diff --git "a/checkpoints/Qwen2.5-3B/babylm_reverse_control_10M_seed0/runs/checkpoint-1382/trainer_state.json" "b/checkpoints/Qwen2.5-3B/babylm_reverse_control_10M_seed0/runs/checkpoint-1382/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoints/Qwen2.5-3B/babylm_reverse_control_10M_seed0/runs/checkpoint-1382/trainer_state.json"
@@ -0,0 +1,10811 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 1382,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.001447178002894356,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.5826,
+      "step": 1
+    },
+    {
+      "epoch": 0.002894356005788712,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6119,
+      "step": 2
+    },
+    {
+      "epoch": 0.004341534008683068,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.62,
+      "step": 3
+    },
+    {
+      "epoch": 0.005788712011577424,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.604,
+      "step": 4
+    },
+    {
+      "epoch": 0.00723589001447178,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6207,
+      "step": 5
+    },
+    {
+      "epoch": 0.008683068017366137,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6348,
+      "step": 6
+    },
+    {
+      "epoch": 0.010130246020260492,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.627,
+      "step": 7
+    },
+    {
+      "epoch": 0.011577424023154847,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6195,
+      "step": 8
+    },
+    {
+      "epoch": 0.013024602026049204,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6251,
+      "step": 9
+    },
+    {
+      "epoch": 0.01447178002894356,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6017,
+      "step": 10
+    },
+    {
+      "epoch": 0.01447178002894356,
+      "eval_loss": 1.627951741218567,
+      "eval_runtime": 24.3872,
+      "eval_samples_per_second": 41.005,
+      "eval_steps_per_second": 2.583,
+      "step": 10
+    },
+    {
+      "epoch": 0.015918958031837915,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6406,
+      "step": 11
+    },
+    {
+      "epoch": 0.017366136034732273,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6387,
+      "step": 12
+    },
+    {
+      "epoch": 0.01881331403762663,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6292,
+      "step": 13
+    },
+    {
+      "epoch": 0.020260492040520984,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6097,
+      "step": 14
+    },
+    {
+      "epoch": 0.02170767004341534,
+      "grad_norm": 2.144015073776245,
+      "learning_rate": 3.597122302158274e-08,
+      "loss": 1.6483,
+      "step": 15
+    },
+    {
+      "epoch": 0.023154848046309694,
+      "grad_norm": 2.063476085662842,
+      "learning_rate": 7.194244604316547e-08,
+      "loss": 1.6203,
+      "step": 16
+    },
+    {
+      "epoch": 0.024602026049204053,
+      "grad_norm": 2.117913007736206,
+      "learning_rate": 1.0791366906474822e-07,
+      "loss": 1.6195,
+      "step": 17
+    },
+    {
+      "epoch": 0.02604920405209841,
+      "grad_norm": 2.244119167327881,
+      "learning_rate": 1.4388489208633095e-07,
+      "loss": 1.6481,
+      "step": 18
+    },
+    {
+      "epoch": 0.027496382054992764,
+      "grad_norm": 2.3340413570404053,
+      "learning_rate": 1.7985611510791368e-07,
+      "loss": 1.6166,
+      "step": 19
+    },
+    {
+      "epoch": 0.02894356005788712,
+      "grad_norm": 2.1988165378570557,
+      "learning_rate": 2.1582733812949643e-07,
+      "loss": 1.6399,
+      "step": 20
+    },
+    {
+      "epoch": 0.02894356005788712,
+      "eval_loss": 1.6270402669906616,
+      "eval_runtime": 24.6946,
+      "eval_samples_per_second": 40.495,
+      "eval_steps_per_second": 2.551,
+      "step": 20
+    },
+    {
+      "epoch": 0.030390738060781478,
+      "grad_norm": 2.1431221961975098,
+      "learning_rate": 2.5179856115107916e-07,
+      "loss": 1.6301,
+      "step": 21
+    },
+    {
+      "epoch": 0.03183791606367583,
+      "grad_norm": 2.168792247772217,
+      "learning_rate": 2.877697841726619e-07,
+      "loss": 1.5952,
+      "step": 22
+    },
+    {
+      "epoch": 0.03328509406657019,
+      "grad_norm": 2.015110731124878,
+      "learning_rate": 3.237410071942446e-07,
+      "loss": 1.6061,
+      "step": 23
+    },
+    {
+      "epoch": 0.03473227206946455,
+      "grad_norm": 2.050848960876465,
+      "learning_rate": 3.5971223021582736e-07,
+      "loss": 1.6472,
+      "step": 24
+    },
+    {
+      "epoch": 0.0361794500723589,
+      "grad_norm": 2.036330223083496,
+      "learning_rate": 3.956834532374101e-07,
+      "loss": 1.6324,
+      "step": 25
+    },
+    {
+      "epoch": 0.03762662807525326,
+      "grad_norm": 1.9470363855361938,
+      "learning_rate": 4.3165467625899287e-07,
+      "loss": 1.5941,
+      "step": 26
+    },
+    {
+      "epoch": 0.03907380607814761,
+      "grad_norm": 1.6043027639389038,
+      "learning_rate": 4.676258992805756e-07,
+      "loss": 1.6025,
+      "step": 27
+    },
+    {
+      "epoch": 0.04052098408104197,
+      "grad_norm": 1.7497000694274902,
+      "learning_rate": 5.035971223021583e-07,
+      "loss": 1.6381,
+      "step": 28
+    },
+    {
+      "epoch": 0.041968162083936326,
+      "grad_norm": 1.436023473739624,
+      "learning_rate": 5.39568345323741e-07,
+      "loss": 1.5767,
+      "step": 29
+    },
+    {
+      "epoch": 0.04341534008683068,
+      "grad_norm": 1.6225666999816895,
+      "learning_rate": 5.755395683453238e-07,
+      "loss": 1.5999,
+      "step": 30
+    },
+    {
+      "epoch": 0.04341534008683068,
+      "eval_loss": 1.600980520248413,
+      "eval_runtime": 25.1149,
+      "eval_samples_per_second": 39.817,
+      "eval_steps_per_second": 2.508,
+      "step": 30
+    },
+    {
+      "epoch": 0.04486251808972504,
+      "grad_norm": 1.3526175022125244,
+      "learning_rate": 6.115107913669066e-07,
+      "loss": 1.5752,
+      "step": 31
+    },
+    {
+      "epoch": 0.04630969609261939,
+      "grad_norm": 1.2586437463760376,
+      "learning_rate": 6.474820143884893e-07,
+      "loss": 1.6016,
+      "step": 32
+    },
+    {
+      "epoch": 0.04775687409551375,
+      "grad_norm": 1.216167688369751,
+      "learning_rate": 6.83453237410072e-07,
+      "loss": 1.6009,
+      "step": 33
+    },
+    {
+      "epoch": 0.049204052098408106,
+      "grad_norm": 1.1464942693710327,
+      "learning_rate": 7.194244604316547e-07,
+      "loss": 1.5922,
+      "step": 34
+    },
+    {
+      "epoch": 0.05065123010130246,
+      "grad_norm": 1.191906213760376,
+      "learning_rate": 7.553956834532375e-07,
+      "loss": 1.5621,
+      "step": 35
+    },
+    {
+      "epoch": 0.05209840810419682,
+      "grad_norm": 1.065446138381958,
+      "learning_rate": 7.913669064748202e-07,
+      "loss": 1.5611,
+      "step": 36
+    },
+    {
+      "epoch": 0.053545586107091175,
+      "grad_norm": 0.9210651516914368,
+      "learning_rate": 8.27338129496403e-07,
+      "loss": 1.592,
+      "step": 37
+    },
+    {
+      "epoch": 0.05499276410998553,
+      "grad_norm": 1.0145928859710693,
+      "learning_rate": 8.633093525179857e-07,
+      "loss": 1.551,
+      "step": 38
+    },
+    {
+      "epoch": 0.056439942112879886,
+      "grad_norm": 0.8538812398910522,
+      "learning_rate": 8.992805755395684e-07,
+      "loss": 1.564,
+      "step": 39
+    },
+    {
+      "epoch": 0.05788712011577424,
+      "grad_norm": 0.935691773891449,
+      "learning_rate": 9.352517985611512e-07,
+      "loss": 1.5504,
+      "step": 40
+    },
+    {
+      "epoch": 0.05788712011577424,
+      "eval_loss": 1.5676774978637695,
+      "eval_runtime": 25.5185,
+      "eval_samples_per_second": 39.187,
+      "eval_steps_per_second": 2.469,
+      "step": 40
+    },
+    {
+      "epoch": 0.059334298118668596,
+      "grad_norm": 0.9620401263237,
+      "learning_rate": 9.71223021582734e-07,
+      "loss": 1.5504,
+      "step": 41
+    },
+    {
+      "epoch": 0.060781476121562955,
+      "grad_norm": 0.9131060838699341,
+      "learning_rate": 1.0071942446043167e-06,
+      "loss": 1.5696,
+      "step": 42
+    },
+    {
+      "epoch": 0.06222865412445731,
+      "grad_norm": 0.875487744808197,
+      "learning_rate": 1.0431654676258993e-06,
+      "loss": 1.5916,
+      "step": 43
+    },
+    {
+      "epoch": 0.06367583212735166,
+      "grad_norm": 0.8755046725273132,
+      "learning_rate": 1.079136690647482e-06,
+      "loss": 1.5706,
+      "step": 44
+    },
+    {
+      "epoch": 0.06512301013024602,
+      "grad_norm": 0.8441756367683411,
+      "learning_rate": 1.115107913669065e-06,
+      "loss": 1.5616,
+      "step": 45
+    },
+    {
+      "epoch": 0.06657018813314038,
+      "grad_norm": 0.7749464511871338,
+      "learning_rate": 1.1510791366906476e-06,
+      "loss": 1.5414,
+      "step": 46
+    },
+    {
+      "epoch": 0.06801736613603473,
+      "grad_norm": 0.8531357645988464,
+      "learning_rate": 1.1870503597122303e-06,
+      "loss": 1.5627,
+      "step": 47
+    },
+    {
+      "epoch": 0.0694645441389291,
+      "grad_norm": 0.8716596364974976,
+      "learning_rate": 1.2230215827338131e-06,
+      "loss": 1.5464,
+      "step": 48
+    },
+    {
+      "epoch": 0.07091172214182344,
+      "grad_norm": 0.8619104027748108,
+      "learning_rate": 1.2589928057553958e-06,
+      "loss": 1.5329,
+      "step": 49
+    },
+    {
+      "epoch": 0.0723589001447178,
+      "grad_norm": 0.8491005897521973,
+      "learning_rate": 1.2949640287769785e-06,
+      "loss": 1.5075,
+      "step": 50
+    },
+    {
+      "epoch": 0.0723589001447178,
+      "eval_loss": 1.5282820463180542,
+      "eval_runtime": 25.6731,
+      "eval_samples_per_second": 38.951,
+      "eval_steps_per_second": 2.454,
+      "step": 50
+    },
+    {
+      "epoch": 0.07380607814761216,
+      "grad_norm": 0.8045928478240967,
+      "learning_rate": 1.3309352517985614e-06,
+      "loss": 1.4949,
+      "step": 51
+    },
+    {
+      "epoch": 0.07525325615050651,
+      "grad_norm": 0.9498971700668335,
+      "learning_rate": 1.366906474820144e-06,
+      "loss": 1.5126,
+      "step": 52
+    },
+    {
+      "epoch": 0.07670043415340087,
+      "grad_norm": 0.9595310091972351,
+      "learning_rate": 1.4028776978417265e-06,
+      "loss": 1.4954,
+      "step": 53
+    },
+    {
+      "epoch": 0.07814761215629522,
+      "grad_norm": 1.2015273571014404,
+      "learning_rate": 1.4388489208633094e-06,
+      "loss": 1.4952,
+      "step": 54
+    },
+    {
+      "epoch": 0.07959479015918958,
+      "grad_norm": 1.2412959337234497,
+      "learning_rate": 1.474820143884892e-06,
+      "loss": 1.5102,
+      "step": 55
+    },
+    {
+      "epoch": 0.08104196816208394,
+      "grad_norm": 1.0004088878631592,
+      "learning_rate": 1.510791366906475e-06,
+      "loss": 1.4709,
+      "step": 56
+    },
+    {
+      "epoch": 0.0824891461649783,
+      "grad_norm": 1.1461646556854248,
+      "learning_rate": 1.5467625899280579e-06,
+      "loss": 1.4214,
+      "step": 57
+    },
+    {
+      "epoch": 0.08393632416787265,
+      "grad_norm": 1.2485071420669556,
+      "learning_rate": 1.5827338129496403e-06,
+      "loss": 1.4698,
+      "step": 58
+    },
+    {
+      "epoch": 0.085383502170767,
+      "grad_norm": 1.2286319732666016,
+      "learning_rate": 1.618705035971223e-06,
+      "loss": 1.4802,
+      "step": 59
+    },
+    {
+      "epoch": 0.08683068017366136,
+      "grad_norm": 1.243710994720459,
+      "learning_rate": 1.654676258992806e-06,
+      "loss": 1.4204,
+      "step": 60
+    },
+    {
+      "epoch": 0.08683068017366136,
+      "eval_loss": 1.4558827877044678,
+      "eval_runtime": 25.629,
+      "eval_samples_per_second": 39.018,
+      "eval_steps_per_second": 2.458,
+      "step": 60
+    },
+    {
+      "epoch": 0.08827785817655572,
+      "grad_norm": 1.3729641437530518,
+      "learning_rate": 1.6906474820143886e-06,
+      "loss": 1.4566,
+      "step": 61
+    },
+    {
+      "epoch": 0.08972503617945007,
+      "grad_norm": 1.332112193107605,
+      "learning_rate": 1.7266187050359715e-06,
+      "loss": 1.453,
+      "step": 62
+    },
+    {
+      "epoch": 0.09117221418234443,
+      "grad_norm": 1.2128121852874756,
+      "learning_rate": 1.762589928057554e-06,
+      "loss": 1.4244,
+      "step": 63
+    },
+    {
+      "epoch": 0.09261939218523878,
+      "grad_norm": 1.3846666812896729,
+      "learning_rate": 1.7985611510791368e-06,
+      "loss": 1.4078,
+      "step": 64
+    },
+    {
+      "epoch": 0.09406657018813314,
+      "grad_norm": 1.2263684272766113,
+      "learning_rate": 1.8345323741007195e-06,
+      "loss": 1.3985,
+      "step": 65
+    },
+    {
+      "epoch": 0.0955137481910275,
+      "grad_norm": 1.3221999406814575,
+      "learning_rate": 1.8705035971223024e-06,
+      "loss": 1.3883,
+      "step": 66
+    },
+    {
+      "epoch": 0.09696092619392185,
+      "grad_norm": 1.3177974224090576,
+      "learning_rate": 1.906474820143885e-06,
+      "loss": 1.3672,
+      "step": 67
+    },
+    {
+      "epoch": 0.09840810419681621,
+      "grad_norm": 1.226685881614685,
+      "learning_rate": 1.942446043165468e-06,
+      "loss": 1.3625,
+      "step": 68
+    },
+    {
+      "epoch": 0.09985528219971057,
+      "grad_norm": 1.2232762575149536,
+      "learning_rate": 1.9784172661870504e-06,
+      "loss": 1.3844,
+      "step": 69
+    },
+    {
+      "epoch": 0.10130246020260492,
+      "grad_norm": 1.254563808441162,
+      "learning_rate": 2.0143884892086333e-06,
+      "loss": 1.3607,
+      "step": 70
+    },
+    {
+      "epoch": 0.10130246020260492,
+      "eval_loss": 1.3712953329086304,
+      "eval_runtime": 25.9893,
+      "eval_samples_per_second": 38.477,
+      "eval_steps_per_second": 2.424,
+      "step": 70
+    },
+    {
+      "epoch": 0.10274963820549927,
+      "grad_norm": 1.3770047426223755,
+      "learning_rate": 2.050359712230216e-06,
+      "loss": 1.3771,
+      "step": 71
+    },
+    {
+      "epoch": 0.10419681620839363,
+      "grad_norm": 1.3217036724090576,
+      "learning_rate": 2.0863309352517987e-06,
+      "loss": 1.3172,
+      "step": 72
+    },
+    {
+      "epoch": 0.10564399421128799,
+      "grad_norm": 1.3304638862609863,
+      "learning_rate": 2.1223021582733816e-06,
+      "loss": 1.318,
+      "step": 73
+    },
+    {
+      "epoch": 0.10709117221418235,
+      "grad_norm": 1.216414213180542,
+      "learning_rate": 2.158273381294964e-06,
+      "loss": 1.3,
+      "step": 74
+    },
+    {
+      "epoch": 0.1085383502170767,
+      "grad_norm": 1.1022595167160034,
+      "learning_rate": 2.194244604316547e-06,
+      "loss": 1.2698,
+      "step": 75
+    },
+    {
+      "epoch": 0.10998552821997105,
+      "grad_norm": 1.01797616481781,
+      "learning_rate": 2.23021582733813e-06,
+      "loss": 1.3161,
+      "step": 76
+    },
+    {
+      "epoch": 0.11143270622286541,
+      "grad_norm": 1.2778011560440063,
+      "learning_rate": 2.2661870503597123e-06,
+      "loss": 1.3176,
+      "step": 77
+    },
+    {
+      "epoch": 0.11287988422575977,
+      "grad_norm": 1.1276257038116455,
+      "learning_rate": 2.302158273381295e-06,
+      "loss": 1.3164,
+      "step": 78
+    },
+    {
+      "epoch": 0.11432706222865413,
+      "grad_norm": 1.0312668085098267,
+      "learning_rate": 2.3381294964028776e-06,
+      "loss": 1.3201,
+      "step": 79
+    },
+    {
+      "epoch": 0.11577424023154848,
+      "grad_norm": 1.1506502628326416,
+      "learning_rate": 2.3741007194244605e-06,
+      "loss": 1.2898,
+      "step": 80
+    },
+    {
+      "epoch": 0.11577424023154848,
+      "eval_loss": 1.3133100271224976,
+      "eval_runtime": 25.9424,
+      "eval_samples_per_second": 38.547,
+      "eval_steps_per_second": 2.428,
+      "step": 80
+    },
+    {
+      "epoch": 0.11722141823444283,
+      "grad_norm": 1.255002737045288,
+      "learning_rate": 2.4100719424460434e-06,
+      "loss": 1.247,
+      "step": 81
+    },
+    {
+      "epoch": 0.11866859623733719,
+      "grad_norm": 1.0065932273864746,
+      "learning_rate": 2.4460431654676263e-06,
+      "loss": 1.2602,
+      "step": 82
+    },
+    {
+      "epoch": 0.12011577424023155,
+      "grad_norm": 1.1586326360702515,
+      "learning_rate": 2.4820143884892088e-06,
+      "loss": 1.2704,
+      "step": 83
+    },
+    {
+      "epoch": 0.12156295224312591,
+      "grad_norm": 1.0571246147155762,
+      "learning_rate": 2.5179856115107916e-06,
+      "loss": 1.2633,
+      "step": 84
+    },
+    {
+      "epoch": 0.12301013024602026,
+      "grad_norm": 1.0377482175827026,
+      "learning_rate": 2.5539568345323745e-06,
+      "loss": 1.2238,
+      "step": 85
+    },
+    {
+      "epoch": 0.12445730824891461,
+      "grad_norm": 1.1468099355697632,
+      "learning_rate": 2.589928057553957e-06,
+      "loss": 1.2596,
+      "step": 86
+    },
+    {
+      "epoch": 0.12590448625180897,
+      "grad_norm": 1.0852339267730713,
+      "learning_rate": 2.6258992805755395e-06,
+      "loss": 1.2081,
+      "step": 87
+    },
+    {
+      "epoch": 0.12735166425470332,
+      "grad_norm": 1.1638356447219849,
+      "learning_rate": 2.6618705035971228e-06,
+      "loss": 1.2345,
+      "step": 88
+    },
+    {
+      "epoch": 0.1287988422575977,
+      "grad_norm": 1.0700938701629639,
+      "learning_rate": 2.6978417266187052e-06,
+      "loss": 1.257,
+      "step": 89
+    },
+    {
+      "epoch": 0.13024602026049203,
+      "grad_norm": 1.0385199785232544,
+      "learning_rate": 2.733812949640288e-06,
+      "loss": 1.2391,
+      "step": 90
+    },
+    {
+      "epoch": 0.13024602026049203,
+      "eval_loss": 1.2747834920883179,
+      "eval_runtime": 25.8114,
+      "eval_samples_per_second": 38.743,
+      "eval_steps_per_second": 2.441,
+      "step": 90
+    },
+    {
+      "epoch": 0.1316931982633864,
+      "grad_norm": 1.0480972528457642,
+      "learning_rate": 2.7697841726618706e-06,
+      "loss": 1.251,
+      "step": 91
+    },
+    {
+      "epoch": 0.13314037626628075,
+      "grad_norm": 1.0574753284454346,
+      "learning_rate": 2.805755395683453e-06,
+      "loss": 1.2553,
+      "step": 92
+    },
+    {
+      "epoch": 0.1345875542691751,
+      "grad_norm": 1.0661845207214355,
+      "learning_rate": 2.8417266187050364e-06,
+      "loss": 1.2123,
+      "step": 93
+    },
+    {
+      "epoch": 0.13603473227206947,
+      "grad_norm": 1.0629788637161255,
+      "learning_rate": 2.877697841726619e-06,
+      "loss": 1.2224,
+      "step": 94
+    },
+    {
+      "epoch": 0.13748191027496381,
+      "grad_norm": 1.0361292362213135,
+      "learning_rate": 2.9136690647482017e-06,
+      "loss": 1.2173,
+      "step": 95
+    },
+    {
+      "epoch": 0.1389290882778582,
+      "grad_norm": 0.9286133050918579,
+      "learning_rate": 2.949640287769784e-06,
+      "loss": 1.1927,
+      "step": 96
+    },
+    {
+      "epoch": 0.14037626628075253,
+      "grad_norm": 1.048107624053955,
+      "learning_rate": 2.985611510791367e-06,
+      "loss": 1.2088,
+      "step": 97
+    },
+    {
+      "epoch": 0.14182344428364688,
+      "grad_norm": 1.0357203483581543,
+      "learning_rate": 3.02158273381295e-06,
+      "loss": 1.1913,
+      "step": 98
+    },
+    {
+      "epoch": 0.14327062228654125,
+      "grad_norm": 0.9728686213493347,
+      "learning_rate": 3.0575539568345324e-06,
+      "loss": 1.2347,
+      "step": 99
+    },
+    {
+      "epoch": 0.1447178002894356,
+      "grad_norm": 1.0187407732009888,
+      "learning_rate": 3.0935251798561158e-06,
+      "loss": 1.2401,
+      "step": 100
+    },
+    {
+      "epoch": 0.1447178002894356,
+      "eval_loss": 1.246801495552063,
+      "eval_runtime": 26.0552,
+      "eval_samples_per_second": 38.38,
+      "eval_steps_per_second": 2.418,
+      "step": 100
+    },
+    {
+      "epoch": 0.14616497829232997,
+      "grad_norm": 1.071919322013855,
+      "learning_rate": 3.1294964028776982e-06,
+      "loss": 1.1819,
+      "step": 101
+    },
+    {
+      "epoch": 0.1476121562952243,
+      "grad_norm": 1.1320074796676636,
+      "learning_rate": 3.1654676258992807e-06,
+      "loss": 1.1978,
+      "step": 102
+    },
+    {
+      "epoch": 0.14905933429811866,
+      "grad_norm": 1.1210949420928955,
+      "learning_rate": 3.2014388489208636e-06,
+      "loss": 1.158,
+      "step": 103
+    },
+    {
+      "epoch": 0.15050651230101303,
+      "grad_norm": 1.0182701349258423,
+      "learning_rate": 3.237410071942446e-06,
+      "loss": 1.1825,
+      "step": 104
+    },
+    {
+      "epoch": 0.15195369030390737,
+      "grad_norm": 1.1875406503677368,
+      "learning_rate": 3.2733812949640294e-06,
+      "loss": 1.1763,
+      "step": 105
+    },
+    {
+      "epoch": 0.15340086830680175,
+      "grad_norm": 1.08661687374115,
+      "learning_rate": 3.309352517985612e-06,
+      "loss": 1.163,
+      "step": 106
+    },
+    {
+      "epoch": 0.1548480463096961,
+      "grad_norm": 1.0423978567123413,
+      "learning_rate": 3.3453237410071943e-06,
+      "loss": 1.1796,
+      "step": 107
+    },
+    {
+      "epoch": 0.15629522431259044,
+      "grad_norm": 1.0998430252075195,
+      "learning_rate": 3.381294964028777e-06,
+      "loss": 1.1686,
+      "step": 108
+    },
+    {
+      "epoch": 0.1577424023154848,
+      "grad_norm": 1.279999852180481,
+      "learning_rate": 3.4172661870503596e-06,
+      "loss": 1.1859,
+      "step": 109
+    },
+    {
+      "epoch": 0.15918958031837915,
+      "grad_norm": 1.2889167070388794,
+      "learning_rate": 3.453237410071943e-06,
+      "loss": 1.1262,
+      "step": 110
+    },
+    {
+      "epoch": 0.15918958031837915,
+      "eval_loss": 1.2298810482025146,
+      "eval_runtime": 25.854,
+      "eval_samples_per_second": 38.679,
+      "eval_steps_per_second": 2.437,
+      "step": 110
+    },
+    {
+      "epoch": 0.16063675832127353,
+      "grad_norm": 1.3285552263259888,
+      "learning_rate": 3.4892086330935254e-06,
+      "loss": 1.2033,
+      "step": 111
+    },
+    {
+      "epoch": 0.16208393632416787,
+      "grad_norm": 1.6612026691436768,
+      "learning_rate": 3.525179856115108e-06,
+      "loss": 1.1981,
+      "step": 112
+    },
+    {
+      "epoch": 0.16353111432706222,
+      "grad_norm": 1.6106890439987183,
+      "learning_rate": 3.561151079136691e-06,
+      "loss": 1.1716,
+      "step": 113
+    },
+    {
+      "epoch": 0.1649782923299566,
+      "grad_norm": 1.4226655960083008,
+      "learning_rate": 3.5971223021582737e-06,
+      "loss": 1.1666,
+      "step": 114
+    },
+    {
+      "epoch": 0.16642547033285093,
+      "grad_norm": 1.284498929977417,
+      "learning_rate": 3.6330935251798566e-06,
+      "loss": 1.1461,
+      "step": 115
+    },
+    {
+      "epoch": 0.1678726483357453,
+      "grad_norm": 1.3292125463485718,
+      "learning_rate": 3.669064748201439e-06,
+      "loss": 1.1256,
+      "step": 116
+    },
+    {
+      "epoch": 0.16931982633863965,
+      "grad_norm": 1.376024842262268,
+      "learning_rate": 3.7050359712230215e-06,
+      "loss": 1.1779,
+      "step": 117
+    },
+    {
+      "epoch": 0.170767004341534,
+      "grad_norm": 1.2837554216384888,
+      "learning_rate": 3.741007194244605e-06,
+      "loss": 1.1377,
+      "step": 118
+    },
+    {
+      "epoch": 0.17221418234442837,
+      "grad_norm": 1.2247073650360107,
+      "learning_rate": 3.7769784172661873e-06,
+      "loss": 1.1634,
+      "step": 119
+    },
+    {
+      "epoch": 0.1736613603473227,
+      "grad_norm": 1.1542692184448242,
+      "learning_rate": 3.81294964028777e-06,
+      "loss": 1.1496,
+      "step": 120
+    },
+    {
+      "epoch": 0.1736613603473227,
+      "eval_loss": 1.2104259729385376,
+      "eval_runtime": 25.8178,
+      "eval_samples_per_second": 38.733,
+      "eval_steps_per_second": 2.44,
+      "step": 120
+    },
+    {
+      "epoch": 0.17510853835021709,
+      "grad_norm": 1.136985421180725,
+      "learning_rate": 3.848920863309353e-06,
+      "loss": 1.1459,
+      "step": 121
+    },
+    {
+      "epoch": 0.17655571635311143,
+      "grad_norm": 1.451317548751831,
+      "learning_rate": 3.884892086330936e-06,
+      "loss": 1.1443,
+      "step": 122
+    },
+    {
+      "epoch": 0.17800289435600578,
+      "grad_norm": 1.2180333137512207,
+      "learning_rate": 3.920863309352518e-06,
+      "loss": 1.1108,
+      "step": 123
+    },
+    {
+      "epoch": 0.17945007235890015,
+      "grad_norm": 1.244494080543518,
+      "learning_rate": 3.956834532374101e-06,
+      "loss": 1.1416,
+      "step": 124
+    },
+    {
+      "epoch": 0.1808972503617945,
+      "grad_norm": 1.394103765487671,
+      "learning_rate": 3.992805755395684e-06,
+      "loss": 1.1612,
+      "step": 125
+    },
+    {
+      "epoch": 0.18234442836468887,
+      "grad_norm": 1.4224308729171753,
+      "learning_rate": 4.028776978417267e-06,
+      "loss": 1.1541,
+      "step": 126
+    },
+    {
+      "epoch": 0.1837916063675832,
+      "grad_norm": 1.1226342916488647,
+      "learning_rate": 4.0647482014388495e-06,
+      "loss": 1.158,
+      "step": 127
+    },
+    {
+      "epoch": 0.18523878437047755,
+      "grad_norm": 1.6892023086547852,
+      "learning_rate": 4.100719424460432e-06,
+      "loss": 1.1361,
+      "step": 128
+    },
+    {
+      "epoch": 0.18668596237337193,
+      "grad_norm": 1.2189652919769287,
+      "learning_rate": 4.1366906474820145e-06,
+      "loss": 1.1349,
+      "step": 129
+    },
+    {
+      "epoch": 0.18813314037626627,
+      "grad_norm": 1.3171944618225098,
+      "learning_rate": 4.172661870503597e-06,
+      "loss": 1.1279,
+      "step": 130
+    },
+    {
+      "epoch": 0.18813314037626627,
+      "eval_loss": 1.1947468519210815,
+      "eval_runtime": 25.8646,
+      "eval_samples_per_second": 38.663,
+      "eval_steps_per_second": 2.436,
+      "step": 130
+    },
+    {
+      "epoch": 0.18958031837916064,
+      "grad_norm": 1.3438791036605835,
+      "learning_rate": 4.20863309352518e-06,
+      "loss": 1.1665,
+      "step": 131
+    },
+    {
+      "epoch": 0.191027496382055,
+      "grad_norm": 1.237427830696106,
+      "learning_rate": 4.244604316546763e-06,
+      "loss": 1.1249,
+      "step": 132
+    },
+    {
+      "epoch": 0.19247467438494936,
+      "grad_norm": 1.2919330596923828,
+      "learning_rate": 4.280575539568346e-06,
+      "loss": 1.165,
+      "step": 133
+    },
+    {
+      "epoch": 0.1939218523878437,
+      "grad_norm": 1.0939977169036865,
+      "learning_rate": 4.316546762589928e-06,
+      "loss": 1.1565,
+      "step": 134
+    },
+    {
+      "epoch": 0.19536903039073805,
+      "grad_norm": 1.280110478401184,
+      "learning_rate": 4.352517985611511e-06,
+      "loss": 1.1298,
+      "step": 135
+    },
+    {
+      "epoch": 0.19681620839363242,
+      "grad_norm": 1.3320883512496948,
+      "learning_rate": 4.388489208633094e-06,
+      "loss": 1.1175,
+      "step": 136
+    },
+    {
+      "epoch": 0.19826338639652677,
+      "grad_norm": 1.280161738395691,
+      "learning_rate": 4.424460431654677e-06,
+      "loss": 1.1015,
+      "step": 137
+    },
+    {
+      "epoch": 0.19971056439942114,
+      "grad_norm": 1.1698580980300903,
+      "learning_rate": 4.46043165467626e-06,
+      "loss": 1.0857,
+      "step": 138
+    },
+    {
+      "epoch": 0.2011577424023155,
+      "grad_norm": 1.2695364952087402,
+      "learning_rate": 4.496402877697842e-06,
+      "loss": 1.1271,
+      "step": 139
+    },
+    {
+      "epoch": 0.20260492040520983,
+      "grad_norm": 1.2971086502075195,
+      "learning_rate": 4.5323741007194245e-06,
+      "loss": 1.1204,
+      "step": 140
+    },
+    {
+      "epoch": 0.20260492040520983,
+      "eval_loss": 1.1829454898834229,
+      "eval_runtime": 25.6947,
+      "eval_samples_per_second": 38.918,
+      "eval_steps_per_second": 2.452,
+      "step": 140
+    },
+    {
+      "epoch": 0.2040520984081042,
+      "grad_norm": 1.4637281894683838,
+      "learning_rate": 4.5683453237410074e-06,
+      "loss": 1.099,
+      "step": 141
+    },
+    {
+      "epoch": 0.20549927641099855,
+      "grad_norm": 1.3764904737472534,
+      "learning_rate": 4.60431654676259e-06,
+      "loss": 1.1115,
+      "step": 142
+    },
+    {
+      "epoch": 0.20694645441389292,
+      "grad_norm": 1.2485584020614624,
+      "learning_rate": 4.640287769784173e-06,
+      "loss": 1.1036,
+      "step": 143
+    },
+    {
+      "epoch": 0.20839363241678727,
+      "grad_norm": 1.5968283414840698,
+      "learning_rate": 4.676258992805755e-06,
+      "loss": 1.1008,
+      "step": 144
+    },
+    {
+      "epoch": 0.2098408104196816,
+      "grad_norm": 1.275233268737793,
+      "learning_rate": 4.712230215827339e-06,
+      "loss": 1.1326,
+      "step": 145
+    },
+    {
+      "epoch": 0.21128798842257598,
+      "grad_norm": 1.4760442972183228,
+      "learning_rate": 4.748201438848921e-06,
+      "loss": 1.1176,
+      "step": 146
+    },
+    {
+      "epoch": 0.21273516642547033,
+      "grad_norm": 1.381948709487915,
+      "learning_rate": 4.784172661870504e-06,
+      "loss": 1.0668,
+      "step": 147
+    },
+    {
+      "epoch": 0.2141823444283647,
+      "grad_norm": 1.8259927034378052,
+      "learning_rate": 4.820143884892087e-06,
+      "loss": 1.1167,
+      "step": 148
+    },
+    {
+      "epoch": 0.21562952243125905,
+      "grad_norm": 1.4848363399505615,
+      "learning_rate": 4.856115107913669e-06,
+      "loss": 1.1014,
+      "step": 149
+    },
+    {
+      "epoch": 0.2170767004341534,
+      "grad_norm": 1.5530890226364136,
+      "learning_rate": 4.892086330935253e-06,
+      "loss": 1.1171,
+      "step": 150
+    },
+    {
+      "epoch": 0.2170767004341534,
+      "eval_loss": 1.1686047315597534,
+      "eval_runtime": 25.7032,
+      "eval_samples_per_second": 38.906,
+      "eval_steps_per_second": 2.451,
+      "step": 150
+    },
+    {
+      "epoch": 0.21852387843704776,
+      "grad_norm": 1.445328712463379,
+      "learning_rate": 4.928057553956835e-06,
+      "loss": 1.112,
+      "step": 151
+    },
+    {
+      "epoch": 0.2199710564399421,
+      "grad_norm": 1.6887681484222412,
+      "learning_rate": 4.9640287769784175e-06,
+      "loss": 1.0563,
+      "step": 152
+    },
+    {
+      "epoch": 0.22141823444283648,
+      "grad_norm": 1.165201187133789,
+      "learning_rate": 5e-06,
+      "loss": 1.0794,
+      "step": 153
+    },
+    {
+      "epoch": 0.22286541244573083,
+      "grad_norm": 1.548592448234558,
+      "learning_rate": 4.9959774738535805e-06,
+      "loss": 1.0889,
+      "step": 154
+    },
+    {
+      "epoch": 0.22431259044862517,
+      "grad_norm": 1.2718911170959473,
+      "learning_rate": 4.991954947707161e-06,
+      "loss": 1.0748,
+      "step": 155
+    },
+    {
+      "epoch": 0.22575976845151954,
+      "grad_norm": 1.3692930936813354,
+      "learning_rate": 4.987932421560741e-06,
+      "loss": 1.103,
+      "step": 156
+    },
+    {
+      "epoch": 0.2272069464544139,
+      "grad_norm": 1.3592978715896606,
+      "learning_rate": 4.983909895414321e-06,
+      "loss": 1.1079,
+      "step": 157
+    },
+    {
+      "epoch": 0.22865412445730826,
+      "grad_norm": 1.5015676021575928,
+      "learning_rate": 4.9798873692679e-06,
+      "loss": 1.0584,
+      "step": 158
+    },
+    {
+      "epoch": 0.2301013024602026,
+      "grad_norm": 1.4109690189361572,
+      "learning_rate": 4.97586484312148e-06,
+      "loss": 1.0821,
+      "step": 159
+    },
+    {
+      "epoch": 0.23154848046309695,
+      "grad_norm": 1.3715922832489014,
+      "learning_rate": 4.9718423169750605e-06,
+      "loss": 1.1053,
+      "step": 160
+    },
+    {
+      "epoch": 0.23154848046309695,
+      "eval_loss": 1.1580647230148315,
+      "eval_runtime": 25.6393,
+      "eval_samples_per_second": 39.003,
+      "eval_steps_per_second": 2.457,
+      "step": 160
+    },
+    {
+      "epoch": 0.23299565846599132,
+      "grad_norm": 1.1541366577148438,
+      "learning_rate": 4.967819790828641e-06,
+      "loss": 1.0857,
+      "step": 161
+    },
+    {
+      "epoch": 0.23444283646888567,
+      "grad_norm": 1.5763776302337646,
+      "learning_rate": 4.963797264682221e-06,
+      "loss": 1.0539,
+      "step": 162
+    },
+    {
+      "epoch": 0.23589001447178004,
+      "grad_norm": 1.3444937467575073,
+      "learning_rate": 4.959774738535801e-06,
+      "loss": 1.0772,
+      "step": 163
+    },
+    {
+      "epoch": 0.23733719247467439,
+      "grad_norm": 1.4197810888290405,
+      "learning_rate": 4.955752212389381e-06,
+      "loss": 1.0827,
+      "step": 164
+    },
+    {
+      "epoch": 0.23878437047756873,
+      "grad_norm": 1.1938902139663696,
+      "learning_rate": 4.951729686242961e-06,
+      "loss": 1.034,
+      "step": 165
+    },
+    {
+      "epoch": 0.2402315484804631,
+      "grad_norm": 1.2534770965576172,
+      "learning_rate": 4.947707160096541e-06,
+      "loss": 1.0898,
+      "step": 166
+    },
+    {
+      "epoch": 0.24167872648335745,
+      "grad_norm": 1.3372267484664917,
+      "learning_rate": 4.943684633950121e-06,
+      "loss": 1.0581,
+      "step": 167
+    },
+    {
+      "epoch": 0.24312590448625182,
+      "grad_norm": 1.3008803129196167,
+      "learning_rate": 4.939662107803701e-06,
+      "loss": 1.085,
+      "step": 168
+    },
+    {
+      "epoch": 0.24457308248914617,
+      "grad_norm": 1.2555029392242432,
+      "learning_rate": 4.935639581657281e-06,
+      "loss": 1.0546,
+      "step": 169
+    },
+    {
+      "epoch": 0.2460202604920405,
+      "grad_norm": 1.3190017938613892,
+      "learning_rate": 4.931617055510861e-06,
+      "loss": 1.0895,
+      "step": 170
+    },
+    {
+      "epoch": 0.2460202604920405,
+      "eval_loss": 1.1438969373703003,
+      "eval_runtime": 25.6063,
+      "eval_samples_per_second": 39.053,
+      "eval_steps_per_second": 2.46,
+      "step": 170
+    },
+    {
+      "epoch": 0.24746743849493488,
+      "grad_norm": 1.3019165992736816,
+      "learning_rate": 4.927594529364441e-06,
+      "loss": 1.1047,
+      "step": 171
+    },
+    {
+      "epoch": 0.24891461649782923,
+      "grad_norm": 1.232087254524231,
+      "learning_rate": 4.923572003218021e-06,
+      "loss": 1.0703,
+      "step": 172
+    },
+    {
+      "epoch": 0.2503617945007236,
+      "grad_norm": 1.3068221807479858,
+      "learning_rate": 4.919549477071601e-06,
+      "loss": 1.0331,
+      "step": 173
+    },
+    {
+      "epoch": 0.25180897250361794,
+      "grad_norm": 1.521071434020996,
+      "learning_rate": 4.915526950925181e-06,
+      "loss": 1.0559,
+      "step": 174
+    },
+    {
+      "epoch": 0.2532561505065123,
+      "grad_norm": 1.411606788635254,
+      "learning_rate": 4.9115044247787615e-06,
+      "loss": 1.0705,
+      "step": 175
+    },
+    {
+      "epoch": 0.25470332850940663,
+      "grad_norm": 1.6073122024536133,
+      "learning_rate": 4.907481898632342e-06,
+      "loss": 1.0492,
+      "step": 176
+    },
+    {
+      "epoch": 0.25615050651230103,
+      "grad_norm": 1.4059184789657593,
+      "learning_rate": 4.903459372485922e-06,
+      "loss": 1.1006,
+      "step": 177
+    },
+    {
+      "epoch": 0.2575976845151954,
+      "grad_norm": 1.370344638824463,
+      "learning_rate": 4.899436846339501e-06,
+      "loss": 1.0533,
+      "step": 178
+    },
+    {
+      "epoch": 0.2590448625180897,
+      "grad_norm": 1.4646189212799072,
+      "learning_rate": 4.895414320193081e-06,
+      "loss": 1.0602,
+      "step": 179
+    },
+    {
+      "epoch": 0.26049204052098407,
+      "grad_norm": 1.5465567111968994,
+      "learning_rate": 4.891391794046661e-06,
+      "loss": 1.0773,
+      "step": 180
+    },
+    {
+      "epoch": 0.26049204052098407,
+      "eval_loss": 1.1373363733291626,
+      "eval_runtime": 25.632,
+      "eval_samples_per_second": 39.014,
+      "eval_steps_per_second": 2.458,
+      "step": 180
+    },
+    {
+      "epoch": 0.2619392185238784,
+      "grad_norm": 1.7964074611663818,
+      "learning_rate": 4.8873692679002414e-06,
+      "loss": 1.0827,
+      "step": 181
+    },
+    {
+      "epoch": 0.2633863965267728,
+      "grad_norm": 1.2767939567565918,
+      "learning_rate": 4.8833467417538216e-06,
+      "loss": 1.0588,
+      "step": 182
+    },
+    {
+      "epoch": 0.26483357452966716,
+      "grad_norm": 1.7200208902359009,
+      "learning_rate": 4.879324215607402e-06,
+      "loss": 1.069,
+      "step": 183
+    },
+    {
+      "epoch": 0.2662807525325615,
+      "grad_norm": 1.3738912343978882,
+      "learning_rate": 4.875301689460982e-06,
+      "loss": 1.0447,
+      "step": 184
+    },
+    {
+      "epoch": 0.26772793053545585,
+      "grad_norm": 1.4727184772491455,
+      "learning_rate": 4.871279163314562e-06,
+      "loss": 1.0738,
+      "step": 185
+    },
+    {
+      "epoch": 0.2691751085383502,
+      "grad_norm": 1.7510229349136353,
+      "learning_rate": 4.867256637168142e-06,
+      "loss": 1.0569,
+      "step": 186
+    },
+    {
+      "epoch": 0.2706222865412446,
+      "grad_norm": 1.5435395240783691,
+      "learning_rate": 4.863234111021722e-06,
+      "loss": 1.0365,
+      "step": 187
+    },
+    {
+      "epoch": 0.27206946454413894,
+      "grad_norm": 1.9311236143112183,
+      "learning_rate": 4.8592115848753015e-06,
+      "loss": 1.0527,
+      "step": 188
+    },
+    {
+      "epoch": 0.2735166425470333,
+      "grad_norm": 1.4670462608337402,
+      "learning_rate": 4.855189058728882e-06,
+      "loss": 1.0547,
+      "step": 189
+    },
+    {
+      "epoch": 0.27496382054992763,
+      "grad_norm": 1.773186445236206,
+      "learning_rate": 4.851166532582462e-06,
+      "loss": 1.0374,
+      "step": 190
+    },
+    {
+      "epoch": 0.27496382054992763,
+      "eval_loss": 1.1362701654434204,
+      "eval_runtime": 25.664,
+      "eval_samples_per_second": 38.965,
+      "eval_steps_per_second": 2.455,
+      "step": 190
+    },
+    {
+      "epoch": 0.276410998552822,
+      "grad_norm": 1.3181395530700684,
+      "learning_rate": 4.847144006436042e-06,
+      "loss": 1.0727,
+      "step": 191
+    },
+    {
+      "epoch": 0.2778581765557164,
+      "grad_norm": 1.30997633934021,
+      "learning_rate": 4.843121480289622e-06,
+      "loss": 1.0038,
+      "step": 192
+    },
+    {
+      "epoch": 0.2793053545586107,
+      "grad_norm": 1.6861798763275146,
+      "learning_rate": 4.839098954143202e-06,
+      "loss": 1.0219,
+      "step": 193
+    },
+    {
+      "epoch": 0.28075253256150506,
+      "grad_norm": 1.3235957622528076,
+      "learning_rate": 4.835076427996782e-06,
+      "loss": 1.0406,
+      "step": 194
+    },
+    {
+      "epoch": 0.2821997105643994,
+      "grad_norm": 1.4384372234344482,
+      "learning_rate": 4.831053901850362e-06,
+      "loss": 1.0828,
+      "step": 195
+    },
+    {
+      "epoch": 0.28364688856729375,
+      "grad_norm": 1.4204976558685303,
+      "learning_rate": 4.8270313757039425e-06,
+      "loss": 1.0542,
+      "step": 196
+    },
+    {
+      "epoch": 0.28509406657018815,
+      "grad_norm": 1.207572340965271,
+      "learning_rate": 4.823008849557523e-06,
+      "loss": 1.033,
+      "step": 197
+    },
+    {
+      "epoch": 0.2865412445730825,
+      "grad_norm": 1.4599734544754028,
+      "learning_rate": 4.818986323411103e-06,
+      "loss": 1.0303,
+      "step": 198
+    },
+    {
+      "epoch": 0.28798842257597684,
+      "grad_norm": 1.5304739475250244,
+      "learning_rate": 4.814963797264683e-06,
+      "loss": 1.0334,
+      "step": 199
+    },
+    {
+      "epoch": 0.2894356005788712,
+      "grad_norm": 1.3051408529281616,
+      "learning_rate": 4.810941271118263e-06,
+      "loss": 1.0265,
+      "step": 200
+    },
+    {
+      "epoch": 0.2894356005788712,
+      "eval_loss": 1.1237152814865112,
+      "eval_runtime": 25.6565,
+      "eval_samples_per_second": 38.977,
+      "eval_steps_per_second": 2.456,
+      "step": 200
+    },
+    {
+      "epoch": 0.29088277858176553,
+      "grad_norm": 1.3950132131576538,
+      "learning_rate": 4.806918744971843e-06,
+      "loss": 1.0247,
+      "step": 201
+    },
+    {
+      "epoch": 0.29232995658465993,
+      "grad_norm": 1.3100475072860718,
+      "learning_rate": 4.802896218825423e-06,
+      "loss": 1.0257,
+      "step": 202
+    },
+    {
+      "epoch": 0.2937771345875543,
+      "grad_norm": 1.1980586051940918,
+      "learning_rate": 4.798873692679003e-06,
+      "loss": 1.063,
+      "step": 203
+    },
+    {
+      "epoch": 0.2952243125904486,
+      "grad_norm": 1.4957659244537354,
+      "learning_rate": 4.794851166532583e-06,
+      "loss": 1.1039,
+      "step": 204
+    },
+    {
+      "epoch": 0.29667149059334297,
+      "grad_norm": 1.406442403793335,
+      "learning_rate": 4.790828640386163e-06,
+      "loss": 1.0844,
+      "step": 205
+    },
+    {
+      "epoch": 0.2981186685962373,
+      "grad_norm": 1.2532386779785156,
+      "learning_rate": 4.786806114239743e-06,
+      "loss": 1.0421,
+      "step": 206
+    },
+    {
+      "epoch": 0.2995658465991317,
+      "grad_norm": 1.311007022857666,
+      "learning_rate": 4.782783588093323e-06,
+      "loss": 1.0657,
+      "step": 207
+    },
+    {
+      "epoch": 0.30101302460202606,
+      "grad_norm": 1.427161693572998,
+      "learning_rate": 4.778761061946903e-06,
+      "loss": 1.0112,
+      "step": 208
+    },
+    {
+      "epoch": 0.3024602026049204,
+      "grad_norm": 1.6766445636749268,
+      "learning_rate": 4.774738535800483e-06,
+      "loss": 1.0514,
+      "step": 209
+    },
+    {
+      "epoch": 0.30390738060781475,
+      "grad_norm": 1.422170639038086,
+      "learning_rate": 4.7707160096540635e-06,
+      "loss": 1.0221,
+      "step": 210
+    },
+    {
+      "epoch": 0.30390738060781475,
+      "eval_loss": 1.1181718111038208,
+      "eval_runtime": 25.6967,
+      "eval_samples_per_second": 38.916,
+      "eval_steps_per_second": 2.452,
+      "step": 210
+    },
+    {
+      "epoch": 0.3053545586107091,
+      "grad_norm": 1.2965795993804932,
+      "learning_rate": 4.766693483507644e-06,
+      "loss": 1.0388,
+      "step": 211
+    },
+    {
+      "epoch": 0.3068017366136035,
+      "grad_norm": 1.3259484767913818,
+      "learning_rate": 4.762670957361224e-06,
+      "loss": 1.0565,
+      "step": 212
+    },
+    {
+      "epoch": 0.30824891461649784,
+      "grad_norm": 1.2306101322174072,
+      "learning_rate": 4.758648431214804e-06,
+      "loss": 1.0349,
+      "step": 213
+    },
+    {
+      "epoch": 0.3096960926193922,
+      "grad_norm": 1.4872595071792603,
+      "learning_rate": 4.754625905068383e-06,
+      "loss": 1.0754,
+      "step": 214
+    },
+    {
+      "epoch": 0.3111432706222865,
+      "grad_norm": 1.1391340494155884,
+      "learning_rate": 4.750603378921963e-06,
+      "loss": 1.0491,
+      "step": 215
+    },
+    {
+      "epoch": 0.3125904486251809,
+      "grad_norm": 1.254473328590393,
+      "learning_rate": 4.746580852775543e-06,
+      "loss": 1.0425,
+      "step": 216
+    },
+    {
+      "epoch": 0.3140376266280753,
+      "grad_norm": 1.3624285459518433,
+      "learning_rate": 4.7425583266291235e-06,
+      "loss": 1.0578,
+      "step": 217
+    },
+    {
+      "epoch": 0.3154848046309696,
+      "grad_norm": 1.3315480947494507,
+      "learning_rate": 4.738535800482704e-06,
+      "loss": 1.0402,
+      "step": 218
+    },
+    {
+      "epoch": 0.31693198263386396,
+      "grad_norm": 1.4668720960617065,
+      "learning_rate": 4.734513274336284e-06,
+      "loss": 1.0421,
+      "step": 219
+    },
+    {
+      "epoch": 0.3183791606367583,
+      "grad_norm": 1.246150255203247,
+      "learning_rate": 4.730490748189864e-06,
+      "loss": 1.0138,
+      "step": 220
+    },
+    {
+      "epoch": 0.3183791606367583,
+      "eval_loss": 1.1130249500274658,
+      "eval_runtime": 25.9195,
+      "eval_samples_per_second": 38.581,
+      "eval_steps_per_second": 2.431,
+      "step": 220
+    },
+    {
+      "epoch": 0.31982633863965265,
+      "grad_norm": 1.2191036939620972,
+      "learning_rate": 4.726468222043444e-06,
+      "loss": 1.0245,
+      "step": 221
+    },
+    {
+      "epoch": 0.32127351664254705,
+      "grad_norm": 1.3822863101959229,
+      "learning_rate": 4.722445695897024e-06,
+      "loss": 1.0409,
+      "step": 222
+    },
+    {
+      "epoch": 0.3227206946454414,
+      "grad_norm": 1.2923561334609985,
+      "learning_rate": 4.718423169750604e-06,
+      "loss": 1.0555,
+      "step": 223
+    },
+    {
+      "epoch": 0.32416787264833574,
+      "grad_norm": 1.3411794900894165,
+      "learning_rate": 4.7144006436041835e-06,
+      "loss": 0.9884,
+      "step": 224
+    },
+    {
+      "epoch": 0.3256150506512301,
+      "grad_norm": 1.4061198234558105,
+      "learning_rate": 4.710378117457764e-06,
+      "loss": 0.9896,
+      "step": 225
+    },
+    {
+      "epoch": 0.32706222865412443,
+      "grad_norm": 1.361370325088501,
+      "learning_rate": 4.706355591311344e-06,
+      "loss": 1.015,
+      "step": 226
+    },
+    {
+      "epoch": 0.32850940665701883,
+      "grad_norm": 1.2427265644073486,
+      "learning_rate": 4.702333065164924e-06,
+      "loss": 1.0235,
+      "step": 227
+    },
+    {
+      "epoch": 0.3299565846599132,
+      "grad_norm": 1.3974968194961548,
+      "learning_rate": 4.698310539018504e-06,
+      "loss": 1.0351,
+      "step": 228
+    },
+    {
+      "epoch": 0.3314037626628075,
+      "grad_norm": 1.4032317399978638,
+      "learning_rate": 4.694288012872084e-06,
+      "loss": 1.0012,
+      "step": 229
+    },
+    {
+      "epoch": 0.33285094066570187,
+      "grad_norm": 1.3513522148132324,
+      "learning_rate": 4.690265486725664e-06,
+      "loss": 1.043,
+      "step": 230
+    },
+    {
+      "epoch": 0.33285094066570187,
+      "eval_loss": 1.1030467748641968,
+      "eval_runtime": 25.7003,
+      "eval_samples_per_second": 38.91,
+      "eval_steps_per_second": 2.451,
+      "step": 230
+    },
+    {
+      "epoch": 0.3342981186685962,
+      "grad_norm": 1.430236577987671,
+      "learning_rate": 4.6862429605792444e-06,
+      "loss": 1.0229,
+      "step": 231
+    },
+    {
+      "epoch": 0.3357452966714906,
+      "grad_norm": 1.3285548686981201,
+      "learning_rate": 4.6822204344328246e-06,
+      "loss": 1.0177,
+      "step": 232
+    },
+    {
+      "epoch": 0.33719247467438496,
+      "grad_norm": 1.5301393270492554,
+      "learning_rate": 4.678197908286405e-06,
+      "loss": 1.0396,
+      "step": 233
+    },
+    {
+      "epoch": 0.3386396526772793,
+      "grad_norm": 1.2442700862884521,
+      "learning_rate": 4.674175382139984e-06,
+      "loss": 1.0346,
+      "step": 234
+    },
+    {
+      "epoch": 0.34008683068017365,
+      "grad_norm": 1.3574509620666504,
+      "learning_rate": 4.670152855993564e-06,
+      "loss": 1.0752,
+      "step": 235
+    },
+    {
+      "epoch": 0.341534008683068,
+      "grad_norm": 1.5170680284500122,
+      "learning_rate": 4.666130329847144e-06,
+      "loss": 1.0498,
+      "step": 236
+    },
+    {
+      "epoch": 0.3429811866859624,
+      "grad_norm": 1.5622068643569946,
+      "learning_rate": 4.662107803700724e-06,
+      "loss": 1.0397,
+      "step": 237
+    },
+    {
+      "epoch": 0.34442836468885674,
+      "grad_norm": 1.319341778755188,
+      "learning_rate": 4.6580852775543045e-06,
+      "loss": 1.0564,
+      "step": 238
+    },
+    {
+      "epoch": 0.3458755426917511,
+      "grad_norm": 1.6022669076919556,
+      "learning_rate": 4.654062751407885e-06,
+      "loss": 1.0339,
+      "step": 239
+    },
+    {
+      "epoch": 0.3473227206946454,
+      "grad_norm": 1.5898162126541138,
+      "learning_rate": 4.650040225261465e-06,
+      "loss": 1.0128,
+      "step": 240
+    },
+    {
+      "epoch": 0.3473227206946454,
+      "eval_loss": 1.0971758365631104,
+      "eval_runtime": 25.8681,
+      "eval_samples_per_second": 38.658,
+      "eval_steps_per_second": 2.435,
+      "step": 240
+    },
+    {
+      "epoch": 0.34876989869753977,
+      "grad_norm": 1.5174083709716797,
+      "learning_rate": 4.646017699115045e-06,
+      "loss": 1.0162,
+      "step": 241
+    },
+    {
+      "epoch": 0.35021707670043417,
+      "grad_norm": 1.569929599761963,
+      "learning_rate": 4.641995172968625e-06,
+      "loss": 0.9892,
+      "step": 242
+    },
+    {
+      "epoch": 0.3516642547033285,
+      "grad_norm": 1.3473092317581177,
+      "learning_rate": 4.637972646822205e-06,
+      "loss": 1.0343,
+      "step": 243
+    },
+    {
+      "epoch": 0.35311143270622286,
+      "grad_norm": 1.4948253631591797,
+      "learning_rate": 4.633950120675784e-06,
+      "loss": 1.0244,
+      "step": 244
+    },
+    {
+      "epoch": 0.3545586107091172,
+      "grad_norm": 1.4232202768325806,
+      "learning_rate": 4.6299275945293645e-06,
+      "loss": 1.0367,
+      "step": 245
+    },
+    {
+      "epoch": 0.35600578871201155,
+      "grad_norm": 1.5077722072601318,
+      "learning_rate": 4.625905068382945e-06,
+      "loss": 1.0208,
+      "step": 246
+    },
+    {
+      "epoch": 0.35745296671490595,
+      "grad_norm": 1.389813780784607,
+      "learning_rate": 4.621882542236525e-06,
+      "loss": 1.027,
+      "step": 247
+    },
+    {
+      "epoch": 0.3589001447178003,
+      "grad_norm": 1.3511253595352173,
+      "learning_rate": 4.617860016090105e-06,
+      "loss": 1.0247,
+      "step": 248
+    },
+    {
+      "epoch": 0.36034732272069464,
+      "grad_norm": 1.6211826801300049,
+      "learning_rate": 4.613837489943685e-06,
+      "loss": 1.0376,
+      "step": 249
+    },
+    {
+      "epoch": 0.361794500723589,
+      "grad_norm": 1.2799283266067505,
+      "learning_rate": 4.609814963797265e-06,
+      "loss": 0.9995,
+      "step": 250
+    },
+    {
+      "epoch": 0.361794500723589,
+      "eval_loss": 1.0912978649139404,
+      "eval_runtime": 25.8351,
+      "eval_samples_per_second": 38.707,
+      "eval_steps_per_second": 2.439,
+      "step": 250
+    },
+    {
+      "epoch": 0.36324167872648333,
+      "grad_norm": 1.6001930236816406,
+      "learning_rate": 4.605792437650845e-06,
+      "loss": 1.0318,
+      "step": 251
+    },
+    {
+      "epoch": 0.36468885672937773,
+      "grad_norm": 1.3637175559997559,
+      "learning_rate": 4.6017699115044254e-06,
+      "loss": 1.0027,
+      "step": 252
+    },
+    {
+      "epoch": 0.3661360347322721,
+      "grad_norm": 1.3025727272033691,
+      "learning_rate": 4.5977473853580056e-06,
+      "loss": 0.9971,
+      "step": 253
+    },
+    {
+      "epoch": 0.3675832127351664,
+      "grad_norm": 1.450335144996643,
+      "learning_rate": 4.593724859211585e-06,
+      "loss": 1.0201,
+      "step": 254
+    },
+    {
+      "epoch": 0.36903039073806077,
+      "grad_norm": 1.428523063659668,
+      "learning_rate": 4.589702333065165e-06,
+      "loss": 1.0261,
+      "step": 255
+    },
+    {
+      "epoch": 0.3704775687409551,
+      "grad_norm": 1.4156044721603394,
+      "learning_rate": 4.585679806918745e-06,
+      "loss": 0.9958,
+      "step": 256
+    },
+    {
+      "epoch": 0.3719247467438495,
+      "grad_norm": 1.3857672214508057,
+      "learning_rate": 4.581657280772325e-06,
+      "loss": 1.0098,
+      "step": 257
+    },
+    {
+      "epoch": 0.37337192474674386,
+      "grad_norm": 1.3765766620635986,
+      "learning_rate": 4.577634754625905e-06,
+      "loss": 1.0258,
+      "step": 258
+    },
+    {
+      "epoch": 0.3748191027496382,
+      "grad_norm": 1.3213120698928833,
+      "learning_rate": 4.5736122284794855e-06,
+      "loss": 1.0156,
+      "step": 259
+    },
+    {
+      "epoch": 0.37626628075253254,
+      "grad_norm": 1.3624275922775269,
+      "learning_rate": 4.569589702333066e-06,
+      "loss": 1.0266,
+      "step": 260
+    },
+    {
+      "epoch": 0.37626628075253254,
+      "eval_loss": 1.087217092514038,
+      "eval_runtime": 25.6623,
+      "eval_samples_per_second": 38.968,
+      "eval_steps_per_second": 2.455,
+      "step": 260
+    },
+    {
+      "epoch": 0.37771345875542695,
+      "grad_norm": 1.4796395301818848,
+      "learning_rate": 4.565567176186646e-06,
+      "loss": 0.969,
+      "step": 261
+    },
+    {
+      "epoch": 0.3791606367583213,
+      "grad_norm": 1.348137617111206,
+      "learning_rate": 4.561544650040226e-06,
+      "loss": 1.0073,
+      "step": 262
+    },
+    {
+      "epoch": 0.38060781476121563,
+      "grad_norm": 1.3586536645889282,
+      "learning_rate": 4.557522123893805e-06,
+      "loss": 1.0308,
+      "step": 263
+    },
+    {
+      "epoch": 0.38205499276411,
+      "grad_norm": 1.5610628128051758,
+      "learning_rate": 4.553499597747385e-06,
+      "loss": 1.0403,
+      "step": 264
+    },
+    {
+      "epoch": 0.3835021707670043,
+      "grad_norm": 1.4678453207015991,
+      "learning_rate": 4.549477071600965e-06,
+      "loss": 0.9937,
+      "step": 265
+    },
+    {
+      "epoch": 0.3849493487698987,
+      "grad_norm": 1.3599241971969604,
+      "learning_rate": 4.5454545454545455e-06,
+      "loss": 1.0112,
+      "step": 266
+    },
+    {
+      "epoch": 0.38639652677279307,
+      "grad_norm": 1.471745491027832,
+      "learning_rate": 4.541432019308126e-06,
+      "loss": 1.0145,
+      "step": 267
+    },
+    {
+      "epoch": 0.3878437047756874,
+      "grad_norm": 1.5481610298156738,
+      "learning_rate": 4.537409493161706e-06,
+      "loss": 1.0332,
+      "step": 268
+    },
+    {
+      "epoch": 0.38929088277858176,
+      "grad_norm": 1.475701928138733,
+      "learning_rate": 4.533386967015286e-06,
+      "loss": 1.0038,
+      "step": 269
+    },
+    {
+      "epoch": 0.3907380607814761,
+      "grad_norm": 1.6974904537200928,
+      "learning_rate": 4.529364440868866e-06,
+      "loss": 1.0033,
+      "step": 270
+    },
+    {
+      "epoch": 0.3907380607814761,
+      "eval_loss": 1.089996099472046,
+      "eval_runtime": 25.715,
+      "eval_samples_per_second": 38.888,
+      "eval_steps_per_second": 2.45,
+      "step": 270
+    },
+    {
+      "epoch": 0.3921852387843705,
+      "grad_norm": 1.5095347166061401,
+      "learning_rate": 4.525341914722446e-06,
+      "loss": 0.9633,
+      "step": 271
+    },
+    {
+      "epoch": 0.39363241678726485,
+      "grad_norm": 1.4550669193267822,
+      "learning_rate": 4.521319388576026e-06,
+      "loss": 1.065,
+      "step": 272
+    },
+    {
+      "epoch": 0.3950795947901592,
+      "grad_norm": 1.5253571271896362,
+      "learning_rate": 4.5172968624296056e-06,
+      "loss": 0.9701,
+      "step": 273
+    },
+    {
+      "epoch": 0.39652677279305354,
+      "grad_norm": 1.3699054718017578,
+      "learning_rate": 4.513274336283186e-06,
+      "loss": 1.0078,
+      "step": 274
+    },
+    {
+      "epoch": 0.3979739507959479,
+      "grad_norm": 1.3667832612991333,
+      "learning_rate": 4.509251810136766e-06,
+      "loss": 0.9924,
+      "step": 275
+    },
+    {
+      "epoch": 0.3994211287988423,
+      "grad_norm": 1.3792893886566162,
+      "learning_rate": 4.505229283990346e-06,
+      "loss": 1.011,
+      "step": 276
+    },
+    {
+      "epoch": 0.40086830680173663,
+      "grad_norm": 1.3925573825836182,
+      "learning_rate": 4.501206757843926e-06,
+      "loss": 1.0122,
+      "step": 277
+    },
+    {
+      "epoch": 0.402315484804631,
+      "grad_norm": 1.3556911945343018,
+      "learning_rate": 4.497184231697506e-06,
+      "loss": 0.9894,
+      "step": 278
+    },
+    {
+      "epoch": 0.4037626628075253,
+      "grad_norm": 1.3186146020889282,
+      "learning_rate": 4.493161705551086e-06,
+      "loss": 0.9996,
+      "step": 279
+    },
+    {
+      "epoch": 0.40520984081041966,
+      "grad_norm": 1.3015190362930298,
+      "learning_rate": 4.4891391794046665e-06,
+      "loss": 0.9923,
+      "step": 280
+    },
+    {
+      "epoch": 0.40520984081041966,
+      "eval_loss": 1.0799185037612915,
+      "eval_runtime": 25.8526,
+      "eval_samples_per_second": 38.681,
+      "eval_steps_per_second": 2.437,
+      "step": 280
+    },
+    {
+      "epoch": 0.40665701881331406,
+      "grad_norm": 1.5820448398590088,
+      "learning_rate": 4.485116653258247e-06,
+      "loss": 1.0292,
+      "step": 281
+    },
+    {
+      "epoch": 0.4081041968162084,
+      "grad_norm": 1.3347865343093872,
+      "learning_rate": 4.481094127111827e-06,
+      "loss": 1.0391,
+      "step": 282
+    },
+    {
+      "epoch": 0.40955137481910275,
+      "grad_norm": 1.5359543561935425,
+      "learning_rate": 4.477071600965406e-06,
+      "loss": 0.9842,
+      "step": 283
+    },
+    {
+      "epoch": 0.4109985528219971,
+      "grad_norm": 1.5072472095489502,
+      "learning_rate": 4.473049074818986e-06,
+      "loss": 1.0094,
+      "step": 284
+    },
+    {
+      "epoch": 0.41244573082489144,
+      "grad_norm": 1.3954992294311523,
+      "learning_rate": 4.469026548672566e-06,
+      "loss": 1.0169,
+      "step": 285
+    },
+    {
+      "epoch": 0.41389290882778584,
+      "grad_norm": 1.4730685949325562,
+      "learning_rate": 4.465004022526146e-06,
+      "loss": 0.981,
+      "step": 286
+    },
+    {
+      "epoch": 0.4153400868306802,
+      "grad_norm": 1.4737553596496582,
+      "learning_rate": 4.460981496379727e-06,
+      "loss": 0.9913,
+      "step": 287
+    },
+    {
+      "epoch": 0.41678726483357453,
+      "grad_norm": 1.3746957778930664,
+      "learning_rate": 4.4569589702333075e-06,
+      "loss": 0.9927,
+      "step": 288
+    },
+    {
+      "epoch": 0.4182344428364689,
+      "grad_norm": 1.3467106819152832,
+      "learning_rate": 4.452936444086888e-06,
+      "loss": 1.0173,
+      "step": 289
+    },
+    {
+      "epoch": 0.4196816208393632,
+      "grad_norm": 1.3768117427825928,
+      "learning_rate": 4.448913917940467e-06,
+      "loss": 1.0287,
+      "step": 290
+    },
+    {
+      "epoch": 0.4196816208393632,
+      "eval_loss": 1.0746866464614868,
+      "eval_runtime": 25.7053,
+      "eval_samples_per_second": 38.903,
+      "eval_steps_per_second": 2.451,
+      "step": 290
+    },
+    {
+      "epoch": 0.4211287988422576,
+      "grad_norm": 1.3297845125198364,
+      "learning_rate": 4.444891391794047e-06,
+      "loss": 0.9854,
+      "step": 291
+    },
+    {
+      "epoch": 0.42257597684515197,
+      "grad_norm": 1.3490155935287476,
+      "learning_rate": 4.440868865647627e-06,
+      "loss": 1.0071,
+      "step": 292
+    },
+    {
+      "epoch": 0.4240231548480463,
+      "grad_norm": 1.409170389175415,
+      "learning_rate": 4.436846339501207e-06,
+      "loss": 0.9944,
+      "step": 293
+    },
+    {
+      "epoch": 0.42547033285094066,
+      "grad_norm": 1.3009305000305176,
+      "learning_rate": 4.432823813354787e-06,
+      "loss": 1.0176,
+      "step": 294
+    },
+    {
+      "epoch": 0.426917510853835,
+      "grad_norm": 1.236854910850525,
+      "learning_rate": 4.4288012872083675e-06,
+      "loss": 0.9725,
+      "step": 295
+    },
+    {
+      "epoch": 0.4283646888567294,
+      "grad_norm": 1.3198227882385254,
+      "learning_rate": 4.424778761061948e-06,
+      "loss": 0.9878,
+      "step": 296
+    },
+    {
+      "epoch": 0.42981186685962375,
+      "grad_norm": 1.360911250114441,
+      "learning_rate": 4.420756234915528e-06,
+      "loss": 0.9698,
+      "step": 297
+    },
+    {
+      "epoch": 0.4312590448625181,
+      "grad_norm": 1.3463726043701172,
+      "learning_rate": 4.416733708769108e-06,
+      "loss": 0.9868,
+      "step": 298
+    },
+    {
+      "epoch": 0.43270622286541244,
+      "grad_norm": 1.409676432609558,
+      "learning_rate": 4.412711182622687e-06,
+      "loss": 1.0173,
+      "step": 299
+    },
+    {
+      "epoch": 0.4341534008683068,
+      "grad_norm": 1.3323174715042114,
+      "learning_rate": 4.408688656476267e-06,
+      "loss": 0.9568,
+      "step": 300
+    },
+    {
+      "epoch": 0.4341534008683068,
+      "eval_loss": 1.0745749473571777,
+      "eval_runtime": 25.7849,
+      "eval_samples_per_second": 38.782,
+      "eval_steps_per_second": 2.443,
+      "step": 300
+    },
+    {
+      "epoch": 0.4356005788712012,
+      "grad_norm": 1.4331218004226685,
+      "learning_rate": 4.4046661303298474e-06,
+      "loss": 0.9586,
+      "step": 301
+    },
+    {
+      "epoch": 0.4370477568740955,
+      "grad_norm": 1.5185577869415283,
+      "learning_rate": 4.4006436041834276e-06,
+      "loss": 1.0248,
+      "step": 302
+    },
+    {
+      "epoch": 0.4384949348769899,
+      "grad_norm": 1.7651474475860596,
+      "learning_rate": 4.396621078037008e-06,
+      "loss": 0.9775,
+      "step": 303
+    },
+    {
+      "epoch": 0.4399421128798842,
+      "grad_norm": 1.5057427883148193,
+      "learning_rate": 4.392598551890588e-06,
+      "loss": 0.9845,
+      "step": 304
+    },
+    {
+      "epoch": 0.44138929088277856,
+      "grad_norm": 1.4012314081192017,
+      "learning_rate": 4.388576025744168e-06,
+      "loss": 0.9677,
+      "step": 305
+    },
+    {
+      "epoch": 0.44283646888567296,
+      "grad_norm": 1.4347463846206665,
+      "learning_rate": 4.384553499597748e-06,
+      "loss": 0.9813,
+      "step": 306
+    },
+    {
+      "epoch": 0.4442836468885673,
+      "grad_norm": 1.3435741662979126,
+      "learning_rate": 4.380530973451328e-06,
+      "loss": 0.9734,
+      "step": 307
+    },
+    {
+      "epoch": 0.44573082489146165,
+      "grad_norm": 1.5648781061172485,
+      "learning_rate": 4.376508447304908e-06,
+      "loss": 1.0614,
+      "step": 308
+    },
+    {
+      "epoch": 0.447178002894356,
+      "grad_norm": 1.403222680091858,
+      "learning_rate": 4.372485921158488e-06,
+      "loss": 0.986,
+      "step": 309
+    },
+    {
+      "epoch": 0.44862518089725034,
+      "grad_norm": 1.4489191770553589,
+      "learning_rate": 4.368463395012068e-06,
+      "loss": 0.9709,
+      "step": 310
+    },
+    {
+      "epoch": 0.44862518089725034,
+      "eval_loss": 1.0693542957305908,
+      "eval_runtime": 25.7542,
+      "eval_samples_per_second": 38.829,
+      "eval_steps_per_second": 2.446,
+      "step": 310
+    },
+    {
+      "epoch": 0.45007235890014474,
+      "grad_norm": 1.5860564708709717,
+      "learning_rate": 4.364440868865648e-06,
+      "loss": 1.0255,
+      "step": 311
+    },
+    {
+      "epoch": 0.4515195369030391,
+      "grad_norm": 1.3616255521774292,
+      "learning_rate": 4.360418342719228e-06,
+      "loss": 0.9487,
+      "step": 312
+    },
+    {
+      "epoch": 0.45296671490593343,
+      "grad_norm": 1.4219884872436523,
+      "learning_rate": 4.356395816572808e-06,
+      "loss": 0.9859,
+      "step": 313
+    },
+    {
+      "epoch": 0.4544138929088278,
+      "grad_norm": 1.3668162822723389,
+      "learning_rate": 4.352373290426388e-06,
+      "loss": 0.9575,
+      "step": 314
+    },
+    {
+      "epoch": 0.4558610709117221,
+      "grad_norm": 1.4213571548461914,
+      "learning_rate": 4.348350764279968e-06,
+      "loss": 1.0129,
+      "step": 315
+    },
+    {
+      "epoch": 0.4573082489146165,
+      "grad_norm": 1.430222988128662,
+      "learning_rate": 4.3443282381335485e-06,
+      "loss": 0.9554,
+      "step": 316
+    },
+    {
+      "epoch": 0.45875542691751087,
+      "grad_norm": 1.3727890253067017,
+      "learning_rate": 4.340305711987129e-06,
+      "loss": 1.0396,
+      "step": 317
+    },
+    {
+      "epoch": 0.4602026049204052,
+      "grad_norm": 1.4125868082046509,
+      "learning_rate": 4.336283185840709e-06,
+      "loss": 0.9571,
+      "step": 318
+    },
+    {
+      "epoch": 0.46164978292329956,
+      "grad_norm": 1.4137847423553467,
+      "learning_rate": 4.332260659694288e-06,
+      "loss": 0.9992,
+      "step": 319
+    },
+    {
+      "epoch": 0.4630969609261939,
+      "grad_norm": 1.435394525527954,
+      "learning_rate": 4.328238133547868e-06,
+      "loss": 0.998,
+      "step": 320
+    },
+    {
+      "epoch": 0.4630969609261939,
+      "eval_loss": 1.0627951622009277,
+      "eval_runtime": 25.7658,
+      "eval_samples_per_second": 38.811,
+      "eval_steps_per_second": 2.445,
+      "step": 320
+    },
+    {
+      "epoch": 0.4645441389290883,
+      "grad_norm": 1.2891018390655518,
+      "learning_rate": 4.324215607401448e-06,
+      "loss": 0.9908,
+      "step": 321
+    },
+    {
+      "epoch": 0.46599131693198265,
+      "grad_norm": 1.3445963859558105,
+      "learning_rate": 4.3201930812550284e-06,
+      "loss": 0.9724,
+      "step": 322
+    },
+    {
+      "epoch": 0.467438494934877,
+      "grad_norm": 1.4441720247268677,
+      "learning_rate": 4.3161705551086086e-06,
+      "loss": 1.0035,
+      "step": 323
+    },
+    {
+      "epoch": 0.46888567293777134,
+      "grad_norm": 1.3721234798431396,
+      "learning_rate": 4.312148028962189e-06,
+      "loss": 0.9945,
+      "step": 324
+    },
+    {
+      "epoch": 0.4703328509406657,
+      "grad_norm": 1.387130856513977,
+      "learning_rate": 4.308125502815769e-06,
+      "loss": 0.9808,
+      "step": 325
+    },
+    {
+      "epoch": 0.4717800289435601,
+      "grad_norm": 1.467307686805725,
+      "learning_rate": 4.304102976669349e-06,
+      "loss": 0.9651,
+      "step": 326
+    },
+    {
+      "epoch": 0.4732272069464544,
+      "grad_norm": 1.3222352266311646,
+      "learning_rate": 4.300080450522929e-06,
+      "loss": 0.9618,
+      "step": 327
+    },
+    {
+      "epoch": 0.47467438494934877,
+      "grad_norm": 1.5600520372390747,
+      "learning_rate": 4.296057924376509e-06,
+      "loss": 1.0447,
+      "step": 328
+    },
+    {
+      "epoch": 0.4761215629522431,
+      "grad_norm": 1.3152457475662231,
+      "learning_rate": 4.2920353982300885e-06,
+      "loss": 1.0105,
+      "step": 329
+    },
+    {
+      "epoch": 0.47756874095513746,
+      "grad_norm": 1.3301295042037964,
+      "learning_rate": 4.288012872083669e-06,
+      "loss": 0.9997,
+      "step": 330
+    },
+    {
+      "epoch": 0.47756874095513746,
+      "eval_loss": 1.0656896829605103,
+      "eval_runtime": 25.7854,
+      "eval_samples_per_second": 38.782,
+      "eval_steps_per_second": 2.443,
+      "step": 330
+    },
+    {
+      "epoch": 0.47901591895803186,
+      "grad_norm": 1.537719964981079,
+      "learning_rate": 4.283990345937249e-06,
+      "loss": 0.9521,
+      "step": 331
+    },
+    {
+      "epoch": 0.4804630969609262,
+      "grad_norm": 1.3024014234542847,
+      "learning_rate": 4.279967819790829e-06,
+      "loss": 0.9855,
+      "step": 332
+    },
+    {
+      "epoch": 0.48191027496382055,
+      "grad_norm": 1.5578094720840454,
+      "learning_rate": 4.275945293644409e-06,
+      "loss": 0.9954,
+      "step": 333
+    },
+    {
+      "epoch": 0.4833574529667149,
+      "grad_norm": 1.4333091974258423,
+      "learning_rate": 4.271922767497989e-06,
+      "loss": 0.9975,
+      "step": 334
+    },
+    {
+      "epoch": 0.48480463096960924,
+      "grad_norm": 1.3617147207260132,
+      "learning_rate": 4.267900241351569e-06,
+      "loss": 0.9761,
+      "step": 335
+    },
+    {
+      "epoch": 0.48625180897250364,
+      "grad_norm": 1.4757050275802612,
+      "learning_rate": 4.263877715205149e-06,
+      "loss": 0.9852,
+      "step": 336
+    },
+    {
+      "epoch": 0.487698986975398,
+      "grad_norm": 1.4386905431747437,
+      "learning_rate": 4.2598551890587295e-06,
+      "loss": 0.983,
+      "step": 337
+    },
+    {
+      "epoch": 0.48914616497829233,
+      "grad_norm": 1.528428316116333,
+      "learning_rate": 4.25583266291231e-06,
+      "loss": 0.9801,
+      "step": 338
+    },
+    {
+      "epoch": 0.4905933429811867,
+      "grad_norm": 1.5482014417648315,
+      "learning_rate": 4.251810136765889e-06,
+      "loss": 0.9839,
+      "step": 339
+    },
+    {
+      "epoch": 0.492040520984081,
+      "grad_norm": 1.2495687007904053,
+      "learning_rate": 4.247787610619469e-06,
+      "loss": 0.9476,
+      "step": 340
+    },
+    {
+      "epoch": 0.492040520984081,
+      "eval_loss": 1.0619398355484009,
+      "eval_runtime": 25.8553,
+      "eval_samples_per_second": 38.677,
+      "eval_steps_per_second": 2.437,
+      "step": 340
+    },
+    {
+      "epoch": 0.4934876989869754,
+      "grad_norm": 1.3904533386230469,
+      "learning_rate": 4.243765084473049e-06,
+      "loss": 0.9935,
+      "step": 341
+    },
+    {
+      "epoch": 0.49493487698986977,
+      "grad_norm": 1.4647581577301025,
+      "learning_rate": 4.239742558326629e-06,
+      "loss": 0.9955,
+      "step": 342
+    },
+    {
+      "epoch": 0.4963820549927641,
+      "grad_norm": 1.3266704082489014,
+      "learning_rate": 4.235720032180209e-06,
+      "loss": 0.9304,
+      "step": 343
+    },
+    {
+      "epoch": 0.49782923299565845,
+      "grad_norm": 1.3789790868759155,
+      "learning_rate": 4.2316975060337895e-06,
+      "loss": 0.9834,
+      "step": 344
+    },
+    {
+      "epoch": 0.4992764109985528,
+      "grad_norm": 1.2976341247558594,
+      "learning_rate": 4.22767497988737e-06,
+      "loss": 0.9766,
+      "step": 345
+    },
+    {
+      "epoch": 0.5007235890014472,
+      "grad_norm": 1.3830623626708984,
+      "learning_rate": 4.22365245374095e-06,
+      "loss": 0.9684,
+      "step": 346
+    },
+    {
+      "epoch": 0.5021707670043415,
+      "grad_norm": 1.3655767440795898,
+      "learning_rate": 4.21962992759453e-06,
+      "loss": 0.9484,
+      "step": 347
+    },
+    {
+      "epoch": 0.5036179450072359,
+      "grad_norm": 1.4754517078399658,
+      "learning_rate": 4.21560740144811e-06,
+      "loss": 0.9792,
+      "step": 348
+    },
+    {
+      "epoch": 0.5050651230101303,
+      "grad_norm": 1.3112934827804565,
+      "learning_rate": 4.211584875301689e-06,
+      "loss": 0.9588,
+      "step": 349
+    },
+    {
+      "epoch": 0.5065123010130246,
+      "grad_norm": 1.4535300731658936,
+      "learning_rate": 4.2075623491552695e-06,
+      "loss": 0.9573,
+      "step": 350
+    },
+    {
+      "epoch": 0.5065123010130246,
+      "eval_loss": 1.0596165657043457,
+      "eval_runtime": 25.761,
+      "eval_samples_per_second": 38.818,
+      "eval_steps_per_second": 2.446,
+      "step": 350
+    },
+    {
+      "epoch": 0.507959479015919,
+      "grad_norm": 1.3574364185333252,
+      "learning_rate": 4.20353982300885e-06,
+      "loss": 0.9949,
+      "step": 351
+    },
+    {
+      "epoch": 0.5094066570188133,
+      "grad_norm": 1.2600637674331665,
+      "learning_rate": 4.19951729686243e-06,
+      "loss": 0.9972,
+      "step": 352
+    },
+    {
+      "epoch": 0.5108538350217077,
+      "grad_norm": 1.34312903881073,
+      "learning_rate": 4.19549477071601e-06,
+      "loss": 0.9695,
+      "step": 353
+    },
+    {
+      "epoch": 0.5123010130246021,
+      "grad_norm": 1.3600518703460693,
+      "learning_rate": 4.19147224456959e-06,
+      "loss": 1.0045,
+      "step": 354
+    },
+    {
+      "epoch": 0.5137481910274964,
+      "grad_norm": 1.3012819290161133,
+      "learning_rate": 4.18744971842317e-06,
+      "loss": 0.9377,
+      "step": 355
+    },
+    {
+      "epoch": 0.5151953690303908,
+      "grad_norm": 1.4134474992752075,
+      "learning_rate": 4.18342719227675e-06,
+      "loss": 1.0233,
+      "step": 356
+    },
+    {
+      "epoch": 0.516642547033285,
+      "grad_norm": 1.3824727535247803,
+      "learning_rate": 4.17940466613033e-06,
+      "loss": 0.979,
+      "step": 357
+    },
+    {
+      "epoch": 0.5180897250361794,
+      "grad_norm": 1.3735177516937256,
+      "learning_rate": 4.1753821399839105e-06,
+      "loss": 0.9657,
+      "step": 358
+    },
+    {
+      "epoch": 0.5195369030390738,
+      "grad_norm": 1.4538931846618652,
+      "learning_rate": 4.17135961383749e-06,
+      "loss": 0.9978,
+      "step": 359
+    },
+    {
+      "epoch": 0.5209840810419681,
+      "grad_norm": 1.3481597900390625,
+      "learning_rate": 4.16733708769107e-06,
+      "loss": 0.9582,
+      "step": 360
+    },
+    {
+      "epoch": 0.5209840810419681,
+      "eval_loss": 1.051351547241211,
+      "eval_runtime": 25.6222,
+      "eval_samples_per_second": 39.029,
+      "eval_steps_per_second": 2.459,
+      "step": 360
+    },
+    {
+      "epoch": 0.5224312590448625,
+      "grad_norm": 1.2694309949874878,
+      "learning_rate": 4.16331456154465e-06,
+      "loss": 1.0004,
+      "step": 361
+    },
+    {
+      "epoch": 0.5238784370477568,
+      "grad_norm": 1.461315631866455,
+      "learning_rate": 4.15929203539823e-06,
+      "loss": 0.9663,
+      "step": 362
+    },
+    {
+      "epoch": 0.5253256150506512,
+      "grad_norm": 1.3080368041992188,
+      "learning_rate": 4.15526950925181e-06,
+      "loss": 0.9654,
+      "step": 363
+    },
+    {
+      "epoch": 0.5267727930535456,
+      "grad_norm": 1.4705910682678223,
+      "learning_rate": 4.15124698310539e-06,
+      "loss": 0.9707,
+      "step": 364
+    },
+    {
+      "epoch": 0.5282199710564399,
+      "grad_norm": 1.4834836721420288,
+      "learning_rate": 4.1472244569589705e-06,
+      "loss": 0.9758,
+      "step": 365
+    },
+    {
+      "epoch": 0.5296671490593343,
+      "grad_norm": 1.5037809610366821,
+      "learning_rate": 4.143201930812551e-06,
+      "loss": 0.9678,
+      "step": 366
+    },
+    {
+      "epoch": 0.5311143270622286,
+      "grad_norm": 1.4555189609527588,
+      "learning_rate": 4.139179404666131e-06,
+      "loss": 0.9475,
+      "step": 367
+    },
+    {
+      "epoch": 0.532561505065123,
+      "grad_norm": 1.3858565092086792,
+      "learning_rate": 4.13515687851971e-06,
+      "loss": 0.975,
+      "step": 368
+    },
+    {
+      "epoch": 0.5340086830680174,
+      "grad_norm": 1.3702665567398071,
+      "learning_rate": 4.13113435237329e-06,
+      "loss": 0.966,
+      "step": 369
+    },
+    {
+      "epoch": 0.5354558610709117,
+      "grad_norm": 1.3536401987075806,
+      "learning_rate": 4.12711182622687e-06,
+      "loss": 0.9649,
+      "step": 370
+    },
+    {
+      "epoch": 0.5354558610709117,
+      "eval_loss": 1.0490535497665405,
+      "eval_runtime": 25.7917,
+      "eval_samples_per_second": 38.772,
+      "eval_steps_per_second": 2.443,
+      "step": 370
+    },
+    {
+      "epoch": 0.5369030390738061,
+      "grad_norm": 1.2890545129776,
+      "learning_rate": 4.1230893000804505e-06,
+      "loss": 0.9618,
+      "step": 371
+    },
+    {
+      "epoch": 0.5383502170767004,
+      "grad_norm": 1.4141520261764526,
+      "learning_rate": 4.119066773934031e-06,
+      "loss": 0.9292,
+      "step": 372
+    },
+    {
+      "epoch": 0.5397973950795948,
+      "grad_norm": 1.4172197580337524,
+      "learning_rate": 4.115044247787611e-06,
+      "loss": 0.9642,
+      "step": 373
+    },
+    {
+      "epoch": 0.5412445730824892,
+      "grad_norm": 1.4644924402236938,
+      "learning_rate": 4.111021721641191e-06,
+      "loss": 0.9842,
+      "step": 374
+    },
+    {
+      "epoch": 0.5426917510853835,
+      "grad_norm": 1.4348403215408325,
+      "learning_rate": 4.106999195494771e-06,
+      "loss": 0.9734,
+      "step": 375
+    },
+    {
+      "epoch": 0.5441389290882779,
+      "grad_norm": 1.311895728111267,
+      "learning_rate": 4.102976669348351e-06,
+      "loss": 0.9442,
+      "step": 376
+    },
+    {
+      "epoch": 0.5455861070911722,
+      "grad_norm": 1.3387318849563599,
+      "learning_rate": 4.098954143201931e-06,
+      "loss": 0.9323,
+      "step": 377
+    },
+    {
+      "epoch": 0.5470332850940666,
+      "grad_norm": 1.4854440689086914,
+      "learning_rate": 4.094931617055511e-06,
+      "loss": 0.9577,
+      "step": 378
+    },
+    {
+      "epoch": 0.548480463096961,
+      "grad_norm": 1.4293216466903687,
+      "learning_rate": 4.0909090909090915e-06,
+      "loss": 0.9795,
+      "step": 379
+    },
+    {
+      "epoch": 0.5499276410998553,
+      "grad_norm": 1.4908519983291626,
+      "learning_rate": 4.086886564762672e-06,
+      "loss": 1.0036,
+      "step": 380
+    },
+    {
+      "epoch": 0.5499276410998553,
+      "eval_loss": 1.0515390634536743,
+      "eval_runtime": 25.9468,
+      "eval_samples_per_second": 38.54,
+      "eval_steps_per_second": 2.428,
+      "step": 380
+    },
+    {
+      "epoch": 0.5513748191027497,
+      "grad_norm": 1.4864846467971802,
+      "learning_rate": 4.082864038616252e-06,
+      "loss": 0.9308,
+      "step": 381
+    },
+    {
+      "epoch": 0.552821997105644,
+      "grad_norm": 1.3892290592193604,
+      "learning_rate": 4.078841512469832e-06,
+      "loss": 0.9669,
+      "step": 382
+    },
+    {
+      "epoch": 0.5542691751085383,
+      "grad_norm": 1.3630082607269287,
+      "learning_rate": 4.074818986323412e-06,
+      "loss": 0.9657,
+      "step": 383
+    },
+    {
+      "epoch": 0.5557163531114327,
+      "grad_norm": 1.4945286512374878,
+      "learning_rate": 4.070796460176992e-06,
+      "loss": 0.9953,
+      "step": 384
+    },
+    {
+      "epoch": 0.557163531114327,
+      "grad_norm": 1.4368705749511719,
+      "learning_rate": 4.066773934030571e-06,
+      "loss": 1.0021,
+      "step": 385
+    },
+    {
+      "epoch": 0.5586107091172214,
+      "grad_norm": 1.4859812259674072,
+      "learning_rate": 4.0627514078841515e-06,
+      "loss": 0.983,
+      "step": 386
+    },
+    {
+      "epoch": 0.5600578871201157,
+      "grad_norm": 1.4304081201553345,
+      "learning_rate": 4.058728881737732e-06,
+      "loss": 0.9178,
+      "step": 387
+    },
+    {
+      "epoch": 0.5615050651230101,
+      "grad_norm": 1.3401498794555664,
+      "learning_rate": 4.054706355591312e-06,
+      "loss": 0.9209,
+      "step": 388
+    },
+    {
+      "epoch": 0.5629522431259045,
+      "grad_norm": 1.5096213817596436,
+      "learning_rate": 4.050683829444892e-06,
+      "loss": 0.9733,
+      "step": 389
+    },
+    {
+      "epoch": 0.5643994211287988,
+      "grad_norm": 1.5285199880599976,
+      "learning_rate": 4.046661303298472e-06,
+      "loss": 0.9649,
+      "step": 390
+    },
+    {
+      "epoch": 0.5643994211287988,
+      "eval_loss": 1.0477445125579834,
+      "eval_runtime": 26.0028,
+      "eval_samples_per_second": 38.457,
+      "eval_steps_per_second": 2.423,
+      "step": 390
+    },
+    {
+      "epoch": 0.5658465991316932,
+      "grad_norm": 1.3823034763336182,
+      "learning_rate": 4.042638777152052e-06,
+      "loss": 0.9635,
+      "step": 391
+    },
+    {
+      "epoch": 0.5672937771345875,
+      "grad_norm": 1.4934637546539307,
+      "learning_rate": 4.038616251005632e-06,
+      "loss": 0.9799,
+      "step": 392
+    },
+    {
+      "epoch": 0.5687409551374819,
+      "grad_norm": 1.3570342063903809,
+      "learning_rate": 4.034593724859212e-06,
+      "loss": 0.9736,
+      "step": 393
+    },
+    {
+      "epoch": 0.5701881331403763,
+      "grad_norm": 1.4477742910385132,
+      "learning_rate": 4.0305711987127925e-06,
+      "loss": 0.9539,
+      "step": 394
+    },
+    {
+      "epoch": 0.5716353111432706,
+      "grad_norm": 1.4653184413909912,
+      "learning_rate": 4.026548672566372e-06,
+      "loss": 0.9698,
+      "step": 395
+    },
+    {
+      "epoch": 0.573082489146165,
+      "grad_norm": 1.3791393041610718,
+      "learning_rate": 4.022526146419952e-06,
+      "loss": 0.9754,
+      "step": 396
+    },
+    {
+      "epoch": 0.5745296671490593,
+      "grad_norm": 1.5420602560043335,
+      "learning_rate": 4.018503620273532e-06,
+      "loss": 0.9592,
+      "step": 397
+    },
+    {
+      "epoch": 0.5759768451519537,
+      "grad_norm": 1.494424819946289,
+      "learning_rate": 4.014481094127112e-06,
+      "loss": 0.9783,
+      "step": 398
+    },
+    {
+      "epoch": 0.5774240231548481,
+      "grad_norm": 1.4302630424499512,
+      "learning_rate": 4.010458567980692e-06,
+      "loss": 0.9697,
+      "step": 399
+    },
+    {
+      "epoch": 0.5788712011577424,
+      "grad_norm": 1.4920209646224976,
+      "learning_rate": 4.0064360418342725e-06,
+      "loss": 0.9724,
+      "step": 400
+    },
+    {
+      "epoch": 0.5788712011577424,
+      "eval_loss": 1.0458935499191284,
+      "eval_runtime": 25.7502,
+      "eval_samples_per_second": 38.835,
+      "eval_steps_per_second": 2.447,
+      "step": 400
+    },
+    {
+      "epoch": 0.5803183791606368,
+      "grad_norm": 1.4532008171081543,
+      "learning_rate": 4.002413515687853e-06,
+      "loss": 0.9576,
+      "step": 401
+    },
+    {
+      "epoch": 0.5817655571635311,
+      "grad_norm": 1.4824233055114746,
+      "learning_rate": 3.998390989541433e-06,
+      "loss": 0.9631,
+      "step": 402
+    },
+    {
+      "epoch": 0.5832127351664255,
+      "grad_norm": 1.4938057661056519,
+      "learning_rate": 3.994368463395013e-06,
+      "loss": 0.9522,
+      "step": 403
+    },
+    {
+      "epoch": 0.5846599131693199,
+      "grad_norm": 1.5085760354995728,
+      "learning_rate": 3.990345937248592e-06,
+      "loss": 0.9873,
+      "step": 404
+    },
+    {
+      "epoch": 0.5861070911722142,
+      "grad_norm": 1.4096283912658691,
+      "learning_rate": 3.986323411102172e-06,
+      "loss": 0.9912,
+      "step": 405
+    },
+    {
+      "epoch": 0.5875542691751086,
+      "grad_norm": 1.4348584413528442,
+      "learning_rate": 3.982300884955752e-06,
+      "loss": 1.0022,
+      "step": 406
+    },
+    {
+      "epoch": 0.5890014471780028,
+      "grad_norm": 1.4397310018539429,
+      "learning_rate": 3.9782783588093325e-06,
+      "loss": 0.9702,
+      "step": 407
+    },
+    {
+      "epoch": 0.5904486251808972,
+      "grad_norm": 1.3248670101165771,
+      "learning_rate": 3.974255832662913e-06,
+      "loss": 0.9476,
+      "step": 408
+    },
+    {
+      "epoch": 0.5918958031837916,
+      "grad_norm": 1.3723477125167847,
+      "learning_rate": 3.970233306516493e-06,
+      "loss": 0.9463,
+      "step": 409
+    },
+    {
+      "epoch": 0.5933429811866859,
+      "grad_norm": 1.4383715391159058,
+      "learning_rate": 3.966210780370073e-06,
+      "loss": 0.9341,
+      "step": 410
+    },
+    {
+      "epoch": 0.5933429811866859,
+      "eval_loss": 1.04234778881073,
+      "eval_runtime": 25.7716,
+      "eval_samples_per_second": 38.802,
+      "eval_steps_per_second": 2.445,
+      "step": 410
+    },
+    {
+      "epoch": 0.5947901591895803,
+      "grad_norm": 1.4978934526443481,
+      "learning_rate": 3.962188254223653e-06,
+      "loss": 0.9593,
+      "step": 411
+    },
+    {
+      "epoch": 0.5962373371924746,
+      "grad_norm": 1.4144810438156128,
+      "learning_rate": 3.958165728077233e-06,
+      "loss": 0.9422,
+      "step": 412
+    },
+    {
+      "epoch": 0.597684515195369,
+      "grad_norm": 1.4646399021148682,
+      "learning_rate": 3.954143201930813e-06,
+      "loss": 0.9329,
+      "step": 413
+    },
+    {
+      "epoch": 0.5991316931982634,
+      "grad_norm": 1.5337008237838745,
+      "learning_rate": 3.9501206757843926e-06,
+      "loss": 0.9847,
+      "step": 414
+    },
+    {
+      "epoch": 0.6005788712011577,
+      "grad_norm": 1.342451810836792,
+      "learning_rate": 3.946098149637973e-06,
+      "loss": 0.9827,
+      "step": 415
+    },
+    {
+      "epoch": 0.6020260492040521,
+      "grad_norm": 1.4263315200805664,
+      "learning_rate": 3.942075623491553e-06,
+      "loss": 0.9404,
+      "step": 416
+    },
+    {
+      "epoch": 0.6034732272069464,
+      "grad_norm": 1.4434232711791992,
+      "learning_rate": 3.938053097345133e-06,
+      "loss": 0.9266,
+      "step": 417
+    },
+    {
+      "epoch": 0.6049204052098408,
+      "grad_norm": 1.4536890983581543,
+      "learning_rate": 3.934030571198713e-06,
+      "loss": 0.9889,
+      "step": 418
+    },
+    {
+      "epoch": 0.6063675832127352,
+      "grad_norm": 1.3431605100631714,
+      "learning_rate": 3.930008045052293e-06,
+      "loss": 0.9614,
+      "step": 419
+    },
+    {
+      "epoch": 0.6078147612156295,
+      "grad_norm": 1.3683048486709595,
+      "learning_rate": 3.925985518905873e-06,
+      "loss": 0.9207,
+      "step": 420
+    },
+    {
+      "epoch": 0.6078147612156295,
+      "eval_loss": 1.0399519205093384,
+      "eval_runtime": 25.9477,
+      "eval_samples_per_second": 38.539,
+      "eval_steps_per_second": 2.428,
+      "step": 420
+    },
+    {
+      "epoch": 0.6092619392185239,
+      "grad_norm": 1.5089398622512817,
+      "learning_rate": 3.9219629927594534e-06,
+      "loss": 0.9382,
+      "step": 421
+    },
+    {
+      "epoch": 0.6107091172214182,
+      "grad_norm": 1.5236350297927856,
+      "learning_rate": 3.9179404666130336e-06,
+      "loss": 0.9683,
+      "step": 422
+    },
+    {
+      "epoch": 0.6121562952243126,
+      "grad_norm": 1.3001353740692139,
+      "learning_rate": 3.913917940466614e-06,
+      "loss": 0.9385,
+      "step": 423
+    },
+    {
+      "epoch": 0.613603473227207,
+      "grad_norm": 1.6009608507156372,
+      "learning_rate": 3.909895414320193e-06,
+      "loss": 0.9988,
+      "step": 424
+    },
+    {
+      "epoch": 0.6150506512301013,
+      "grad_norm": 1.3586127758026123,
+      "learning_rate": 3.905872888173773e-06,
+      "loss": 0.9613,
+      "step": 425
+    },
+    {
+      "epoch": 0.6164978292329957,
+      "grad_norm": 1.4171210527420044,
+      "learning_rate": 3.901850362027353e-06,
+      "loss": 0.9485,
+      "step": 426
+    },
+    {
+      "epoch": 0.61794500723589,
+      "grad_norm": 1.4831761121749878,
+      "learning_rate": 3.897827835880933e-06,
+      "loss": 0.9337,
+      "step": 427
+    },
+    {
+      "epoch": 0.6193921852387844,
+      "grad_norm": 1.3427209854125977,
+      "learning_rate": 3.8938053097345135e-06,
+      "loss": 0.9319,
+      "step": 428
+    },
+    {
+      "epoch": 0.6208393632416788,
+      "grad_norm": 1.3631221055984497,
+      "learning_rate": 3.889782783588094e-06,
+      "loss": 0.9416,
+      "step": 429
+    },
+    {
+      "epoch": 0.622286541244573,
+      "grad_norm": 1.3836332559585571,
+      "learning_rate": 3.885760257441674e-06,
+      "loss": 0.9422,
+      "step": 430
+    },
+    {
+      "epoch": 0.622286541244573,
+      "eval_loss": 1.0372964143753052,
+      "eval_runtime": 26.054,
+      "eval_samples_per_second": 38.382,
+      "eval_steps_per_second": 2.418,
+      "step": 430
+    },
+    {
+      "epoch": 0.6237337192474675,
+      "grad_norm": 1.4307619333267212,
+      "learning_rate": 3.881737731295254e-06,
+      "loss": 0.9395,
+      "step": 431
+    },
+    {
+      "epoch": 0.6251808972503617,
+      "grad_norm": 1.338143229484558,
+      "learning_rate": 3.877715205148834e-06,
+      "loss": 0.9527,
+      "step": 432
+    },
+    {
+      "epoch": 0.6266280752532561,
+      "grad_norm": 1.3655433654785156,
+      "learning_rate": 3.873692679002414e-06,
+      "loss": 0.9633,
+      "step": 433
+    },
+    {
+      "epoch": 0.6280752532561505,
+      "grad_norm": 1.4386481046676636,
+      "learning_rate": 3.869670152855993e-06,
+      "loss": 0.9878,
+      "step": 434
+    },
+    {
+      "epoch": 0.6295224312590448,
+      "grad_norm": 1.4618841409683228,
+      "learning_rate": 3.8656476267095735e-06,
+      "loss": 0.922,
+      "step": 435
+    },
+    {
+      "epoch": 0.6309696092619392,
+      "grad_norm": 1.4335823059082031,
+      "learning_rate": 3.861625100563154e-06,
+      "loss": 0.9346,
+      "step": 436
+    },
+    {
+      "epoch": 0.6324167872648335,
+      "grad_norm": 1.4543538093566895,
+      "learning_rate": 3.857602574416734e-06,
+      "loss": 0.9847,
+      "step": 437
+    },
+    {
+      "epoch": 0.6338639652677279,
+      "grad_norm": 1.460121750831604,
+      "learning_rate": 3.853580048270314e-06,
+      "loss": 0.9954,
+      "step": 438
+    },
+    {
+      "epoch": 0.6353111432706223,
+      "grad_norm": 1.5101383924484253,
+      "learning_rate": 3.849557522123894e-06,
+      "loss": 0.963,
+      "step": 439
+    },
+    {
+      "epoch": 0.6367583212735166,
+      "grad_norm": 1.5034980773925781,
+      "learning_rate": 3.845534995977474e-06,
+      "loss": 0.9378,
+      "step": 440
+    },
+    {
+      "epoch": 0.6367583212735166,
+      "eval_loss": 1.034120798110962,
+      "eval_runtime": 26.0364,
+      "eval_samples_per_second": 38.408,
+      "eval_steps_per_second": 2.42,
+      "step": 440
+    },
+    {
+      "epoch": 0.638205499276411,
+      "grad_norm": 1.375977873802185,
+      "learning_rate": 3.841512469831054e-06,
+      "loss": 0.9232,
+      "step": 441
+    },
+    {
+      "epoch": 0.6396526772793053,
+      "grad_norm": 1.486662745475769,
+      "learning_rate": 3.8374899436846344e-06,
+      "loss": 0.9634,
+      "step": 442
+    },
+    {
+      "epoch": 0.6410998552821997,
+      "grad_norm": 1.4522157907485962,
+      "learning_rate": 3.8334674175382146e-06,
+      "loss": 0.9577,
+      "step": 443
+    },
+    {
+      "epoch": 0.6425470332850941,
+      "grad_norm": 1.3580085039138794,
+      "learning_rate": 3.829444891391794e-06,
+      "loss": 0.9283,
+      "step": 444
+    },
+    {
+      "epoch": 0.6439942112879884,
+      "grad_norm": 1.5201516151428223,
+      "learning_rate": 3.825422365245374e-06,
+      "loss": 0.9358,
+      "step": 445
+    },
+    {
+      "epoch": 0.6454413892908828,
+      "grad_norm": 1.5232244729995728,
+      "learning_rate": 3.821399839098954e-06,
+      "loss": 0.9197,
+      "step": 446
+    },
+    {
+      "epoch": 0.6468885672937771,
+      "grad_norm": 1.3573877811431885,
+      "learning_rate": 3.817377312952534e-06,
+      "loss": 0.9888,
+      "step": 447
+    },
+    {
+      "epoch": 0.6483357452966715,
+      "grad_norm": 1.3900681734085083,
+      "learning_rate": 3.8133547868061144e-06,
+      "loss": 0.9388,
+      "step": 448
+    },
+    {
+      "epoch": 0.6497829232995659,
+      "grad_norm": 1.4748214483261108,
+      "learning_rate": 3.8093322606596945e-06,
+      "loss": 0.9883,
+      "step": 449
+    },
+    {
+      "epoch": 0.6512301013024602,
+      "grad_norm": 1.3832060098648071,
+      "learning_rate": 3.8053097345132746e-06,
+      "loss": 0.9676,
+      "step": 450
+    },
+    {
+      "epoch": 0.6512301013024602,
+      "eval_loss": 1.031374454498291,
+      "eval_runtime": 25.8404,
+      "eval_samples_per_second": 38.699,
+      "eval_steps_per_second": 2.438,
+      "step": 450
+    },
+    {
+      "epoch": 0.6526772793053546,
+      "grad_norm": 1.4417644739151,
+      "learning_rate": 3.8012872083668543e-06,
+      "loss": 0.9424,
+      "step": 451
+    },
+    {
+      "epoch": 0.6541244573082489,
+      "grad_norm": 1.3401044607162476,
+      "learning_rate": 3.7972646822204344e-06,
+      "loss": 0.9427,
+      "step": 452
+    },
+    {
+      "epoch": 0.6555716353111433,
+      "grad_norm": 1.4024505615234375,
+      "learning_rate": 3.7932421560740146e-06,
+      "loss": 0.9422,
+      "step": 453
+    },
+    {
+      "epoch": 0.6570188133140377,
+      "grad_norm": 1.4496229887008667,
+      "learning_rate": 3.7892196299275947e-06,
+      "loss": 0.9249,
+      "step": 454
+    },
+    {
+      "epoch": 0.658465991316932,
+      "grad_norm": 1.3784229755401611,
+      "learning_rate": 3.785197103781175e-06,
+      "loss": 0.908,
+      "step": 455
+    },
+    {
+      "epoch": 0.6599131693198264,
+      "grad_norm": 1.413669228553772,
+      "learning_rate": 3.7811745776347545e-06,
+      "loss": 0.9697,
+      "step": 456
+    },
+    {
+      "epoch": 0.6613603473227206,
+      "grad_norm": 1.5418171882629395,
+      "learning_rate": 3.7771520514883347e-06,
+      "loss": 0.9636,
+      "step": 457
+    },
+    {
+      "epoch": 0.662807525325615,
+      "grad_norm": 1.3097503185272217,
+      "learning_rate": 3.7731295253419148e-06,
+      "loss": 0.9562,
+      "step": 458
+    },
+    {
+      "epoch": 0.6642547033285094,
+      "grad_norm": 1.3388592004776,
+      "learning_rate": 3.769106999195495e-06,
+      "loss": 0.9441,
+      "step": 459
+    },
+    {
+      "epoch": 0.6657018813314037,
+      "grad_norm": 1.3556655645370483,
+      "learning_rate": 3.765084473049075e-06,
+      "loss": 0.9527,
+      "step": 460
+    },
+    {
+      "epoch": 0.6657018813314037,
+      "eval_loss": 1.0326937437057495,
+      "eval_runtime": 25.7402,
+      "eval_samples_per_second": 38.85,
+      "eval_steps_per_second": 2.448,
+      "step": 460
+    },
+    {
+      "epoch": 0.6671490593342981,
+      "grad_norm": 1.3645198345184326,
+      "learning_rate": 3.7610619469026547e-06,
+      "loss": 0.972,
+      "step": 461
+    },
+    {
+      "epoch": 0.6685962373371924,
+      "grad_norm": 1.4470856189727783,
+      "learning_rate": 3.757039420756235e-06,
+      "loss": 0.9749,
+      "step": 462
+    },
+    {
+      "epoch": 0.6700434153400868,
+      "grad_norm": 1.3615493774414062,
+      "learning_rate": 3.753016894609815e-06,
+      "loss": 0.9386,
+      "step": 463
+    },
+    {
+      "epoch": 0.6714905933429812,
+      "grad_norm": 1.355056643486023,
+      "learning_rate": 3.7489943684633956e-06,
+      "loss": 0.9761,
+      "step": 464
+    },
+    {
+      "epoch": 0.6729377713458755,
+      "grad_norm": 1.4716227054595947,
+      "learning_rate": 3.7449718423169757e-06,
+      "loss": 0.983,
+      "step": 465
+    },
+    {
+      "epoch": 0.6743849493487699,
+      "grad_norm": 1.4398634433746338,
+      "learning_rate": 3.740949316170556e-06,
+      "loss": 0.9279,
+      "step": 466
+    },
+    {
+      "epoch": 0.6758321273516642,
+      "grad_norm": 1.469275712966919,
+      "learning_rate": 3.7369267900241355e-06,
+      "loss": 0.9343,
+      "step": 467
+    },
+    {
+      "epoch": 0.6772793053545586,
+      "grad_norm": 1.383954405784607,
+      "learning_rate": 3.7329042638777156e-06,
+      "loss": 0.9372,
+      "step": 468
+    },
+    {
+      "epoch": 0.678726483357453,
+      "grad_norm": 1.3994921445846558,
+      "learning_rate": 3.7288817377312958e-06,
+      "loss": 0.9399,
+      "step": 469
+    },
+    {
+      "epoch": 0.6801736613603473,
+      "grad_norm": 1.4469072818756104,
+      "learning_rate": 3.724859211584876e-06,
+      "loss": 0.9696,
+      "step": 470
+    },
+    {
+      "epoch": 0.6801736613603473,
+      "eval_loss": 1.0329785346984863,
+      "eval_runtime": 25.7443,
+      "eval_samples_per_second": 38.843,
+      "eval_steps_per_second": 2.447,
+      "step": 470
+    },
+    {
+      "epoch": 0.6816208393632417,
+      "grad_norm": 1.447624921798706,
+      "learning_rate": 3.720836685438456e-06,
+      "loss": 0.985,
+      "step": 471
+    },
+    {
+      "epoch": 0.683068017366136,
+      "grad_norm": 1.409177303314209,
+      "learning_rate": 3.7168141592920357e-06,
+      "loss": 0.948,
+      "step": 472
+    },
+    {
+      "epoch": 0.6845151953690304,
+      "grad_norm": 1.4697754383087158,
+      "learning_rate": 3.712791633145616e-06,
+      "loss": 0.9365,
+      "step": 473
+    },
+    {
+      "epoch": 0.6859623733719248,
+      "grad_norm": 1.4154843091964722,
+      "learning_rate": 3.708769106999196e-06,
+      "loss": 0.9701,
+      "step": 474
+    },
+    {
+      "epoch": 0.6874095513748191,
+      "grad_norm": 1.4701588153839111,
+      "learning_rate": 3.704746580852776e-06,
+      "loss": 0.9548,
+      "step": 475
+    },
+    {
+      "epoch": 0.6888567293777135,
+      "grad_norm": 1.4201622009277344,
+      "learning_rate": 3.7007240547063562e-06,
+      "loss": 0.9288,
+      "step": 476
+    },
+    {
+      "epoch": 0.6903039073806078,
+      "grad_norm": 1.365224003791809,
+      "learning_rate": 3.696701528559936e-06,
+      "loss": 0.9461,
+      "step": 477
+    },
+    {
+      "epoch": 0.6917510853835022,
+      "grad_norm": 1.3176798820495605,
+      "learning_rate": 3.692679002413516e-06,
+      "loss": 0.9193,
+      "step": 478
+    },
+    {
+      "epoch": 0.6931982633863966,
+      "grad_norm": 1.4290233850479126,
+      "learning_rate": 3.688656476267096e-06,
+      "loss": 0.9356,
+      "step": 479
+    },
+    {
+      "epoch": 0.6946454413892909,
+      "grad_norm": 1.4030050039291382,
+      "learning_rate": 3.6846339501206763e-06,
+      "loss": 0.9634,
+      "step": 480
+    },
+    {
+      "epoch": 0.6946454413892909,
+      "eval_loss": 1.0300215482711792,
+      "eval_runtime": 25.9854,
+      "eval_samples_per_second": 38.483,
+      "eval_steps_per_second": 2.424,
+      "step": 480
+    },
+    {
+      "epoch": 0.6960926193921853,
+      "grad_norm": 1.3863651752471924,
+      "learning_rate": 3.6806114239742565e-06,
+      "loss": 0.8867,
+      "step": 481
+    },
+    {
+      "epoch": 0.6975397973950795,
+      "grad_norm": 1.4828954935073853,
+      "learning_rate": 3.676588897827836e-06,
+      "loss": 0.9357,
+      "step": 482
+    },
+    {
+      "epoch": 0.6989869753979739,
+      "grad_norm": 1.4383491277694702,
+      "learning_rate": 3.6725663716814163e-06,
+      "loss": 0.9295,
+      "step": 483
+    },
+    {
+      "epoch": 0.7004341534008683,
+      "grad_norm": 1.446408748626709,
+      "learning_rate": 3.6685438455349964e-06,
+      "loss": 0.9609,
+      "step": 484
+    },
+    {
+      "epoch": 0.7018813314037626,
+      "grad_norm": 1.3710352182388306,
+      "learning_rate": 3.6645213193885765e-06,
+      "loss": 0.9365,
+      "step": 485
+    },
+    {
+      "epoch": 0.703328509406657,
+      "grad_norm": 1.4681317806243896,
+      "learning_rate": 3.6604987932421567e-06,
+      "loss": 0.9732,
+      "step": 486
+    },
+    {
+      "epoch": 0.7047756874095513,
+      "grad_norm": 1.3563289642333984,
+      "learning_rate": 3.6564762670957364e-06,
+      "loss": 0.9393,
+      "step": 487
+    },
+    {
+      "epoch": 0.7062228654124457,
+      "grad_norm": 1.3670072555541992,
+      "learning_rate": 3.6524537409493165e-06,
+      "loss": 0.9363,
+      "step": 488
+    },
+    {
+      "epoch": 0.7076700434153401,
+      "grad_norm": 1.4312621355056763,
+      "learning_rate": 3.6484312148028966e-06,
+      "loss": 0.9229,
+      "step": 489
+    },
+    {
+      "epoch": 0.7091172214182344,
+      "grad_norm": 1.4633783102035522,
+      "learning_rate": 3.6444086886564768e-06,
+      "loss": 0.9515,
+      "step": 490
+    },
+    {
+      "epoch": 0.7091172214182344,
+      "eval_loss": 1.0295219421386719,
+      "eval_runtime": 25.7043,
+      "eval_samples_per_second": 38.904,
+      "eval_steps_per_second": 2.451,
+      "step": 490
+    },
+    {
+      "epoch": 0.7105643994211288,
+      "grad_norm": 1.5504322052001953,
+      "learning_rate": 3.640386162510057e-06,
+      "loss": 0.9707,
+      "step": 491
+    },
+    {
+      "epoch": 0.7120115774240231,
+      "grad_norm": 1.445961594581604,
+      "learning_rate": 3.6363636363636366e-06,
+      "loss": 0.9924,
+      "step": 492
+    },
+    {
+      "epoch": 0.7134587554269175,
+      "grad_norm": 1.49595308303833,
+      "learning_rate": 3.6323411102172167e-06,
+      "loss": 0.9619,
+      "step": 493
+    },
+    {
+      "epoch": 0.7149059334298119,
+      "grad_norm": 1.5091534852981567,
+      "learning_rate": 3.628318584070797e-06,
+      "loss": 0.9393,
+      "step": 494
+    },
+    {
+      "epoch": 0.7163531114327062,
+      "grad_norm": 1.3934762477874756,
+      "learning_rate": 3.624296057924377e-06,
+      "loss": 0.9252,
+      "step": 495
+    },
+    {
+      "epoch": 0.7178002894356006,
+      "grad_norm": 1.4442094564437866,
+      "learning_rate": 3.620273531777957e-06,
+      "loss": 0.9149,
+      "step": 496
+    },
+    {
+      "epoch": 0.7192474674384949,
+      "grad_norm": 1.3727437257766724,
+      "learning_rate": 3.616251005631537e-06,
+      "loss": 0.9136,
+      "step": 497
+    },
+    {
+      "epoch": 0.7206946454413893,
+      "grad_norm": 1.3915483951568604,
+      "learning_rate": 3.612228479485117e-06,
+      "loss": 0.9132,
+      "step": 498
+    },
+    {
+      "epoch": 0.7221418234442837,
+      "grad_norm": 1.5220942497253418,
+      "learning_rate": 3.608205953338697e-06,
+      "loss": 0.9576,
+      "step": 499
+    },
+    {
+      "epoch": 0.723589001447178,
+      "grad_norm": 1.4072246551513672,
+      "learning_rate": 3.604183427192277e-06,
+      "loss": 0.9372,
+      "step": 500
+    },
+    {
+      "epoch": 0.723589001447178,
+      "eval_loss": 1.029038906097412,
+      "eval_runtime": 25.9369,
+      "eval_samples_per_second": 38.555,
+      "eval_steps_per_second": 2.429,
+      "step": 500
+    },
+    {
+      "epoch": 0.7250361794500724,
+      "grad_norm": 1.3465371131896973,
+      "learning_rate": 3.6001609010458573e-06,
+      "loss": 0.9192,
+      "step": 501
+    },
+    {
+      "epoch": 0.7264833574529667,
+      "grad_norm": 1.5331004858016968,
+      "learning_rate": 3.596138374899437e-06,
+      "loss": 0.9672,
+      "step": 502
+    },
+    {
+      "epoch": 0.7279305354558611,
+      "grad_norm": 1.4934667348861694,
+      "learning_rate": 3.592115848753017e-06,
+      "loss": 0.9349,
+      "step": 503
+    },
+    {
+      "epoch": 0.7293777134587555,
+      "grad_norm": 1.3691191673278809,
+      "learning_rate": 3.5880933226065973e-06,
+      "loss": 0.9471,
+      "step": 504
+    },
+    {
+      "epoch": 0.7308248914616498,
+      "grad_norm": 1.450124979019165,
+      "learning_rate": 3.5840707964601774e-06,
+      "loss": 0.9424,
+      "step": 505
+    },
+    {
+      "epoch": 0.7322720694645442,
+      "grad_norm": 1.5474168062210083,
+      "learning_rate": 3.5800482703137575e-06,
+      "loss": 0.9187,
+      "step": 506
+    },
+    {
+      "epoch": 0.7337192474674384,
+      "grad_norm": 1.4401590824127197,
+      "learning_rate": 3.5760257441673372e-06,
+      "loss": 0.9218,
+      "step": 507
+    },
+    {
+      "epoch": 0.7351664254703328,
+      "grad_norm": 1.5775281190872192,
+      "learning_rate": 3.5720032180209174e-06,
+      "loss": 0.93,
+      "step": 508
+    },
+    {
+      "epoch": 0.7366136034732272,
+      "grad_norm": 1.5710840225219727,
+      "learning_rate": 3.5679806918744975e-06,
+      "loss": 0.8954,
+      "step": 509
+    },
+    {
+      "epoch": 0.7380607814761215,
+      "grad_norm": 1.5885707139968872,
+      "learning_rate": 3.5639581657280776e-06,
+      "loss": 0.9682,
+      "step": 510
+    },
+    {
+      "epoch": 0.7380607814761215,
+      "eval_loss": 1.0261106491088867,
+      "eval_runtime": 25.8126,
+      "eval_samples_per_second": 38.741,
+      "eval_steps_per_second": 2.441,
+      "step": 510
+    },
+    {
+      "epoch": 0.7395079594790159,
+      "grad_norm": 1.4182794094085693,
+      "learning_rate": 3.5599356395816577e-06,
+      "loss": 0.9417,
+      "step": 511
+    },
+    {
+      "epoch": 0.7409551374819102,
+      "grad_norm": 1.416455626487732,
+      "learning_rate": 3.5559131134352374e-06,
+      "loss": 0.9325,
+      "step": 512
+    },
+    {
+      "epoch": 0.7424023154848046,
+      "grad_norm": 1.511069416999817,
+      "learning_rate": 3.5518905872888176e-06,
+      "loss": 0.9621,
+      "step": 513
+    },
+    {
+      "epoch": 0.743849493487699,
+      "grad_norm": 1.5709325075149536,
+      "learning_rate": 3.5478680611423977e-06,
+      "loss": 0.9302,
+      "step": 514
+    },
+    {
+      "epoch": 0.7452966714905933,
+      "grad_norm": 1.5268837213516235,
+      "learning_rate": 3.543845534995978e-06,
+      "loss": 0.8761,
+      "step": 515
+    },
+    {
+      "epoch": 0.7467438494934877,
+      "grad_norm": 1.6121459007263184,
+      "learning_rate": 3.539823008849558e-06,
+      "loss": 0.9322,
+      "step": 516
+    },
+    {
+      "epoch": 0.748191027496382,
+      "grad_norm": 1.479650616645813,
+      "learning_rate": 3.5358004827031377e-06,
+      "loss": 0.9654,
+      "step": 517
+    },
+    {
+      "epoch": 0.7496382054992764,
+      "grad_norm": 1.6954939365386963,
+      "learning_rate": 3.5317779565567178e-06,
+      "loss": 0.9719,
+      "step": 518
+    },
+    {
+      "epoch": 0.7510853835021708,
+      "grad_norm": 1.4463775157928467,
+      "learning_rate": 3.527755430410298e-06,
+      "loss": 0.8981,
+      "step": 519
+    },
+    {
+      "epoch": 0.7525325615050651,
+      "grad_norm": 1.4564443826675415,
+      "learning_rate": 3.523732904263878e-06,
+      "loss": 0.9273,
+      "step": 520
+    },
+    {
+      "epoch": 0.7525325615050651,
+      "eval_loss": 1.0260487794876099,
+      "eval_runtime": 25.8612,
+      "eval_samples_per_second": 38.668,
+      "eval_steps_per_second": 2.436,
+      "step": 520
+    },
+    {
+      "epoch": 0.7539797395079595,
+      "grad_norm": 1.436152696609497,
+      "learning_rate": 3.519710378117458e-06,
+      "loss": 0.9541,
+      "step": 521
+    },
+    {
+      "epoch": 0.7554269175108539,
+      "grad_norm": 1.596771478652954,
+      "learning_rate": 3.515687851971038e-06,
+      "loss": 0.9138,
+      "step": 522
+    },
+    {
+      "epoch": 0.7568740955137482,
+      "grad_norm": 1.4977961778640747,
+      "learning_rate": 3.511665325824618e-06,
+      "loss": 0.9557,
+      "step": 523
+    },
+    {
+      "epoch": 0.7583212735166426,
+      "grad_norm": 1.4800033569335938,
+      "learning_rate": 3.507642799678198e-06,
+      "loss": 0.9372,
+      "step": 524
+    },
+    {
+      "epoch": 0.7597684515195369,
+      "grad_norm": 1.4876062870025635,
+      "learning_rate": 3.5036202735317783e-06,
+      "loss": 0.9345,
+      "step": 525
+    },
+    {
+      "epoch": 0.7612156295224313,
+      "grad_norm": 1.4231287240982056,
+      "learning_rate": 3.4995977473853584e-06,
+      "loss": 0.9444,
+      "step": 526
+    },
+    {
+      "epoch": 0.7626628075253257,
+      "grad_norm": 1.5092722177505493,
+      "learning_rate": 3.495575221238938e-06,
+      "loss": 0.9536,
+      "step": 527
+    },
+    {
+      "epoch": 0.76410998552822,
+      "grad_norm": 1.465430498123169,
+      "learning_rate": 3.4915526950925182e-06,
+      "loss": 0.9378,
+      "step": 528
+    },
+    {
+      "epoch": 0.7655571635311144,
+      "grad_norm": 1.4292782545089722,
+      "learning_rate": 3.4875301689460983e-06,
+      "loss": 0.9345,
+      "step": 529
+    },
+    {
+      "epoch": 0.7670043415340086,
+      "grad_norm": 1.606329321861267,
+      "learning_rate": 3.4835076427996785e-06,
+      "loss": 0.9446,
+      "step": 530
+    },
+    {
+      "epoch": 0.7670043415340086,
+      "eval_loss": 1.0236235857009888,
+      "eval_runtime": 25.9147,
+      "eval_samples_per_second": 38.588,
+      "eval_steps_per_second": 2.431,
+      "step": 530
+    },
+    {
+      "epoch": 0.768451519536903,
+      "grad_norm": 1.3893780708312988,
+      "learning_rate": 3.4794851166532586e-06,
+      "loss": 0.9234,
+      "step": 531
+    },
+    {
+      "epoch": 0.7698986975397974,
+      "grad_norm": 1.4836314916610718,
+      "learning_rate": 3.4754625905068383e-06,
+      "loss": 0.9164,
+      "step": 532
+    },
+    {
+      "epoch": 0.7713458755426917,
+      "grad_norm": 1.3877429962158203,
+      "learning_rate": 3.4714400643604184e-06,
+      "loss": 0.9433,
+      "step": 533
+    },
+    {
+      "epoch": 0.7727930535455861,
+      "grad_norm": 1.486474871635437,
+      "learning_rate": 3.4674175382139986e-06,
+      "loss": 0.9303,
+      "step": 534
+    },
+    {
+      "epoch": 0.7742402315484804,
+      "grad_norm": 1.4549676179885864,
+      "learning_rate": 3.4633950120675787e-06,
+      "loss": 0.9438,
+      "step": 535
+    },
+    {
+      "epoch": 0.7756874095513748,
+      "grad_norm": 1.4801554679870605,
+      "learning_rate": 3.4593724859211584e-06,
+      "loss": 0.9441,
+      "step": 536
+    },
+    {
+      "epoch": 0.7771345875542692,
+      "grad_norm": 1.4578830003738403,
+      "learning_rate": 3.4553499597747385e-06,
+      "loss": 0.9301,
+      "step": 537
+    },
+    {
+      "epoch": 0.7785817655571635,
+      "grad_norm": 1.4650003910064697,
+      "learning_rate": 3.4513274336283186e-06,
+      "loss": 0.939,
+      "step": 538
+    },
+    {
+      "epoch": 0.7800289435600579,
+      "grad_norm": 1.416143774986267,
+      "learning_rate": 3.4473049074818988e-06,
+      "loss": 0.9211,
+      "step": 539
+    },
+    {
+      "epoch": 0.7814761215629522,
+      "grad_norm": 1.768223762512207,
+      "learning_rate": 3.443282381335479e-06,
+      "loss": 0.9502,
+      "step": 540
+    },
+    {
+      "epoch": 0.7814761215629522,
+      "eval_loss": 1.024090051651001,
+      "eval_runtime": 25.9198,
+      "eval_samples_per_second": 38.58,
+      "eval_steps_per_second": 2.431,
+      "step": 540
+    },
+    {
+      "epoch": 0.7829232995658466,
+      "grad_norm": 1.5100836753845215,
+      "learning_rate": 3.4392598551890586e-06,
+      "loss": 0.957,
+      "step": 541
+    },
+    {
+      "epoch": 0.784370477568741,
+      "grad_norm": 1.4793955087661743,
+      "learning_rate": 3.4352373290426387e-06,
+      "loss": 0.9341,
+      "step": 542
+    },
+    {
+      "epoch": 0.7858176555716353,
+      "grad_norm": 1.5424785614013672,
+      "learning_rate": 3.431214802896219e-06,
+      "loss": 0.9246,
+      "step": 543
+    },
+    {
+      "epoch": 0.7872648335745297,
+      "grad_norm": 1.5629793405532837,
+      "learning_rate": 3.427192276749799e-06,
+      "loss": 0.9494,
+      "step": 544
+    },
+    {
+      "epoch": 0.788712011577424,
+      "grad_norm": 1.5148011445999146,
+      "learning_rate": 3.423169750603379e-06,
+      "loss": 0.9063,
+      "step": 545
+    },
+    {
+      "epoch": 0.7901591895803184,
+      "grad_norm": 1.489189624786377,
+      "learning_rate": 3.419147224456959e-06,
+      "loss": 0.9498,
+      "step": 546
+    },
+    {
+      "epoch": 0.7916063675832128,
+      "grad_norm": 1.5156149864196777,
+      "learning_rate": 3.415124698310539e-06,
+      "loss": 0.9377,
+      "step": 547
+    },
+    {
+      "epoch": 0.7930535455861071,
+      "grad_norm": 1.5290278196334839,
+      "learning_rate": 3.411102172164119e-06,
+      "loss": 0.9235,
+      "step": 548
+    },
+    {
+      "epoch": 0.7945007235890015,
+      "grad_norm": 1.5882658958435059,
+      "learning_rate": 3.407079646017699e-06,
+      "loss": 0.9498,
+      "step": 549
+    },
+    {
+      "epoch": 0.7959479015918958,
+      "grad_norm": 1.4402974843978882,
+      "learning_rate": 3.4030571198712793e-06,
+      "loss": 0.9396,
+      "step": 550
+    },
+    {
+      "epoch": 0.7959479015918958,
+      "eval_loss": 1.020573616027832,
+      "eval_runtime": 25.7599,
+      "eval_samples_per_second": 38.82,
+      "eval_steps_per_second": 2.446,
+      "step": 550
+    },
+    {
+      "epoch": 0.7973950795947902,
+      "grad_norm": 1.4785488843917847,
+      "learning_rate": 3.399034593724859e-06,
+      "loss": 0.9143,
+      "step": 551
+    },
+    {
+      "epoch": 0.7988422575976846,
+      "grad_norm": 1.4422612190246582,
+      "learning_rate": 3.395012067578439e-06,
+      "loss": 0.9585,
+      "step": 552
+    },
+    {
+      "epoch": 0.8002894356005789,
+      "grad_norm": 1.3907090425491333,
+      "learning_rate": 3.3909895414320197e-06,
+      "loss": 0.9288,
+      "step": 553
+    },
+    {
+      "epoch": 0.8017366136034733,
+      "grad_norm": 1.4476579427719116,
+      "learning_rate": 3.3869670152856e-06,
+      "loss": 0.9135,
+      "step": 554
+    },
+    {
+      "epoch": 0.8031837916063675,
+      "grad_norm": 1.2672919034957886,
+      "learning_rate": 3.38294448913918e-06,
+      "loss": 0.9093,
+      "step": 555
+    },
+    {
+      "epoch": 0.804630969609262,
+      "grad_norm": 1.4998122453689575,
+      "learning_rate": 3.37892196299276e-06,
+      "loss": 0.9147,
+      "step": 556
+    },
+    {
+      "epoch": 0.8060781476121563,
+      "grad_norm": 1.3593398332595825,
+      "learning_rate": 3.3748994368463402e-06,
+      "loss": 0.914,
+      "step": 557
+    },
+    {
+      "epoch": 0.8075253256150506,
+      "grad_norm": 1.5175474882125854,
+      "learning_rate": 3.37087691069992e-06,
+      "loss": 0.9232,
+      "step": 558
+    },
+    {
+      "epoch": 0.808972503617945,
+      "grad_norm": 1.7521854639053345,
+      "learning_rate": 3.3668543845535e-06,
+      "loss": 0.9683,
+      "step": 559
+    },
+    {
+      "epoch": 0.8104196816208393,
+      "grad_norm": 1.451035499572754,
+      "learning_rate": 3.36283185840708e-06,
+      "loss": 0.9462,
+      "step": 560
+    },
+    {
+      "epoch": 0.8104196816208393,
+      "eval_loss": 1.0183910131454468,
+      "eval_runtime": 25.7176,
+      "eval_samples_per_second": 38.884,
+      "eval_steps_per_second": 2.45,
+      "step": 560
+    },
+    {
+      "epoch": 0.8118668596237337,
+      "grad_norm": 1.3751353025436401,
+      "learning_rate": 3.3588093322606603e-06,
+      "loss": 0.9493,
+      "step": 561
+    },
+    {
+      "epoch": 0.8133140376266281,
+      "grad_norm": 1.437719464302063,
+      "learning_rate": 3.3547868061142404e-06,
+      "loss": 0.915,
+      "step": 562
+    },
+    {
+      "epoch": 0.8147612156295224,
+      "grad_norm": 1.3803507089614868,
+      "learning_rate": 3.35076427996782e-06,
+      "loss": 0.9247,
+      "step": 563
+    },
+    {
+      "epoch": 0.8162083936324168,
+      "grad_norm": 1.4442408084869385,
+      "learning_rate": 3.3467417538214003e-06,
+      "loss": 0.9232,
+      "step": 564
+    },
+    {
+      "epoch": 0.8176555716353111,
+      "grad_norm": 1.419447422027588,
+      "learning_rate": 3.3427192276749804e-06,
+      "loss": 0.9242,
+      "step": 565
+    },
+    {
+      "epoch": 0.8191027496382055,
+      "grad_norm": 1.4496979713439941,
+      "learning_rate": 3.3386967015285605e-06,
+      "loss": 0.9079,
+      "step": 566
+    },
+    {
+      "epoch": 0.8205499276410999,
+      "grad_norm": 1.5532276630401611,
+      "learning_rate": 3.3346741753821407e-06,
+      "loss": 0.922,
+      "step": 567
+    },
+    {
+      "epoch": 0.8219971056439942,
+      "grad_norm": 1.5804080963134766,
+      "learning_rate": 3.3306516492357204e-06,
+      "loss": 0.9258,
+      "step": 568
+    },
+    {
+      "epoch": 0.8234442836468886,
+      "grad_norm": 1.5104866027832031,
+      "learning_rate": 3.3266291230893005e-06,
+      "loss": 0.9419,
+      "step": 569
+    },
+    {
+      "epoch": 0.8248914616497829,
+      "grad_norm": 1.4507907629013062,
+      "learning_rate": 3.3226065969428806e-06,
+      "loss": 0.9495,
+      "step": 570
+    },
+    {
+      "epoch": 0.8248914616497829,
+      "eval_loss": 1.018306016921997,
+      "eval_runtime": 25.653,
+      "eval_samples_per_second": 38.982,
+      "eval_steps_per_second": 2.456,
+      "step": 570
+    },
+    {
+      "epoch": 0.8263386396526773,
+      "grad_norm": 1.8267414569854736,
+      "learning_rate": 3.3185840707964607e-06,
+      "loss": 0.9442,
+      "step": 571
+    },
+    {
+      "epoch": 0.8277858176555717,
+      "grad_norm": 1.4675520658493042,
+      "learning_rate": 3.3145615446500404e-06,
+      "loss": 0.9802,
+      "step": 572
+    },
+    {
+      "epoch": 0.829232995658466,
+      "grad_norm": 1.506719946861267,
+      "learning_rate": 3.3105390185036206e-06,
+      "loss": 0.9248,
+      "step": 573
+    },
+    {
+      "epoch": 0.8306801736613604,
+      "grad_norm": 1.4712932109832764,
+      "learning_rate": 3.3065164923572007e-06,
+      "loss": 0.9366,
+      "step": 574
+    },
+    {
+      "epoch": 0.8321273516642547,
+      "grad_norm": 1.5222026109695435,
+      "learning_rate": 3.302493966210781e-06,
+      "loss": 0.913,
+      "step": 575
+    },
+    {
+      "epoch": 0.8335745296671491,
+      "grad_norm": 1.4533673524856567,
+      "learning_rate": 3.298471440064361e-06,
+      "loss": 0.9223,
+      "step": 576
+    },
+    {
+      "epoch": 0.8350217076700435,
+      "grad_norm": 1.446425437927246,
+      "learning_rate": 3.2944489139179407e-06,
+      "loss": 0.8802,
+      "step": 577
+    },
+    {
+      "epoch": 0.8364688856729378,
+      "grad_norm": 1.699572205543518,
+      "learning_rate": 3.2904263877715208e-06,
+      "loss": 0.9203,
+      "step": 578
+    },
+    {
+      "epoch": 0.8379160636758322,
+      "grad_norm": 1.5956206321716309,
+      "learning_rate": 3.286403861625101e-06,
+      "loss": 0.9334,
+      "step": 579
+    },
+    {
+      "epoch": 0.8393632416787264,
+      "grad_norm": 1.4471442699432373,
+      "learning_rate": 3.282381335478681e-06,
+      "loss": 0.9492,
+      "step": 580
+    },
+    {
+      "epoch": 0.8393632416787264,
+      "eval_loss": 1.0188478231430054,
+      "eval_runtime": 25.7874,
+      "eval_samples_per_second": 38.779,
+      "eval_steps_per_second": 2.443,
+      "step": 580
+    },
+    {
+      "epoch": 0.8408104196816208,
+      "grad_norm": 1.542809247970581,
+      "learning_rate": 3.278358809332261e-06,
+      "loss": 0.9042,
+      "step": 581
+    },
+    {
+      "epoch": 0.8422575976845152,
+      "grad_norm": 1.451400876045227,
+      "learning_rate": 3.274336283185841e-06,
+      "loss": 0.9005,
+      "step": 582
+    },
+    {
+      "epoch": 0.8437047756874095,
+      "grad_norm": 1.357790470123291,
+      "learning_rate": 3.270313757039421e-06,
+      "loss": 0.9463,
+      "step": 583
+    },
+    {
+      "epoch": 0.8451519536903039,
+      "grad_norm": 1.4770903587341309,
+      "learning_rate": 3.266291230893001e-06,
+      "loss": 0.9599,
+      "step": 584
+    },
+    {
+      "epoch": 0.8465991316931982,
+      "grad_norm": 1.3903440237045288,
+      "learning_rate": 3.2622687047465813e-06,
+      "loss": 0.9017,
+      "step": 585
+    },
+    {
+      "epoch": 0.8480463096960926,
+      "grad_norm": 1.508465051651001,
+      "learning_rate": 3.2582461786001614e-06,
+      "loss": 0.9376,
+      "step": 586
+    },
+    {
+      "epoch": 0.849493487698987,
+      "grad_norm": 1.352357029914856,
+      "learning_rate": 3.254223652453741e-06,
+      "loss": 0.9077,
+      "step": 587
+    },
+    {
+      "epoch": 0.8509406657018813,
+      "grad_norm": 1.53770112991333,
+      "learning_rate": 3.2502011263073212e-06,
+      "loss": 0.9418,
+      "step": 588
+    },
+    {
+      "epoch": 0.8523878437047757,
+      "grad_norm": 1.4406723976135254,
+      "learning_rate": 3.2461786001609013e-06,
+      "loss": 0.9408,
+      "step": 589
+    },
+    {
+      "epoch": 0.85383502170767,
+      "grad_norm": 1.3737328052520752,
+      "learning_rate": 3.2421560740144815e-06,
+      "loss": 0.9574,
+      "step": 590
+    },
+    {
+      "epoch": 0.85383502170767,
+      "eval_loss": 1.0164623260498047,
+      "eval_runtime": 25.6732,
+      "eval_samples_per_second": 38.951,
+      "eval_steps_per_second": 2.454,
+      "step": 590
+    },
+    {
+      "epoch": 0.8552821997105644,
+      "grad_norm": 1.4223228693008423,
+      "learning_rate": 3.2381335478680616e-06,
+      "loss": 0.9165,
+      "step": 591
+    },
+    {
+      "epoch": 0.8567293777134588,
+      "grad_norm": 1.372409462928772,
+      "learning_rate": 3.2341110217216413e-06,
+      "loss": 0.9262,
+      "step": 592
+    },
+    {
+      "epoch": 0.8581765557163531,
+      "grad_norm": 1.4255372285842896,
+      "learning_rate": 3.2300884955752214e-06,
+      "loss": 0.9518,
+      "step": 593
+    },
+    {
+      "epoch": 0.8596237337192475,
+      "grad_norm": 1.3796833753585815,
+      "learning_rate": 3.2260659694288016e-06,
+      "loss": 0.9034,
+      "step": 594
+    },
+    {
+      "epoch": 0.8610709117221418,
+      "grad_norm": 1.4612222909927368,
+      "learning_rate": 3.2220434432823817e-06,
+      "loss": 0.9126,
+      "step": 595
+    },
+    {
+      "epoch": 0.8625180897250362,
+      "grad_norm": 1.4215260744094849,
+      "learning_rate": 3.218020917135962e-06,
+      "loss": 0.929,
+      "step": 596
+    },
+    {
+      "epoch": 0.8639652677279306,
+      "grad_norm": 1.493041753768921,
+      "learning_rate": 3.2139983909895415e-06,
+      "loss": 0.9343,
+      "step": 597
+    },
+    {
+      "epoch": 0.8654124457308249,
+      "grad_norm": 1.4106526374816895,
+      "learning_rate": 3.2099758648431216e-06,
+      "loss": 0.9249,
+      "step": 598
+    },
+    {
+      "epoch": 0.8668596237337193,
+      "grad_norm": 1.5474896430969238,
+      "learning_rate": 3.2059533386967018e-06,
+      "loss": 0.9114,
+      "step": 599
+    },
+    {
+      "epoch": 0.8683068017366136,
+      "grad_norm": 1.4770991802215576,
+      "learning_rate": 3.201930812550282e-06,
+      "loss": 0.919,
+      "step": 600
+    },
+    {
+      "epoch": 0.8683068017366136,
+      "eval_loss": 1.0175906419754028,
+      "eval_runtime": 25.7685,
+      "eval_samples_per_second": 38.807,
+      "eval_steps_per_second": 2.445,
+      "step": 600
+    },
+    {
+      "epoch": 0.869753979739508,
+      "grad_norm": 1.4260352849960327,
+      "learning_rate": 3.197908286403862e-06,
+      "loss": 0.935,
+      "step": 601
+    },
+    {
+      "epoch": 0.8712011577424024,
+      "grad_norm": 1.4157609939575195,
+      "learning_rate": 3.1938857602574417e-06,
+      "loss": 0.9046,
+      "step": 602
+    },
+    {
+      "epoch": 0.8726483357452967,
+      "grad_norm": 1.4820977449417114,
+      "learning_rate": 3.189863234111022e-06,
+      "loss": 0.9363,
+      "step": 603
+    },
+    {
+      "epoch": 0.874095513748191,
+      "grad_norm": 1.437635064125061,
+      "learning_rate": 3.185840707964602e-06,
+      "loss": 0.906,
+      "step": 604
+    },
+    {
+      "epoch": 0.8755426917510853,
+      "grad_norm": 1.4991382360458374,
+      "learning_rate": 3.181818181818182e-06,
+      "loss": 0.921,
+      "step": 605
+    },
+    {
+      "epoch": 0.8769898697539797,
+      "grad_norm": 1.4861739873886108,
+      "learning_rate": 3.1777956556717622e-06,
+      "loss": 0.9135,
+      "step": 606
+    },
+    {
+      "epoch": 0.8784370477568741,
+      "grad_norm": 1.3911547660827637,
+      "learning_rate": 3.173773129525342e-06,
+      "loss": 0.8854,
+      "step": 607
+    },
+    {
+      "epoch": 0.8798842257597684,
+      "grad_norm": 1.3838648796081543,
+      "learning_rate": 3.169750603378922e-06,
+      "loss": 0.8957,
+      "step": 608
+    },
+    {
+      "epoch": 0.8813314037626628,
+      "grad_norm": 1.507276177406311,
+      "learning_rate": 3.165728077232502e-06,
+      "loss": 0.9171,
+      "step": 609
+    },
+    {
+      "epoch": 0.8827785817655571,
+      "grad_norm": 1.3060482740402222,
+      "learning_rate": 3.1617055510860823e-06,
+      "loss": 0.8788,
+      "step": 610
+    },
+    {
+      "epoch": 0.8827785817655571,
+      "eval_loss": 1.0178226232528687,
+      "eval_runtime": 26.0182,
+      "eval_samples_per_second": 38.435,
+      "eval_steps_per_second": 2.421,
+      "step": 610
+    },
+    {
+      "epoch": 0.8842257597684515,
+      "grad_norm": 1.5606573820114136,
+      "learning_rate": 3.1576830249396625e-06,
+      "loss": 0.909,
+      "step": 611
+    },
+    {
+      "epoch": 0.8856729377713459,
+      "grad_norm": 1.4704793691635132,
+      "learning_rate": 3.153660498793242e-06,
+      "loss": 0.9206,
+      "step": 612
+    },
+    {
+      "epoch": 0.8871201157742402,
+      "grad_norm": 1.2905973196029663,
+      "learning_rate": 3.1496379726468223e-06,
+      "loss": 0.9362,
+      "step": 613
+    },
+    {
+      "epoch": 0.8885672937771346,
+      "grad_norm": 1.4948153495788574,
+      "learning_rate": 3.1456154465004024e-06,
+      "loss": 0.8864,
+      "step": 614
+    },
+    {
+      "epoch": 0.8900144717800289,
+      "grad_norm": 1.450865626335144,
+      "learning_rate": 3.1415929203539825e-06,
+      "loss": 0.9162,
+      "step": 615
+    },
+    {
+      "epoch": 0.8914616497829233,
+      "grad_norm": 1.5478347539901733,
+      "learning_rate": 3.1375703942075627e-06,
+      "loss": 0.9319,
+      "step": 616
+    },
+    {
+      "epoch": 0.8929088277858177,
+      "grad_norm": 1.5064746141433716,
+      "learning_rate": 3.1335478680611424e-06,
+      "loss": 0.917,
+      "step": 617
+    },
+    {
+      "epoch": 0.894356005788712,
+      "grad_norm": 1.4333183765411377,
+      "learning_rate": 3.1295253419147225e-06,
+      "loss": 0.9227,
+      "step": 618
+    },
+    {
+      "epoch": 0.8958031837916064,
+      "grad_norm": 1.5096293687820435,
+      "learning_rate": 3.1255028157683026e-06,
+      "loss": 0.9406,
+      "step": 619
+    },
+    {
+      "epoch": 0.8972503617945007,
+      "grad_norm": 1.5495786666870117,
+      "learning_rate": 3.1214802896218828e-06,
+      "loss": 0.9406,
+      "step": 620
+    },
+    {
+      "epoch": 0.8972503617945007,
+      "eval_loss": 1.016027808189392,
+      "eval_runtime": 26.0019,
+      "eval_samples_per_second": 38.459,
+      "eval_steps_per_second": 2.423,
+      "step": 620
+    },
+    {
+      "epoch": 0.8986975397973951,
+      "grad_norm": 1.4580657482147217,
+      "learning_rate": 3.117457763475463e-06,
+      "loss": 0.9445,
+      "step": 621
+    },
+    {
+      "epoch": 0.9001447178002895,
+      "grad_norm": 1.4101994037628174,
+      "learning_rate": 3.1134352373290426e-06,
+      "loss": 0.9345,
+      "step": 622
+    },
+    {
+      "epoch": 0.9015918958031838,
+      "grad_norm": 1.4174447059631348,
+      "learning_rate": 3.1094127111826227e-06,
+      "loss": 0.9529,
+      "step": 623
+    },
+    {
+      "epoch": 0.9030390738060782,
+      "grad_norm": 1.399423360824585,
+      "learning_rate": 3.105390185036203e-06,
+      "loss": 0.9062,
+      "step": 624
+    },
+    {
+      "epoch": 0.9044862518089725,
+      "grad_norm": 1.332127571105957,
+      "learning_rate": 3.101367658889783e-06,
+      "loss": 0.887,
+      "step": 625
+    },
+    {
+      "epoch": 0.9059334298118669,
+      "grad_norm": 1.4557148218154907,
+      "learning_rate": 3.097345132743363e-06,
+      "loss": 0.928,
+      "step": 626
+    },
+    {
+      "epoch": 0.9073806078147613,
+      "grad_norm": 1.4364054203033447,
+      "learning_rate": 3.093322606596943e-06,
+      "loss": 0.952,
+      "step": 627
+    },
+    {
+      "epoch": 0.9088277858176556,
+      "grad_norm": 1.4176812171936035,
+      "learning_rate": 3.089300080450523e-06,
+      "loss": 0.9103,
+      "step": 628
+    },
+    {
+      "epoch": 0.91027496382055,
+      "grad_norm": 1.4091321229934692,
+      "learning_rate": 3.085277554304103e-06,
+      "loss": 0.9195,
+      "step": 629
+    },
+    {
+      "epoch": 0.9117221418234442,
+      "grad_norm": 1.4213095903396606,
+      "learning_rate": 3.081255028157683e-06,
+      "loss": 0.8743,
+      "step": 630
+    },
+    {
+      "epoch": 0.9117221418234442,
+      "eval_loss": 1.012543797492981,
+      "eval_runtime": 25.909,
+      "eval_samples_per_second": 38.597,
+      "eval_steps_per_second": 2.432,
+      "step": 630
+    },
+    {
+      "epoch": 0.9131693198263386,
+      "grad_norm": 1.4346204996109009,
+      "learning_rate": 3.0772325020112633e-06,
+      "loss": 0.954,
+      "step": 631
+    },
+    {
+      "epoch": 0.914616497829233,
+      "grad_norm": 1.3710834980010986,
+      "learning_rate": 3.073209975864843e-06,
+      "loss": 0.9346,
+      "step": 632
+    },
+    {
+      "epoch": 0.9160636758321273,
+      "grad_norm": 1.4266773462295532,
+      "learning_rate": 3.069187449718423e-06,
+      "loss": 0.9253,
+      "step": 633
+    },
+    {
+      "epoch": 0.9175108538350217,
+      "grad_norm": 1.4468302726745605,
+      "learning_rate": 3.0651649235720033e-06,
+      "loss": 0.905,
+      "step": 634
+    },
+    {
+      "epoch": 0.918958031837916,
+      "grad_norm": 1.3775814771652222,
+      "learning_rate": 3.0611423974255834e-06,
+      "loss": 0.9227,
+      "step": 635
+    },
+    {
+      "epoch": 0.9204052098408104,
+      "grad_norm": 1.4380732774734497,
+      "learning_rate": 3.057119871279163e-06,
+      "loss": 0.9128,
+      "step": 636
+    },
+    {
+      "epoch": 0.9218523878437048,
+      "grad_norm": 1.4668720960617065,
+      "learning_rate": 3.0530973451327432e-06,
+      "loss": 0.9111,
+      "step": 637
+    },
+    {
+      "epoch": 0.9232995658465991,
+      "grad_norm": 1.5741231441497803,
+      "learning_rate": 3.0490748189863234e-06,
+      "loss": 0.9715,
+      "step": 638
+    },
+    {
+      "epoch": 0.9247467438494935,
+      "grad_norm": 1.44276762008667,
+      "learning_rate": 3.0450522928399035e-06,
+      "loss": 0.9382,
+      "step": 639
+    },
+    {
+      "epoch": 0.9261939218523878,
+      "grad_norm": 1.5068806409835815,
+      "learning_rate": 3.0410297666934836e-06,
+      "loss": 0.9328,
+      "step": 640
+    },
+    {
+      "epoch": 0.9261939218523878,
+      "eval_loss": 1.0154249668121338,
+      "eval_runtime": 25.7492,
+      "eval_samples_per_second": 38.836,
+      "eval_steps_per_second": 2.447,
+      "step": 640
+    },
+    {
+      "epoch": 0.9276410998552822,
+      "grad_norm": 1.437995195388794,
+      "learning_rate": 3.0370072405470633e-06,
+      "loss": 0.9104,
+      "step": 641
+    },
+    {
+      "epoch": 0.9290882778581766,
+      "grad_norm": 1.6219338178634644,
+      "learning_rate": 3.0329847144006443e-06,
+      "loss": 0.9355,
+      "step": 642
+    },
+    {
+      "epoch": 0.9305354558610709,
+      "grad_norm": 1.4921026229858398,
+      "learning_rate": 3.028962188254224e-06,
+      "loss": 0.9298,
+      "step": 643
+    },
+    {
+      "epoch": 0.9319826338639653,
+      "grad_norm": 1.3929303884506226,
+      "learning_rate": 3.024939662107804e-06,
+      "loss": 0.9133,
+      "step": 644
+    },
+    {
+      "epoch": 0.9334298118668596,
+      "grad_norm": 1.4663861989974976,
+      "learning_rate": 3.0209171359613843e-06,
+      "loss": 0.9199,
+      "step": 645
+    },
+    {
+      "epoch": 0.934876989869754,
+      "grad_norm": 1.4335482120513916,
+      "learning_rate": 3.0168946098149644e-06,
+      "loss": 0.8951,
+      "step": 646
+    },
+    {
+      "epoch": 0.9363241678726484,
+      "grad_norm": 1.4543933868408203,
+      "learning_rate": 3.0128720836685445e-06,
+      "loss": 0.9221,
+      "step": 647
+    },
+    {
+      "epoch": 0.9377713458755427,
+      "grad_norm": 1.3568743467330933,
+      "learning_rate": 3.0088495575221242e-06,
+      "loss": 0.8988,
+      "step": 648
+    },
+    {
+      "epoch": 0.9392185238784371,
+      "grad_norm": 1.497255563735962,
+      "learning_rate": 3.0048270313757043e-06,
+      "loss": 0.9321,
+      "step": 649
+    },
+    {
+      "epoch": 0.9406657018813314,
+      "grad_norm": 1.3754018545150757,
+      "learning_rate": 3.0008045052292845e-06,
+      "loss": 0.906,
+      "step": 650
+    },
+    {
+      "epoch": 0.9406657018813314,
+      "eval_loss": 1.0140572786331177,
+      "eval_runtime": 26.0118,
+      "eval_samples_per_second": 38.444,
+      "eval_steps_per_second": 2.422,
+      "step": 650
+    },
+    {
+      "epoch": 0.9421128798842258,
+      "grad_norm": 1.3779268264770508,
+      "learning_rate": 2.9967819790828646e-06,
+      "loss": 0.9066,
+      "step": 651
+    },
+    {
+      "epoch": 0.9435600578871202,
+      "grad_norm": 1.4161144495010376,
+      "learning_rate": 2.9927594529364447e-06,
+      "loss": 0.8982,
+      "step": 652
+    },
+    {
+      "epoch": 0.9450072358900145,
+      "grad_norm": 1.4134407043457031,
+      "learning_rate": 2.9887369267900244e-06,
+      "loss": 0.9157,
+      "step": 653
+    },
+    {
+      "epoch": 0.9464544138929089,
+      "grad_norm": 1.4312589168548584,
+      "learning_rate": 2.9847144006436046e-06,
+      "loss": 0.9232,
+      "step": 654
+    },
+    {
+      "epoch": 0.9479015918958031,
+      "grad_norm": 1.521687388420105,
+      "learning_rate": 2.9806918744971847e-06,
+      "loss": 0.9489,
+      "step": 655
+    },
+    {
+      "epoch": 0.9493487698986975,
+      "grad_norm": 1.5733003616333008,
+      "learning_rate": 2.976669348350765e-06,
+      "loss": 0.9585,
+      "step": 656
+    },
+    {
+      "epoch": 0.9507959479015919,
+      "grad_norm": 1.4925999641418457,
+      "learning_rate": 2.972646822204345e-06,
+      "loss": 0.8928,
+      "step": 657
+    },
+    {
+      "epoch": 0.9522431259044862,
+      "grad_norm": 1.4285452365875244,
+      "learning_rate": 2.9686242960579246e-06,
+      "loss": 0.8946,
+      "step": 658
+    },
+    {
+      "epoch": 0.9536903039073806,
+      "grad_norm": 1.4919170141220093,
+      "learning_rate": 2.9646017699115048e-06,
+      "loss": 0.9162,
+      "step": 659
+    },
+    {
+      "epoch": 0.9551374819102749,
+      "grad_norm": 1.4638007879257202,
+      "learning_rate": 2.960579243765085e-06,
+      "loss": 0.9025,
+      "step": 660
+    },
+    {
+      "epoch": 0.9551374819102749,
+      "eval_loss": 1.0097233057022095,
+      "eval_runtime": 25.6834,
+      "eval_samples_per_second": 38.936,
+      "eval_steps_per_second": 2.453,
+      "step": 660
+    },
+    {
+      "epoch": 0.9565846599131693,
+      "grad_norm": 1.4907405376434326,
+      "learning_rate": 2.956556717618665e-06,
+      "loss": 0.9428,
+      "step": 661
+    },
+    {
+      "epoch": 0.9580318379160637,
+      "grad_norm": 1.4305118322372437,
+      "learning_rate": 2.952534191472245e-06,
+      "loss": 0.9007,
+      "step": 662
+    },
+    {
+      "epoch": 0.959479015918958,
+      "grad_norm": 1.401001214981079,
+      "learning_rate": 2.948511665325825e-06,
+      "loss": 0.908,
+      "step": 663
+    },
+    {
+      "epoch": 0.9609261939218524,
+      "grad_norm": 1.4340101480484009,
+      "learning_rate": 2.944489139179405e-06,
+      "loss": 0.9302,
+      "step": 664
+    },
+    {
+      "epoch": 0.9623733719247467,
+      "grad_norm": 1.4313371181488037,
+      "learning_rate": 2.940466613032985e-06,
+      "loss": 0.852,
+      "step": 665
+    },
+    {
+      "epoch": 0.9638205499276411,
+      "grad_norm": 1.6063755750656128,
+      "learning_rate": 2.9364440868865652e-06,
+      "loss": 0.9331,
+      "step": 666
+    },
+    {
+      "epoch": 0.9652677279305355,
+      "grad_norm": 1.48060142993927,
+      "learning_rate": 2.9324215607401454e-06,
+      "loss": 0.9181,
+      "step": 667
+    },
+    {
+      "epoch": 0.9667149059334298,
+      "grad_norm": 1.4865727424621582,
+      "learning_rate": 2.928399034593725e-06,
+      "loss": 0.9356,
+      "step": 668
+    },
+    {
+      "epoch": 0.9681620839363242,
+      "grad_norm": 1.409006953239441,
+      "learning_rate": 2.924376508447305e-06,
+      "loss": 0.9176,
+      "step": 669
+    },
+    {
+      "epoch": 0.9696092619392185,
+      "grad_norm": 1.411401629447937,
+      "learning_rate": 2.9203539823008853e-06,
+      "loss": 0.865,
+      "step": 670
+    },
+    {
+      "epoch": 0.9696092619392185,
+      "eval_loss": 1.007845401763916,
+      "eval_runtime": 25.8558,
+      "eval_samples_per_second": 38.676,
+      "eval_steps_per_second": 2.437,
+      "step": 670
+    },
+    {
+      "epoch": 0.9710564399421129,
+      "grad_norm": 1.4949239492416382,
+      "learning_rate": 2.9163314561544655e-06,
+      "loss": 0.9204,
+      "step": 671
+    },
+    {
+      "epoch": 0.9725036179450073,
+      "grad_norm": 1.4543801546096802,
+      "learning_rate": 2.912308930008045e-06,
+      "loss": 0.8932,
+      "step": 672
+    },
+    {
+      "epoch": 0.9739507959479016,
+      "grad_norm": 1.48538339138031,
+      "learning_rate": 2.9082864038616253e-06,
+      "loss": 0.9089,
+      "step": 673
+    },
+    {
+      "epoch": 0.975397973950796,
+      "grad_norm": 1.3665274381637573,
+      "learning_rate": 2.9042638777152054e-06,
+      "loss": 0.8793,
+      "step": 674
+    },
+    {
+      "epoch": 0.9768451519536903,
+      "grad_norm": 1.4270809888839722,
+      "learning_rate": 2.9002413515687855e-06,
+      "loss": 0.9028,
+      "step": 675
+    },
+    {
+      "epoch": 0.9782923299565847,
+      "grad_norm": 1.4889215230941772,
+      "learning_rate": 2.8962188254223657e-06,
+      "loss": 0.9161,
+      "step": 676
+    },
+    {
+      "epoch": 0.9797395079594791,
+      "grad_norm": 1.50188410282135,
+      "learning_rate": 2.8921962992759454e-06,
+      "loss": 0.871,
+      "step": 677
+    },
+    {
+      "epoch": 0.9811866859623734,
+      "grad_norm": 1.4788942337036133,
+      "learning_rate": 2.8881737731295255e-06,
+      "loss": 0.9241,
+      "step": 678
+    },
+    {
+      "epoch": 0.9826338639652678,
+      "grad_norm": 1.4699136018753052,
+      "learning_rate": 2.8841512469831056e-06,
+      "loss": 0.9242,
+      "step": 679
+    },
+    {
+      "epoch": 0.984081041968162,
+      "grad_norm": 1.4581975936889648,
+      "learning_rate": 2.8801287208366858e-06,
+      "loss": 0.9461,
+      "step": 680
+    },
+    {
+      "epoch": 0.984081041968162,
+      "eval_loss": 1.0047694444656372,
+      "eval_runtime": 25.6669,
+      "eval_samples_per_second": 38.961,
+      "eval_steps_per_second": 2.455,
+      "step": 680
+    },
+    {
+      "epoch": 0.9855282199710564,
+      "grad_norm": 1.3976376056671143,
+      "learning_rate": 2.876106194690266e-06,
+      "loss": 0.9247,
+      "step": 681
+    },
+    {
+      "epoch": 0.9869753979739508,
+      "grad_norm": 1.4753456115722656,
+      "learning_rate": 2.8720836685438456e-06,
+      "loss": 0.8744,
+      "step": 682
+    },
+    {
+      "epoch": 0.9884225759768451,
+      "grad_norm": 1.3396860361099243,
+      "learning_rate": 2.8680611423974257e-06,
+      "loss": 0.9268,
+      "step": 683
+    },
+    {
+      "epoch": 0.9898697539797395,
+      "grad_norm": 1.41839599609375,
+      "learning_rate": 2.864038616251006e-06,
+      "loss": 0.8972,
+      "step": 684
+    },
+    {
+      "epoch": 0.9913169319826338,
+      "grad_norm": 1.5397756099700928,
+      "learning_rate": 2.860016090104586e-06,
+      "loss": 0.9036,
+      "step": 685
+    },
+    {
+      "epoch": 0.9927641099855282,
+      "grad_norm": 1.458146333694458,
+      "learning_rate": 2.855993563958166e-06,
+      "loss": 0.924,
+      "step": 686
+    },
+    {
+      "epoch": 0.9942112879884226,
+      "grad_norm": 1.3859862089157104,
+      "learning_rate": 2.851971037811746e-06,
+      "loss": 0.9045,
+      "step": 687
+    },
+    {
+      "epoch": 0.9956584659913169,
+      "grad_norm": 1.5419162511825562,
+      "learning_rate": 2.847948511665326e-06,
+      "loss": 0.8943,
+      "step": 688
+    },
+    {
+      "epoch": 0.9971056439942113,
+      "grad_norm": 1.4252957105636597,
+      "learning_rate": 2.843925985518906e-06,
+      "loss": 0.9126,
+      "step": 689
+    },
+    {
+      "epoch": 0.9985528219971056,
+      "grad_norm": 1.5327415466308594,
+      "learning_rate": 2.839903459372486e-06,
+      "loss": 0.9356,
+      "step": 690
+    },
+    {
+      "epoch": 0.9985528219971056,
+      "eval_loss": 1.0072345733642578,
+      "eval_runtime": 25.8176,
+      "eval_samples_per_second": 38.733,
+      "eval_steps_per_second": 2.44,
+      "step": 690
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.4523152112960815,
+      "learning_rate": 2.8358809332260663e-06,
+      "loss": 0.8922,
+      "step": 691
+    },
+    {
+      "epoch": 1.0014471780028944,
+      "grad_norm": 1.4414013624191284,
+      "learning_rate": 2.831858407079646e-06,
+      "loss": 0.8817,
+      "step": 692
+    },
+    {
+      "epoch": 1.0028943560057888,
+      "grad_norm": 1.4955841302871704,
+      "learning_rate": 2.827835880933226e-06,
+      "loss": 0.8755,
+      "step": 693
+    },
+    {
+      "epoch": 1.004341534008683,
+      "grad_norm": 1.4577631950378418,
+      "learning_rate": 2.8238133547868063e-06,
+      "loss": 0.916,
+      "step": 694
+    },
+    {
+      "epoch": 1.0057887120115774,
+      "grad_norm": 1.5309616327285767,
+      "learning_rate": 2.8197908286403864e-06,
+      "loss": 0.9334,
+      "step": 695
+    },
+    {
+      "epoch": 1.0072358900144718,
+      "grad_norm": 1.678214430809021,
+      "learning_rate": 2.8157683024939665e-06,
+      "loss": 0.9174,
+      "step": 696
+    },
+    {
+      "epoch": 1.0086830680173662,
+      "grad_norm": 1.4631013870239258,
+      "learning_rate": 2.8117457763475462e-06,
+      "loss": 0.8904,
+      "step": 697
+    },
+    {
+      "epoch": 1.0101302460202606,
+      "grad_norm": 1.5390706062316895,
+      "learning_rate": 2.8077232502011264e-06,
+      "loss": 0.8739,
+      "step": 698
+    },
+    {
+      "epoch": 1.0115774240231548,
+      "grad_norm": 1.4008797407150269,
+      "learning_rate": 2.8037007240547065e-06,
+      "loss": 0.8803,
+      "step": 699
+    },
+    {
+      "epoch": 1.0130246020260492,
+      "grad_norm": 1.5263433456420898,
+      "learning_rate": 2.7996781979082866e-06,
+      "loss": 0.9466,
+      "step": 700
+    },
+    {
+      "epoch": 1.0130246020260492,
+      "eval_loss": 1.0086634159088135,
+      "eval_runtime": 25.832,
+      "eval_samples_per_second": 38.712,
+      "eval_steps_per_second": 2.439,
+      "step": 700
+    },
+    {
+      "epoch": 1.0144717800289436,
+      "grad_norm": 1.528403639793396,
+      "learning_rate": 2.7956556717618667e-06,
+      "loss": 0.9317,
+      "step": 701
+    },
+    {
+      "epoch": 1.015918958031838,
+      "grad_norm": 1.6002906560897827,
+      "learning_rate": 2.7916331456154465e-06,
+      "loss": 0.9359,
+      "step": 702
+    },
+    {
+      "epoch": 1.0173661360347324,
+      "grad_norm": 1.6118007898330688,
+      "learning_rate": 2.7876106194690266e-06,
+      "loss": 0.9234,
+      "step": 703
+    },
+    {
+      "epoch": 1.0188133140376265,
+      "grad_norm": 1.5051100254058838,
+      "learning_rate": 2.7835880933226067e-06,
+      "loss": 0.9066,
+      "step": 704
+    },
+    {
+      "epoch": 1.020260492040521,
+      "grad_norm": 1.5129306316375732,
+      "learning_rate": 2.779565567176187e-06,
+      "loss": 0.9406,
+      "step": 705
+    },
+    {
+      "epoch": 1.0217076700434153,
+      "grad_norm": 1.5781571865081787,
+      "learning_rate": 2.775543041029767e-06,
+      "loss": 0.8667,
+      "step": 706
+    },
+    {
+      "epoch": 1.0231548480463097,
+      "grad_norm": 1.4130151271820068,
+      "learning_rate": 2.7715205148833467e-06,
+      "loss": 0.9081,
+      "step": 707
+    },
+    {
+      "epoch": 1.0246020260492041,
+      "grad_norm": 1.5489847660064697,
+      "learning_rate": 2.767497988736927e-06,
+      "loss": 0.9474,
+      "step": 708
+    },
+    {
+      "epoch": 1.0260492040520983,
+      "grad_norm": 1.490387201309204,
+      "learning_rate": 2.763475462590507e-06,
+      "loss": 0.8583,
+      "step": 709
+    },
+    {
+      "epoch": 1.0274963820549927,
+      "grad_norm": 1.6409109830856323,
+      "learning_rate": 2.759452936444087e-06,
+      "loss": 0.8878,
+      "step": 710
+    },
+    {
+      "epoch": 1.0274963820549927,
+      "eval_loss": 1.0066230297088623,
+      "eval_runtime": 25.8463,
+      "eval_samples_per_second": 38.69,
+      "eval_steps_per_second": 2.437,
+      "step": 710
+    },
+    {
+      "epoch": 1.0289435600578871,
+      "grad_norm": 1.537100076675415,
+      "learning_rate": 2.755430410297667e-06,
+      "loss": 0.9095,
+      "step": 711
+    },
+    {
+      "epoch": 1.0303907380607815,
+      "grad_norm": 1.5319218635559082,
+      "learning_rate": 2.751407884151247e-06,
+      "loss": 0.9023,
+      "step": 712
+    },
+    {
+      "epoch": 1.031837916063676,
+      "grad_norm": 1.5210882425308228,
+      "learning_rate": 2.747385358004827e-06,
+      "loss": 0.9183,
+      "step": 713
+    },
+    {
+      "epoch": 1.03328509406657,
+      "grad_norm": 1.5327351093292236,
+      "learning_rate": 2.743362831858407e-06,
+      "loss": 0.8974,
+      "step": 714
+    },
+    {
+      "epoch": 1.0347322720694645,
+      "grad_norm": 1.487595558166504,
+      "learning_rate": 2.7393403057119873e-06,
+      "loss": 0.9136,
+      "step": 715
+    },
+    {
+      "epoch": 1.036179450072359,
+      "grad_norm": 1.615623116493225,
+      "learning_rate": 2.7353177795655674e-06,
+      "loss": 0.9088,
+      "step": 716
+    },
+    {
+      "epoch": 1.0376266280752533,
+      "grad_norm": 1.4948785305023193,
+      "learning_rate": 2.731295253419147e-06,
+      "loss": 0.8943,
+      "step": 717
+    },
+    {
+      "epoch": 1.0390738060781477,
+      "grad_norm": 1.3898569345474243,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 0.8781,
+      "step": 718
+    },
+    {
+      "epoch": 1.0405209840810419,
+      "grad_norm": 1.607021450996399,
+      "learning_rate": 2.7232502011263074e-06,
+      "loss": 0.8876,
+      "step": 719
+    },
+    {
+      "epoch": 1.0419681620839363,
+      "grad_norm": 1.6453396081924438,
+      "learning_rate": 2.7192276749798875e-06,
+      "loss": 0.919,
+      "step": 720
+    },
+    {
+      "epoch": 1.0419681620839363,
+      "eval_loss": 1.0065933465957642,
+      "eval_runtime": 25.7628,
+      "eval_samples_per_second": 38.816,
+      "eval_steps_per_second": 2.445,
+      "step": 720
+    },
+    {
+      "epoch": 1.0434153400868307,
+      "grad_norm": 1.5655184984207153,
+      "learning_rate": 2.7152051488334676e-06,
+      "loss": 0.885,
+      "step": 721
+    },
+    {
+      "epoch": 1.044862518089725,
+      "grad_norm": 1.4670000076293945,
+      "learning_rate": 2.7111826226870473e-06,
+      "loss": 0.8701,
+      "step": 722
+    },
+    {
+      "epoch": 1.0463096960926195,
+      "grad_norm": 1.4562489986419678,
+      "learning_rate": 2.7071600965406274e-06,
+      "loss": 0.9035,
+      "step": 723
+    },
+    {
+      "epoch": 1.0477568740955137,
+      "grad_norm": 1.5567082166671753,
+      "learning_rate": 2.7031375703942076e-06,
+      "loss": 0.8974,
+      "step": 724
+    },
+    {
+      "epoch": 1.049204052098408,
+      "grad_norm": 1.4280647039413452,
+      "learning_rate": 2.6991150442477877e-06,
+      "loss": 0.8588,
+      "step": 725
+    },
+    {
+      "epoch": 1.0506512301013025,
+      "grad_norm": 1.4315814971923828,
+      "learning_rate": 2.695092518101368e-06,
+      "loss": 0.8464,
+      "step": 726
+    },
+    {
+      "epoch": 1.0520984081041969,
+      "grad_norm": 1.5172886848449707,
+      "learning_rate": 2.6910699919549475e-06,
+      "loss": 0.9014,
+      "step": 727
+    },
+    {
+      "epoch": 1.0535455861070913,
+      "grad_norm": 1.635730266571045,
+      "learning_rate": 2.6870474658085277e-06,
+      "loss": 0.8978,
+      "step": 728
+    },
+    {
+      "epoch": 1.0549927641099854,
+      "grad_norm": 1.5512837171554565,
+      "learning_rate": 2.6830249396621078e-06,
+      "loss": 0.9593,
+      "step": 729
+    },
+    {
+      "epoch": 1.0564399421128798,
+      "grad_norm": 1.5531891584396362,
+      "learning_rate": 2.679002413515688e-06,
+      "loss": 0.897,
+      "step": 730
+    },
+    {
+      "epoch": 1.0564399421128798,
+      "eval_loss": 1.00581693649292,
+      "eval_runtime": 25.6366,
+      "eval_samples_per_second": 39.007,
+      "eval_steps_per_second": 2.457,
+      "step": 730
+    },
+    {
+      "epoch": 1.0578871201157742,
+      "grad_norm": 1.5719125270843506,
+      "learning_rate": 2.6749798873692685e-06,
+      "loss": 0.8763,
+      "step": 731
+    },
+    {
+      "epoch": 1.0593342981186686,
+      "grad_norm": 1.559281826019287,
+      "learning_rate": 2.6709573612228486e-06,
+      "loss": 0.9001,
+      "step": 732
+    },
+    {
+      "epoch": 1.060781476121563,
+      "grad_norm": 1.7323325872421265,
+      "learning_rate": 2.6669348350764283e-06,
+      "loss": 0.9084,
+      "step": 733
+    },
+    {
+      "epoch": 1.0622286541244572,
+      "grad_norm": 1.6019538640975952,
+      "learning_rate": 2.6629123089300084e-06,
+      "loss": 0.9016,
+      "step": 734
+    },
+    {
+      "epoch": 1.0636758321273516,
+      "grad_norm": 1.521162748336792,
+      "learning_rate": 2.6588897827835885e-06,
+      "loss": 0.8473,
+      "step": 735
+    },
+    {
+      "epoch": 1.065123010130246,
+      "grad_norm": 1.58328115940094,
+      "learning_rate": 2.6548672566371687e-06,
+      "loss": 0.8973,
+      "step": 736
+    },
+    {
+      "epoch": 1.0665701881331404,
+      "grad_norm": 1.6900322437286377,
+      "learning_rate": 2.650844730490749e-06,
+      "loss": 0.8908,
+      "step": 737
+    },
+    {
+      "epoch": 1.0680173661360348,
+      "grad_norm": 1.6272144317626953,
+      "learning_rate": 2.6468222043443285e-06,
+      "loss": 0.9089,
+      "step": 738
+    },
+    {
+      "epoch": 1.069464544138929,
+      "grad_norm": 1.6188445091247559,
+      "learning_rate": 2.6427996781979086e-06,
+      "loss": 0.938,
+      "step": 739
+    },
+    {
+      "epoch": 1.0709117221418234,
+      "grad_norm": 1.6111786365509033,
+      "learning_rate": 2.6387771520514888e-06,
+      "loss": 0.9106,
+      "step": 740
+    },
+    {
+      "epoch": 1.0709117221418234,
+      "eval_loss": 1.0052382946014404,
+      "eval_runtime": 25.7043,
+      "eval_samples_per_second": 38.904,
+      "eval_steps_per_second": 2.451,
+      "step": 740
+    },
+    {
+      "epoch": 1.0723589001447178,
+      "grad_norm": 1.5151225328445435,
+      "learning_rate": 2.634754625905069e-06,
+      "loss": 0.8542,
+      "step": 741
+    },
+    {
+      "epoch": 1.0738060781476122,
+      "grad_norm": 1.5061923265457153,
+      "learning_rate": 2.630732099758649e-06,
+      "loss": 0.883,
+      "step": 742
+    },
+    {
+      "epoch": 1.0752532561505066,
+      "grad_norm": 1.5270133018493652,
+      "learning_rate": 2.6267095736122287e-06,
+      "loss": 0.8941,
+      "step": 743
+    },
+    {
+      "epoch": 1.0767004341534008,
+      "grad_norm": 1.5916814804077148,
+      "learning_rate": 2.622687047465809e-06,
+      "loss": 0.8985,
+      "step": 744
+    },
+    {
+      "epoch": 1.0781476121562952,
+      "grad_norm": 1.4444690942764282,
+      "learning_rate": 2.618664521319389e-06,
+      "loss": 0.8818,
+      "step": 745
+    },
+    {
+      "epoch": 1.0795947901591896,
+      "grad_norm": 1.4805785417556763,
+      "learning_rate": 2.614641995172969e-06,
+      "loss": 0.9,
+      "step": 746
+    },
+    {
+      "epoch": 1.081041968162084,
+      "grad_norm": 1.4908889532089233,
+      "learning_rate": 2.6106194690265492e-06,
+      "loss": 0.912,
+      "step": 747
+    },
+    {
+      "epoch": 1.0824891461649784,
+      "grad_norm": 1.5267603397369385,
+      "learning_rate": 2.606596942880129e-06,
+      "loss": 0.938,
+      "step": 748
+    },
+    {
+      "epoch": 1.0839363241678726,
+      "grad_norm": 1.4756078720092773,
+      "learning_rate": 2.602574416733709e-06,
+      "loss": 0.8712,
+      "step": 749
+    },
+    {
+      "epoch": 1.085383502170767,
+      "grad_norm": 1.508716106414795,
+      "learning_rate": 2.598551890587289e-06,
+      "loss": 0.9001,
+      "step": 750
+    },
+    {
+      "epoch": 1.085383502170767,
+      "eval_loss": 1.0030596256256104,
+      "eval_runtime": 25.5438,
+      "eval_samples_per_second": 39.148,
+      "eval_steps_per_second": 2.466,
+      "step": 750
+    },
+    {
+      "epoch": 1.0868306801736614,
+      "grad_norm": 1.5817303657531738,
+      "learning_rate": 2.5945293644408693e-06,
+      "loss": 0.9058,
+      "step": 751
+    },
+    {
+      "epoch": 1.0882778581765558,
+      "grad_norm": 1.497726321220398,
+      "learning_rate": 2.5905068382944494e-06,
+      "loss": 0.8961,
+      "step": 752
+    },
+    {
+      "epoch": 1.0897250361794502,
+      "grad_norm": 1.5296815633773804,
+      "learning_rate": 2.586484312148029e-06,
+      "loss": 0.9155,
+      "step": 753
+    },
+    {
+      "epoch": 1.0911722141823443,
+      "grad_norm": 1.609724760055542,
+      "learning_rate": 2.5824617860016093e-06,
+      "loss": 0.9105,
+      "step": 754
+    },
+    {
+      "epoch": 1.0926193921852387,
+      "grad_norm": 1.4852569103240967,
+      "learning_rate": 2.5784392598551894e-06,
+      "loss": 0.923,
+      "step": 755
+    },
+    {
+      "epoch": 1.0940665701881331,
+      "grad_norm": 1.5415880680084229,
+      "learning_rate": 2.5744167337087695e-06,
+      "loss": 0.8643,
+      "step": 756
+    },
+    {
+      "epoch": 1.0955137481910275,
+      "grad_norm": 1.465867280960083,
+      "learning_rate": 2.5703942075623497e-06,
+      "loss": 0.913,
+      "step": 757
+    },
+    {
+      "epoch": 1.096960926193922,
+      "grad_norm": 1.576643943786621,
+      "learning_rate": 2.5663716814159294e-06,
+      "loss": 0.8944,
+      "step": 758
+    },
+    {
+      "epoch": 1.0984081041968161,
+      "grad_norm": 1.6592824459075928,
+      "learning_rate": 2.5623491552695095e-06,
+      "loss": 0.8829,
+      "step": 759
+    },
+    {
+      "epoch": 1.0998552821997105,
+      "grad_norm": 1.6082524061203003,
+      "learning_rate": 2.5583266291230896e-06,
+      "loss": 0.8653,
+      "step": 760
+    },
+    {
+      "epoch": 1.0998552821997105,
+      "eval_loss": 1.0026334524154663,
+      "eval_runtime": 25.607,
+      "eval_samples_per_second": 39.052,
+      "eval_steps_per_second": 2.46,
+      "step": 760
+    },
+    {
+      "epoch": 1.101302460202605,
+      "grad_norm": 1.52219557762146,
+      "learning_rate": 2.5543041029766698e-06,
+      "loss": 0.8905,
+      "step": 761
+    },
+    {
+      "epoch": 1.1027496382054993,
+      "grad_norm": 1.6422560214996338,
+      "learning_rate": 2.55028157683025e-06,
+      "loss": 0.8654,
+      "step": 762
+    },
+    {
+      "epoch": 1.1041968162083937,
+      "grad_norm": 1.6546605825424194,
+      "learning_rate": 2.5462590506838296e-06,
+      "loss": 0.8655,
+      "step": 763
+    },
+    {
+      "epoch": 1.105643994211288,
+      "grad_norm": 1.5878373384475708,
+      "learning_rate": 2.5422365245374097e-06,
+      "loss": 0.9194,
+      "step": 764
+    },
+    {
+      "epoch": 1.1070911722141823,
+      "grad_norm": 1.5256876945495605,
+      "learning_rate": 2.53821399839099e-06,
+      "loss": 0.891,
+      "step": 765
+    },
+    {
+      "epoch": 1.1085383502170767,
+      "grad_norm": 1.5687922239303589,
+      "learning_rate": 2.53419147224457e-06,
+      "loss": 0.8885,
+      "step": 766
+    },
+    {
+      "epoch": 1.109985528219971,
+      "grad_norm": 1.6002429723739624,
+      "learning_rate": 2.53016894609815e-06,
+      "loss": 0.8896,
+      "step": 767
+    },
+    {
+      "epoch": 1.1114327062228655,
+      "grad_norm": 1.5805329084396362,
+      "learning_rate": 2.52614641995173e-06,
+      "loss": 0.9043,
+      "step": 768
+    },
+    {
+      "epoch": 1.1128798842257597,
+      "grad_norm": 1.5537420511245728,
+      "learning_rate": 2.52212389380531e-06,
+      "loss": 0.8831,
+      "step": 769
+    },
+    {
+      "epoch": 1.114327062228654,
+      "grad_norm": 1.6533360481262207,
+      "learning_rate": 2.51810136765889e-06,
+      "loss": 0.9479,
+      "step": 770
+    },
+    {
+      "epoch": 1.114327062228654,
+      "eval_loss": 1.002786636352539,
+      "eval_runtime": 25.8282,
+      "eval_samples_per_second": 38.717,
+      "eval_steps_per_second": 2.439,
+      "step": 770
+    },
+    {
+      "epoch": 1.1157742402315485,
+      "grad_norm": 1.6494132280349731,
+      "learning_rate": 2.51407884151247e-06,
+      "loss": 0.895,
+      "step": 771
+    },
+    {
+      "epoch": 1.1172214182344429,
+      "grad_norm": 1.5635745525360107,
+      "learning_rate": 2.5100563153660503e-06,
+      "loss": 0.8859,
+      "step": 772
+    },
+    {
+      "epoch": 1.1186685962373373,
+      "grad_norm": 1.501560091972351,
+      "learning_rate": 2.50603378921963e-06,
+      "loss": 0.9423,
+      "step": 773
+    },
+    {
+      "epoch": 1.1201157742402315,
+      "grad_norm": 1.5016065835952759,
+      "learning_rate": 2.50201126307321e-06,
+      "loss": 0.9226,
+      "step": 774
+    },
+    {
+      "epoch": 1.1215629522431259,
+      "grad_norm": 1.5102709531784058,
+      "learning_rate": 2.4979887369267903e-06,
+      "loss": 0.889,
+      "step": 775
+    },
+    {
+      "epoch": 1.1230101302460203,
+      "grad_norm": 1.4280509948730469,
+      "learning_rate": 2.4939662107803704e-06,
+      "loss": 0.8936,
+      "step": 776
+    },
+    {
+      "epoch": 1.1244573082489147,
+      "grad_norm": 1.5048205852508545,
+      "learning_rate": 2.48994368463395e-06,
+      "loss": 0.9029,
+      "step": 777
+    },
+    {
+      "epoch": 1.125904486251809,
+      "grad_norm": 1.5334604978561401,
+      "learning_rate": 2.4859211584875302e-06,
+      "loss": 0.8864,
+      "step": 778
+    },
+    {
+      "epoch": 1.1273516642547032,
+      "grad_norm": 1.6037338972091675,
+      "learning_rate": 2.4818986323411104e-06,
+      "loss": 0.921,
+      "step": 779
+    },
+    {
+      "epoch": 1.1287988422575976,
+      "grad_norm": 1.6144342422485352,
+      "learning_rate": 2.4778761061946905e-06,
+      "loss": 0.9006,
+      "step": 780
+    },
+    {
+      "epoch": 1.1287988422575976,
+      "eval_loss": 1.0016281604766846,
+      "eval_runtime": 25.8012,
+      "eval_samples_per_second": 38.758,
+      "eval_steps_per_second": 2.442,
+      "step": 780
+    },
+    {
+      "epoch": 1.130246020260492,
+      "grad_norm": 1.593278408050537,
+      "learning_rate": 2.4738535800482706e-06,
+      "loss": 0.8724,
+      "step": 781
+    },
+    {
+      "epoch": 1.1316931982633864,
+      "grad_norm": 1.5138260126113892,
+      "learning_rate": 2.4698310539018503e-06,
+      "loss": 0.9093,
+      "step": 782
+    },
+    {
+      "epoch": 1.1331403762662808,
+      "grad_norm": 1.5158487558364868,
+      "learning_rate": 2.4658085277554304e-06,
+      "loss": 0.9086,
+      "step": 783
+    },
+    {
+      "epoch": 1.134587554269175,
+      "grad_norm": 1.5141456127166748,
+      "learning_rate": 2.4617860016090106e-06,
+      "loss": 0.8756,
+      "step": 784
+    },
+    {
+      "epoch": 1.1360347322720694,
+      "grad_norm": 1.5087177753448486,
+      "learning_rate": 2.4577634754625907e-06,
+      "loss": 0.9197,
+      "step": 785
+    },
+    {
+      "epoch": 1.1374819102749638,
+      "grad_norm": 1.5388543605804443,
+      "learning_rate": 2.453740949316171e-06,
+      "loss": 0.8808,
+      "step": 786
+    },
+    {
+      "epoch": 1.1389290882778582,
+      "grad_norm": 1.575091004371643,
+      "learning_rate": 2.4497184231697505e-06,
+      "loss": 0.9123,
+      "step": 787
+    },
+    {
+      "epoch": 1.1403762662807526,
+      "grad_norm": 1.5432230234146118,
+      "learning_rate": 2.4456958970233307e-06,
+      "loss": 0.8898,
+      "step": 788
+    },
+    {
+      "epoch": 1.1418234442836468,
+      "grad_norm": 1.5404096841812134,
+      "learning_rate": 2.4416733708769108e-06,
+      "loss": 0.8835,
+      "step": 789
+    },
+    {
+      "epoch": 1.1432706222865412,
+      "grad_norm": 1.7286585569381714,
+      "learning_rate": 2.437650844730491e-06,
+      "loss": 0.8938,
+      "step": 790
+    },
+    {
+      "epoch": 1.1432706222865412,
+      "eval_loss": 1.0016593933105469,
+      "eval_runtime": 25.7801,
+      "eval_samples_per_second": 38.79,
+      "eval_steps_per_second": 2.444,
+      "step": 790
+    },
+    {
+      "epoch": 1.1447178002894356,
+      "grad_norm": 1.693106770515442,
+      "learning_rate": 2.433628318584071e-06,
+      "loss": 0.9065,
+      "step": 791
+    },
+    {
+      "epoch": 1.14616497829233,
+      "grad_norm": 1.589040994644165,
+      "learning_rate": 2.4296057924376507e-06,
+      "loss": 0.8858,
+      "step": 792
+    },
+    {
+      "epoch": 1.1476121562952244,
+      "grad_norm": 1.5030606985092163,
+      "learning_rate": 2.425583266291231e-06,
+      "loss": 0.8924,
+      "step": 793
+    },
+    {
+      "epoch": 1.1490593342981186,
+      "grad_norm": 1.6685343980789185,
+      "learning_rate": 2.421560740144811e-06,
+      "loss": 0.9408,
+      "step": 794
+    },
+    {
+      "epoch": 1.150506512301013,
+      "grad_norm": 1.59011709690094,
+      "learning_rate": 2.417538213998391e-06,
+      "loss": 0.8711,
+      "step": 795
+    },
+    {
+      "epoch": 1.1519536903039074,
+      "grad_norm": 1.5271319150924683,
+      "learning_rate": 2.4135156878519713e-06,
+      "loss": 0.8951,
+      "step": 796
+    },
+    {
+      "epoch": 1.1534008683068018,
+      "grad_norm": 1.4878593683242798,
+      "learning_rate": 2.4094931617055514e-06,
+      "loss": 0.8683,
+      "step": 797
+    },
+    {
+      "epoch": 1.1548480463096962,
+      "grad_norm": 1.572616696357727,
+      "learning_rate": 2.4054706355591315e-06,
+      "loss": 0.8463,
+      "step": 798
+    },
+    {
+      "epoch": 1.1562952243125904,
+      "grad_norm": 1.5298666954040527,
+      "learning_rate": 2.4014481094127116e-06,
+      "loss": 0.891,
+      "step": 799
+    },
+    {
+      "epoch": 1.1577424023154848,
+      "grad_norm": 1.6578890085220337,
+      "learning_rate": 2.3974255832662913e-06,
+      "loss": 0.8952,
+      "step": 800
+    },
+    {
+      "epoch": 1.1577424023154848,
+      "eval_loss": 1.0011423826217651,
+      "eval_runtime": 25.786,
+      "eval_samples_per_second": 38.781,
+      "eval_steps_per_second": 2.443,
+      "step": 800
+    },
+    {
+      "epoch": 1.1591895803183792,
+      "grad_norm": 1.6832380294799805,
+      "learning_rate": 2.3934030571198715e-06,
+      "loss": 0.9434,
+      "step": 801
+    },
+    {
+      "epoch": 1.1606367583212736,
+      "grad_norm": 1.712808609008789,
+      "learning_rate": 2.3893805309734516e-06,
+      "loss": 0.8573,
+      "step": 802
+    },
+    {
+      "epoch": 1.162083936324168,
+      "grad_norm": 1.7583296298980713,
+      "learning_rate": 2.3853580048270317e-06,
+      "loss": 0.9205,
+      "step": 803
+    },
+    {
+      "epoch": 1.1635311143270621,
+      "grad_norm": 1.5997307300567627,
+      "learning_rate": 2.381335478680612e-06,
+      "loss": 0.9089,
+      "step": 804
+    },
+    {
+      "epoch": 1.1649782923299565,
+      "grad_norm": 1.4809633493423462,
+      "learning_rate": 2.3773129525341916e-06,
+      "loss": 0.8914,
+      "step": 805
+    },
+    {
+      "epoch": 1.166425470332851,
+      "grad_norm": 1.48288094997406,
+      "learning_rate": 2.3732904263877717e-06,
+      "loss": 0.8872,
+      "step": 806
+    },
+    {
+      "epoch": 1.1678726483357453,
+      "grad_norm": 1.5765438079833984,
+      "learning_rate": 2.369267900241352e-06,
+      "loss": 0.921,
+      "step": 807
+    },
+    {
+      "epoch": 1.1693198263386397,
+      "grad_norm": 1.5146821737289429,
+      "learning_rate": 2.365245374094932e-06,
+      "loss": 0.8814,
+      "step": 808
+    },
+    {
+      "epoch": 1.170767004341534,
+      "grad_norm": 1.4884650707244873,
+      "learning_rate": 2.361222847948512e-06,
+      "loss": 0.8718,
+      "step": 809
+    },
+    {
+      "epoch": 1.1722141823444283,
+      "grad_norm": 1.5190155506134033,
+      "learning_rate": 2.3572003218020918e-06,
+      "loss": 0.9138,
+      "step": 810
+    },
+    {
+      "epoch": 1.1722141823444283,
+      "eval_loss": 1.0010331869125366,
+      "eval_runtime": 25.9025,
+      "eval_samples_per_second": 38.606,
+      "eval_steps_per_second": 2.432,
+      "step": 810
+    },
+    {
+      "epoch": 1.1736613603473227,
+      "grad_norm": 1.619706392288208,
+      "learning_rate": 2.353177795655672e-06,
+      "loss": 0.8873,
+      "step": 811
+    },
+    {
+      "epoch": 1.1751085383502171,
+      "grad_norm": 1.6402854919433594,
+      "learning_rate": 2.349155269509252e-06,
+      "loss": 0.8954,
+      "step": 812
+    },
+    {
+      "epoch": 1.1765557163531115,
+      "grad_norm": 1.5414272546768188,
+      "learning_rate": 2.345132743362832e-06,
+      "loss": 0.899,
+      "step": 813
+    },
+    {
+      "epoch": 1.1780028943560057,
+      "grad_norm": 1.7477384805679321,
+      "learning_rate": 2.3411102172164123e-06,
+      "loss": 0.9155,
+      "step": 814
+    },
+    {
+      "epoch": 1.1794500723589,
+      "grad_norm": 1.5654603242874146,
+      "learning_rate": 2.337087691069992e-06,
+      "loss": 0.9058,
+      "step": 815
+    },
+    {
+      "epoch": 1.1808972503617945,
+      "grad_norm": 1.5773078203201294,
+      "learning_rate": 2.333065164923572e-06,
+      "loss": 0.909,
+      "step": 816
+    },
+    {
+      "epoch": 1.182344428364689,
+      "grad_norm": 1.5746979713439941,
+      "learning_rate": 2.3290426387771522e-06,
+      "loss": 0.9036,
+      "step": 817
+    },
+    {
+      "epoch": 1.1837916063675833,
+      "grad_norm": 1.5165321826934814,
+      "learning_rate": 2.3250201126307324e-06,
+      "loss": 0.8874,
+      "step": 818
+    },
+    {
+      "epoch": 1.1852387843704775,
+      "grad_norm": 1.4424513578414917,
+      "learning_rate": 2.3209975864843125e-06,
+      "loss": 0.9118,
+      "step": 819
+    },
+    {
+      "epoch": 1.1866859623733719,
+      "grad_norm": 1.494571328163147,
+      "learning_rate": 2.316975060337892e-06,
+      "loss": 0.8679,
+      "step": 820
+    },
+    {
+      "epoch": 1.1866859623733719,
+      "eval_loss": 0.9990912079811096,
+      "eval_runtime": 25.843,
+      "eval_samples_per_second": 38.695,
+      "eval_steps_per_second": 2.438,
+      "step": 820
+    },
+    {
+      "epoch": 1.1881331403762663,
+      "grad_norm": 1.480198860168457,
+      "learning_rate": 2.3129525341914723e-06,
+      "loss": 0.9037,
+      "step": 821
+    },
+    {
+      "epoch": 1.1895803183791607,
+      "grad_norm": 1.584207534790039,
+      "learning_rate": 2.3089300080450525e-06,
+      "loss": 0.9139,
+      "step": 822
+    },
+    {
+      "epoch": 1.191027496382055,
+      "grad_norm": 1.612007975578308,
+      "learning_rate": 2.3049074818986326e-06,
+      "loss": 0.8963,
+      "step": 823
+    },
+    {
+      "epoch": 1.1924746743849495,
+      "grad_norm": 1.6596332788467407,
+      "learning_rate": 2.3008849557522127e-06,
+      "loss": 0.8807,
+      "step": 824
+    },
+    {
+      "epoch": 1.1939218523878437,
+      "grad_norm": 1.6010795831680298,
+      "learning_rate": 2.2968624296057924e-06,
+      "loss": 0.9194,
+      "step": 825
+    },
+    {
+      "epoch": 1.195369030390738,
+      "grad_norm": 1.525209903717041,
+      "learning_rate": 2.2928399034593725e-06,
+      "loss": 0.9004,
+      "step": 826
+    },
+    {
+      "epoch": 1.1968162083936325,
+      "grad_norm": 1.5119304656982422,
+      "learning_rate": 2.2888173773129527e-06,
+      "loss": 0.9177,
+      "step": 827
+    },
+    {
+      "epoch": 1.1982633863965269,
+      "grad_norm": 1.5764222145080566,
+      "learning_rate": 2.284794851166533e-06,
+      "loss": 0.8945,
+      "step": 828
+    },
+    {
+      "epoch": 1.199710564399421,
+      "grad_norm": 1.632712960243225,
+      "learning_rate": 2.280772325020113e-06,
+      "loss": 0.9267,
+      "step": 829
+    },
+    {
+      "epoch": 1.2011577424023154,
+      "grad_norm": 1.6575121879577637,
+      "learning_rate": 2.2767497988736926e-06,
+      "loss": 0.8893,
+      "step": 830
+    },
+    {
+      "epoch": 1.2011577424023154,
+      "eval_loss": 0.9975345730781555,
+      "eval_runtime": 25.7991,
+      "eval_samples_per_second": 38.761,
+      "eval_steps_per_second": 2.442,
+      "step": 830
+    },
+    {
+      "epoch": 1.2026049204052098,
+      "grad_norm": 1.5680720806121826,
+      "learning_rate": 2.2727272727272728e-06,
+      "loss": 0.8726,
+      "step": 831
+    },
+    {
+      "epoch": 1.2040520984081042,
+      "grad_norm": 1.5080980062484741,
+      "learning_rate": 2.268704746580853e-06,
+      "loss": 0.8918,
+      "step": 832
+    },
+    {
+      "epoch": 1.2054992764109986,
+      "grad_norm": 1.4733283519744873,
+      "learning_rate": 2.264682220434433e-06,
+      "loss": 0.8833,
+      "step": 833
+    },
+    {
+      "epoch": 1.206946454413893,
+      "grad_norm": 1.526259183883667,
+      "learning_rate": 2.260659694288013e-06,
+      "loss": 0.8811,
+      "step": 834
+    },
+    {
+      "epoch": 1.2083936324167872,
+      "grad_norm": 1.5349605083465576,
+      "learning_rate": 2.256637168141593e-06,
+      "loss": 0.8577,
+      "step": 835
+    },
+    {
+      "epoch": 1.2098408104196816,
+      "grad_norm": 1.5627975463867188,
+      "learning_rate": 2.252614641995173e-06,
+      "loss": 0.884,
+      "step": 836
+    },
+    {
+      "epoch": 1.211287988422576,
+      "grad_norm": 1.6055257320404053,
+      "learning_rate": 2.248592115848753e-06,
+      "loss": 0.8896,
+      "step": 837
+    },
+    {
+      "epoch": 1.2127351664254704,
+      "grad_norm": 1.5470566749572754,
+      "learning_rate": 2.2445695897023332e-06,
+      "loss": 0.8953,
+      "step": 838
+    },
+    {
+      "epoch": 1.2141823444283646,
+      "grad_norm": 1.4979593753814697,
+      "learning_rate": 2.2405470635559134e-06,
+      "loss": 0.8676,
+      "step": 839
+    },
+    {
+      "epoch": 1.215629522431259,
+      "grad_norm": 1.5414305925369263,
+      "learning_rate": 2.236524537409493e-06,
+      "loss": 0.9068,
+      "step": 840
+    },
+    {
+      "epoch": 1.215629522431259,
+      "eval_loss": 0.9961005449295044,
+      "eval_runtime": 25.6585,
+      "eval_samples_per_second": 38.973,
+      "eval_steps_per_second": 2.455,
+      "step": 840
+    },
+    {
+      "epoch": 1.2170767004341534,
+      "grad_norm": 1.518509864807129,
+      "learning_rate": 2.232502011263073e-06,
+      "loss": 0.8575,
+      "step": 841
+    },
+    {
+      "epoch": 1.2185238784370478,
+      "grad_norm": 1.4536638259887695,
+      "learning_rate": 2.2284794851166537e-06,
+      "loss": 0.8858,
+      "step": 842
+    },
+    {
+      "epoch": 1.2199710564399422,
+      "grad_norm": 1.606372356414795,
+      "learning_rate": 2.2244569589702334e-06,
+      "loss": 0.8667,
+      "step": 843
+    },
+    {
+      "epoch": 1.2214182344428366,
+      "grad_norm": 1.526896357536316,
+      "learning_rate": 2.2204344328238136e-06,
+      "loss": 0.8832,
+      "step": 844
+    },
+    {
+      "epoch": 1.2228654124457308,
+      "grad_norm": 1.5027129650115967,
+      "learning_rate": 2.2164119066773937e-06,
+      "loss": 0.8948,
+      "step": 845
+    },
+    {
+      "epoch": 1.2243125904486252,
+      "grad_norm": 1.4722239971160889,
+      "learning_rate": 2.212389380530974e-06,
+      "loss": 0.9062,
+      "step": 846
+    },
+    {
+      "epoch": 1.2257597684515196,
+      "grad_norm": 1.4409259557724,
+      "learning_rate": 2.208366854384554e-06,
+      "loss": 0.9001,
+      "step": 847
+    },
+    {
+      "epoch": 1.227206946454414,
+      "grad_norm": 1.4950021505355835,
+      "learning_rate": 2.2043443282381337e-06,
+      "loss": 0.9215,
+      "step": 848
+    },
+    {
+      "epoch": 1.2286541244573081,
+      "grad_norm": 1.6024349927902222,
+      "learning_rate": 2.2003218020917138e-06,
+      "loss": 0.889,
+      "step": 849
+    },
+    {
+      "epoch": 1.2301013024602026,
+      "grad_norm": 1.4954161643981934,
+      "learning_rate": 2.196299275945294e-06,
+      "loss": 0.8865,
+      "step": 850
+    },
+    {
+      "epoch": 1.2301013024602026,
+      "eval_loss": 0.9965184330940247,
+      "eval_runtime": 26.0623,
+      "eval_samples_per_second": 38.37,
+      "eval_steps_per_second": 2.417,
+      "step": 850
+    },
+    {
+      "epoch": 1.231548480463097,
+      "grad_norm": 1.5020195245742798,
+      "learning_rate": 2.192276749798874e-06,
+      "loss": 0.9048,
+      "step": 851
+    },
+    {
+      "epoch": 1.2329956584659914,
+      "grad_norm": 1.5896894931793213,
+      "learning_rate": 2.188254223652454e-06,
+      "loss": 0.8976,
+      "step": 852
+    },
+    {
+      "epoch": 1.2344428364688858,
+      "grad_norm": 1.6095257997512817,
+      "learning_rate": 2.184231697506034e-06,
+      "loss": 0.8699,
+      "step": 853
+    },
+    {
+      "epoch": 1.2358900144717802,
+      "grad_norm": 1.5864946842193604,
+      "learning_rate": 2.180209171359614e-06,
+      "loss": 0.8791,
+      "step": 854
+    },
+    {
+      "epoch": 1.2373371924746743,
+      "grad_norm": 1.5503919124603271,
+      "learning_rate": 2.176186645213194e-06,
+      "loss": 0.8984,
+      "step": 855
+    },
+    {
+      "epoch": 1.2387843704775687,
+      "grad_norm": 1.5076204538345337,
+      "learning_rate": 2.1721641190667743e-06,
+      "loss": 0.8658,
+      "step": 856
+    },
+    {
+      "epoch": 1.2402315484804631,
+      "grad_norm": 1.5990691184997559,
+      "learning_rate": 2.1681415929203544e-06,
+      "loss": 0.8868,
+      "step": 857
+    },
+    {
+      "epoch": 1.2416787264833575,
+      "grad_norm": 1.5078307390213013,
+      "learning_rate": 2.164119066773934e-06,
+      "loss": 0.895,
+      "step": 858
+    },
+    {
+      "epoch": 1.2431259044862517,
+      "grad_norm": 1.5056289434432983,
+      "learning_rate": 2.1600965406275142e-06,
+      "loss": 0.8813,
+      "step": 859
+    },
+    {
+      "epoch": 1.244573082489146,
+      "grad_norm": 1.5606762170791626,
+      "learning_rate": 2.1560740144810943e-06,
+      "loss": 0.9292,
+      "step": 860
+    },
+    {
+      "epoch": 1.244573082489146,
+      "eval_loss": 0.9943708181381226,
+      "eval_runtime": 25.6768,
+      "eval_samples_per_second": 38.946,
+      "eval_steps_per_second": 2.454,
+      "step": 860
+    },
+    {
+      "epoch": 1.2460202604920405,
+      "grad_norm": 1.4824141263961792,
+      "learning_rate": 2.1520514883346745e-06,
+      "loss": 0.8732,
+      "step": 861
+    },
+    {
+      "epoch": 1.247467438494935,
+      "grad_norm": 1.5555789470672607,
+      "learning_rate": 2.1480289621882546e-06,
+      "loss": 0.8937,
+      "step": 862
+    },
+    {
+      "epoch": 1.2489146164978293,
+      "grad_norm": 1.5916050672531128,
+      "learning_rate": 2.1440064360418343e-06,
+      "loss": 0.8728,
+      "step": 863
+    },
+    {
+      "epoch": 1.2503617945007237,
+      "grad_norm": 1.6065150499343872,
+      "learning_rate": 2.1399839098954144e-06,
+      "loss": 0.8511,
+      "step": 864
+    },
+    {
+      "epoch": 1.251808972503618,
+      "grad_norm": 1.569021224975586,
+      "learning_rate": 2.1359613837489946e-06,
+      "loss": 0.8922,
+      "step": 865
+    },
+    {
+      "epoch": 1.2532561505065123,
+      "grad_norm": 1.5763746500015259,
+      "learning_rate": 2.1319388576025747e-06,
+      "loss": 0.9,
+      "step": 866
+    },
+    {
+      "epoch": 1.2547033285094067,
+      "grad_norm": 1.5964343547821045,
+      "learning_rate": 2.127916331456155e-06,
+      "loss": 0.9042,
+      "step": 867
+    },
+    {
+      "epoch": 1.256150506512301,
+      "grad_norm": 1.457443118095398,
+      "learning_rate": 2.1238938053097345e-06,
+      "loss": 0.8802,
+      "step": 868
+    },
+    {
+      "epoch": 1.2575976845151953,
+      "grad_norm": 1.4925570487976074,
+      "learning_rate": 2.1198712791633146e-06,
+      "loss": 0.8771,
+      "step": 869
+    },
+    {
+      "epoch": 1.2590448625180897,
+      "grad_norm": 1.543954849243164,
+      "learning_rate": 2.1158487530168948e-06,
+      "loss": 0.9085,
+      "step": 870
+    },
+    {
+      "epoch": 1.2590448625180897,
+      "eval_loss": 0.993406355381012,
+      "eval_runtime": 25.8004,
+      "eval_samples_per_second": 38.759,
+      "eval_steps_per_second": 2.442,
+      "step": 870
+    },
+    {
+      "epoch": 1.260492040520984,
+      "grad_norm": 1.5981473922729492,
+      "learning_rate": 2.111826226870475e-06,
+      "loss": 0.885,
+      "step": 871
+    },
+    {
+      "epoch": 1.2619392185238785,
+      "grad_norm": 1.5987436771392822,
+      "learning_rate": 2.107803700724055e-06,
+      "loss": 0.8883,
+      "step": 872
+    },
+    {
+      "epoch": 1.2633863965267729,
+      "grad_norm": 1.5223180055618286,
+      "learning_rate": 2.1037811745776347e-06,
+      "loss": 0.8445,
+      "step": 873
+    },
+    {
+      "epoch": 1.2648335745296673,
+      "grad_norm": 1.5502173900604248,
+      "learning_rate": 2.099758648431215e-06,
+      "loss": 0.8799,
+      "step": 874
+    },
+    {
+      "epoch": 1.2662807525325614,
+      "grad_norm": 1.574419379234314,
+      "learning_rate": 2.095736122284795e-06,
+      "loss": 0.8862,
+      "step": 875
+    },
+    {
+      "epoch": 1.2677279305354558,
+      "grad_norm": 1.5151711702346802,
+      "learning_rate": 2.091713596138375e-06,
+      "loss": 0.842,
+      "step": 876
+    },
+    {
+      "epoch": 1.2691751085383502,
+      "grad_norm": 1.5926389694213867,
+      "learning_rate": 2.0876910699919552e-06,
+      "loss": 0.8812,
+      "step": 877
+    },
+    {
+      "epoch": 1.2706222865412446,
+      "grad_norm": 1.5817290544509888,
+      "learning_rate": 2.083668543845535e-06,
+      "loss": 0.8927,
+      "step": 878
+    },
+    {
+      "epoch": 1.2720694645441388,
+      "grad_norm": 1.5659255981445312,
+      "learning_rate": 2.079646017699115e-06,
+      "loss": 0.8973,
+      "step": 879
+    },
+    {
+      "epoch": 1.2735166425470332,
+      "grad_norm": 1.5826165676116943,
+      "learning_rate": 2.075623491552695e-06,
+      "loss": 0.8934,
+      "step": 880
+    },
+    {
+      "epoch": 1.2735166425470332,
+      "eval_loss": 0.994766116142273,
+      "eval_runtime": 25.9544,
+      "eval_samples_per_second": 38.529,
+      "eval_steps_per_second": 2.427,
+      "step": 880
+    },
+    {
+      "epoch": 1.2749638205499276,
+      "grad_norm": 1.5688477754592896,
+      "learning_rate": 2.0716009654062753e-06,
+      "loss": 0.8909,
+      "step": 881
+    },
+    {
+      "epoch": 1.276410998552822,
+      "grad_norm": 1.6281142234802246,
+      "learning_rate": 2.067578439259855e-06,
+      "loss": 0.9109,
+      "step": 882
+    },
+    {
+      "epoch": 1.2778581765557164,
+      "grad_norm": 1.5646711587905884,
+      "learning_rate": 2.063555913113435e-06,
+      "loss": 0.8628,
+      "step": 883
+    },
+    {
+      "epoch": 1.2793053545586108,
+      "grad_norm": 1.5538504123687744,
+      "learning_rate": 2.0595333869670153e-06,
+      "loss": 0.8856,
+      "step": 884
+    },
+    {
+      "epoch": 1.280752532561505,
+      "grad_norm": 1.5453202724456787,
+      "learning_rate": 2.0555108608205954e-06,
+      "loss": 0.8892,
+      "step": 885
+    },
+    {
+      "epoch": 1.2821997105643994,
+      "grad_norm": 1.520643949508667,
+      "learning_rate": 2.0514883346741755e-06,
+      "loss": 0.8811,
+      "step": 886
+    },
+    {
+      "epoch": 1.2836468885672938,
+      "grad_norm": 1.4514366388320923,
+      "learning_rate": 2.0474658085277557e-06,
+      "loss": 0.8871,
+      "step": 887
+    },
+    {
+      "epoch": 1.2850940665701882,
+      "grad_norm": 1.5639816522598267,
+      "learning_rate": 2.043443282381336e-06,
+      "loss": 0.8786,
+      "step": 888
+    },
+    {
+      "epoch": 1.2865412445730824,
+      "grad_norm": 1.5956770181655884,
+      "learning_rate": 2.039420756234916e-06,
+      "loss": 0.8959,
+      "step": 889
+    },
+    {
+      "epoch": 1.2879884225759768,
+      "grad_norm": 1.6308395862579346,
+      "learning_rate": 2.035398230088496e-06,
+      "loss": 0.884,
+      "step": 890
+    },
+    {
+      "epoch": 1.2879884225759768,
+      "eval_loss": 0.9930154085159302,
+      "eval_runtime": 25.8341,
+      "eval_samples_per_second": 38.709,
+      "eval_steps_per_second": 2.439,
+      "step": 890
+    },
+    {
+      "epoch": 1.2894356005788712,
+      "grad_norm": 1.505920171737671,
+      "learning_rate": 2.0313757039420758e-06,
+      "loss": 0.85,
+      "step": 891
+    },
+    {
+      "epoch": 1.2908827785817656,
+      "grad_norm": 1.5757458209991455,
+      "learning_rate": 2.027353177795656e-06,
+      "loss": 0.8968,
+      "step": 892
+    },
+    {
+      "epoch": 1.29232995658466,
+      "grad_norm": 1.5903902053833008,
+      "learning_rate": 2.023330651649236e-06,
+      "loss": 0.8994,
+      "step": 893
+    },
+    {
+      "epoch": 1.2937771345875544,
+      "grad_norm": 1.4955521821975708,
+      "learning_rate": 2.019308125502816e-06,
+      "loss": 0.8982,
+      "step": 894
+    },
+    {
+      "epoch": 1.2952243125904486,
+      "grad_norm": 1.4922794103622437,
+      "learning_rate": 2.0152855993563963e-06,
+      "loss": 0.8778,
+      "step": 895
+    },
+    {
+      "epoch": 1.296671490593343,
+      "grad_norm": 1.5851147174835205,
+      "learning_rate": 2.011263073209976e-06,
+      "loss": 0.8602,
+      "step": 896
+    },
+    {
+      "epoch": 1.2981186685962374,
+      "grad_norm": 1.5002673864364624,
+      "learning_rate": 2.007240547063556e-06,
+      "loss": 0.8868,
+      "step": 897
+    },
+    {
+      "epoch": 1.2995658465991318,
+      "grad_norm": 1.5019080638885498,
+      "learning_rate": 2.0032180209171362e-06,
+      "loss": 0.8955,
+      "step": 898
+    },
+    {
+      "epoch": 1.301013024602026,
+      "grad_norm": 1.544146180152893,
+      "learning_rate": 1.9991954947707164e-06,
+      "loss": 0.8934,
+      "step": 899
+    },
+    {
+      "epoch": 1.3024602026049203,
+      "grad_norm": 1.5680269002914429,
+      "learning_rate": 1.995172968624296e-06,
+      "loss": 0.8784,
+      "step": 900
+    },
+    {
+      "epoch": 1.3024602026049203,
+      "eval_loss": 0.9916625618934631,
+      "eval_runtime": 25.7891,
+      "eval_samples_per_second": 38.776,
+      "eval_steps_per_second": 2.443,
+      "step": 900
+    },
+    {
+      "epoch": 1.3039073806078147,
+      "grad_norm": 1.6523807048797607,
+      "learning_rate": 1.991150442477876e-06,
+      "loss": 0.9072,
+      "step": 901
+    },
+    {
+      "epoch": 1.3053545586107091,
+      "grad_norm": 1.5159372091293335,
+      "learning_rate": 1.9871279163314563e-06,
+      "loss": 0.8983,
+      "step": 902
+    },
+    {
+      "epoch": 1.3068017366136035,
+      "grad_norm": 1.7033772468566895,
+      "learning_rate": 1.9831053901850364e-06,
+      "loss": 0.8733,
+      "step": 903
+    },
+    {
+      "epoch": 1.308248914616498,
+      "grad_norm": 1.5757745504379272,
+      "learning_rate": 1.9790828640386166e-06,
+      "loss": 0.8769,
+      "step": 904
+    },
+    {
+      "epoch": 1.3096960926193921,
+      "grad_norm": 1.5687916278839111,
+      "learning_rate": 1.9750603378921963e-06,
+      "loss": 0.9141,
+      "step": 905
+    },
+    {
+      "epoch": 1.3111432706222865,
+      "grad_norm": 1.6125038862228394,
+      "learning_rate": 1.9710378117457764e-06,
+      "loss": 0.9043,
+      "step": 906
+    },
+    {
+      "epoch": 1.312590448625181,
+      "grad_norm": 1.5606075525283813,
+      "learning_rate": 1.9670152855993565e-06,
+      "loss": 0.8921,
+      "step": 907
+    },
+    {
+      "epoch": 1.3140376266280753,
+      "grad_norm": 1.6325387954711914,
+      "learning_rate": 1.9629927594529367e-06,
+      "loss": 0.8908,
+      "step": 908
+    },
+    {
+      "epoch": 1.3154848046309695,
+      "grad_norm": 1.5046014785766602,
+      "learning_rate": 1.9589702333065168e-06,
+      "loss": 0.8781,
+      "step": 909
+    },
+    {
+      "epoch": 1.316931982633864,
+      "grad_norm": 1.5318217277526855,
+      "learning_rate": 1.9549477071600965e-06,
+      "loss": 0.8553,
+      "step": 910
+    },
+    {
+      "epoch": 1.316931982633864,
+      "eval_loss": 0.9922801852226257,
+      "eval_runtime": 25.6515,
+      "eval_samples_per_second": 38.984,
+      "eval_steps_per_second": 2.456,
+      "step": 910
+    },
+    {
+      "epoch": 1.3183791606367583,
+      "grad_norm": 1.5576587915420532,
+      "learning_rate": 1.9509251810136766e-06,
+      "loss": 0.9139,
+      "step": 911
+    },
+    {
+      "epoch": 1.3198263386396527,
+      "grad_norm": 1.6574680805206299,
+      "learning_rate": 1.9469026548672567e-06,
+      "loss": 0.8644,
+      "step": 912
+    },
+    {
+      "epoch": 1.321273516642547,
+      "grad_norm": 1.6218926906585693,
+      "learning_rate": 1.942880128720837e-06,
+      "loss": 0.8941,
+      "step": 913
+    },
+    {
+      "epoch": 1.3227206946454415,
+      "grad_norm": 1.6747978925704956,
+      "learning_rate": 1.938857602574417e-06,
+      "loss": 0.9131,
+      "step": 914
+    },
+    {
+      "epoch": 1.3241678726483357,
+      "grad_norm": 1.5592243671417236,
+      "learning_rate": 1.9348350764279967e-06,
+      "loss": 0.9109,
+      "step": 915
+    },
+    {
+      "epoch": 1.32561505065123,
+      "grad_norm": 1.5739071369171143,
+      "learning_rate": 1.930812550281577e-06,
+      "loss": 0.8956,
+      "step": 916
+    },
+    {
+      "epoch": 1.3270622286541245,
+      "grad_norm": 1.529422402381897,
+      "learning_rate": 1.926790024135157e-06,
+      "loss": 0.872,
+      "step": 917
+    },
+    {
+      "epoch": 1.3285094066570189,
+      "grad_norm": 1.4759328365325928,
+      "learning_rate": 1.922767497988737e-06,
+      "loss": 0.8646,
+      "step": 918
+    },
+    {
+      "epoch": 1.329956584659913,
+      "grad_norm": 1.5779008865356445,
+      "learning_rate": 1.9187449718423172e-06,
+      "loss": 0.8933,
+      "step": 919
+    },
+    {
+      "epoch": 1.3314037626628075,
+      "grad_norm": 1.5866317749023438,
+      "learning_rate": 1.914722445695897e-06,
+      "loss": 0.9112,
+      "step": 920
+    },
+    {
+      "epoch": 1.3314037626628075,
+      "eval_loss": 0.9918884634971619,
+      "eval_runtime": 26.0278,
+      "eval_samples_per_second": 38.42,
+      "eval_steps_per_second": 2.42,
+      "step": 920
+    },
+    {
+      "epoch": 1.3328509406657019,
+      "grad_norm": 1.5680437088012695,
+      "learning_rate": 1.910699919549477e-06,
+      "loss": 0.9152,
+      "step": 921
+    },
+    {
+      "epoch": 1.3342981186685963,
+      "grad_norm": 1.5756558179855347,
+      "learning_rate": 1.9066773934030572e-06,
+      "loss": 0.9017,
+      "step": 922
+    },
+    {
+      "epoch": 1.3357452966714907,
+      "grad_norm": 1.6703917980194092,
+      "learning_rate": 1.9026548672566373e-06,
+      "loss": 0.8885,
+      "step": 923
+    },
+    {
+      "epoch": 1.337192474674385,
+      "grad_norm": 1.567590355873108,
+      "learning_rate": 1.8986323411102172e-06,
+      "loss": 0.8838,
+      "step": 924
+    },
+    {
+      "epoch": 1.3386396526772792,
+      "grad_norm": 1.5318201780319214,
+      "learning_rate": 1.8946098149637973e-06,
+      "loss": 0.869,
+      "step": 925
+    },
+    {
+      "epoch": 1.3400868306801736,
+      "grad_norm": 1.553001880645752,
+      "learning_rate": 1.8905872888173773e-06,
+      "loss": 0.8781,
+      "step": 926
+    },
+    {
+      "epoch": 1.341534008683068,
+      "grad_norm": 1.5244364738464355,
+      "learning_rate": 1.8865647626709574e-06,
+      "loss": 0.8726,
+      "step": 927
+    },
+    {
+      "epoch": 1.3429811866859624,
+      "grad_norm": 1.4793615341186523,
+      "learning_rate": 1.8825422365245375e-06,
+      "loss": 0.8623,
+      "step": 928
+    },
+    {
+      "epoch": 1.3444283646888566,
+      "grad_norm": 1.5498313903808594,
+      "learning_rate": 1.8785197103781174e-06,
+      "loss": 0.8663,
+      "step": 929
+    },
+    {
+      "epoch": 1.345875542691751,
+      "grad_norm": 1.5433320999145508,
+      "learning_rate": 1.8744971842316978e-06,
+      "loss": 0.8864,
+      "step": 930
+    },
+    {
+      "epoch": 1.345875542691751,
+      "eval_loss": 0.9909474849700928,
+      "eval_runtime": 25.7846,
+      "eval_samples_per_second": 38.783,
+      "eval_steps_per_second": 2.443,
+      "step": 930
+    },
+    {
+      "epoch": 1.3473227206946454,
+      "grad_norm": 1.4593886137008667,
+      "learning_rate": 1.870474658085278e-06,
+      "loss": 0.8686,
+      "step": 931
+    },
+    {
+      "epoch": 1.3487698986975398,
+      "grad_norm": 1.5526374578475952,
+      "learning_rate": 1.8664521319388578e-06,
+      "loss": 0.8927,
+      "step": 932
+    },
+    {
+      "epoch": 1.3502170767004342,
+      "grad_norm": 1.6133079528808594,
+      "learning_rate": 1.862429605792438e-06,
+      "loss": 0.8653,
+      "step": 933
+    },
+    {
+      "epoch": 1.3516642547033286,
+      "grad_norm": 1.4973291158676147,
+      "learning_rate": 1.8584070796460179e-06,
+      "loss": 0.8763,
+      "step": 934
+    },
+    {
+      "epoch": 1.3531114327062228,
+      "grad_norm": 1.5784869194030762,
+      "learning_rate": 1.854384553499598e-06,
+      "loss": 0.9069,
+      "step": 935
+    },
+    {
+      "epoch": 1.3545586107091172,
+      "grad_norm": 1.5758600234985352,
+      "learning_rate": 1.8503620273531781e-06,
+      "loss": 0.9041,
+      "step": 936
+    },
+    {
+      "epoch": 1.3560057887120116,
+      "grad_norm": 1.4842194318771362,
+      "learning_rate": 1.846339501206758e-06,
+      "loss": 0.8912,
+      "step": 937
+    },
+    {
+      "epoch": 1.357452966714906,
+      "grad_norm": 1.522415280342102,
+      "learning_rate": 1.8423169750603382e-06,
+      "loss": 0.8582,
+      "step": 938
+    },
+    {
+      "epoch": 1.3589001447178002,
+      "grad_norm": 1.483718752861023,
+      "learning_rate": 1.838294448913918e-06,
+      "loss": 0.8774,
+      "step": 939
+    },
+    {
+      "epoch": 1.3603473227206946,
+      "grad_norm": 1.4885591268539429,
+      "learning_rate": 1.8342719227674982e-06,
+      "loss": 0.8778,
+      "step": 940
+    },
+    {
+      "epoch": 1.3603473227206946,
+      "eval_loss": 0.9908953905105591,
+      "eval_runtime": 25.6816,
+      "eval_samples_per_second": 38.938,
+      "eval_steps_per_second": 2.453,
+      "step": 940
+    },
+    {
+      "epoch": 1.361794500723589,
+      "grad_norm": 1.5625662803649902,
+      "learning_rate": 1.8302493966210783e-06,
+      "loss": 0.8922,
+      "step": 941
+    },
+    {
+      "epoch": 1.3632416787264834,
+      "grad_norm": 1.5827653408050537,
+      "learning_rate": 1.8262268704746582e-06,
+      "loss": 0.8762,
+      "step": 942
+    },
+    {
+      "epoch": 1.3646888567293778,
+      "grad_norm": 1.532116174697876,
+      "learning_rate": 1.8222043443282384e-06,
+      "loss": 0.8838,
+      "step": 943
+    },
+    {
+      "epoch": 1.3661360347322722,
+      "grad_norm": 1.5727930068969727,
+      "learning_rate": 1.8181818181818183e-06,
+      "loss": 0.8761,
+      "step": 944
+    },
+    {
+      "epoch": 1.3675832127351664,
+      "grad_norm": 1.5438214540481567,
+      "learning_rate": 1.8141592920353984e-06,
+      "loss": 0.8739,
+      "step": 945
+    },
+    {
+      "epoch": 1.3690303907380608,
+      "grad_norm": 1.5137232542037964,
+      "learning_rate": 1.8101367658889785e-06,
+      "loss": 0.885,
+      "step": 946
+    },
+    {
+      "epoch": 1.3704775687409552,
+      "grad_norm": 1.5985186100006104,
+      "learning_rate": 1.8061142397425585e-06,
+      "loss": 0.8967,
+      "step": 947
+    },
+    {
+      "epoch": 1.3719247467438496,
+      "grad_norm": 1.607809066772461,
+      "learning_rate": 1.8020917135961386e-06,
+      "loss": 0.9175,
+      "step": 948
+    },
+    {
+      "epoch": 1.3733719247467437,
+      "grad_norm": 1.5717999935150146,
+      "learning_rate": 1.7980691874497185e-06,
+      "loss": 0.8862,
+      "step": 949
+    },
+    {
+      "epoch": 1.3748191027496381,
+      "grad_norm": 1.490583062171936,
+      "learning_rate": 1.7940466613032986e-06,
+      "loss": 0.8425,
+      "step": 950
+    },
+    {
+      "epoch": 1.3748191027496381,
+      "eval_loss": 0.989203929901123,
+      "eval_runtime": 25.7453,
+      "eval_samples_per_second": 38.842,
+      "eval_steps_per_second": 2.447,
+      "step": 950
+    },
+    {
+      "epoch": 1.3762662807525325,
+      "grad_norm": 1.5401087999343872,
+      "learning_rate": 1.7900241351568788e-06,
+      "loss": 0.8418,
+      "step": 951
+    },
+    {
+      "epoch": 1.377713458755427,
+      "grad_norm": 1.527870774269104,
+      "learning_rate": 1.7860016090104587e-06,
+      "loss": 0.8591,
+      "step": 952
+    },
+    {
+      "epoch": 1.3791606367583213,
+      "grad_norm": 1.5778286457061768,
+      "learning_rate": 1.7819790828640388e-06,
+      "loss": 0.8959,
+      "step": 953
+    },
+    {
+      "epoch": 1.3806078147612157,
+      "grad_norm": 1.5266025066375732,
+      "learning_rate": 1.7779565567176187e-06,
+      "loss": 0.8449,
+      "step": 954
+    },
+    {
+      "epoch": 1.38205499276411,
+      "grad_norm": 1.508654236793518,
+      "learning_rate": 1.7739340305711988e-06,
+      "loss": 0.8578,
+      "step": 955
+    },
+    {
+      "epoch": 1.3835021707670043,
+      "grad_norm": 1.515903353691101,
+      "learning_rate": 1.769911504424779e-06,
+      "loss": 0.8801,
+      "step": 956
+    },
+    {
+      "epoch": 1.3849493487698987,
+      "grad_norm": 1.4178016185760498,
+      "learning_rate": 1.7658889782783589e-06,
+      "loss": 0.8601,
+      "step": 957
+    },
+    {
+      "epoch": 1.3863965267727931,
+      "grad_norm": 1.5548313856124878,
+      "learning_rate": 1.761866452131939e-06,
+      "loss": 0.8874,
+      "step": 958
+    },
+    {
+      "epoch": 1.3878437047756873,
+      "grad_norm": 1.6333560943603516,
+      "learning_rate": 1.757843925985519e-06,
+      "loss": 0.874,
+      "step": 959
+    },
+    {
+      "epoch": 1.3892908827785817,
+      "grad_norm": 1.6060621738433838,
+      "learning_rate": 1.753821399839099e-06,
+      "loss": 0.8987,
+      "step": 960
+    },
+    {
+      "epoch": 1.3892908827785817,
+      "eval_loss": 0.9903543591499329,
+      "eval_runtime": 25.6673,
+      "eval_samples_per_second": 38.96,
+      "eval_steps_per_second": 2.454,
+      "step": 960
+    },
+    {
+      "epoch": 1.390738060781476,
+      "grad_norm": 1.526719331741333,
+      "learning_rate": 1.7497988736926792e-06,
+      "loss": 0.9042,
+      "step": 961
+    },
+    {
+      "epoch": 1.3921852387843705,
+      "grad_norm": 1.654146671295166,
+      "learning_rate": 1.7457763475462591e-06,
+      "loss": 0.8697,
+      "step": 962
+    },
+    {
+      "epoch": 1.393632416787265,
+      "grad_norm": 1.5884320735931396,
+      "learning_rate": 1.7417538213998392e-06,
+      "loss": 0.8967,
+      "step": 963
+    },
+    {
+      "epoch": 1.3950795947901593,
+      "grad_norm": 1.5201061964035034,
+      "learning_rate": 1.7377312952534192e-06,
+      "loss": 0.9149,
+      "step": 964
+    },
+    {
+      "epoch": 1.3965267727930535,
+      "grad_norm": 1.4852148294448853,
+      "learning_rate": 1.7337087691069993e-06,
+      "loss": 0.8937,
+      "step": 965
+    },
+    {
+      "epoch": 1.3979739507959479,
+      "grad_norm": 1.4756934642791748,
+      "learning_rate": 1.7296862429605792e-06,
+      "loss": 0.8906,
+      "step": 966
+    },
+    {
+      "epoch": 1.3994211287988423,
+      "grad_norm": 1.4979703426361084,
+      "learning_rate": 1.7256637168141593e-06,
+      "loss": 0.8766,
+      "step": 967
+    },
+    {
+      "epoch": 1.4008683068017367,
+      "grad_norm": 1.4638196229934692,
+      "learning_rate": 1.7216411906677395e-06,
+      "loss": 0.8316,
+      "step": 968
+    },
+    {
+      "epoch": 1.4023154848046309,
+      "grad_norm": 1.4833625555038452,
+      "learning_rate": 1.7176186645213194e-06,
+      "loss": 0.8643,
+      "step": 969
+    },
+    {
+      "epoch": 1.4037626628075253,
+      "grad_norm": 1.6497318744659424,
+      "learning_rate": 1.7135961383748995e-06,
+      "loss": 0.9124,
+      "step": 970
+    },
+    {
+      "epoch": 1.4037626628075253,
+      "eval_loss": 0.9889197945594788,
+      "eval_runtime": 26.0218,
+      "eval_samples_per_second": 38.429,
+      "eval_steps_per_second": 2.421,
+      "step": 970
+    },
+    {
+      "epoch": 1.4052098408104197,
+      "grad_norm": 1.4444023370742798,
+      "learning_rate": 1.7095736122284794e-06,
+      "loss": 0.8733,
+      "step": 971
+    },
+    {
+      "epoch": 1.406657018813314,
+      "grad_norm": 1.5876818895339966,
+      "learning_rate": 1.7055510860820595e-06,
+      "loss": 0.8442,
+      "step": 972
+    },
+    {
+      "epoch": 1.4081041968162085,
+      "grad_norm": 1.6315959692001343,
+      "learning_rate": 1.7015285599356397e-06,
+      "loss": 0.8869,
+      "step": 973
+    },
+    {
+      "epoch": 1.4095513748191029,
+      "grad_norm": 1.6600921154022217,
+      "learning_rate": 1.6975060337892196e-06,
+      "loss": 0.9176,
+      "step": 974
+    },
+    {
+      "epoch": 1.410998552821997,
+      "grad_norm": 1.4976136684417725,
+      "learning_rate": 1.6934835076428e-06,
+      "loss": 0.8663,
+      "step": 975
+    },
+    {
+      "epoch": 1.4124457308248914,
+      "grad_norm": 1.5567373037338257,
+      "learning_rate": 1.68946098149638e-06,
+      "loss": 0.9061,
+      "step": 976
+    },
+    {
+      "epoch": 1.4138929088277858,
+      "grad_norm": 1.483586311340332,
+      "learning_rate": 1.68543845534996e-06,
+      "loss": 0.8904,
+      "step": 977
+    },
+    {
+      "epoch": 1.4153400868306802,
+      "grad_norm": 1.5179016590118408,
+      "learning_rate": 1.68141592920354e-06,
+      "loss": 0.8972,
+      "step": 978
+    },
+    {
+      "epoch": 1.4167872648335744,
+      "grad_norm": 1.4706259965896606,
+      "learning_rate": 1.6773934030571202e-06,
+      "loss": 0.8727,
+      "step": 979
+    },
+    {
+      "epoch": 1.4182344428364688,
+      "grad_norm": 1.6346557140350342,
+      "learning_rate": 1.6733708769107001e-06,
+      "loss": 0.9173,
+      "step": 980
+    },
+    {
+      "epoch": 1.4182344428364688,
+      "eval_loss": 0.9879488348960876,
+      "eval_runtime": 25.8173,
+      "eval_samples_per_second": 38.734,
+      "eval_steps_per_second": 2.44,
+      "step": 980
+    },
+    {
+      "epoch": 1.4196816208393632,
+      "grad_norm": 1.6798738241195679,
+      "learning_rate": 1.6693483507642803e-06,
+      "loss": 0.8445,
+      "step": 981
+    },
+    {
+      "epoch": 1.4211287988422576,
+      "grad_norm": 1.5508772134780884,
+      "learning_rate": 1.6653258246178602e-06,
+      "loss": 0.8713,
+      "step": 982
+    },
+    {
+      "epoch": 1.422575976845152,
+      "grad_norm": 1.557064414024353,
+      "learning_rate": 1.6613032984714403e-06,
+      "loss": 0.8596,
+      "step": 983
+    },
+    {
+      "epoch": 1.4240231548480464,
+      "grad_norm": 1.6628038883209229,
+      "learning_rate": 1.6572807723250202e-06,
+      "loss": 0.9002,
+      "step": 984
+    },
+    {
+      "epoch": 1.4254703328509406,
+      "grad_norm": 1.606233835220337,
+      "learning_rate": 1.6532582461786004e-06,
+      "loss": 0.8555,
+      "step": 985
+    },
+    {
+      "epoch": 1.426917510853835,
+      "grad_norm": 1.694331407546997,
+      "learning_rate": 1.6492357200321805e-06,
+      "loss": 0.8903,
+      "step": 986
+    },
+    {
+      "epoch": 1.4283646888567294,
+      "grad_norm": 1.5641584396362305,
+      "learning_rate": 1.6452131938857604e-06,
+      "loss": 0.8864,
+      "step": 987
+    },
+    {
+      "epoch": 1.4298118668596238,
+      "grad_norm": 1.524124264717102,
+      "learning_rate": 1.6411906677393405e-06,
+      "loss": 0.8899,
+      "step": 988
+    },
+    {
+      "epoch": 1.431259044862518,
+      "grad_norm": 1.5322781801223755,
+      "learning_rate": 1.6371681415929204e-06,
+      "loss": 0.8723,
+      "step": 989
+    },
+    {
+      "epoch": 1.4327062228654124,
+      "grad_norm": 1.5665675401687622,
+      "learning_rate": 1.6331456154465006e-06,
+      "loss": 0.9193,
+      "step": 990
+    },
+    {
+      "epoch": 1.4327062228654124,
+      "eval_loss": 0.9871683120727539,
+      "eval_runtime": 25.7573,
+      "eval_samples_per_second": 38.824,
+      "eval_steps_per_second": 2.446,
+      "step": 990
+    },
+    {
+      "epoch": 1.4341534008683068,
+      "grad_norm": 1.5333694219589233,
+      "learning_rate": 1.6291230893000807e-06,
+      "loss": 0.8669,
+      "step": 991
+    },
+    {
+      "epoch": 1.4356005788712012,
+      "grad_norm": 1.4930754899978638,
+      "learning_rate": 1.6251005631536606e-06,
+      "loss": 0.8744,
+      "step": 992
+    },
+    {
+      "epoch": 1.4370477568740956,
+      "grad_norm": 1.5360321998596191,
+      "learning_rate": 1.6210780370072407e-06,
+      "loss": 0.8883,
+      "step": 993
+    },
+    {
+      "epoch": 1.43849493487699,
+      "grad_norm": 1.568663477897644,
+      "learning_rate": 1.6170555108608207e-06,
+      "loss": 0.8927,
+      "step": 994
+    },
+    {
+      "epoch": 1.4399421128798842,
+      "grad_norm": 1.62112295627594,
+      "learning_rate": 1.6130329847144008e-06,
+      "loss": 0.9045,
+      "step": 995
+    },
+    {
+      "epoch": 1.4413892908827786,
+      "grad_norm": 1.5460090637207031,
+      "learning_rate": 1.609010458567981e-06,
+      "loss": 0.9154,
+      "step": 996
+    },
+    {
+      "epoch": 1.442836468885673,
+      "grad_norm": 1.470872402191162,
+      "learning_rate": 1.6049879324215608e-06,
+      "loss": 0.861,
+      "step": 997
+    },
+    {
+      "epoch": 1.4442836468885674,
+      "grad_norm": 1.552982211112976,
+      "learning_rate": 1.600965406275141e-06,
+      "loss": 0.8359,
+      "step": 998
+    },
+    {
+      "epoch": 1.4457308248914615,
+      "grad_norm": 1.5416020154953003,
+      "learning_rate": 1.5969428801287209e-06,
+      "loss": 0.885,
+      "step": 999
+    },
+    {
+      "epoch": 1.447178002894356,
+      "grad_norm": 1.5289608240127563,
+      "learning_rate": 1.592920353982301e-06,
+      "loss": 0.8767,
+      "step": 1000
+    },
+    {
+      "epoch": 1.447178002894356,
+      "eval_loss": 0.9877498745918274,
+      "eval_runtime": 26.065,
+      "eval_samples_per_second": 38.366,
+      "eval_steps_per_second": 2.417,
+      "step": 1000
+    },
+    {
+      "epoch": 1.4486251808972503,
+      "grad_norm": 1.5533311367034912,
+      "learning_rate": 1.5888978278358811e-06,
+      "loss": 0.837,
+      "step": 1001
+    },
+    {
+      "epoch": 1.4500723589001447,
+      "grad_norm": 1.605147361755371,
+      "learning_rate": 1.584875301689461e-06,
+      "loss": 0.8792,
+      "step": 1002
+    },
+    {
+      "epoch": 1.4515195369030391,
+      "grad_norm": 1.6621363162994385,
+      "learning_rate": 1.5808527755430412e-06,
+      "loss": 0.8586,
+      "step": 1003
+    },
+    {
+      "epoch": 1.4529667149059335,
+      "grad_norm": 1.5832258462905884,
+      "learning_rate": 1.576830249396621e-06,
+      "loss": 0.915,
+      "step": 1004
+    },
+    {
+      "epoch": 1.4544138929088277,
+      "grad_norm": 1.4973900318145752,
+      "learning_rate": 1.5728077232502012e-06,
+      "loss": 0.8703,
+      "step": 1005
+    },
+    {
+      "epoch": 1.4558610709117221,
+      "grad_norm": 1.5472540855407715,
+      "learning_rate": 1.5687851971037813e-06,
+      "loss": 0.8971,
+      "step": 1006
+    },
+    {
+      "epoch": 1.4573082489146165,
+      "grad_norm": 1.6117088794708252,
+      "learning_rate": 1.5647626709573613e-06,
+      "loss": 0.8401,
+      "step": 1007
+    },
+    {
+      "epoch": 1.458755426917511,
+      "grad_norm": 1.5230553150177002,
+      "learning_rate": 1.5607401448109414e-06,
+      "loss": 0.8998,
+      "step": 1008
+    },
+    {
+      "epoch": 1.460202604920405,
+      "grad_norm": 1.5158506631851196,
+      "learning_rate": 1.5567176186645213e-06,
+      "loss": 0.8469,
+      "step": 1009
+    },
+    {
+      "epoch": 1.4616497829232995,
+      "grad_norm": 1.6470974683761597,
+      "learning_rate": 1.5526950925181014e-06,
+      "loss": 0.8783,
+      "step": 1010
+    },
+    {
+      "epoch": 1.4616497829232995,
+      "eval_loss": 0.9876880049705505,
+      "eval_runtime": 25.7522,
+      "eval_samples_per_second": 38.832,
+      "eval_steps_per_second": 2.446,
+      "step": 1010
+    },
+    {
+      "epoch": 1.463096960926194,
+      "grad_norm": 1.6227943897247314,
+      "learning_rate": 1.5486725663716816e-06,
+      "loss": 0.8993,
+      "step": 1011
+    },
+    {
+      "epoch": 1.4645441389290883,
+      "grad_norm": 1.558890700340271,
+      "learning_rate": 1.5446500402252615e-06,
+      "loss": 0.8601,
+      "step": 1012
+    },
+    {
+      "epoch": 1.4659913169319827,
+      "grad_norm": 1.5884944200515747,
+      "learning_rate": 1.5406275140788416e-06,
+      "loss": 0.8886,
+      "step": 1013
+    },
+    {
+      "epoch": 1.467438494934877,
+      "grad_norm": 1.463099479675293,
+      "learning_rate": 1.5366049879324215e-06,
+      "loss": 0.8625,
+      "step": 1014
+    },
+    {
+      "epoch": 1.4688856729377713,
+      "grad_norm": 1.5330357551574707,
+      "learning_rate": 1.5325824617860016e-06,
+      "loss": 0.8759,
+      "step": 1015
+    },
+    {
+      "epoch": 1.4703328509406657,
+      "grad_norm": 1.5273404121398926,
+      "learning_rate": 1.5285599356395816e-06,
+      "loss": 0.9061,
+      "step": 1016
+    },
+    {
+      "epoch": 1.47178002894356,
+      "grad_norm": 1.5751070976257324,
+      "learning_rate": 1.5245374094931617e-06,
+      "loss": 0.9084,
+      "step": 1017
+    },
+    {
+      "epoch": 1.4732272069464545,
+      "grad_norm": 1.5140705108642578,
+      "learning_rate": 1.5205148833467418e-06,
+      "loss": 0.8522,
+      "step": 1018
+    },
+    {
+      "epoch": 1.4746743849493487,
+      "grad_norm": 1.5287972688674927,
+      "learning_rate": 1.5164923572003221e-06,
+      "loss": 0.8384,
+      "step": 1019
+    },
+    {
+      "epoch": 1.476121562952243,
+      "grad_norm": 1.5287972688674927,
+      "learning_rate": 1.5164923572003221e-06,
+      "loss": 0.929,
+      "step": 1020
+    },
+    {
+      "epoch": 1.476121562952243,
+      "eval_loss": 0.9861629605293274,
+      "eval_runtime": 25.9299,
+      "eval_samples_per_second": 38.566,
+      "eval_steps_per_second": 2.43,
+      "step": 1020
+    },
+    {
+      "epoch": 1.4775687409551375,
+      "grad_norm": 1.6659374237060547,
+      "learning_rate": 1.512469831053902e-06,
+      "loss": 0.8733,
+      "step": 1021
+    },
+    {
+      "epoch": 1.4790159189580319,
+      "grad_norm": 1.5888694524765015,
+      "learning_rate": 1.5084473049074822e-06,
+      "loss": 0.8927,
+      "step": 1022
+    },
+    {
+      "epoch": 1.4804630969609263,
+      "grad_norm": 1.5705657005310059,
+      "learning_rate": 1.5044247787610621e-06,
+      "loss": 0.8729,
+      "step": 1023
+    },
+    {
+      "epoch": 1.4819102749638207,
+      "grad_norm": 1.5564839839935303,
+      "learning_rate": 1.5004022526146422e-06,
+      "loss": 0.9164,
+      "step": 1024
+    },
+    {
+      "epoch": 1.4833574529667148,
+      "grad_norm": 1.5336191654205322,
+      "learning_rate": 1.4963797264682224e-06,
+      "loss": 0.8673,
+      "step": 1025
+    },
+    {
+      "epoch": 1.4848046309696092,
+      "grad_norm": 1.5255659818649292,
+      "learning_rate": 1.4923572003218023e-06,
+      "loss": 0.8689,
+      "step": 1026
+    },
+    {
+      "epoch": 1.4862518089725036,
+      "grad_norm": 1.5760184526443481,
+      "learning_rate": 1.4883346741753824e-06,
+      "loss": 0.8825,
+      "step": 1027
+    },
+    {
+      "epoch": 1.487698986975398,
+      "grad_norm": 1.5247989892959595,
+      "learning_rate": 1.4843121480289623e-06,
+      "loss": 0.8774,
+      "step": 1028
+    },
+    {
+      "epoch": 1.4891461649782922,
+      "grad_norm": 1.5857770442962646,
+      "learning_rate": 1.4802896218825425e-06,
+      "loss": 0.8552,
+      "step": 1029
+    },
+    {
+      "epoch": 1.4905933429811866,
+      "grad_norm": 1.5850846767425537,
+      "learning_rate": 1.4762670957361226e-06,
+      "loss": 0.8623,
+      "step": 1030
+    },
+    {
+      "epoch": 1.4905933429811866,
+      "eval_loss": 0.9863028526306152,
+      "eval_runtime": 25.8299,
+      "eval_samples_per_second": 38.715,
+      "eval_steps_per_second": 2.439,
+      "step": 1030
+    },
+    {
+      "epoch": 1.492040520984081,
+      "grad_norm": 1.5669705867767334,
+      "learning_rate": 1.4722445695897025e-06,
+      "loss": 0.915,
+      "step": 1031
+    },
+    {
+      "epoch": 1.4934876989869754,
+      "grad_norm": 1.5132821798324585,
+      "learning_rate": 1.4682220434432826e-06,
+      "loss": 0.8721,
+      "step": 1032
+    },
+    {
+      "epoch": 1.4949348769898698,
+      "grad_norm": 1.519844651222229,
+      "learning_rate": 1.4641995172968625e-06,
+      "loss": 0.8907,
+      "step": 1033
+    },
+    {
+      "epoch": 1.4963820549927642,
+      "grad_norm": 1.488808512687683,
+      "learning_rate": 1.4601769911504427e-06,
+      "loss": 0.854,
+      "step": 1034
+    },
+    {
+      "epoch": 1.4978292329956584,
+      "grad_norm": 1.5658503770828247,
+      "learning_rate": 1.4561544650040226e-06,
+      "loss": 0.8855,
+      "step": 1035
+    },
+    {
+      "epoch": 1.4992764109985528,
+      "grad_norm": 1.5756577253341675,
+      "learning_rate": 1.4521319388576027e-06,
+      "loss": 0.8593,
+      "step": 1036
+    },
+    {
+      "epoch": 1.5007235890014472,
+      "grad_norm": 1.6409657001495361,
+      "learning_rate": 1.4481094127111828e-06,
+      "loss": 0.887,
+      "step": 1037
+    },
+    {
+      "epoch": 1.5021707670043414,
+      "grad_norm": 1.5633593797683716,
+      "learning_rate": 1.4440868865647628e-06,
+      "loss": 0.8393,
+      "step": 1038
+    },
+    {
+      "epoch": 1.5036179450072358,
+      "grad_norm": 1.5313143730163574,
+      "learning_rate": 1.4400643604183429e-06,
+      "loss": 0.8633,
+      "step": 1039
+    },
+    {
+      "epoch": 1.5050651230101302,
+      "grad_norm": 1.6449614763259888,
+      "learning_rate": 1.4360418342719228e-06,
+      "loss": 0.8755,
+      "step": 1040
+    },
+    {
+      "epoch": 1.5050651230101302,
+      "eval_loss": 0.985752284526825,
+      "eval_runtime": 25.9811,
+      "eval_samples_per_second": 38.49,
+      "eval_steps_per_second": 2.425,
+      "step": 1040
+    },
+    {
+      "epoch": 1.5065123010130246,
+      "grad_norm": 1.5782673358917236,
+      "learning_rate": 1.432019308125503e-06,
+      "loss": 0.8965,
+      "step": 1041
+    },
+    {
+      "epoch": 1.507959479015919,
+      "grad_norm": 1.5749887228012085,
+      "learning_rate": 1.427996781979083e-06,
+      "loss": 0.8864,
+      "step": 1042
+    },
+    {
+      "epoch": 1.5094066570188134,
+      "grad_norm": 1.4836838245391846,
+      "learning_rate": 1.423974255832663e-06,
+      "loss": 0.8881,
+      "step": 1043
+    },
+    {
+      "epoch": 1.5108538350217078,
+      "grad_norm": 1.5988322496414185,
+      "learning_rate": 1.419951729686243e-06,
+      "loss": 0.8877,
+      "step": 1044
+    },
+    {
+      "epoch": 1.5123010130246022,
+      "grad_norm": 1.5654700994491577,
+      "learning_rate": 1.415929203539823e-06,
+      "loss": 0.8933,
+      "step": 1045
+    },
+    {
+      "epoch": 1.5137481910274964,
+      "grad_norm": 1.462059497833252,
+      "learning_rate": 1.4119066773934031e-06,
+      "loss": 0.8677,
+      "step": 1046
+    },
+    {
+      "epoch": 1.5151953690303908,
+      "grad_norm": 1.4891657829284668,
+      "learning_rate": 1.4078841512469833e-06,
+      "loss": 0.8486,
+      "step": 1047
+    },
+    {
+      "epoch": 1.516642547033285,
+      "grad_norm": 1.6066802740097046,
+      "learning_rate": 1.4038616251005632e-06,
+      "loss": 0.8887,
+      "step": 1048
+    },
+    {
+      "epoch": 1.5180897250361793,
+      "grad_norm": 1.5902212858200073,
+      "learning_rate": 1.3998390989541433e-06,
+      "loss": 0.9109,
+      "step": 1049
+    },
+    {
+      "epoch": 1.5195369030390737,
+      "grad_norm": 1.545209527015686,
+      "learning_rate": 1.3958165728077232e-06,
+      "loss": 0.8758,
+      "step": 1050
+    },
+    {
+      "epoch": 1.5195369030390737,
+      "eval_loss": 0.9853872060775757,
+      "eval_runtime": 25.8212,
+      "eval_samples_per_second": 38.728,
+      "eval_steps_per_second": 2.44,
+      "step": 1050
+    },
+    {
+      "epoch": 1.5209840810419681,
+      "grad_norm": 1.5264979600906372,
+      "learning_rate": 1.3917940466613034e-06,
+      "loss": 0.8628,
+      "step": 1051
+    },
+    {
+      "epoch": 1.5224312590448625,
+      "grad_norm": 1.5595197677612305,
+      "learning_rate": 1.3877715205148835e-06,
+      "loss": 0.885,
+      "step": 1052
+    },
+    {
+      "epoch": 1.523878437047757,
+      "grad_norm": 1.6068772077560425,
+      "learning_rate": 1.3837489943684634e-06,
+      "loss": 0.902,
+      "step": 1053
+    },
+    {
+      "epoch": 1.5253256150506513,
+      "grad_norm": 1.7956867218017578,
+      "learning_rate": 1.3797264682220435e-06,
+      "loss": 0.9056,
+      "step": 1054
+    },
+    {
+      "epoch": 1.5267727930535457,
+      "grad_norm": 1.6634644269943237,
+      "learning_rate": 1.3757039420756234e-06,
+      "loss": 0.8778,
+      "step": 1055
+    },
+    {
+      "epoch": 1.52821997105644,
+      "grad_norm": 1.535996437072754,
+      "learning_rate": 1.3716814159292036e-06,
+      "loss": 0.8736,
+      "step": 1056
+    },
+    {
+      "epoch": 1.5296671490593343,
+      "grad_norm": 1.5196404457092285,
+      "learning_rate": 1.3676588897827837e-06,
+      "loss": 0.8596,
+      "step": 1057
+    },
+    {
+      "epoch": 1.5311143270622285,
+      "grad_norm": 1.5545907020568848,
+      "learning_rate": 1.3636363636363636e-06,
+      "loss": 0.8605,
+      "step": 1058
+    },
+    {
+      "epoch": 1.532561505065123,
+      "grad_norm": 1.5114647150039673,
+      "learning_rate": 1.3596138374899437e-06,
+      "loss": 0.8693,
+      "step": 1059
+    },
+    {
+      "epoch": 1.5340086830680173,
+      "grad_norm": 1.587093710899353,
+      "learning_rate": 1.3555913113435237e-06,
+      "loss": 0.867,
+      "step": 1060
+    },
+    {
+      "epoch": 1.5340086830680173,
+      "eval_loss": 0.9839922189712524,
+      "eval_runtime": 25.7893,
+      "eval_samples_per_second": 38.776,
+      "eval_steps_per_second": 2.443,
+      "step": 1060
+    },
+    {
+      "epoch": 1.5354558610709117,
+      "grad_norm": 1.618294358253479,
+      "learning_rate": 1.3515687851971038e-06,
+      "loss": 0.8773,
+      "step": 1061
+    },
+    {
+      "epoch": 1.536903039073806,
+      "grad_norm": 1.6108827590942383,
+      "learning_rate": 1.347546259050684e-06,
+      "loss": 0.8939,
+      "step": 1062
+    },
+    {
+      "epoch": 1.5383502170767005,
+      "grad_norm": 1.555820345878601,
+      "learning_rate": 1.3435237329042638e-06,
+      "loss": 0.864,
+      "step": 1063
+    },
+    {
+      "epoch": 1.539797395079595,
+      "grad_norm": 1.5706232786178589,
+      "learning_rate": 1.339501206757844e-06,
+      "loss": 0.8698,
+      "step": 1064
+    },
+    {
+      "epoch": 1.5412445730824893,
+      "grad_norm": 1.6310842037200928,
+      "learning_rate": 1.3354786806114243e-06,
+      "loss": 0.8755,
+      "step": 1065
+    },
+    {
+      "epoch": 1.5426917510853835,
+      "grad_norm": 1.602475881576538,
+      "learning_rate": 1.3314561544650042e-06,
+      "loss": 0.8974,
+      "step": 1066
+    },
+    {
+      "epoch": 1.5441389290882779,
+      "grad_norm": 1.5371959209442139,
+      "learning_rate": 1.3274336283185843e-06,
+      "loss": 0.8691,
+      "step": 1067
+    },
+    {
+      "epoch": 1.545586107091172,
+      "grad_norm": 1.6384830474853516,
+      "learning_rate": 1.3234111021721643e-06,
+      "loss": 0.8818,
+      "step": 1068
+    },
+    {
+      "epoch": 1.5470332850940665,
+      "grad_norm": 1.5125739574432373,
+      "learning_rate": 1.3193885760257444e-06,
+      "loss": 0.8694,
+      "step": 1069
+    },
+    {
+      "epoch": 1.5484804630969609,
+      "grad_norm": 1.5838148593902588,
+      "learning_rate": 1.3153660498793245e-06,
+      "loss": 0.8898,
+      "step": 1070
+    },
+    {
+      "epoch": 1.5484804630969609,
+      "eval_loss": 0.9824967384338379,
+      "eval_runtime": 25.9096,
+      "eval_samples_per_second": 38.596,
+      "eval_steps_per_second": 2.432,
+      "step": 1070
+    },
+    {
+      "epoch": 1.5499276410998553,
+      "grad_norm": 1.6065926551818848,
+      "learning_rate": 1.3113435237329044e-06,
+      "loss": 0.8909,
+      "step": 1071
+    },
+    {
+      "epoch": 1.5513748191027497,
+      "grad_norm": 1.5557194948196411,
+      "learning_rate": 1.3073209975864846e-06,
+      "loss": 0.8545,
+      "step": 1072
+    },
+    {
+      "epoch": 1.552821997105644,
+      "grad_norm": 1.6023532152175903,
+      "learning_rate": 1.3032984714400645e-06,
+      "loss": 0.8724,
+      "step": 1073
+    },
+    {
+      "epoch": 1.5542691751085385,
+      "grad_norm": 1.5924384593963623,
+      "learning_rate": 1.2992759452936446e-06,
+      "loss": 0.8845,
+      "step": 1074
+    },
+    {
+      "epoch": 1.5557163531114329,
+      "grad_norm": 1.6290538311004639,
+      "learning_rate": 1.2952534191472247e-06,
+      "loss": 0.878,
+      "step": 1075
+    },
+    {
+      "epoch": 1.557163531114327,
+      "grad_norm": 1.6020493507385254,
+      "learning_rate": 1.2912308930008046e-06,
+      "loss": 0.8564,
+      "step": 1076
+    },
+    {
+      "epoch": 1.5586107091172214,
+      "grad_norm": 1.6875399351119995,
+      "learning_rate": 1.2872083668543848e-06,
+      "loss": 0.8443,
+      "step": 1077
+    },
+    {
+      "epoch": 1.5600578871201156,
+      "grad_norm": 1.662969708442688,
+      "learning_rate": 1.2831858407079647e-06,
+      "loss": 0.8879,
+      "step": 1078
+    },
+    {
+      "epoch": 1.56150506512301,
+      "grad_norm": 1.5883301496505737,
+      "learning_rate": 1.2791633145615448e-06,
+      "loss": 0.841,
+      "step": 1079
+    },
+    {
+      "epoch": 1.5629522431259044,
+      "grad_norm": 1.564860224723816,
+      "learning_rate": 1.275140788415125e-06,
+      "loss": 0.8378,
+      "step": 1080
+    },
+    {
+      "epoch": 1.5629522431259044,
+      "eval_loss": 0.9821707606315613,
+      "eval_runtime": 25.8456,
+      "eval_samples_per_second": 38.691,
+      "eval_steps_per_second": 2.438,
+      "step": 1080
+    },
+    {
+      "epoch": 1.5643994211287988,
+      "grad_norm": 1.690390944480896,
+      "learning_rate": 1.2711182622687049e-06,
+      "loss": 0.9134,
+      "step": 1081
+    },
+    {
+      "epoch": 1.5658465991316932,
+      "grad_norm": 1.5780271291732788,
+      "learning_rate": 1.267095736122285e-06,
+      "loss": 0.8815,
+      "step": 1082
+    },
+    {
+      "epoch": 1.5672937771345876,
+      "grad_norm": 1.6903564929962158,
+      "learning_rate": 1.263073209975865e-06,
+      "loss": 0.8806,
+      "step": 1083
+    },
+    {
+      "epoch": 1.568740955137482,
+      "grad_norm": 1.5610105991363525,
+      "learning_rate": 1.259050683829445e-06,
+      "loss": 0.9031,
+      "step": 1084
+    },
+    {
+      "epoch": 1.5701881331403764,
+      "grad_norm": 1.463962435722351,
+      "learning_rate": 1.2550281576830252e-06,
+      "loss": 0.8491,
+      "step": 1085
+    },
+    {
+      "epoch": 1.5716353111432706,
+      "grad_norm": 1.5153679847717285,
+      "learning_rate": 1.251005631536605e-06,
+      "loss": 0.866,
+      "step": 1086
+    },
+    {
+      "epoch": 1.573082489146165,
+      "grad_norm": 1.6320936679840088,
+      "learning_rate": 1.2469831053901852e-06,
+      "loss": 0.8712,
+      "step": 1087
+    },
+    {
+      "epoch": 1.5745296671490592,
+      "grad_norm": 1.5522252321243286,
+      "learning_rate": 1.2429605792437651e-06,
+      "loss": 0.8937,
+      "step": 1088
+    },
+    {
+      "epoch": 1.5759768451519536,
+      "grad_norm": 1.5487126111984253,
+      "learning_rate": 1.2389380530973452e-06,
+      "loss": 0.931,
+      "step": 1089
+    },
+    {
+      "epoch": 1.577424023154848,
+      "grad_norm": 1.602441430091858,
+      "learning_rate": 1.2349155269509252e-06,
+      "loss": 0.8811,
+      "step": 1090
+    },
+    {
+      "epoch": 1.577424023154848,
+      "eval_loss": 0.982520341873169,
+      "eval_runtime": 25.7693,
+      "eval_samples_per_second": 38.806,
+      "eval_steps_per_second": 2.445,
+      "step": 1090
+    },
+    {
+      "epoch": 1.5788712011577424,
+      "grad_norm": 1.633206844329834,
+      "learning_rate": 1.2308930008045053e-06,
+      "loss": 0.8697,
+      "step": 1091
+    },
+    {
+      "epoch": 1.5803183791606368,
+      "grad_norm": 1.5206350088119507,
+      "learning_rate": 1.2268704746580854e-06,
+      "loss": 0.8645,
+      "step": 1092
+    },
+    {
+      "epoch": 1.5817655571635312,
+      "grad_norm": 1.5659570693969727,
+      "learning_rate": 1.2228479485116653e-06,
+      "loss": 0.8569,
+      "step": 1093
+    },
+    {
+      "epoch": 1.5832127351664256,
+      "grad_norm": 1.5603435039520264,
+      "learning_rate": 1.2188254223652455e-06,
+      "loss": 0.8741,
+      "step": 1094
+    },
+    {
+      "epoch": 1.58465991316932,
+      "grad_norm": 1.5861403942108154,
+      "learning_rate": 1.2148028962188254e-06,
+      "loss": 0.9042,
+      "step": 1095
+    },
+    {
+      "epoch": 1.5861070911722142,
+      "grad_norm": 1.7639175653457642,
+      "learning_rate": 1.2107803700724055e-06,
+      "loss": 0.8556,
+      "step": 1096
+    },
+    {
+      "epoch": 1.5875542691751086,
+      "grad_norm": 1.5393552780151367,
+      "learning_rate": 1.2067578439259856e-06,
+      "loss": 0.8267,
+      "step": 1097
+    },
+    {
+      "epoch": 1.5890014471780027,
+      "grad_norm": 1.5538761615753174,
+      "learning_rate": 1.2027353177795658e-06,
+      "loss": 0.8574,
+      "step": 1098
+    },
+    {
+      "epoch": 1.5904486251808971,
+      "grad_norm": 1.6127047538757324,
+      "learning_rate": 1.1987127916331457e-06,
+      "loss": 0.8667,
+      "step": 1099
+    },
+    {
+      "epoch": 1.5918958031837915,
+      "grad_norm": 1.6033275127410889,
+      "learning_rate": 1.1946902654867258e-06,
+      "loss": 0.8788,
+      "step": 1100
+    },
+    {
+      "epoch": 1.5918958031837915,
+      "eval_loss": 0.9810674786567688,
+      "eval_runtime": 25.866,
+      "eval_samples_per_second": 38.661,
+      "eval_steps_per_second": 2.436,
+      "step": 1100
+    },
+    {
+      "epoch": 1.593342981186686,
+      "grad_norm": 1.6297693252563477,
+      "learning_rate": 1.190667739340306e-06,
+      "loss": 0.8847,
+      "step": 1101
+    },
+    {
+      "epoch": 1.5947901591895803,
+      "grad_norm": 1.6360310316085815,
+      "learning_rate": 1.1866452131938858e-06,
+      "loss": 0.8544,
+      "step": 1102
+    },
+    {
+      "epoch": 1.5962373371924747,
+      "grad_norm": 1.541109561920166,
+      "learning_rate": 1.182622687047466e-06,
+      "loss": 0.8759,
+      "step": 1103
+    },
+    {
+      "epoch": 1.5976845151953691,
+      "grad_norm": 1.5267829895019531,
+      "learning_rate": 1.1786001609010459e-06,
+      "loss": 0.8544,
+      "step": 1104
+    },
+    {
+      "epoch": 1.5991316931982635,
+      "grad_norm": 1.610019326210022,
+      "learning_rate": 1.174577634754626e-06,
+      "loss": 0.8711,
+      "step": 1105
+    },
+    {
+      "epoch": 1.6005788712011577,
+      "grad_norm": 1.5717980861663818,
+      "learning_rate": 1.1705551086082061e-06,
+      "loss": 0.8596,
+      "step": 1106
+    },
+    {
+      "epoch": 1.6020260492040521,
+      "grad_norm": 1.5795584917068481,
+      "learning_rate": 1.166532582461786e-06,
+      "loss": 0.8682,
+      "step": 1107
+    },
+    {
+      "epoch": 1.6034732272069463,
+      "grad_norm": 1.5268584489822388,
+      "learning_rate": 1.1625100563153662e-06,
+      "loss": 0.8469,
+      "step": 1108
+    },
+    {
+      "epoch": 1.6049204052098407,
+      "grad_norm": 1.5472923517227173,
+      "learning_rate": 1.158487530168946e-06,
+      "loss": 0.8614,
+      "step": 1109
+    },
+    {
+      "epoch": 1.606367583212735,
+      "grad_norm": 1.5849939584732056,
+      "learning_rate": 1.1544650040225262e-06,
+      "loss": 0.8891,
+      "step": 1110
+    },
+    {
+      "epoch": 1.606367583212735,
+      "eval_loss": 0.9799522161483765,
+      "eval_runtime": 25.7004,
+      "eval_samples_per_second": 38.91,
+      "eval_steps_per_second": 2.451,
+      "step": 1110
+    },
+    {
+      "epoch": 1.6078147612156295,
+      "grad_norm": 1.5831488370895386,
+      "learning_rate": 1.1504424778761064e-06,
+      "loss": 0.8719,
+      "step": 1111
+    },
+    {
+      "epoch": 1.609261939218524,
+      "grad_norm": 1.5813566446304321,
+      "learning_rate": 1.1464199517296863e-06,
+      "loss": 0.8889,
+      "step": 1112
+    },
+    {
+      "epoch": 1.6107091172214183,
+      "grad_norm": 1.7443983554840088,
+      "learning_rate": 1.1423974255832664e-06,
+      "loss": 0.8839,
+      "step": 1113
+    },
+    {
+      "epoch": 1.6121562952243127,
+      "grad_norm": 1.5268986225128174,
+      "learning_rate": 1.1383748994368463e-06,
+      "loss": 0.8705,
+      "step": 1114
+    },
+    {
+      "epoch": 1.613603473227207,
+      "grad_norm": 1.525639533996582,
+      "learning_rate": 1.1343523732904264e-06,
+      "loss": 0.8671,
+      "step": 1115
+    },
+    {
+      "epoch": 1.6150506512301013,
+      "grad_norm": 1.8092018365859985,
+      "learning_rate": 1.1303298471440066e-06,
+      "loss": 0.9264,
+      "step": 1116
+    },
+    {
+      "epoch": 1.6164978292329957,
+      "grad_norm": 1.5909322500228882,
+      "learning_rate": 1.1263073209975865e-06,
+      "loss": 0.8827,
+      "step": 1117
+    },
+    {
+      "epoch": 1.6179450072358899,
+      "grad_norm": 1.6932170391082764,
+      "learning_rate": 1.1222847948511666e-06,
+      "loss": 0.8555,
+      "step": 1118
+    },
+    {
+      "epoch": 1.6193921852387843,
+      "grad_norm": 1.5742934942245483,
+      "learning_rate": 1.1182622687047465e-06,
+      "loss": 0.8946,
+      "step": 1119
+    },
+    {
+      "epoch": 1.6208393632416787,
+      "grad_norm": 1.5503621101379395,
+      "learning_rate": 1.1142397425583269e-06,
+      "loss": 0.8803,
+      "step": 1120
+    },
+    {
+      "epoch": 1.6208393632416787,
+      "eval_loss": 0.9796671867370605,
+      "eval_runtime": 25.8219,
+      "eval_samples_per_second": 38.727,
+      "eval_steps_per_second": 2.44,
+      "step": 1120
+    },
+    {
+      "epoch": 1.622286541244573,
+      "grad_norm": 1.5717918872833252,
+      "learning_rate": 1.1102172164119068e-06,
+      "loss": 0.8869,
+      "step": 1121
+    },
+    {
+      "epoch": 1.6237337192474675,
+      "grad_norm": 1.5179367065429688,
+      "learning_rate": 1.106194690265487e-06,
+      "loss": 0.8859,
+      "step": 1122
+    },
+    {
+      "epoch": 1.6251808972503619,
+      "grad_norm": 1.5226870775222778,
+      "learning_rate": 1.1021721641190668e-06,
+      "loss": 0.8616,
+      "step": 1123
+    },
+    {
+      "epoch": 1.6266280752532563,
+      "grad_norm": 1.6265931129455566,
+      "learning_rate": 1.098149637972647e-06,
+      "loss": 0.8787,
+      "step": 1124
+    },
+    {
+      "epoch": 1.6280752532561507,
+      "grad_norm": 1.576897382736206,
+      "learning_rate": 1.094127111826227e-06,
+      "loss": 0.8592,
+      "step": 1125
+    },
+    {
+      "epoch": 1.6295224312590448,
+      "grad_norm": 1.5493236780166626,
+      "learning_rate": 1.090104585679807e-06,
+      "loss": 0.8601,
+      "step": 1126
+    },
+    {
+      "epoch": 1.6309696092619392,
+      "grad_norm": 1.5343093872070312,
+      "learning_rate": 1.0860820595333871e-06,
+      "loss": 0.8661,
+      "step": 1127
+    },
+    {
+      "epoch": 1.6324167872648334,
+      "grad_norm": 1.6531797647476196,
+      "learning_rate": 1.082059533386967e-06,
+      "loss": 0.8787,
+      "step": 1128
+    },
+    {
+      "epoch": 1.6338639652677278,
+      "grad_norm": 1.5565072298049927,
+      "learning_rate": 1.0780370072405472e-06,
+      "loss": 0.8843,
+      "step": 1129
+    },
+    {
+      "epoch": 1.6353111432706222,
+      "grad_norm": 1.5990854501724243,
+      "learning_rate": 1.0740144810941273e-06,
+      "loss": 0.8744,
+      "step": 1130
+    },
+    {
+      "epoch": 1.6353111432706222,
+      "eval_loss": 0.9809547066688538,
+      "eval_runtime": 26.0231,
+      "eval_samples_per_second": 38.427,
+      "eval_steps_per_second": 2.421,
+      "step": 1130
+    },
+    {
+      "epoch": 1.6367583212735166,
+      "grad_norm": 1.5343475341796875,
+      "learning_rate": 1.0699919549477072e-06,
+      "loss": 0.8587,
+      "step": 1131
+    },
+    {
+      "epoch": 1.638205499276411,
+      "grad_norm": 1.602108120918274,
+      "learning_rate": 1.0659694288012873e-06,
+      "loss": 0.889,
+      "step": 1132
+    },
+    {
+      "epoch": 1.6396526772793054,
+      "grad_norm": 1.6065183877944946,
+      "learning_rate": 1.0619469026548673e-06,
+      "loss": 0.8443,
+      "step": 1133
+    },
+    {
+      "epoch": 1.6410998552821998,
+      "grad_norm": 1.6848286390304565,
+      "learning_rate": 1.0579243765084474e-06,
+      "loss": 0.8857,
+      "step": 1134
+    },
+    {
+      "epoch": 1.6425470332850942,
+      "grad_norm": 1.5931259393692017,
+      "learning_rate": 1.0539018503620275e-06,
+      "loss": 0.9061,
+      "step": 1135
+    },
+    {
+      "epoch": 1.6439942112879884,
+      "grad_norm": 1.5638717412948608,
+      "learning_rate": 1.0498793242156074e-06,
+      "loss": 0.8512,
+      "step": 1136
+    },
+    {
+      "epoch": 1.6454413892908828,
+      "grad_norm": 1.6239174604415894,
+      "learning_rate": 1.0458567980691876e-06,
+      "loss": 0.9118,
+      "step": 1137
+    },
+    {
+      "epoch": 1.646888567293777,
+      "grad_norm": 1.6062713861465454,
+      "learning_rate": 1.0418342719227675e-06,
+      "loss": 0.8806,
+      "step": 1138
+    },
+    {
+      "epoch": 1.6483357452966714,
+      "grad_norm": 1.5737882852554321,
+      "learning_rate": 1.0378117457763476e-06,
+      "loss": 0.8444,
+      "step": 1139
+    },
+    {
+      "epoch": 1.6497829232995658,
+      "grad_norm": 1.5323673486709595,
+      "learning_rate": 1.0337892196299275e-06,
+      "loss": 0.8587,
+      "step": 1140
+    },
+    {
+      "epoch": 1.6497829232995658,
+      "eval_loss": 0.9802071452140808,
+      "eval_runtime": 25.8256,
+      "eval_samples_per_second": 38.721,
+      "eval_steps_per_second": 2.439,
+      "step": 1140
+    },
+    {
+      "epoch": 1.6512301013024602,
+      "grad_norm": 1.5475029945373535,
+      "learning_rate": 1.0297666934835076e-06,
+      "loss": 0.886,
+      "step": 1141
+    },
+    {
+      "epoch": 1.6526772793053546,
+      "grad_norm": 1.5495952367782593,
+      "learning_rate": 1.0257441673370878e-06,
+      "loss": 0.8586,
+      "step": 1142
+    },
+    {
+      "epoch": 1.654124457308249,
+      "grad_norm": 1.6458721160888672,
+      "learning_rate": 1.021721641190668e-06,
+      "loss": 0.8912,
+      "step": 1143
+    },
+    {
+      "epoch": 1.6555716353111434,
+      "grad_norm": 1.5719788074493408,
+      "learning_rate": 1.017699115044248e-06,
+      "loss": 0.86,
+      "step": 1144
+    },
+    {
+      "epoch": 1.6570188133140378,
+      "grad_norm": 1.574139952659607,
+      "learning_rate": 1.013676588897828e-06,
+      "loss": 0.8846,
+      "step": 1145
+    },
+    {
+      "epoch": 1.658465991316932,
+      "grad_norm": 1.6863696575164795,
+      "learning_rate": 1.009654062751408e-06,
+      "loss": 0.8461,
+      "step": 1146
+    },
+    {
+      "epoch": 1.6599131693198264,
+      "grad_norm": 1.5795451402664185,
+      "learning_rate": 1.005631536604988e-06,
+      "loss": 0.8725,
+      "step": 1147
+    },
+    {
+      "epoch": 1.6613603473227205,
+      "grad_norm": 1.6984606981277466,
+      "learning_rate": 1.0016090104585681e-06,
+      "loss": 0.8959,
+      "step": 1148
+    },
+    {
+      "epoch": 1.662807525325615,
+      "grad_norm": 1.682733416557312,
+      "learning_rate": 9.97586484312148e-07,
+      "loss": 0.8583,
+      "step": 1149
+    },
+    {
+      "epoch": 1.6642547033285093,
+      "grad_norm": 1.607202172279358,
+      "learning_rate": 9.935639581657282e-07,
+      "loss": 0.8601,
+      "step": 1150
+    },
+    {
+      "epoch": 1.6642547033285093,
+      "eval_loss": 0.9815489053726196,
+      "eval_runtime": 25.9889,
+      "eval_samples_per_second": 38.478,
+      "eval_steps_per_second": 2.424,
+      "step": 1150
+    },
+    {
+      "epoch": 1.6657018813314037,
+      "grad_norm": 1.6263314485549927,
+      "learning_rate": 9.895414320193083e-07,
+      "loss": 0.8728,
+      "step": 1151
+    },
+    {
+      "epoch": 1.6671490593342981,
+      "grad_norm": 1.5998060703277588,
+      "learning_rate": 9.855189058728882e-07,
+      "loss": 0.8781,
+      "step": 1152
+    },
+    {
+      "epoch": 1.6685962373371925,
+      "grad_norm": 1.6003531217575073,
+      "learning_rate": 9.814963797264683e-07,
+      "loss": 0.8686,
+      "step": 1153
+    },
+    {
+      "epoch": 1.670043415340087,
+      "grad_norm": 1.5383695363998413,
+      "learning_rate": 9.774738535800482e-07,
+      "loss": 0.8583,
+      "step": 1154
+    },
+    {
+      "epoch": 1.6714905933429813,
+      "grad_norm": 1.5972706079483032,
+      "learning_rate": 9.734513274336284e-07,
+      "loss": 0.8915,
+      "step": 1155
+    },
+    {
+      "epoch": 1.6729377713458755,
+      "grad_norm": 1.6024503707885742,
+      "learning_rate": 9.694288012872085e-07,
+      "loss": 0.8842,
+      "step": 1156
+    },
+    {
+      "epoch": 1.67438494934877,
+      "grad_norm": 1.5077908039093018,
+      "learning_rate": 9.654062751407884e-07,
+      "loss": 0.9109,
+      "step": 1157
+    },
+    {
+      "epoch": 1.675832127351664,
+      "grad_norm": 1.5892987251281738,
+      "learning_rate": 9.613837489943685e-07,
+      "loss": 0.8554,
+      "step": 1158
+    },
+    {
+      "epoch": 1.6772793053545585,
+      "grad_norm": 1.5132979154586792,
+      "learning_rate": 9.573612228479485e-07,
+      "loss": 0.8424,
+      "step": 1159
+    },
+    {
+      "epoch": 1.678726483357453,
+      "grad_norm": 1.559235692024231,
+      "learning_rate": 9.533386967015286e-07,
+      "loss": 0.8469,
+      "step": 1160
+    },
+    {
+      "epoch": 1.678726483357453,
+      "eval_loss": 0.9792452454566956,
+      "eval_runtime": 25.8401,
+      "eval_samples_per_second": 38.699,
+      "eval_steps_per_second": 2.438,
+      "step": 1160
+    },
+    {
+      "epoch": 1.6801736613603473,
+      "grad_norm": 1.5679746866226196,
+      "learning_rate": 9.493161705551086e-07,
+      "loss": 0.8348,
+      "step": 1161
+    },
+    {
+      "epoch": 1.6816208393632417,
+      "grad_norm": 1.5585919618606567,
+      "learning_rate": 9.452936444086886e-07,
+      "loss": 0.8399,
+      "step": 1162
+    },
+    {
+      "epoch": 1.683068017366136,
+      "grad_norm": 1.607049584388733,
+      "learning_rate": 9.412711182622688e-07,
+      "loss": 0.8437,
+      "step": 1163
+    },
+    {
+      "epoch": 1.6845151953690305,
+      "grad_norm": 1.5190579891204834,
+      "learning_rate": 9.372485921158489e-07,
+      "loss": 0.8597,
+      "step": 1164
+    },
+    {
+      "epoch": 1.685962373371925,
+      "grad_norm": 1.601736307144165,
+      "learning_rate": 9.332260659694289e-07,
+      "loss": 0.8707,
+      "step": 1165
+    },
+    {
+      "epoch": 1.687409551374819,
+      "grad_norm": 1.6469128131866455,
+      "learning_rate": 9.292035398230089e-07,
+      "loss": 0.9113,
+      "step": 1166
+    },
+    {
+      "epoch": 1.6888567293777135,
+      "grad_norm": 1.6424798965454102,
+      "learning_rate": 9.251810136765891e-07,
+      "loss": 0.8596,
+      "step": 1167
+    },
+    {
+      "epoch": 1.6903039073806077,
+      "grad_norm": 1.683808445930481,
+      "learning_rate": 9.211584875301691e-07,
+      "loss": 0.9157,
+      "step": 1168
+    },
+    {
+      "epoch": 1.691751085383502,
+      "grad_norm": 1.6408365964889526,
+      "learning_rate": 9.171359613837491e-07,
+      "loss": 0.853,
+      "step": 1169
+    },
+    {
+      "epoch": 1.6931982633863965,
+      "grad_norm": 1.7329471111297607,
+      "learning_rate": 9.131134352373291e-07,
+      "loss": 0.8696,
+      "step": 1170
+    },
+    {
+      "epoch": 1.6931982633863965,
+      "eval_loss": 0.9784539937973022,
+      "eval_runtime": 26.0423,
+      "eval_samples_per_second": 38.399,
+      "eval_steps_per_second": 2.419,
+      "step": 1170
+    },
+    {
+      "epoch": 1.6946454413892909,
+      "grad_norm": 1.599411964416504,
+      "learning_rate": 9.090909090909091e-07,
+      "loss": 0.8722,
+      "step": 1171
+    },
+    {
+      "epoch": 1.6960926193921853,
+      "grad_norm": 1.5941879749298096,
+      "learning_rate": 9.050683829444893e-07,
+      "loss": 0.8576,
+      "step": 1172
+    },
+    {
+      "epoch": 1.6975397973950797,
+      "grad_norm": 1.5630899667739868,
+      "learning_rate": 9.010458567980693e-07,
+      "loss": 0.8809,
+      "step": 1173
+    },
+    {
+      "epoch": 1.698986975397974,
+      "grad_norm": 1.5148115158081055,
+      "learning_rate": 8.970233306516493e-07,
+      "loss": 0.8841,
+      "step": 1174
+    },
+    {
+      "epoch": 1.7004341534008685,
+      "grad_norm": 1.5821913480758667,
+      "learning_rate": 8.930008045052293e-07,
+      "loss": 0.8772,
+      "step": 1175
+    },
+    {
+      "epoch": 1.7018813314037626,
+      "grad_norm": 1.7660754919052124,
+      "learning_rate": 8.889782783588094e-07,
+      "loss": 0.8731,
+      "step": 1176
+    },
+    {
+      "epoch": 1.703328509406657,
+      "grad_norm": 1.5370192527770996,
+      "learning_rate": 8.849557522123895e-07,
+      "loss": 0.8584,
+      "step": 1177
+    },
+    {
+      "epoch": 1.7047756874095512,
+      "grad_norm": 1.5817389488220215,
+      "learning_rate": 8.809332260659695e-07,
+      "loss": 0.8786,
+      "step": 1178
+    },
+    {
+      "epoch": 1.7062228654124456,
+      "grad_norm": 1.5897088050842285,
+      "learning_rate": 8.769106999195495e-07,
+      "loss": 0.8448,
+      "step": 1179
+    },
+    {
+      "epoch": 1.70767004341534,
+      "grad_norm": 1.5718865394592285,
+      "learning_rate": 8.728881737731296e-07,
+      "loss": 0.8432,
+      "step": 1180
+    },
+    {
+      "epoch": 1.70767004341534,
+      "eval_loss": 0.978662371635437,
+      "eval_runtime": 25.9113,
+      "eval_samples_per_second": 38.593,
+      "eval_steps_per_second": 2.431,
+      "step": 1180
+    },
+    {
+      "epoch": 1.7091172214182344,
+      "grad_norm": 1.6017844676971436,
+      "learning_rate": 8.688656476267096e-07,
+      "loss": 0.868,
+      "step": 1181
+    },
+    {
+      "epoch": 1.7105643994211288,
+      "grad_norm": 1.6823580265045166,
+      "learning_rate": 8.648431214802896e-07,
+      "loss": 0.8621,
+      "step": 1182
+    },
+    {
+      "epoch": 1.7120115774240232,
+      "grad_norm": 1.6109862327575684,
+      "learning_rate": 8.608205953338697e-07,
+      "loss": 0.8614,
+      "step": 1183
+    },
+    {
+      "epoch": 1.7134587554269176,
+      "grad_norm": 1.5950835943222046,
+      "learning_rate": 8.567980691874497e-07,
+      "loss": 0.8481,
+      "step": 1184
+    },
+    {
+      "epoch": 1.714905933429812,
+      "grad_norm": 1.7378734350204468,
+      "learning_rate": 8.527755430410298e-07,
+      "loss": 0.8863,
+      "step": 1185
+    },
+    {
+      "epoch": 1.7163531114327062,
+      "grad_norm": 1.6182748079299927,
+      "learning_rate": 8.487530168946098e-07,
+      "loss": 0.8982,
+      "step": 1186
+    },
+    {
+      "epoch": 1.7178002894356006,
+      "grad_norm": 1.61431884765625,
+      "learning_rate": 8.4473049074819e-07,
+      "loss": 0.8855,
+      "step": 1187
+    },
+    {
+      "epoch": 1.7192474674384948,
+      "grad_norm": 1.593196153640747,
+      "learning_rate": 8.4070796460177e-07,
+      "loss": 0.8745,
+      "step": 1188
+    },
+    {
+      "epoch": 1.7206946454413892,
+      "grad_norm": 1.5465645790100098,
+      "learning_rate": 8.366854384553501e-07,
+      "loss": 0.8863,
+      "step": 1189
+    },
+    {
+      "epoch": 1.7221418234442836,
+      "grad_norm": 1.6166539192199707,
+      "learning_rate": 8.326629123089301e-07,
+      "loss": 0.8826,
+      "step": 1190
+    },
+    {
+      "epoch": 1.7221418234442836,
+      "eval_loss": 0.977293848991394,
+      "eval_runtime": 25.8688,
+      "eval_samples_per_second": 38.657,
+      "eval_steps_per_second": 2.435,
+      "step": 1190
+    },
+    {
+      "epoch": 1.723589001447178,
+      "grad_norm": 1.5473971366882324,
+      "learning_rate": 8.286403861625101e-07,
+      "loss": 0.9087,
+      "step": 1191
+    },
+    {
+      "epoch": 1.7250361794500724,
+      "grad_norm": 1.557051420211792,
+      "learning_rate": 8.246178600160902e-07,
+      "loss": 0.8697,
+      "step": 1192
+    },
+    {
+      "epoch": 1.7264833574529668,
+      "grad_norm": 1.473290205001831,
+      "learning_rate": 8.205953338696703e-07,
+      "loss": 0.8659,
+      "step": 1193
+    },
+    {
+      "epoch": 1.7279305354558612,
+      "grad_norm": 1.5358150005340576,
+      "learning_rate": 8.165728077232503e-07,
+      "loss": 0.8668,
+      "step": 1194
+    },
+    {
+      "epoch": 1.7293777134587556,
+      "grad_norm": 1.5885599851608276,
+      "learning_rate": 8.125502815768303e-07,
+      "loss": 0.8705,
+      "step": 1195
+    },
+    {
+      "epoch": 1.7308248914616498,
+      "grad_norm": 1.5535321235656738,
+      "learning_rate": 8.085277554304103e-07,
+      "loss": 0.8895,
+      "step": 1196
+    },
+    {
+      "epoch": 1.7322720694645442,
+      "grad_norm": 1.5673221349716187,
+      "learning_rate": 8.045052292839905e-07,
+      "loss": 0.8685,
+      "step": 1197
+    },
+    {
+      "epoch": 1.7337192474674383,
+      "grad_norm": 1.768881916999817,
+      "learning_rate": 8.004827031375705e-07,
+      "loss": 0.9055,
+      "step": 1198
+    },
+    {
+      "epoch": 1.7351664254703327,
+      "grad_norm": 1.597657322883606,
+      "learning_rate": 7.964601769911505e-07,
+      "loss": 0.8558,
+      "step": 1199
+    },
+    {
+      "epoch": 1.7366136034732271,
+      "grad_norm": 1.5506863594055176,
+      "learning_rate": 7.924376508447305e-07,
+      "loss": 0.8852,
+      "step": 1200
+    },
+    {
+      "epoch": 1.7366136034732271,
+      "eval_loss": 0.9770252704620361,
+      "eval_runtime": 25.9917,
+      "eval_samples_per_second": 38.474,
+      "eval_steps_per_second": 2.424,
+      "step": 1200
+    },
+    {
+      "epoch": 1.7380607814761215,
+      "grad_norm": 1.6115679740905762,
+      "learning_rate": 7.884151246983105e-07,
+      "loss": 0.85,
+      "step": 1201
+    },
+    {
+      "epoch": 1.739507959479016,
+      "grad_norm": 1.662935733795166,
+      "learning_rate": 7.843925985518907e-07,
+      "loss": 0.8807,
+      "step": 1202
+    },
+    {
+      "epoch": 1.7409551374819103,
+      "grad_norm": 1.617238998413086,
+      "learning_rate": 7.803700724054707e-07,
+      "loss": 0.8544,
+      "step": 1203
+    },
+    {
+      "epoch": 1.7424023154848047,
+      "grad_norm": 1.5308291912078857,
+      "learning_rate": 7.763475462590507e-07,
+      "loss": 0.8601,
+      "step": 1204
+    },
+    {
+      "epoch": 1.7438494934876991,
+      "grad_norm": 1.603887915611267,
+      "learning_rate": 7.723250201126307e-07,
+      "loss": 0.889,
+      "step": 1205
+    },
+    {
+      "epoch": 1.7452966714905933,
+      "grad_norm": 1.546451449394226,
+      "learning_rate": 7.683024939662108e-07,
+      "loss": 0.8936,
+      "step": 1206
+    },
+    {
+      "epoch": 1.7467438494934877,
+      "grad_norm": 1.5847758054733276,
+      "learning_rate": 7.642799678197908e-07,
+      "loss": 0.866,
+      "step": 1207
+    },
+    {
+      "epoch": 1.7481910274963819,
+      "grad_norm": 1.5509288311004639,
+      "learning_rate": 7.602574416733709e-07,
+      "loss": 0.8593,
+      "step": 1208
+    },
+    {
+      "epoch": 1.7496382054992763,
+      "grad_norm": 1.615058183670044,
+      "learning_rate": 7.56234915526951e-07,
+      "loss": 0.8949,
+      "step": 1209
+    },
+    {
+      "epoch": 1.7510853835021707,
+      "grad_norm": 1.5887871980667114,
+      "learning_rate": 7.522123893805311e-07,
+      "loss": 0.8401,
+      "step": 1210
+    },
+    {
+      "epoch": 1.7510853835021707,
+      "eval_loss": 0.9778628349304199,
+      "eval_runtime": 25.8847,
+      "eval_samples_per_second": 38.633,
+      "eval_steps_per_second": 2.434,
+      "step": 1210
+    },
+    {
+      "epoch": 1.752532561505065,
+      "grad_norm": 1.772998332977295,
+      "learning_rate": 7.481898632341112e-07,
+      "loss": 0.9199,
+      "step": 1211
+    },
+    {
+      "epoch": 1.7539797395079595,
+      "grad_norm": 1.57089364528656,
+      "learning_rate": 7.441673370876912e-07,
+      "loss": 0.864,
+      "step": 1212
+    },
+    {
+      "epoch": 1.755426917510854,
+      "grad_norm": 1.603686809539795,
+      "learning_rate": 7.401448109412712e-07,
+      "loss": 0.8511,
+      "step": 1213
+    },
+    {
+      "epoch": 1.7568740955137483,
+      "grad_norm": 1.6855931282043457,
+      "learning_rate": 7.361222847948512e-07,
+      "loss": 0.8688,
+      "step": 1214
+    },
+    {
+      "epoch": 1.7583212735166427,
+      "grad_norm": 1.553842544555664,
+      "learning_rate": 7.320997586484313e-07,
+      "loss": 0.8604,
+      "step": 1215
+    },
+    {
+      "epoch": 1.7597684515195369,
+      "grad_norm": 1.6918758153915405,
+      "learning_rate": 7.280772325020113e-07,
+      "loss": 0.8767,
+      "step": 1216
+    },
+    {
+      "epoch": 1.7612156295224313,
+      "grad_norm": 1.5926964282989502,
+      "learning_rate": 7.240547063555914e-07,
+      "loss": 0.8871,
+      "step": 1217
+    },
+    {
+      "epoch": 1.7626628075253257,
+      "grad_norm": 1.5411027669906616,
+      "learning_rate": 7.200321802091714e-07,
+      "loss": 0.8663,
+      "step": 1218
+    },
+    {
+      "epoch": 1.7641099855282198,
+      "grad_norm": 1.595458745956421,
+      "learning_rate": 7.160096540627515e-07,
+      "loss": 0.8645,
+      "step": 1219
+    },
+    {
+      "epoch": 1.7655571635311142,
+      "grad_norm": 1.5208485126495361,
+      "learning_rate": 7.119871279163315e-07,
+      "loss": 0.8804,
+      "step": 1220
+    },
+    {
+      "epoch": 1.7655571635311142,
+      "eval_loss": 0.9778464436531067,
+      "eval_runtime": 25.9514,
+      "eval_samples_per_second": 38.534,
+      "eval_steps_per_second": 2.428,
+      "step": 1220
+    },
+    {
+      "epoch": 1.7670043415340086,
+      "grad_norm": 1.6658185720443726,
+      "learning_rate": 7.079646017699115e-07,
+      "loss": 0.8534,
+      "step": 1221
+    },
+    {
+      "epoch": 1.768451519536903,
+      "grad_norm": 1.6133363246917725,
+      "learning_rate": 7.039420756234916e-07,
+      "loss": 0.8685,
+      "step": 1222
+    },
+    {
+      "epoch": 1.7698986975397974,
+      "grad_norm": 1.5848267078399658,
+      "learning_rate": 6.999195494770717e-07,
+      "loss": 0.8888,
+      "step": 1223
+    },
+    {
+      "epoch": 1.7713458755426919,
+      "grad_norm": 1.6191439628601074,
+      "learning_rate": 6.958970233306517e-07,
+      "loss": 0.8707,
+      "step": 1224
+    },
+    {
+      "epoch": 1.7727930535455863,
+      "grad_norm": 1.5733712911605835,
+      "learning_rate": 6.918744971842317e-07,
+      "loss": 0.8581,
+      "step": 1225
+    },
+    {
+      "epoch": 1.7742402315484804,
+      "grad_norm": 1.6595110893249512,
+      "learning_rate": 6.878519710378117e-07,
+      "loss": 0.8882,
+      "step": 1226
+    },
+    {
+      "epoch": 1.7756874095513748,
+      "grad_norm": 1.6195080280303955,
+      "learning_rate": 6.838294448913918e-07,
+      "loss": 0.8386,
+      "step": 1227
+    },
+    {
+      "epoch": 1.7771345875542692,
+      "grad_norm": 1.535504937171936,
+      "learning_rate": 6.798069187449719e-07,
+      "loss": 0.8556,
+      "step": 1228
+    },
+    {
+      "epoch": 1.7785817655571634,
+      "grad_norm": 1.5953824520111084,
+      "learning_rate": 6.757843925985519e-07,
+      "loss": 0.8291,
+      "step": 1229
+    },
+    {
+      "epoch": 1.7800289435600578,
+      "grad_norm": 1.5881785154342651,
+      "learning_rate": 6.717618664521319e-07,
+      "loss": 0.8765,
+      "step": 1230
+    },
+    {
+      "epoch": 1.7800289435600578,
+      "eval_loss": 0.9774501919746399,
+      "eval_runtime": 26.0326,
+      "eval_samples_per_second": 38.413,
+      "eval_steps_per_second": 2.42,
+      "step": 1230
+    },
+    {
+      "epoch": 1.7814761215629522,
+      "grad_norm": 1.633536696434021,
+      "learning_rate": 6.677393403057121e-07,
+      "loss": 0.8795,
+      "step": 1231
+    },
+    {
+      "epoch": 1.7829232995658466,
+      "grad_norm": 1.5229909420013428,
+      "learning_rate": 6.637168141592922e-07,
+      "loss": 0.8595,
+      "step": 1232
+    },
+    {
+      "epoch": 1.784370477568741,
+      "grad_norm": 1.6129873991012573,
+      "learning_rate": 6.596942880128722e-07,
+      "loss": 0.8634,
+      "step": 1233
+    },
+    {
+      "epoch": 1.7858176555716354,
+      "grad_norm": 1.5728979110717773,
+      "learning_rate": 6.556717618664522e-07,
+      "loss": 0.8685,
+      "step": 1234
+    },
+    {
+      "epoch": 1.7872648335745298,
+      "grad_norm": 1.6559734344482422,
+      "learning_rate": 6.516492357200322e-07,
+      "loss": 0.8849,
+      "step": 1235
+    },
+    {
+      "epoch": 1.788712011577424,
+      "grad_norm": 1.5157029628753662,
+      "learning_rate": 6.476267095736124e-07,
+      "loss": 0.868,
+      "step": 1236
+    },
+    {
+      "epoch": 1.7901591895803184,
+      "grad_norm": 1.5743296146392822,
+      "learning_rate": 6.436041834271924e-07,
+      "loss": 0.8446,
+      "step": 1237
+    },
+    {
+      "epoch": 1.7916063675832128,
+      "grad_norm": 1.5727088451385498,
+      "learning_rate": 6.395816572807724e-07,
+      "loss": 0.8703,
+      "step": 1238
+    },
+    {
+      "epoch": 1.793053545586107,
+      "grad_norm": 1.6340490579605103,
+      "learning_rate": 6.355591311343524e-07,
+      "loss": 0.8852,
+      "step": 1239
+    },
+    {
+      "epoch": 1.7945007235890014,
+      "grad_norm": 1.592427134513855,
+      "learning_rate": 6.315366049879324e-07,
+      "loss": 0.8611,
+      "step": 1240
+    },
+    {
+      "epoch": 1.7945007235890014,
+      "eval_loss": 0.9771277904510498,
+      "eval_runtime": 25.8154,
+      "eval_samples_per_second": 38.737,
+      "eval_steps_per_second": 2.44,
+      "step": 1240
+    },
+    {
+      "epoch": 1.7959479015918958,
+      "grad_norm": 1.6300837993621826,
+      "learning_rate": 6.275140788415126e-07,
+      "loss": 0.8471,
+      "step": 1241
+    },
+    {
+      "epoch": 1.7973950795947902,
+      "grad_norm": 1.614335298538208,
+      "learning_rate": 6.234915526950926e-07,
+      "loss": 0.888,
+      "step": 1242
+    },
+    {
+      "epoch": 1.7988422575976846,
+      "grad_norm": 1.5964536666870117,
+      "learning_rate": 6.194690265486726e-07,
+      "loss": 0.8876,
+      "step": 1243
+    },
+    {
+      "epoch": 1.800289435600579,
+      "grad_norm": 1.601259708404541,
+      "learning_rate": 6.154465004022526e-07,
+      "loss": 0.8572,
+      "step": 1244
+    },
+    {
+      "epoch": 1.8017366136034734,
+      "grad_norm": 1.6280089616775513,
+      "learning_rate": 6.114239742558327e-07,
+      "loss": 0.8606,
+      "step": 1245
+    },
+    {
+      "epoch": 1.8031837916063675,
+      "grad_norm": 1.6801878213882446,
+      "learning_rate": 6.074014481094127e-07,
+      "loss": 0.8871,
+      "step": 1246
+    },
+    {
+      "epoch": 1.804630969609262,
+      "grad_norm": 1.5877913236618042,
+      "learning_rate": 6.033789219629928e-07,
+      "loss": 0.8912,
+      "step": 1247
+    },
+    {
+      "epoch": 1.8060781476121563,
+      "grad_norm": 1.6258810758590698,
+      "learning_rate": 5.993563958165728e-07,
+      "loss": 0.8619,
+      "step": 1248
+    },
+    {
+      "epoch": 1.8075253256150505,
+      "grad_norm": 1.5987861156463623,
+      "learning_rate": 5.95333869670153e-07,
+      "loss": 0.8819,
+      "step": 1249
+    },
+    {
+      "epoch": 1.808972503617945,
+      "grad_norm": 1.5958012342453003,
+      "learning_rate": 5.91311343523733e-07,
+      "loss": 0.8795,
+      "step": 1250
+    },
+    {
+      "epoch": 1.808972503617945,
+      "eval_loss": 0.9765346646308899,
+      "eval_runtime": 26.1073,
+      "eval_samples_per_second": 38.303,
+      "eval_steps_per_second": 2.413,
+      "step": 1250
+    },
+    {
+      "epoch": 1.8104196816208393,
+      "grad_norm": 1.5270453691482544,
+      "learning_rate": 5.87288817377313e-07,
+      "loss": 0.8617,
+      "step": 1251
+    },
+    {
+      "epoch": 1.8118668596237337,
+      "grad_norm": 1.575887680053711,
+      "learning_rate": 5.83266291230893e-07,
+      "loss": 0.8664,
+      "step": 1252
+    },
+    {
+      "epoch": 1.8133140376266281,
+      "grad_norm": 1.6458449363708496,
+      "learning_rate": 5.79243765084473e-07,
+      "loss": 0.8628,
+      "step": 1253
+    },
+    {
+      "epoch": 1.8147612156295225,
+      "grad_norm": 1.5487672090530396,
+      "learning_rate": 5.752212389380532e-07,
+      "loss": 0.871,
+      "step": 1254
+    },
+    {
+      "epoch": 1.816208393632417,
+      "grad_norm": 1.5608482360839844,
+      "learning_rate": 5.711987127916332e-07,
+      "loss": 0.898,
+      "step": 1255
+    },
+    {
+      "epoch": 1.817655571635311,
+      "grad_norm": 1.471292495727539,
+      "learning_rate": 5.671761866452132e-07,
+      "loss": 0.8563,
+      "step": 1256
+    },
+    {
+      "epoch": 1.8191027496382055,
+      "grad_norm": 1.6494553089141846,
+      "learning_rate": 5.631536604987932e-07,
+      "loss": 0.8673,
+      "step": 1257
+    },
+    {
+      "epoch": 1.8205499276411,
+      "grad_norm": 1.580310583114624,
+      "learning_rate": 5.591311343523733e-07,
+      "loss": 0.8655,
+      "step": 1258
+    },
+    {
+      "epoch": 1.821997105643994,
+      "grad_norm": 1.5952807664871216,
+      "learning_rate": 5.551086082059534e-07,
+      "loss": 0.8646,
+      "step": 1259
+    },
+    {
+      "epoch": 1.8234442836468885,
+      "grad_norm": 1.6021016836166382,
+      "learning_rate": 5.510860820595334e-07,
+      "loss": 0.8906,
+      "step": 1260
+    },
+    {
+      "epoch": 1.8234442836468885,
+      "eval_loss": 0.975847601890564,
+      "eval_runtime": 25.9152,
+      "eval_samples_per_second": 38.587,
+      "eval_steps_per_second": 2.431,
+      "step": 1260
+    },
+    {
+      "epoch": 1.8248914616497829,
+      "grad_norm": 1.6215472221374512,
+      "learning_rate": 5.470635559131135e-07,
+      "loss": 0.8775,
+      "step": 1261
+    },
+    {
+      "epoch": 1.8263386396526773,
+      "grad_norm": 1.6024051904678345,
+      "learning_rate": 5.430410297666936e-07,
+      "loss": 0.8467,
+      "step": 1262
+    },
+    {
+      "epoch": 1.8277858176555717,
+      "grad_norm": 1.5474594831466675,
+      "learning_rate": 5.390185036202736e-07,
+      "loss": 0.8411,
+      "step": 1263
+    },
+    {
+      "epoch": 1.829232995658466,
+      "grad_norm": 1.5789331197738647,
+      "learning_rate": 5.349959774738536e-07,
+      "loss": 0.8808,
+      "step": 1264
+    },
+    {
+      "epoch": 1.8306801736613605,
+      "grad_norm": 1.5437642335891724,
+      "learning_rate": 5.309734513274336e-07,
+      "loss": 0.8633,
+      "step": 1265
+    },
+    {
+      "epoch": 1.8321273516642547,
+      "grad_norm": 1.589099407196045,
+      "learning_rate": 5.269509251810138e-07,
+      "loss": 0.8466,
+      "step": 1266
+    },
+    {
+      "epoch": 1.833574529667149,
+      "grad_norm": 1.5559078454971313,
+      "learning_rate": 5.229283990345938e-07,
+      "loss": 0.8754,
+      "step": 1267
+    },
+    {
+      "epoch": 1.8350217076700435,
+      "grad_norm": 1.6219558715820312,
+      "learning_rate": 5.189058728881738e-07,
+      "loss": 0.8313,
+      "step": 1268
+    },
+    {
+      "epoch": 1.8364688856729376,
+      "grad_norm": 1.639571189880371,
+      "learning_rate": 5.148833467417538e-07,
+      "loss": 0.8791,
+      "step": 1269
+    },
+    {
+      "epoch": 1.837916063675832,
+      "grad_norm": 1.7509524822235107,
+      "learning_rate": 5.10860820595334e-07,
+      "loss": 0.8943,
+      "step": 1270
+    },
+    {
+      "epoch": 1.837916063675832,
+      "eval_loss": 0.9765909314155579,
+      "eval_runtime": 26.1083,
+      "eval_samples_per_second": 38.302,
+      "eval_steps_per_second": 2.413,
+      "step": 1270
+    },
+    {
+      "epoch": 1.8393632416787264,
+      "grad_norm": 1.5593583583831787,
+      "learning_rate": 5.06838294448914e-07,
+      "loss": 0.8662,
+      "step": 1271
+    },
+    {
+      "epoch": 1.8408104196816208,
+      "grad_norm": 1.6008491516113281,
+      "learning_rate": 5.02815768302494e-07,
+      "loss": 0.8895,
+      "step": 1272
+    },
+    {
+      "epoch": 1.8422575976845152,
+      "grad_norm": 1.5698717832565308,
+      "learning_rate": 4.98793242156074e-07,
+      "loss": 0.857,
+      "step": 1273
+    },
+    {
+      "epoch": 1.8437047756874096,
+      "grad_norm": 1.6328340768814087,
+      "learning_rate": 4.947707160096541e-07,
+      "loss": 0.8832,
+      "step": 1274
+    },
+    {
+      "epoch": 1.845151953690304,
+      "grad_norm": 1.5507820844650269,
+      "learning_rate": 4.907481898632342e-07,
+      "loss": 0.8703,
+      "step": 1275
+    },
+    {
+      "epoch": 1.8465991316931982,
+      "grad_norm": 1.6311931610107422,
+      "learning_rate": 4.867256637168142e-07,
+      "loss": 0.8491,
+      "step": 1276
+    },
+    {
+      "epoch": 1.8480463096960926,
+      "grad_norm": 1.5922417640686035,
+      "learning_rate": 4.827031375703942e-07,
+      "loss": 0.8691,
+      "step": 1277
+    },
+    {
+      "epoch": 1.849493487698987,
+      "grad_norm": 1.4700504541397095,
+      "learning_rate": 4.786806114239742e-07,
+      "loss": 0.8801,
+      "step": 1278
+    },
+    {
+      "epoch": 1.8509406657018812,
+      "grad_norm": 1.6556566953659058,
+      "learning_rate": 4.746580852775543e-07,
+      "loss": 0.8594,
+      "step": 1279
+    },
+    {
+      "epoch": 1.8523878437047756,
+      "grad_norm": 1.5696501731872559,
+      "learning_rate": 4.706355591311344e-07,
+      "loss": 0.8682,
+      "step": 1280
+    },
+    {
+      "epoch": 1.8523878437047756,
+      "eval_loss": 0.9761188626289368,
+      "eval_runtime": 25.9086,
+      "eval_samples_per_second": 38.597,
+      "eval_steps_per_second": 2.432,
+      "step": 1280
+    },
+    {
+      "epoch": 1.85383502170767,
+      "grad_norm": 1.5777618885040283,
+      "learning_rate": 4.6661303298471445e-07,
+      "loss": 0.8454,
+      "step": 1281
+    },
+    {
+      "epoch": 1.8552821997105644,
+      "grad_norm": 1.5324792861938477,
+      "learning_rate": 4.6259050683829453e-07,
+      "loss": 0.8363,
+      "step": 1282
+    },
+    {
+      "epoch": 1.8567293777134588,
+      "grad_norm": 1.6020021438598633,
+      "learning_rate": 4.5856798069187455e-07,
+      "loss": 0.8665,
+      "step": 1283
+    },
+    {
+      "epoch": 1.8581765557163532,
+      "grad_norm": 1.6646153926849365,
+      "learning_rate": 4.5454545454545457e-07,
+      "loss": 0.8926,
+      "step": 1284
+    },
+    {
+      "epoch": 1.8596237337192476,
+      "grad_norm": 1.72982919216156,
+      "learning_rate": 4.5052292839903465e-07,
+      "loss": 0.8532,
+      "step": 1285
+    },
+    {
+      "epoch": 1.8610709117221418,
+      "grad_norm": 1.6174458265304565,
+      "learning_rate": 4.4650040225261467e-07,
+      "loss": 0.8763,
+      "step": 1286
+    },
+    {
+      "epoch": 1.8625180897250362,
+      "grad_norm": 1.4950498342514038,
+      "learning_rate": 4.4247787610619474e-07,
+      "loss": 0.848,
+      "step": 1287
+    },
+    {
+      "epoch": 1.8639652677279306,
+      "grad_norm": 1.6399447917938232,
+      "learning_rate": 4.3845534995977477e-07,
+      "loss": 0.8663,
+      "step": 1288
+    },
+    {
+      "epoch": 1.8654124457308248,
+      "grad_norm": 1.5628941059112549,
+      "learning_rate": 4.344328238133548e-07,
+      "loss": 0.8834,
+      "step": 1289
+    },
+    {
+      "epoch": 1.8668596237337192,
+      "grad_norm": 1.5437206029891968,
+      "learning_rate": 4.3041029766693486e-07,
+      "loss": 0.8897,
+      "step": 1290
+    },
+    {
+      "epoch": 1.8668596237337192,
+      "eval_loss": 0.9763774871826172,
+      "eval_runtime": 26.0791,
+      "eval_samples_per_second": 38.345,
+      "eval_steps_per_second": 2.416,
+      "step": 1290
+    },
+    {
+      "epoch": 1.8683068017366136,
+      "grad_norm": 1.6498969793319702,
+      "learning_rate": 4.263877715205149e-07,
+      "loss": 0.841,
+      "step": 1291
+    },
+    {
+      "epoch": 1.869753979739508,
+      "grad_norm": 1.5987532138824463,
+      "learning_rate": 4.22365245374095e-07,
+      "loss": 0.8556,
+      "step": 1292
+    },
+    {
+      "epoch": 1.8712011577424024,
+      "grad_norm": 1.6004939079284668,
+      "learning_rate": 4.1834271922767503e-07,
+      "loss": 0.8762,
+      "step": 1293
+    },
+    {
+      "epoch": 1.8726483357452968,
+      "grad_norm": 1.5418647527694702,
+      "learning_rate": 4.1432019308125506e-07,
+      "loss": 0.8455,
+      "step": 1294
+    },
+    {
+      "epoch": 1.8740955137481912,
+      "grad_norm": 1.6261379718780518,
+      "learning_rate": 4.1029766693483513e-07,
+      "loss": 0.9002,
+      "step": 1295
+    },
+    {
+      "epoch": 1.8755426917510853,
+      "grad_norm": 1.6570844650268555,
+      "learning_rate": 4.0627514078841515e-07,
+      "loss": 0.8792,
+      "step": 1296
+    },
+    {
+      "epoch": 1.8769898697539797,
+      "grad_norm": 1.5834189653396606,
+      "learning_rate": 4.0225261464199523e-07,
+      "loss": 0.8761,
+      "step": 1297
+    },
+    {
+      "epoch": 1.8784370477568741,
+      "grad_norm": 1.6012641191482544,
+      "learning_rate": 3.9823008849557525e-07,
+      "loss": 0.8468,
+      "step": 1298
+    },
+    {
+      "epoch": 1.8798842257597683,
+      "grad_norm": 1.5698151588439941,
+      "learning_rate": 3.9420756234915527e-07,
+      "loss": 0.8669,
+      "step": 1299
+    },
+    {
+      "epoch": 1.8813314037626627,
+      "grad_norm": 1.5631901025772095,
+      "learning_rate": 3.9018503620273535e-07,
+      "loss": 0.8742,
+      "step": 1300
+    },
+    {
+      "epoch": 1.8813314037626627,
+      "eval_loss": 0.9766185283660889,
+      "eval_runtime": 25.8927,
+      "eval_samples_per_second": 38.621,
+      "eval_steps_per_second": 2.433,
+      "step": 1300
+    },
+    {
+      "epoch": 1.8827785817655571,
+      "grad_norm": 1.5822370052337646,
+      "learning_rate": 3.8616251005631537e-07,
+      "loss": 0.8775,
+      "step": 1301
+    },
+    {
+      "epoch": 1.8842257597684515,
+      "grad_norm": 1.739130973815918,
+      "learning_rate": 3.821399839098954e-07,
+      "loss": 0.8917,
+      "step": 1302
+    },
+    {
+      "epoch": 1.885672937771346,
+      "grad_norm": 1.565758466720581,
+      "learning_rate": 3.781174577634755e-07,
+      "loss": 0.8703,
+      "step": 1303
+    },
+    {
+      "epoch": 1.8871201157742403,
+      "grad_norm": 1.6006062030792236,
+      "learning_rate": 3.740949316170556e-07,
+      "loss": 0.8605,
+      "step": 1304
+    },
+    {
+      "epoch": 1.8885672937771347,
+      "grad_norm": 1.6097816228866577,
+      "learning_rate": 3.700724054706356e-07,
+      "loss": 0.8486,
+      "step": 1305
+    },
+    {
+      "epoch": 1.890014471780029,
+      "grad_norm": 1.5582165718078613,
+      "learning_rate": 3.6604987932421563e-07,
+      "loss": 0.8331,
+      "step": 1306
+    },
+    {
+      "epoch": 1.8914616497829233,
+      "grad_norm": 1.5906857252120972,
+      "learning_rate": 3.620273531777957e-07,
+      "loss": 0.8695,
+      "step": 1307
+    },
+    {
+      "epoch": 1.8929088277858177,
+      "grad_norm": 1.5649751424789429,
+      "learning_rate": 3.5800482703137573e-07,
+      "loss": 0.87,
+      "step": 1308
+    },
+    {
+      "epoch": 1.8943560057887119,
+      "grad_norm": 1.571973204612732,
+      "learning_rate": 3.5398230088495575e-07,
+      "loss": 0.8755,
+      "step": 1309
+    },
+    {
+      "epoch": 1.8958031837916063,
+      "grad_norm": 1.5199156999588013,
+      "learning_rate": 3.4995977473853583e-07,
+      "loss": 0.8709,
+      "step": 1310
+    },
+    {
+      "epoch": 1.8958031837916063,
+      "eval_loss": 0.9759008884429932,
+      "eval_runtime": 25.8688,
+      "eval_samples_per_second": 38.657,
+      "eval_steps_per_second": 2.435,
+      "step": 1310
+    },
+    {
+      "epoch": 1.8972503617945007,
+      "grad_norm": 1.60419499874115,
+      "learning_rate": 3.4593724859211585e-07,
+      "loss": 0.8648,
+      "step": 1311
+    },
+    {
+      "epoch": 1.898697539797395,
+      "grad_norm": 1.4729905128479004,
+      "learning_rate": 3.419147224456959e-07,
+      "loss": 0.8495,
+      "step": 1312
+    },
+    {
+      "epoch": 1.9001447178002895,
+      "grad_norm": 1.7213385105133057,
+      "learning_rate": 3.3789219629927595e-07,
+      "loss": 0.88,
+      "step": 1313
+    },
+    {
+      "epoch": 1.9015918958031839,
+      "grad_norm": 1.616490364074707,
+      "learning_rate": 3.338696701528561e-07,
+      "loss": 0.8703,
+      "step": 1314
+    },
+    {
+      "epoch": 1.9030390738060783,
+      "grad_norm": 1.6348494291305542,
+      "learning_rate": 3.298471440064361e-07,
+      "loss": 0.8844,
+      "step": 1315
+    },
+    {
+      "epoch": 1.9044862518089725,
+      "grad_norm": 1.6609773635864258,
+      "learning_rate": 3.258246178600161e-07,
+      "loss": 0.8496,
+      "step": 1316
+    },
+    {
+      "epoch": 1.9059334298118669,
+      "grad_norm": 1.5871094465255737,
+      "learning_rate": 3.218020917135962e-07,
+      "loss": 0.8451,
+      "step": 1317
+    },
+    {
+      "epoch": 1.9073806078147613,
+      "grad_norm": 1.6062517166137695,
+      "learning_rate": 3.177795655671762e-07,
+      "loss": 0.8885,
+      "step": 1318
+    },
+    {
+      "epoch": 1.9088277858176554,
+      "grad_norm": 1.636926293373108,
+      "learning_rate": 3.137570394207563e-07,
+      "loss": 0.8524,
+      "step": 1319
+    },
+    {
+      "epoch": 1.9102749638205498,
+      "grad_norm": 1.4566848278045654,
+      "learning_rate": 3.097345132743363e-07,
+      "loss": 0.8507,
+      "step": 1320
+    },
+    {
+      "epoch": 1.9102749638205498,
+      "eval_loss": 0.975534200668335,
+      "eval_runtime": 25.9357,
+      "eval_samples_per_second": 38.557,
+      "eval_steps_per_second": 2.429,
+      "step": 1320
+    },
+    {
+      "epoch": 1.9117221418234442,
+      "grad_norm": 1.643818736076355,
+      "learning_rate": 3.0571198712791633e-07,
+      "loss": 0.8868,
+      "step": 1321
+    },
+    {
+      "epoch": 1.9131693198263386,
+      "grad_norm": 1.5635424852371216,
+      "learning_rate": 3.016894609814964e-07,
+      "loss": 0.8856,
+      "step": 1322
+    },
+    {
+      "epoch": 1.914616497829233,
+      "grad_norm": 1.5588244199752808,
+      "learning_rate": 2.976669348350765e-07,
+      "loss": 0.8923,
+      "step": 1323
+    },
+    {
+      "epoch": 1.9160636758321274,
+      "grad_norm": 1.5307813882827759,
+      "learning_rate": 2.936444086886565e-07,
+      "loss": 0.8594,
+      "step": 1324
+    },
+    {
+      "epoch": 1.9175108538350218,
+      "grad_norm": 1.5048149824142456,
+      "learning_rate": 2.896218825422365e-07,
+      "loss": 0.8659,
+      "step": 1325
+    },
+    {
+      "epoch": 1.918958031837916,
+      "grad_norm": 1.5086400508880615,
+      "learning_rate": 2.855993563958166e-07,
+      "loss": 0.8608,
+      "step": 1326
+    },
+    {
+      "epoch": 1.9204052098408104,
+      "grad_norm": 1.659081220626831,
+      "learning_rate": 2.815768302493966e-07,
+      "loss": 0.9087,
+      "step": 1327
+    },
+    {
+      "epoch": 1.9218523878437048,
+      "grad_norm": 1.5549503564834595,
+      "learning_rate": 2.775543041029767e-07,
+      "loss": 0.8513,
+      "step": 1328
+    },
+    {
+      "epoch": 1.923299565846599,
+      "grad_norm": 1.531968116760254,
+      "learning_rate": 2.7353177795655677e-07,
+      "loss": 0.8641,
+      "step": 1329
+    },
+    {
+      "epoch": 1.9247467438494934,
+      "grad_norm": 1.5982201099395752,
+      "learning_rate": 2.695092518101368e-07,
+      "loss": 0.875,
+      "step": 1330
+    },
+    {
+      "epoch": 1.9247467438494934,
+      "eval_loss": 0.9751847982406616,
+      "eval_runtime": 25.9991,
+      "eval_samples_per_second": 38.463,
+      "eval_steps_per_second": 2.423,
+      "step": 1330
+    },
+    {
+      "epoch": 1.9261939218523878,
+      "grad_norm": 1.6116726398468018,
+      "learning_rate": 2.654867256637168e-07,
+      "loss": 0.8798,
+      "step": 1331
+    },
+    {
+      "epoch": 1.9276410998552822,
+      "grad_norm": 1.5425564050674438,
+      "learning_rate": 2.614641995172969e-07,
+      "loss": 0.8395,
+      "step": 1332
+    },
+    {
+      "epoch": 1.9290882778581766,
+      "grad_norm": 1.5614674091339111,
+      "learning_rate": 2.574416733708769e-07,
+      "loss": 0.8619,
+      "step": 1333
+    },
+    {
+      "epoch": 1.930535455861071,
+      "grad_norm": 1.579698085784912,
+      "learning_rate": 2.53419147224457e-07,
+      "loss": 0.8953,
+      "step": 1334
+    },
+    {
+      "epoch": 1.9319826338639654,
+      "grad_norm": 1.5420680046081543,
+      "learning_rate": 2.49396621078037e-07,
+      "loss": 0.8543,
+      "step": 1335
+    },
+    {
+      "epoch": 1.9334298118668596,
+      "grad_norm": 1.5527102947235107,
+      "learning_rate": 2.453740949316171e-07,
+      "loss": 0.8449,
+      "step": 1336
+    },
+    {
+      "epoch": 1.934876989869754,
+      "grad_norm": 1.6574015617370605,
+      "learning_rate": 2.413515687851971e-07,
+      "loss": 0.8719,
+      "step": 1337
+    },
+    {
+      "epoch": 1.9363241678726484,
+      "grad_norm": 1.5566723346710205,
+      "learning_rate": 2.3732904263877715e-07,
+      "loss": 0.8617,
+      "step": 1338
+    },
+    {
+      "epoch": 1.9377713458755426,
+      "grad_norm": 1.6653348207473755,
+      "learning_rate": 2.3330651649235723e-07,
+      "loss": 0.8858,
+      "step": 1339
+    },
+    {
+      "epoch": 1.939218523878437,
+      "grad_norm": 1.590993046760559,
+      "learning_rate": 2.2928399034593728e-07,
+      "loss": 0.8494,
+      "step": 1340
+    },
+    {
+      "epoch": 1.939218523878437,
+      "eval_loss": 0.9752265810966492,
+      "eval_runtime": 25.9274,
+      "eval_samples_per_second": 38.569,
+      "eval_steps_per_second": 2.43,
+      "step": 1340
+    },
+    {
+      "epoch": 1.9406657018813314,
+      "grad_norm": 1.5773200988769531,
+      "learning_rate": 2.2526146419951732e-07,
+      "loss": 0.8564,
+      "step": 1341
+    },
+    {
+      "epoch": 1.9421128798842258,
+      "grad_norm": 1.6304457187652588,
+      "learning_rate": 2.2123893805309737e-07,
+      "loss": 0.8367,
+      "step": 1342
+    },
+    {
+      "epoch": 1.9435600578871202,
+      "grad_norm": 1.6174850463867188,
+      "learning_rate": 2.172164119066774e-07,
+      "loss": 0.8704,
+      "step": 1343
+    },
+    {
+      "epoch": 1.9450072358900146,
+      "grad_norm": 1.6023101806640625,
+      "learning_rate": 2.1319388576025744e-07,
+      "loss": 0.8876,
+      "step": 1344
+    },
+    {
+      "epoch": 1.946454413892909,
+      "grad_norm": 1.6506774425506592,
+      "learning_rate": 2.0917135961383752e-07,
+      "loss": 0.8931,
+      "step": 1345
+    },
+    {
+      "epoch": 1.9479015918958031,
+      "grad_norm": 1.6398158073425293,
+      "learning_rate": 2.0514883346741757e-07,
+      "loss": 0.915,
+      "step": 1346
+    },
+    {
+      "epoch": 1.9493487698986975,
+      "grad_norm": 1.612850308418274,
+      "learning_rate": 2.0112630732099761e-07,
+      "loss": 0.8516,
+      "step": 1347
+    },
+    {
+      "epoch": 1.950795947901592,
+      "grad_norm": 1.5886873006820679,
+      "learning_rate": 1.9710378117457764e-07,
+      "loss": 0.8681,
+      "step": 1348
+    },
+    {
+      "epoch": 1.9522431259044861,
+      "grad_norm": 1.5209591388702393,
+      "learning_rate": 1.9308125502815768e-07,
+      "loss": 0.8493,
+      "step": 1349
+    },
+    {
+      "epoch": 1.9536903039073805,
+      "grad_norm": 1.6258975267410278,
+      "learning_rate": 1.8905872888173776e-07,
+      "loss": 0.8929,
+      "step": 1350
+    },
+    {
+      "epoch": 1.9536903039073805,
+      "eval_loss": 0.974741518497467,
+      "eval_runtime": 26.0872,
+      "eval_samples_per_second": 38.333,
+      "eval_steps_per_second": 2.415,
+      "step": 1350
+    },
+    {
+      "epoch": 1.955137481910275,
+      "grad_norm": 1.6310229301452637,
+      "learning_rate": 1.850362027353178e-07,
+      "loss": 0.8492,
+      "step": 1351
+    },
+    {
+      "epoch": 1.9565846599131693,
+      "grad_norm": 1.5790897607803345,
+      "learning_rate": 1.8101367658889785e-07,
+      "loss": 0.8644,
+      "step": 1352
+    },
+    {
+      "epoch": 1.9580318379160637,
+      "grad_norm": 1.6126723289489746,
+      "learning_rate": 1.7699115044247788e-07,
+      "loss": 0.8832,
+      "step": 1353
+    },
+    {
+      "epoch": 1.9594790159189581,
+      "grad_norm": 1.5507099628448486,
+      "learning_rate": 1.7296862429605792e-07,
+      "loss": 0.8901,
+      "step": 1354
+    },
+    {
+      "epoch": 1.9609261939218525,
+      "grad_norm": 1.5444624423980713,
+      "learning_rate": 1.6894609814963797e-07,
+      "loss": 0.8834,
+      "step": 1355
+    },
+    {
+      "epoch": 1.9623733719247467,
+      "grad_norm": 1.5959510803222656,
+      "learning_rate": 1.6492357200321805e-07,
+      "loss": 0.8432,
+      "step": 1356
+    },
+    {
+      "epoch": 1.963820549927641,
+      "grad_norm": 1.5451171398162842,
+      "learning_rate": 1.609010458567981e-07,
+      "loss": 0.8675,
+      "step": 1357
+    },
+    {
+      "epoch": 1.9652677279305355,
+      "grad_norm": 1.5004843473434448,
+      "learning_rate": 1.5687851971037814e-07,
+      "loss": 0.8437,
+      "step": 1358
+    },
+    {
+      "epoch": 1.9667149059334297,
+      "grad_norm": 1.4817488193511963,
+      "learning_rate": 1.5285599356395817e-07,
+      "loss": 0.8518,
+      "step": 1359
+    },
+    {
+      "epoch": 1.968162083936324,
+      "grad_norm": 1.6038738489151,
+      "learning_rate": 1.4883346741753824e-07,
+      "loss": 0.8968,
+      "step": 1360
+    },
+    {
+      "epoch": 1.968162083936324,
+      "eval_loss": 0.9743183255195618,
+      "eval_runtime": 26.0266,
+      "eval_samples_per_second": 38.422,
+      "eval_steps_per_second": 2.421,
+      "step": 1360
+    },
+    {
+      "epoch": 1.9696092619392185,
+      "grad_norm": 1.558771014213562,
+      "learning_rate": 1.4481094127111826e-07,
+      "loss": 0.8535,
+      "step": 1361
+    },
+    {
+      "epoch": 1.9710564399421129,
+      "grad_norm": 1.644518256187439,
+      "learning_rate": 1.407884151246983e-07,
+      "loss": 0.884,
+      "step": 1362
+    },
+    {
+      "epoch": 1.9725036179450073,
+      "grad_norm": 1.5933536291122437,
+      "learning_rate": 1.3676588897827839e-07,
+      "loss": 0.8643,
+      "step": 1363
+    },
+    {
+      "epoch": 1.9739507959479017,
+      "grad_norm": 1.6139038801193237,
+      "learning_rate": 1.327433628318584e-07,
+      "loss": 0.8814,
+      "step": 1364
+    },
+    {
+      "epoch": 1.975397973950796,
+      "grad_norm": 1.5435553789138794,
+      "learning_rate": 1.2872083668543846e-07,
+      "loss": 0.8626,
+      "step": 1365
+    },
+    {
+      "epoch": 1.9768451519536903,
+      "grad_norm": 1.5796884298324585,
+      "learning_rate": 1.246983105390185e-07,
+      "loss": 0.8887,
+      "step": 1366
+    },
+    {
+      "epoch": 1.9782923299565847,
+      "grad_norm": 1.560550332069397,
+      "learning_rate": 1.2067578439259855e-07,
+      "loss": 0.9024,
+      "step": 1367
+    },
+    {
+      "epoch": 1.979739507959479,
+      "grad_norm": 1.6081390380859375,
+      "learning_rate": 1.1665325824617861e-07,
+      "loss": 0.834,
+      "step": 1368
+    },
+    {
+      "epoch": 1.9811866859623732,
+      "grad_norm": 1.60971200466156,
+      "learning_rate": 1.1263073209975866e-07,
+      "loss": 0.8688,
+      "step": 1369
+    },
+    {
+      "epoch": 1.9826338639652676,
+      "grad_norm": 1.5743461847305298,
+      "learning_rate": 1.086082059533387e-07,
+      "loss": 0.9112,
+      "step": 1370
+    },
+    {
+      "epoch": 1.9826338639652676,
+      "eval_loss": 0.9745351672172546,
+      "eval_runtime": 26.141,
+      "eval_samples_per_second": 38.254,
+      "eval_steps_per_second": 2.41,
+      "step": 1370
+    },
+    {
+      "epoch": 1.984081041968162,
+      "grad_norm": 1.5900604724884033,
+      "learning_rate": 1.0458567980691876e-07,
+      "loss": 0.8531,
+      "step": 1371
+    },
+    {
+      "epoch": 1.9855282199710564,
+      "grad_norm": 1.6036739349365234,
+      "learning_rate": 1.0056315366049881e-07,
+      "loss": 0.8617,
+      "step": 1372
+    },
+    {
+      "epoch": 1.9869753979739508,
+      "grad_norm": 1.6389803886413574,
+      "learning_rate": 9.654062751407884e-08,
+      "loss": 0.8873,
+      "step": 1373
+    },
+    {
+      "epoch": 1.9884225759768452,
+      "grad_norm": 1.5734872817993164,
+      "learning_rate": 9.25181013676589e-08,
+      "loss": 0.8895,
+      "step": 1374
+    },
+    {
+      "epoch": 1.9898697539797396,
+      "grad_norm": 1.530853509902954,
+      "learning_rate": 8.849557522123894e-08,
+      "loss": 0.8509,
+      "step": 1375
+    },
+    {
+      "epoch": 1.9913169319826338,
+      "grad_norm": 1.5750211477279663,
+      "learning_rate": 8.447304907481899e-08,
+      "loss": 0.8455,
+      "step": 1376
+    },
+    {
+      "epoch": 1.9927641099855282,
+      "grad_norm": 1.5041974782943726,
+      "learning_rate": 8.045052292839905e-08,
+      "loss": 0.8677,
+      "step": 1377
+    },
+    {
+      "epoch": 1.9942112879884226,
+      "grad_norm": 1.6976056098937988,
+      "learning_rate": 7.642799678197908e-08,
+      "loss": 0.8759,
+      "step": 1378
+    },
+    {
+      "epoch": 1.9956584659913168,
+      "grad_norm": 1.535250186920166,
+      "learning_rate": 7.240547063555913e-08,
+      "loss": 0.8582,
+      "step": 1379
+    },
+    {
+      "epoch": 1.9971056439942112,
+      "grad_norm": 1.4982575178146362,
+      "learning_rate": 6.838294448913919e-08,
+      "loss": 0.8539,
+      "step": 1380
+    },
+    {
+      "epoch": 1.9971056439942112,
+      "eval_loss": 0.9745309352874756,
+      "eval_runtime": 26.0712,
+      "eval_samples_per_second": 38.356,
+      "eval_steps_per_second": 2.416,
+      "step": 1380
+    },
+    {
+      "epoch": 1.9985528219971056,
+      "grad_norm": 1.5955522060394287,
+      "learning_rate": 6.436041834271923e-08,
+      "loss": 0.881,
+      "step": 1381
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.5880597829818726,
+      "learning_rate": 6.033789219629928e-08,
+      "loss": 0.8396,
+      "step": 1382
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1382,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.654550286644019e+17,
+  "train_batch_size": 6,
+  "trial_name": null,
+  "trial_params": null
+}