web-doc-refining-lm / trainer_state.json
koalazf99's picture
Upload folder using huggingface_hub
f66c0b2 verified
raw
history blame
127 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9900990099009901,
"eval_steps": 500,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 73.68818664550781,
"learning_rate": 1.0000000000000002e-06,
"loss": 8.0388,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 71.36270904541016,
"learning_rate": 2.0000000000000003e-06,
"loss": 8.0003,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 73.16751861572266,
"learning_rate": 3e-06,
"loss": 7.9032,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 74.18943786621094,
"learning_rate": 4.000000000000001e-06,
"loss": 7.921,
"step": 4
},
{
"epoch": 0.01,
"grad_norm": 70.63272857666016,
"learning_rate": 5e-06,
"loss": 8.032,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 64.6897964477539,
"learning_rate": 6e-06,
"loss": 7.68,
"step": 6
},
{
"epoch": 0.01,
"grad_norm": 65.79997253417969,
"learning_rate": 7e-06,
"loss": 7.5291,
"step": 7
},
{
"epoch": 0.01,
"grad_norm": 63.4569091796875,
"learning_rate": 8.000000000000001e-06,
"loss": 5.6132,
"step": 8
},
{
"epoch": 0.01,
"grad_norm": 63.28990173339844,
"learning_rate": 9e-06,
"loss": 5.0102,
"step": 9
},
{
"epoch": 0.01,
"grad_norm": 46.30258560180664,
"learning_rate": 1e-05,
"loss": 2.2227,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 46.01011657714844,
"learning_rate": 9.99999848074862e-06,
"loss": 1.6679,
"step": 11
},
{
"epoch": 0.01,
"grad_norm": 42.595951080322266,
"learning_rate": 9.9999939229954e-06,
"loss": 1.5493,
"step": 12
},
{
"epoch": 0.02,
"grad_norm": 11.979974746704102,
"learning_rate": 9.999986326743111e-06,
"loss": 0.892,
"step": 13
},
{
"epoch": 0.02,
"grad_norm": 13.096778869628906,
"learning_rate": 9.99997569199637e-06,
"loss": 0.9386,
"step": 14
},
{
"epoch": 0.02,
"grad_norm": 35.61207962036133,
"learning_rate": 9.99996201876164e-06,
"loss": 1.3573,
"step": 15
},
{
"epoch": 0.02,
"grad_norm": 18.184959411621094,
"learning_rate": 9.999945307047228e-06,
"loss": 0.9778,
"step": 16
},
{
"epoch": 0.02,
"grad_norm": 6.461019992828369,
"learning_rate": 9.99992555686329e-06,
"loss": 1.0665,
"step": 17
},
{
"epoch": 0.02,
"grad_norm": 4.743849277496338,
"learning_rate": 9.99990276822183e-06,
"loss": 0.5975,
"step": 18
},
{
"epoch": 0.02,
"grad_norm": 5.654608726501465,
"learning_rate": 9.999876941136697e-06,
"loss": 0.856,
"step": 19
},
{
"epoch": 0.02,
"grad_norm": 5.488308906555176,
"learning_rate": 9.999848075623584e-06,
"loss": 0.7874,
"step": 20
},
{
"epoch": 0.03,
"grad_norm": 5.833119869232178,
"learning_rate": 9.999816171700034e-06,
"loss": 0.8777,
"step": 21
},
{
"epoch": 0.03,
"grad_norm": 3.678900718688965,
"learning_rate": 9.999781229385433e-06,
"loss": 0.5888,
"step": 22
},
{
"epoch": 0.03,
"grad_norm": 6.357454776763916,
"learning_rate": 9.99974324870102e-06,
"loss": 0.9263,
"step": 23
},
{
"epoch": 0.03,
"grad_norm": 5.7684149742126465,
"learning_rate": 9.99970222966987e-06,
"loss": 0.7734,
"step": 24
},
{
"epoch": 0.03,
"grad_norm": 6.77016019821167,
"learning_rate": 9.999658172316915e-06,
"loss": 0.7735,
"step": 25
},
{
"epoch": 0.03,
"grad_norm": 4.0211334228515625,
"learning_rate": 9.999611076668926e-06,
"loss": 0.5645,
"step": 26
},
{
"epoch": 0.03,
"grad_norm": 7.3770222663879395,
"learning_rate": 9.999560942754525e-06,
"loss": 1.0185,
"step": 27
},
{
"epoch": 0.03,
"grad_norm": 4.433741569519043,
"learning_rate": 9.999507770604177e-06,
"loss": 0.3547,
"step": 28
},
{
"epoch": 0.04,
"grad_norm": 6.0549492835998535,
"learning_rate": 9.999451560250196e-06,
"loss": 0.4961,
"step": 29
},
{
"epoch": 0.04,
"grad_norm": 7.8142619132995605,
"learning_rate": 9.999392311726738e-06,
"loss": 0.4398,
"step": 30
},
{
"epoch": 0.04,
"grad_norm": 5.785826683044434,
"learning_rate": 9.999330025069812e-06,
"loss": 0.6431,
"step": 31
},
{
"epoch": 0.04,
"grad_norm": 6.010104656219482,
"learning_rate": 9.999264700317268e-06,
"loss": 0.6129,
"step": 32
},
{
"epoch": 0.04,
"grad_norm": 8.289867401123047,
"learning_rate": 9.999196337508804e-06,
"loss": 0.3771,
"step": 33
},
{
"epoch": 0.04,
"grad_norm": 5.59083890914917,
"learning_rate": 9.999124936685965e-06,
"loss": 0.3964,
"step": 34
},
{
"epoch": 0.04,
"grad_norm": 6.018394947052002,
"learning_rate": 9.99905049789214e-06,
"loss": 0.4801,
"step": 35
},
{
"epoch": 0.04,
"grad_norm": 10.878011703491211,
"learning_rate": 9.998973021172564e-06,
"loss": 0.2996,
"step": 36
},
{
"epoch": 0.05,
"grad_norm": 31.676380157470703,
"learning_rate": 9.998892506574325e-06,
"loss": 0.5261,
"step": 37
},
{
"epoch": 0.05,
"grad_norm": 16.133407592773438,
"learning_rate": 9.998808954146347e-06,
"loss": 0.3843,
"step": 38
},
{
"epoch": 0.05,
"grad_norm": 8.785749435424805,
"learning_rate": 9.998722363939407e-06,
"loss": 0.2476,
"step": 39
},
{
"epoch": 0.05,
"grad_norm": 4.326422214508057,
"learning_rate": 9.998632736006124e-06,
"loss": 0.2334,
"step": 40
},
{
"epoch": 0.05,
"grad_norm": 6.171711444854736,
"learning_rate": 9.998540070400966e-06,
"loss": 0.1671,
"step": 41
},
{
"epoch": 0.05,
"grad_norm": 3.5893757343292236,
"learning_rate": 9.998444367180247e-06,
"loss": 0.1732,
"step": 42
},
{
"epoch": 0.05,
"grad_norm": 2.918233633041382,
"learning_rate": 9.998345626402124e-06,
"loss": 0.1127,
"step": 43
},
{
"epoch": 0.05,
"grad_norm": 2.620290517807007,
"learning_rate": 9.998243848126604e-06,
"loss": 0.1337,
"step": 44
},
{
"epoch": 0.06,
"grad_norm": 4.31186056137085,
"learning_rate": 9.998139032415534e-06,
"loss": 0.144,
"step": 45
},
{
"epoch": 0.06,
"grad_norm": 3.399256706237793,
"learning_rate": 9.998031179332618e-06,
"loss": 0.0878,
"step": 46
},
{
"epoch": 0.06,
"grad_norm": 1.658913016319275,
"learning_rate": 9.997920288943388e-06,
"loss": 0.0651,
"step": 47
},
{
"epoch": 0.06,
"grad_norm": 3.0306262969970703,
"learning_rate": 9.99780636131524e-06,
"loss": 0.1051,
"step": 48
},
{
"epoch": 0.06,
"grad_norm": 2.099931478500366,
"learning_rate": 9.997689396517408e-06,
"loss": 0.0937,
"step": 49
},
{
"epoch": 0.06,
"grad_norm": 2.8879055976867676,
"learning_rate": 9.997569394620965e-06,
"loss": 0.0701,
"step": 50
},
{
"epoch": 0.06,
"grad_norm": 1.2706345319747925,
"learning_rate": 9.997446355698843e-06,
"loss": 0.0559,
"step": 51
},
{
"epoch": 0.06,
"grad_norm": 1.7181872129440308,
"learning_rate": 9.99732027982581e-06,
"loss": 0.06,
"step": 52
},
{
"epoch": 0.07,
"grad_norm": 1.4794338941574097,
"learning_rate": 9.997191167078479e-06,
"loss": 0.0715,
"step": 53
},
{
"epoch": 0.07,
"grad_norm": 1.40012788772583,
"learning_rate": 9.99705901753532e-06,
"loss": 0.0608,
"step": 54
},
{
"epoch": 0.07,
"grad_norm": 3.034327983856201,
"learning_rate": 9.996923831276632e-06,
"loss": 0.0603,
"step": 55
},
{
"epoch": 0.07,
"grad_norm": 3.1301584243774414,
"learning_rate": 9.996785608384573e-06,
"loss": 0.0762,
"step": 56
},
{
"epoch": 0.07,
"grad_norm": 2.8218867778778076,
"learning_rate": 9.996644348943141e-06,
"loss": 0.0956,
"step": 57
},
{
"epoch": 0.07,
"grad_norm": 1.7874306440353394,
"learning_rate": 9.996500053038176e-06,
"loss": 0.0693,
"step": 58
},
{
"epoch": 0.07,
"grad_norm": 1.582387089729309,
"learning_rate": 9.99635272075737e-06,
"loss": 0.0679,
"step": 59
},
{
"epoch": 0.07,
"grad_norm": 1.9699870347976685,
"learning_rate": 9.996202352190256e-06,
"loss": 0.0565,
"step": 60
},
{
"epoch": 0.08,
"grad_norm": 2.0471878051757812,
"learning_rate": 9.996048947428212e-06,
"loss": 0.0725,
"step": 61
},
{
"epoch": 0.08,
"grad_norm": 1.3680695295333862,
"learning_rate": 9.995892506564461e-06,
"loss": 0.0586,
"step": 62
},
{
"epoch": 0.08,
"grad_norm": 4.124834060668945,
"learning_rate": 9.995733029694077e-06,
"loss": 0.0724,
"step": 63
},
{
"epoch": 0.08,
"grad_norm": 2.7247884273529053,
"learning_rate": 9.995570516913971e-06,
"loss": 0.095,
"step": 64
},
{
"epoch": 0.08,
"grad_norm": 5.306038856506348,
"learning_rate": 9.995404968322902e-06,
"loss": 0.0783,
"step": 65
},
{
"epoch": 0.08,
"grad_norm": 1.6850618124008179,
"learning_rate": 9.995236384021474e-06,
"loss": 0.0602,
"step": 66
},
{
"epoch": 0.08,
"grad_norm": 7.234889984130859,
"learning_rate": 9.995064764112135e-06,
"loss": 0.0852,
"step": 67
},
{
"epoch": 0.08,
"grad_norm": 3.0967376232147217,
"learning_rate": 9.994890108699182e-06,
"loss": 0.0905,
"step": 68
},
{
"epoch": 0.09,
"grad_norm": 3.0317909717559814,
"learning_rate": 9.99471241788875e-06,
"loss": 0.0728,
"step": 69
},
{
"epoch": 0.09,
"grad_norm": 4.1822967529296875,
"learning_rate": 9.994531691788822e-06,
"loss": 0.0919,
"step": 70
},
{
"epoch": 0.09,
"grad_norm": 2.137779474258423,
"learning_rate": 9.994347930509225e-06,
"loss": 0.0496,
"step": 71
},
{
"epoch": 0.09,
"grad_norm": 1.5484669208526611,
"learning_rate": 9.994161134161635e-06,
"loss": 0.0696,
"step": 72
},
{
"epoch": 0.09,
"grad_norm": 4.332581996917725,
"learning_rate": 9.993971302859561e-06,
"loss": 0.0769,
"step": 73
},
{
"epoch": 0.09,
"grad_norm": 1.838725209236145,
"learning_rate": 9.99377843671837e-06,
"loss": 0.1011,
"step": 74
},
{
"epoch": 0.09,
"grad_norm": 3.3594350814819336,
"learning_rate": 9.993582535855265e-06,
"loss": 0.075,
"step": 75
},
{
"epoch": 0.09,
"grad_norm": 1.8417227268218994,
"learning_rate": 9.993383600389294e-06,
"loss": 0.0529,
"step": 76
},
{
"epoch": 0.1,
"grad_norm": 4.550814151763916,
"learning_rate": 9.993181630441352e-06,
"loss": 0.1104,
"step": 77
},
{
"epoch": 0.1,
"grad_norm": 1.729711651802063,
"learning_rate": 9.992976626134171e-06,
"loss": 0.0601,
"step": 78
},
{
"epoch": 0.1,
"grad_norm": 2.1105282306671143,
"learning_rate": 9.99276858759234e-06,
"loss": 0.0423,
"step": 79
},
{
"epoch": 0.1,
"grad_norm": 2.180546522140503,
"learning_rate": 9.992557514942278e-06,
"loss": 0.0691,
"step": 80
},
{
"epoch": 0.1,
"grad_norm": 3.0761630535125732,
"learning_rate": 9.992343408312258e-06,
"loss": 0.0503,
"step": 81
},
{
"epoch": 0.1,
"grad_norm": 0.8641157150268555,
"learning_rate": 9.992126267832392e-06,
"loss": 0.0425,
"step": 82
},
{
"epoch": 0.1,
"grad_norm": 10.724833488464355,
"learning_rate": 9.991906093634633e-06,
"loss": 0.0603,
"step": 83
},
{
"epoch": 0.1,
"grad_norm": 1.1756705045700073,
"learning_rate": 9.991682885852784e-06,
"loss": 0.0392,
"step": 84
},
{
"epoch": 0.11,
"grad_norm": 1.1171228885650635,
"learning_rate": 9.991456644622489e-06,
"loss": 0.0454,
"step": 85
},
{
"epoch": 0.11,
"grad_norm": 1.6004431247711182,
"learning_rate": 9.991227370081233e-06,
"loss": 0.0496,
"step": 86
},
{
"epoch": 0.11,
"grad_norm": 3.264841318130493,
"learning_rate": 9.990995062368346e-06,
"loss": 0.0339,
"step": 87
},
{
"epoch": 0.11,
"grad_norm": 2.4765560626983643,
"learning_rate": 9.990759721625005e-06,
"loss": 0.0698,
"step": 88
},
{
"epoch": 0.11,
"grad_norm": 6.907183647155762,
"learning_rate": 9.990521347994224e-06,
"loss": 0.1026,
"step": 89
},
{
"epoch": 0.11,
"grad_norm": 5.384580135345459,
"learning_rate": 9.990279941620861e-06,
"loss": 0.0664,
"step": 90
},
{
"epoch": 0.11,
"grad_norm": 1.2060827016830444,
"learning_rate": 9.990035502651624e-06,
"loss": 0.0324,
"step": 91
},
{
"epoch": 0.11,
"grad_norm": 1.0956050157546997,
"learning_rate": 9.989788031235054e-06,
"loss": 0.0593,
"step": 92
},
{
"epoch": 0.12,
"grad_norm": 2.1994054317474365,
"learning_rate": 9.98953752752154e-06,
"loss": 0.0484,
"step": 93
},
{
"epoch": 0.12,
"grad_norm": 3.490142583847046,
"learning_rate": 9.989283991663316e-06,
"loss": 0.0561,
"step": 94
},
{
"epoch": 0.12,
"grad_norm": 4.274105072021484,
"learning_rate": 9.989027423814454e-06,
"loss": 0.1123,
"step": 95
},
{
"epoch": 0.12,
"grad_norm": 3.0847527980804443,
"learning_rate": 9.98876782413087e-06,
"loss": 0.0606,
"step": 96
},
{
"epoch": 0.12,
"grad_norm": 1.8111186027526855,
"learning_rate": 9.988505192770324e-06,
"loss": 0.0681,
"step": 97
},
{
"epoch": 0.12,
"grad_norm": 1.2713731527328491,
"learning_rate": 9.988239529892416e-06,
"loss": 0.0516,
"step": 98
},
{
"epoch": 0.12,
"grad_norm": 1.189513087272644,
"learning_rate": 9.987970835658592e-06,
"loss": 0.0768,
"step": 99
},
{
"epoch": 0.12,
"grad_norm": 0.9951283931732178,
"learning_rate": 9.987699110232134e-06,
"loss": 0.0416,
"step": 100
},
{
"epoch": 0.12,
"grad_norm": 1.4628676176071167,
"learning_rate": 9.987424353778172e-06,
"loss": 0.0751,
"step": 101
},
{
"epoch": 0.13,
"grad_norm": 1.41041100025177,
"learning_rate": 9.987146566463677e-06,
"loss": 0.0681,
"step": 102
},
{
"epoch": 0.13,
"grad_norm": 1.9383851289749146,
"learning_rate": 9.986865748457457e-06,
"loss": 0.1003,
"step": 103
},
{
"epoch": 0.13,
"grad_norm": 1.1434725522994995,
"learning_rate": 9.986581899930167e-06,
"loss": 0.049,
"step": 104
},
{
"epoch": 0.13,
"grad_norm": 3.613456964492798,
"learning_rate": 9.986295021054302e-06,
"loss": 0.0519,
"step": 105
},
{
"epoch": 0.13,
"grad_norm": 3.5484371185302734,
"learning_rate": 9.986005112004198e-06,
"loss": 0.0571,
"step": 106
},
{
"epoch": 0.13,
"grad_norm": 1.9423480033874512,
"learning_rate": 9.985712172956035e-06,
"loss": 0.039,
"step": 107
},
{
"epoch": 0.13,
"grad_norm": 2.0560059547424316,
"learning_rate": 9.985416204087828e-06,
"loss": 0.0904,
"step": 108
},
{
"epoch": 0.13,
"grad_norm": 6.695100784301758,
"learning_rate": 9.985117205579442e-06,
"loss": 0.1549,
"step": 109
},
{
"epoch": 0.14,
"grad_norm": 2.4656105041503906,
"learning_rate": 9.984815177612574e-06,
"loss": 0.079,
"step": 110
},
{
"epoch": 0.14,
"grad_norm": 1.974007487297058,
"learning_rate": 9.984510120370771e-06,
"loss": 0.0585,
"step": 111
},
{
"epoch": 0.14,
"grad_norm": 1.3341798782348633,
"learning_rate": 9.984202034039414e-06,
"loss": 0.0585,
"step": 112
},
{
"epoch": 0.14,
"grad_norm": 2.7250359058380127,
"learning_rate": 9.983890918805727e-06,
"loss": 0.0651,
"step": 113
},
{
"epoch": 0.14,
"grad_norm": 4.140810489654541,
"learning_rate": 9.983576774858776e-06,
"loss": 0.0748,
"step": 114
},
{
"epoch": 0.14,
"grad_norm": 6.119039058685303,
"learning_rate": 9.983259602389469e-06,
"loss": 0.0818,
"step": 115
},
{
"epoch": 0.14,
"grad_norm": 1.3782867193222046,
"learning_rate": 9.982939401590545e-06,
"loss": 0.0563,
"step": 116
},
{
"epoch": 0.14,
"grad_norm": 1.240810513496399,
"learning_rate": 9.982616172656594e-06,
"loss": 0.0555,
"step": 117
},
{
"epoch": 0.15,
"grad_norm": 2.0260303020477295,
"learning_rate": 9.982289915784044e-06,
"loss": 0.0554,
"step": 118
},
{
"epoch": 0.15,
"grad_norm": 2.1243703365325928,
"learning_rate": 9.981960631171162e-06,
"loss": 0.0584,
"step": 119
},
{
"epoch": 0.15,
"grad_norm": 2.7996938228607178,
"learning_rate": 9.98162831901805e-06,
"loss": 0.0854,
"step": 120
},
{
"epoch": 0.15,
"grad_norm": 1.3062973022460938,
"learning_rate": 9.981292979526656e-06,
"loss": 0.0821,
"step": 121
},
{
"epoch": 0.15,
"grad_norm": 1.2655537128448486,
"learning_rate": 9.980954612900768e-06,
"loss": 0.0643,
"step": 122
},
{
"epoch": 0.15,
"grad_norm": 4.0950798988342285,
"learning_rate": 9.980613219346012e-06,
"loss": 0.0994,
"step": 123
},
{
"epoch": 0.15,
"grad_norm": 1.522292971611023,
"learning_rate": 9.980268799069848e-06,
"loss": 0.0369,
"step": 124
},
{
"epoch": 0.15,
"grad_norm": 2.5451443195343018,
"learning_rate": 9.979921352281585e-06,
"loss": 0.0286,
"step": 125
},
{
"epoch": 0.16,
"grad_norm": 1.8015575408935547,
"learning_rate": 9.979570879192365e-06,
"loss": 0.0736,
"step": 126
},
{
"epoch": 0.16,
"grad_norm": 3.2620017528533936,
"learning_rate": 9.979217380015173e-06,
"loss": 0.0662,
"step": 127
},
{
"epoch": 0.16,
"grad_norm": 0.5585700273513794,
"learning_rate": 9.978860854964827e-06,
"loss": 0.0248,
"step": 128
},
{
"epoch": 0.16,
"grad_norm": 1.1841486692428589,
"learning_rate": 9.978501304257991e-06,
"loss": 0.0386,
"step": 129
},
{
"epoch": 0.16,
"grad_norm": 1.1351743936538696,
"learning_rate": 9.97813872811316e-06,
"loss": 0.0437,
"step": 130
},
{
"epoch": 0.16,
"grad_norm": 2.4472172260284424,
"learning_rate": 9.977773126750677e-06,
"loss": 0.074,
"step": 131
},
{
"epoch": 0.16,
"grad_norm": 0.9335076808929443,
"learning_rate": 9.977404500392711e-06,
"loss": 0.034,
"step": 132
},
{
"epoch": 0.16,
"grad_norm": 1.9846038818359375,
"learning_rate": 9.977032849263284e-06,
"loss": 0.0488,
"step": 133
},
{
"epoch": 0.17,
"grad_norm": 1.003464698791504,
"learning_rate": 9.976658173588244e-06,
"loss": 0.0199,
"step": 134
},
{
"epoch": 0.17,
"grad_norm": 1.1298803091049194,
"learning_rate": 9.976280473595284e-06,
"loss": 0.0507,
"step": 135
},
{
"epoch": 0.17,
"grad_norm": 4.0241546630859375,
"learning_rate": 9.975899749513928e-06,
"loss": 0.097,
"step": 136
},
{
"epoch": 0.17,
"grad_norm": 2.1224637031555176,
"learning_rate": 9.975516001575549e-06,
"loss": 0.0656,
"step": 137
},
{
"epoch": 0.17,
"grad_norm": 1.3180643320083618,
"learning_rate": 9.975129230013347e-06,
"loss": 0.0839,
"step": 138
},
{
"epoch": 0.17,
"grad_norm": 2.089977979660034,
"learning_rate": 9.974739435062364e-06,
"loss": 0.0571,
"step": 139
},
{
"epoch": 0.17,
"grad_norm": 1.773493766784668,
"learning_rate": 9.974346616959476e-06,
"loss": 0.025,
"step": 140
},
{
"epoch": 0.17,
"grad_norm": 2.1019980907440186,
"learning_rate": 9.973950775943403e-06,
"loss": 0.0447,
"step": 141
},
{
"epoch": 0.18,
"grad_norm": 1.4967840909957886,
"learning_rate": 9.973551912254696e-06,
"loss": 0.0422,
"step": 142
},
{
"epoch": 0.18,
"grad_norm": 1.1371103525161743,
"learning_rate": 9.973150026135743e-06,
"loss": 0.0648,
"step": 143
},
{
"epoch": 0.18,
"grad_norm": 0.8660270571708679,
"learning_rate": 9.972745117830774e-06,
"loss": 0.0344,
"step": 144
},
{
"epoch": 0.18,
"grad_norm": 5.05332088470459,
"learning_rate": 9.972337187585848e-06,
"loss": 0.1036,
"step": 145
},
{
"epoch": 0.18,
"grad_norm": 1.1562827825546265,
"learning_rate": 9.971926235648868e-06,
"loss": 0.041,
"step": 146
},
{
"epoch": 0.18,
"grad_norm": 3.426886558532715,
"learning_rate": 9.971512262269568e-06,
"loss": 0.127,
"step": 147
},
{
"epoch": 0.18,
"grad_norm": 1.173113465309143,
"learning_rate": 9.97109526769952e-06,
"loss": 0.0525,
"step": 148
},
{
"epoch": 0.18,
"grad_norm": 1.1487282514572144,
"learning_rate": 9.970675252192133e-06,
"loss": 0.052,
"step": 149
},
{
"epoch": 0.19,
"grad_norm": 1.5633060932159424,
"learning_rate": 9.970252216002647e-06,
"loss": 0.0389,
"step": 150
},
{
"epoch": 0.19,
"grad_norm": 1.445123314857483,
"learning_rate": 9.969826159388145e-06,
"loss": 0.0521,
"step": 151
},
{
"epoch": 0.19,
"grad_norm": 0.8425119519233704,
"learning_rate": 9.96939708260754e-06,
"loss": 0.0513,
"step": 152
},
{
"epoch": 0.19,
"grad_norm": 0.9555310606956482,
"learning_rate": 9.968964985921584e-06,
"loss": 0.0574,
"step": 153
},
{
"epoch": 0.19,
"grad_norm": 1.8024086952209473,
"learning_rate": 9.96852986959286e-06,
"loss": 0.058,
"step": 154
},
{
"epoch": 0.19,
"grad_norm": 1.4136022329330444,
"learning_rate": 9.96809173388579e-06,
"loss": 0.0402,
"step": 155
},
{
"epoch": 0.19,
"grad_norm": 0.9865325093269348,
"learning_rate": 9.96765057906663e-06,
"loss": 0.0555,
"step": 156
},
{
"epoch": 0.19,
"grad_norm": 1.3715591430664062,
"learning_rate": 9.967206405403468e-06,
"loss": 0.0549,
"step": 157
},
{
"epoch": 0.2,
"grad_norm": 1.10662841796875,
"learning_rate": 9.966759213166231e-06,
"loss": 0.0584,
"step": 158
},
{
"epoch": 0.2,
"grad_norm": 1.3035138845443726,
"learning_rate": 9.966309002626676e-06,
"loss": 0.0398,
"step": 159
},
{
"epoch": 0.2,
"grad_norm": 2.7275445461273193,
"learning_rate": 9.965855774058395e-06,
"loss": 0.0583,
"step": 160
},
{
"epoch": 0.2,
"grad_norm": 1.4070425033569336,
"learning_rate": 9.965399527736819e-06,
"loss": 0.0476,
"step": 161
},
{
"epoch": 0.2,
"grad_norm": 1.2913644313812256,
"learning_rate": 9.964940263939206e-06,
"loss": 0.0693,
"step": 162
},
{
"epoch": 0.2,
"grad_norm": 5.090683937072754,
"learning_rate": 9.964477982944654e-06,
"loss": 0.0737,
"step": 163
},
{
"epoch": 0.2,
"grad_norm": 4.244226455688477,
"learning_rate": 9.964012685034087e-06,
"loss": 0.0659,
"step": 164
},
{
"epoch": 0.2,
"grad_norm": 1.7967549562454224,
"learning_rate": 9.96354437049027e-06,
"loss": 0.0226,
"step": 165
},
{
"epoch": 0.21,
"grad_norm": 1.695214033126831,
"learning_rate": 9.963073039597798e-06,
"loss": 0.0772,
"step": 166
},
{
"epoch": 0.21,
"grad_norm": 2.0708000659942627,
"learning_rate": 9.962598692643098e-06,
"loss": 0.053,
"step": 167
},
{
"epoch": 0.21,
"grad_norm": 2.1509592533111572,
"learning_rate": 9.962121329914432e-06,
"loss": 0.0714,
"step": 168
},
{
"epoch": 0.21,
"grad_norm": 2.4323039054870605,
"learning_rate": 9.961640951701892e-06,
"loss": 0.0456,
"step": 169
},
{
"epoch": 0.21,
"grad_norm": 2.304720640182495,
"learning_rate": 9.961157558297404e-06,
"loss": 0.0854,
"step": 170
},
{
"epoch": 0.21,
"grad_norm": 0.8575959205627441,
"learning_rate": 9.960671149994727e-06,
"loss": 0.0374,
"step": 171
},
{
"epoch": 0.21,
"grad_norm": 1.106746792793274,
"learning_rate": 9.960181727089455e-06,
"loss": 0.0515,
"step": 172
},
{
"epoch": 0.21,
"grad_norm": 1.6459972858428955,
"learning_rate": 9.959689289879003e-06,
"loss": 0.0514,
"step": 173
},
{
"epoch": 0.22,
"grad_norm": 1.5684750080108643,
"learning_rate": 9.959193838662634e-06,
"loss": 0.0669,
"step": 174
},
{
"epoch": 0.22,
"grad_norm": 1.1011048555374146,
"learning_rate": 9.958695373741428e-06,
"loss": 0.0406,
"step": 175
},
{
"epoch": 0.22,
"grad_norm": 0.9976766109466553,
"learning_rate": 9.958193895418305e-06,
"loss": 0.0377,
"step": 176
},
{
"epoch": 0.22,
"grad_norm": 1.4583932161331177,
"learning_rate": 9.957689403998012e-06,
"loss": 0.06,
"step": 177
},
{
"epoch": 0.22,
"grad_norm": 1.1599044799804688,
"learning_rate": 9.95718189978713e-06,
"loss": 0.0406,
"step": 178
},
{
"epoch": 0.22,
"grad_norm": 0.9436582326889038,
"learning_rate": 9.95667138309407e-06,
"loss": 0.0361,
"step": 179
},
{
"epoch": 0.22,
"grad_norm": 2.8169147968292236,
"learning_rate": 9.956157854229072e-06,
"loss": 0.0597,
"step": 180
},
{
"epoch": 0.22,
"grad_norm": 0.9190147519111633,
"learning_rate": 9.955641313504208e-06,
"loss": 0.0258,
"step": 181
},
{
"epoch": 0.23,
"grad_norm": 0.8643155694007874,
"learning_rate": 9.95512176123338e-06,
"loss": 0.0327,
"step": 182
},
{
"epoch": 0.23,
"grad_norm": 1.2514710426330566,
"learning_rate": 9.95459919773232e-06,
"loss": 0.0723,
"step": 183
},
{
"epoch": 0.23,
"grad_norm": 1.3103550672531128,
"learning_rate": 9.954073623318593e-06,
"loss": 0.0576,
"step": 184
},
{
"epoch": 0.23,
"grad_norm": 2.092473268508911,
"learning_rate": 9.953545038311587e-06,
"loss": 0.0734,
"step": 185
},
{
"epoch": 0.23,
"grad_norm": 2.5062074661254883,
"learning_rate": 9.953013443032524e-06,
"loss": 0.0483,
"step": 186
},
{
"epoch": 0.23,
"grad_norm": 2.1158766746520996,
"learning_rate": 9.952478837804459e-06,
"loss": 0.0345,
"step": 187
},
{
"epoch": 0.23,
"grad_norm": 2.5865800380706787,
"learning_rate": 9.951941222952264e-06,
"loss": 0.0557,
"step": 188
},
{
"epoch": 0.23,
"grad_norm": 2.171496868133545,
"learning_rate": 9.951400598802655e-06,
"loss": 0.062,
"step": 189
},
{
"epoch": 0.24,
"grad_norm": 0.9497528076171875,
"learning_rate": 9.950856965684167e-06,
"loss": 0.0365,
"step": 190
},
{
"epoch": 0.24,
"grad_norm": 1.4575358629226685,
"learning_rate": 9.950310323927165e-06,
"loss": 0.0648,
"step": 191
},
{
"epoch": 0.24,
"grad_norm": 2.8335795402526855,
"learning_rate": 9.949760673863846e-06,
"loss": 0.0611,
"step": 192
},
{
"epoch": 0.24,
"grad_norm": 1.1269536018371582,
"learning_rate": 9.949208015828232e-06,
"loss": 0.0541,
"step": 193
},
{
"epoch": 0.24,
"grad_norm": 0.9925274848937988,
"learning_rate": 9.948652350156172e-06,
"loss": 0.0275,
"step": 194
},
{
"epoch": 0.24,
"grad_norm": 1.2717292308807373,
"learning_rate": 9.948093677185345e-06,
"loss": 0.041,
"step": 195
},
{
"epoch": 0.24,
"grad_norm": 1.1867843866348267,
"learning_rate": 9.947531997255256e-06,
"loss": 0.0517,
"step": 196
},
{
"epoch": 0.24,
"grad_norm": 1.1004167795181274,
"learning_rate": 9.946967310707241e-06,
"loss": 0.0503,
"step": 197
},
{
"epoch": 0.25,
"grad_norm": 1.8476804494857788,
"learning_rate": 9.946399617884457e-06,
"loss": 0.0419,
"step": 198
},
{
"epoch": 0.25,
"grad_norm": 1.3617258071899414,
"learning_rate": 9.945828919131894e-06,
"loss": 0.0273,
"step": 199
},
{
"epoch": 0.25,
"grad_norm": 1.4114432334899902,
"learning_rate": 9.945255214796366e-06,
"loss": 0.0448,
"step": 200
},
{
"epoch": 0.25,
"grad_norm": 1.4074312448501587,
"learning_rate": 9.944678505226511e-06,
"loss": 0.0637,
"step": 201
},
{
"epoch": 0.25,
"grad_norm": 1.2234091758728027,
"learning_rate": 9.944098790772797e-06,
"loss": 0.0497,
"step": 202
},
{
"epoch": 0.25,
"grad_norm": 1.3652763366699219,
"learning_rate": 9.943516071787517e-06,
"loss": 0.0555,
"step": 203
},
{
"epoch": 0.25,
"grad_norm": 2.020076036453247,
"learning_rate": 9.942930348624788e-06,
"loss": 0.0488,
"step": 204
},
{
"epoch": 0.25,
"grad_norm": 1.1463106870651245,
"learning_rate": 9.942341621640558e-06,
"loss": 0.0498,
"step": 205
},
{
"epoch": 0.25,
"grad_norm": 1.1451953649520874,
"learning_rate": 9.941749891192594e-06,
"loss": 0.0485,
"step": 206
},
{
"epoch": 0.26,
"grad_norm": 2.710951805114746,
"learning_rate": 9.94115515764049e-06,
"loss": 0.0485,
"step": 207
},
{
"epoch": 0.26,
"grad_norm": 1.6404072046279907,
"learning_rate": 9.940557421345667e-06,
"loss": 0.0387,
"step": 208
},
{
"epoch": 0.26,
"grad_norm": 1.1222543716430664,
"learning_rate": 9.939956682671372e-06,
"loss": 0.0586,
"step": 209
},
{
"epoch": 0.26,
"grad_norm": 1.6379327774047852,
"learning_rate": 9.939352941982671e-06,
"loss": 0.068,
"step": 210
},
{
"epoch": 0.26,
"grad_norm": 1.2636500597000122,
"learning_rate": 9.938746199646458e-06,
"loss": 0.0413,
"step": 211
},
{
"epoch": 0.26,
"grad_norm": 1.1981465816497803,
"learning_rate": 9.938136456031454e-06,
"loss": 0.0259,
"step": 212
},
{
"epoch": 0.26,
"grad_norm": 1.2407490015029907,
"learning_rate": 9.937523711508196e-06,
"loss": 0.0413,
"step": 213
},
{
"epoch": 0.26,
"grad_norm": 1.5851786136627197,
"learning_rate": 9.93690796644905e-06,
"loss": 0.0452,
"step": 214
},
{
"epoch": 0.27,
"grad_norm": 1.1833544969558716,
"learning_rate": 9.936289221228207e-06,
"loss": 0.0415,
"step": 215
},
{
"epoch": 0.27,
"grad_norm": 5.073670387268066,
"learning_rate": 9.935667476221678e-06,
"loss": 0.1248,
"step": 216
},
{
"epoch": 0.27,
"grad_norm": 2.5642805099487305,
"learning_rate": 9.935042731807297e-06,
"loss": 0.0708,
"step": 217
},
{
"epoch": 0.27,
"grad_norm": 3.680995464324951,
"learning_rate": 9.934414988364722e-06,
"loss": 0.0587,
"step": 218
},
{
"epoch": 0.27,
"grad_norm": 2.164574146270752,
"learning_rate": 9.933784246275432e-06,
"loss": 0.0532,
"step": 219
},
{
"epoch": 0.27,
"grad_norm": 1.1444894075393677,
"learning_rate": 9.93315050592273e-06,
"loss": 0.0486,
"step": 220
},
{
"epoch": 0.27,
"grad_norm": 0.9272328615188599,
"learning_rate": 9.932513767691743e-06,
"loss": 0.0465,
"step": 221
},
{
"epoch": 0.27,
"grad_norm": 3.0213119983673096,
"learning_rate": 9.931874031969411e-06,
"loss": 0.0679,
"step": 222
},
{
"epoch": 0.28,
"grad_norm": 2.7126073837280273,
"learning_rate": 9.931231299144509e-06,
"loss": 0.0849,
"step": 223
},
{
"epoch": 0.28,
"grad_norm": 1.2266963720321655,
"learning_rate": 9.93058556960762e-06,
"loss": 0.0722,
"step": 224
},
{
"epoch": 0.28,
"grad_norm": 2.530362844467163,
"learning_rate": 9.929936843751158e-06,
"loss": 0.0477,
"step": 225
},
{
"epoch": 0.28,
"grad_norm": 2.087737798690796,
"learning_rate": 9.929285121969352e-06,
"loss": 0.0698,
"step": 226
},
{
"epoch": 0.28,
"grad_norm": 1.2407419681549072,
"learning_rate": 9.928630404658255e-06,
"loss": 0.0501,
"step": 227
},
{
"epoch": 0.28,
"grad_norm": 1.7187033891677856,
"learning_rate": 9.927972692215739e-06,
"loss": 0.0537,
"step": 228
},
{
"epoch": 0.28,
"grad_norm": 2.143998861312866,
"learning_rate": 9.927311985041495e-06,
"loss": 0.0554,
"step": 229
},
{
"epoch": 0.28,
"grad_norm": 2.8843326568603516,
"learning_rate": 9.926648283537037e-06,
"loss": 0.0544,
"step": 230
},
{
"epoch": 0.29,
"grad_norm": 1.6308791637420654,
"learning_rate": 9.925981588105695e-06,
"loss": 0.0505,
"step": 231
},
{
"epoch": 0.29,
"grad_norm": 1.8796863555908203,
"learning_rate": 9.92531189915262e-06,
"loss": 0.0537,
"step": 232
},
{
"epoch": 0.29,
"grad_norm": 1.4090087413787842,
"learning_rate": 9.924639217084783e-06,
"loss": 0.0589,
"step": 233
},
{
"epoch": 0.29,
"grad_norm": 0.9706072807312012,
"learning_rate": 9.923963542310975e-06,
"loss": 0.049,
"step": 234
},
{
"epoch": 0.29,
"grad_norm": 0.9905783534049988,
"learning_rate": 9.923284875241802e-06,
"loss": 0.0537,
"step": 235
},
{
"epoch": 0.29,
"grad_norm": 0.5304461717605591,
"learning_rate": 9.92260321628969e-06,
"loss": 0.0291,
"step": 236
},
{
"epoch": 0.29,
"grad_norm": 1.2716902494430542,
"learning_rate": 9.921918565868887e-06,
"loss": 0.0652,
"step": 237
},
{
"epoch": 0.29,
"grad_norm": 0.9943916201591492,
"learning_rate": 9.921230924395449e-06,
"loss": 0.0543,
"step": 238
},
{
"epoch": 0.3,
"grad_norm": 1.3783643245697021,
"learning_rate": 9.920540292287262e-06,
"loss": 0.0536,
"step": 239
},
{
"epoch": 0.3,
"grad_norm": 1.389773964881897,
"learning_rate": 9.91984666996402e-06,
"loss": 0.0376,
"step": 240
},
{
"epoch": 0.3,
"grad_norm": 0.7887927293777466,
"learning_rate": 9.91915005784724e-06,
"loss": 0.0272,
"step": 241
},
{
"epoch": 0.3,
"grad_norm": 1.902744174003601,
"learning_rate": 9.918450456360252e-06,
"loss": 0.0543,
"step": 242
},
{
"epoch": 0.3,
"grad_norm": 0.6114033460617065,
"learning_rate": 9.917747865928206e-06,
"loss": 0.0262,
"step": 243
},
{
"epoch": 0.3,
"grad_norm": 1.1496695280075073,
"learning_rate": 9.917042286978064e-06,
"loss": 0.0643,
"step": 244
},
{
"epoch": 0.3,
"grad_norm": 0.8322230577468872,
"learning_rate": 9.916333719938608e-06,
"loss": 0.0435,
"step": 245
},
{
"epoch": 0.3,
"grad_norm": 0.9281955361366272,
"learning_rate": 9.915622165240435e-06,
"loss": 0.0399,
"step": 246
},
{
"epoch": 0.31,
"grad_norm": 0.7492028474807739,
"learning_rate": 9.914907623315958e-06,
"loss": 0.0367,
"step": 247
},
{
"epoch": 0.31,
"grad_norm": 2.0944385528564453,
"learning_rate": 9.914190094599403e-06,
"loss": 0.0488,
"step": 248
},
{
"epoch": 0.31,
"grad_norm": 1.0233027935028076,
"learning_rate": 9.913469579526811e-06,
"loss": 0.0475,
"step": 249
},
{
"epoch": 0.31,
"grad_norm": 0.9051103591918945,
"learning_rate": 9.912746078536044e-06,
"loss": 0.0374,
"step": 250
},
{
"epoch": 0.31,
"grad_norm": 0.6250872015953064,
"learning_rate": 9.91201959206677e-06,
"loss": 0.0236,
"step": 251
},
{
"epoch": 0.31,
"grad_norm": 1.0147565603256226,
"learning_rate": 9.911290120560477e-06,
"loss": 0.0408,
"step": 252
},
{
"epoch": 0.31,
"grad_norm": 1.8525872230529785,
"learning_rate": 9.910557664460464e-06,
"loss": 0.0485,
"step": 253
},
{
"epoch": 0.31,
"grad_norm": 2.040386915206909,
"learning_rate": 9.909822224211845e-06,
"loss": 0.0716,
"step": 254
},
{
"epoch": 0.32,
"grad_norm": 1.2481484413146973,
"learning_rate": 9.90908380026155e-06,
"loss": 0.0376,
"step": 255
},
{
"epoch": 0.32,
"grad_norm": 2.1175787448883057,
"learning_rate": 9.908342393058317e-06,
"loss": 0.0657,
"step": 256
},
{
"epoch": 0.32,
"grad_norm": 0.9903053641319275,
"learning_rate": 9.907598003052701e-06,
"loss": 0.0378,
"step": 257
},
{
"epoch": 0.32,
"grad_norm": 1.7109051942825317,
"learning_rate": 9.906850630697068e-06,
"loss": 0.0624,
"step": 258
},
{
"epoch": 0.32,
"grad_norm": 1.9067022800445557,
"learning_rate": 9.906100276445596e-06,
"loss": 0.0492,
"step": 259
},
{
"epoch": 0.32,
"grad_norm": 0.9397685527801514,
"learning_rate": 9.905346940754274e-06,
"loss": 0.0147,
"step": 260
},
{
"epoch": 0.32,
"grad_norm": 3.0456113815307617,
"learning_rate": 9.90459062408091e-06,
"loss": 0.0812,
"step": 261
},
{
"epoch": 0.32,
"grad_norm": 2.6053810119628906,
"learning_rate": 9.903831326885112e-06,
"loss": 0.0623,
"step": 262
},
{
"epoch": 0.33,
"grad_norm": 2.0448148250579834,
"learning_rate": 9.90306904962831e-06,
"loss": 0.0803,
"step": 263
},
{
"epoch": 0.33,
"grad_norm": 1.1430933475494385,
"learning_rate": 9.902303792773736e-06,
"loss": 0.0305,
"step": 264
},
{
"epoch": 0.33,
"grad_norm": 0.8864290714263916,
"learning_rate": 9.90153555678644e-06,
"loss": 0.0488,
"step": 265
},
{
"epoch": 0.33,
"grad_norm": 1.6222556829452515,
"learning_rate": 9.900764342133277e-06,
"loss": 0.021,
"step": 266
},
{
"epoch": 0.33,
"grad_norm": 1.0808035135269165,
"learning_rate": 9.899990149282917e-06,
"loss": 0.0326,
"step": 267
},
{
"epoch": 0.33,
"grad_norm": 2.029120683670044,
"learning_rate": 9.899212978705836e-06,
"loss": 0.0384,
"step": 268
},
{
"epoch": 0.33,
"grad_norm": 1.2418546676635742,
"learning_rate": 9.898432830874324e-06,
"loss": 0.0365,
"step": 269
},
{
"epoch": 0.33,
"grad_norm": 1.3441228866577148,
"learning_rate": 9.897649706262474e-06,
"loss": 0.0692,
"step": 270
},
{
"epoch": 0.34,
"grad_norm": 1.4092243909835815,
"learning_rate": 9.896863605346191e-06,
"loss": 0.0472,
"step": 271
},
{
"epoch": 0.34,
"grad_norm": 1.3884505033493042,
"learning_rate": 9.89607452860319e-06,
"loss": 0.088,
"step": 272
},
{
"epoch": 0.34,
"grad_norm": 2.6695573329925537,
"learning_rate": 9.895282476512995e-06,
"loss": 0.043,
"step": 273
},
{
"epoch": 0.34,
"grad_norm": 1.7949867248535156,
"learning_rate": 9.894487449556934e-06,
"loss": 0.0514,
"step": 274
},
{
"epoch": 0.34,
"grad_norm": 1.3810291290283203,
"learning_rate": 9.893689448218146e-06,
"loss": 0.0472,
"step": 275
},
{
"epoch": 0.34,
"grad_norm": 1.0681228637695312,
"learning_rate": 9.892888472981577e-06,
"loss": 0.0389,
"step": 276
},
{
"epoch": 0.34,
"grad_norm": 0.6548139452934265,
"learning_rate": 9.89208452433398e-06,
"loss": 0.0339,
"step": 277
},
{
"epoch": 0.34,
"grad_norm": 0.8944026231765747,
"learning_rate": 9.891277602763916e-06,
"loss": 0.037,
"step": 278
},
{
"epoch": 0.35,
"grad_norm": 1.7463440895080566,
"learning_rate": 9.89046770876175e-06,
"loss": 0.048,
"step": 279
},
{
"epoch": 0.35,
"grad_norm": 3.2079529762268066,
"learning_rate": 9.889654842819658e-06,
"loss": 0.0721,
"step": 280
},
{
"epoch": 0.35,
"grad_norm": 2.0868616104125977,
"learning_rate": 9.888839005431615e-06,
"loss": 0.0573,
"step": 281
},
{
"epoch": 0.35,
"grad_norm": 1.23513662815094,
"learning_rate": 9.888020197093409e-06,
"loss": 0.0542,
"step": 282
},
{
"epoch": 0.35,
"grad_norm": 0.7781217694282532,
"learning_rate": 9.887198418302629e-06,
"loss": 0.0386,
"step": 283
},
{
"epoch": 0.35,
"grad_norm": 1.390410304069519,
"learning_rate": 9.886373669558669e-06,
"loss": 0.0338,
"step": 284
},
{
"epoch": 0.35,
"grad_norm": 1.6135231256484985,
"learning_rate": 9.885545951362733e-06,
"loss": 0.0403,
"step": 285
},
{
"epoch": 0.35,
"grad_norm": 1.1802467107772827,
"learning_rate": 9.884715264217823e-06,
"loss": 0.0716,
"step": 286
},
{
"epoch": 0.36,
"grad_norm": 1.1783833503723145,
"learning_rate": 9.883881608628748e-06,
"loss": 0.0426,
"step": 287
},
{
"epoch": 0.36,
"grad_norm": 0.994340181350708,
"learning_rate": 9.883044985102122e-06,
"loss": 0.047,
"step": 288
},
{
"epoch": 0.36,
"grad_norm": 0.9849565625190735,
"learning_rate": 9.882205394146362e-06,
"loss": 0.0416,
"step": 289
},
{
"epoch": 0.36,
"grad_norm": 1.2525103092193604,
"learning_rate": 9.881362836271686e-06,
"loss": 0.0672,
"step": 290
},
{
"epoch": 0.36,
"grad_norm": 0.8505926728248596,
"learning_rate": 9.880517311990118e-06,
"loss": 0.0455,
"step": 291
},
{
"epoch": 0.36,
"grad_norm": 1.3629908561706543,
"learning_rate": 9.879668821815484e-06,
"loss": 0.0357,
"step": 292
},
{
"epoch": 0.36,
"grad_norm": 1.1365973949432373,
"learning_rate": 9.878817366263412e-06,
"loss": 0.0666,
"step": 293
},
{
"epoch": 0.36,
"grad_norm": 1.0324252843856812,
"learning_rate": 9.87796294585133e-06,
"loss": 0.0449,
"step": 294
},
{
"epoch": 0.37,
"grad_norm": 0.757729172706604,
"learning_rate": 9.877105561098473e-06,
"loss": 0.0248,
"step": 295
},
{
"epoch": 0.37,
"grad_norm": 1.2894716262817383,
"learning_rate": 9.87624521252587e-06,
"loss": 0.0382,
"step": 296
},
{
"epoch": 0.37,
"grad_norm": 1.5887492895126343,
"learning_rate": 9.87538190065636e-06,
"loss": 0.0459,
"step": 297
},
{
"epoch": 0.37,
"grad_norm": 1.5617096424102783,
"learning_rate": 9.874515626014576e-06,
"loss": 0.0673,
"step": 298
},
{
"epoch": 0.37,
"grad_norm": 2.4001352787017822,
"learning_rate": 9.873646389126954e-06,
"loss": 0.0937,
"step": 299
},
{
"epoch": 0.37,
"grad_norm": 1.1498814821243286,
"learning_rate": 9.872774190521727e-06,
"loss": 0.0609,
"step": 300
},
{
"epoch": 0.37,
"grad_norm": 3.620199680328369,
"learning_rate": 9.871899030728932e-06,
"loss": 0.078,
"step": 301
},
{
"epoch": 0.37,
"grad_norm": 1.5257648229599,
"learning_rate": 9.871020910280408e-06,
"loss": 0.0456,
"step": 302
},
{
"epoch": 0.38,
"grad_norm": 2.344609498977661,
"learning_rate": 9.870139829709784e-06,
"loss": 0.0579,
"step": 303
},
{
"epoch": 0.38,
"grad_norm": 0.6787387132644653,
"learning_rate": 9.869255789552496e-06,
"loss": 0.036,
"step": 304
},
{
"epoch": 0.38,
"grad_norm": 0.7965288162231445,
"learning_rate": 9.868368790345777e-06,
"loss": 0.0347,
"step": 305
},
{
"epoch": 0.38,
"grad_norm": 1.3934015035629272,
"learning_rate": 9.867478832628652e-06,
"loss": 0.0504,
"step": 306
},
{
"epoch": 0.38,
"grad_norm": 0.6102665662765503,
"learning_rate": 9.866585916941951e-06,
"loss": 0.0303,
"step": 307
},
{
"epoch": 0.38,
"grad_norm": 0.6944254636764526,
"learning_rate": 9.865690043828302e-06,
"loss": 0.0389,
"step": 308
},
{
"epoch": 0.38,
"grad_norm": 0.5572813153266907,
"learning_rate": 9.864791213832125e-06,
"loss": 0.0249,
"step": 309
},
{
"epoch": 0.38,
"grad_norm": 0.9218201041221619,
"learning_rate": 9.863889427499641e-06,
"loss": 0.0579,
"step": 310
},
{
"epoch": 0.38,
"grad_norm": 2.7617053985595703,
"learning_rate": 9.862984685378864e-06,
"loss": 0.0942,
"step": 311
},
{
"epoch": 0.39,
"grad_norm": 2.5800890922546387,
"learning_rate": 9.862076988019609e-06,
"loss": 0.0705,
"step": 312
},
{
"epoch": 0.39,
"grad_norm": 0.5009744763374329,
"learning_rate": 9.86116633597348e-06,
"loss": 0.0187,
"step": 313
},
{
"epoch": 0.39,
"grad_norm": 0.8876914381980896,
"learning_rate": 9.860252729793885e-06,
"loss": 0.0574,
"step": 314
},
{
"epoch": 0.39,
"grad_norm": 2.8853681087493896,
"learning_rate": 9.859336170036022e-06,
"loss": 0.0509,
"step": 315
},
{
"epoch": 0.39,
"grad_norm": 3.341853141784668,
"learning_rate": 9.858416657256883e-06,
"loss": 0.0697,
"step": 316
},
{
"epoch": 0.39,
"grad_norm": 1.9934710264205933,
"learning_rate": 9.857494192015258e-06,
"loss": 0.0531,
"step": 317
},
{
"epoch": 0.39,
"grad_norm": 1.259093165397644,
"learning_rate": 9.85656877487173e-06,
"loss": 0.0349,
"step": 318
},
{
"epoch": 0.39,
"grad_norm": 0.9945093393325806,
"learning_rate": 9.855640406388673e-06,
"loss": 0.0393,
"step": 319
},
{
"epoch": 0.4,
"grad_norm": 1.5558804273605347,
"learning_rate": 9.854709087130261e-06,
"loss": 0.0584,
"step": 320
},
{
"epoch": 0.4,
"grad_norm": 2.9720606803894043,
"learning_rate": 9.853774817662453e-06,
"loss": 0.0767,
"step": 321
},
{
"epoch": 0.4,
"grad_norm": 0.8328733444213867,
"learning_rate": 9.85283759855301e-06,
"loss": 0.0312,
"step": 322
},
{
"epoch": 0.4,
"grad_norm": 2.4241795539855957,
"learning_rate": 9.851897430371475e-06,
"loss": 0.0613,
"step": 323
},
{
"epoch": 0.4,
"grad_norm": 1.2547311782836914,
"learning_rate": 9.850954313689193e-06,
"loss": 0.0378,
"step": 324
},
{
"epoch": 0.4,
"grad_norm": 0.9641187191009521,
"learning_rate": 9.850008249079295e-06,
"loss": 0.0301,
"step": 325
},
{
"epoch": 0.4,
"grad_norm": 3.5166923999786377,
"learning_rate": 9.849059237116702e-06,
"loss": 0.0651,
"step": 326
},
{
"epoch": 0.4,
"grad_norm": 1.5394651889801025,
"learning_rate": 9.848107278378136e-06,
"loss": 0.0483,
"step": 327
},
{
"epoch": 0.41,
"grad_norm": 1.9585269689559937,
"learning_rate": 9.847152373442096e-06,
"loss": 0.0548,
"step": 328
},
{
"epoch": 0.41,
"grad_norm": 1.0429555177688599,
"learning_rate": 9.846194522888884e-06,
"loss": 0.0481,
"step": 329
},
{
"epoch": 0.41,
"grad_norm": 1.1581437587738037,
"learning_rate": 9.84523372730058e-06,
"loss": 0.0603,
"step": 330
},
{
"epoch": 0.41,
"grad_norm": 0.7063565850257874,
"learning_rate": 9.844269987261066e-06,
"loss": 0.0326,
"step": 331
},
{
"epoch": 0.41,
"grad_norm": 1.5360925197601318,
"learning_rate": 9.843303303356005e-06,
"loss": 0.0456,
"step": 332
},
{
"epoch": 0.41,
"grad_norm": 1.3182265758514404,
"learning_rate": 9.84233367617285e-06,
"loss": 0.0336,
"step": 333
},
{
"epoch": 0.41,
"grad_norm": 0.8530195951461792,
"learning_rate": 9.841361106300846e-06,
"loss": 0.0375,
"step": 334
},
{
"epoch": 0.41,
"grad_norm": 0.9681763052940369,
"learning_rate": 9.840385594331022e-06,
"loss": 0.0265,
"step": 335
},
{
"epoch": 0.42,
"grad_norm": 1.2474390268325806,
"learning_rate": 9.839407140856199e-06,
"loss": 0.0438,
"step": 336
},
{
"epoch": 0.42,
"grad_norm": 1.427484393119812,
"learning_rate": 9.838425746470984e-06,
"loss": 0.0506,
"step": 337
},
{
"epoch": 0.42,
"grad_norm": 0.8225058317184448,
"learning_rate": 9.837441411771771e-06,
"loss": 0.0355,
"step": 338
},
{
"epoch": 0.42,
"grad_norm": 0.9241979122161865,
"learning_rate": 9.836454137356739e-06,
"loss": 0.0386,
"step": 339
},
{
"epoch": 0.42,
"grad_norm": 0.8418800234794617,
"learning_rate": 9.835463923825854e-06,
"loss": 0.0392,
"step": 340
},
{
"epoch": 0.42,
"grad_norm": 0.9536418914794922,
"learning_rate": 9.834470771780875e-06,
"loss": 0.0577,
"step": 341
},
{
"epoch": 0.42,
"grad_norm": 0.7787923216819763,
"learning_rate": 9.833474681825334e-06,
"loss": 0.0325,
"step": 342
},
{
"epoch": 0.42,
"grad_norm": 2.5342555046081543,
"learning_rate": 9.832475654564562e-06,
"loss": 0.0413,
"step": 343
},
{
"epoch": 0.43,
"grad_norm": 1.160288691520691,
"learning_rate": 9.831473690605664e-06,
"loss": 0.0609,
"step": 344
},
{
"epoch": 0.43,
"grad_norm": 2.0293076038360596,
"learning_rate": 9.830468790557536e-06,
"loss": 0.0376,
"step": 345
},
{
"epoch": 0.43,
"grad_norm": 1.1950795650482178,
"learning_rate": 9.829460955030854e-06,
"loss": 0.0285,
"step": 346
},
{
"epoch": 0.43,
"grad_norm": 1.130022644996643,
"learning_rate": 9.828450184638082e-06,
"loss": 0.0725,
"step": 347
},
{
"epoch": 0.43,
"grad_norm": 1.2049533128738403,
"learning_rate": 9.827436479993468e-06,
"loss": 0.0345,
"step": 348
},
{
"epoch": 0.43,
"grad_norm": 1.9585927724838257,
"learning_rate": 9.826419841713038e-06,
"loss": 0.0539,
"step": 349
},
{
"epoch": 0.43,
"grad_norm": 0.7200453281402588,
"learning_rate": 9.825400270414602e-06,
"loss": 0.0358,
"step": 350
},
{
"epoch": 0.43,
"grad_norm": 0.9681141972541809,
"learning_rate": 9.824377766717758e-06,
"loss": 0.0288,
"step": 351
},
{
"epoch": 0.44,
"grad_norm": 0.843163788318634,
"learning_rate": 9.823352331243881e-06,
"loss": 0.0396,
"step": 352
},
{
"epoch": 0.44,
"grad_norm": 0.8464294075965881,
"learning_rate": 9.822323964616125e-06,
"loss": 0.0394,
"step": 353
},
{
"epoch": 0.44,
"grad_norm": 0.6887583136558533,
"learning_rate": 9.821292667459435e-06,
"loss": 0.0295,
"step": 354
},
{
"epoch": 0.44,
"grad_norm": 1.815610408782959,
"learning_rate": 9.820258440400525e-06,
"loss": 0.0372,
"step": 355
},
{
"epoch": 0.44,
"grad_norm": 1.1596908569335938,
"learning_rate": 9.8192212840679e-06,
"loss": 0.0247,
"step": 356
},
{
"epoch": 0.44,
"grad_norm": 1.0240830183029175,
"learning_rate": 9.818181199091838e-06,
"loss": 0.0497,
"step": 357
},
{
"epoch": 0.44,
"grad_norm": 0.9827424883842468,
"learning_rate": 9.817138186104401e-06,
"loss": 0.0585,
"step": 358
},
{
"epoch": 0.44,
"grad_norm": 0.8876912593841553,
"learning_rate": 9.816092245739426e-06,
"loss": 0.039,
"step": 359
},
{
"epoch": 0.45,
"grad_norm": 1.8267855644226074,
"learning_rate": 9.81504337863253e-06,
"loss": 0.0393,
"step": 360
},
{
"epoch": 0.45,
"grad_norm": 0.7727996706962585,
"learning_rate": 9.813991585421118e-06,
"loss": 0.0442,
"step": 361
},
{
"epoch": 0.45,
"grad_norm": 2.0796356201171875,
"learning_rate": 9.812936866744358e-06,
"loss": 0.0525,
"step": 362
},
{
"epoch": 0.45,
"grad_norm": 0.8108832836151123,
"learning_rate": 9.811879223243207e-06,
"loss": 0.0367,
"step": 363
},
{
"epoch": 0.45,
"grad_norm": 0.9708784818649292,
"learning_rate": 9.810818655560393e-06,
"loss": 0.0436,
"step": 364
},
{
"epoch": 0.45,
"grad_norm": 1.442888855934143,
"learning_rate": 9.809755164340423e-06,
"loss": 0.0432,
"step": 365
},
{
"epoch": 0.45,
"grad_norm": 0.8913246989250183,
"learning_rate": 9.808688750229584e-06,
"loss": 0.046,
"step": 366
},
{
"epoch": 0.45,
"grad_norm": 2.196491003036499,
"learning_rate": 9.807619413875937e-06,
"loss": 0.0466,
"step": 367
},
{
"epoch": 0.46,
"grad_norm": 0.9138450622558594,
"learning_rate": 9.806547155929315e-06,
"loss": 0.0355,
"step": 368
},
{
"epoch": 0.46,
"grad_norm": 0.3624818027019501,
"learning_rate": 9.80547197704133e-06,
"loss": 0.0186,
"step": 369
},
{
"epoch": 0.46,
"grad_norm": 1.0726361274719238,
"learning_rate": 9.804393877865373e-06,
"loss": 0.0497,
"step": 370
},
{
"epoch": 0.46,
"grad_norm": 0.8961818218231201,
"learning_rate": 9.8033128590566e-06,
"loss": 0.0356,
"step": 371
},
{
"epoch": 0.46,
"grad_norm": 2.240262746810913,
"learning_rate": 9.80222892127195e-06,
"loss": 0.0794,
"step": 372
},
{
"epoch": 0.46,
"grad_norm": 2.4816982746124268,
"learning_rate": 9.801142065170132e-06,
"loss": 0.0631,
"step": 373
},
{
"epoch": 0.46,
"grad_norm": 1.1969040632247925,
"learning_rate": 9.80005229141163e-06,
"loss": 0.0559,
"step": 374
},
{
"epoch": 0.46,
"grad_norm": 1.4784609079360962,
"learning_rate": 9.798959600658697e-06,
"loss": 0.0746,
"step": 375
},
{
"epoch": 0.47,
"grad_norm": 0.7828866839408875,
"learning_rate": 9.797863993575365e-06,
"loss": 0.0396,
"step": 376
},
{
"epoch": 0.47,
"grad_norm": 0.7891765832901001,
"learning_rate": 9.796765470827435e-06,
"loss": 0.0567,
"step": 377
},
{
"epoch": 0.47,
"grad_norm": 0.7710642218589783,
"learning_rate": 9.795664033082476e-06,
"loss": 0.0442,
"step": 378
},
{
"epoch": 0.47,
"grad_norm": 0.8450149297714233,
"learning_rate": 9.794559681009837e-06,
"loss": 0.036,
"step": 379
},
{
"epoch": 0.47,
"grad_norm": 0.545617401599884,
"learning_rate": 9.79345241528063e-06,
"loss": 0.0302,
"step": 380
},
{
"epoch": 0.47,
"grad_norm": 1.7093480825424194,
"learning_rate": 9.792342236567743e-06,
"loss": 0.0494,
"step": 381
},
{
"epoch": 0.47,
"grad_norm": 0.8590899109840393,
"learning_rate": 9.791229145545832e-06,
"loss": 0.0389,
"step": 382
},
{
"epoch": 0.47,
"grad_norm": 1.1689053773880005,
"learning_rate": 9.790113142891323e-06,
"loss": 0.0505,
"step": 383
},
{
"epoch": 0.48,
"grad_norm": 0.6099830269813538,
"learning_rate": 9.78899422928241e-06,
"loss": 0.036,
"step": 384
},
{
"epoch": 0.48,
"grad_norm": 1.2200748920440674,
"learning_rate": 9.787872405399059e-06,
"loss": 0.0557,
"step": 385
},
{
"epoch": 0.48,
"grad_norm": 1.0489903688430786,
"learning_rate": 9.786747671923003e-06,
"loss": 0.0719,
"step": 386
},
{
"epoch": 0.48,
"grad_norm": 1.578433871269226,
"learning_rate": 9.785620029537741e-06,
"loss": 0.03,
"step": 387
},
{
"epoch": 0.48,
"grad_norm": 0.9253179430961609,
"learning_rate": 9.784489478928545e-06,
"loss": 0.0527,
"step": 388
},
{
"epoch": 0.48,
"grad_norm": 0.7473218441009521,
"learning_rate": 9.783356020782448e-06,
"loss": 0.035,
"step": 389
},
{
"epoch": 0.48,
"grad_norm": 1.4502854347229004,
"learning_rate": 9.782219655788257e-06,
"loss": 0.0423,
"step": 390
},
{
"epoch": 0.48,
"grad_norm": 0.946733295917511,
"learning_rate": 9.781080384636539e-06,
"loss": 0.0413,
"step": 391
},
{
"epoch": 0.49,
"grad_norm": 1.4826123714447021,
"learning_rate": 9.77993820801963e-06,
"loss": 0.0414,
"step": 392
},
{
"epoch": 0.49,
"grad_norm": 2.0471692085266113,
"learning_rate": 9.778793126631632e-06,
"loss": 0.0466,
"step": 393
},
{
"epoch": 0.49,
"grad_norm": 1.7681257724761963,
"learning_rate": 9.777645141168411e-06,
"loss": 0.0504,
"step": 394
},
{
"epoch": 0.49,
"grad_norm": 0.7187155485153198,
"learning_rate": 9.776494252327597e-06,
"loss": 0.0447,
"step": 395
},
{
"epoch": 0.49,
"grad_norm": 0.7922236323356628,
"learning_rate": 9.775340460808589e-06,
"loss": 0.0313,
"step": 396
},
{
"epoch": 0.49,
"grad_norm": 2.724630117416382,
"learning_rate": 9.774183767312545e-06,
"loss": 0.0616,
"step": 397
},
{
"epoch": 0.49,
"grad_norm": 0.47513461112976074,
"learning_rate": 9.773024172542389e-06,
"loss": 0.0163,
"step": 398
},
{
"epoch": 0.49,
"grad_norm": 0.6144838333129883,
"learning_rate": 9.771861677202804e-06,
"loss": 0.0271,
"step": 399
},
{
"epoch": 0.5,
"grad_norm": 1.0170230865478516,
"learning_rate": 9.770696282000245e-06,
"loss": 0.0438,
"step": 400
},
{
"epoch": 0.5,
"grad_norm": 0.5385282635688782,
"learning_rate": 9.76952798764292e-06,
"loss": 0.0169,
"step": 401
},
{
"epoch": 0.5,
"grad_norm": 1.6152381896972656,
"learning_rate": 9.7683567948408e-06,
"loss": 0.068,
"step": 402
},
{
"epoch": 0.5,
"grad_norm": 0.9734664559364319,
"learning_rate": 9.767182704305625e-06,
"loss": 0.0681,
"step": 403
},
{
"epoch": 0.5,
"grad_norm": 1.7027530670166016,
"learning_rate": 9.766005716750884e-06,
"loss": 0.04,
"step": 404
},
{
"epoch": 0.5,
"grad_norm": 0.7407202124595642,
"learning_rate": 9.764825832891837e-06,
"loss": 0.033,
"step": 405
},
{
"epoch": 0.5,
"grad_norm": 0.8196337223052979,
"learning_rate": 9.7636430534455e-06,
"loss": 0.0451,
"step": 406
},
{
"epoch": 0.5,
"grad_norm": 2.600836753845215,
"learning_rate": 9.762457379130649e-06,
"loss": 0.075,
"step": 407
},
{
"epoch": 0.5,
"grad_norm": 1.4206620454788208,
"learning_rate": 9.761268810667817e-06,
"loss": 0.0255,
"step": 408
},
{
"epoch": 0.51,
"grad_norm": 0.9220699071884155,
"learning_rate": 9.760077348779298e-06,
"loss": 0.0564,
"step": 409
},
{
"epoch": 0.51,
"grad_norm": 0.6927193999290466,
"learning_rate": 9.758882994189145e-06,
"loss": 0.0375,
"step": 410
},
{
"epoch": 0.51,
"grad_norm": 0.9594948291778564,
"learning_rate": 9.757685747623169e-06,
"loss": 0.0523,
"step": 411
},
{
"epoch": 0.51,
"grad_norm": 1.9151678085327148,
"learning_rate": 9.756485609808934e-06,
"loss": 0.0634,
"step": 412
},
{
"epoch": 0.51,
"grad_norm": 1.0471961498260498,
"learning_rate": 9.755282581475769e-06,
"loss": 0.027,
"step": 413
},
{
"epoch": 0.51,
"grad_norm": 1.2358285188674927,
"learning_rate": 9.75407666335475e-06,
"loss": 0.0705,
"step": 414
},
{
"epoch": 0.51,
"grad_norm": 0.8452746272087097,
"learning_rate": 9.752867856178719e-06,
"loss": 0.0485,
"step": 415
},
{
"epoch": 0.51,
"grad_norm": 1.2570796012878418,
"learning_rate": 9.751656160682265e-06,
"loss": 0.0375,
"step": 416
},
{
"epoch": 0.52,
"grad_norm": 1.8666393756866455,
"learning_rate": 9.750441577601738e-06,
"loss": 0.0418,
"step": 417
},
{
"epoch": 0.52,
"grad_norm": 0.7684221267700195,
"learning_rate": 9.749224107675239e-06,
"loss": 0.0477,
"step": 418
},
{
"epoch": 0.52,
"grad_norm": 1.430303931236267,
"learning_rate": 9.748003751642628e-06,
"loss": 0.0389,
"step": 419
},
{
"epoch": 0.52,
"grad_norm": 4.4301066398620605,
"learning_rate": 9.746780510245512e-06,
"loss": 0.0868,
"step": 420
},
{
"epoch": 0.52,
"grad_norm": 2.655571699142456,
"learning_rate": 9.74555438422726e-06,
"loss": 0.0423,
"step": 421
},
{
"epoch": 0.52,
"grad_norm": 1.7431411743164062,
"learning_rate": 9.744325374332986e-06,
"loss": 0.0235,
"step": 422
},
{
"epoch": 0.52,
"grad_norm": 1.7228596210479736,
"learning_rate": 9.743093481309563e-06,
"loss": 0.0361,
"step": 423
},
{
"epoch": 0.52,
"grad_norm": 0.5912590026855469,
"learning_rate": 9.741858705905609e-06,
"loss": 0.0254,
"step": 424
},
{
"epoch": 0.53,
"grad_norm": 0.8103305101394653,
"learning_rate": 9.740621048871501e-06,
"loss": 0.0159,
"step": 425
},
{
"epoch": 0.53,
"grad_norm": 2.466233253479004,
"learning_rate": 9.739380510959365e-06,
"loss": 0.0803,
"step": 426
},
{
"epoch": 0.53,
"grad_norm": 0.5837281942367554,
"learning_rate": 9.738137092923072e-06,
"loss": 0.0293,
"step": 427
},
{
"epoch": 0.53,
"grad_norm": 1.528012990951538,
"learning_rate": 9.73689079551825e-06,
"loss": 0.0549,
"step": 428
},
{
"epoch": 0.53,
"grad_norm": 2.025675058364868,
"learning_rate": 9.735641619502277e-06,
"loss": 0.0663,
"step": 429
},
{
"epoch": 0.53,
"grad_norm": 1.34830641746521,
"learning_rate": 9.734389565634277e-06,
"loss": 0.0483,
"step": 430
},
{
"epoch": 0.53,
"grad_norm": 1.644051194190979,
"learning_rate": 9.73313463467512e-06,
"loss": 0.053,
"step": 431
},
{
"epoch": 0.53,
"grad_norm": 1.6768667697906494,
"learning_rate": 9.731876827387433e-06,
"loss": 0.0626,
"step": 432
},
{
"epoch": 0.54,
"grad_norm": 2.0125842094421387,
"learning_rate": 9.730616144535581e-06,
"loss": 0.0424,
"step": 433
},
{
"epoch": 0.54,
"grad_norm": 4.256353378295898,
"learning_rate": 9.729352586885687e-06,
"loss": 0.0734,
"step": 434
},
{
"epoch": 0.54,
"grad_norm": 3.4163427352905273,
"learning_rate": 9.728086155205614e-06,
"loss": 0.0544,
"step": 435
},
{
"epoch": 0.54,
"grad_norm": 2.842038154602051,
"learning_rate": 9.726816850264971e-06,
"loss": 0.0465,
"step": 436
},
{
"epoch": 0.54,
"grad_norm": 2.0849742889404297,
"learning_rate": 9.725544672835118e-06,
"loss": 0.0684,
"step": 437
},
{
"epoch": 0.54,
"grad_norm": 0.677302360534668,
"learning_rate": 9.724269623689158e-06,
"loss": 0.0284,
"step": 438
},
{
"epoch": 0.54,
"grad_norm": 1.040449619293213,
"learning_rate": 9.722991703601936e-06,
"loss": 0.0384,
"step": 439
},
{
"epoch": 0.54,
"grad_norm": 0.6753067374229431,
"learning_rate": 9.721710913350048e-06,
"loss": 0.0436,
"step": 440
},
{
"epoch": 0.55,
"grad_norm": 2.006178617477417,
"learning_rate": 9.720427253711831e-06,
"loss": 0.046,
"step": 441
},
{
"epoch": 0.55,
"grad_norm": 1.1364405155181885,
"learning_rate": 9.719140725467362e-06,
"loss": 0.0512,
"step": 442
},
{
"epoch": 0.55,
"grad_norm": 0.7395780086517334,
"learning_rate": 9.717851329398469e-06,
"loss": 0.0239,
"step": 443
},
{
"epoch": 0.55,
"grad_norm": 1.4531809091567993,
"learning_rate": 9.716559066288716e-06,
"loss": 0.0505,
"step": 444
},
{
"epoch": 0.55,
"grad_norm": 0.9090608954429626,
"learning_rate": 9.715263936923413e-06,
"loss": 0.0272,
"step": 445
},
{
"epoch": 0.55,
"grad_norm": 0.9618948698043823,
"learning_rate": 9.713965942089612e-06,
"loss": 0.0491,
"step": 446
},
{
"epoch": 0.55,
"grad_norm": 0.5173948407173157,
"learning_rate": 9.712665082576104e-06,
"loss": 0.0264,
"step": 447
},
{
"epoch": 0.55,
"grad_norm": 0.5747056603431702,
"learning_rate": 9.711361359173422e-06,
"loss": 0.0231,
"step": 448
},
{
"epoch": 0.56,
"grad_norm": 1.7778929471969604,
"learning_rate": 9.710054772673839e-06,
"loss": 0.0492,
"step": 449
},
{
"epoch": 0.56,
"grad_norm": 2.290955066680908,
"learning_rate": 9.708745323871369e-06,
"loss": 0.0465,
"step": 450
},
{
"epoch": 0.56,
"grad_norm": 1.1455390453338623,
"learning_rate": 9.707433013561765e-06,
"loss": 0.0625,
"step": 451
},
{
"epoch": 0.56,
"grad_norm": 2.4170002937316895,
"learning_rate": 9.706117842542517e-06,
"loss": 0.0761,
"step": 452
},
{
"epoch": 0.56,
"grad_norm": 1.6311193704605103,
"learning_rate": 9.704799811612858e-06,
"loss": 0.0736,
"step": 453
},
{
"epoch": 0.56,
"grad_norm": 1.4031122922897339,
"learning_rate": 9.703478921573753e-06,
"loss": 0.0362,
"step": 454
},
{
"epoch": 0.56,
"grad_norm": 1.10888671875,
"learning_rate": 9.702155173227911e-06,
"loss": 0.0468,
"step": 455
},
{
"epoch": 0.56,
"grad_norm": 2.612172842025757,
"learning_rate": 9.700828567379772e-06,
"loss": 0.0709,
"step": 456
},
{
"epoch": 0.57,
"grad_norm": 1.2346030473709106,
"learning_rate": 9.699499104835514e-06,
"loss": 0.0587,
"step": 457
},
{
"epoch": 0.57,
"grad_norm": 1.7313090562820435,
"learning_rate": 9.698166786403057e-06,
"loss": 0.0372,
"step": 458
},
{
"epoch": 0.57,
"grad_norm": 1.303956389427185,
"learning_rate": 9.696831612892048e-06,
"loss": 0.0415,
"step": 459
},
{
"epoch": 0.57,
"grad_norm": 0.4627138674259186,
"learning_rate": 9.695493585113873e-06,
"loss": 0.0276,
"step": 460
},
{
"epoch": 0.57,
"grad_norm": 0.7128018140792847,
"learning_rate": 9.694152703881653e-06,
"loss": 0.0265,
"step": 461
},
{
"epoch": 0.57,
"grad_norm": 0.8362938165664673,
"learning_rate": 9.69280897001024e-06,
"loss": 0.0597,
"step": 462
},
{
"epoch": 0.57,
"grad_norm": 0.9412689208984375,
"learning_rate": 9.691462384316226e-06,
"loss": 0.062,
"step": 463
},
{
"epoch": 0.57,
"grad_norm": 1.3194217681884766,
"learning_rate": 9.690112947617929e-06,
"loss": 0.0526,
"step": 464
},
{
"epoch": 0.58,
"grad_norm": 1.3153883218765259,
"learning_rate": 9.688760660735403e-06,
"loss": 0.0497,
"step": 465
},
{
"epoch": 0.58,
"grad_norm": 1.290602684020996,
"learning_rate": 9.687405524490433e-06,
"loss": 0.0277,
"step": 466
},
{
"epoch": 0.58,
"grad_norm": 0.6527288556098938,
"learning_rate": 9.686047539706536e-06,
"loss": 0.0353,
"step": 467
},
{
"epoch": 0.58,
"grad_norm": 1.1408582925796509,
"learning_rate": 9.684686707208962e-06,
"loss": 0.0407,
"step": 468
},
{
"epoch": 0.58,
"grad_norm": 0.5641573071479797,
"learning_rate": 9.683323027824687e-06,
"loss": 0.0311,
"step": 469
},
{
"epoch": 0.58,
"grad_norm": 0.8712812066078186,
"learning_rate": 9.681956502382423e-06,
"loss": 0.0484,
"step": 470
},
{
"epoch": 0.58,
"grad_norm": 1.6026149988174438,
"learning_rate": 9.680587131712605e-06,
"loss": 0.0697,
"step": 471
},
{
"epoch": 0.58,
"grad_norm": 0.7954007983207703,
"learning_rate": 9.6792149166474e-06,
"loss": 0.0621,
"step": 472
},
{
"epoch": 0.59,
"grad_norm": 1.8472158908843994,
"learning_rate": 9.677839858020709e-06,
"loss": 0.0437,
"step": 473
},
{
"epoch": 0.59,
"grad_norm": 0.9168758988380432,
"learning_rate": 9.676461956668148e-06,
"loss": 0.0535,
"step": 474
},
{
"epoch": 0.59,
"grad_norm": 1.1088653802871704,
"learning_rate": 9.675081213427076e-06,
"loss": 0.038,
"step": 475
},
{
"epoch": 0.59,
"grad_norm": 0.6966286301612854,
"learning_rate": 9.673697629136566e-06,
"loss": 0.0304,
"step": 476
},
{
"epoch": 0.59,
"grad_norm": 1.734716534614563,
"learning_rate": 9.672311204637426e-06,
"loss": 0.0705,
"step": 477
},
{
"epoch": 0.59,
"grad_norm": 0.8543561697006226,
"learning_rate": 9.670921940772186e-06,
"loss": 0.0585,
"step": 478
},
{
"epoch": 0.59,
"grad_norm": 0.6839298605918884,
"learning_rate": 9.669529838385102e-06,
"loss": 0.0381,
"step": 479
},
{
"epoch": 0.59,
"grad_norm": 0.794438362121582,
"learning_rate": 9.668134898322157e-06,
"loss": 0.0485,
"step": 480
},
{
"epoch": 0.6,
"grad_norm": 0.585090696811676,
"learning_rate": 9.666737121431055e-06,
"loss": 0.0295,
"step": 481
},
{
"epoch": 0.6,
"grad_norm": 1.14494788646698,
"learning_rate": 9.665336508561225e-06,
"loss": 0.0248,
"step": 482
},
{
"epoch": 0.6,
"grad_norm": 0.7456786632537842,
"learning_rate": 9.663933060563824e-06,
"loss": 0.0384,
"step": 483
},
{
"epoch": 0.6,
"grad_norm": 1.0646755695343018,
"learning_rate": 9.662526778291725e-06,
"loss": 0.056,
"step": 484
},
{
"epoch": 0.6,
"grad_norm": 0.6966055631637573,
"learning_rate": 9.661117662599527e-06,
"loss": 0.0279,
"step": 485
},
{
"epoch": 0.6,
"grad_norm": 0.8128595948219299,
"learning_rate": 9.659705714343551e-06,
"loss": 0.0421,
"step": 486
},
{
"epoch": 0.6,
"grad_norm": 1.1546441316604614,
"learning_rate": 9.658290934381837e-06,
"loss": 0.0527,
"step": 487
},
{
"epoch": 0.6,
"grad_norm": 0.7882161736488342,
"learning_rate": 9.656873323574152e-06,
"loss": 0.041,
"step": 488
},
{
"epoch": 0.61,
"grad_norm": 0.9414128065109253,
"learning_rate": 9.655452882781972e-06,
"loss": 0.0198,
"step": 489
},
{
"epoch": 0.61,
"grad_norm": 1.0596210956573486,
"learning_rate": 9.654029612868507e-06,
"loss": 0.0606,
"step": 490
},
{
"epoch": 0.61,
"grad_norm": 0.676780641078949,
"learning_rate": 9.652603514698674e-06,
"loss": 0.0232,
"step": 491
},
{
"epoch": 0.61,
"grad_norm": 0.8404201865196228,
"learning_rate": 9.651174589139115e-06,
"loss": 0.0314,
"step": 492
},
{
"epoch": 0.61,
"grad_norm": 0.47275248169898987,
"learning_rate": 9.649742837058189e-06,
"loss": 0.0169,
"step": 493
},
{
"epoch": 0.61,
"grad_norm": 3.815514087677002,
"learning_rate": 9.648308259325973e-06,
"loss": 0.0986,
"step": 494
},
{
"epoch": 0.61,
"grad_norm": 1.271995186805725,
"learning_rate": 9.646870856814259e-06,
"loss": 0.0271,
"step": 495
},
{
"epoch": 0.61,
"grad_norm": 0.6948990821838379,
"learning_rate": 9.64543063039656e-06,
"loss": 0.0224,
"step": 496
},
{
"epoch": 0.62,
"grad_norm": 1.3301115036010742,
"learning_rate": 9.6439875809481e-06,
"loss": 0.0375,
"step": 497
},
{
"epoch": 0.62,
"grad_norm": 0.6250678896903992,
"learning_rate": 9.64254170934582e-06,
"loss": 0.0184,
"step": 498
},
{
"epoch": 0.62,
"grad_norm": 0.9256348609924316,
"learning_rate": 9.641093016468381e-06,
"loss": 0.0375,
"step": 499
},
{
"epoch": 0.62,
"grad_norm": 1.3027982711791992,
"learning_rate": 9.639641503196152e-06,
"loss": 0.0276,
"step": 500
},
{
"epoch": 0.62,
"grad_norm": 2.560512065887451,
"learning_rate": 9.638187170411218e-06,
"loss": 0.0482,
"step": 501
},
{
"epoch": 0.62,
"grad_norm": 1.6088508367538452,
"learning_rate": 9.63673001899738e-06,
"loss": 0.0436,
"step": 502
},
{
"epoch": 0.62,
"grad_norm": 1.439906358718872,
"learning_rate": 9.635270049840146e-06,
"loss": 0.0772,
"step": 503
},
{
"epoch": 0.62,
"grad_norm": 1.1696199178695679,
"learning_rate": 9.633807263826745e-06,
"loss": 0.0388,
"step": 504
},
{
"epoch": 0.62,
"grad_norm": 1.6363476514816284,
"learning_rate": 9.632341661846107e-06,
"loss": 0.0592,
"step": 505
},
{
"epoch": 0.63,
"grad_norm": 3.1684820652008057,
"learning_rate": 9.630873244788884e-06,
"loss": 0.0696,
"step": 506
},
{
"epoch": 0.63,
"grad_norm": 2.787458658218384,
"learning_rate": 9.629402013547432e-06,
"loss": 0.0842,
"step": 507
},
{
"epoch": 0.63,
"grad_norm": 0.8504316806793213,
"learning_rate": 9.627927969015817e-06,
"loss": 0.0413,
"step": 508
},
{
"epoch": 0.63,
"grad_norm": 0.9233881235122681,
"learning_rate": 9.62645111208982e-06,
"loss": 0.0315,
"step": 509
},
{
"epoch": 0.63,
"grad_norm": 1.571606159210205,
"learning_rate": 9.62497144366693e-06,
"loss": 0.0716,
"step": 510
},
{
"epoch": 0.63,
"grad_norm": 2.602965831756592,
"learning_rate": 9.623488964646334e-06,
"loss": 0.0526,
"step": 511
},
{
"epoch": 0.63,
"grad_norm": 1.687855839729309,
"learning_rate": 9.622003675928943e-06,
"loss": 0.0517,
"step": 512
},
{
"epoch": 0.63,
"grad_norm": 1.535513162612915,
"learning_rate": 9.620515578417364e-06,
"loss": 0.0368,
"step": 513
},
{
"epoch": 0.64,
"grad_norm": 0.5331669449806213,
"learning_rate": 9.619024673015916e-06,
"loss": 0.0273,
"step": 514
},
{
"epoch": 0.64,
"grad_norm": 0.7347199320793152,
"learning_rate": 9.617530960630624e-06,
"loss": 0.022,
"step": 515
},
{
"epoch": 0.64,
"grad_norm": 1.8210560083389282,
"learning_rate": 9.616034442169214e-06,
"loss": 0.0625,
"step": 516
},
{
"epoch": 0.64,
"grad_norm": 1.0366301536560059,
"learning_rate": 9.614535118541126e-06,
"loss": 0.0409,
"step": 517
},
{
"epoch": 0.64,
"grad_norm": 0.8622118234634399,
"learning_rate": 9.613032990657495e-06,
"loss": 0.0529,
"step": 518
},
{
"epoch": 0.64,
"grad_norm": 1.1612430810928345,
"learning_rate": 9.61152805943117e-06,
"loss": 0.0298,
"step": 519
},
{
"epoch": 0.64,
"grad_norm": 0.6844496726989746,
"learning_rate": 9.610020325776694e-06,
"loss": 0.0306,
"step": 520
},
{
"epoch": 0.64,
"grad_norm": 0.7687200307846069,
"learning_rate": 9.608509790610322e-06,
"loss": 0.0416,
"step": 521
},
{
"epoch": 0.65,
"grad_norm": 0.7224605083465576,
"learning_rate": 9.606996454850002e-06,
"loss": 0.036,
"step": 522
},
{
"epoch": 0.65,
"grad_norm": 0.6508851051330566,
"learning_rate": 9.605480319415391e-06,
"loss": 0.0368,
"step": 523
},
{
"epoch": 0.65,
"grad_norm": 1.3081005811691284,
"learning_rate": 9.603961385227848e-06,
"loss": 0.0284,
"step": 524
},
{
"epoch": 0.65,
"grad_norm": 0.5530818700790405,
"learning_rate": 9.602439653210426e-06,
"loss": 0.0273,
"step": 525
},
{
"epoch": 0.65,
"grad_norm": 0.5170778036117554,
"learning_rate": 9.600915124287886e-06,
"loss": 0.0181,
"step": 526
},
{
"epoch": 0.65,
"grad_norm": 0.5652095079421997,
"learning_rate": 9.599387799386684e-06,
"loss": 0.0213,
"step": 527
},
{
"epoch": 0.65,
"grad_norm": 1.0414352416992188,
"learning_rate": 9.597857679434974e-06,
"loss": 0.0389,
"step": 528
},
{
"epoch": 0.65,
"grad_norm": 0.6755688786506653,
"learning_rate": 9.596324765362614e-06,
"loss": 0.0343,
"step": 529
},
{
"epoch": 0.66,
"grad_norm": 1.5740824937820435,
"learning_rate": 9.594789058101154e-06,
"loss": 0.0562,
"step": 530
},
{
"epoch": 0.66,
"grad_norm": 1.410057544708252,
"learning_rate": 9.593250558583846e-06,
"loss": 0.0394,
"step": 531
},
{
"epoch": 0.66,
"grad_norm": 1.4377081394195557,
"learning_rate": 9.591709267745635e-06,
"loss": 0.0255,
"step": 532
},
{
"epoch": 0.66,
"grad_norm": 0.9751909971237183,
"learning_rate": 9.590165186523166e-06,
"loss": 0.0395,
"step": 533
},
{
"epoch": 0.66,
"grad_norm": 0.8450660109519958,
"learning_rate": 9.588618315854779e-06,
"loss": 0.0331,
"step": 534
},
{
"epoch": 0.66,
"grad_norm": 1.8118575811386108,
"learning_rate": 9.587068656680506e-06,
"loss": 0.0346,
"step": 535
},
{
"epoch": 0.66,
"grad_norm": 0.7216983437538147,
"learning_rate": 9.585516209942077e-06,
"loss": 0.0242,
"step": 536
},
{
"epoch": 0.66,
"grad_norm": 1.0194247961044312,
"learning_rate": 9.583960976582914e-06,
"loss": 0.0478,
"step": 537
},
{
"epoch": 0.67,
"grad_norm": 1.1861456632614136,
"learning_rate": 9.582402957548132e-06,
"loss": 0.0224,
"step": 538
},
{
"epoch": 0.67,
"grad_norm": 0.8888005614280701,
"learning_rate": 9.580842153784542e-06,
"loss": 0.0393,
"step": 539
},
{
"epoch": 0.67,
"grad_norm": 1.0420960187911987,
"learning_rate": 9.579278566240646e-06,
"loss": 0.035,
"step": 540
},
{
"epoch": 0.67,
"grad_norm": 0.7932503819465637,
"learning_rate": 9.577712195866634e-06,
"loss": 0.0361,
"step": 541
},
{
"epoch": 0.67,
"grad_norm": 2.295933246612549,
"learning_rate": 9.576143043614393e-06,
"loss": 0.0798,
"step": 542
},
{
"epoch": 0.67,
"grad_norm": 0.795536458492279,
"learning_rate": 9.574571110437496e-06,
"loss": 0.034,
"step": 543
},
{
"epoch": 0.67,
"grad_norm": 1.269714117050171,
"learning_rate": 9.572996397291209e-06,
"loss": 0.0308,
"step": 544
},
{
"epoch": 0.67,
"grad_norm": 0.7194578051567078,
"learning_rate": 9.571418905132486e-06,
"loss": 0.0303,
"step": 545
},
{
"epoch": 0.68,
"grad_norm": 0.9299863576889038,
"learning_rate": 9.569838634919968e-06,
"loss": 0.0549,
"step": 546
},
{
"epoch": 0.68,
"grad_norm": 1.1913076639175415,
"learning_rate": 9.568255587613986e-06,
"loss": 0.0419,
"step": 547
},
{
"epoch": 0.68,
"grad_norm": 0.6721378564834595,
"learning_rate": 9.566669764176562e-06,
"loss": 0.0227,
"step": 548
},
{
"epoch": 0.68,
"grad_norm": 0.9450292587280273,
"learning_rate": 9.5650811655714e-06,
"loss": 0.0272,
"step": 549
},
{
"epoch": 0.68,
"grad_norm": 1.6691453456878662,
"learning_rate": 9.56348979276389e-06,
"loss": 0.0506,
"step": 550
},
{
"epoch": 0.68,
"grad_norm": 1.0706772804260254,
"learning_rate": 9.561895646721113e-06,
"loss": 0.0438,
"step": 551
},
{
"epoch": 0.68,
"grad_norm": 1.0017832517623901,
"learning_rate": 9.560298728411833e-06,
"loss": 0.0604,
"step": 552
},
{
"epoch": 0.68,
"grad_norm": 1.9847087860107422,
"learning_rate": 9.558699038806494e-06,
"loss": 0.0827,
"step": 553
},
{
"epoch": 0.69,
"grad_norm": 1.05272376537323,
"learning_rate": 9.557096578877232e-06,
"loss": 0.0315,
"step": 554
},
{
"epoch": 0.69,
"grad_norm": 1.6529170274734497,
"learning_rate": 9.555491349597862e-06,
"loss": 0.0438,
"step": 555
},
{
"epoch": 0.69,
"grad_norm": 1.5359541177749634,
"learning_rate": 9.553883351943882e-06,
"loss": 0.0453,
"step": 556
},
{
"epoch": 0.69,
"grad_norm": 0.7716813087463379,
"learning_rate": 9.552272586892475e-06,
"loss": 0.0395,
"step": 557
},
{
"epoch": 0.69,
"grad_norm": 1.0042527914047241,
"learning_rate": 9.550659055422502e-06,
"loss": 0.0524,
"step": 558
},
{
"epoch": 0.69,
"grad_norm": 0.9220654368400574,
"learning_rate": 9.549042758514505e-06,
"loss": 0.052,
"step": 559
},
{
"epoch": 0.69,
"grad_norm": 1.202533483505249,
"learning_rate": 9.547423697150714e-06,
"loss": 0.0315,
"step": 560
},
{
"epoch": 0.69,
"grad_norm": 1.441113829612732,
"learning_rate": 9.545801872315028e-06,
"loss": 0.0406,
"step": 561
},
{
"epoch": 0.7,
"grad_norm": 1.1032451391220093,
"learning_rate": 9.544177284993035e-06,
"loss": 0.0562,
"step": 562
},
{
"epoch": 0.7,
"grad_norm": 0.613166332244873,
"learning_rate": 9.542549936171994e-06,
"loss": 0.0264,
"step": 563
},
{
"epoch": 0.7,
"grad_norm": 0.6434498429298401,
"learning_rate": 9.540919826840848e-06,
"loss": 0.0326,
"step": 564
},
{
"epoch": 0.7,
"grad_norm": 0.4755064845085144,
"learning_rate": 9.539286957990215e-06,
"loss": 0.0271,
"step": 565
},
{
"epoch": 0.7,
"grad_norm": 0.6659818887710571,
"learning_rate": 9.53765133061239e-06,
"loss": 0.0493,
"step": 566
},
{
"epoch": 0.7,
"grad_norm": 0.9639627933502197,
"learning_rate": 9.536012945701345e-06,
"loss": 0.0384,
"step": 567
},
{
"epoch": 0.7,
"grad_norm": 0.8150410056114197,
"learning_rate": 9.534371804252727e-06,
"loss": 0.0306,
"step": 568
},
{
"epoch": 0.7,
"grad_norm": 1.4704219102859497,
"learning_rate": 9.532727907263861e-06,
"loss": 0.0563,
"step": 569
},
{
"epoch": 0.71,
"grad_norm": 0.6380606889724731,
"learning_rate": 9.53108125573374e-06,
"loss": 0.0183,
"step": 570
},
{
"epoch": 0.71,
"grad_norm": 0.7984311580657959,
"learning_rate": 9.529431850663036e-06,
"loss": 0.0469,
"step": 571
},
{
"epoch": 0.71,
"grad_norm": 0.8775026798248291,
"learning_rate": 9.527779693054095e-06,
"loss": 0.0285,
"step": 572
},
{
"epoch": 0.71,
"grad_norm": 0.5551888346672058,
"learning_rate": 9.526124783910935e-06,
"loss": 0.0322,
"step": 573
},
{
"epoch": 0.71,
"grad_norm": 1.0795842409133911,
"learning_rate": 9.524467124239243e-06,
"loss": 0.0478,
"step": 574
},
{
"epoch": 0.71,
"grad_norm": 1.2850500345230103,
"learning_rate": 9.52280671504638e-06,
"loss": 0.0223,
"step": 575
},
{
"epoch": 0.71,
"grad_norm": 0.5365849733352661,
"learning_rate": 9.521143557341378e-06,
"loss": 0.0285,
"step": 576
},
{
"epoch": 0.71,
"grad_norm": 0.7505818605422974,
"learning_rate": 9.519477652134938e-06,
"loss": 0.0301,
"step": 577
},
{
"epoch": 0.72,
"grad_norm": 0.4962819516658783,
"learning_rate": 9.517809000439432e-06,
"loss": 0.0299,
"step": 578
},
{
"epoch": 0.72,
"grad_norm": 1.9355813264846802,
"learning_rate": 9.516137603268903e-06,
"loss": 0.0715,
"step": 579
},
{
"epoch": 0.72,
"grad_norm": 1.3954781293869019,
"learning_rate": 9.514463461639055e-06,
"loss": 0.0512,
"step": 580
},
{
"epoch": 0.72,
"grad_norm": 1.0368856191635132,
"learning_rate": 9.51278657656727e-06,
"loss": 0.0445,
"step": 581
},
{
"epoch": 0.72,
"grad_norm": 0.7911268472671509,
"learning_rate": 9.511106949072588e-06,
"loss": 0.0475,
"step": 582
},
{
"epoch": 0.72,
"grad_norm": 1.1066776514053345,
"learning_rate": 9.509424580175724e-06,
"loss": 0.049,
"step": 583
},
{
"epoch": 0.72,
"grad_norm": 1.1990307569503784,
"learning_rate": 9.507739470899048e-06,
"loss": 0.0574,
"step": 584
},
{
"epoch": 0.72,
"grad_norm": 1.1048943996429443,
"learning_rate": 9.506051622266608e-06,
"loss": 0.08,
"step": 585
},
{
"epoch": 0.73,
"grad_norm": 0.8120594024658203,
"learning_rate": 9.504361035304106e-06,
"loss": 0.0443,
"step": 586
},
{
"epoch": 0.73,
"grad_norm": 0.6603597402572632,
"learning_rate": 9.502667711038917e-06,
"loss": 0.0366,
"step": 587
},
{
"epoch": 0.73,
"grad_norm": 2.3819870948791504,
"learning_rate": 9.500971650500072e-06,
"loss": 0.0692,
"step": 588
},
{
"epoch": 0.73,
"grad_norm": 1.7831990718841553,
"learning_rate": 9.499272854718268e-06,
"loss": 0.0506,
"step": 589
},
{
"epoch": 0.73,
"grad_norm": 1.1036359071731567,
"learning_rate": 9.497571324725865e-06,
"loss": 0.0435,
"step": 590
},
{
"epoch": 0.73,
"grad_norm": 1.2589616775512695,
"learning_rate": 9.495867061556884e-06,
"loss": 0.0412,
"step": 591
},
{
"epoch": 0.73,
"grad_norm": 0.78188556432724,
"learning_rate": 9.494160066247006e-06,
"loss": 0.0534,
"step": 592
},
{
"epoch": 0.73,
"grad_norm": 0.7451815605163574,
"learning_rate": 9.492450339833573e-06,
"loss": 0.0287,
"step": 593
},
{
"epoch": 0.74,
"grad_norm": 1.3252469301223755,
"learning_rate": 9.490737883355587e-06,
"loss": 0.0334,
"step": 594
},
{
"epoch": 0.74,
"grad_norm": 0.8932815194129944,
"learning_rate": 9.48902269785371e-06,
"loss": 0.036,
"step": 595
},
{
"epoch": 0.74,
"grad_norm": 1.6676141023635864,
"learning_rate": 9.487304784370257e-06,
"loss": 0.0538,
"step": 596
},
{
"epoch": 0.74,
"grad_norm": 0.9928424954414368,
"learning_rate": 9.48558414394921e-06,
"loss": 0.0558,
"step": 597
},
{
"epoch": 0.74,
"grad_norm": 1.130738377571106,
"learning_rate": 9.4838607776362e-06,
"loss": 0.0454,
"step": 598
},
{
"epoch": 0.74,
"grad_norm": 0.8108890056610107,
"learning_rate": 9.48213468647852e-06,
"loss": 0.0265,
"step": 599
},
{
"epoch": 0.74,
"grad_norm": 1.0491758584976196,
"learning_rate": 9.480405871525114e-06,
"loss": 0.0518,
"step": 600
},
{
"epoch": 0.74,
"grad_norm": 1.0204825401306152,
"learning_rate": 9.478674333826586e-06,
"loss": 0.0339,
"step": 601
},
{
"epoch": 0.75,
"grad_norm": 1.026297926902771,
"learning_rate": 9.476940074435189e-06,
"loss": 0.0508,
"step": 602
},
{
"epoch": 0.75,
"grad_norm": 1.4111378192901611,
"learning_rate": 9.475203094404836e-06,
"loss": 0.0553,
"step": 603
},
{
"epoch": 0.75,
"grad_norm": 0.8152147531509399,
"learning_rate": 9.473463394791093e-06,
"loss": 0.0512,
"step": 604
},
{
"epoch": 0.75,
"grad_norm": 0.5428625345230103,
"learning_rate": 9.471720976651173e-06,
"loss": 0.0274,
"step": 605
},
{
"epoch": 0.75,
"grad_norm": 0.789997398853302,
"learning_rate": 9.469975841043946e-06,
"loss": 0.0456,
"step": 606
},
{
"epoch": 0.75,
"grad_norm": 2.5263166427612305,
"learning_rate": 9.468227989029929e-06,
"loss": 0.0912,
"step": 607
},
{
"epoch": 0.75,
"grad_norm": 0.9473277926445007,
"learning_rate": 9.466477421671296e-06,
"loss": 0.0445,
"step": 608
},
{
"epoch": 0.75,
"grad_norm": 0.9322047829627991,
"learning_rate": 9.464724140031866e-06,
"loss": 0.0473,
"step": 609
},
{
"epoch": 0.75,
"grad_norm": 1.0073190927505493,
"learning_rate": 9.462968145177112e-06,
"loss": 0.0506,
"step": 610
},
{
"epoch": 0.76,
"grad_norm": 0.5902945399284363,
"learning_rate": 9.461209438174148e-06,
"loss": 0.0391,
"step": 611
},
{
"epoch": 0.76,
"grad_norm": 2.0115785598754883,
"learning_rate": 9.459448020091746e-06,
"loss": 0.0614,
"step": 612
},
{
"epoch": 0.76,
"grad_norm": 1.8103097677230835,
"learning_rate": 9.457683892000318e-06,
"loss": 0.0481,
"step": 613
},
{
"epoch": 0.76,
"grad_norm": 0.718271017074585,
"learning_rate": 9.455917054971929e-06,
"loss": 0.0277,
"step": 614
},
{
"epoch": 0.76,
"grad_norm": 0.948197066783905,
"learning_rate": 9.45414751008028e-06,
"loss": 0.0424,
"step": 615
},
{
"epoch": 0.76,
"grad_norm": 1.613114356994629,
"learning_rate": 9.452375258400732e-06,
"loss": 0.0444,
"step": 616
},
{
"epoch": 0.76,
"grad_norm": 0.5611456632614136,
"learning_rate": 9.450600301010279e-06,
"loss": 0.0278,
"step": 617
},
{
"epoch": 0.76,
"grad_norm": 1.0461411476135254,
"learning_rate": 9.448822638987564e-06,
"loss": 0.062,
"step": 618
},
{
"epoch": 0.77,
"grad_norm": 1.203861951828003,
"learning_rate": 9.447042273412873e-06,
"loss": 0.0335,
"step": 619
},
{
"epoch": 0.77,
"grad_norm": 1.0347965955734253,
"learning_rate": 9.445259205368138e-06,
"loss": 0.0499,
"step": 620
},
{
"epoch": 0.77,
"grad_norm": 1.2198740243911743,
"learning_rate": 9.44347343593693e-06,
"loss": 0.0441,
"step": 621
},
{
"epoch": 0.77,
"grad_norm": 0.7504235506057739,
"learning_rate": 9.441684966204456e-06,
"loss": 0.0483,
"step": 622
},
{
"epoch": 0.77,
"grad_norm": 0.7221031188964844,
"learning_rate": 9.439893797257578e-06,
"loss": 0.0369,
"step": 623
},
{
"epoch": 0.77,
"grad_norm": 1.0137180089950562,
"learning_rate": 9.438099930184783e-06,
"loss": 0.0242,
"step": 624
},
{
"epoch": 0.77,
"grad_norm": 0.7642596364021301,
"learning_rate": 9.436303366076213e-06,
"loss": 0.0476,
"step": 625
},
{
"epoch": 0.77,
"grad_norm": 1.0482991933822632,
"learning_rate": 9.434504106023634e-06,
"loss": 0.0717,
"step": 626
},
{
"epoch": 0.78,
"grad_norm": 0.7821680903434753,
"learning_rate": 9.432702151120464e-06,
"loss": 0.0395,
"step": 627
},
{
"epoch": 0.78,
"grad_norm": 0.8012223839759827,
"learning_rate": 9.430897502461745e-06,
"loss": 0.0501,
"step": 628
},
{
"epoch": 0.78,
"grad_norm": 0.960848867893219,
"learning_rate": 9.429090161144166e-06,
"loss": 0.0194,
"step": 629
},
{
"epoch": 0.78,
"grad_norm": 0.9573109745979309,
"learning_rate": 9.427280128266049e-06,
"loss": 0.0485,
"step": 630
},
{
"epoch": 0.78,
"grad_norm": 0.6235270500183105,
"learning_rate": 9.425467404927356e-06,
"loss": 0.0354,
"step": 631
},
{
"epoch": 0.78,
"grad_norm": 1.024781346321106,
"learning_rate": 9.423651992229673e-06,
"loss": 0.0356,
"step": 632
},
{
"epoch": 0.78,
"grad_norm": 0.7387573719024658,
"learning_rate": 9.421833891276233e-06,
"loss": 0.0576,
"step": 633
},
{
"epoch": 0.78,
"grad_norm": 0.5336031913757324,
"learning_rate": 9.420013103171893e-06,
"loss": 0.0387,
"step": 634
},
{
"epoch": 0.79,
"grad_norm": 1.2542508840560913,
"learning_rate": 9.418189629023149e-06,
"loss": 0.0415,
"step": 635
},
{
"epoch": 0.79,
"grad_norm": 1.6477981805801392,
"learning_rate": 9.416363469938128e-06,
"loss": 0.0725,
"step": 636
},
{
"epoch": 0.79,
"grad_norm": 0.7093968391418457,
"learning_rate": 9.414534627026586e-06,
"loss": 0.0361,
"step": 637
},
{
"epoch": 0.79,
"grad_norm": 0.8406978845596313,
"learning_rate": 9.412703101399912e-06,
"loss": 0.0248,
"step": 638
},
{
"epoch": 0.79,
"grad_norm": 0.7647954821586609,
"learning_rate": 9.410868894171126e-06,
"loss": 0.0734,
"step": 639
},
{
"epoch": 0.79,
"grad_norm": 0.5869340300559998,
"learning_rate": 9.409032006454877e-06,
"loss": 0.0322,
"step": 640
},
{
"epoch": 0.79,
"grad_norm": 0.6841743588447571,
"learning_rate": 9.407192439367443e-06,
"loss": 0.0217,
"step": 641
},
{
"epoch": 0.79,
"grad_norm": 1.1286256313323975,
"learning_rate": 9.405350194026728e-06,
"loss": 0.0432,
"step": 642
},
{
"epoch": 0.8,
"grad_norm": 1.9575207233428955,
"learning_rate": 9.403505271552267e-06,
"loss": 0.0623,
"step": 643
},
{
"epoch": 0.8,
"grad_norm": 2.1534059047698975,
"learning_rate": 9.401657673065218e-06,
"loss": 0.0682,
"step": 644
},
{
"epoch": 0.8,
"grad_norm": 0.6419281959533691,
"learning_rate": 9.399807399688371e-06,
"loss": 0.0271,
"step": 645
},
{
"epoch": 0.8,
"grad_norm": 0.8669396638870239,
"learning_rate": 9.397954452546139e-06,
"loss": 0.0438,
"step": 646
},
{
"epoch": 0.8,
"grad_norm": 1.168561339378357,
"learning_rate": 9.396098832764555e-06,
"loss": 0.0456,
"step": 647
},
{
"epoch": 0.8,
"grad_norm": 1.2432861328125,
"learning_rate": 9.394240541471282e-06,
"loss": 0.0666,
"step": 648
},
{
"epoch": 0.8,
"grad_norm": 1.9158250093460083,
"learning_rate": 9.392379579795605e-06,
"loss": 0.0452,
"step": 649
},
{
"epoch": 0.8,
"grad_norm": 1.2606102228164673,
"learning_rate": 9.39051594886843e-06,
"loss": 0.0288,
"step": 650
},
{
"epoch": 0.81,
"grad_norm": 1.0844234228134155,
"learning_rate": 9.388649649822289e-06,
"loss": 0.0374,
"step": 651
},
{
"epoch": 0.81,
"grad_norm": 1.0901192426681519,
"learning_rate": 9.386780683791331e-06,
"loss": 0.0498,
"step": 652
},
{
"epoch": 0.81,
"grad_norm": 1.03596830368042,
"learning_rate": 9.384909051911329e-06,
"loss": 0.0544,
"step": 653
},
{
"epoch": 0.81,
"grad_norm": 0.7338258028030396,
"learning_rate": 9.383034755319673e-06,
"loss": 0.0389,
"step": 654
},
{
"epoch": 0.81,
"grad_norm": 1.973031759262085,
"learning_rate": 9.381157795155374e-06,
"loss": 0.0534,
"step": 655
},
{
"epoch": 0.81,
"grad_norm": 0.6111584305763245,
"learning_rate": 9.379278172559065e-06,
"loss": 0.0279,
"step": 656
},
{
"epoch": 0.81,
"grad_norm": 0.7228569388389587,
"learning_rate": 9.37739588867299e-06,
"loss": 0.0397,
"step": 657
},
{
"epoch": 0.81,
"grad_norm": 1.4140815734863281,
"learning_rate": 9.375510944641017e-06,
"loss": 0.0476,
"step": 658
},
{
"epoch": 0.82,
"grad_norm": 1.1325860023498535,
"learning_rate": 9.373623341608624e-06,
"loss": 0.0697,
"step": 659
},
{
"epoch": 0.82,
"grad_norm": 1.155360221862793,
"learning_rate": 9.371733080722911e-06,
"loss": 0.0493,
"step": 660
},
{
"epoch": 0.82,
"grad_norm": 1.2202762365341187,
"learning_rate": 9.36984016313259e-06,
"loss": 0.0425,
"step": 661
},
{
"epoch": 0.82,
"grad_norm": 0.9276245832443237,
"learning_rate": 9.36794458998799e-06,
"loss": 0.0324,
"step": 662
},
{
"epoch": 0.82,
"grad_norm": 0.8629313707351685,
"learning_rate": 9.366046362441047e-06,
"loss": 0.0551,
"step": 663
},
{
"epoch": 0.82,
"grad_norm": 0.3723730742931366,
"learning_rate": 9.36414548164532e-06,
"loss": 0.0157,
"step": 664
},
{
"epoch": 0.82,
"grad_norm": 0.9178370833396912,
"learning_rate": 9.36224194875597e-06,
"loss": 0.0467,
"step": 665
},
{
"epoch": 0.82,
"grad_norm": 0.7394289374351501,
"learning_rate": 9.360335764929781e-06,
"loss": 0.0303,
"step": 666
},
{
"epoch": 0.83,
"grad_norm": 0.757675290107727,
"learning_rate": 9.358426931325137e-06,
"loss": 0.0302,
"step": 667
},
{
"epoch": 0.83,
"grad_norm": 1.3911486864089966,
"learning_rate": 9.356515449102041e-06,
"loss": 0.0544,
"step": 668
},
{
"epoch": 0.83,
"grad_norm": 0.451570063829422,
"learning_rate": 9.354601319422099e-06,
"loss": 0.0207,
"step": 669
},
{
"epoch": 0.83,
"grad_norm": 0.43002304434776306,
"learning_rate": 9.352684543448532e-06,
"loss": 0.0186,
"step": 670
},
{
"epoch": 0.83,
"grad_norm": 0.37833526730537415,
"learning_rate": 9.350765122346162e-06,
"loss": 0.0146,
"step": 671
},
{
"epoch": 0.83,
"grad_norm": 0.9775627255439758,
"learning_rate": 9.348843057281423e-06,
"loss": 0.0451,
"step": 672
},
{
"epoch": 0.83,
"grad_norm": 0.626708447933197,
"learning_rate": 9.346918349422356e-06,
"loss": 0.0301,
"step": 673
},
{
"epoch": 0.83,
"grad_norm": 1.5922341346740723,
"learning_rate": 9.344990999938609e-06,
"loss": 0.0501,
"step": 674
},
{
"epoch": 0.84,
"grad_norm": 1.1948060989379883,
"learning_rate": 9.343061010001428e-06,
"loss": 0.0394,
"step": 675
},
{
"epoch": 0.84,
"grad_norm": 0.9602558016777039,
"learning_rate": 9.341128380783674e-06,
"loss": 0.0429,
"step": 676
},
{
"epoch": 0.84,
"grad_norm": 1.0513089895248413,
"learning_rate": 9.339193113459805e-06,
"loss": 0.0391,
"step": 677
},
{
"epoch": 0.84,
"grad_norm": 1.1344138383865356,
"learning_rate": 9.337255209205884e-06,
"loss": 0.0274,
"step": 678
},
{
"epoch": 0.84,
"grad_norm": 1.1134185791015625,
"learning_rate": 9.335314669199576e-06,
"loss": 0.0604,
"step": 679
},
{
"epoch": 0.84,
"grad_norm": 1.0586154460906982,
"learning_rate": 9.33337149462015e-06,
"loss": 0.0325,
"step": 680
},
{
"epoch": 0.84,
"grad_norm": 1.0996270179748535,
"learning_rate": 9.331425686648472e-06,
"loss": 0.0332,
"step": 681
},
{
"epoch": 0.84,
"grad_norm": 2.7945778369903564,
"learning_rate": 9.32947724646701e-06,
"loss": 0.0664,
"step": 682
},
{
"epoch": 0.85,
"grad_norm": 1.8699554204940796,
"learning_rate": 9.327526175259837e-06,
"loss": 0.0592,
"step": 683
},
{
"epoch": 0.85,
"grad_norm": 1.0859918594360352,
"learning_rate": 9.325572474212615e-06,
"loss": 0.0434,
"step": 684
},
{
"epoch": 0.85,
"grad_norm": 1.2848424911499023,
"learning_rate": 9.323616144512612e-06,
"loss": 0.0343,
"step": 685
},
{
"epoch": 0.85,
"grad_norm": 1.860479474067688,
"learning_rate": 9.321657187348689e-06,
"loss": 0.0581,
"step": 686
},
{
"epoch": 0.85,
"grad_norm": 1.3358099460601807,
"learning_rate": 9.319695603911306e-06,
"loss": 0.059,
"step": 687
},
{
"epoch": 0.85,
"grad_norm": 0.8692423701286316,
"learning_rate": 9.317731395392517e-06,
"loss": 0.0332,
"step": 688
},
{
"epoch": 0.85,
"grad_norm": 1.4998887777328491,
"learning_rate": 9.315764562985976e-06,
"loss": 0.0485,
"step": 689
},
{
"epoch": 0.85,
"grad_norm": 0.5280508995056152,
"learning_rate": 9.313795107886925e-06,
"loss": 0.0249,
"step": 690
},
{
"epoch": 0.86,
"grad_norm": 0.7580534219741821,
"learning_rate": 9.311823031292205e-06,
"loss": 0.0372,
"step": 691
},
{
"epoch": 0.86,
"grad_norm": 0.7582796216011047,
"learning_rate": 9.309848334400247e-06,
"loss": 0.0326,
"step": 692
},
{
"epoch": 0.86,
"grad_norm": 0.6401865482330322,
"learning_rate": 9.307871018411074e-06,
"loss": 0.0301,
"step": 693
},
{
"epoch": 0.86,
"grad_norm": 2.024916410446167,
"learning_rate": 9.305891084526306e-06,
"loss": 0.0723,
"step": 694
},
{
"epoch": 0.86,
"grad_norm": 2.180551767349243,
"learning_rate": 9.303908533949146e-06,
"loss": 0.0639,
"step": 695
},
{
"epoch": 0.86,
"grad_norm": 0.7816917896270752,
"learning_rate": 9.301923367884393e-06,
"loss": 0.0366,
"step": 696
},
{
"epoch": 0.86,
"grad_norm": 0.7270790934562683,
"learning_rate": 9.299935587538432e-06,
"loss": 0.0421,
"step": 697
},
{
"epoch": 0.86,
"grad_norm": 0.8784447312355042,
"learning_rate": 9.29794519411924e-06,
"loss": 0.043,
"step": 698
},
{
"epoch": 0.87,
"grad_norm": 0.6736301779747009,
"learning_rate": 9.29595218883638e-06,
"loss": 0.047,
"step": 699
},
{
"epoch": 0.87,
"grad_norm": 1.0458660125732422,
"learning_rate": 9.293956572900999e-06,
"loss": 0.0295,
"step": 700
},
{
"epoch": 0.87,
"grad_norm": 0.8319834470748901,
"learning_rate": 9.29195834752584e-06,
"loss": 0.0606,
"step": 701
},
{
"epoch": 0.87,
"grad_norm": 1.5236587524414062,
"learning_rate": 9.28995751392522e-06,
"loss": 0.0405,
"step": 702
},
{
"epoch": 0.87,
"grad_norm": 1.4151524305343628,
"learning_rate": 9.28795407331505e-06,
"loss": 0.0397,
"step": 703
},
{
"epoch": 0.87,
"grad_norm": 1.9959708452224731,
"learning_rate": 9.285948026912822e-06,
"loss": 0.0715,
"step": 704
},
{
"epoch": 0.87,
"grad_norm": 0.5822674632072449,
"learning_rate": 9.283939375937609e-06,
"loss": 0.0281,
"step": 705
},
{
"epoch": 0.87,
"grad_norm": 0.7008696794509888,
"learning_rate": 9.28192812161007e-06,
"loss": 0.0486,
"step": 706
},
{
"epoch": 0.88,
"grad_norm": 0.7523006796836853,
"learning_rate": 9.279914265152448e-06,
"loss": 0.0505,
"step": 707
},
{
"epoch": 0.88,
"grad_norm": 1.051295518875122,
"learning_rate": 9.277897807788562e-06,
"loss": 0.0499,
"step": 708
},
{
"epoch": 0.88,
"grad_norm": 0.8184940218925476,
"learning_rate": 9.275878750743818e-06,
"loss": 0.0422,
"step": 709
},
{
"epoch": 0.88,
"grad_norm": 1.372441291809082,
"learning_rate": 9.273857095245192e-06,
"loss": 0.0633,
"step": 710
},
{
"epoch": 0.88,
"grad_norm": 0.6757863759994507,
"learning_rate": 9.271832842521249e-06,
"loss": 0.0366,
"step": 711
},
{
"epoch": 0.88,
"grad_norm": 0.7655669450759888,
"learning_rate": 9.26980599380213e-06,
"loss": 0.0389,
"step": 712
},
{
"epoch": 0.88,
"grad_norm": 1.1087899208068848,
"learning_rate": 9.267776550319548e-06,
"loss": 0.0433,
"step": 713
},
{
"epoch": 0.88,
"grad_norm": 1.6310410499572754,
"learning_rate": 9.265744513306798e-06,
"loss": 0.0471,
"step": 714
},
{
"epoch": 0.88,
"grad_norm": 1.9184622764587402,
"learning_rate": 9.263709883998753e-06,
"loss": 0.0679,
"step": 715
},
{
"epoch": 0.89,
"grad_norm": 2.0910892486572266,
"learning_rate": 9.261672663631854e-06,
"loss": 0.0551,
"step": 716
},
{
"epoch": 0.89,
"grad_norm": 2.9525444507598877,
"learning_rate": 9.259632853444126e-06,
"loss": 0.0682,
"step": 717
},
{
"epoch": 0.89,
"grad_norm": 1.773461103439331,
"learning_rate": 9.257590454675159e-06,
"loss": 0.0441,
"step": 718
},
{
"epoch": 0.89,
"grad_norm": 0.9130051136016846,
"learning_rate": 9.255545468566119e-06,
"loss": 0.0454,
"step": 719
},
{
"epoch": 0.89,
"grad_norm": 0.34200993180274963,
"learning_rate": 9.253497896359749e-06,
"loss": 0.0119,
"step": 720
},
{
"epoch": 0.89,
"grad_norm": 1.0717602968215942,
"learning_rate": 9.251447739300356e-06,
"loss": 0.0552,
"step": 721
},
{
"epoch": 0.89,
"grad_norm": 1.0619879961013794,
"learning_rate": 9.249394998633825e-06,
"loss": 0.0568,
"step": 722
},
{
"epoch": 0.89,
"grad_norm": 0.8811701536178589,
"learning_rate": 9.247339675607606e-06,
"loss": 0.034,
"step": 723
},
{
"epoch": 0.9,
"grad_norm": 0.974205493927002,
"learning_rate": 9.24528177147072e-06,
"loss": 0.0398,
"step": 724
},
{
"epoch": 0.9,
"grad_norm": 0.8818910717964172,
"learning_rate": 9.243221287473755e-06,
"loss": 0.048,
"step": 725
},
{
"epoch": 0.9,
"grad_norm": 0.6580934524536133,
"learning_rate": 9.241158224868871e-06,
"loss": 0.042,
"step": 726
},
{
"epoch": 0.9,
"grad_norm": 1.4452764987945557,
"learning_rate": 9.23909258490979e-06,
"loss": 0.0438,
"step": 727
},
{
"epoch": 0.9,
"grad_norm": 0.6177107095718384,
"learning_rate": 9.237024368851805e-06,
"loss": 0.0434,
"step": 728
},
{
"epoch": 0.9,
"grad_norm": 0.6715316772460938,
"learning_rate": 9.23495357795177e-06,
"loss": 0.0242,
"step": 729
},
{
"epoch": 0.9,
"grad_norm": 1.8438655138015747,
"learning_rate": 9.232880213468106e-06,
"loss": 0.0421,
"step": 730
},
{
"epoch": 0.9,
"grad_norm": 1.011062741279602,
"learning_rate": 9.230804276660799e-06,
"loss": 0.0465,
"step": 731
},
{
"epoch": 0.91,
"grad_norm": 1.2409260272979736,
"learning_rate": 9.228725768791394e-06,
"loss": 0.029,
"step": 732
},
{
"epoch": 0.91,
"grad_norm": 1.2052364349365234,
"learning_rate": 9.226644691123006e-06,
"loss": 0.0465,
"step": 733
},
{
"epoch": 0.91,
"grad_norm": 0.60611891746521,
"learning_rate": 9.224561044920303e-06,
"loss": 0.0328,
"step": 734
},
{
"epoch": 0.91,
"grad_norm": 0.4640844464302063,
"learning_rate": 9.222474831449519e-06,
"loss": 0.0202,
"step": 735
},
{
"epoch": 0.91,
"grad_norm": 1.9622972011566162,
"learning_rate": 9.220386051978449e-06,
"loss": 0.0651,
"step": 736
},
{
"epoch": 0.91,
"grad_norm": 1.8986101150512695,
"learning_rate": 9.218294707776441e-06,
"loss": 0.0556,
"step": 737
},
{
"epoch": 0.91,
"grad_norm": 1.158408284187317,
"learning_rate": 9.216200800114412e-06,
"loss": 0.0368,
"step": 738
},
{
"epoch": 0.91,
"grad_norm": 0.9851293563842773,
"learning_rate": 9.214104330264826e-06,
"loss": 0.053,
"step": 739
},
{
"epoch": 0.92,
"grad_norm": 1.1018086671829224,
"learning_rate": 9.212005299501712e-06,
"loss": 0.0597,
"step": 740
},
{
"epoch": 0.92,
"grad_norm": 1.84424889087677,
"learning_rate": 9.20990370910065e-06,
"loss": 0.0497,
"step": 741
},
{
"epoch": 0.92,
"grad_norm": 1.2366299629211426,
"learning_rate": 9.207799560338779e-06,
"loss": 0.0602,
"step": 742
},
{
"epoch": 0.92,
"grad_norm": 1.1586567163467407,
"learning_rate": 9.20569285449479e-06,
"loss": 0.0316,
"step": 743
},
{
"epoch": 0.92,
"grad_norm": 0.6110067367553711,
"learning_rate": 9.20358359284893e-06,
"loss": 0.0305,
"step": 744
},
{
"epoch": 0.92,
"grad_norm": 0.6773253679275513,
"learning_rate": 9.201471776682999e-06,
"loss": 0.036,
"step": 745
},
{
"epoch": 0.92,
"grad_norm": 0.9832028150558472,
"learning_rate": 9.199357407280349e-06,
"loss": 0.0381,
"step": 746
},
{
"epoch": 0.92,
"grad_norm": 1.0233718156814575,
"learning_rate": 9.197240485925883e-06,
"loss": 0.0549,
"step": 747
},
{
"epoch": 0.93,
"grad_norm": 2.125337839126587,
"learning_rate": 9.195121013906055e-06,
"loss": 0.0776,
"step": 748
},
{
"epoch": 0.93,
"grad_norm": 1.2079508304595947,
"learning_rate": 9.19299899250887e-06,
"loss": 0.0384,
"step": 749
},
{
"epoch": 0.93,
"grad_norm": 1.0452898740768433,
"learning_rate": 9.19087442302388e-06,
"loss": 0.0387,
"step": 750
},
{
"epoch": 0.93,
"grad_norm": 0.8497399687767029,
"learning_rate": 9.18874730674219e-06,
"loss": 0.0386,
"step": 751
},
{
"epoch": 0.93,
"grad_norm": 2.1464147567749023,
"learning_rate": 9.186617644956445e-06,
"loss": 0.0725,
"step": 752
},
{
"epoch": 0.93,
"grad_norm": 0.4441956579685211,
"learning_rate": 9.184485438960846e-06,
"loss": 0.0214,
"step": 753
},
{
"epoch": 0.93,
"grad_norm": 0.818230390548706,
"learning_rate": 9.182350690051134e-06,
"loss": 0.0256,
"step": 754
},
{
"epoch": 0.93,
"grad_norm": 1.0162849426269531,
"learning_rate": 9.180213399524599e-06,
"loss": 0.0592,
"step": 755
},
{
"epoch": 0.94,
"grad_norm": 0.9444966316223145,
"learning_rate": 9.178073568680071e-06,
"loss": 0.0293,
"step": 756
},
{
"epoch": 0.94,
"grad_norm": 0.7616766691207886,
"learning_rate": 9.175931198817926e-06,
"loss": 0.0481,
"step": 757
},
{
"epoch": 0.94,
"grad_norm": 0.47808611392974854,
"learning_rate": 9.173786291240085e-06,
"loss": 0.0287,
"step": 758
},
{
"epoch": 0.94,
"grad_norm": 0.6669220328330994,
"learning_rate": 9.17163884725001e-06,
"loss": 0.0324,
"step": 759
},
{
"epoch": 0.94,
"grad_norm": 0.8807569146156311,
"learning_rate": 9.169488868152704e-06,
"loss": 0.0425,
"step": 760
},
{
"epoch": 0.94,
"grad_norm": 1.2071596384048462,
"learning_rate": 9.16733635525471e-06,
"loss": 0.046,
"step": 761
},
{
"epoch": 0.94,
"grad_norm": 1.2434258460998535,
"learning_rate": 9.165181309864108e-06,
"loss": 0.0383,
"step": 762
},
{
"epoch": 0.94,
"grad_norm": 0.7151886820793152,
"learning_rate": 9.163023733290525e-06,
"loss": 0.0381,
"step": 763
},
{
"epoch": 0.95,
"grad_norm": 0.6364666223526001,
"learning_rate": 9.16086362684512e-06,
"loss": 0.0328,
"step": 764
},
{
"epoch": 0.95,
"grad_norm": 1.2846086025238037,
"learning_rate": 9.15870099184059e-06,
"loss": 0.0317,
"step": 765
},
{
"epoch": 0.95,
"grad_norm": 1.7031409740447998,
"learning_rate": 9.15653582959117e-06,
"loss": 0.0416,
"step": 766
},
{
"epoch": 0.95,
"grad_norm": 1.8931663036346436,
"learning_rate": 9.154368141412632e-06,
"loss": 0.0544,
"step": 767
},
{
"epoch": 0.95,
"grad_norm": 0.5589671730995178,
"learning_rate": 9.152197928622278e-06,
"loss": 0.0204,
"step": 768
},
{
"epoch": 0.95,
"grad_norm": 0.7534042596817017,
"learning_rate": 9.15002519253895e-06,
"loss": 0.0291,
"step": 769
},
{
"epoch": 0.95,
"grad_norm": 0.8194689750671387,
"learning_rate": 9.147849934483019e-06,
"loss": 0.0363,
"step": 770
},
{
"epoch": 0.95,
"grad_norm": 1.4425467252731323,
"learning_rate": 9.145672155776392e-06,
"loss": 0.0583,
"step": 771
},
{
"epoch": 0.96,
"grad_norm": 1.4742876291275024,
"learning_rate": 9.143491857742505e-06,
"loss": 0.0577,
"step": 772
},
{
"epoch": 0.96,
"grad_norm": 0.5303352475166321,
"learning_rate": 9.14130904170633e-06,
"loss": 0.0311,
"step": 773
},
{
"epoch": 0.96,
"grad_norm": 0.7389684915542603,
"learning_rate": 9.13912370899436e-06,
"loss": 0.028,
"step": 774
},
{
"epoch": 0.96,
"grad_norm": 1.5198121070861816,
"learning_rate": 9.136935860934628e-06,
"loss": 0.0461,
"step": 775
},
{
"epoch": 0.96,
"grad_norm": 1.799206256866455,
"learning_rate": 9.134745498856685e-06,
"loss": 0.0478,
"step": 776
},
{
"epoch": 0.96,
"grad_norm": 1.1272491216659546,
"learning_rate": 9.13255262409162e-06,
"loss": 0.0495,
"step": 777
},
{
"epoch": 0.96,
"grad_norm": 1.0748385190963745,
"learning_rate": 9.130357237972044e-06,
"loss": 0.0388,
"step": 778
},
{
"epoch": 0.96,
"grad_norm": 0.8800269961357117,
"learning_rate": 9.128159341832092e-06,
"loss": 0.0233,
"step": 779
},
{
"epoch": 0.97,
"grad_norm": 0.6652606129646301,
"learning_rate": 9.125958937007427e-06,
"loss": 0.0401,
"step": 780
},
{
"epoch": 0.97,
"grad_norm": 0.7951803207397461,
"learning_rate": 9.123756024835237e-06,
"loss": 0.0194,
"step": 781
},
{
"epoch": 0.97,
"grad_norm": 0.6082125902175903,
"learning_rate": 9.121550606654232e-06,
"loss": 0.0221,
"step": 782
},
{
"epoch": 0.97,
"grad_norm": 1.656269907951355,
"learning_rate": 9.119342683804649e-06,
"loss": 0.0267,
"step": 783
},
{
"epoch": 0.97,
"grad_norm": 1.3084255456924438,
"learning_rate": 9.11713225762824e-06,
"loss": 0.0476,
"step": 784
},
{
"epoch": 0.97,
"grad_norm": 0.8326955437660217,
"learning_rate": 9.114919329468283e-06,
"loss": 0.0223,
"step": 785
},
{
"epoch": 0.97,
"grad_norm": 0.612882673740387,
"learning_rate": 9.112703900669577e-06,
"loss": 0.0186,
"step": 786
},
{
"epoch": 0.97,
"grad_norm": 1.0400992631912231,
"learning_rate": 9.110485972578439e-06,
"loss": 0.0494,
"step": 787
},
{
"epoch": 0.98,
"grad_norm": 0.9465930461883545,
"learning_rate": 9.108265546542705e-06,
"loss": 0.0336,
"step": 788
},
{
"epoch": 0.98,
"grad_norm": 0.8121449947357178,
"learning_rate": 9.106042623911728e-06,
"loss": 0.0392,
"step": 789
},
{
"epoch": 0.98,
"grad_norm": 1.7355393171310425,
"learning_rate": 9.103817206036383e-06,
"loss": 0.0492,
"step": 790
},
{
"epoch": 0.98,
"grad_norm": 0.5920339822769165,
"learning_rate": 9.101589294269054e-06,
"loss": 0.0354,
"step": 791
},
{
"epoch": 0.98,
"grad_norm": 1.1976126432418823,
"learning_rate": 9.099358889963643e-06,
"loss": 0.0618,
"step": 792
},
{
"epoch": 0.98,
"grad_norm": 1.0642493963241577,
"learning_rate": 9.097125994475572e-06,
"loss": 0.0555,
"step": 793
},
{
"epoch": 0.98,
"grad_norm": 1.2092516422271729,
"learning_rate": 9.09489060916177e-06,
"loss": 0.0391,
"step": 794
},
{
"epoch": 0.98,
"grad_norm": 0.67398601770401,
"learning_rate": 9.092652735380683e-06,
"loss": 0.0196,
"step": 795
},
{
"epoch": 0.99,
"grad_norm": 0.8952963948249817,
"learning_rate": 9.09041237449227e-06,
"loss": 0.0246,
"step": 796
},
{
"epoch": 0.99,
"grad_norm": 0.7937426567077637,
"learning_rate": 9.088169527857996e-06,
"loss": 0.0449,
"step": 797
},
{
"epoch": 0.99,
"grad_norm": 1.0983673334121704,
"learning_rate": 9.085924196840841e-06,
"loss": 0.0577,
"step": 798
},
{
"epoch": 0.99,
"grad_norm": 1.7625383138656616,
"learning_rate": 9.083676382805295e-06,
"loss": 0.0609,
"step": 799
},
{
"epoch": 0.99,
"grad_norm": 1.6659592390060425,
"learning_rate": 9.081426087117356e-06,
"loss": 0.0453,
"step": 800
}
],
"logging_steps": 1.0,
"max_steps": 4040,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"total_flos": 2.335555778196275e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}