{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9900990099009901, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 73.68818664550781, "learning_rate": 1.0000000000000002e-06, "loss": 8.0388, "step": 1 }, { "epoch": 0.0, "grad_norm": 71.36270904541016, "learning_rate": 2.0000000000000003e-06, "loss": 8.0003, "step": 2 }, { "epoch": 0.0, "grad_norm": 73.16751861572266, "learning_rate": 3e-06, "loss": 7.9032, "step": 3 }, { "epoch": 0.0, "grad_norm": 74.18943786621094, "learning_rate": 4.000000000000001e-06, "loss": 7.921, "step": 4 }, { "epoch": 0.01, "grad_norm": 70.63272857666016, "learning_rate": 5e-06, "loss": 8.032, "step": 5 }, { "epoch": 0.01, "grad_norm": 64.6897964477539, "learning_rate": 6e-06, "loss": 7.68, "step": 6 }, { "epoch": 0.01, "grad_norm": 65.79997253417969, "learning_rate": 7e-06, "loss": 7.5291, "step": 7 }, { "epoch": 0.01, "grad_norm": 63.4569091796875, "learning_rate": 8.000000000000001e-06, "loss": 5.6132, "step": 8 }, { "epoch": 0.01, "grad_norm": 63.28990173339844, "learning_rate": 9e-06, "loss": 5.0102, "step": 9 }, { "epoch": 0.01, "grad_norm": 46.30258560180664, "learning_rate": 1e-05, "loss": 2.2227, "step": 10 }, { "epoch": 0.01, "grad_norm": 46.01011657714844, "learning_rate": 9.99999848074862e-06, "loss": 1.6679, "step": 11 }, { "epoch": 0.01, "grad_norm": 42.595951080322266, "learning_rate": 9.9999939229954e-06, "loss": 1.5493, "step": 12 }, { "epoch": 0.02, "grad_norm": 11.979974746704102, "learning_rate": 9.999986326743111e-06, "loss": 0.892, "step": 13 }, { "epoch": 0.02, "grad_norm": 13.096778869628906, "learning_rate": 9.99997569199637e-06, "loss": 0.9386, "step": 14 }, { "epoch": 0.02, "grad_norm": 35.61207962036133, "learning_rate": 9.99996201876164e-06, "loss": 1.3573, "step": 15 }, { "epoch": 0.02, "grad_norm": 18.184959411621094, "learning_rate": 9.999945307047228e-06, "loss": 0.9778, "step": 16 }, { "epoch": 0.02, "grad_norm": 6.461019992828369, "learning_rate": 9.99992555686329e-06, "loss": 1.0665, "step": 17 }, { "epoch": 0.02, "grad_norm": 4.743849277496338, "learning_rate": 9.99990276822183e-06, "loss": 0.5975, "step": 18 }, { "epoch": 0.02, "grad_norm": 5.654608726501465, "learning_rate": 9.999876941136697e-06, "loss": 0.856, "step": 19 }, { "epoch": 0.02, "grad_norm": 5.488308906555176, "learning_rate": 9.999848075623584e-06, "loss": 0.7874, "step": 20 }, { "epoch": 0.03, "grad_norm": 5.833119869232178, "learning_rate": 9.999816171700034e-06, "loss": 0.8777, "step": 21 }, { "epoch": 0.03, "grad_norm": 3.678900718688965, "learning_rate": 9.999781229385433e-06, "loss": 0.5888, "step": 22 }, { "epoch": 0.03, "grad_norm": 6.357454776763916, "learning_rate": 9.99974324870102e-06, "loss": 0.9263, "step": 23 }, { "epoch": 0.03, "grad_norm": 5.7684149742126465, "learning_rate": 9.99970222966987e-06, "loss": 0.7734, "step": 24 }, { "epoch": 0.03, "grad_norm": 6.77016019821167, "learning_rate": 9.999658172316915e-06, "loss": 0.7735, "step": 25 }, { "epoch": 0.03, "grad_norm": 4.0211334228515625, "learning_rate": 9.999611076668926e-06, "loss": 0.5645, "step": 26 }, { "epoch": 0.03, "grad_norm": 7.3770222663879395, "learning_rate": 9.999560942754525e-06, "loss": 1.0185, "step": 27 }, { "epoch": 0.03, "grad_norm": 4.433741569519043, "learning_rate": 9.999507770604177e-06, "loss": 0.3547, "step": 28 }, { "epoch": 0.04, "grad_norm": 6.0549492835998535, "learning_rate": 9.999451560250196e-06, "loss": 0.4961, "step": 29 }, { "epoch": 0.04, "grad_norm": 7.8142619132995605, "learning_rate": 9.999392311726738e-06, "loss": 0.4398, "step": 30 }, { "epoch": 0.04, "grad_norm": 5.785826683044434, "learning_rate": 9.999330025069812e-06, "loss": 0.6431, "step": 31 }, { "epoch": 0.04, "grad_norm": 6.010104656219482, "learning_rate": 9.999264700317268e-06, "loss": 0.6129, "step": 32 }, { "epoch": 0.04, "grad_norm": 8.289867401123047, "learning_rate": 9.999196337508804e-06, "loss": 0.3771, "step": 33 }, { "epoch": 0.04, "grad_norm": 5.59083890914917, "learning_rate": 9.999124936685965e-06, "loss": 0.3964, "step": 34 }, { "epoch": 0.04, "grad_norm": 6.018394947052002, "learning_rate": 9.99905049789214e-06, "loss": 0.4801, "step": 35 }, { "epoch": 0.04, "grad_norm": 10.878011703491211, "learning_rate": 9.998973021172564e-06, "loss": 0.2996, "step": 36 }, { "epoch": 0.05, "grad_norm": 31.676380157470703, "learning_rate": 9.998892506574325e-06, "loss": 0.5261, "step": 37 }, { "epoch": 0.05, "grad_norm": 16.133407592773438, "learning_rate": 9.998808954146347e-06, "loss": 0.3843, "step": 38 }, { "epoch": 0.05, "grad_norm": 8.785749435424805, "learning_rate": 9.998722363939407e-06, "loss": 0.2476, "step": 39 }, { "epoch": 0.05, "grad_norm": 4.326422214508057, "learning_rate": 9.998632736006124e-06, "loss": 0.2334, "step": 40 }, { "epoch": 0.05, "grad_norm": 6.171711444854736, "learning_rate": 9.998540070400966e-06, "loss": 0.1671, "step": 41 }, { "epoch": 0.05, "grad_norm": 3.5893757343292236, "learning_rate": 9.998444367180247e-06, "loss": 0.1732, "step": 42 }, { "epoch": 0.05, "grad_norm": 2.918233633041382, "learning_rate": 9.998345626402124e-06, "loss": 0.1127, "step": 43 }, { "epoch": 0.05, "grad_norm": 2.620290517807007, "learning_rate": 9.998243848126604e-06, "loss": 0.1337, "step": 44 }, { "epoch": 0.06, "grad_norm": 4.31186056137085, "learning_rate": 9.998139032415534e-06, "loss": 0.144, "step": 45 }, { "epoch": 0.06, "grad_norm": 3.399256706237793, "learning_rate": 9.998031179332618e-06, "loss": 0.0878, "step": 46 }, { "epoch": 0.06, "grad_norm": 1.658913016319275, "learning_rate": 9.997920288943388e-06, "loss": 0.0651, "step": 47 }, { "epoch": 0.06, "grad_norm": 3.0306262969970703, "learning_rate": 9.99780636131524e-06, "loss": 0.1051, "step": 48 }, { "epoch": 0.06, "grad_norm": 2.099931478500366, "learning_rate": 9.997689396517408e-06, "loss": 0.0937, "step": 49 }, { "epoch": 0.06, "grad_norm": 2.8879055976867676, "learning_rate": 9.997569394620965e-06, "loss": 0.0701, "step": 50 }, { "epoch": 0.06, "grad_norm": 1.2706345319747925, "learning_rate": 9.997446355698843e-06, "loss": 0.0559, "step": 51 }, { "epoch": 0.06, "grad_norm": 1.7181872129440308, "learning_rate": 9.99732027982581e-06, "loss": 0.06, "step": 52 }, { "epoch": 0.07, "grad_norm": 1.4794338941574097, "learning_rate": 9.997191167078479e-06, "loss": 0.0715, "step": 53 }, { "epoch": 0.07, "grad_norm": 1.40012788772583, "learning_rate": 9.99705901753532e-06, "loss": 0.0608, "step": 54 }, { "epoch": 0.07, "grad_norm": 3.034327983856201, "learning_rate": 9.996923831276632e-06, "loss": 0.0603, "step": 55 }, { "epoch": 0.07, "grad_norm": 3.1301584243774414, "learning_rate": 9.996785608384573e-06, "loss": 0.0762, "step": 56 }, { "epoch": 0.07, "grad_norm": 2.8218867778778076, "learning_rate": 9.996644348943141e-06, "loss": 0.0956, "step": 57 }, { "epoch": 0.07, "grad_norm": 1.7874306440353394, "learning_rate": 9.996500053038176e-06, "loss": 0.0693, "step": 58 }, { "epoch": 0.07, "grad_norm": 1.582387089729309, "learning_rate": 9.99635272075737e-06, "loss": 0.0679, "step": 59 }, { "epoch": 0.07, "grad_norm": 1.9699870347976685, "learning_rate": 9.996202352190256e-06, "loss": 0.0565, "step": 60 }, { "epoch": 0.08, "grad_norm": 2.0471878051757812, "learning_rate": 9.996048947428212e-06, "loss": 0.0725, "step": 61 }, { "epoch": 0.08, "grad_norm": 1.3680695295333862, "learning_rate": 9.995892506564461e-06, "loss": 0.0586, "step": 62 }, { "epoch": 0.08, "grad_norm": 4.124834060668945, "learning_rate": 9.995733029694077e-06, "loss": 0.0724, "step": 63 }, { "epoch": 0.08, "grad_norm": 2.7247884273529053, "learning_rate": 9.995570516913971e-06, "loss": 0.095, "step": 64 }, { "epoch": 0.08, "grad_norm": 5.306038856506348, "learning_rate": 9.995404968322902e-06, "loss": 0.0783, "step": 65 }, { "epoch": 0.08, "grad_norm": 1.6850618124008179, "learning_rate": 9.995236384021474e-06, "loss": 0.0602, "step": 66 }, { "epoch": 0.08, "grad_norm": 7.234889984130859, "learning_rate": 9.995064764112135e-06, "loss": 0.0852, "step": 67 }, { "epoch": 0.08, "grad_norm": 3.0967376232147217, "learning_rate": 9.994890108699182e-06, "loss": 0.0905, "step": 68 }, { "epoch": 0.09, "grad_norm": 3.0317909717559814, "learning_rate": 9.99471241788875e-06, "loss": 0.0728, "step": 69 }, { "epoch": 0.09, "grad_norm": 4.1822967529296875, "learning_rate": 9.994531691788822e-06, "loss": 0.0919, "step": 70 }, { "epoch": 0.09, "grad_norm": 2.137779474258423, "learning_rate": 9.994347930509225e-06, "loss": 0.0496, "step": 71 }, { "epoch": 0.09, "grad_norm": 1.5484669208526611, "learning_rate": 9.994161134161635e-06, "loss": 0.0696, "step": 72 }, { "epoch": 0.09, "grad_norm": 4.332581996917725, "learning_rate": 9.993971302859561e-06, "loss": 0.0769, "step": 73 }, { "epoch": 0.09, "grad_norm": 1.838725209236145, "learning_rate": 9.99377843671837e-06, "loss": 0.1011, "step": 74 }, { "epoch": 0.09, "grad_norm": 3.3594350814819336, "learning_rate": 9.993582535855265e-06, "loss": 0.075, "step": 75 }, { "epoch": 0.09, "grad_norm": 1.8417227268218994, "learning_rate": 9.993383600389294e-06, "loss": 0.0529, "step": 76 }, { "epoch": 0.1, "grad_norm": 4.550814151763916, "learning_rate": 9.993181630441352e-06, "loss": 0.1104, "step": 77 }, { "epoch": 0.1, "grad_norm": 1.729711651802063, "learning_rate": 9.992976626134171e-06, "loss": 0.0601, "step": 78 }, { "epoch": 0.1, "grad_norm": 2.1105282306671143, "learning_rate": 9.99276858759234e-06, "loss": 0.0423, "step": 79 }, { "epoch": 0.1, "grad_norm": 2.180546522140503, "learning_rate": 9.992557514942278e-06, "loss": 0.0691, "step": 80 }, { "epoch": 0.1, "grad_norm": 3.0761630535125732, "learning_rate": 9.992343408312258e-06, "loss": 0.0503, "step": 81 }, { "epoch": 0.1, "grad_norm": 0.8641157150268555, "learning_rate": 9.992126267832392e-06, "loss": 0.0425, "step": 82 }, { "epoch": 0.1, "grad_norm": 10.724833488464355, "learning_rate": 9.991906093634633e-06, "loss": 0.0603, "step": 83 }, { "epoch": 0.1, "grad_norm": 1.1756705045700073, "learning_rate": 9.991682885852784e-06, "loss": 0.0392, "step": 84 }, { "epoch": 0.11, "grad_norm": 1.1171228885650635, "learning_rate": 9.991456644622489e-06, "loss": 0.0454, "step": 85 }, { "epoch": 0.11, "grad_norm": 1.6004431247711182, "learning_rate": 9.991227370081233e-06, "loss": 0.0496, "step": 86 }, { "epoch": 0.11, "grad_norm": 3.264841318130493, "learning_rate": 9.990995062368346e-06, "loss": 0.0339, "step": 87 }, { "epoch": 0.11, "grad_norm": 2.4765560626983643, "learning_rate": 9.990759721625005e-06, "loss": 0.0698, "step": 88 }, { "epoch": 0.11, "grad_norm": 6.907183647155762, "learning_rate": 9.990521347994224e-06, "loss": 0.1026, "step": 89 }, { "epoch": 0.11, "grad_norm": 5.384580135345459, "learning_rate": 9.990279941620861e-06, "loss": 0.0664, "step": 90 }, { "epoch": 0.11, "grad_norm": 1.2060827016830444, "learning_rate": 9.990035502651624e-06, "loss": 0.0324, "step": 91 }, { "epoch": 0.11, "grad_norm": 1.0956050157546997, "learning_rate": 9.989788031235054e-06, "loss": 0.0593, "step": 92 }, { "epoch": 0.12, "grad_norm": 2.1994054317474365, "learning_rate": 9.98953752752154e-06, "loss": 0.0484, "step": 93 }, { "epoch": 0.12, "grad_norm": 3.490142583847046, "learning_rate": 9.989283991663316e-06, "loss": 0.0561, "step": 94 }, { "epoch": 0.12, "grad_norm": 4.274105072021484, "learning_rate": 9.989027423814454e-06, "loss": 0.1123, "step": 95 }, { "epoch": 0.12, "grad_norm": 3.0847527980804443, "learning_rate": 9.98876782413087e-06, "loss": 0.0606, "step": 96 }, { "epoch": 0.12, "grad_norm": 1.8111186027526855, "learning_rate": 9.988505192770324e-06, "loss": 0.0681, "step": 97 }, { "epoch": 0.12, "grad_norm": 1.2713731527328491, "learning_rate": 9.988239529892416e-06, "loss": 0.0516, "step": 98 }, { "epoch": 0.12, "grad_norm": 1.189513087272644, "learning_rate": 9.987970835658592e-06, "loss": 0.0768, "step": 99 }, { "epoch": 0.12, "grad_norm": 0.9951283931732178, "learning_rate": 9.987699110232134e-06, "loss": 0.0416, "step": 100 }, { "epoch": 0.12, "grad_norm": 1.4628676176071167, "learning_rate": 9.987424353778172e-06, "loss": 0.0751, "step": 101 }, { "epoch": 0.13, "grad_norm": 1.41041100025177, "learning_rate": 9.987146566463677e-06, "loss": 0.0681, "step": 102 }, { "epoch": 0.13, "grad_norm": 1.9383851289749146, "learning_rate": 9.986865748457457e-06, "loss": 0.1003, "step": 103 }, { "epoch": 0.13, "grad_norm": 1.1434725522994995, "learning_rate": 9.986581899930167e-06, "loss": 0.049, "step": 104 }, { "epoch": 0.13, "grad_norm": 3.613456964492798, "learning_rate": 9.986295021054302e-06, "loss": 0.0519, "step": 105 }, { "epoch": 0.13, "grad_norm": 3.5484371185302734, "learning_rate": 9.986005112004198e-06, "loss": 0.0571, "step": 106 }, { "epoch": 0.13, "grad_norm": 1.9423480033874512, "learning_rate": 9.985712172956035e-06, "loss": 0.039, "step": 107 }, { "epoch": 0.13, "grad_norm": 2.0560059547424316, "learning_rate": 9.985416204087828e-06, "loss": 0.0904, "step": 108 }, { "epoch": 0.13, "grad_norm": 6.695100784301758, "learning_rate": 9.985117205579442e-06, "loss": 0.1549, "step": 109 }, { "epoch": 0.14, "grad_norm": 2.4656105041503906, "learning_rate": 9.984815177612574e-06, "loss": 0.079, "step": 110 }, { "epoch": 0.14, "grad_norm": 1.974007487297058, "learning_rate": 9.984510120370771e-06, "loss": 0.0585, "step": 111 }, { "epoch": 0.14, "grad_norm": 1.3341798782348633, "learning_rate": 9.984202034039414e-06, "loss": 0.0585, "step": 112 }, { "epoch": 0.14, "grad_norm": 2.7250359058380127, "learning_rate": 9.983890918805727e-06, "loss": 0.0651, "step": 113 }, { "epoch": 0.14, "grad_norm": 4.140810489654541, "learning_rate": 9.983576774858776e-06, "loss": 0.0748, "step": 114 }, { "epoch": 0.14, "grad_norm": 6.119039058685303, "learning_rate": 9.983259602389469e-06, "loss": 0.0818, "step": 115 }, { "epoch": 0.14, "grad_norm": 1.3782867193222046, "learning_rate": 9.982939401590545e-06, "loss": 0.0563, "step": 116 }, { "epoch": 0.14, "grad_norm": 1.240810513496399, "learning_rate": 9.982616172656594e-06, "loss": 0.0555, "step": 117 }, { "epoch": 0.15, "grad_norm": 2.0260303020477295, "learning_rate": 9.982289915784044e-06, "loss": 0.0554, "step": 118 }, { "epoch": 0.15, "grad_norm": 2.1243703365325928, "learning_rate": 9.981960631171162e-06, "loss": 0.0584, "step": 119 }, { "epoch": 0.15, "grad_norm": 2.7996938228607178, "learning_rate": 9.98162831901805e-06, "loss": 0.0854, "step": 120 }, { "epoch": 0.15, "grad_norm": 1.3062973022460938, "learning_rate": 9.981292979526656e-06, "loss": 0.0821, "step": 121 }, { "epoch": 0.15, "grad_norm": 1.2655537128448486, "learning_rate": 9.980954612900768e-06, "loss": 0.0643, "step": 122 }, { "epoch": 0.15, "grad_norm": 4.0950798988342285, "learning_rate": 9.980613219346012e-06, "loss": 0.0994, "step": 123 }, { "epoch": 0.15, "grad_norm": 1.522292971611023, "learning_rate": 9.980268799069848e-06, "loss": 0.0369, "step": 124 }, { "epoch": 0.15, "grad_norm": 2.5451443195343018, "learning_rate": 9.979921352281585e-06, "loss": 0.0286, "step": 125 }, { "epoch": 0.16, "grad_norm": 1.8015575408935547, "learning_rate": 9.979570879192365e-06, "loss": 0.0736, "step": 126 }, { "epoch": 0.16, "grad_norm": 3.2620017528533936, "learning_rate": 9.979217380015173e-06, "loss": 0.0662, "step": 127 }, { "epoch": 0.16, "grad_norm": 0.5585700273513794, "learning_rate": 9.978860854964827e-06, "loss": 0.0248, "step": 128 }, { "epoch": 0.16, "grad_norm": 1.1841486692428589, "learning_rate": 9.978501304257991e-06, "loss": 0.0386, "step": 129 }, { "epoch": 0.16, "grad_norm": 1.1351743936538696, "learning_rate": 9.97813872811316e-06, "loss": 0.0437, "step": 130 }, { "epoch": 0.16, "grad_norm": 2.4472172260284424, "learning_rate": 9.977773126750677e-06, "loss": 0.074, "step": 131 }, { "epoch": 0.16, "grad_norm": 0.9335076808929443, "learning_rate": 9.977404500392711e-06, "loss": 0.034, "step": 132 }, { "epoch": 0.16, "grad_norm": 1.9846038818359375, "learning_rate": 9.977032849263284e-06, "loss": 0.0488, "step": 133 }, { "epoch": 0.17, "grad_norm": 1.003464698791504, "learning_rate": 9.976658173588244e-06, "loss": 0.0199, "step": 134 }, { "epoch": 0.17, "grad_norm": 1.1298803091049194, "learning_rate": 9.976280473595284e-06, "loss": 0.0507, "step": 135 }, { "epoch": 0.17, "grad_norm": 4.0241546630859375, "learning_rate": 9.975899749513928e-06, "loss": 0.097, "step": 136 }, { "epoch": 0.17, "grad_norm": 2.1224637031555176, "learning_rate": 9.975516001575549e-06, "loss": 0.0656, "step": 137 }, { "epoch": 0.17, "grad_norm": 1.3180643320083618, "learning_rate": 9.975129230013347e-06, "loss": 0.0839, "step": 138 }, { "epoch": 0.17, "grad_norm": 2.089977979660034, "learning_rate": 9.974739435062364e-06, "loss": 0.0571, "step": 139 }, { "epoch": 0.17, "grad_norm": 1.773493766784668, "learning_rate": 9.974346616959476e-06, "loss": 0.025, "step": 140 }, { "epoch": 0.17, "grad_norm": 2.1019980907440186, "learning_rate": 9.973950775943403e-06, "loss": 0.0447, "step": 141 }, { "epoch": 0.18, "grad_norm": 1.4967840909957886, "learning_rate": 9.973551912254696e-06, "loss": 0.0422, "step": 142 }, { "epoch": 0.18, "grad_norm": 1.1371103525161743, "learning_rate": 9.973150026135743e-06, "loss": 0.0648, "step": 143 }, { "epoch": 0.18, "grad_norm": 0.8660270571708679, "learning_rate": 9.972745117830774e-06, "loss": 0.0344, "step": 144 }, { "epoch": 0.18, "grad_norm": 5.05332088470459, "learning_rate": 9.972337187585848e-06, "loss": 0.1036, "step": 145 }, { "epoch": 0.18, "grad_norm": 1.1562827825546265, "learning_rate": 9.971926235648868e-06, "loss": 0.041, "step": 146 }, { "epoch": 0.18, "grad_norm": 3.426886558532715, "learning_rate": 9.971512262269568e-06, "loss": 0.127, "step": 147 }, { "epoch": 0.18, "grad_norm": 1.173113465309143, "learning_rate": 9.97109526769952e-06, "loss": 0.0525, "step": 148 }, { "epoch": 0.18, "grad_norm": 1.1487282514572144, "learning_rate": 9.970675252192133e-06, "loss": 0.052, "step": 149 }, { "epoch": 0.19, "grad_norm": 1.5633060932159424, "learning_rate": 9.970252216002647e-06, "loss": 0.0389, "step": 150 }, { "epoch": 0.19, "grad_norm": 1.445123314857483, "learning_rate": 9.969826159388145e-06, "loss": 0.0521, "step": 151 }, { "epoch": 0.19, "grad_norm": 0.8425119519233704, "learning_rate": 9.96939708260754e-06, "loss": 0.0513, "step": 152 }, { "epoch": 0.19, "grad_norm": 0.9555310606956482, "learning_rate": 9.968964985921584e-06, "loss": 0.0574, "step": 153 }, { "epoch": 0.19, "grad_norm": 1.8024086952209473, "learning_rate": 9.96852986959286e-06, "loss": 0.058, "step": 154 }, { "epoch": 0.19, "grad_norm": 1.4136022329330444, "learning_rate": 9.96809173388579e-06, "loss": 0.0402, "step": 155 }, { "epoch": 0.19, "grad_norm": 0.9865325093269348, "learning_rate": 9.96765057906663e-06, "loss": 0.0555, "step": 156 }, { "epoch": 0.19, "grad_norm": 1.3715591430664062, "learning_rate": 9.967206405403468e-06, "loss": 0.0549, "step": 157 }, { "epoch": 0.2, "grad_norm": 1.10662841796875, "learning_rate": 9.966759213166231e-06, "loss": 0.0584, "step": 158 }, { "epoch": 0.2, "grad_norm": 1.3035138845443726, "learning_rate": 9.966309002626676e-06, "loss": 0.0398, "step": 159 }, { "epoch": 0.2, "grad_norm": 2.7275445461273193, "learning_rate": 9.965855774058395e-06, "loss": 0.0583, "step": 160 }, { "epoch": 0.2, "grad_norm": 1.4070425033569336, "learning_rate": 9.965399527736819e-06, "loss": 0.0476, "step": 161 }, { "epoch": 0.2, "grad_norm": 1.2913644313812256, "learning_rate": 9.964940263939206e-06, "loss": 0.0693, "step": 162 }, { "epoch": 0.2, "grad_norm": 5.090683937072754, "learning_rate": 9.964477982944654e-06, "loss": 0.0737, "step": 163 }, { "epoch": 0.2, "grad_norm": 4.244226455688477, "learning_rate": 9.964012685034087e-06, "loss": 0.0659, "step": 164 }, { "epoch": 0.2, "grad_norm": 1.7967549562454224, "learning_rate": 9.96354437049027e-06, "loss": 0.0226, "step": 165 }, { "epoch": 0.21, "grad_norm": 1.695214033126831, "learning_rate": 9.963073039597798e-06, "loss": 0.0772, "step": 166 }, { "epoch": 0.21, "grad_norm": 2.0708000659942627, "learning_rate": 9.962598692643098e-06, "loss": 0.053, "step": 167 }, { "epoch": 0.21, "grad_norm": 2.1509592533111572, "learning_rate": 9.962121329914432e-06, "loss": 0.0714, "step": 168 }, { "epoch": 0.21, "grad_norm": 2.4323039054870605, "learning_rate": 9.961640951701892e-06, "loss": 0.0456, "step": 169 }, { "epoch": 0.21, "grad_norm": 2.304720640182495, "learning_rate": 9.961157558297404e-06, "loss": 0.0854, "step": 170 }, { "epoch": 0.21, "grad_norm": 0.8575959205627441, "learning_rate": 9.960671149994727e-06, "loss": 0.0374, "step": 171 }, { "epoch": 0.21, "grad_norm": 1.106746792793274, "learning_rate": 9.960181727089455e-06, "loss": 0.0515, "step": 172 }, { "epoch": 0.21, "grad_norm": 1.6459972858428955, "learning_rate": 9.959689289879003e-06, "loss": 0.0514, "step": 173 }, { "epoch": 0.22, "grad_norm": 1.5684750080108643, "learning_rate": 9.959193838662634e-06, "loss": 0.0669, "step": 174 }, { "epoch": 0.22, "grad_norm": 1.1011048555374146, "learning_rate": 9.958695373741428e-06, "loss": 0.0406, "step": 175 }, { "epoch": 0.22, "grad_norm": 0.9976766109466553, "learning_rate": 9.958193895418305e-06, "loss": 0.0377, "step": 176 }, { "epoch": 0.22, "grad_norm": 1.4583932161331177, "learning_rate": 9.957689403998012e-06, "loss": 0.06, "step": 177 }, { "epoch": 0.22, "grad_norm": 1.1599044799804688, "learning_rate": 9.95718189978713e-06, "loss": 0.0406, "step": 178 }, { "epoch": 0.22, "grad_norm": 0.9436582326889038, "learning_rate": 9.95667138309407e-06, "loss": 0.0361, "step": 179 }, { "epoch": 0.22, "grad_norm": 2.8169147968292236, "learning_rate": 9.956157854229072e-06, "loss": 0.0597, "step": 180 }, { "epoch": 0.22, "grad_norm": 0.9190147519111633, "learning_rate": 9.955641313504208e-06, "loss": 0.0258, "step": 181 }, { "epoch": 0.23, "grad_norm": 0.8643155694007874, "learning_rate": 9.95512176123338e-06, "loss": 0.0327, "step": 182 }, { "epoch": 0.23, "grad_norm": 1.2514710426330566, "learning_rate": 9.95459919773232e-06, "loss": 0.0723, "step": 183 }, { "epoch": 0.23, "grad_norm": 1.3103550672531128, "learning_rate": 9.954073623318593e-06, "loss": 0.0576, "step": 184 }, { "epoch": 0.23, "grad_norm": 2.092473268508911, "learning_rate": 9.953545038311587e-06, "loss": 0.0734, "step": 185 }, { "epoch": 0.23, "grad_norm": 2.5062074661254883, "learning_rate": 9.953013443032524e-06, "loss": 0.0483, "step": 186 }, { "epoch": 0.23, "grad_norm": 2.1158766746520996, "learning_rate": 9.952478837804459e-06, "loss": 0.0345, "step": 187 }, { "epoch": 0.23, "grad_norm": 2.5865800380706787, "learning_rate": 9.951941222952264e-06, "loss": 0.0557, "step": 188 }, { "epoch": 0.23, "grad_norm": 2.171496868133545, "learning_rate": 9.951400598802655e-06, "loss": 0.062, "step": 189 }, { "epoch": 0.24, "grad_norm": 0.9497528076171875, "learning_rate": 9.950856965684167e-06, "loss": 0.0365, "step": 190 }, { "epoch": 0.24, "grad_norm": 1.4575358629226685, "learning_rate": 9.950310323927165e-06, "loss": 0.0648, "step": 191 }, { "epoch": 0.24, "grad_norm": 2.8335795402526855, "learning_rate": 9.949760673863846e-06, "loss": 0.0611, "step": 192 }, { "epoch": 0.24, "grad_norm": 1.1269536018371582, "learning_rate": 9.949208015828232e-06, "loss": 0.0541, "step": 193 }, { "epoch": 0.24, "grad_norm": 0.9925274848937988, "learning_rate": 9.948652350156172e-06, "loss": 0.0275, "step": 194 }, { "epoch": 0.24, "grad_norm": 1.2717292308807373, "learning_rate": 9.948093677185345e-06, "loss": 0.041, "step": 195 }, { "epoch": 0.24, "grad_norm": 1.1867843866348267, "learning_rate": 9.947531997255256e-06, "loss": 0.0517, "step": 196 }, { "epoch": 0.24, "grad_norm": 1.1004167795181274, "learning_rate": 9.946967310707241e-06, "loss": 0.0503, "step": 197 }, { "epoch": 0.25, "grad_norm": 1.8476804494857788, "learning_rate": 9.946399617884457e-06, "loss": 0.0419, "step": 198 }, { "epoch": 0.25, "grad_norm": 1.3617258071899414, "learning_rate": 9.945828919131894e-06, "loss": 0.0273, "step": 199 }, { "epoch": 0.25, "grad_norm": 1.4114432334899902, "learning_rate": 9.945255214796366e-06, "loss": 0.0448, "step": 200 }, { "epoch": 0.25, "grad_norm": 1.4074312448501587, "learning_rate": 9.944678505226511e-06, "loss": 0.0637, "step": 201 }, { "epoch": 0.25, "grad_norm": 1.2234091758728027, "learning_rate": 9.944098790772797e-06, "loss": 0.0497, "step": 202 }, { "epoch": 0.25, "grad_norm": 1.3652763366699219, "learning_rate": 9.943516071787517e-06, "loss": 0.0555, "step": 203 }, { "epoch": 0.25, "grad_norm": 2.020076036453247, "learning_rate": 9.942930348624788e-06, "loss": 0.0488, "step": 204 }, { "epoch": 0.25, "grad_norm": 1.1463106870651245, "learning_rate": 9.942341621640558e-06, "loss": 0.0498, "step": 205 }, { "epoch": 0.25, "grad_norm": 1.1451953649520874, "learning_rate": 9.941749891192594e-06, "loss": 0.0485, "step": 206 }, { "epoch": 0.26, "grad_norm": 2.710951805114746, "learning_rate": 9.94115515764049e-06, "loss": 0.0485, "step": 207 }, { "epoch": 0.26, "grad_norm": 1.6404072046279907, "learning_rate": 9.940557421345667e-06, "loss": 0.0387, "step": 208 }, { "epoch": 0.26, "grad_norm": 1.1222543716430664, "learning_rate": 9.939956682671372e-06, "loss": 0.0586, "step": 209 }, { "epoch": 0.26, "grad_norm": 1.6379327774047852, "learning_rate": 9.939352941982671e-06, "loss": 0.068, "step": 210 }, { "epoch": 0.26, "grad_norm": 1.2636500597000122, "learning_rate": 9.938746199646458e-06, "loss": 0.0413, "step": 211 }, { "epoch": 0.26, "grad_norm": 1.1981465816497803, "learning_rate": 9.938136456031454e-06, "loss": 0.0259, "step": 212 }, { "epoch": 0.26, "grad_norm": 1.2407490015029907, "learning_rate": 9.937523711508196e-06, "loss": 0.0413, "step": 213 }, { "epoch": 0.26, "grad_norm": 1.5851786136627197, "learning_rate": 9.93690796644905e-06, "loss": 0.0452, "step": 214 }, { "epoch": 0.27, "grad_norm": 1.1833544969558716, "learning_rate": 9.936289221228207e-06, "loss": 0.0415, "step": 215 }, { "epoch": 0.27, "grad_norm": 5.073670387268066, "learning_rate": 9.935667476221678e-06, "loss": 0.1248, "step": 216 }, { "epoch": 0.27, "grad_norm": 2.5642805099487305, "learning_rate": 9.935042731807297e-06, "loss": 0.0708, "step": 217 }, { "epoch": 0.27, "grad_norm": 3.680995464324951, "learning_rate": 9.934414988364722e-06, "loss": 0.0587, "step": 218 }, { "epoch": 0.27, "grad_norm": 2.164574146270752, "learning_rate": 9.933784246275432e-06, "loss": 0.0532, "step": 219 }, { "epoch": 0.27, "grad_norm": 1.1444894075393677, "learning_rate": 9.93315050592273e-06, "loss": 0.0486, "step": 220 }, { "epoch": 0.27, "grad_norm": 0.9272328615188599, "learning_rate": 9.932513767691743e-06, "loss": 0.0465, "step": 221 }, { "epoch": 0.27, "grad_norm": 3.0213119983673096, "learning_rate": 9.931874031969411e-06, "loss": 0.0679, "step": 222 }, { "epoch": 0.28, "grad_norm": 2.7126073837280273, "learning_rate": 9.931231299144509e-06, "loss": 0.0849, "step": 223 }, { "epoch": 0.28, "grad_norm": 1.2266963720321655, "learning_rate": 9.93058556960762e-06, "loss": 0.0722, "step": 224 }, { "epoch": 0.28, "grad_norm": 2.530362844467163, "learning_rate": 9.929936843751158e-06, "loss": 0.0477, "step": 225 }, { "epoch": 0.28, "grad_norm": 2.087737798690796, "learning_rate": 9.929285121969352e-06, "loss": 0.0698, "step": 226 }, { "epoch": 0.28, "grad_norm": 1.2407419681549072, "learning_rate": 9.928630404658255e-06, "loss": 0.0501, "step": 227 }, { "epoch": 0.28, "grad_norm": 1.7187033891677856, "learning_rate": 9.927972692215739e-06, "loss": 0.0537, "step": 228 }, { "epoch": 0.28, "grad_norm": 2.143998861312866, "learning_rate": 9.927311985041495e-06, "loss": 0.0554, "step": 229 }, { "epoch": 0.28, "grad_norm": 2.8843326568603516, "learning_rate": 9.926648283537037e-06, "loss": 0.0544, "step": 230 }, { "epoch": 0.29, "grad_norm": 1.6308791637420654, "learning_rate": 9.925981588105695e-06, "loss": 0.0505, "step": 231 }, { "epoch": 0.29, "grad_norm": 1.8796863555908203, "learning_rate": 9.92531189915262e-06, "loss": 0.0537, "step": 232 }, { "epoch": 0.29, "grad_norm": 1.4090087413787842, "learning_rate": 9.924639217084783e-06, "loss": 0.0589, "step": 233 }, { "epoch": 0.29, "grad_norm": 0.9706072807312012, "learning_rate": 9.923963542310975e-06, "loss": 0.049, "step": 234 }, { "epoch": 0.29, "grad_norm": 0.9905783534049988, "learning_rate": 9.923284875241802e-06, "loss": 0.0537, "step": 235 }, { "epoch": 0.29, "grad_norm": 0.5304461717605591, "learning_rate": 9.92260321628969e-06, "loss": 0.0291, "step": 236 }, { "epoch": 0.29, "grad_norm": 1.2716902494430542, "learning_rate": 9.921918565868887e-06, "loss": 0.0652, "step": 237 }, { "epoch": 0.29, "grad_norm": 0.9943916201591492, "learning_rate": 9.921230924395449e-06, "loss": 0.0543, "step": 238 }, { "epoch": 0.3, "grad_norm": 1.3783643245697021, "learning_rate": 9.920540292287262e-06, "loss": 0.0536, "step": 239 }, { "epoch": 0.3, "grad_norm": 1.389773964881897, "learning_rate": 9.91984666996402e-06, "loss": 0.0376, "step": 240 }, { "epoch": 0.3, "grad_norm": 0.7887927293777466, "learning_rate": 9.91915005784724e-06, "loss": 0.0272, "step": 241 }, { "epoch": 0.3, "grad_norm": 1.902744174003601, "learning_rate": 9.918450456360252e-06, "loss": 0.0543, "step": 242 }, { "epoch": 0.3, "grad_norm": 0.6114033460617065, "learning_rate": 9.917747865928206e-06, "loss": 0.0262, "step": 243 }, { "epoch": 0.3, "grad_norm": 1.1496695280075073, "learning_rate": 9.917042286978064e-06, "loss": 0.0643, "step": 244 }, { "epoch": 0.3, "grad_norm": 0.8322230577468872, "learning_rate": 9.916333719938608e-06, "loss": 0.0435, "step": 245 }, { "epoch": 0.3, "grad_norm": 0.9281955361366272, "learning_rate": 9.915622165240435e-06, "loss": 0.0399, "step": 246 }, { "epoch": 0.31, "grad_norm": 0.7492028474807739, "learning_rate": 9.914907623315958e-06, "loss": 0.0367, "step": 247 }, { "epoch": 0.31, "grad_norm": 2.0944385528564453, "learning_rate": 9.914190094599403e-06, "loss": 0.0488, "step": 248 }, { "epoch": 0.31, "grad_norm": 1.0233027935028076, "learning_rate": 9.913469579526811e-06, "loss": 0.0475, "step": 249 }, { "epoch": 0.31, "grad_norm": 0.9051103591918945, "learning_rate": 9.912746078536044e-06, "loss": 0.0374, "step": 250 }, { "epoch": 0.31, "grad_norm": 0.6250872015953064, "learning_rate": 9.91201959206677e-06, "loss": 0.0236, "step": 251 }, { "epoch": 0.31, "grad_norm": 1.0147565603256226, "learning_rate": 9.911290120560477e-06, "loss": 0.0408, "step": 252 }, { "epoch": 0.31, "grad_norm": 1.8525872230529785, "learning_rate": 9.910557664460464e-06, "loss": 0.0485, "step": 253 }, { "epoch": 0.31, "grad_norm": 2.040386915206909, "learning_rate": 9.909822224211845e-06, "loss": 0.0716, "step": 254 }, { "epoch": 0.32, "grad_norm": 1.2481484413146973, "learning_rate": 9.90908380026155e-06, "loss": 0.0376, "step": 255 }, { "epoch": 0.32, "grad_norm": 2.1175787448883057, "learning_rate": 9.908342393058317e-06, "loss": 0.0657, "step": 256 }, { "epoch": 0.32, "grad_norm": 0.9903053641319275, "learning_rate": 9.907598003052701e-06, "loss": 0.0378, "step": 257 }, { "epoch": 0.32, "grad_norm": 1.7109051942825317, "learning_rate": 9.906850630697068e-06, "loss": 0.0624, "step": 258 }, { "epoch": 0.32, "grad_norm": 1.9067022800445557, "learning_rate": 9.906100276445596e-06, "loss": 0.0492, "step": 259 }, { "epoch": 0.32, "grad_norm": 0.9397685527801514, "learning_rate": 9.905346940754274e-06, "loss": 0.0147, "step": 260 }, { "epoch": 0.32, "grad_norm": 3.0456113815307617, "learning_rate": 9.90459062408091e-06, "loss": 0.0812, "step": 261 }, { "epoch": 0.32, "grad_norm": 2.6053810119628906, "learning_rate": 9.903831326885112e-06, "loss": 0.0623, "step": 262 }, { "epoch": 0.33, "grad_norm": 2.0448148250579834, "learning_rate": 9.90306904962831e-06, "loss": 0.0803, "step": 263 }, { "epoch": 0.33, "grad_norm": 1.1430933475494385, "learning_rate": 9.902303792773736e-06, "loss": 0.0305, "step": 264 }, { "epoch": 0.33, "grad_norm": 0.8864290714263916, "learning_rate": 9.90153555678644e-06, "loss": 0.0488, "step": 265 }, { "epoch": 0.33, "grad_norm": 1.6222556829452515, "learning_rate": 9.900764342133277e-06, "loss": 0.021, "step": 266 }, { "epoch": 0.33, "grad_norm": 1.0808035135269165, "learning_rate": 9.899990149282917e-06, "loss": 0.0326, "step": 267 }, { "epoch": 0.33, "grad_norm": 2.029120683670044, "learning_rate": 9.899212978705836e-06, "loss": 0.0384, "step": 268 }, { "epoch": 0.33, "grad_norm": 1.2418546676635742, "learning_rate": 9.898432830874324e-06, "loss": 0.0365, "step": 269 }, { "epoch": 0.33, "grad_norm": 1.3441228866577148, "learning_rate": 9.897649706262474e-06, "loss": 0.0692, "step": 270 }, { "epoch": 0.34, "grad_norm": 1.4092243909835815, "learning_rate": 9.896863605346191e-06, "loss": 0.0472, "step": 271 }, { "epoch": 0.34, "grad_norm": 1.3884505033493042, "learning_rate": 9.89607452860319e-06, "loss": 0.088, "step": 272 }, { "epoch": 0.34, "grad_norm": 2.6695573329925537, "learning_rate": 9.895282476512995e-06, "loss": 0.043, "step": 273 }, { "epoch": 0.34, "grad_norm": 1.7949867248535156, "learning_rate": 9.894487449556934e-06, "loss": 0.0514, "step": 274 }, { "epoch": 0.34, "grad_norm": 1.3810291290283203, "learning_rate": 9.893689448218146e-06, "loss": 0.0472, "step": 275 }, { "epoch": 0.34, "grad_norm": 1.0681228637695312, "learning_rate": 9.892888472981577e-06, "loss": 0.0389, "step": 276 }, { "epoch": 0.34, "grad_norm": 0.6548139452934265, "learning_rate": 9.89208452433398e-06, "loss": 0.0339, "step": 277 }, { "epoch": 0.34, "grad_norm": 0.8944026231765747, "learning_rate": 9.891277602763916e-06, "loss": 0.037, "step": 278 }, { "epoch": 0.35, "grad_norm": 1.7463440895080566, "learning_rate": 9.89046770876175e-06, "loss": 0.048, "step": 279 }, { "epoch": 0.35, "grad_norm": 3.2079529762268066, "learning_rate": 9.889654842819658e-06, "loss": 0.0721, "step": 280 }, { "epoch": 0.35, "grad_norm": 2.0868616104125977, "learning_rate": 9.888839005431615e-06, "loss": 0.0573, "step": 281 }, { "epoch": 0.35, "grad_norm": 1.23513662815094, "learning_rate": 9.888020197093409e-06, "loss": 0.0542, "step": 282 }, { "epoch": 0.35, "grad_norm": 0.7781217694282532, "learning_rate": 9.887198418302629e-06, "loss": 0.0386, "step": 283 }, { "epoch": 0.35, "grad_norm": 1.390410304069519, "learning_rate": 9.886373669558669e-06, "loss": 0.0338, "step": 284 }, { "epoch": 0.35, "grad_norm": 1.6135231256484985, "learning_rate": 9.885545951362733e-06, "loss": 0.0403, "step": 285 }, { "epoch": 0.35, "grad_norm": 1.1802467107772827, "learning_rate": 9.884715264217823e-06, "loss": 0.0716, "step": 286 }, { "epoch": 0.36, "grad_norm": 1.1783833503723145, "learning_rate": 9.883881608628748e-06, "loss": 0.0426, "step": 287 }, { "epoch": 0.36, "grad_norm": 0.994340181350708, "learning_rate": 9.883044985102122e-06, "loss": 0.047, "step": 288 }, { "epoch": 0.36, "grad_norm": 0.9849565625190735, "learning_rate": 9.882205394146362e-06, "loss": 0.0416, "step": 289 }, { "epoch": 0.36, "grad_norm": 1.2525103092193604, "learning_rate": 9.881362836271686e-06, "loss": 0.0672, "step": 290 }, { "epoch": 0.36, "grad_norm": 0.8505926728248596, "learning_rate": 9.880517311990118e-06, "loss": 0.0455, "step": 291 }, { "epoch": 0.36, "grad_norm": 1.3629908561706543, "learning_rate": 9.879668821815484e-06, "loss": 0.0357, "step": 292 }, { "epoch": 0.36, "grad_norm": 1.1365973949432373, "learning_rate": 9.878817366263412e-06, "loss": 0.0666, "step": 293 }, { "epoch": 0.36, "grad_norm": 1.0324252843856812, "learning_rate": 9.87796294585133e-06, "loss": 0.0449, "step": 294 }, { "epoch": 0.37, "grad_norm": 0.757729172706604, "learning_rate": 9.877105561098473e-06, "loss": 0.0248, "step": 295 }, { "epoch": 0.37, "grad_norm": 1.2894716262817383, "learning_rate": 9.87624521252587e-06, "loss": 0.0382, "step": 296 }, { "epoch": 0.37, "grad_norm": 1.5887492895126343, "learning_rate": 9.87538190065636e-06, "loss": 0.0459, "step": 297 }, { "epoch": 0.37, "grad_norm": 1.5617096424102783, "learning_rate": 9.874515626014576e-06, "loss": 0.0673, "step": 298 }, { "epoch": 0.37, "grad_norm": 2.4001352787017822, "learning_rate": 9.873646389126954e-06, "loss": 0.0937, "step": 299 }, { "epoch": 0.37, "grad_norm": 1.1498814821243286, "learning_rate": 9.872774190521727e-06, "loss": 0.0609, "step": 300 }, { "epoch": 0.37, "grad_norm": 3.620199680328369, "learning_rate": 9.871899030728932e-06, "loss": 0.078, "step": 301 }, { "epoch": 0.37, "grad_norm": 1.5257648229599, "learning_rate": 9.871020910280408e-06, "loss": 0.0456, "step": 302 }, { "epoch": 0.38, "grad_norm": 2.344609498977661, "learning_rate": 9.870139829709784e-06, "loss": 0.0579, "step": 303 }, { "epoch": 0.38, "grad_norm": 0.6787387132644653, "learning_rate": 9.869255789552496e-06, "loss": 0.036, "step": 304 }, { "epoch": 0.38, "grad_norm": 0.7965288162231445, "learning_rate": 9.868368790345777e-06, "loss": 0.0347, "step": 305 }, { "epoch": 0.38, "grad_norm": 1.3934015035629272, "learning_rate": 9.867478832628652e-06, "loss": 0.0504, "step": 306 }, { "epoch": 0.38, "grad_norm": 0.6102665662765503, "learning_rate": 9.866585916941951e-06, "loss": 0.0303, "step": 307 }, { "epoch": 0.38, "grad_norm": 0.6944254636764526, "learning_rate": 9.865690043828302e-06, "loss": 0.0389, "step": 308 }, { "epoch": 0.38, "grad_norm": 0.5572813153266907, "learning_rate": 9.864791213832125e-06, "loss": 0.0249, "step": 309 }, { "epoch": 0.38, "grad_norm": 0.9218201041221619, "learning_rate": 9.863889427499641e-06, "loss": 0.0579, "step": 310 }, { "epoch": 0.38, "grad_norm": 2.7617053985595703, "learning_rate": 9.862984685378864e-06, "loss": 0.0942, "step": 311 }, { "epoch": 0.39, "grad_norm": 2.5800890922546387, "learning_rate": 9.862076988019609e-06, "loss": 0.0705, "step": 312 }, { "epoch": 0.39, "grad_norm": 0.5009744763374329, "learning_rate": 9.86116633597348e-06, "loss": 0.0187, "step": 313 }, { "epoch": 0.39, "grad_norm": 0.8876914381980896, "learning_rate": 9.860252729793885e-06, "loss": 0.0574, "step": 314 }, { "epoch": 0.39, "grad_norm": 2.8853681087493896, "learning_rate": 9.859336170036022e-06, "loss": 0.0509, "step": 315 }, { "epoch": 0.39, "grad_norm": 3.341853141784668, "learning_rate": 9.858416657256883e-06, "loss": 0.0697, "step": 316 }, { "epoch": 0.39, "grad_norm": 1.9934710264205933, "learning_rate": 9.857494192015258e-06, "loss": 0.0531, "step": 317 }, { "epoch": 0.39, "grad_norm": 1.259093165397644, "learning_rate": 9.85656877487173e-06, "loss": 0.0349, "step": 318 }, { "epoch": 0.39, "grad_norm": 0.9945093393325806, "learning_rate": 9.855640406388673e-06, "loss": 0.0393, "step": 319 }, { "epoch": 0.4, "grad_norm": 1.5558804273605347, "learning_rate": 9.854709087130261e-06, "loss": 0.0584, "step": 320 }, { "epoch": 0.4, "grad_norm": 2.9720606803894043, "learning_rate": 9.853774817662453e-06, "loss": 0.0767, "step": 321 }, { "epoch": 0.4, "grad_norm": 0.8328733444213867, "learning_rate": 9.85283759855301e-06, "loss": 0.0312, "step": 322 }, { "epoch": 0.4, "grad_norm": 2.4241795539855957, "learning_rate": 9.851897430371475e-06, "loss": 0.0613, "step": 323 }, { "epoch": 0.4, "grad_norm": 1.2547311782836914, "learning_rate": 9.850954313689193e-06, "loss": 0.0378, "step": 324 }, { "epoch": 0.4, "grad_norm": 0.9641187191009521, "learning_rate": 9.850008249079295e-06, "loss": 0.0301, "step": 325 }, { "epoch": 0.4, "grad_norm": 3.5166923999786377, "learning_rate": 9.849059237116702e-06, "loss": 0.0651, "step": 326 }, { "epoch": 0.4, "grad_norm": 1.5394651889801025, "learning_rate": 9.848107278378136e-06, "loss": 0.0483, "step": 327 }, { "epoch": 0.41, "grad_norm": 1.9585269689559937, "learning_rate": 9.847152373442096e-06, "loss": 0.0548, "step": 328 }, { "epoch": 0.41, "grad_norm": 1.0429555177688599, "learning_rate": 9.846194522888884e-06, "loss": 0.0481, "step": 329 }, { "epoch": 0.41, "grad_norm": 1.1581437587738037, "learning_rate": 9.84523372730058e-06, "loss": 0.0603, "step": 330 }, { "epoch": 0.41, "grad_norm": 0.7063565850257874, "learning_rate": 9.844269987261066e-06, "loss": 0.0326, "step": 331 }, { "epoch": 0.41, "grad_norm": 1.5360925197601318, "learning_rate": 9.843303303356005e-06, "loss": 0.0456, "step": 332 }, { "epoch": 0.41, "grad_norm": 1.3182265758514404, "learning_rate": 9.84233367617285e-06, "loss": 0.0336, "step": 333 }, { "epoch": 0.41, "grad_norm": 0.8530195951461792, "learning_rate": 9.841361106300846e-06, "loss": 0.0375, "step": 334 }, { "epoch": 0.41, "grad_norm": 0.9681763052940369, "learning_rate": 9.840385594331022e-06, "loss": 0.0265, "step": 335 }, { "epoch": 0.42, "grad_norm": 1.2474390268325806, "learning_rate": 9.839407140856199e-06, "loss": 0.0438, "step": 336 }, { "epoch": 0.42, "grad_norm": 1.427484393119812, "learning_rate": 9.838425746470984e-06, "loss": 0.0506, "step": 337 }, { "epoch": 0.42, "grad_norm": 0.8225058317184448, "learning_rate": 9.837441411771771e-06, "loss": 0.0355, "step": 338 }, { "epoch": 0.42, "grad_norm": 0.9241979122161865, "learning_rate": 9.836454137356739e-06, "loss": 0.0386, "step": 339 }, { "epoch": 0.42, "grad_norm": 0.8418800234794617, "learning_rate": 9.835463923825854e-06, "loss": 0.0392, "step": 340 }, { "epoch": 0.42, "grad_norm": 0.9536418914794922, "learning_rate": 9.834470771780875e-06, "loss": 0.0577, "step": 341 }, { "epoch": 0.42, "grad_norm": 0.7787923216819763, "learning_rate": 9.833474681825334e-06, "loss": 0.0325, "step": 342 }, { "epoch": 0.42, "grad_norm": 2.5342555046081543, "learning_rate": 9.832475654564562e-06, "loss": 0.0413, "step": 343 }, { "epoch": 0.43, "grad_norm": 1.160288691520691, "learning_rate": 9.831473690605664e-06, "loss": 0.0609, "step": 344 }, { "epoch": 0.43, "grad_norm": 2.0293076038360596, "learning_rate": 9.830468790557536e-06, "loss": 0.0376, "step": 345 }, { "epoch": 0.43, "grad_norm": 1.1950795650482178, "learning_rate": 9.829460955030854e-06, "loss": 0.0285, "step": 346 }, { "epoch": 0.43, "grad_norm": 1.130022644996643, "learning_rate": 9.828450184638082e-06, "loss": 0.0725, "step": 347 }, { "epoch": 0.43, "grad_norm": 1.2049533128738403, "learning_rate": 9.827436479993468e-06, "loss": 0.0345, "step": 348 }, { "epoch": 0.43, "grad_norm": 1.9585927724838257, "learning_rate": 9.826419841713038e-06, "loss": 0.0539, "step": 349 }, { "epoch": 0.43, "grad_norm": 0.7200453281402588, "learning_rate": 9.825400270414602e-06, "loss": 0.0358, "step": 350 }, { "epoch": 0.43, "grad_norm": 0.9681141972541809, "learning_rate": 9.824377766717758e-06, "loss": 0.0288, "step": 351 }, { "epoch": 0.44, "grad_norm": 0.843163788318634, "learning_rate": 9.823352331243881e-06, "loss": 0.0396, "step": 352 }, { "epoch": 0.44, "grad_norm": 0.8464294075965881, "learning_rate": 9.822323964616125e-06, "loss": 0.0394, "step": 353 }, { "epoch": 0.44, "grad_norm": 0.6887583136558533, "learning_rate": 9.821292667459435e-06, "loss": 0.0295, "step": 354 }, { "epoch": 0.44, "grad_norm": 1.815610408782959, "learning_rate": 9.820258440400525e-06, "loss": 0.0372, "step": 355 }, { "epoch": 0.44, "grad_norm": 1.1596908569335938, "learning_rate": 9.8192212840679e-06, "loss": 0.0247, "step": 356 }, { "epoch": 0.44, "grad_norm": 1.0240830183029175, "learning_rate": 9.818181199091838e-06, "loss": 0.0497, "step": 357 }, { "epoch": 0.44, "grad_norm": 0.9827424883842468, "learning_rate": 9.817138186104401e-06, "loss": 0.0585, "step": 358 }, { "epoch": 0.44, "grad_norm": 0.8876912593841553, "learning_rate": 9.816092245739426e-06, "loss": 0.039, "step": 359 }, { "epoch": 0.45, "grad_norm": 1.8267855644226074, "learning_rate": 9.81504337863253e-06, "loss": 0.0393, "step": 360 }, { "epoch": 0.45, "grad_norm": 0.7727996706962585, "learning_rate": 9.813991585421118e-06, "loss": 0.0442, "step": 361 }, { "epoch": 0.45, "grad_norm": 2.0796356201171875, "learning_rate": 9.812936866744358e-06, "loss": 0.0525, "step": 362 }, { "epoch": 0.45, "grad_norm": 0.8108832836151123, "learning_rate": 9.811879223243207e-06, "loss": 0.0367, "step": 363 }, { "epoch": 0.45, "grad_norm": 0.9708784818649292, "learning_rate": 9.810818655560393e-06, "loss": 0.0436, "step": 364 }, { "epoch": 0.45, "grad_norm": 1.442888855934143, "learning_rate": 9.809755164340423e-06, "loss": 0.0432, "step": 365 }, { "epoch": 0.45, "grad_norm": 0.8913246989250183, "learning_rate": 9.808688750229584e-06, "loss": 0.046, "step": 366 }, { "epoch": 0.45, "grad_norm": 2.196491003036499, "learning_rate": 9.807619413875937e-06, "loss": 0.0466, "step": 367 }, { "epoch": 0.46, "grad_norm": 0.9138450622558594, "learning_rate": 9.806547155929315e-06, "loss": 0.0355, "step": 368 }, { "epoch": 0.46, "grad_norm": 0.3624818027019501, "learning_rate": 9.80547197704133e-06, "loss": 0.0186, "step": 369 }, { "epoch": 0.46, "grad_norm": 1.0726361274719238, "learning_rate": 9.804393877865373e-06, "loss": 0.0497, "step": 370 }, { "epoch": 0.46, "grad_norm": 0.8961818218231201, "learning_rate": 9.8033128590566e-06, "loss": 0.0356, "step": 371 }, { "epoch": 0.46, "grad_norm": 2.240262746810913, "learning_rate": 9.80222892127195e-06, "loss": 0.0794, "step": 372 }, { "epoch": 0.46, "grad_norm": 2.4816982746124268, "learning_rate": 9.801142065170132e-06, "loss": 0.0631, "step": 373 }, { "epoch": 0.46, "grad_norm": 1.1969040632247925, "learning_rate": 9.80005229141163e-06, "loss": 0.0559, "step": 374 }, { "epoch": 0.46, "grad_norm": 1.4784609079360962, "learning_rate": 9.798959600658697e-06, "loss": 0.0746, "step": 375 }, { "epoch": 0.47, "grad_norm": 0.7828866839408875, "learning_rate": 9.797863993575365e-06, "loss": 0.0396, "step": 376 }, { "epoch": 0.47, "grad_norm": 0.7891765832901001, "learning_rate": 9.796765470827435e-06, "loss": 0.0567, "step": 377 }, { "epoch": 0.47, "grad_norm": 0.7710642218589783, "learning_rate": 9.795664033082476e-06, "loss": 0.0442, "step": 378 }, { "epoch": 0.47, "grad_norm": 0.8450149297714233, "learning_rate": 9.794559681009837e-06, "loss": 0.036, "step": 379 }, { "epoch": 0.47, "grad_norm": 0.545617401599884, "learning_rate": 9.79345241528063e-06, "loss": 0.0302, "step": 380 }, { "epoch": 0.47, "grad_norm": 1.7093480825424194, "learning_rate": 9.792342236567743e-06, "loss": 0.0494, "step": 381 }, { "epoch": 0.47, "grad_norm": 0.8590899109840393, "learning_rate": 9.791229145545832e-06, "loss": 0.0389, "step": 382 }, { "epoch": 0.47, "grad_norm": 1.1689053773880005, "learning_rate": 9.790113142891323e-06, "loss": 0.0505, "step": 383 }, { "epoch": 0.48, "grad_norm": 0.6099830269813538, "learning_rate": 9.78899422928241e-06, "loss": 0.036, "step": 384 }, { "epoch": 0.48, "grad_norm": 1.2200748920440674, "learning_rate": 9.787872405399059e-06, "loss": 0.0557, "step": 385 }, { "epoch": 0.48, "grad_norm": 1.0489903688430786, "learning_rate": 9.786747671923003e-06, "loss": 0.0719, "step": 386 }, { "epoch": 0.48, "grad_norm": 1.578433871269226, "learning_rate": 9.785620029537741e-06, "loss": 0.03, "step": 387 }, { "epoch": 0.48, "grad_norm": 0.9253179430961609, "learning_rate": 9.784489478928545e-06, "loss": 0.0527, "step": 388 }, { "epoch": 0.48, "grad_norm": 0.7473218441009521, "learning_rate": 9.783356020782448e-06, "loss": 0.035, "step": 389 }, { "epoch": 0.48, "grad_norm": 1.4502854347229004, "learning_rate": 9.782219655788257e-06, "loss": 0.0423, "step": 390 }, { "epoch": 0.48, "grad_norm": 0.946733295917511, "learning_rate": 9.781080384636539e-06, "loss": 0.0413, "step": 391 }, { "epoch": 0.49, "grad_norm": 1.4826123714447021, "learning_rate": 9.77993820801963e-06, "loss": 0.0414, "step": 392 }, { "epoch": 0.49, "grad_norm": 2.0471692085266113, "learning_rate": 9.778793126631632e-06, "loss": 0.0466, "step": 393 }, { "epoch": 0.49, "grad_norm": 1.7681257724761963, "learning_rate": 9.777645141168411e-06, "loss": 0.0504, "step": 394 }, { "epoch": 0.49, "grad_norm": 0.7187155485153198, "learning_rate": 9.776494252327597e-06, "loss": 0.0447, "step": 395 }, { "epoch": 0.49, "grad_norm": 0.7922236323356628, "learning_rate": 9.775340460808589e-06, "loss": 0.0313, "step": 396 }, { "epoch": 0.49, "grad_norm": 2.724630117416382, "learning_rate": 9.774183767312545e-06, "loss": 0.0616, "step": 397 }, { "epoch": 0.49, "grad_norm": 0.47513461112976074, "learning_rate": 9.773024172542389e-06, "loss": 0.0163, "step": 398 }, { "epoch": 0.49, "grad_norm": 0.6144838333129883, "learning_rate": 9.771861677202804e-06, "loss": 0.0271, "step": 399 }, { "epoch": 0.5, "grad_norm": 1.0170230865478516, "learning_rate": 9.770696282000245e-06, "loss": 0.0438, "step": 400 }, { "epoch": 0.5, "grad_norm": 0.5385282635688782, "learning_rate": 9.76952798764292e-06, "loss": 0.0169, "step": 401 }, { "epoch": 0.5, "grad_norm": 1.6152381896972656, "learning_rate": 9.7683567948408e-06, "loss": 0.068, "step": 402 }, { "epoch": 0.5, "grad_norm": 0.9734664559364319, "learning_rate": 9.767182704305625e-06, "loss": 0.0681, "step": 403 }, { "epoch": 0.5, "grad_norm": 1.7027530670166016, "learning_rate": 9.766005716750884e-06, "loss": 0.04, "step": 404 }, { "epoch": 0.5, "grad_norm": 0.7407202124595642, "learning_rate": 9.764825832891837e-06, "loss": 0.033, "step": 405 }, { "epoch": 0.5, "grad_norm": 0.8196337223052979, "learning_rate": 9.7636430534455e-06, "loss": 0.0451, "step": 406 }, { "epoch": 0.5, "grad_norm": 2.600836753845215, "learning_rate": 9.762457379130649e-06, "loss": 0.075, "step": 407 }, { "epoch": 0.5, "grad_norm": 1.4206620454788208, "learning_rate": 9.761268810667817e-06, "loss": 0.0255, "step": 408 }, { "epoch": 0.51, "grad_norm": 0.9220699071884155, "learning_rate": 9.760077348779298e-06, "loss": 0.0564, "step": 409 }, { "epoch": 0.51, "grad_norm": 0.6927193999290466, "learning_rate": 9.758882994189145e-06, "loss": 0.0375, "step": 410 }, { "epoch": 0.51, "grad_norm": 0.9594948291778564, "learning_rate": 9.757685747623169e-06, "loss": 0.0523, "step": 411 }, { "epoch": 0.51, "grad_norm": 1.9151678085327148, "learning_rate": 9.756485609808934e-06, "loss": 0.0634, "step": 412 }, { "epoch": 0.51, "grad_norm": 1.0471961498260498, "learning_rate": 9.755282581475769e-06, "loss": 0.027, "step": 413 }, { "epoch": 0.51, "grad_norm": 1.2358285188674927, "learning_rate": 9.75407666335475e-06, "loss": 0.0705, "step": 414 }, { "epoch": 0.51, "grad_norm": 0.8452746272087097, "learning_rate": 9.752867856178719e-06, "loss": 0.0485, "step": 415 }, { "epoch": 0.51, "grad_norm": 1.2570796012878418, "learning_rate": 9.751656160682265e-06, "loss": 0.0375, "step": 416 }, { "epoch": 0.52, "grad_norm": 1.8666393756866455, "learning_rate": 9.750441577601738e-06, "loss": 0.0418, "step": 417 }, { "epoch": 0.52, "grad_norm": 0.7684221267700195, "learning_rate": 9.749224107675239e-06, "loss": 0.0477, "step": 418 }, { "epoch": 0.52, "grad_norm": 1.430303931236267, "learning_rate": 9.748003751642628e-06, "loss": 0.0389, "step": 419 }, { "epoch": 0.52, "grad_norm": 4.4301066398620605, "learning_rate": 9.746780510245512e-06, "loss": 0.0868, "step": 420 }, { "epoch": 0.52, "grad_norm": 2.655571699142456, "learning_rate": 9.74555438422726e-06, "loss": 0.0423, "step": 421 }, { "epoch": 0.52, "grad_norm": 1.7431411743164062, "learning_rate": 9.744325374332986e-06, "loss": 0.0235, "step": 422 }, { "epoch": 0.52, "grad_norm": 1.7228596210479736, "learning_rate": 9.743093481309563e-06, "loss": 0.0361, "step": 423 }, { "epoch": 0.52, "grad_norm": 0.5912590026855469, "learning_rate": 9.741858705905609e-06, "loss": 0.0254, "step": 424 }, { "epoch": 0.53, "grad_norm": 0.8103305101394653, "learning_rate": 9.740621048871501e-06, "loss": 0.0159, "step": 425 }, { "epoch": 0.53, "grad_norm": 2.466233253479004, "learning_rate": 9.739380510959365e-06, "loss": 0.0803, "step": 426 }, { "epoch": 0.53, "grad_norm": 0.5837281942367554, "learning_rate": 9.738137092923072e-06, "loss": 0.0293, "step": 427 }, { "epoch": 0.53, "grad_norm": 1.528012990951538, "learning_rate": 9.73689079551825e-06, "loss": 0.0549, "step": 428 }, { "epoch": 0.53, "grad_norm": 2.025675058364868, "learning_rate": 9.735641619502277e-06, "loss": 0.0663, "step": 429 }, { "epoch": 0.53, "grad_norm": 1.34830641746521, "learning_rate": 9.734389565634277e-06, "loss": 0.0483, "step": 430 }, { "epoch": 0.53, "grad_norm": 1.644051194190979, "learning_rate": 9.73313463467512e-06, "loss": 0.053, "step": 431 }, { "epoch": 0.53, "grad_norm": 1.6768667697906494, "learning_rate": 9.731876827387433e-06, "loss": 0.0626, "step": 432 }, { "epoch": 0.54, "grad_norm": 2.0125842094421387, "learning_rate": 9.730616144535581e-06, "loss": 0.0424, "step": 433 }, { "epoch": 0.54, "grad_norm": 4.256353378295898, "learning_rate": 9.729352586885687e-06, "loss": 0.0734, "step": 434 }, { "epoch": 0.54, "grad_norm": 3.4163427352905273, "learning_rate": 9.728086155205614e-06, "loss": 0.0544, "step": 435 }, { "epoch": 0.54, "grad_norm": 2.842038154602051, "learning_rate": 9.726816850264971e-06, "loss": 0.0465, "step": 436 }, { "epoch": 0.54, "grad_norm": 2.0849742889404297, "learning_rate": 9.725544672835118e-06, "loss": 0.0684, "step": 437 }, { "epoch": 0.54, "grad_norm": 0.677302360534668, "learning_rate": 9.724269623689158e-06, "loss": 0.0284, "step": 438 }, { "epoch": 0.54, "grad_norm": 1.040449619293213, "learning_rate": 9.722991703601936e-06, "loss": 0.0384, "step": 439 }, { "epoch": 0.54, "grad_norm": 0.6753067374229431, "learning_rate": 9.721710913350048e-06, "loss": 0.0436, "step": 440 }, { "epoch": 0.55, "grad_norm": 2.006178617477417, "learning_rate": 9.720427253711831e-06, "loss": 0.046, "step": 441 }, { "epoch": 0.55, "grad_norm": 1.1364405155181885, "learning_rate": 9.719140725467362e-06, "loss": 0.0512, "step": 442 }, { "epoch": 0.55, "grad_norm": 0.7395780086517334, "learning_rate": 9.717851329398469e-06, "loss": 0.0239, "step": 443 }, { "epoch": 0.55, "grad_norm": 1.4531809091567993, "learning_rate": 9.716559066288716e-06, "loss": 0.0505, "step": 444 }, { "epoch": 0.55, "grad_norm": 0.9090608954429626, "learning_rate": 9.715263936923413e-06, "loss": 0.0272, "step": 445 }, { "epoch": 0.55, "grad_norm": 0.9618948698043823, "learning_rate": 9.713965942089612e-06, "loss": 0.0491, "step": 446 }, { "epoch": 0.55, "grad_norm": 0.5173948407173157, "learning_rate": 9.712665082576104e-06, "loss": 0.0264, "step": 447 }, { "epoch": 0.55, "grad_norm": 0.5747056603431702, "learning_rate": 9.711361359173422e-06, "loss": 0.0231, "step": 448 }, { "epoch": 0.56, "grad_norm": 1.7778929471969604, "learning_rate": 9.710054772673839e-06, "loss": 0.0492, "step": 449 }, { "epoch": 0.56, "grad_norm": 2.290955066680908, "learning_rate": 9.708745323871369e-06, "loss": 0.0465, "step": 450 }, { "epoch": 0.56, "grad_norm": 1.1455390453338623, "learning_rate": 9.707433013561765e-06, "loss": 0.0625, "step": 451 }, { "epoch": 0.56, "grad_norm": 2.4170002937316895, "learning_rate": 9.706117842542517e-06, "loss": 0.0761, "step": 452 }, { "epoch": 0.56, "grad_norm": 1.6311193704605103, "learning_rate": 9.704799811612858e-06, "loss": 0.0736, "step": 453 }, { "epoch": 0.56, "grad_norm": 1.4031122922897339, "learning_rate": 9.703478921573753e-06, "loss": 0.0362, "step": 454 }, { "epoch": 0.56, "grad_norm": 1.10888671875, "learning_rate": 9.702155173227911e-06, "loss": 0.0468, "step": 455 }, { "epoch": 0.56, "grad_norm": 2.612172842025757, "learning_rate": 9.700828567379772e-06, "loss": 0.0709, "step": 456 }, { "epoch": 0.57, "grad_norm": 1.2346030473709106, "learning_rate": 9.699499104835514e-06, "loss": 0.0587, "step": 457 }, { "epoch": 0.57, "grad_norm": 1.7313090562820435, "learning_rate": 9.698166786403057e-06, "loss": 0.0372, "step": 458 }, { "epoch": 0.57, "grad_norm": 1.303956389427185, "learning_rate": 9.696831612892048e-06, "loss": 0.0415, "step": 459 }, { "epoch": 0.57, "grad_norm": 0.4627138674259186, "learning_rate": 9.695493585113873e-06, "loss": 0.0276, "step": 460 }, { "epoch": 0.57, "grad_norm": 0.7128018140792847, "learning_rate": 9.694152703881653e-06, "loss": 0.0265, "step": 461 }, { "epoch": 0.57, "grad_norm": 0.8362938165664673, "learning_rate": 9.69280897001024e-06, "loss": 0.0597, "step": 462 }, { "epoch": 0.57, "grad_norm": 0.9412689208984375, "learning_rate": 9.691462384316226e-06, "loss": 0.062, "step": 463 }, { "epoch": 0.57, "grad_norm": 1.3194217681884766, "learning_rate": 9.690112947617929e-06, "loss": 0.0526, "step": 464 }, { "epoch": 0.58, "grad_norm": 1.3153883218765259, "learning_rate": 9.688760660735403e-06, "loss": 0.0497, "step": 465 }, { "epoch": 0.58, "grad_norm": 1.290602684020996, "learning_rate": 9.687405524490433e-06, "loss": 0.0277, "step": 466 }, { "epoch": 0.58, "grad_norm": 0.6527288556098938, "learning_rate": 9.686047539706536e-06, "loss": 0.0353, "step": 467 }, { "epoch": 0.58, "grad_norm": 1.1408582925796509, "learning_rate": 9.684686707208962e-06, "loss": 0.0407, "step": 468 }, { "epoch": 0.58, "grad_norm": 0.5641573071479797, "learning_rate": 9.683323027824687e-06, "loss": 0.0311, "step": 469 }, { "epoch": 0.58, "grad_norm": 0.8712812066078186, "learning_rate": 9.681956502382423e-06, "loss": 0.0484, "step": 470 }, { "epoch": 0.58, "grad_norm": 1.6026149988174438, "learning_rate": 9.680587131712605e-06, "loss": 0.0697, "step": 471 }, { "epoch": 0.58, "grad_norm": 0.7954007983207703, "learning_rate": 9.6792149166474e-06, "loss": 0.0621, "step": 472 }, { "epoch": 0.59, "grad_norm": 1.8472158908843994, "learning_rate": 9.677839858020709e-06, "loss": 0.0437, "step": 473 }, { "epoch": 0.59, "grad_norm": 0.9168758988380432, "learning_rate": 9.676461956668148e-06, "loss": 0.0535, "step": 474 }, { "epoch": 0.59, "grad_norm": 1.1088653802871704, "learning_rate": 9.675081213427076e-06, "loss": 0.038, "step": 475 }, { "epoch": 0.59, "grad_norm": 0.6966286301612854, "learning_rate": 9.673697629136566e-06, "loss": 0.0304, "step": 476 }, { "epoch": 0.59, "grad_norm": 1.734716534614563, "learning_rate": 9.672311204637426e-06, "loss": 0.0705, "step": 477 }, { "epoch": 0.59, "grad_norm": 0.8543561697006226, "learning_rate": 9.670921940772186e-06, "loss": 0.0585, "step": 478 }, { "epoch": 0.59, "grad_norm": 0.6839298605918884, "learning_rate": 9.669529838385102e-06, "loss": 0.0381, "step": 479 }, { "epoch": 0.59, "grad_norm": 0.794438362121582, "learning_rate": 9.668134898322157e-06, "loss": 0.0485, "step": 480 }, { "epoch": 0.6, "grad_norm": 0.585090696811676, "learning_rate": 9.666737121431055e-06, "loss": 0.0295, "step": 481 }, { "epoch": 0.6, "grad_norm": 1.14494788646698, "learning_rate": 9.665336508561225e-06, "loss": 0.0248, "step": 482 }, { "epoch": 0.6, "grad_norm": 0.7456786632537842, "learning_rate": 9.663933060563824e-06, "loss": 0.0384, "step": 483 }, { "epoch": 0.6, "grad_norm": 1.0646755695343018, "learning_rate": 9.662526778291725e-06, "loss": 0.056, "step": 484 }, { "epoch": 0.6, "grad_norm": 0.6966055631637573, "learning_rate": 9.661117662599527e-06, "loss": 0.0279, "step": 485 }, { "epoch": 0.6, "grad_norm": 0.8128595948219299, "learning_rate": 9.659705714343551e-06, "loss": 0.0421, "step": 486 }, { "epoch": 0.6, "grad_norm": 1.1546441316604614, "learning_rate": 9.658290934381837e-06, "loss": 0.0527, "step": 487 }, { "epoch": 0.6, "grad_norm": 0.7882161736488342, "learning_rate": 9.656873323574152e-06, "loss": 0.041, "step": 488 }, { "epoch": 0.61, "grad_norm": 0.9414128065109253, "learning_rate": 9.655452882781972e-06, "loss": 0.0198, "step": 489 }, { "epoch": 0.61, "grad_norm": 1.0596210956573486, "learning_rate": 9.654029612868507e-06, "loss": 0.0606, "step": 490 }, { "epoch": 0.61, "grad_norm": 0.676780641078949, "learning_rate": 9.652603514698674e-06, "loss": 0.0232, "step": 491 }, { "epoch": 0.61, "grad_norm": 0.8404201865196228, "learning_rate": 9.651174589139115e-06, "loss": 0.0314, "step": 492 }, { "epoch": 0.61, "grad_norm": 0.47275248169898987, "learning_rate": 9.649742837058189e-06, "loss": 0.0169, "step": 493 }, { "epoch": 0.61, "grad_norm": 3.815514087677002, "learning_rate": 9.648308259325973e-06, "loss": 0.0986, "step": 494 }, { "epoch": 0.61, "grad_norm": 1.271995186805725, "learning_rate": 9.646870856814259e-06, "loss": 0.0271, "step": 495 }, { "epoch": 0.61, "grad_norm": 0.6948990821838379, "learning_rate": 9.64543063039656e-06, "loss": 0.0224, "step": 496 }, { "epoch": 0.62, "grad_norm": 1.3301115036010742, "learning_rate": 9.6439875809481e-06, "loss": 0.0375, "step": 497 }, { "epoch": 0.62, "grad_norm": 0.6250678896903992, "learning_rate": 9.64254170934582e-06, "loss": 0.0184, "step": 498 }, { "epoch": 0.62, "grad_norm": 0.9256348609924316, "learning_rate": 9.641093016468381e-06, "loss": 0.0375, "step": 499 }, { "epoch": 0.62, "grad_norm": 1.3027982711791992, "learning_rate": 9.639641503196152e-06, "loss": 0.0276, "step": 500 }, { "epoch": 0.62, "grad_norm": 2.560512065887451, "learning_rate": 9.638187170411218e-06, "loss": 0.0482, "step": 501 }, { "epoch": 0.62, "grad_norm": 1.6088508367538452, "learning_rate": 9.63673001899738e-06, "loss": 0.0436, "step": 502 }, { "epoch": 0.62, "grad_norm": 1.439906358718872, "learning_rate": 9.635270049840146e-06, "loss": 0.0772, "step": 503 }, { "epoch": 0.62, "grad_norm": 1.1696199178695679, "learning_rate": 9.633807263826745e-06, "loss": 0.0388, "step": 504 }, { "epoch": 0.62, "grad_norm": 1.6363476514816284, "learning_rate": 9.632341661846107e-06, "loss": 0.0592, "step": 505 }, { "epoch": 0.63, "grad_norm": 3.1684820652008057, "learning_rate": 9.630873244788884e-06, "loss": 0.0696, "step": 506 }, { "epoch": 0.63, "grad_norm": 2.787458658218384, "learning_rate": 9.629402013547432e-06, "loss": 0.0842, "step": 507 }, { "epoch": 0.63, "grad_norm": 0.8504316806793213, "learning_rate": 9.627927969015817e-06, "loss": 0.0413, "step": 508 }, { "epoch": 0.63, "grad_norm": 0.9233881235122681, "learning_rate": 9.62645111208982e-06, "loss": 0.0315, "step": 509 }, { "epoch": 0.63, "grad_norm": 1.571606159210205, "learning_rate": 9.62497144366693e-06, "loss": 0.0716, "step": 510 }, { "epoch": 0.63, "grad_norm": 2.602965831756592, "learning_rate": 9.623488964646334e-06, "loss": 0.0526, "step": 511 }, { "epoch": 0.63, "grad_norm": 1.687855839729309, "learning_rate": 9.622003675928943e-06, "loss": 0.0517, "step": 512 }, { "epoch": 0.63, "grad_norm": 1.535513162612915, "learning_rate": 9.620515578417364e-06, "loss": 0.0368, "step": 513 }, { "epoch": 0.64, "grad_norm": 0.5331669449806213, "learning_rate": 9.619024673015916e-06, "loss": 0.0273, "step": 514 }, { "epoch": 0.64, "grad_norm": 0.7347199320793152, "learning_rate": 9.617530960630624e-06, "loss": 0.022, "step": 515 }, { "epoch": 0.64, "grad_norm": 1.8210560083389282, "learning_rate": 9.616034442169214e-06, "loss": 0.0625, "step": 516 }, { "epoch": 0.64, "grad_norm": 1.0366301536560059, "learning_rate": 9.614535118541126e-06, "loss": 0.0409, "step": 517 }, { "epoch": 0.64, "grad_norm": 0.8622118234634399, "learning_rate": 9.613032990657495e-06, "loss": 0.0529, "step": 518 }, { "epoch": 0.64, "grad_norm": 1.1612430810928345, "learning_rate": 9.61152805943117e-06, "loss": 0.0298, "step": 519 }, { "epoch": 0.64, "grad_norm": 0.6844496726989746, "learning_rate": 9.610020325776694e-06, "loss": 0.0306, "step": 520 }, { "epoch": 0.64, "grad_norm": 0.7687200307846069, "learning_rate": 9.608509790610322e-06, "loss": 0.0416, "step": 521 }, { "epoch": 0.65, "grad_norm": 0.7224605083465576, "learning_rate": 9.606996454850002e-06, "loss": 0.036, "step": 522 }, { "epoch": 0.65, "grad_norm": 0.6508851051330566, "learning_rate": 9.605480319415391e-06, "loss": 0.0368, "step": 523 }, { "epoch": 0.65, "grad_norm": 1.3081005811691284, "learning_rate": 9.603961385227848e-06, "loss": 0.0284, "step": 524 }, { "epoch": 0.65, "grad_norm": 0.5530818700790405, "learning_rate": 9.602439653210426e-06, "loss": 0.0273, "step": 525 }, { "epoch": 0.65, "grad_norm": 0.5170778036117554, "learning_rate": 9.600915124287886e-06, "loss": 0.0181, "step": 526 }, { "epoch": 0.65, "grad_norm": 0.5652095079421997, "learning_rate": 9.599387799386684e-06, "loss": 0.0213, "step": 527 }, { "epoch": 0.65, "grad_norm": 1.0414352416992188, "learning_rate": 9.597857679434974e-06, "loss": 0.0389, "step": 528 }, { "epoch": 0.65, "grad_norm": 0.6755688786506653, "learning_rate": 9.596324765362614e-06, "loss": 0.0343, "step": 529 }, { "epoch": 0.66, "grad_norm": 1.5740824937820435, "learning_rate": 9.594789058101154e-06, "loss": 0.0562, "step": 530 }, { "epoch": 0.66, "grad_norm": 1.410057544708252, "learning_rate": 9.593250558583846e-06, "loss": 0.0394, "step": 531 }, { "epoch": 0.66, "grad_norm": 1.4377081394195557, "learning_rate": 9.591709267745635e-06, "loss": 0.0255, "step": 532 }, { "epoch": 0.66, "grad_norm": 0.9751909971237183, "learning_rate": 9.590165186523166e-06, "loss": 0.0395, "step": 533 }, { "epoch": 0.66, "grad_norm": 0.8450660109519958, "learning_rate": 9.588618315854779e-06, "loss": 0.0331, "step": 534 }, { "epoch": 0.66, "grad_norm": 1.8118575811386108, "learning_rate": 9.587068656680506e-06, "loss": 0.0346, "step": 535 }, { "epoch": 0.66, "grad_norm": 0.7216983437538147, "learning_rate": 9.585516209942077e-06, "loss": 0.0242, "step": 536 }, { "epoch": 0.66, "grad_norm": 1.0194247961044312, "learning_rate": 9.583960976582914e-06, "loss": 0.0478, "step": 537 }, { "epoch": 0.67, "grad_norm": 1.1861456632614136, "learning_rate": 9.582402957548132e-06, "loss": 0.0224, "step": 538 }, { "epoch": 0.67, "grad_norm": 0.8888005614280701, "learning_rate": 9.580842153784542e-06, "loss": 0.0393, "step": 539 }, { "epoch": 0.67, "grad_norm": 1.0420960187911987, "learning_rate": 9.579278566240646e-06, "loss": 0.035, "step": 540 }, { "epoch": 0.67, "grad_norm": 0.7932503819465637, "learning_rate": 9.577712195866634e-06, "loss": 0.0361, "step": 541 }, { "epoch": 0.67, "grad_norm": 2.295933246612549, "learning_rate": 9.576143043614393e-06, "loss": 0.0798, "step": 542 }, { "epoch": 0.67, "grad_norm": 0.795536458492279, "learning_rate": 9.574571110437496e-06, "loss": 0.034, "step": 543 }, { "epoch": 0.67, "grad_norm": 1.269714117050171, "learning_rate": 9.572996397291209e-06, "loss": 0.0308, "step": 544 }, { "epoch": 0.67, "grad_norm": 0.7194578051567078, "learning_rate": 9.571418905132486e-06, "loss": 0.0303, "step": 545 }, { "epoch": 0.68, "grad_norm": 0.9299863576889038, "learning_rate": 9.569838634919968e-06, "loss": 0.0549, "step": 546 }, { "epoch": 0.68, "grad_norm": 1.1913076639175415, "learning_rate": 9.568255587613986e-06, "loss": 0.0419, "step": 547 }, { "epoch": 0.68, "grad_norm": 0.6721378564834595, "learning_rate": 9.566669764176562e-06, "loss": 0.0227, "step": 548 }, { "epoch": 0.68, "grad_norm": 0.9450292587280273, "learning_rate": 9.5650811655714e-06, "loss": 0.0272, "step": 549 }, { "epoch": 0.68, "grad_norm": 1.6691453456878662, "learning_rate": 9.56348979276389e-06, "loss": 0.0506, "step": 550 }, { "epoch": 0.68, "grad_norm": 1.0706772804260254, "learning_rate": 9.561895646721113e-06, "loss": 0.0438, "step": 551 }, { "epoch": 0.68, "grad_norm": 1.0017832517623901, "learning_rate": 9.560298728411833e-06, "loss": 0.0604, "step": 552 }, { "epoch": 0.68, "grad_norm": 1.9847087860107422, "learning_rate": 9.558699038806494e-06, "loss": 0.0827, "step": 553 }, { "epoch": 0.69, "grad_norm": 1.05272376537323, "learning_rate": 9.557096578877232e-06, "loss": 0.0315, "step": 554 }, { "epoch": 0.69, "grad_norm": 1.6529170274734497, "learning_rate": 9.555491349597862e-06, "loss": 0.0438, "step": 555 }, { "epoch": 0.69, "grad_norm": 1.5359541177749634, "learning_rate": 9.553883351943882e-06, "loss": 0.0453, "step": 556 }, { "epoch": 0.69, "grad_norm": 0.7716813087463379, "learning_rate": 9.552272586892475e-06, "loss": 0.0395, "step": 557 }, { "epoch": 0.69, "grad_norm": 1.0042527914047241, "learning_rate": 9.550659055422502e-06, "loss": 0.0524, "step": 558 }, { "epoch": 0.69, "grad_norm": 0.9220654368400574, "learning_rate": 9.549042758514505e-06, "loss": 0.052, "step": 559 }, { "epoch": 0.69, "grad_norm": 1.202533483505249, "learning_rate": 9.547423697150714e-06, "loss": 0.0315, "step": 560 }, { "epoch": 0.69, "grad_norm": 1.441113829612732, "learning_rate": 9.545801872315028e-06, "loss": 0.0406, "step": 561 }, { "epoch": 0.7, "grad_norm": 1.1032451391220093, "learning_rate": 9.544177284993035e-06, "loss": 0.0562, "step": 562 }, { "epoch": 0.7, "grad_norm": 0.613166332244873, "learning_rate": 9.542549936171994e-06, "loss": 0.0264, "step": 563 }, { "epoch": 0.7, "grad_norm": 0.6434498429298401, "learning_rate": 9.540919826840848e-06, "loss": 0.0326, "step": 564 }, { "epoch": 0.7, "grad_norm": 0.4755064845085144, "learning_rate": 9.539286957990215e-06, "loss": 0.0271, "step": 565 }, { "epoch": 0.7, "grad_norm": 0.6659818887710571, "learning_rate": 9.53765133061239e-06, "loss": 0.0493, "step": 566 }, { "epoch": 0.7, "grad_norm": 0.9639627933502197, "learning_rate": 9.536012945701345e-06, "loss": 0.0384, "step": 567 }, { "epoch": 0.7, "grad_norm": 0.8150410056114197, "learning_rate": 9.534371804252727e-06, "loss": 0.0306, "step": 568 }, { "epoch": 0.7, "grad_norm": 1.4704219102859497, "learning_rate": 9.532727907263861e-06, "loss": 0.0563, "step": 569 }, { "epoch": 0.71, "grad_norm": 0.6380606889724731, "learning_rate": 9.53108125573374e-06, "loss": 0.0183, "step": 570 }, { "epoch": 0.71, "grad_norm": 0.7984311580657959, "learning_rate": 9.529431850663036e-06, "loss": 0.0469, "step": 571 }, { "epoch": 0.71, "grad_norm": 0.8775026798248291, "learning_rate": 9.527779693054095e-06, "loss": 0.0285, "step": 572 }, { "epoch": 0.71, "grad_norm": 0.5551888346672058, "learning_rate": 9.526124783910935e-06, "loss": 0.0322, "step": 573 }, { "epoch": 0.71, "grad_norm": 1.0795842409133911, "learning_rate": 9.524467124239243e-06, "loss": 0.0478, "step": 574 }, { "epoch": 0.71, "grad_norm": 1.2850500345230103, "learning_rate": 9.52280671504638e-06, "loss": 0.0223, "step": 575 }, { "epoch": 0.71, "grad_norm": 0.5365849733352661, "learning_rate": 9.521143557341378e-06, "loss": 0.0285, "step": 576 }, { "epoch": 0.71, "grad_norm": 0.7505818605422974, "learning_rate": 9.519477652134938e-06, "loss": 0.0301, "step": 577 }, { "epoch": 0.72, "grad_norm": 0.4962819516658783, "learning_rate": 9.517809000439432e-06, "loss": 0.0299, "step": 578 }, { "epoch": 0.72, "grad_norm": 1.9355813264846802, "learning_rate": 9.516137603268903e-06, "loss": 0.0715, "step": 579 }, { "epoch": 0.72, "grad_norm": 1.3954781293869019, "learning_rate": 9.514463461639055e-06, "loss": 0.0512, "step": 580 }, { "epoch": 0.72, "grad_norm": 1.0368856191635132, "learning_rate": 9.51278657656727e-06, "loss": 0.0445, "step": 581 }, { "epoch": 0.72, "grad_norm": 0.7911268472671509, "learning_rate": 9.511106949072588e-06, "loss": 0.0475, "step": 582 }, { "epoch": 0.72, "grad_norm": 1.1066776514053345, "learning_rate": 9.509424580175724e-06, "loss": 0.049, "step": 583 }, { "epoch": 0.72, "grad_norm": 1.1990307569503784, "learning_rate": 9.507739470899048e-06, "loss": 0.0574, "step": 584 }, { "epoch": 0.72, "grad_norm": 1.1048943996429443, "learning_rate": 9.506051622266608e-06, "loss": 0.08, "step": 585 }, { "epoch": 0.73, "grad_norm": 0.8120594024658203, "learning_rate": 9.504361035304106e-06, "loss": 0.0443, "step": 586 }, { "epoch": 0.73, "grad_norm": 0.6603597402572632, "learning_rate": 9.502667711038917e-06, "loss": 0.0366, "step": 587 }, { "epoch": 0.73, "grad_norm": 2.3819870948791504, "learning_rate": 9.500971650500072e-06, "loss": 0.0692, "step": 588 }, { "epoch": 0.73, "grad_norm": 1.7831990718841553, "learning_rate": 9.499272854718268e-06, "loss": 0.0506, "step": 589 }, { "epoch": 0.73, "grad_norm": 1.1036359071731567, "learning_rate": 9.497571324725865e-06, "loss": 0.0435, "step": 590 }, { "epoch": 0.73, "grad_norm": 1.2589616775512695, "learning_rate": 9.495867061556884e-06, "loss": 0.0412, "step": 591 }, { "epoch": 0.73, "grad_norm": 0.78188556432724, "learning_rate": 9.494160066247006e-06, "loss": 0.0534, "step": 592 }, { "epoch": 0.73, "grad_norm": 0.7451815605163574, "learning_rate": 9.492450339833573e-06, "loss": 0.0287, "step": 593 }, { "epoch": 0.74, "grad_norm": 1.3252469301223755, "learning_rate": 9.490737883355587e-06, "loss": 0.0334, "step": 594 }, { "epoch": 0.74, "grad_norm": 0.8932815194129944, "learning_rate": 9.48902269785371e-06, "loss": 0.036, "step": 595 }, { "epoch": 0.74, "grad_norm": 1.6676141023635864, "learning_rate": 9.487304784370257e-06, "loss": 0.0538, "step": 596 }, { "epoch": 0.74, "grad_norm": 0.9928424954414368, "learning_rate": 9.48558414394921e-06, "loss": 0.0558, "step": 597 }, { "epoch": 0.74, "grad_norm": 1.130738377571106, "learning_rate": 9.4838607776362e-06, "loss": 0.0454, "step": 598 }, { "epoch": 0.74, "grad_norm": 0.8108890056610107, "learning_rate": 9.48213468647852e-06, "loss": 0.0265, "step": 599 }, { "epoch": 0.74, "grad_norm": 1.0491758584976196, "learning_rate": 9.480405871525114e-06, "loss": 0.0518, "step": 600 }, { "epoch": 0.74, "grad_norm": 1.0204825401306152, "learning_rate": 9.478674333826586e-06, "loss": 0.0339, "step": 601 }, { "epoch": 0.75, "grad_norm": 1.026297926902771, "learning_rate": 9.476940074435189e-06, "loss": 0.0508, "step": 602 }, { "epoch": 0.75, "grad_norm": 1.4111378192901611, "learning_rate": 9.475203094404836e-06, "loss": 0.0553, "step": 603 }, { "epoch": 0.75, "grad_norm": 0.8152147531509399, "learning_rate": 9.473463394791093e-06, "loss": 0.0512, "step": 604 }, { "epoch": 0.75, "grad_norm": 0.5428625345230103, "learning_rate": 9.471720976651173e-06, "loss": 0.0274, "step": 605 }, { "epoch": 0.75, "grad_norm": 0.789997398853302, "learning_rate": 9.469975841043946e-06, "loss": 0.0456, "step": 606 }, { "epoch": 0.75, "grad_norm": 2.5263166427612305, "learning_rate": 9.468227989029929e-06, "loss": 0.0912, "step": 607 }, { "epoch": 0.75, "grad_norm": 0.9473277926445007, "learning_rate": 9.466477421671296e-06, "loss": 0.0445, "step": 608 }, { "epoch": 0.75, "grad_norm": 0.9322047829627991, "learning_rate": 9.464724140031866e-06, "loss": 0.0473, "step": 609 }, { "epoch": 0.75, "grad_norm": 1.0073190927505493, "learning_rate": 9.462968145177112e-06, "loss": 0.0506, "step": 610 }, { "epoch": 0.76, "grad_norm": 0.5902945399284363, "learning_rate": 9.461209438174148e-06, "loss": 0.0391, "step": 611 }, { "epoch": 0.76, "grad_norm": 2.0115785598754883, "learning_rate": 9.459448020091746e-06, "loss": 0.0614, "step": 612 }, { "epoch": 0.76, "grad_norm": 1.8103097677230835, "learning_rate": 9.457683892000318e-06, "loss": 0.0481, "step": 613 }, { "epoch": 0.76, "grad_norm": 0.718271017074585, "learning_rate": 9.455917054971929e-06, "loss": 0.0277, "step": 614 }, { "epoch": 0.76, "grad_norm": 0.948197066783905, "learning_rate": 9.45414751008028e-06, "loss": 0.0424, "step": 615 }, { "epoch": 0.76, "grad_norm": 1.613114356994629, "learning_rate": 9.452375258400732e-06, "loss": 0.0444, "step": 616 }, { "epoch": 0.76, "grad_norm": 0.5611456632614136, "learning_rate": 9.450600301010279e-06, "loss": 0.0278, "step": 617 }, { "epoch": 0.76, "grad_norm": 1.0461411476135254, "learning_rate": 9.448822638987564e-06, "loss": 0.062, "step": 618 }, { "epoch": 0.77, "grad_norm": 1.203861951828003, "learning_rate": 9.447042273412873e-06, "loss": 0.0335, "step": 619 }, { "epoch": 0.77, "grad_norm": 1.0347965955734253, "learning_rate": 9.445259205368138e-06, "loss": 0.0499, "step": 620 }, { "epoch": 0.77, "grad_norm": 1.2198740243911743, "learning_rate": 9.44347343593693e-06, "loss": 0.0441, "step": 621 }, { "epoch": 0.77, "grad_norm": 0.7504235506057739, "learning_rate": 9.441684966204456e-06, "loss": 0.0483, "step": 622 }, { "epoch": 0.77, "grad_norm": 0.7221031188964844, "learning_rate": 9.439893797257578e-06, "loss": 0.0369, "step": 623 }, { "epoch": 0.77, "grad_norm": 1.0137180089950562, "learning_rate": 9.438099930184783e-06, "loss": 0.0242, "step": 624 }, { "epoch": 0.77, "grad_norm": 0.7642596364021301, "learning_rate": 9.436303366076213e-06, "loss": 0.0476, "step": 625 }, { "epoch": 0.77, "grad_norm": 1.0482991933822632, "learning_rate": 9.434504106023634e-06, "loss": 0.0717, "step": 626 }, { "epoch": 0.78, "grad_norm": 0.7821680903434753, "learning_rate": 9.432702151120464e-06, "loss": 0.0395, "step": 627 }, { "epoch": 0.78, "grad_norm": 0.8012223839759827, "learning_rate": 9.430897502461745e-06, "loss": 0.0501, "step": 628 }, { "epoch": 0.78, "grad_norm": 0.960848867893219, "learning_rate": 9.429090161144166e-06, "loss": 0.0194, "step": 629 }, { "epoch": 0.78, "grad_norm": 0.9573109745979309, "learning_rate": 9.427280128266049e-06, "loss": 0.0485, "step": 630 }, { "epoch": 0.78, "grad_norm": 0.6235270500183105, "learning_rate": 9.425467404927356e-06, "loss": 0.0354, "step": 631 }, { "epoch": 0.78, "grad_norm": 1.024781346321106, "learning_rate": 9.423651992229673e-06, "loss": 0.0356, "step": 632 }, { "epoch": 0.78, "grad_norm": 0.7387573719024658, "learning_rate": 9.421833891276233e-06, "loss": 0.0576, "step": 633 }, { "epoch": 0.78, "grad_norm": 0.5336031913757324, "learning_rate": 9.420013103171893e-06, "loss": 0.0387, "step": 634 }, { "epoch": 0.79, "grad_norm": 1.2542508840560913, "learning_rate": 9.418189629023149e-06, "loss": 0.0415, "step": 635 }, { "epoch": 0.79, "grad_norm": 1.6477981805801392, "learning_rate": 9.416363469938128e-06, "loss": 0.0725, "step": 636 }, { "epoch": 0.79, "grad_norm": 0.7093968391418457, "learning_rate": 9.414534627026586e-06, "loss": 0.0361, "step": 637 }, { "epoch": 0.79, "grad_norm": 0.8406978845596313, "learning_rate": 9.412703101399912e-06, "loss": 0.0248, "step": 638 }, { "epoch": 0.79, "grad_norm": 0.7647954821586609, "learning_rate": 9.410868894171126e-06, "loss": 0.0734, "step": 639 }, { "epoch": 0.79, "grad_norm": 0.5869340300559998, "learning_rate": 9.409032006454877e-06, "loss": 0.0322, "step": 640 }, { "epoch": 0.79, "grad_norm": 0.6841743588447571, "learning_rate": 9.407192439367443e-06, "loss": 0.0217, "step": 641 }, { "epoch": 0.79, "grad_norm": 1.1286256313323975, "learning_rate": 9.405350194026728e-06, "loss": 0.0432, "step": 642 }, { "epoch": 0.8, "grad_norm": 1.9575207233428955, "learning_rate": 9.403505271552267e-06, "loss": 0.0623, "step": 643 }, { "epoch": 0.8, "grad_norm": 2.1534059047698975, "learning_rate": 9.401657673065218e-06, "loss": 0.0682, "step": 644 }, { "epoch": 0.8, "grad_norm": 0.6419281959533691, "learning_rate": 9.399807399688371e-06, "loss": 0.0271, "step": 645 }, { "epoch": 0.8, "grad_norm": 0.8669396638870239, "learning_rate": 9.397954452546139e-06, "loss": 0.0438, "step": 646 }, { "epoch": 0.8, "grad_norm": 1.168561339378357, "learning_rate": 9.396098832764555e-06, "loss": 0.0456, "step": 647 }, { "epoch": 0.8, "grad_norm": 1.2432861328125, "learning_rate": 9.394240541471282e-06, "loss": 0.0666, "step": 648 }, { "epoch": 0.8, "grad_norm": 1.9158250093460083, "learning_rate": 9.392379579795605e-06, "loss": 0.0452, "step": 649 }, { "epoch": 0.8, "grad_norm": 1.2606102228164673, "learning_rate": 9.39051594886843e-06, "loss": 0.0288, "step": 650 }, { "epoch": 0.81, "grad_norm": 1.0844234228134155, "learning_rate": 9.388649649822289e-06, "loss": 0.0374, "step": 651 }, { "epoch": 0.81, "grad_norm": 1.0901192426681519, "learning_rate": 9.386780683791331e-06, "loss": 0.0498, "step": 652 }, { "epoch": 0.81, "grad_norm": 1.03596830368042, "learning_rate": 9.384909051911329e-06, "loss": 0.0544, "step": 653 }, { "epoch": 0.81, "grad_norm": 0.7338258028030396, "learning_rate": 9.383034755319673e-06, "loss": 0.0389, "step": 654 }, { "epoch": 0.81, "grad_norm": 1.973031759262085, "learning_rate": 9.381157795155374e-06, "loss": 0.0534, "step": 655 }, { "epoch": 0.81, "grad_norm": 0.6111584305763245, "learning_rate": 9.379278172559065e-06, "loss": 0.0279, "step": 656 }, { "epoch": 0.81, "grad_norm": 0.7228569388389587, "learning_rate": 9.37739588867299e-06, "loss": 0.0397, "step": 657 }, { "epoch": 0.81, "grad_norm": 1.4140815734863281, "learning_rate": 9.375510944641017e-06, "loss": 0.0476, "step": 658 }, { "epoch": 0.82, "grad_norm": 1.1325860023498535, "learning_rate": 9.373623341608624e-06, "loss": 0.0697, "step": 659 }, { "epoch": 0.82, "grad_norm": 1.155360221862793, "learning_rate": 9.371733080722911e-06, "loss": 0.0493, "step": 660 }, { "epoch": 0.82, "grad_norm": 1.2202762365341187, "learning_rate": 9.36984016313259e-06, "loss": 0.0425, "step": 661 }, { "epoch": 0.82, "grad_norm": 0.9276245832443237, "learning_rate": 9.36794458998799e-06, "loss": 0.0324, "step": 662 }, { "epoch": 0.82, "grad_norm": 0.8629313707351685, "learning_rate": 9.366046362441047e-06, "loss": 0.0551, "step": 663 }, { "epoch": 0.82, "grad_norm": 0.3723730742931366, "learning_rate": 9.36414548164532e-06, "loss": 0.0157, "step": 664 }, { "epoch": 0.82, "grad_norm": 0.9178370833396912, "learning_rate": 9.36224194875597e-06, "loss": 0.0467, "step": 665 }, { "epoch": 0.82, "grad_norm": 0.7394289374351501, "learning_rate": 9.360335764929781e-06, "loss": 0.0303, "step": 666 }, { "epoch": 0.83, "grad_norm": 0.757675290107727, "learning_rate": 9.358426931325137e-06, "loss": 0.0302, "step": 667 }, { "epoch": 0.83, "grad_norm": 1.3911486864089966, "learning_rate": 9.356515449102041e-06, "loss": 0.0544, "step": 668 }, { "epoch": 0.83, "grad_norm": 0.451570063829422, "learning_rate": 9.354601319422099e-06, "loss": 0.0207, "step": 669 }, { "epoch": 0.83, "grad_norm": 0.43002304434776306, "learning_rate": 9.352684543448532e-06, "loss": 0.0186, "step": 670 }, { "epoch": 0.83, "grad_norm": 0.37833526730537415, "learning_rate": 9.350765122346162e-06, "loss": 0.0146, "step": 671 }, { "epoch": 0.83, "grad_norm": 0.9775627255439758, "learning_rate": 9.348843057281423e-06, "loss": 0.0451, "step": 672 }, { "epoch": 0.83, "grad_norm": 0.626708447933197, "learning_rate": 9.346918349422356e-06, "loss": 0.0301, "step": 673 }, { "epoch": 0.83, "grad_norm": 1.5922341346740723, "learning_rate": 9.344990999938609e-06, "loss": 0.0501, "step": 674 }, { "epoch": 0.84, "grad_norm": 1.1948060989379883, "learning_rate": 9.343061010001428e-06, "loss": 0.0394, "step": 675 }, { "epoch": 0.84, "grad_norm": 0.9602558016777039, "learning_rate": 9.341128380783674e-06, "loss": 0.0429, "step": 676 }, { "epoch": 0.84, "grad_norm": 1.0513089895248413, "learning_rate": 9.339193113459805e-06, "loss": 0.0391, "step": 677 }, { "epoch": 0.84, "grad_norm": 1.1344138383865356, "learning_rate": 9.337255209205884e-06, "loss": 0.0274, "step": 678 }, { "epoch": 0.84, "grad_norm": 1.1134185791015625, "learning_rate": 9.335314669199576e-06, "loss": 0.0604, "step": 679 }, { "epoch": 0.84, "grad_norm": 1.0586154460906982, "learning_rate": 9.33337149462015e-06, "loss": 0.0325, "step": 680 }, { "epoch": 0.84, "grad_norm": 1.0996270179748535, "learning_rate": 9.331425686648472e-06, "loss": 0.0332, "step": 681 }, { "epoch": 0.84, "grad_norm": 2.7945778369903564, "learning_rate": 9.32947724646701e-06, "loss": 0.0664, "step": 682 }, { "epoch": 0.85, "grad_norm": 1.8699554204940796, "learning_rate": 9.327526175259837e-06, "loss": 0.0592, "step": 683 }, { "epoch": 0.85, "grad_norm": 1.0859918594360352, "learning_rate": 9.325572474212615e-06, "loss": 0.0434, "step": 684 }, { "epoch": 0.85, "grad_norm": 1.2848424911499023, "learning_rate": 9.323616144512612e-06, "loss": 0.0343, "step": 685 }, { "epoch": 0.85, "grad_norm": 1.860479474067688, "learning_rate": 9.321657187348689e-06, "loss": 0.0581, "step": 686 }, { "epoch": 0.85, "grad_norm": 1.3358099460601807, "learning_rate": 9.319695603911306e-06, "loss": 0.059, "step": 687 }, { "epoch": 0.85, "grad_norm": 0.8692423701286316, "learning_rate": 9.317731395392517e-06, "loss": 0.0332, "step": 688 }, { "epoch": 0.85, "grad_norm": 1.4998887777328491, "learning_rate": 9.315764562985976e-06, "loss": 0.0485, "step": 689 }, { "epoch": 0.85, "grad_norm": 0.5280508995056152, "learning_rate": 9.313795107886925e-06, "loss": 0.0249, "step": 690 }, { "epoch": 0.86, "grad_norm": 0.7580534219741821, "learning_rate": 9.311823031292205e-06, "loss": 0.0372, "step": 691 }, { "epoch": 0.86, "grad_norm": 0.7582796216011047, "learning_rate": 9.309848334400247e-06, "loss": 0.0326, "step": 692 }, { "epoch": 0.86, "grad_norm": 0.6401865482330322, "learning_rate": 9.307871018411074e-06, "loss": 0.0301, "step": 693 }, { "epoch": 0.86, "grad_norm": 2.024916410446167, "learning_rate": 9.305891084526306e-06, "loss": 0.0723, "step": 694 }, { "epoch": 0.86, "grad_norm": 2.180551767349243, "learning_rate": 9.303908533949146e-06, "loss": 0.0639, "step": 695 }, { "epoch": 0.86, "grad_norm": 0.7816917896270752, "learning_rate": 9.301923367884393e-06, "loss": 0.0366, "step": 696 }, { "epoch": 0.86, "grad_norm": 0.7270790934562683, "learning_rate": 9.299935587538432e-06, "loss": 0.0421, "step": 697 }, { "epoch": 0.86, "grad_norm": 0.8784447312355042, "learning_rate": 9.29794519411924e-06, "loss": 0.043, "step": 698 }, { "epoch": 0.87, "grad_norm": 0.6736301779747009, "learning_rate": 9.29595218883638e-06, "loss": 0.047, "step": 699 }, { "epoch": 0.87, "grad_norm": 1.0458660125732422, "learning_rate": 9.293956572900999e-06, "loss": 0.0295, "step": 700 }, { "epoch": 0.87, "grad_norm": 0.8319834470748901, "learning_rate": 9.29195834752584e-06, "loss": 0.0606, "step": 701 }, { "epoch": 0.87, "grad_norm": 1.5236587524414062, "learning_rate": 9.28995751392522e-06, "loss": 0.0405, "step": 702 }, { "epoch": 0.87, "grad_norm": 1.4151524305343628, "learning_rate": 9.28795407331505e-06, "loss": 0.0397, "step": 703 }, { "epoch": 0.87, "grad_norm": 1.9959708452224731, "learning_rate": 9.285948026912822e-06, "loss": 0.0715, "step": 704 }, { "epoch": 0.87, "grad_norm": 0.5822674632072449, "learning_rate": 9.283939375937609e-06, "loss": 0.0281, "step": 705 }, { "epoch": 0.87, "grad_norm": 0.7008696794509888, "learning_rate": 9.28192812161007e-06, "loss": 0.0486, "step": 706 }, { "epoch": 0.88, "grad_norm": 0.7523006796836853, "learning_rate": 9.279914265152448e-06, "loss": 0.0505, "step": 707 }, { "epoch": 0.88, "grad_norm": 1.051295518875122, "learning_rate": 9.277897807788562e-06, "loss": 0.0499, "step": 708 }, { "epoch": 0.88, "grad_norm": 0.8184940218925476, "learning_rate": 9.275878750743818e-06, "loss": 0.0422, "step": 709 }, { "epoch": 0.88, "grad_norm": 1.372441291809082, "learning_rate": 9.273857095245192e-06, "loss": 0.0633, "step": 710 }, { "epoch": 0.88, "grad_norm": 0.6757863759994507, "learning_rate": 9.271832842521249e-06, "loss": 0.0366, "step": 711 }, { "epoch": 0.88, "grad_norm": 0.7655669450759888, "learning_rate": 9.26980599380213e-06, "loss": 0.0389, "step": 712 }, { "epoch": 0.88, "grad_norm": 1.1087899208068848, "learning_rate": 9.267776550319548e-06, "loss": 0.0433, "step": 713 }, { "epoch": 0.88, "grad_norm": 1.6310410499572754, "learning_rate": 9.265744513306798e-06, "loss": 0.0471, "step": 714 }, { "epoch": 0.88, "grad_norm": 1.9184622764587402, "learning_rate": 9.263709883998753e-06, "loss": 0.0679, "step": 715 }, { "epoch": 0.89, "grad_norm": 2.0910892486572266, "learning_rate": 9.261672663631854e-06, "loss": 0.0551, "step": 716 }, { "epoch": 0.89, "grad_norm": 2.9525444507598877, "learning_rate": 9.259632853444126e-06, "loss": 0.0682, "step": 717 }, { "epoch": 0.89, "grad_norm": 1.773461103439331, "learning_rate": 9.257590454675159e-06, "loss": 0.0441, "step": 718 }, { "epoch": 0.89, "grad_norm": 0.9130051136016846, "learning_rate": 9.255545468566119e-06, "loss": 0.0454, "step": 719 }, { "epoch": 0.89, "grad_norm": 0.34200993180274963, "learning_rate": 9.253497896359749e-06, "loss": 0.0119, "step": 720 }, { "epoch": 0.89, "grad_norm": 1.0717602968215942, "learning_rate": 9.251447739300356e-06, "loss": 0.0552, "step": 721 }, { "epoch": 0.89, "grad_norm": 1.0619879961013794, "learning_rate": 9.249394998633825e-06, "loss": 0.0568, "step": 722 }, { "epoch": 0.89, "grad_norm": 0.8811701536178589, "learning_rate": 9.247339675607606e-06, "loss": 0.034, "step": 723 }, { "epoch": 0.9, "grad_norm": 0.974205493927002, "learning_rate": 9.24528177147072e-06, "loss": 0.0398, "step": 724 }, { "epoch": 0.9, "grad_norm": 0.8818910717964172, "learning_rate": 9.243221287473755e-06, "loss": 0.048, "step": 725 }, { "epoch": 0.9, "grad_norm": 0.6580934524536133, "learning_rate": 9.241158224868871e-06, "loss": 0.042, "step": 726 }, { "epoch": 0.9, "grad_norm": 1.4452764987945557, "learning_rate": 9.23909258490979e-06, "loss": 0.0438, "step": 727 }, { "epoch": 0.9, "grad_norm": 0.6177107095718384, "learning_rate": 9.237024368851805e-06, "loss": 0.0434, "step": 728 }, { "epoch": 0.9, "grad_norm": 0.6715316772460938, "learning_rate": 9.23495357795177e-06, "loss": 0.0242, "step": 729 }, { "epoch": 0.9, "grad_norm": 1.8438655138015747, "learning_rate": 9.232880213468106e-06, "loss": 0.0421, "step": 730 }, { "epoch": 0.9, "grad_norm": 1.011062741279602, "learning_rate": 9.230804276660799e-06, "loss": 0.0465, "step": 731 }, { "epoch": 0.91, "grad_norm": 1.2409260272979736, "learning_rate": 9.228725768791394e-06, "loss": 0.029, "step": 732 }, { "epoch": 0.91, "grad_norm": 1.2052364349365234, "learning_rate": 9.226644691123006e-06, "loss": 0.0465, "step": 733 }, { "epoch": 0.91, "grad_norm": 0.60611891746521, "learning_rate": 9.224561044920303e-06, "loss": 0.0328, "step": 734 }, { "epoch": 0.91, "grad_norm": 0.4640844464302063, "learning_rate": 9.222474831449519e-06, "loss": 0.0202, "step": 735 }, { "epoch": 0.91, "grad_norm": 1.9622972011566162, "learning_rate": 9.220386051978449e-06, "loss": 0.0651, "step": 736 }, { "epoch": 0.91, "grad_norm": 1.8986101150512695, "learning_rate": 9.218294707776441e-06, "loss": 0.0556, "step": 737 }, { "epoch": 0.91, "grad_norm": 1.158408284187317, "learning_rate": 9.216200800114412e-06, "loss": 0.0368, "step": 738 }, { "epoch": 0.91, "grad_norm": 0.9851293563842773, "learning_rate": 9.214104330264826e-06, "loss": 0.053, "step": 739 }, { "epoch": 0.92, "grad_norm": 1.1018086671829224, "learning_rate": 9.212005299501712e-06, "loss": 0.0597, "step": 740 }, { "epoch": 0.92, "grad_norm": 1.84424889087677, "learning_rate": 9.20990370910065e-06, "loss": 0.0497, "step": 741 }, { "epoch": 0.92, "grad_norm": 1.2366299629211426, "learning_rate": 9.207799560338779e-06, "loss": 0.0602, "step": 742 }, { "epoch": 0.92, "grad_norm": 1.1586567163467407, "learning_rate": 9.20569285449479e-06, "loss": 0.0316, "step": 743 }, { "epoch": 0.92, "grad_norm": 0.6110067367553711, "learning_rate": 9.20358359284893e-06, "loss": 0.0305, "step": 744 }, { "epoch": 0.92, "grad_norm": 0.6773253679275513, "learning_rate": 9.201471776682999e-06, "loss": 0.036, "step": 745 }, { "epoch": 0.92, "grad_norm": 0.9832028150558472, "learning_rate": 9.199357407280349e-06, "loss": 0.0381, "step": 746 }, { "epoch": 0.92, "grad_norm": 1.0233718156814575, "learning_rate": 9.197240485925883e-06, "loss": 0.0549, "step": 747 }, { "epoch": 0.93, "grad_norm": 2.125337839126587, "learning_rate": 9.195121013906055e-06, "loss": 0.0776, "step": 748 }, { "epoch": 0.93, "grad_norm": 1.2079508304595947, "learning_rate": 9.19299899250887e-06, "loss": 0.0384, "step": 749 }, { "epoch": 0.93, "grad_norm": 1.0452898740768433, "learning_rate": 9.19087442302388e-06, "loss": 0.0387, "step": 750 }, { "epoch": 0.93, "grad_norm": 0.8497399687767029, "learning_rate": 9.18874730674219e-06, "loss": 0.0386, "step": 751 }, { "epoch": 0.93, "grad_norm": 2.1464147567749023, "learning_rate": 9.186617644956445e-06, "loss": 0.0725, "step": 752 }, { "epoch": 0.93, "grad_norm": 0.4441956579685211, "learning_rate": 9.184485438960846e-06, "loss": 0.0214, "step": 753 }, { "epoch": 0.93, "grad_norm": 0.818230390548706, "learning_rate": 9.182350690051134e-06, "loss": 0.0256, "step": 754 }, { "epoch": 0.93, "grad_norm": 1.0162849426269531, "learning_rate": 9.180213399524599e-06, "loss": 0.0592, "step": 755 }, { "epoch": 0.94, "grad_norm": 0.9444966316223145, "learning_rate": 9.178073568680071e-06, "loss": 0.0293, "step": 756 }, { "epoch": 0.94, "grad_norm": 0.7616766691207886, "learning_rate": 9.175931198817926e-06, "loss": 0.0481, "step": 757 }, { "epoch": 0.94, "grad_norm": 0.47808611392974854, "learning_rate": 9.173786291240085e-06, "loss": 0.0287, "step": 758 }, { "epoch": 0.94, "grad_norm": 0.6669220328330994, "learning_rate": 9.17163884725001e-06, "loss": 0.0324, "step": 759 }, { "epoch": 0.94, "grad_norm": 0.8807569146156311, "learning_rate": 9.169488868152704e-06, "loss": 0.0425, "step": 760 }, { "epoch": 0.94, "grad_norm": 1.2071596384048462, "learning_rate": 9.16733635525471e-06, "loss": 0.046, "step": 761 }, { "epoch": 0.94, "grad_norm": 1.2434258460998535, "learning_rate": 9.165181309864108e-06, "loss": 0.0383, "step": 762 }, { "epoch": 0.94, "grad_norm": 0.7151886820793152, "learning_rate": 9.163023733290525e-06, "loss": 0.0381, "step": 763 }, { "epoch": 0.95, "grad_norm": 0.6364666223526001, "learning_rate": 9.16086362684512e-06, "loss": 0.0328, "step": 764 }, { "epoch": 0.95, "grad_norm": 1.2846086025238037, "learning_rate": 9.15870099184059e-06, "loss": 0.0317, "step": 765 }, { "epoch": 0.95, "grad_norm": 1.7031409740447998, "learning_rate": 9.15653582959117e-06, "loss": 0.0416, "step": 766 }, { "epoch": 0.95, "grad_norm": 1.8931663036346436, "learning_rate": 9.154368141412632e-06, "loss": 0.0544, "step": 767 }, { "epoch": 0.95, "grad_norm": 0.5589671730995178, "learning_rate": 9.152197928622278e-06, "loss": 0.0204, "step": 768 }, { "epoch": 0.95, "grad_norm": 0.7534042596817017, "learning_rate": 9.15002519253895e-06, "loss": 0.0291, "step": 769 }, { "epoch": 0.95, "grad_norm": 0.8194689750671387, "learning_rate": 9.147849934483019e-06, "loss": 0.0363, "step": 770 }, { "epoch": 0.95, "grad_norm": 1.4425467252731323, "learning_rate": 9.145672155776392e-06, "loss": 0.0583, "step": 771 }, { "epoch": 0.96, "grad_norm": 1.4742876291275024, "learning_rate": 9.143491857742505e-06, "loss": 0.0577, "step": 772 }, { "epoch": 0.96, "grad_norm": 0.5303352475166321, "learning_rate": 9.14130904170633e-06, "loss": 0.0311, "step": 773 }, { "epoch": 0.96, "grad_norm": 0.7389684915542603, "learning_rate": 9.13912370899436e-06, "loss": 0.028, "step": 774 }, { "epoch": 0.96, "grad_norm": 1.5198121070861816, "learning_rate": 9.136935860934628e-06, "loss": 0.0461, "step": 775 }, { "epoch": 0.96, "grad_norm": 1.799206256866455, "learning_rate": 9.134745498856685e-06, "loss": 0.0478, "step": 776 }, { "epoch": 0.96, "grad_norm": 1.1272491216659546, "learning_rate": 9.13255262409162e-06, "loss": 0.0495, "step": 777 }, { "epoch": 0.96, "grad_norm": 1.0748385190963745, "learning_rate": 9.130357237972044e-06, "loss": 0.0388, "step": 778 }, { "epoch": 0.96, "grad_norm": 0.8800269961357117, "learning_rate": 9.128159341832092e-06, "loss": 0.0233, "step": 779 }, { "epoch": 0.97, "grad_norm": 0.6652606129646301, "learning_rate": 9.125958937007427e-06, "loss": 0.0401, "step": 780 }, { "epoch": 0.97, "grad_norm": 0.7951803207397461, "learning_rate": 9.123756024835237e-06, "loss": 0.0194, "step": 781 }, { "epoch": 0.97, "grad_norm": 0.6082125902175903, "learning_rate": 9.121550606654232e-06, "loss": 0.0221, "step": 782 }, { "epoch": 0.97, "grad_norm": 1.656269907951355, "learning_rate": 9.119342683804649e-06, "loss": 0.0267, "step": 783 }, { "epoch": 0.97, "grad_norm": 1.3084255456924438, "learning_rate": 9.11713225762824e-06, "loss": 0.0476, "step": 784 }, { "epoch": 0.97, "grad_norm": 0.8326955437660217, "learning_rate": 9.114919329468283e-06, "loss": 0.0223, "step": 785 }, { "epoch": 0.97, "grad_norm": 0.612882673740387, "learning_rate": 9.112703900669577e-06, "loss": 0.0186, "step": 786 }, { "epoch": 0.97, "grad_norm": 1.0400992631912231, "learning_rate": 9.110485972578439e-06, "loss": 0.0494, "step": 787 }, { "epoch": 0.98, "grad_norm": 0.9465930461883545, "learning_rate": 9.108265546542705e-06, "loss": 0.0336, "step": 788 }, { "epoch": 0.98, "grad_norm": 0.8121449947357178, "learning_rate": 9.106042623911728e-06, "loss": 0.0392, "step": 789 }, { "epoch": 0.98, "grad_norm": 1.7355393171310425, "learning_rate": 9.103817206036383e-06, "loss": 0.0492, "step": 790 }, { "epoch": 0.98, "grad_norm": 0.5920339822769165, "learning_rate": 9.101589294269054e-06, "loss": 0.0354, "step": 791 }, { "epoch": 0.98, "grad_norm": 1.1976126432418823, "learning_rate": 9.099358889963643e-06, "loss": 0.0618, "step": 792 }, { "epoch": 0.98, "grad_norm": 1.0642493963241577, "learning_rate": 9.097125994475572e-06, "loss": 0.0555, "step": 793 }, { "epoch": 0.98, "grad_norm": 1.2092516422271729, "learning_rate": 9.09489060916177e-06, "loss": 0.0391, "step": 794 }, { "epoch": 0.98, "grad_norm": 0.67398601770401, "learning_rate": 9.092652735380683e-06, "loss": 0.0196, "step": 795 }, { "epoch": 0.99, "grad_norm": 0.8952963948249817, "learning_rate": 9.09041237449227e-06, "loss": 0.0246, "step": 796 }, { "epoch": 0.99, "grad_norm": 0.7937426567077637, "learning_rate": 9.088169527857996e-06, "loss": 0.0449, "step": 797 }, { "epoch": 0.99, "grad_norm": 1.0983673334121704, "learning_rate": 9.085924196840841e-06, "loss": 0.0577, "step": 798 }, { "epoch": 0.99, "grad_norm": 1.7625383138656616, "learning_rate": 9.083676382805295e-06, "loss": 0.0609, "step": 799 }, { "epoch": 0.99, "grad_norm": 1.6659592390060425, "learning_rate": 9.081426087117356e-06, "loss": 0.0453, "step": 800 } ], "logging_steps": 1.0, "max_steps": 4040, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "total_flos": 2.335555778196275e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }