diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5621 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9900990099009901, + "eval_steps": 500, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 73.68818664550781, + "learning_rate": 1.0000000000000002e-06, + "loss": 8.0388, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 71.36270904541016, + "learning_rate": 2.0000000000000003e-06, + "loss": 8.0003, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 73.16751861572266, + "learning_rate": 3e-06, + "loss": 7.9032, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 74.18943786621094, + "learning_rate": 4.000000000000001e-06, + "loss": 7.921, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 70.63272857666016, + "learning_rate": 5e-06, + "loss": 8.032, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 64.6897964477539, + "learning_rate": 6e-06, + "loss": 7.68, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 65.79997253417969, + "learning_rate": 7e-06, + "loss": 7.5291, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 63.4569091796875, + "learning_rate": 8.000000000000001e-06, + "loss": 5.6132, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 63.28990173339844, + "learning_rate": 9e-06, + "loss": 5.0102, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 46.30258560180664, + "learning_rate": 1e-05, + "loss": 2.2227, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 46.01011657714844, + "learning_rate": 9.99999848074862e-06, + "loss": 1.6679, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 42.595951080322266, + "learning_rate": 9.9999939229954e-06, + "loss": 1.5493, + "step": 12 + }, + { + "epoch": 0.02, + "grad_norm": 11.979974746704102, + "learning_rate": 9.999986326743111e-06, + "loss": 0.892, + "step": 13 + }, + { + "epoch": 0.02, + "grad_norm": 13.096778869628906, + "learning_rate": 9.99997569199637e-06, + "loss": 0.9386, + "step": 14 + }, + { + "epoch": 0.02, + "grad_norm": 35.61207962036133, + "learning_rate": 9.99996201876164e-06, + "loss": 1.3573, + "step": 15 + }, + { + "epoch": 0.02, + "grad_norm": 18.184959411621094, + "learning_rate": 9.999945307047228e-06, + "loss": 0.9778, + "step": 16 + }, + { + "epoch": 0.02, + "grad_norm": 6.461019992828369, + "learning_rate": 9.99992555686329e-06, + "loss": 1.0665, + "step": 17 + }, + { + "epoch": 0.02, + "grad_norm": 4.743849277496338, + "learning_rate": 9.99990276822183e-06, + "loss": 0.5975, + "step": 18 + }, + { + "epoch": 0.02, + "grad_norm": 5.654608726501465, + "learning_rate": 9.999876941136697e-06, + "loss": 0.856, + "step": 19 + }, + { + "epoch": 0.02, + "grad_norm": 5.488308906555176, + "learning_rate": 9.999848075623584e-06, + "loss": 0.7874, + "step": 20 + }, + { + "epoch": 0.03, + "grad_norm": 5.833119869232178, + "learning_rate": 9.999816171700034e-06, + "loss": 0.8777, + "step": 21 + }, + { + "epoch": 0.03, + "grad_norm": 3.678900718688965, + "learning_rate": 9.999781229385433e-06, + "loss": 0.5888, + "step": 22 + }, + { + "epoch": 0.03, + "grad_norm": 6.357454776763916, + "learning_rate": 9.99974324870102e-06, + "loss": 0.9263, + "step": 23 + }, + { + "epoch": 0.03, + "grad_norm": 5.7684149742126465, + "learning_rate": 9.99970222966987e-06, + "loss": 0.7734, + "step": 24 + }, + { + "epoch": 0.03, + "grad_norm": 6.77016019821167, + "learning_rate": 9.999658172316915e-06, + "loss": 0.7735, + "step": 25 + }, + { + "epoch": 0.03, + "grad_norm": 4.0211334228515625, + "learning_rate": 9.999611076668926e-06, + "loss": 0.5645, + "step": 26 + }, + { + "epoch": 0.03, + "grad_norm": 7.3770222663879395, + "learning_rate": 9.999560942754525e-06, + "loss": 1.0185, + "step": 27 + }, + { + "epoch": 0.03, + "grad_norm": 4.433741569519043, + "learning_rate": 9.999507770604177e-06, + "loss": 0.3547, + "step": 28 + }, + { + "epoch": 0.04, + "grad_norm": 6.0549492835998535, + "learning_rate": 9.999451560250196e-06, + "loss": 0.4961, + "step": 29 + }, + { + "epoch": 0.04, + "grad_norm": 7.8142619132995605, + "learning_rate": 9.999392311726738e-06, + "loss": 0.4398, + "step": 30 + }, + { + "epoch": 0.04, + "grad_norm": 5.785826683044434, + "learning_rate": 9.999330025069812e-06, + "loss": 0.6431, + "step": 31 + }, + { + "epoch": 0.04, + "grad_norm": 6.010104656219482, + "learning_rate": 9.999264700317268e-06, + "loss": 0.6129, + "step": 32 + }, + { + "epoch": 0.04, + "grad_norm": 8.289867401123047, + "learning_rate": 9.999196337508804e-06, + "loss": 0.3771, + "step": 33 + }, + { + "epoch": 0.04, + "grad_norm": 5.59083890914917, + "learning_rate": 9.999124936685965e-06, + "loss": 0.3964, + "step": 34 + }, + { + "epoch": 0.04, + "grad_norm": 6.018394947052002, + "learning_rate": 9.99905049789214e-06, + "loss": 0.4801, + "step": 35 + }, + { + "epoch": 0.04, + "grad_norm": 10.878011703491211, + "learning_rate": 9.998973021172564e-06, + "loss": 0.2996, + "step": 36 + }, + { + "epoch": 0.05, + "grad_norm": 31.676380157470703, + "learning_rate": 9.998892506574325e-06, + "loss": 0.5261, + "step": 37 + }, + { + "epoch": 0.05, + "grad_norm": 16.133407592773438, + "learning_rate": 9.998808954146347e-06, + "loss": 0.3843, + "step": 38 + }, + { + "epoch": 0.05, + "grad_norm": 8.785749435424805, + "learning_rate": 9.998722363939407e-06, + "loss": 0.2476, + "step": 39 + }, + { + "epoch": 0.05, + "grad_norm": 4.326422214508057, + "learning_rate": 9.998632736006124e-06, + "loss": 0.2334, + "step": 40 + }, + { + "epoch": 0.05, + "grad_norm": 6.171711444854736, + "learning_rate": 9.998540070400966e-06, + "loss": 0.1671, + "step": 41 + }, + { + "epoch": 0.05, + "grad_norm": 3.5893757343292236, + "learning_rate": 9.998444367180247e-06, + "loss": 0.1732, + "step": 42 + }, + { + "epoch": 0.05, + "grad_norm": 2.918233633041382, + "learning_rate": 9.998345626402124e-06, + "loss": 0.1127, + "step": 43 + }, + { + "epoch": 0.05, + "grad_norm": 2.620290517807007, + "learning_rate": 9.998243848126604e-06, + "loss": 0.1337, + "step": 44 + }, + { + "epoch": 0.06, + "grad_norm": 4.31186056137085, + "learning_rate": 9.998139032415534e-06, + "loss": 0.144, + "step": 45 + }, + { + "epoch": 0.06, + "grad_norm": 3.399256706237793, + "learning_rate": 9.998031179332618e-06, + "loss": 0.0878, + "step": 46 + }, + { + "epoch": 0.06, + "grad_norm": 1.658913016319275, + "learning_rate": 9.997920288943388e-06, + "loss": 0.0651, + "step": 47 + }, + { + "epoch": 0.06, + "grad_norm": 3.0306262969970703, + "learning_rate": 9.99780636131524e-06, + "loss": 0.1051, + "step": 48 + }, + { + "epoch": 0.06, + "grad_norm": 2.099931478500366, + "learning_rate": 9.997689396517408e-06, + "loss": 0.0937, + "step": 49 + }, + { + "epoch": 0.06, + "grad_norm": 2.8879055976867676, + "learning_rate": 9.997569394620965e-06, + "loss": 0.0701, + "step": 50 + }, + { + "epoch": 0.06, + "grad_norm": 1.2706345319747925, + "learning_rate": 9.997446355698843e-06, + "loss": 0.0559, + "step": 51 + }, + { + "epoch": 0.06, + "grad_norm": 1.7181872129440308, + "learning_rate": 9.99732027982581e-06, + "loss": 0.06, + "step": 52 + }, + { + "epoch": 0.07, + "grad_norm": 1.4794338941574097, + "learning_rate": 9.997191167078479e-06, + "loss": 0.0715, + "step": 53 + }, + { + "epoch": 0.07, + "grad_norm": 1.40012788772583, + "learning_rate": 9.99705901753532e-06, + "loss": 0.0608, + "step": 54 + }, + { + "epoch": 0.07, + "grad_norm": 3.034327983856201, + "learning_rate": 9.996923831276632e-06, + "loss": 0.0603, + "step": 55 + }, + { + "epoch": 0.07, + "grad_norm": 3.1301584243774414, + "learning_rate": 9.996785608384573e-06, + "loss": 0.0762, + "step": 56 + }, + { + "epoch": 0.07, + "grad_norm": 2.8218867778778076, + "learning_rate": 9.996644348943141e-06, + "loss": 0.0956, + "step": 57 + }, + { + "epoch": 0.07, + "grad_norm": 1.7874306440353394, + "learning_rate": 9.996500053038176e-06, + "loss": 0.0693, + "step": 58 + }, + { + "epoch": 0.07, + "grad_norm": 1.582387089729309, + "learning_rate": 9.99635272075737e-06, + "loss": 0.0679, + "step": 59 + }, + { + "epoch": 0.07, + "grad_norm": 1.9699870347976685, + "learning_rate": 9.996202352190256e-06, + "loss": 0.0565, + "step": 60 + }, + { + "epoch": 0.08, + "grad_norm": 2.0471878051757812, + "learning_rate": 9.996048947428212e-06, + "loss": 0.0725, + "step": 61 + }, + { + "epoch": 0.08, + "grad_norm": 1.3680695295333862, + "learning_rate": 9.995892506564461e-06, + "loss": 0.0586, + "step": 62 + }, + { + "epoch": 0.08, + "grad_norm": 4.124834060668945, + "learning_rate": 9.995733029694077e-06, + "loss": 0.0724, + "step": 63 + }, + { + "epoch": 0.08, + "grad_norm": 2.7247884273529053, + "learning_rate": 9.995570516913971e-06, + "loss": 0.095, + "step": 64 + }, + { + "epoch": 0.08, + "grad_norm": 5.306038856506348, + "learning_rate": 9.995404968322902e-06, + "loss": 0.0783, + "step": 65 + }, + { + "epoch": 0.08, + "grad_norm": 1.6850618124008179, + "learning_rate": 9.995236384021474e-06, + "loss": 0.0602, + "step": 66 + }, + { + "epoch": 0.08, + "grad_norm": 7.234889984130859, + "learning_rate": 9.995064764112135e-06, + "loss": 0.0852, + "step": 67 + }, + { + "epoch": 0.08, + "grad_norm": 3.0967376232147217, + "learning_rate": 9.994890108699182e-06, + "loss": 0.0905, + "step": 68 + }, + { + "epoch": 0.09, + "grad_norm": 3.0317909717559814, + "learning_rate": 9.99471241788875e-06, + "loss": 0.0728, + "step": 69 + }, + { + "epoch": 0.09, + "grad_norm": 4.1822967529296875, + "learning_rate": 9.994531691788822e-06, + "loss": 0.0919, + "step": 70 + }, + { + "epoch": 0.09, + "grad_norm": 2.137779474258423, + "learning_rate": 9.994347930509225e-06, + "loss": 0.0496, + "step": 71 + }, + { + "epoch": 0.09, + "grad_norm": 1.5484669208526611, + "learning_rate": 9.994161134161635e-06, + "loss": 0.0696, + "step": 72 + }, + { + "epoch": 0.09, + "grad_norm": 4.332581996917725, + "learning_rate": 9.993971302859561e-06, + "loss": 0.0769, + "step": 73 + }, + { + "epoch": 0.09, + "grad_norm": 1.838725209236145, + "learning_rate": 9.99377843671837e-06, + "loss": 0.1011, + "step": 74 + }, + { + "epoch": 0.09, + "grad_norm": 3.3594350814819336, + "learning_rate": 9.993582535855265e-06, + "loss": 0.075, + "step": 75 + }, + { + "epoch": 0.09, + "grad_norm": 1.8417227268218994, + "learning_rate": 9.993383600389294e-06, + "loss": 0.0529, + "step": 76 + }, + { + "epoch": 0.1, + "grad_norm": 4.550814151763916, + "learning_rate": 9.993181630441352e-06, + "loss": 0.1104, + "step": 77 + }, + { + "epoch": 0.1, + "grad_norm": 1.729711651802063, + "learning_rate": 9.992976626134171e-06, + "loss": 0.0601, + "step": 78 + }, + { + "epoch": 0.1, + "grad_norm": 2.1105282306671143, + "learning_rate": 9.99276858759234e-06, + "loss": 0.0423, + "step": 79 + }, + { + "epoch": 0.1, + "grad_norm": 2.180546522140503, + "learning_rate": 9.992557514942278e-06, + "loss": 0.0691, + "step": 80 + }, + { + "epoch": 0.1, + "grad_norm": 3.0761630535125732, + "learning_rate": 9.992343408312258e-06, + "loss": 0.0503, + "step": 81 + }, + { + "epoch": 0.1, + "grad_norm": 0.8641157150268555, + "learning_rate": 9.992126267832392e-06, + "loss": 0.0425, + "step": 82 + }, + { + "epoch": 0.1, + "grad_norm": 10.724833488464355, + "learning_rate": 9.991906093634633e-06, + "loss": 0.0603, + "step": 83 + }, + { + "epoch": 0.1, + "grad_norm": 1.1756705045700073, + "learning_rate": 9.991682885852784e-06, + "loss": 0.0392, + "step": 84 + }, + { + "epoch": 0.11, + "grad_norm": 1.1171228885650635, + "learning_rate": 9.991456644622489e-06, + "loss": 0.0454, + "step": 85 + }, + { + "epoch": 0.11, + "grad_norm": 1.6004431247711182, + "learning_rate": 9.991227370081233e-06, + "loss": 0.0496, + "step": 86 + }, + { + "epoch": 0.11, + "grad_norm": 3.264841318130493, + "learning_rate": 9.990995062368346e-06, + "loss": 0.0339, + "step": 87 + }, + { + "epoch": 0.11, + "grad_norm": 2.4765560626983643, + "learning_rate": 9.990759721625005e-06, + "loss": 0.0698, + "step": 88 + }, + { + "epoch": 0.11, + "grad_norm": 6.907183647155762, + "learning_rate": 9.990521347994224e-06, + "loss": 0.1026, + "step": 89 + }, + { + "epoch": 0.11, + "grad_norm": 5.384580135345459, + "learning_rate": 9.990279941620861e-06, + "loss": 0.0664, + "step": 90 + }, + { + "epoch": 0.11, + "grad_norm": 1.2060827016830444, + "learning_rate": 9.990035502651624e-06, + "loss": 0.0324, + "step": 91 + }, + { + "epoch": 0.11, + "grad_norm": 1.0956050157546997, + "learning_rate": 9.989788031235054e-06, + "loss": 0.0593, + "step": 92 + }, + { + "epoch": 0.12, + "grad_norm": 2.1994054317474365, + "learning_rate": 9.98953752752154e-06, + "loss": 0.0484, + "step": 93 + }, + { + "epoch": 0.12, + "grad_norm": 3.490142583847046, + "learning_rate": 9.989283991663316e-06, + "loss": 0.0561, + "step": 94 + }, + { + "epoch": 0.12, + "grad_norm": 4.274105072021484, + "learning_rate": 9.989027423814454e-06, + "loss": 0.1123, + "step": 95 + }, + { + "epoch": 0.12, + "grad_norm": 3.0847527980804443, + "learning_rate": 9.98876782413087e-06, + "loss": 0.0606, + "step": 96 + }, + { + "epoch": 0.12, + "grad_norm": 1.8111186027526855, + "learning_rate": 9.988505192770324e-06, + "loss": 0.0681, + "step": 97 + }, + { + "epoch": 0.12, + "grad_norm": 1.2713731527328491, + "learning_rate": 9.988239529892416e-06, + "loss": 0.0516, + "step": 98 + }, + { + "epoch": 0.12, + "grad_norm": 1.189513087272644, + "learning_rate": 9.987970835658592e-06, + "loss": 0.0768, + "step": 99 + }, + { + "epoch": 0.12, + "grad_norm": 0.9951283931732178, + "learning_rate": 9.987699110232134e-06, + "loss": 0.0416, + "step": 100 + }, + { + "epoch": 0.12, + "grad_norm": 1.4628676176071167, + "learning_rate": 9.987424353778172e-06, + "loss": 0.0751, + "step": 101 + }, + { + "epoch": 0.13, + "grad_norm": 1.41041100025177, + "learning_rate": 9.987146566463677e-06, + "loss": 0.0681, + "step": 102 + }, + { + "epoch": 0.13, + "grad_norm": 1.9383851289749146, + "learning_rate": 9.986865748457457e-06, + "loss": 0.1003, + "step": 103 + }, + { + "epoch": 0.13, + "grad_norm": 1.1434725522994995, + "learning_rate": 9.986581899930167e-06, + "loss": 0.049, + "step": 104 + }, + { + "epoch": 0.13, + "grad_norm": 3.613456964492798, + "learning_rate": 9.986295021054302e-06, + "loss": 0.0519, + "step": 105 + }, + { + "epoch": 0.13, + "grad_norm": 3.5484371185302734, + "learning_rate": 9.986005112004198e-06, + "loss": 0.0571, + "step": 106 + }, + { + "epoch": 0.13, + "grad_norm": 1.9423480033874512, + "learning_rate": 9.985712172956035e-06, + "loss": 0.039, + "step": 107 + }, + { + "epoch": 0.13, + "grad_norm": 2.0560059547424316, + "learning_rate": 9.985416204087828e-06, + "loss": 0.0904, + "step": 108 + }, + { + "epoch": 0.13, + "grad_norm": 6.695100784301758, + "learning_rate": 9.985117205579442e-06, + "loss": 0.1549, + "step": 109 + }, + { + "epoch": 0.14, + "grad_norm": 2.4656105041503906, + "learning_rate": 9.984815177612574e-06, + "loss": 0.079, + "step": 110 + }, + { + "epoch": 0.14, + "grad_norm": 1.974007487297058, + "learning_rate": 9.984510120370771e-06, + "loss": 0.0585, + "step": 111 + }, + { + "epoch": 0.14, + "grad_norm": 1.3341798782348633, + "learning_rate": 9.984202034039414e-06, + "loss": 0.0585, + "step": 112 + }, + { + "epoch": 0.14, + "grad_norm": 2.7250359058380127, + "learning_rate": 9.983890918805727e-06, + "loss": 0.0651, + "step": 113 + }, + { + "epoch": 0.14, + "grad_norm": 4.140810489654541, + "learning_rate": 9.983576774858776e-06, + "loss": 0.0748, + "step": 114 + }, + { + "epoch": 0.14, + "grad_norm": 6.119039058685303, + "learning_rate": 9.983259602389469e-06, + "loss": 0.0818, + "step": 115 + }, + { + "epoch": 0.14, + "grad_norm": 1.3782867193222046, + "learning_rate": 9.982939401590545e-06, + "loss": 0.0563, + "step": 116 + }, + { + "epoch": 0.14, + "grad_norm": 1.240810513496399, + "learning_rate": 9.982616172656594e-06, + "loss": 0.0555, + "step": 117 + }, + { + "epoch": 0.15, + "grad_norm": 2.0260303020477295, + "learning_rate": 9.982289915784044e-06, + "loss": 0.0554, + "step": 118 + }, + { + "epoch": 0.15, + "grad_norm": 2.1243703365325928, + "learning_rate": 9.981960631171162e-06, + "loss": 0.0584, + "step": 119 + }, + { + "epoch": 0.15, + "grad_norm": 2.7996938228607178, + "learning_rate": 9.98162831901805e-06, + "loss": 0.0854, + "step": 120 + }, + { + "epoch": 0.15, + "grad_norm": 1.3062973022460938, + "learning_rate": 9.981292979526656e-06, + "loss": 0.0821, + "step": 121 + }, + { + "epoch": 0.15, + "grad_norm": 1.2655537128448486, + "learning_rate": 9.980954612900768e-06, + "loss": 0.0643, + "step": 122 + }, + { + "epoch": 0.15, + "grad_norm": 4.0950798988342285, + "learning_rate": 9.980613219346012e-06, + "loss": 0.0994, + "step": 123 + }, + { + "epoch": 0.15, + "grad_norm": 1.522292971611023, + "learning_rate": 9.980268799069848e-06, + "loss": 0.0369, + "step": 124 + }, + { + "epoch": 0.15, + "grad_norm": 2.5451443195343018, + "learning_rate": 9.979921352281585e-06, + "loss": 0.0286, + "step": 125 + }, + { + "epoch": 0.16, + "grad_norm": 1.8015575408935547, + "learning_rate": 9.979570879192365e-06, + "loss": 0.0736, + "step": 126 + }, + { + "epoch": 0.16, + "grad_norm": 3.2620017528533936, + "learning_rate": 9.979217380015173e-06, + "loss": 0.0662, + "step": 127 + }, + { + "epoch": 0.16, + "grad_norm": 0.5585700273513794, + "learning_rate": 9.978860854964827e-06, + "loss": 0.0248, + "step": 128 + }, + { + "epoch": 0.16, + "grad_norm": 1.1841486692428589, + "learning_rate": 9.978501304257991e-06, + "loss": 0.0386, + "step": 129 + }, + { + "epoch": 0.16, + "grad_norm": 1.1351743936538696, + "learning_rate": 9.97813872811316e-06, + "loss": 0.0437, + "step": 130 + }, + { + "epoch": 0.16, + "grad_norm": 2.4472172260284424, + "learning_rate": 9.977773126750677e-06, + "loss": 0.074, + "step": 131 + }, + { + "epoch": 0.16, + "grad_norm": 0.9335076808929443, + "learning_rate": 9.977404500392711e-06, + "loss": 0.034, + "step": 132 + }, + { + "epoch": 0.16, + "grad_norm": 1.9846038818359375, + "learning_rate": 9.977032849263284e-06, + "loss": 0.0488, + "step": 133 + }, + { + "epoch": 0.17, + "grad_norm": 1.003464698791504, + "learning_rate": 9.976658173588244e-06, + "loss": 0.0199, + "step": 134 + }, + { + "epoch": 0.17, + "grad_norm": 1.1298803091049194, + "learning_rate": 9.976280473595284e-06, + "loss": 0.0507, + "step": 135 + }, + { + "epoch": 0.17, + "grad_norm": 4.0241546630859375, + "learning_rate": 9.975899749513928e-06, + "loss": 0.097, + "step": 136 + }, + { + "epoch": 0.17, + "grad_norm": 2.1224637031555176, + "learning_rate": 9.975516001575549e-06, + "loss": 0.0656, + "step": 137 + }, + { + "epoch": 0.17, + "grad_norm": 1.3180643320083618, + "learning_rate": 9.975129230013347e-06, + "loss": 0.0839, + "step": 138 + }, + { + "epoch": 0.17, + "grad_norm": 2.089977979660034, + "learning_rate": 9.974739435062364e-06, + "loss": 0.0571, + "step": 139 + }, + { + "epoch": 0.17, + "grad_norm": 1.773493766784668, + "learning_rate": 9.974346616959476e-06, + "loss": 0.025, + "step": 140 + }, + { + "epoch": 0.17, + "grad_norm": 2.1019980907440186, + "learning_rate": 9.973950775943403e-06, + "loss": 0.0447, + "step": 141 + }, + { + "epoch": 0.18, + "grad_norm": 1.4967840909957886, + "learning_rate": 9.973551912254696e-06, + "loss": 0.0422, + "step": 142 + }, + { + "epoch": 0.18, + "grad_norm": 1.1371103525161743, + "learning_rate": 9.973150026135743e-06, + "loss": 0.0648, + "step": 143 + }, + { + "epoch": 0.18, + "grad_norm": 0.8660270571708679, + "learning_rate": 9.972745117830774e-06, + "loss": 0.0344, + "step": 144 + }, + { + "epoch": 0.18, + "grad_norm": 5.05332088470459, + "learning_rate": 9.972337187585848e-06, + "loss": 0.1036, + "step": 145 + }, + { + "epoch": 0.18, + "grad_norm": 1.1562827825546265, + "learning_rate": 9.971926235648868e-06, + "loss": 0.041, + "step": 146 + }, + { + "epoch": 0.18, + "grad_norm": 3.426886558532715, + "learning_rate": 9.971512262269568e-06, + "loss": 0.127, + "step": 147 + }, + { + "epoch": 0.18, + "grad_norm": 1.173113465309143, + "learning_rate": 9.97109526769952e-06, + "loss": 0.0525, + "step": 148 + }, + { + "epoch": 0.18, + "grad_norm": 1.1487282514572144, + "learning_rate": 9.970675252192133e-06, + "loss": 0.052, + "step": 149 + }, + { + "epoch": 0.19, + "grad_norm": 1.5633060932159424, + "learning_rate": 9.970252216002647e-06, + "loss": 0.0389, + "step": 150 + }, + { + "epoch": 0.19, + "grad_norm": 1.445123314857483, + "learning_rate": 9.969826159388145e-06, + "loss": 0.0521, + "step": 151 + }, + { + "epoch": 0.19, + "grad_norm": 0.8425119519233704, + "learning_rate": 9.96939708260754e-06, + "loss": 0.0513, + "step": 152 + }, + { + "epoch": 0.19, + "grad_norm": 0.9555310606956482, + "learning_rate": 9.968964985921584e-06, + "loss": 0.0574, + "step": 153 + }, + { + "epoch": 0.19, + "grad_norm": 1.8024086952209473, + "learning_rate": 9.96852986959286e-06, + "loss": 0.058, + "step": 154 + }, + { + "epoch": 0.19, + "grad_norm": 1.4136022329330444, + "learning_rate": 9.96809173388579e-06, + "loss": 0.0402, + "step": 155 + }, + { + "epoch": 0.19, + "grad_norm": 0.9865325093269348, + "learning_rate": 9.96765057906663e-06, + "loss": 0.0555, + "step": 156 + }, + { + "epoch": 0.19, + "grad_norm": 1.3715591430664062, + "learning_rate": 9.967206405403468e-06, + "loss": 0.0549, + "step": 157 + }, + { + "epoch": 0.2, + "grad_norm": 1.10662841796875, + "learning_rate": 9.966759213166231e-06, + "loss": 0.0584, + "step": 158 + }, + { + "epoch": 0.2, + "grad_norm": 1.3035138845443726, + "learning_rate": 9.966309002626676e-06, + "loss": 0.0398, + "step": 159 + }, + { + "epoch": 0.2, + "grad_norm": 2.7275445461273193, + "learning_rate": 9.965855774058395e-06, + "loss": 0.0583, + "step": 160 + }, + { + "epoch": 0.2, + "grad_norm": 1.4070425033569336, + "learning_rate": 9.965399527736819e-06, + "loss": 0.0476, + "step": 161 + }, + { + "epoch": 0.2, + "grad_norm": 1.2913644313812256, + "learning_rate": 9.964940263939206e-06, + "loss": 0.0693, + "step": 162 + }, + { + "epoch": 0.2, + "grad_norm": 5.090683937072754, + "learning_rate": 9.964477982944654e-06, + "loss": 0.0737, + "step": 163 + }, + { + "epoch": 0.2, + "grad_norm": 4.244226455688477, + "learning_rate": 9.964012685034087e-06, + "loss": 0.0659, + "step": 164 + }, + { + "epoch": 0.2, + "grad_norm": 1.7967549562454224, + "learning_rate": 9.96354437049027e-06, + "loss": 0.0226, + "step": 165 + }, + { + "epoch": 0.21, + "grad_norm": 1.695214033126831, + "learning_rate": 9.963073039597798e-06, + "loss": 0.0772, + "step": 166 + }, + { + "epoch": 0.21, + "grad_norm": 2.0708000659942627, + "learning_rate": 9.962598692643098e-06, + "loss": 0.053, + "step": 167 + }, + { + "epoch": 0.21, + "grad_norm": 2.1509592533111572, + "learning_rate": 9.962121329914432e-06, + "loss": 0.0714, + "step": 168 + }, + { + "epoch": 0.21, + "grad_norm": 2.4323039054870605, + "learning_rate": 9.961640951701892e-06, + "loss": 0.0456, + "step": 169 + }, + { + "epoch": 0.21, + "grad_norm": 2.304720640182495, + "learning_rate": 9.961157558297404e-06, + "loss": 0.0854, + "step": 170 + }, + { + "epoch": 0.21, + "grad_norm": 0.8575959205627441, + "learning_rate": 9.960671149994727e-06, + "loss": 0.0374, + "step": 171 + }, + { + "epoch": 0.21, + "grad_norm": 1.106746792793274, + "learning_rate": 9.960181727089455e-06, + "loss": 0.0515, + "step": 172 + }, + { + "epoch": 0.21, + "grad_norm": 1.6459972858428955, + "learning_rate": 9.959689289879003e-06, + "loss": 0.0514, + "step": 173 + }, + { + "epoch": 0.22, + "grad_norm": 1.5684750080108643, + "learning_rate": 9.959193838662634e-06, + "loss": 0.0669, + "step": 174 + }, + { + "epoch": 0.22, + "grad_norm": 1.1011048555374146, + "learning_rate": 9.958695373741428e-06, + "loss": 0.0406, + "step": 175 + }, + { + "epoch": 0.22, + "grad_norm": 0.9976766109466553, + "learning_rate": 9.958193895418305e-06, + "loss": 0.0377, + "step": 176 + }, + { + "epoch": 0.22, + "grad_norm": 1.4583932161331177, + "learning_rate": 9.957689403998012e-06, + "loss": 0.06, + "step": 177 + }, + { + "epoch": 0.22, + "grad_norm": 1.1599044799804688, + "learning_rate": 9.95718189978713e-06, + "loss": 0.0406, + "step": 178 + }, + { + "epoch": 0.22, + "grad_norm": 0.9436582326889038, + "learning_rate": 9.95667138309407e-06, + "loss": 0.0361, + "step": 179 + }, + { + "epoch": 0.22, + "grad_norm": 2.8169147968292236, + "learning_rate": 9.956157854229072e-06, + "loss": 0.0597, + "step": 180 + }, + { + "epoch": 0.22, + "grad_norm": 0.9190147519111633, + "learning_rate": 9.955641313504208e-06, + "loss": 0.0258, + "step": 181 + }, + { + "epoch": 0.23, + "grad_norm": 0.8643155694007874, + "learning_rate": 9.95512176123338e-06, + "loss": 0.0327, + "step": 182 + }, + { + "epoch": 0.23, + "grad_norm": 1.2514710426330566, + "learning_rate": 9.95459919773232e-06, + "loss": 0.0723, + "step": 183 + }, + { + "epoch": 0.23, + "grad_norm": 1.3103550672531128, + "learning_rate": 9.954073623318593e-06, + "loss": 0.0576, + "step": 184 + }, + { + "epoch": 0.23, + "grad_norm": 2.092473268508911, + "learning_rate": 9.953545038311587e-06, + "loss": 0.0734, + "step": 185 + }, + { + "epoch": 0.23, + "grad_norm": 2.5062074661254883, + "learning_rate": 9.953013443032524e-06, + "loss": 0.0483, + "step": 186 + }, + { + "epoch": 0.23, + "grad_norm": 2.1158766746520996, + "learning_rate": 9.952478837804459e-06, + "loss": 0.0345, + "step": 187 + }, + { + "epoch": 0.23, + "grad_norm": 2.5865800380706787, + "learning_rate": 9.951941222952264e-06, + "loss": 0.0557, + "step": 188 + }, + { + "epoch": 0.23, + "grad_norm": 2.171496868133545, + "learning_rate": 9.951400598802655e-06, + "loss": 0.062, + "step": 189 + }, + { + "epoch": 0.24, + "grad_norm": 0.9497528076171875, + "learning_rate": 9.950856965684167e-06, + "loss": 0.0365, + "step": 190 + }, + { + "epoch": 0.24, + "grad_norm": 1.4575358629226685, + "learning_rate": 9.950310323927165e-06, + "loss": 0.0648, + "step": 191 + }, + { + "epoch": 0.24, + "grad_norm": 2.8335795402526855, + "learning_rate": 9.949760673863846e-06, + "loss": 0.0611, + "step": 192 + }, + { + "epoch": 0.24, + "grad_norm": 1.1269536018371582, + "learning_rate": 9.949208015828232e-06, + "loss": 0.0541, + "step": 193 + }, + { + "epoch": 0.24, + "grad_norm": 0.9925274848937988, + "learning_rate": 9.948652350156172e-06, + "loss": 0.0275, + "step": 194 + }, + { + "epoch": 0.24, + "grad_norm": 1.2717292308807373, + "learning_rate": 9.948093677185345e-06, + "loss": 0.041, + "step": 195 + }, + { + "epoch": 0.24, + "grad_norm": 1.1867843866348267, + "learning_rate": 9.947531997255256e-06, + "loss": 0.0517, + "step": 196 + }, + { + "epoch": 0.24, + "grad_norm": 1.1004167795181274, + "learning_rate": 9.946967310707241e-06, + "loss": 0.0503, + "step": 197 + }, + { + "epoch": 0.25, + "grad_norm": 1.8476804494857788, + "learning_rate": 9.946399617884457e-06, + "loss": 0.0419, + "step": 198 + }, + { + "epoch": 0.25, + "grad_norm": 1.3617258071899414, + "learning_rate": 9.945828919131894e-06, + "loss": 0.0273, + "step": 199 + }, + { + "epoch": 0.25, + "grad_norm": 1.4114432334899902, + "learning_rate": 9.945255214796366e-06, + "loss": 0.0448, + "step": 200 + }, + { + "epoch": 0.25, + "grad_norm": 1.4074312448501587, + "learning_rate": 9.944678505226511e-06, + "loss": 0.0637, + "step": 201 + }, + { + "epoch": 0.25, + "grad_norm": 1.2234091758728027, + "learning_rate": 9.944098790772797e-06, + "loss": 0.0497, + "step": 202 + }, + { + "epoch": 0.25, + "grad_norm": 1.3652763366699219, + "learning_rate": 9.943516071787517e-06, + "loss": 0.0555, + "step": 203 + }, + { + "epoch": 0.25, + "grad_norm": 2.020076036453247, + "learning_rate": 9.942930348624788e-06, + "loss": 0.0488, + "step": 204 + }, + { + "epoch": 0.25, + "grad_norm": 1.1463106870651245, + "learning_rate": 9.942341621640558e-06, + "loss": 0.0498, + "step": 205 + }, + { + "epoch": 0.25, + "grad_norm": 1.1451953649520874, + "learning_rate": 9.941749891192594e-06, + "loss": 0.0485, + "step": 206 + }, + { + "epoch": 0.26, + "grad_norm": 2.710951805114746, + "learning_rate": 9.94115515764049e-06, + "loss": 0.0485, + "step": 207 + }, + { + "epoch": 0.26, + "grad_norm": 1.6404072046279907, + "learning_rate": 9.940557421345667e-06, + "loss": 0.0387, + "step": 208 + }, + { + "epoch": 0.26, + "grad_norm": 1.1222543716430664, + "learning_rate": 9.939956682671372e-06, + "loss": 0.0586, + "step": 209 + }, + { + "epoch": 0.26, + "grad_norm": 1.6379327774047852, + "learning_rate": 9.939352941982671e-06, + "loss": 0.068, + "step": 210 + }, + { + "epoch": 0.26, + "grad_norm": 1.2636500597000122, + "learning_rate": 9.938746199646458e-06, + "loss": 0.0413, + "step": 211 + }, + { + "epoch": 0.26, + "grad_norm": 1.1981465816497803, + "learning_rate": 9.938136456031454e-06, + "loss": 0.0259, + "step": 212 + }, + { + "epoch": 0.26, + "grad_norm": 1.2407490015029907, + "learning_rate": 9.937523711508196e-06, + "loss": 0.0413, + "step": 213 + }, + { + "epoch": 0.26, + "grad_norm": 1.5851786136627197, + "learning_rate": 9.93690796644905e-06, + "loss": 0.0452, + "step": 214 + }, + { + "epoch": 0.27, + "grad_norm": 1.1833544969558716, + "learning_rate": 9.936289221228207e-06, + "loss": 0.0415, + "step": 215 + }, + { + "epoch": 0.27, + "grad_norm": 5.073670387268066, + "learning_rate": 9.935667476221678e-06, + "loss": 0.1248, + "step": 216 + }, + { + "epoch": 0.27, + "grad_norm": 2.5642805099487305, + "learning_rate": 9.935042731807297e-06, + "loss": 0.0708, + "step": 217 + }, + { + "epoch": 0.27, + "grad_norm": 3.680995464324951, + "learning_rate": 9.934414988364722e-06, + "loss": 0.0587, + "step": 218 + }, + { + "epoch": 0.27, + "grad_norm": 2.164574146270752, + "learning_rate": 9.933784246275432e-06, + "loss": 0.0532, + "step": 219 + }, + { + "epoch": 0.27, + "grad_norm": 1.1444894075393677, + "learning_rate": 9.93315050592273e-06, + "loss": 0.0486, + "step": 220 + }, + { + "epoch": 0.27, + "grad_norm": 0.9272328615188599, + "learning_rate": 9.932513767691743e-06, + "loss": 0.0465, + "step": 221 + }, + { + "epoch": 0.27, + "grad_norm": 3.0213119983673096, + "learning_rate": 9.931874031969411e-06, + "loss": 0.0679, + "step": 222 + }, + { + "epoch": 0.28, + "grad_norm": 2.7126073837280273, + "learning_rate": 9.931231299144509e-06, + "loss": 0.0849, + "step": 223 + }, + { + "epoch": 0.28, + "grad_norm": 1.2266963720321655, + "learning_rate": 9.93058556960762e-06, + "loss": 0.0722, + "step": 224 + }, + { + "epoch": 0.28, + "grad_norm": 2.530362844467163, + "learning_rate": 9.929936843751158e-06, + "loss": 0.0477, + "step": 225 + }, + { + "epoch": 0.28, + "grad_norm": 2.087737798690796, + "learning_rate": 9.929285121969352e-06, + "loss": 0.0698, + "step": 226 + }, + { + "epoch": 0.28, + "grad_norm": 1.2407419681549072, + "learning_rate": 9.928630404658255e-06, + "loss": 0.0501, + "step": 227 + }, + { + "epoch": 0.28, + "grad_norm": 1.7187033891677856, + "learning_rate": 9.927972692215739e-06, + "loss": 0.0537, + "step": 228 + }, + { + "epoch": 0.28, + "grad_norm": 2.143998861312866, + "learning_rate": 9.927311985041495e-06, + "loss": 0.0554, + "step": 229 + }, + { + "epoch": 0.28, + "grad_norm": 2.8843326568603516, + "learning_rate": 9.926648283537037e-06, + "loss": 0.0544, + "step": 230 + }, + { + "epoch": 0.29, + "grad_norm": 1.6308791637420654, + "learning_rate": 9.925981588105695e-06, + "loss": 0.0505, + "step": 231 + }, + { + "epoch": 0.29, + "grad_norm": 1.8796863555908203, + "learning_rate": 9.92531189915262e-06, + "loss": 0.0537, + "step": 232 + }, + { + "epoch": 0.29, + "grad_norm": 1.4090087413787842, + "learning_rate": 9.924639217084783e-06, + "loss": 0.0589, + "step": 233 + }, + { + "epoch": 0.29, + "grad_norm": 0.9706072807312012, + "learning_rate": 9.923963542310975e-06, + "loss": 0.049, + "step": 234 + }, + { + "epoch": 0.29, + "grad_norm": 0.9905783534049988, + "learning_rate": 9.923284875241802e-06, + "loss": 0.0537, + "step": 235 + }, + { + "epoch": 0.29, + "grad_norm": 0.5304461717605591, + "learning_rate": 9.92260321628969e-06, + "loss": 0.0291, + "step": 236 + }, + { + "epoch": 0.29, + "grad_norm": 1.2716902494430542, + "learning_rate": 9.921918565868887e-06, + "loss": 0.0652, + "step": 237 + }, + { + "epoch": 0.29, + "grad_norm": 0.9943916201591492, + "learning_rate": 9.921230924395449e-06, + "loss": 0.0543, + "step": 238 + }, + { + "epoch": 0.3, + "grad_norm": 1.3783643245697021, + "learning_rate": 9.920540292287262e-06, + "loss": 0.0536, + "step": 239 + }, + { + "epoch": 0.3, + "grad_norm": 1.389773964881897, + "learning_rate": 9.91984666996402e-06, + "loss": 0.0376, + "step": 240 + }, + { + "epoch": 0.3, + "grad_norm": 0.7887927293777466, + "learning_rate": 9.91915005784724e-06, + "loss": 0.0272, + "step": 241 + }, + { + "epoch": 0.3, + "grad_norm": 1.902744174003601, + "learning_rate": 9.918450456360252e-06, + "loss": 0.0543, + "step": 242 + }, + { + "epoch": 0.3, + "grad_norm": 0.6114033460617065, + "learning_rate": 9.917747865928206e-06, + "loss": 0.0262, + "step": 243 + }, + { + "epoch": 0.3, + "grad_norm": 1.1496695280075073, + "learning_rate": 9.917042286978064e-06, + "loss": 0.0643, + "step": 244 + }, + { + "epoch": 0.3, + "grad_norm": 0.8322230577468872, + "learning_rate": 9.916333719938608e-06, + "loss": 0.0435, + "step": 245 + }, + { + "epoch": 0.3, + "grad_norm": 0.9281955361366272, + "learning_rate": 9.915622165240435e-06, + "loss": 0.0399, + "step": 246 + }, + { + "epoch": 0.31, + "grad_norm": 0.7492028474807739, + "learning_rate": 9.914907623315958e-06, + "loss": 0.0367, + "step": 247 + }, + { + "epoch": 0.31, + "grad_norm": 2.0944385528564453, + "learning_rate": 9.914190094599403e-06, + "loss": 0.0488, + "step": 248 + }, + { + "epoch": 0.31, + "grad_norm": 1.0233027935028076, + "learning_rate": 9.913469579526811e-06, + "loss": 0.0475, + "step": 249 + }, + { + "epoch": 0.31, + "grad_norm": 0.9051103591918945, + "learning_rate": 9.912746078536044e-06, + "loss": 0.0374, + "step": 250 + }, + { + "epoch": 0.31, + "grad_norm": 0.6250872015953064, + "learning_rate": 9.91201959206677e-06, + "loss": 0.0236, + "step": 251 + }, + { + "epoch": 0.31, + "grad_norm": 1.0147565603256226, + "learning_rate": 9.911290120560477e-06, + "loss": 0.0408, + "step": 252 + }, + { + "epoch": 0.31, + "grad_norm": 1.8525872230529785, + "learning_rate": 9.910557664460464e-06, + "loss": 0.0485, + "step": 253 + }, + { + "epoch": 0.31, + "grad_norm": 2.040386915206909, + "learning_rate": 9.909822224211845e-06, + "loss": 0.0716, + "step": 254 + }, + { + "epoch": 0.32, + "grad_norm": 1.2481484413146973, + "learning_rate": 9.90908380026155e-06, + "loss": 0.0376, + "step": 255 + }, + { + "epoch": 0.32, + "grad_norm": 2.1175787448883057, + "learning_rate": 9.908342393058317e-06, + "loss": 0.0657, + "step": 256 + }, + { + "epoch": 0.32, + "grad_norm": 0.9903053641319275, + "learning_rate": 9.907598003052701e-06, + "loss": 0.0378, + "step": 257 + }, + { + "epoch": 0.32, + "grad_norm": 1.7109051942825317, + "learning_rate": 9.906850630697068e-06, + "loss": 0.0624, + "step": 258 + }, + { + "epoch": 0.32, + "grad_norm": 1.9067022800445557, + "learning_rate": 9.906100276445596e-06, + "loss": 0.0492, + "step": 259 + }, + { + "epoch": 0.32, + "grad_norm": 0.9397685527801514, + "learning_rate": 9.905346940754274e-06, + "loss": 0.0147, + "step": 260 + }, + { + "epoch": 0.32, + "grad_norm": 3.0456113815307617, + "learning_rate": 9.90459062408091e-06, + "loss": 0.0812, + "step": 261 + }, + { + "epoch": 0.32, + "grad_norm": 2.6053810119628906, + "learning_rate": 9.903831326885112e-06, + "loss": 0.0623, + "step": 262 + }, + { + "epoch": 0.33, + "grad_norm": 2.0448148250579834, + "learning_rate": 9.90306904962831e-06, + "loss": 0.0803, + "step": 263 + }, + { + "epoch": 0.33, + "grad_norm": 1.1430933475494385, + "learning_rate": 9.902303792773736e-06, + "loss": 0.0305, + "step": 264 + }, + { + "epoch": 0.33, + "grad_norm": 0.8864290714263916, + "learning_rate": 9.90153555678644e-06, + "loss": 0.0488, + "step": 265 + }, + { + "epoch": 0.33, + "grad_norm": 1.6222556829452515, + "learning_rate": 9.900764342133277e-06, + "loss": 0.021, + "step": 266 + }, + { + "epoch": 0.33, + "grad_norm": 1.0808035135269165, + "learning_rate": 9.899990149282917e-06, + "loss": 0.0326, + "step": 267 + }, + { + "epoch": 0.33, + "grad_norm": 2.029120683670044, + "learning_rate": 9.899212978705836e-06, + "loss": 0.0384, + "step": 268 + }, + { + "epoch": 0.33, + "grad_norm": 1.2418546676635742, + "learning_rate": 9.898432830874324e-06, + "loss": 0.0365, + "step": 269 + }, + { + "epoch": 0.33, + "grad_norm": 1.3441228866577148, + "learning_rate": 9.897649706262474e-06, + "loss": 0.0692, + "step": 270 + }, + { + "epoch": 0.34, + "grad_norm": 1.4092243909835815, + "learning_rate": 9.896863605346191e-06, + "loss": 0.0472, + "step": 271 + }, + { + "epoch": 0.34, + "grad_norm": 1.3884505033493042, + "learning_rate": 9.89607452860319e-06, + "loss": 0.088, + "step": 272 + }, + { + "epoch": 0.34, + "grad_norm": 2.6695573329925537, + "learning_rate": 9.895282476512995e-06, + "loss": 0.043, + "step": 273 + }, + { + "epoch": 0.34, + "grad_norm": 1.7949867248535156, + "learning_rate": 9.894487449556934e-06, + "loss": 0.0514, + "step": 274 + }, + { + "epoch": 0.34, + "grad_norm": 1.3810291290283203, + "learning_rate": 9.893689448218146e-06, + "loss": 0.0472, + "step": 275 + }, + { + "epoch": 0.34, + "grad_norm": 1.0681228637695312, + "learning_rate": 9.892888472981577e-06, + "loss": 0.0389, + "step": 276 + }, + { + "epoch": 0.34, + "grad_norm": 0.6548139452934265, + "learning_rate": 9.89208452433398e-06, + "loss": 0.0339, + "step": 277 + }, + { + "epoch": 0.34, + "grad_norm": 0.8944026231765747, + "learning_rate": 9.891277602763916e-06, + "loss": 0.037, + "step": 278 + }, + { + "epoch": 0.35, + "grad_norm": 1.7463440895080566, + "learning_rate": 9.89046770876175e-06, + "loss": 0.048, + "step": 279 + }, + { + "epoch": 0.35, + "grad_norm": 3.2079529762268066, + "learning_rate": 9.889654842819658e-06, + "loss": 0.0721, + "step": 280 + }, + { + "epoch": 0.35, + "grad_norm": 2.0868616104125977, + "learning_rate": 9.888839005431615e-06, + "loss": 0.0573, + "step": 281 + }, + { + "epoch": 0.35, + "grad_norm": 1.23513662815094, + "learning_rate": 9.888020197093409e-06, + "loss": 0.0542, + "step": 282 + }, + { + "epoch": 0.35, + "grad_norm": 0.7781217694282532, + "learning_rate": 9.887198418302629e-06, + "loss": 0.0386, + "step": 283 + }, + { + "epoch": 0.35, + "grad_norm": 1.390410304069519, + "learning_rate": 9.886373669558669e-06, + "loss": 0.0338, + "step": 284 + }, + { + "epoch": 0.35, + "grad_norm": 1.6135231256484985, + "learning_rate": 9.885545951362733e-06, + "loss": 0.0403, + "step": 285 + }, + { + "epoch": 0.35, + "grad_norm": 1.1802467107772827, + "learning_rate": 9.884715264217823e-06, + "loss": 0.0716, + "step": 286 + }, + { + "epoch": 0.36, + "grad_norm": 1.1783833503723145, + "learning_rate": 9.883881608628748e-06, + "loss": 0.0426, + "step": 287 + }, + { + "epoch": 0.36, + "grad_norm": 0.994340181350708, + "learning_rate": 9.883044985102122e-06, + "loss": 0.047, + "step": 288 + }, + { + "epoch": 0.36, + "grad_norm": 0.9849565625190735, + "learning_rate": 9.882205394146362e-06, + "loss": 0.0416, + "step": 289 + }, + { + "epoch": 0.36, + "grad_norm": 1.2525103092193604, + "learning_rate": 9.881362836271686e-06, + "loss": 0.0672, + "step": 290 + }, + { + "epoch": 0.36, + "grad_norm": 0.8505926728248596, + "learning_rate": 9.880517311990118e-06, + "loss": 0.0455, + "step": 291 + }, + { + "epoch": 0.36, + "grad_norm": 1.3629908561706543, + "learning_rate": 9.879668821815484e-06, + "loss": 0.0357, + "step": 292 + }, + { + "epoch": 0.36, + "grad_norm": 1.1365973949432373, + "learning_rate": 9.878817366263412e-06, + "loss": 0.0666, + "step": 293 + }, + { + "epoch": 0.36, + "grad_norm": 1.0324252843856812, + "learning_rate": 9.87796294585133e-06, + "loss": 0.0449, + "step": 294 + }, + { + "epoch": 0.37, + "grad_norm": 0.757729172706604, + "learning_rate": 9.877105561098473e-06, + "loss": 0.0248, + "step": 295 + }, + { + "epoch": 0.37, + "grad_norm": 1.2894716262817383, + "learning_rate": 9.87624521252587e-06, + "loss": 0.0382, + "step": 296 + }, + { + "epoch": 0.37, + "grad_norm": 1.5887492895126343, + "learning_rate": 9.87538190065636e-06, + "loss": 0.0459, + "step": 297 + }, + { + "epoch": 0.37, + "grad_norm": 1.5617096424102783, + "learning_rate": 9.874515626014576e-06, + "loss": 0.0673, + "step": 298 + }, + { + "epoch": 0.37, + "grad_norm": 2.4001352787017822, + "learning_rate": 9.873646389126954e-06, + "loss": 0.0937, + "step": 299 + }, + { + "epoch": 0.37, + "grad_norm": 1.1498814821243286, + "learning_rate": 9.872774190521727e-06, + "loss": 0.0609, + "step": 300 + }, + { + "epoch": 0.37, + "grad_norm": 3.620199680328369, + "learning_rate": 9.871899030728932e-06, + "loss": 0.078, + "step": 301 + }, + { + "epoch": 0.37, + "grad_norm": 1.5257648229599, + "learning_rate": 9.871020910280408e-06, + "loss": 0.0456, + "step": 302 + }, + { + "epoch": 0.38, + "grad_norm": 2.344609498977661, + "learning_rate": 9.870139829709784e-06, + "loss": 0.0579, + "step": 303 + }, + { + "epoch": 0.38, + "grad_norm": 0.6787387132644653, + "learning_rate": 9.869255789552496e-06, + "loss": 0.036, + "step": 304 + }, + { + "epoch": 0.38, + "grad_norm": 0.7965288162231445, + "learning_rate": 9.868368790345777e-06, + "loss": 0.0347, + "step": 305 + }, + { + "epoch": 0.38, + "grad_norm": 1.3934015035629272, + "learning_rate": 9.867478832628652e-06, + "loss": 0.0504, + "step": 306 + }, + { + "epoch": 0.38, + "grad_norm": 0.6102665662765503, + "learning_rate": 9.866585916941951e-06, + "loss": 0.0303, + "step": 307 + }, + { + "epoch": 0.38, + "grad_norm": 0.6944254636764526, + "learning_rate": 9.865690043828302e-06, + "loss": 0.0389, + "step": 308 + }, + { + "epoch": 0.38, + "grad_norm": 0.5572813153266907, + "learning_rate": 9.864791213832125e-06, + "loss": 0.0249, + "step": 309 + }, + { + "epoch": 0.38, + "grad_norm": 0.9218201041221619, + "learning_rate": 9.863889427499641e-06, + "loss": 0.0579, + "step": 310 + }, + { + "epoch": 0.38, + "grad_norm": 2.7617053985595703, + "learning_rate": 9.862984685378864e-06, + "loss": 0.0942, + "step": 311 + }, + { + "epoch": 0.39, + "grad_norm": 2.5800890922546387, + "learning_rate": 9.862076988019609e-06, + "loss": 0.0705, + "step": 312 + }, + { + "epoch": 0.39, + "grad_norm": 0.5009744763374329, + "learning_rate": 9.86116633597348e-06, + "loss": 0.0187, + "step": 313 + }, + { + "epoch": 0.39, + "grad_norm": 0.8876914381980896, + "learning_rate": 9.860252729793885e-06, + "loss": 0.0574, + "step": 314 + }, + { + "epoch": 0.39, + "grad_norm": 2.8853681087493896, + "learning_rate": 9.859336170036022e-06, + "loss": 0.0509, + "step": 315 + }, + { + "epoch": 0.39, + "grad_norm": 3.341853141784668, + "learning_rate": 9.858416657256883e-06, + "loss": 0.0697, + "step": 316 + }, + { + "epoch": 0.39, + "grad_norm": 1.9934710264205933, + "learning_rate": 9.857494192015258e-06, + "loss": 0.0531, + "step": 317 + }, + { + "epoch": 0.39, + "grad_norm": 1.259093165397644, + "learning_rate": 9.85656877487173e-06, + "loss": 0.0349, + "step": 318 + }, + { + "epoch": 0.39, + "grad_norm": 0.9945093393325806, + "learning_rate": 9.855640406388673e-06, + "loss": 0.0393, + "step": 319 + }, + { + "epoch": 0.4, + "grad_norm": 1.5558804273605347, + "learning_rate": 9.854709087130261e-06, + "loss": 0.0584, + "step": 320 + }, + { + "epoch": 0.4, + "grad_norm": 2.9720606803894043, + "learning_rate": 9.853774817662453e-06, + "loss": 0.0767, + "step": 321 + }, + { + "epoch": 0.4, + "grad_norm": 0.8328733444213867, + "learning_rate": 9.85283759855301e-06, + "loss": 0.0312, + "step": 322 + }, + { + "epoch": 0.4, + "grad_norm": 2.4241795539855957, + "learning_rate": 9.851897430371475e-06, + "loss": 0.0613, + "step": 323 + }, + { + "epoch": 0.4, + "grad_norm": 1.2547311782836914, + "learning_rate": 9.850954313689193e-06, + "loss": 0.0378, + "step": 324 + }, + { + "epoch": 0.4, + "grad_norm": 0.9641187191009521, + "learning_rate": 9.850008249079295e-06, + "loss": 0.0301, + "step": 325 + }, + { + "epoch": 0.4, + "grad_norm": 3.5166923999786377, + "learning_rate": 9.849059237116702e-06, + "loss": 0.0651, + "step": 326 + }, + { + "epoch": 0.4, + "grad_norm": 1.5394651889801025, + "learning_rate": 9.848107278378136e-06, + "loss": 0.0483, + "step": 327 + }, + { + "epoch": 0.41, + "grad_norm": 1.9585269689559937, + "learning_rate": 9.847152373442096e-06, + "loss": 0.0548, + "step": 328 + }, + { + "epoch": 0.41, + "grad_norm": 1.0429555177688599, + "learning_rate": 9.846194522888884e-06, + "loss": 0.0481, + "step": 329 + }, + { + "epoch": 0.41, + "grad_norm": 1.1581437587738037, + "learning_rate": 9.84523372730058e-06, + "loss": 0.0603, + "step": 330 + }, + { + "epoch": 0.41, + "grad_norm": 0.7063565850257874, + "learning_rate": 9.844269987261066e-06, + "loss": 0.0326, + "step": 331 + }, + { + "epoch": 0.41, + "grad_norm": 1.5360925197601318, + "learning_rate": 9.843303303356005e-06, + "loss": 0.0456, + "step": 332 + }, + { + "epoch": 0.41, + "grad_norm": 1.3182265758514404, + "learning_rate": 9.84233367617285e-06, + "loss": 0.0336, + "step": 333 + }, + { + "epoch": 0.41, + "grad_norm": 0.8530195951461792, + "learning_rate": 9.841361106300846e-06, + "loss": 0.0375, + "step": 334 + }, + { + "epoch": 0.41, + "grad_norm": 0.9681763052940369, + "learning_rate": 9.840385594331022e-06, + "loss": 0.0265, + "step": 335 + }, + { + "epoch": 0.42, + "grad_norm": 1.2474390268325806, + "learning_rate": 9.839407140856199e-06, + "loss": 0.0438, + "step": 336 + }, + { + "epoch": 0.42, + "grad_norm": 1.427484393119812, + "learning_rate": 9.838425746470984e-06, + "loss": 0.0506, + "step": 337 + }, + { + "epoch": 0.42, + "grad_norm": 0.8225058317184448, + "learning_rate": 9.837441411771771e-06, + "loss": 0.0355, + "step": 338 + }, + { + "epoch": 0.42, + "grad_norm": 0.9241979122161865, + "learning_rate": 9.836454137356739e-06, + "loss": 0.0386, + "step": 339 + }, + { + "epoch": 0.42, + "grad_norm": 0.8418800234794617, + "learning_rate": 9.835463923825854e-06, + "loss": 0.0392, + "step": 340 + }, + { + "epoch": 0.42, + "grad_norm": 0.9536418914794922, + "learning_rate": 9.834470771780875e-06, + "loss": 0.0577, + "step": 341 + }, + { + "epoch": 0.42, + "grad_norm": 0.7787923216819763, + "learning_rate": 9.833474681825334e-06, + "loss": 0.0325, + "step": 342 + }, + { + "epoch": 0.42, + "grad_norm": 2.5342555046081543, + "learning_rate": 9.832475654564562e-06, + "loss": 0.0413, + "step": 343 + }, + { + "epoch": 0.43, + "grad_norm": 1.160288691520691, + "learning_rate": 9.831473690605664e-06, + "loss": 0.0609, + "step": 344 + }, + { + "epoch": 0.43, + "grad_norm": 2.0293076038360596, + "learning_rate": 9.830468790557536e-06, + "loss": 0.0376, + "step": 345 + }, + { + "epoch": 0.43, + "grad_norm": 1.1950795650482178, + "learning_rate": 9.829460955030854e-06, + "loss": 0.0285, + "step": 346 + }, + { + "epoch": 0.43, + "grad_norm": 1.130022644996643, + "learning_rate": 9.828450184638082e-06, + "loss": 0.0725, + "step": 347 + }, + { + "epoch": 0.43, + "grad_norm": 1.2049533128738403, + "learning_rate": 9.827436479993468e-06, + "loss": 0.0345, + "step": 348 + }, + { + "epoch": 0.43, + "grad_norm": 1.9585927724838257, + "learning_rate": 9.826419841713038e-06, + "loss": 0.0539, + "step": 349 + }, + { + "epoch": 0.43, + "grad_norm": 0.7200453281402588, + "learning_rate": 9.825400270414602e-06, + "loss": 0.0358, + "step": 350 + }, + { + "epoch": 0.43, + "grad_norm": 0.9681141972541809, + "learning_rate": 9.824377766717758e-06, + "loss": 0.0288, + "step": 351 + }, + { + "epoch": 0.44, + "grad_norm": 0.843163788318634, + "learning_rate": 9.823352331243881e-06, + "loss": 0.0396, + "step": 352 + }, + { + "epoch": 0.44, + "grad_norm": 0.8464294075965881, + "learning_rate": 9.822323964616125e-06, + "loss": 0.0394, + "step": 353 + }, + { + "epoch": 0.44, + "grad_norm": 0.6887583136558533, + "learning_rate": 9.821292667459435e-06, + "loss": 0.0295, + "step": 354 + }, + { + "epoch": 0.44, + "grad_norm": 1.815610408782959, + "learning_rate": 9.820258440400525e-06, + "loss": 0.0372, + "step": 355 + }, + { + "epoch": 0.44, + "grad_norm": 1.1596908569335938, + "learning_rate": 9.8192212840679e-06, + "loss": 0.0247, + "step": 356 + }, + { + "epoch": 0.44, + "grad_norm": 1.0240830183029175, + "learning_rate": 9.818181199091838e-06, + "loss": 0.0497, + "step": 357 + }, + { + "epoch": 0.44, + "grad_norm": 0.9827424883842468, + "learning_rate": 9.817138186104401e-06, + "loss": 0.0585, + "step": 358 + }, + { + "epoch": 0.44, + "grad_norm": 0.8876912593841553, + "learning_rate": 9.816092245739426e-06, + "loss": 0.039, + "step": 359 + }, + { + "epoch": 0.45, + "grad_norm": 1.8267855644226074, + "learning_rate": 9.81504337863253e-06, + "loss": 0.0393, + "step": 360 + }, + { + "epoch": 0.45, + "grad_norm": 0.7727996706962585, + "learning_rate": 9.813991585421118e-06, + "loss": 0.0442, + "step": 361 + }, + { + "epoch": 0.45, + "grad_norm": 2.0796356201171875, + "learning_rate": 9.812936866744358e-06, + "loss": 0.0525, + "step": 362 + }, + { + "epoch": 0.45, + "grad_norm": 0.8108832836151123, + "learning_rate": 9.811879223243207e-06, + "loss": 0.0367, + "step": 363 + }, + { + "epoch": 0.45, + "grad_norm": 0.9708784818649292, + "learning_rate": 9.810818655560393e-06, + "loss": 0.0436, + "step": 364 + }, + { + "epoch": 0.45, + "grad_norm": 1.442888855934143, + "learning_rate": 9.809755164340423e-06, + "loss": 0.0432, + "step": 365 + }, + { + "epoch": 0.45, + "grad_norm": 0.8913246989250183, + "learning_rate": 9.808688750229584e-06, + "loss": 0.046, + "step": 366 + }, + { + "epoch": 0.45, + "grad_norm": 2.196491003036499, + "learning_rate": 9.807619413875937e-06, + "loss": 0.0466, + "step": 367 + }, + { + "epoch": 0.46, + "grad_norm": 0.9138450622558594, + "learning_rate": 9.806547155929315e-06, + "loss": 0.0355, + "step": 368 + }, + { + "epoch": 0.46, + "grad_norm": 0.3624818027019501, + "learning_rate": 9.80547197704133e-06, + "loss": 0.0186, + "step": 369 + }, + { + "epoch": 0.46, + "grad_norm": 1.0726361274719238, + "learning_rate": 9.804393877865373e-06, + "loss": 0.0497, + "step": 370 + }, + { + "epoch": 0.46, + "grad_norm": 0.8961818218231201, + "learning_rate": 9.8033128590566e-06, + "loss": 0.0356, + "step": 371 + }, + { + "epoch": 0.46, + "grad_norm": 2.240262746810913, + "learning_rate": 9.80222892127195e-06, + "loss": 0.0794, + "step": 372 + }, + { + "epoch": 0.46, + "grad_norm": 2.4816982746124268, + "learning_rate": 9.801142065170132e-06, + "loss": 0.0631, + "step": 373 + }, + { + "epoch": 0.46, + "grad_norm": 1.1969040632247925, + "learning_rate": 9.80005229141163e-06, + "loss": 0.0559, + "step": 374 + }, + { + "epoch": 0.46, + "grad_norm": 1.4784609079360962, + "learning_rate": 9.798959600658697e-06, + "loss": 0.0746, + "step": 375 + }, + { + "epoch": 0.47, + "grad_norm": 0.7828866839408875, + "learning_rate": 9.797863993575365e-06, + "loss": 0.0396, + "step": 376 + }, + { + "epoch": 0.47, + "grad_norm": 0.7891765832901001, + "learning_rate": 9.796765470827435e-06, + "loss": 0.0567, + "step": 377 + }, + { + "epoch": 0.47, + "grad_norm": 0.7710642218589783, + "learning_rate": 9.795664033082476e-06, + "loss": 0.0442, + "step": 378 + }, + { + "epoch": 0.47, + "grad_norm": 0.8450149297714233, + "learning_rate": 9.794559681009837e-06, + "loss": 0.036, + "step": 379 + }, + { + "epoch": 0.47, + "grad_norm": 0.545617401599884, + "learning_rate": 9.79345241528063e-06, + "loss": 0.0302, + "step": 380 + }, + { + "epoch": 0.47, + "grad_norm": 1.7093480825424194, + "learning_rate": 9.792342236567743e-06, + "loss": 0.0494, + "step": 381 + }, + { + "epoch": 0.47, + "grad_norm": 0.8590899109840393, + "learning_rate": 9.791229145545832e-06, + "loss": 0.0389, + "step": 382 + }, + { + "epoch": 0.47, + "grad_norm": 1.1689053773880005, + "learning_rate": 9.790113142891323e-06, + "loss": 0.0505, + "step": 383 + }, + { + "epoch": 0.48, + "grad_norm": 0.6099830269813538, + "learning_rate": 9.78899422928241e-06, + "loss": 0.036, + "step": 384 + }, + { + "epoch": 0.48, + "grad_norm": 1.2200748920440674, + "learning_rate": 9.787872405399059e-06, + "loss": 0.0557, + "step": 385 + }, + { + "epoch": 0.48, + "grad_norm": 1.0489903688430786, + "learning_rate": 9.786747671923003e-06, + "loss": 0.0719, + "step": 386 + }, + { + "epoch": 0.48, + "grad_norm": 1.578433871269226, + "learning_rate": 9.785620029537741e-06, + "loss": 0.03, + "step": 387 + }, + { + "epoch": 0.48, + "grad_norm": 0.9253179430961609, + "learning_rate": 9.784489478928545e-06, + "loss": 0.0527, + "step": 388 + }, + { + "epoch": 0.48, + "grad_norm": 0.7473218441009521, + "learning_rate": 9.783356020782448e-06, + "loss": 0.035, + "step": 389 + }, + { + "epoch": 0.48, + "grad_norm": 1.4502854347229004, + "learning_rate": 9.782219655788257e-06, + "loss": 0.0423, + "step": 390 + }, + { + "epoch": 0.48, + "grad_norm": 0.946733295917511, + "learning_rate": 9.781080384636539e-06, + "loss": 0.0413, + "step": 391 + }, + { + "epoch": 0.49, + "grad_norm": 1.4826123714447021, + "learning_rate": 9.77993820801963e-06, + "loss": 0.0414, + "step": 392 + }, + { + "epoch": 0.49, + "grad_norm": 2.0471692085266113, + "learning_rate": 9.778793126631632e-06, + "loss": 0.0466, + "step": 393 + }, + { + "epoch": 0.49, + "grad_norm": 1.7681257724761963, + "learning_rate": 9.777645141168411e-06, + "loss": 0.0504, + "step": 394 + }, + { + "epoch": 0.49, + "grad_norm": 0.7187155485153198, + "learning_rate": 9.776494252327597e-06, + "loss": 0.0447, + "step": 395 + }, + { + "epoch": 0.49, + "grad_norm": 0.7922236323356628, + "learning_rate": 9.775340460808589e-06, + "loss": 0.0313, + "step": 396 + }, + { + "epoch": 0.49, + "grad_norm": 2.724630117416382, + "learning_rate": 9.774183767312545e-06, + "loss": 0.0616, + "step": 397 + }, + { + "epoch": 0.49, + "grad_norm": 0.47513461112976074, + "learning_rate": 9.773024172542389e-06, + "loss": 0.0163, + "step": 398 + }, + { + "epoch": 0.49, + "grad_norm": 0.6144838333129883, + "learning_rate": 9.771861677202804e-06, + "loss": 0.0271, + "step": 399 + }, + { + "epoch": 0.5, + "grad_norm": 1.0170230865478516, + "learning_rate": 9.770696282000245e-06, + "loss": 0.0438, + "step": 400 + }, + { + "epoch": 0.5, + "grad_norm": 0.5385282635688782, + "learning_rate": 9.76952798764292e-06, + "loss": 0.0169, + "step": 401 + }, + { + "epoch": 0.5, + "grad_norm": 1.6152381896972656, + "learning_rate": 9.7683567948408e-06, + "loss": 0.068, + "step": 402 + }, + { + "epoch": 0.5, + "grad_norm": 0.9734664559364319, + "learning_rate": 9.767182704305625e-06, + "loss": 0.0681, + "step": 403 + }, + { + "epoch": 0.5, + "grad_norm": 1.7027530670166016, + "learning_rate": 9.766005716750884e-06, + "loss": 0.04, + "step": 404 + }, + { + "epoch": 0.5, + "grad_norm": 0.7407202124595642, + "learning_rate": 9.764825832891837e-06, + "loss": 0.033, + "step": 405 + }, + { + "epoch": 0.5, + "grad_norm": 0.8196337223052979, + "learning_rate": 9.7636430534455e-06, + "loss": 0.0451, + "step": 406 + }, + { + "epoch": 0.5, + "grad_norm": 2.600836753845215, + "learning_rate": 9.762457379130649e-06, + "loss": 0.075, + "step": 407 + }, + { + "epoch": 0.5, + "grad_norm": 1.4206620454788208, + "learning_rate": 9.761268810667817e-06, + "loss": 0.0255, + "step": 408 + }, + { + "epoch": 0.51, + "grad_norm": 0.9220699071884155, + "learning_rate": 9.760077348779298e-06, + "loss": 0.0564, + "step": 409 + }, + { + "epoch": 0.51, + "grad_norm": 0.6927193999290466, + "learning_rate": 9.758882994189145e-06, + "loss": 0.0375, + "step": 410 + }, + { + "epoch": 0.51, + "grad_norm": 0.9594948291778564, + "learning_rate": 9.757685747623169e-06, + "loss": 0.0523, + "step": 411 + }, + { + "epoch": 0.51, + "grad_norm": 1.9151678085327148, + "learning_rate": 9.756485609808934e-06, + "loss": 0.0634, + "step": 412 + }, + { + "epoch": 0.51, + "grad_norm": 1.0471961498260498, + "learning_rate": 9.755282581475769e-06, + "loss": 0.027, + "step": 413 + }, + { + "epoch": 0.51, + "grad_norm": 1.2358285188674927, + "learning_rate": 9.75407666335475e-06, + "loss": 0.0705, + "step": 414 + }, + { + "epoch": 0.51, + "grad_norm": 0.8452746272087097, + "learning_rate": 9.752867856178719e-06, + "loss": 0.0485, + "step": 415 + }, + { + "epoch": 0.51, + "grad_norm": 1.2570796012878418, + "learning_rate": 9.751656160682265e-06, + "loss": 0.0375, + "step": 416 + }, + { + "epoch": 0.52, + "grad_norm": 1.8666393756866455, + "learning_rate": 9.750441577601738e-06, + "loss": 0.0418, + "step": 417 + }, + { + "epoch": 0.52, + "grad_norm": 0.7684221267700195, + "learning_rate": 9.749224107675239e-06, + "loss": 0.0477, + "step": 418 + }, + { + "epoch": 0.52, + "grad_norm": 1.430303931236267, + "learning_rate": 9.748003751642628e-06, + "loss": 0.0389, + "step": 419 + }, + { + "epoch": 0.52, + "grad_norm": 4.4301066398620605, + "learning_rate": 9.746780510245512e-06, + "loss": 0.0868, + "step": 420 + }, + { + "epoch": 0.52, + "grad_norm": 2.655571699142456, + "learning_rate": 9.74555438422726e-06, + "loss": 0.0423, + "step": 421 + }, + { + "epoch": 0.52, + "grad_norm": 1.7431411743164062, + "learning_rate": 9.744325374332986e-06, + "loss": 0.0235, + "step": 422 + }, + { + "epoch": 0.52, + "grad_norm": 1.7228596210479736, + "learning_rate": 9.743093481309563e-06, + "loss": 0.0361, + "step": 423 + }, + { + "epoch": 0.52, + "grad_norm": 0.5912590026855469, + "learning_rate": 9.741858705905609e-06, + "loss": 0.0254, + "step": 424 + }, + { + "epoch": 0.53, + "grad_norm": 0.8103305101394653, + "learning_rate": 9.740621048871501e-06, + "loss": 0.0159, + "step": 425 + }, + { + "epoch": 0.53, + "grad_norm": 2.466233253479004, + "learning_rate": 9.739380510959365e-06, + "loss": 0.0803, + "step": 426 + }, + { + "epoch": 0.53, + "grad_norm": 0.5837281942367554, + "learning_rate": 9.738137092923072e-06, + "loss": 0.0293, + "step": 427 + }, + { + "epoch": 0.53, + "grad_norm": 1.528012990951538, + "learning_rate": 9.73689079551825e-06, + "loss": 0.0549, + "step": 428 + }, + { + "epoch": 0.53, + "grad_norm": 2.025675058364868, + "learning_rate": 9.735641619502277e-06, + "loss": 0.0663, + "step": 429 + }, + { + "epoch": 0.53, + "grad_norm": 1.34830641746521, + "learning_rate": 9.734389565634277e-06, + "loss": 0.0483, + "step": 430 + }, + { + "epoch": 0.53, + "grad_norm": 1.644051194190979, + "learning_rate": 9.73313463467512e-06, + "loss": 0.053, + "step": 431 + }, + { + "epoch": 0.53, + "grad_norm": 1.6768667697906494, + "learning_rate": 9.731876827387433e-06, + "loss": 0.0626, + "step": 432 + }, + { + "epoch": 0.54, + "grad_norm": 2.0125842094421387, + "learning_rate": 9.730616144535581e-06, + "loss": 0.0424, + "step": 433 + }, + { + "epoch": 0.54, + "grad_norm": 4.256353378295898, + "learning_rate": 9.729352586885687e-06, + "loss": 0.0734, + "step": 434 + }, + { + "epoch": 0.54, + "grad_norm": 3.4163427352905273, + "learning_rate": 9.728086155205614e-06, + "loss": 0.0544, + "step": 435 + }, + { + "epoch": 0.54, + "grad_norm": 2.842038154602051, + "learning_rate": 9.726816850264971e-06, + "loss": 0.0465, + "step": 436 + }, + { + "epoch": 0.54, + "grad_norm": 2.0849742889404297, + "learning_rate": 9.725544672835118e-06, + "loss": 0.0684, + "step": 437 + }, + { + "epoch": 0.54, + "grad_norm": 0.677302360534668, + "learning_rate": 9.724269623689158e-06, + "loss": 0.0284, + "step": 438 + }, + { + "epoch": 0.54, + "grad_norm": 1.040449619293213, + "learning_rate": 9.722991703601936e-06, + "loss": 0.0384, + "step": 439 + }, + { + "epoch": 0.54, + "grad_norm": 0.6753067374229431, + "learning_rate": 9.721710913350048e-06, + "loss": 0.0436, + "step": 440 + }, + { + "epoch": 0.55, + "grad_norm": 2.006178617477417, + "learning_rate": 9.720427253711831e-06, + "loss": 0.046, + "step": 441 + }, + { + "epoch": 0.55, + "grad_norm": 1.1364405155181885, + "learning_rate": 9.719140725467362e-06, + "loss": 0.0512, + "step": 442 + }, + { + "epoch": 0.55, + "grad_norm": 0.7395780086517334, + "learning_rate": 9.717851329398469e-06, + "loss": 0.0239, + "step": 443 + }, + { + "epoch": 0.55, + "grad_norm": 1.4531809091567993, + "learning_rate": 9.716559066288716e-06, + "loss": 0.0505, + "step": 444 + }, + { + "epoch": 0.55, + "grad_norm": 0.9090608954429626, + "learning_rate": 9.715263936923413e-06, + "loss": 0.0272, + "step": 445 + }, + { + "epoch": 0.55, + "grad_norm": 0.9618948698043823, + "learning_rate": 9.713965942089612e-06, + "loss": 0.0491, + "step": 446 + }, + { + "epoch": 0.55, + "grad_norm": 0.5173948407173157, + "learning_rate": 9.712665082576104e-06, + "loss": 0.0264, + "step": 447 + }, + { + "epoch": 0.55, + "grad_norm": 0.5747056603431702, + "learning_rate": 9.711361359173422e-06, + "loss": 0.0231, + "step": 448 + }, + { + "epoch": 0.56, + "grad_norm": 1.7778929471969604, + "learning_rate": 9.710054772673839e-06, + "loss": 0.0492, + "step": 449 + }, + { + "epoch": 0.56, + "grad_norm": 2.290955066680908, + "learning_rate": 9.708745323871369e-06, + "loss": 0.0465, + "step": 450 + }, + { + "epoch": 0.56, + "grad_norm": 1.1455390453338623, + "learning_rate": 9.707433013561765e-06, + "loss": 0.0625, + "step": 451 + }, + { + "epoch": 0.56, + "grad_norm": 2.4170002937316895, + "learning_rate": 9.706117842542517e-06, + "loss": 0.0761, + "step": 452 + }, + { + "epoch": 0.56, + "grad_norm": 1.6311193704605103, + "learning_rate": 9.704799811612858e-06, + "loss": 0.0736, + "step": 453 + }, + { + "epoch": 0.56, + "grad_norm": 1.4031122922897339, + "learning_rate": 9.703478921573753e-06, + "loss": 0.0362, + "step": 454 + }, + { + "epoch": 0.56, + "grad_norm": 1.10888671875, + "learning_rate": 9.702155173227911e-06, + "loss": 0.0468, + "step": 455 + }, + { + "epoch": 0.56, + "grad_norm": 2.612172842025757, + "learning_rate": 9.700828567379772e-06, + "loss": 0.0709, + "step": 456 + }, + { + "epoch": 0.57, + "grad_norm": 1.2346030473709106, + "learning_rate": 9.699499104835514e-06, + "loss": 0.0587, + "step": 457 + }, + { + "epoch": 0.57, + "grad_norm": 1.7313090562820435, + "learning_rate": 9.698166786403057e-06, + "loss": 0.0372, + "step": 458 + }, + { + "epoch": 0.57, + "grad_norm": 1.303956389427185, + "learning_rate": 9.696831612892048e-06, + "loss": 0.0415, + "step": 459 + }, + { + "epoch": 0.57, + "grad_norm": 0.4627138674259186, + "learning_rate": 9.695493585113873e-06, + "loss": 0.0276, + "step": 460 + }, + { + "epoch": 0.57, + "grad_norm": 0.7128018140792847, + "learning_rate": 9.694152703881653e-06, + "loss": 0.0265, + "step": 461 + }, + { + "epoch": 0.57, + "grad_norm": 0.8362938165664673, + "learning_rate": 9.69280897001024e-06, + "loss": 0.0597, + "step": 462 + }, + { + "epoch": 0.57, + "grad_norm": 0.9412689208984375, + "learning_rate": 9.691462384316226e-06, + "loss": 0.062, + "step": 463 + }, + { + "epoch": 0.57, + "grad_norm": 1.3194217681884766, + "learning_rate": 9.690112947617929e-06, + "loss": 0.0526, + "step": 464 + }, + { + "epoch": 0.58, + "grad_norm": 1.3153883218765259, + "learning_rate": 9.688760660735403e-06, + "loss": 0.0497, + "step": 465 + }, + { + "epoch": 0.58, + "grad_norm": 1.290602684020996, + "learning_rate": 9.687405524490433e-06, + "loss": 0.0277, + "step": 466 + }, + { + "epoch": 0.58, + "grad_norm": 0.6527288556098938, + "learning_rate": 9.686047539706536e-06, + "loss": 0.0353, + "step": 467 + }, + { + "epoch": 0.58, + "grad_norm": 1.1408582925796509, + "learning_rate": 9.684686707208962e-06, + "loss": 0.0407, + "step": 468 + }, + { + "epoch": 0.58, + "grad_norm": 0.5641573071479797, + "learning_rate": 9.683323027824687e-06, + "loss": 0.0311, + "step": 469 + }, + { + "epoch": 0.58, + "grad_norm": 0.8712812066078186, + "learning_rate": 9.681956502382423e-06, + "loss": 0.0484, + "step": 470 + }, + { + "epoch": 0.58, + "grad_norm": 1.6026149988174438, + "learning_rate": 9.680587131712605e-06, + "loss": 0.0697, + "step": 471 + }, + { + "epoch": 0.58, + "grad_norm": 0.7954007983207703, + "learning_rate": 9.6792149166474e-06, + "loss": 0.0621, + "step": 472 + }, + { + "epoch": 0.59, + "grad_norm": 1.8472158908843994, + "learning_rate": 9.677839858020709e-06, + "loss": 0.0437, + "step": 473 + }, + { + "epoch": 0.59, + "grad_norm": 0.9168758988380432, + "learning_rate": 9.676461956668148e-06, + "loss": 0.0535, + "step": 474 + }, + { + "epoch": 0.59, + "grad_norm": 1.1088653802871704, + "learning_rate": 9.675081213427076e-06, + "loss": 0.038, + "step": 475 + }, + { + "epoch": 0.59, + "grad_norm": 0.6966286301612854, + "learning_rate": 9.673697629136566e-06, + "loss": 0.0304, + "step": 476 + }, + { + "epoch": 0.59, + "grad_norm": 1.734716534614563, + "learning_rate": 9.672311204637426e-06, + "loss": 0.0705, + "step": 477 + }, + { + "epoch": 0.59, + "grad_norm": 0.8543561697006226, + "learning_rate": 9.670921940772186e-06, + "loss": 0.0585, + "step": 478 + }, + { + "epoch": 0.59, + "grad_norm": 0.6839298605918884, + "learning_rate": 9.669529838385102e-06, + "loss": 0.0381, + "step": 479 + }, + { + "epoch": 0.59, + "grad_norm": 0.794438362121582, + "learning_rate": 9.668134898322157e-06, + "loss": 0.0485, + "step": 480 + }, + { + "epoch": 0.6, + "grad_norm": 0.585090696811676, + "learning_rate": 9.666737121431055e-06, + "loss": 0.0295, + "step": 481 + }, + { + "epoch": 0.6, + "grad_norm": 1.14494788646698, + "learning_rate": 9.665336508561225e-06, + "loss": 0.0248, + "step": 482 + }, + { + "epoch": 0.6, + "grad_norm": 0.7456786632537842, + "learning_rate": 9.663933060563824e-06, + "loss": 0.0384, + "step": 483 + }, + { + "epoch": 0.6, + "grad_norm": 1.0646755695343018, + "learning_rate": 9.662526778291725e-06, + "loss": 0.056, + "step": 484 + }, + { + "epoch": 0.6, + "grad_norm": 0.6966055631637573, + "learning_rate": 9.661117662599527e-06, + "loss": 0.0279, + "step": 485 + }, + { + "epoch": 0.6, + "grad_norm": 0.8128595948219299, + "learning_rate": 9.659705714343551e-06, + "loss": 0.0421, + "step": 486 + }, + { + "epoch": 0.6, + "grad_norm": 1.1546441316604614, + "learning_rate": 9.658290934381837e-06, + "loss": 0.0527, + "step": 487 + }, + { + "epoch": 0.6, + "grad_norm": 0.7882161736488342, + "learning_rate": 9.656873323574152e-06, + "loss": 0.041, + "step": 488 + }, + { + "epoch": 0.61, + "grad_norm": 0.9414128065109253, + "learning_rate": 9.655452882781972e-06, + "loss": 0.0198, + "step": 489 + }, + { + "epoch": 0.61, + "grad_norm": 1.0596210956573486, + "learning_rate": 9.654029612868507e-06, + "loss": 0.0606, + "step": 490 + }, + { + "epoch": 0.61, + "grad_norm": 0.676780641078949, + "learning_rate": 9.652603514698674e-06, + "loss": 0.0232, + "step": 491 + }, + { + "epoch": 0.61, + "grad_norm": 0.8404201865196228, + "learning_rate": 9.651174589139115e-06, + "loss": 0.0314, + "step": 492 + }, + { + "epoch": 0.61, + "grad_norm": 0.47275248169898987, + "learning_rate": 9.649742837058189e-06, + "loss": 0.0169, + "step": 493 + }, + { + "epoch": 0.61, + "grad_norm": 3.815514087677002, + "learning_rate": 9.648308259325973e-06, + "loss": 0.0986, + "step": 494 + }, + { + "epoch": 0.61, + "grad_norm": 1.271995186805725, + "learning_rate": 9.646870856814259e-06, + "loss": 0.0271, + "step": 495 + }, + { + "epoch": 0.61, + "grad_norm": 0.6948990821838379, + "learning_rate": 9.64543063039656e-06, + "loss": 0.0224, + "step": 496 + }, + { + "epoch": 0.62, + "grad_norm": 1.3301115036010742, + "learning_rate": 9.6439875809481e-06, + "loss": 0.0375, + "step": 497 + }, + { + "epoch": 0.62, + "grad_norm": 0.6250678896903992, + "learning_rate": 9.64254170934582e-06, + "loss": 0.0184, + "step": 498 + }, + { + "epoch": 0.62, + "grad_norm": 0.9256348609924316, + "learning_rate": 9.641093016468381e-06, + "loss": 0.0375, + "step": 499 + }, + { + "epoch": 0.62, + "grad_norm": 1.3027982711791992, + "learning_rate": 9.639641503196152e-06, + "loss": 0.0276, + "step": 500 + }, + { + "epoch": 0.62, + "grad_norm": 2.560512065887451, + "learning_rate": 9.638187170411218e-06, + "loss": 0.0482, + "step": 501 + }, + { + "epoch": 0.62, + "grad_norm": 1.6088508367538452, + "learning_rate": 9.63673001899738e-06, + "loss": 0.0436, + "step": 502 + }, + { + "epoch": 0.62, + "grad_norm": 1.439906358718872, + "learning_rate": 9.635270049840146e-06, + "loss": 0.0772, + "step": 503 + }, + { + "epoch": 0.62, + "grad_norm": 1.1696199178695679, + "learning_rate": 9.633807263826745e-06, + "loss": 0.0388, + "step": 504 + }, + { + "epoch": 0.62, + "grad_norm": 1.6363476514816284, + "learning_rate": 9.632341661846107e-06, + "loss": 0.0592, + "step": 505 + }, + { + "epoch": 0.63, + "grad_norm": 3.1684820652008057, + "learning_rate": 9.630873244788884e-06, + "loss": 0.0696, + "step": 506 + }, + { + "epoch": 0.63, + "grad_norm": 2.787458658218384, + "learning_rate": 9.629402013547432e-06, + "loss": 0.0842, + "step": 507 + }, + { + "epoch": 0.63, + "grad_norm": 0.8504316806793213, + "learning_rate": 9.627927969015817e-06, + "loss": 0.0413, + "step": 508 + }, + { + "epoch": 0.63, + "grad_norm": 0.9233881235122681, + "learning_rate": 9.62645111208982e-06, + "loss": 0.0315, + "step": 509 + }, + { + "epoch": 0.63, + "grad_norm": 1.571606159210205, + "learning_rate": 9.62497144366693e-06, + "loss": 0.0716, + "step": 510 + }, + { + "epoch": 0.63, + "grad_norm": 2.602965831756592, + "learning_rate": 9.623488964646334e-06, + "loss": 0.0526, + "step": 511 + }, + { + "epoch": 0.63, + "grad_norm": 1.687855839729309, + "learning_rate": 9.622003675928943e-06, + "loss": 0.0517, + "step": 512 + }, + { + "epoch": 0.63, + "grad_norm": 1.535513162612915, + "learning_rate": 9.620515578417364e-06, + "loss": 0.0368, + "step": 513 + }, + { + "epoch": 0.64, + "grad_norm": 0.5331669449806213, + "learning_rate": 9.619024673015916e-06, + "loss": 0.0273, + "step": 514 + }, + { + "epoch": 0.64, + "grad_norm": 0.7347199320793152, + "learning_rate": 9.617530960630624e-06, + "loss": 0.022, + "step": 515 + }, + { + "epoch": 0.64, + "grad_norm": 1.8210560083389282, + "learning_rate": 9.616034442169214e-06, + "loss": 0.0625, + "step": 516 + }, + { + "epoch": 0.64, + "grad_norm": 1.0366301536560059, + "learning_rate": 9.614535118541126e-06, + "loss": 0.0409, + "step": 517 + }, + { + "epoch": 0.64, + "grad_norm": 0.8622118234634399, + "learning_rate": 9.613032990657495e-06, + "loss": 0.0529, + "step": 518 + }, + { + "epoch": 0.64, + "grad_norm": 1.1612430810928345, + "learning_rate": 9.61152805943117e-06, + "loss": 0.0298, + "step": 519 + }, + { + "epoch": 0.64, + "grad_norm": 0.6844496726989746, + "learning_rate": 9.610020325776694e-06, + "loss": 0.0306, + "step": 520 + }, + { + "epoch": 0.64, + "grad_norm": 0.7687200307846069, + "learning_rate": 9.608509790610322e-06, + "loss": 0.0416, + "step": 521 + }, + { + "epoch": 0.65, + "grad_norm": 0.7224605083465576, + "learning_rate": 9.606996454850002e-06, + "loss": 0.036, + "step": 522 + }, + { + "epoch": 0.65, + "grad_norm": 0.6508851051330566, + "learning_rate": 9.605480319415391e-06, + "loss": 0.0368, + "step": 523 + }, + { + "epoch": 0.65, + "grad_norm": 1.3081005811691284, + "learning_rate": 9.603961385227848e-06, + "loss": 0.0284, + "step": 524 + }, + { + "epoch": 0.65, + "grad_norm": 0.5530818700790405, + "learning_rate": 9.602439653210426e-06, + "loss": 0.0273, + "step": 525 + }, + { + "epoch": 0.65, + "grad_norm": 0.5170778036117554, + "learning_rate": 9.600915124287886e-06, + "loss": 0.0181, + "step": 526 + }, + { + "epoch": 0.65, + "grad_norm": 0.5652095079421997, + "learning_rate": 9.599387799386684e-06, + "loss": 0.0213, + "step": 527 + }, + { + "epoch": 0.65, + "grad_norm": 1.0414352416992188, + "learning_rate": 9.597857679434974e-06, + "loss": 0.0389, + "step": 528 + }, + { + "epoch": 0.65, + "grad_norm": 0.6755688786506653, + "learning_rate": 9.596324765362614e-06, + "loss": 0.0343, + "step": 529 + }, + { + "epoch": 0.66, + "grad_norm": 1.5740824937820435, + "learning_rate": 9.594789058101154e-06, + "loss": 0.0562, + "step": 530 + }, + { + "epoch": 0.66, + "grad_norm": 1.410057544708252, + "learning_rate": 9.593250558583846e-06, + "loss": 0.0394, + "step": 531 + }, + { + "epoch": 0.66, + "grad_norm": 1.4377081394195557, + "learning_rate": 9.591709267745635e-06, + "loss": 0.0255, + "step": 532 + }, + { + "epoch": 0.66, + "grad_norm": 0.9751909971237183, + "learning_rate": 9.590165186523166e-06, + "loss": 0.0395, + "step": 533 + }, + { + "epoch": 0.66, + "grad_norm": 0.8450660109519958, + "learning_rate": 9.588618315854779e-06, + "loss": 0.0331, + "step": 534 + }, + { + "epoch": 0.66, + "grad_norm": 1.8118575811386108, + "learning_rate": 9.587068656680506e-06, + "loss": 0.0346, + "step": 535 + }, + { + "epoch": 0.66, + "grad_norm": 0.7216983437538147, + "learning_rate": 9.585516209942077e-06, + "loss": 0.0242, + "step": 536 + }, + { + "epoch": 0.66, + "grad_norm": 1.0194247961044312, + "learning_rate": 9.583960976582914e-06, + "loss": 0.0478, + "step": 537 + }, + { + "epoch": 0.67, + "grad_norm": 1.1861456632614136, + "learning_rate": 9.582402957548132e-06, + "loss": 0.0224, + "step": 538 + }, + { + "epoch": 0.67, + "grad_norm": 0.8888005614280701, + "learning_rate": 9.580842153784542e-06, + "loss": 0.0393, + "step": 539 + }, + { + "epoch": 0.67, + "grad_norm": 1.0420960187911987, + "learning_rate": 9.579278566240646e-06, + "loss": 0.035, + "step": 540 + }, + { + "epoch": 0.67, + "grad_norm": 0.7932503819465637, + "learning_rate": 9.577712195866634e-06, + "loss": 0.0361, + "step": 541 + }, + { + "epoch": 0.67, + "grad_norm": 2.295933246612549, + "learning_rate": 9.576143043614393e-06, + "loss": 0.0798, + "step": 542 + }, + { + "epoch": 0.67, + "grad_norm": 0.795536458492279, + "learning_rate": 9.574571110437496e-06, + "loss": 0.034, + "step": 543 + }, + { + "epoch": 0.67, + "grad_norm": 1.269714117050171, + "learning_rate": 9.572996397291209e-06, + "loss": 0.0308, + "step": 544 + }, + { + "epoch": 0.67, + "grad_norm": 0.7194578051567078, + "learning_rate": 9.571418905132486e-06, + "loss": 0.0303, + "step": 545 + }, + { + "epoch": 0.68, + "grad_norm": 0.9299863576889038, + "learning_rate": 9.569838634919968e-06, + "loss": 0.0549, + "step": 546 + }, + { + "epoch": 0.68, + "grad_norm": 1.1913076639175415, + "learning_rate": 9.568255587613986e-06, + "loss": 0.0419, + "step": 547 + }, + { + "epoch": 0.68, + "grad_norm": 0.6721378564834595, + "learning_rate": 9.566669764176562e-06, + "loss": 0.0227, + "step": 548 + }, + { + "epoch": 0.68, + "grad_norm": 0.9450292587280273, + "learning_rate": 9.5650811655714e-06, + "loss": 0.0272, + "step": 549 + }, + { + "epoch": 0.68, + "grad_norm": 1.6691453456878662, + "learning_rate": 9.56348979276389e-06, + "loss": 0.0506, + "step": 550 + }, + { + "epoch": 0.68, + "grad_norm": 1.0706772804260254, + "learning_rate": 9.561895646721113e-06, + "loss": 0.0438, + "step": 551 + }, + { + "epoch": 0.68, + "grad_norm": 1.0017832517623901, + "learning_rate": 9.560298728411833e-06, + "loss": 0.0604, + "step": 552 + }, + { + "epoch": 0.68, + "grad_norm": 1.9847087860107422, + "learning_rate": 9.558699038806494e-06, + "loss": 0.0827, + "step": 553 + }, + { + "epoch": 0.69, + "grad_norm": 1.05272376537323, + "learning_rate": 9.557096578877232e-06, + "loss": 0.0315, + "step": 554 + }, + { + "epoch": 0.69, + "grad_norm": 1.6529170274734497, + "learning_rate": 9.555491349597862e-06, + "loss": 0.0438, + "step": 555 + }, + { + "epoch": 0.69, + "grad_norm": 1.5359541177749634, + "learning_rate": 9.553883351943882e-06, + "loss": 0.0453, + "step": 556 + }, + { + "epoch": 0.69, + "grad_norm": 0.7716813087463379, + "learning_rate": 9.552272586892475e-06, + "loss": 0.0395, + "step": 557 + }, + { + "epoch": 0.69, + "grad_norm": 1.0042527914047241, + "learning_rate": 9.550659055422502e-06, + "loss": 0.0524, + "step": 558 + }, + { + "epoch": 0.69, + "grad_norm": 0.9220654368400574, + "learning_rate": 9.549042758514505e-06, + "loss": 0.052, + "step": 559 + }, + { + "epoch": 0.69, + "grad_norm": 1.202533483505249, + "learning_rate": 9.547423697150714e-06, + "loss": 0.0315, + "step": 560 + }, + { + "epoch": 0.69, + "grad_norm": 1.441113829612732, + "learning_rate": 9.545801872315028e-06, + "loss": 0.0406, + "step": 561 + }, + { + "epoch": 0.7, + "grad_norm": 1.1032451391220093, + "learning_rate": 9.544177284993035e-06, + "loss": 0.0562, + "step": 562 + }, + { + "epoch": 0.7, + "grad_norm": 0.613166332244873, + "learning_rate": 9.542549936171994e-06, + "loss": 0.0264, + "step": 563 + }, + { + "epoch": 0.7, + "grad_norm": 0.6434498429298401, + "learning_rate": 9.540919826840848e-06, + "loss": 0.0326, + "step": 564 + }, + { + "epoch": 0.7, + "grad_norm": 0.4755064845085144, + "learning_rate": 9.539286957990215e-06, + "loss": 0.0271, + "step": 565 + }, + { + "epoch": 0.7, + "grad_norm": 0.6659818887710571, + "learning_rate": 9.53765133061239e-06, + "loss": 0.0493, + "step": 566 + }, + { + "epoch": 0.7, + "grad_norm": 0.9639627933502197, + "learning_rate": 9.536012945701345e-06, + "loss": 0.0384, + "step": 567 + }, + { + "epoch": 0.7, + "grad_norm": 0.8150410056114197, + "learning_rate": 9.534371804252727e-06, + "loss": 0.0306, + "step": 568 + }, + { + "epoch": 0.7, + "grad_norm": 1.4704219102859497, + "learning_rate": 9.532727907263861e-06, + "loss": 0.0563, + "step": 569 + }, + { + "epoch": 0.71, + "grad_norm": 0.6380606889724731, + "learning_rate": 9.53108125573374e-06, + "loss": 0.0183, + "step": 570 + }, + { + "epoch": 0.71, + "grad_norm": 0.7984311580657959, + "learning_rate": 9.529431850663036e-06, + "loss": 0.0469, + "step": 571 + }, + { + "epoch": 0.71, + "grad_norm": 0.8775026798248291, + "learning_rate": 9.527779693054095e-06, + "loss": 0.0285, + "step": 572 + }, + { + "epoch": 0.71, + "grad_norm": 0.5551888346672058, + "learning_rate": 9.526124783910935e-06, + "loss": 0.0322, + "step": 573 + }, + { + "epoch": 0.71, + "grad_norm": 1.0795842409133911, + "learning_rate": 9.524467124239243e-06, + "loss": 0.0478, + "step": 574 + }, + { + "epoch": 0.71, + "grad_norm": 1.2850500345230103, + "learning_rate": 9.52280671504638e-06, + "loss": 0.0223, + "step": 575 + }, + { + "epoch": 0.71, + "grad_norm": 0.5365849733352661, + "learning_rate": 9.521143557341378e-06, + "loss": 0.0285, + "step": 576 + }, + { + "epoch": 0.71, + "grad_norm": 0.7505818605422974, + "learning_rate": 9.519477652134938e-06, + "loss": 0.0301, + "step": 577 + }, + { + "epoch": 0.72, + "grad_norm": 0.4962819516658783, + "learning_rate": 9.517809000439432e-06, + "loss": 0.0299, + "step": 578 + }, + { + "epoch": 0.72, + "grad_norm": 1.9355813264846802, + "learning_rate": 9.516137603268903e-06, + "loss": 0.0715, + "step": 579 + }, + { + "epoch": 0.72, + "grad_norm": 1.3954781293869019, + "learning_rate": 9.514463461639055e-06, + "loss": 0.0512, + "step": 580 + }, + { + "epoch": 0.72, + "grad_norm": 1.0368856191635132, + "learning_rate": 9.51278657656727e-06, + "loss": 0.0445, + "step": 581 + }, + { + "epoch": 0.72, + "grad_norm": 0.7911268472671509, + "learning_rate": 9.511106949072588e-06, + "loss": 0.0475, + "step": 582 + }, + { + "epoch": 0.72, + "grad_norm": 1.1066776514053345, + "learning_rate": 9.509424580175724e-06, + "loss": 0.049, + "step": 583 + }, + { + "epoch": 0.72, + "grad_norm": 1.1990307569503784, + "learning_rate": 9.507739470899048e-06, + "loss": 0.0574, + "step": 584 + }, + { + "epoch": 0.72, + "grad_norm": 1.1048943996429443, + "learning_rate": 9.506051622266608e-06, + "loss": 0.08, + "step": 585 + }, + { + "epoch": 0.73, + "grad_norm": 0.8120594024658203, + "learning_rate": 9.504361035304106e-06, + "loss": 0.0443, + "step": 586 + }, + { + "epoch": 0.73, + "grad_norm": 0.6603597402572632, + "learning_rate": 9.502667711038917e-06, + "loss": 0.0366, + "step": 587 + }, + { + "epoch": 0.73, + "grad_norm": 2.3819870948791504, + "learning_rate": 9.500971650500072e-06, + "loss": 0.0692, + "step": 588 + }, + { + "epoch": 0.73, + "grad_norm": 1.7831990718841553, + "learning_rate": 9.499272854718268e-06, + "loss": 0.0506, + "step": 589 + }, + { + "epoch": 0.73, + "grad_norm": 1.1036359071731567, + "learning_rate": 9.497571324725865e-06, + "loss": 0.0435, + "step": 590 + }, + { + "epoch": 0.73, + "grad_norm": 1.2589616775512695, + "learning_rate": 9.495867061556884e-06, + "loss": 0.0412, + "step": 591 + }, + { + "epoch": 0.73, + "grad_norm": 0.78188556432724, + "learning_rate": 9.494160066247006e-06, + "loss": 0.0534, + "step": 592 + }, + { + "epoch": 0.73, + "grad_norm": 0.7451815605163574, + "learning_rate": 9.492450339833573e-06, + "loss": 0.0287, + "step": 593 + }, + { + "epoch": 0.74, + "grad_norm": 1.3252469301223755, + "learning_rate": 9.490737883355587e-06, + "loss": 0.0334, + "step": 594 + }, + { + "epoch": 0.74, + "grad_norm": 0.8932815194129944, + "learning_rate": 9.48902269785371e-06, + "loss": 0.036, + "step": 595 + }, + { + "epoch": 0.74, + "grad_norm": 1.6676141023635864, + "learning_rate": 9.487304784370257e-06, + "loss": 0.0538, + "step": 596 + }, + { + "epoch": 0.74, + "grad_norm": 0.9928424954414368, + "learning_rate": 9.48558414394921e-06, + "loss": 0.0558, + "step": 597 + }, + { + "epoch": 0.74, + "grad_norm": 1.130738377571106, + "learning_rate": 9.4838607776362e-06, + "loss": 0.0454, + "step": 598 + }, + { + "epoch": 0.74, + "grad_norm": 0.8108890056610107, + "learning_rate": 9.48213468647852e-06, + "loss": 0.0265, + "step": 599 + }, + { + "epoch": 0.74, + "grad_norm": 1.0491758584976196, + "learning_rate": 9.480405871525114e-06, + "loss": 0.0518, + "step": 600 + }, + { + "epoch": 0.74, + "grad_norm": 1.0204825401306152, + "learning_rate": 9.478674333826586e-06, + "loss": 0.0339, + "step": 601 + }, + { + "epoch": 0.75, + "grad_norm": 1.026297926902771, + "learning_rate": 9.476940074435189e-06, + "loss": 0.0508, + "step": 602 + }, + { + "epoch": 0.75, + "grad_norm": 1.4111378192901611, + "learning_rate": 9.475203094404836e-06, + "loss": 0.0553, + "step": 603 + }, + { + "epoch": 0.75, + "grad_norm": 0.8152147531509399, + "learning_rate": 9.473463394791093e-06, + "loss": 0.0512, + "step": 604 + }, + { + "epoch": 0.75, + "grad_norm": 0.5428625345230103, + "learning_rate": 9.471720976651173e-06, + "loss": 0.0274, + "step": 605 + }, + { + "epoch": 0.75, + "grad_norm": 0.789997398853302, + "learning_rate": 9.469975841043946e-06, + "loss": 0.0456, + "step": 606 + }, + { + "epoch": 0.75, + "grad_norm": 2.5263166427612305, + "learning_rate": 9.468227989029929e-06, + "loss": 0.0912, + "step": 607 + }, + { + "epoch": 0.75, + "grad_norm": 0.9473277926445007, + "learning_rate": 9.466477421671296e-06, + "loss": 0.0445, + "step": 608 + }, + { + "epoch": 0.75, + "grad_norm": 0.9322047829627991, + "learning_rate": 9.464724140031866e-06, + "loss": 0.0473, + "step": 609 + }, + { + "epoch": 0.75, + "grad_norm": 1.0073190927505493, + "learning_rate": 9.462968145177112e-06, + "loss": 0.0506, + "step": 610 + }, + { + "epoch": 0.76, + "grad_norm": 0.5902945399284363, + "learning_rate": 9.461209438174148e-06, + "loss": 0.0391, + "step": 611 + }, + { + "epoch": 0.76, + "grad_norm": 2.0115785598754883, + "learning_rate": 9.459448020091746e-06, + "loss": 0.0614, + "step": 612 + }, + { + "epoch": 0.76, + "grad_norm": 1.8103097677230835, + "learning_rate": 9.457683892000318e-06, + "loss": 0.0481, + "step": 613 + }, + { + "epoch": 0.76, + "grad_norm": 0.718271017074585, + "learning_rate": 9.455917054971929e-06, + "loss": 0.0277, + "step": 614 + }, + { + "epoch": 0.76, + "grad_norm": 0.948197066783905, + "learning_rate": 9.45414751008028e-06, + "loss": 0.0424, + "step": 615 + }, + { + "epoch": 0.76, + "grad_norm": 1.613114356994629, + "learning_rate": 9.452375258400732e-06, + "loss": 0.0444, + "step": 616 + }, + { + "epoch": 0.76, + "grad_norm": 0.5611456632614136, + "learning_rate": 9.450600301010279e-06, + "loss": 0.0278, + "step": 617 + }, + { + "epoch": 0.76, + "grad_norm": 1.0461411476135254, + "learning_rate": 9.448822638987564e-06, + "loss": 0.062, + "step": 618 + }, + { + "epoch": 0.77, + "grad_norm": 1.203861951828003, + "learning_rate": 9.447042273412873e-06, + "loss": 0.0335, + "step": 619 + }, + { + "epoch": 0.77, + "grad_norm": 1.0347965955734253, + "learning_rate": 9.445259205368138e-06, + "loss": 0.0499, + "step": 620 + }, + { + "epoch": 0.77, + "grad_norm": 1.2198740243911743, + "learning_rate": 9.44347343593693e-06, + "loss": 0.0441, + "step": 621 + }, + { + "epoch": 0.77, + "grad_norm": 0.7504235506057739, + "learning_rate": 9.441684966204456e-06, + "loss": 0.0483, + "step": 622 + }, + { + "epoch": 0.77, + "grad_norm": 0.7221031188964844, + "learning_rate": 9.439893797257578e-06, + "loss": 0.0369, + "step": 623 + }, + { + "epoch": 0.77, + "grad_norm": 1.0137180089950562, + "learning_rate": 9.438099930184783e-06, + "loss": 0.0242, + "step": 624 + }, + { + "epoch": 0.77, + "grad_norm": 0.7642596364021301, + "learning_rate": 9.436303366076213e-06, + "loss": 0.0476, + "step": 625 + }, + { + "epoch": 0.77, + "grad_norm": 1.0482991933822632, + "learning_rate": 9.434504106023634e-06, + "loss": 0.0717, + "step": 626 + }, + { + "epoch": 0.78, + "grad_norm": 0.7821680903434753, + "learning_rate": 9.432702151120464e-06, + "loss": 0.0395, + "step": 627 + }, + { + "epoch": 0.78, + "grad_norm": 0.8012223839759827, + "learning_rate": 9.430897502461745e-06, + "loss": 0.0501, + "step": 628 + }, + { + "epoch": 0.78, + "grad_norm": 0.960848867893219, + "learning_rate": 9.429090161144166e-06, + "loss": 0.0194, + "step": 629 + }, + { + "epoch": 0.78, + "grad_norm": 0.9573109745979309, + "learning_rate": 9.427280128266049e-06, + "loss": 0.0485, + "step": 630 + }, + { + "epoch": 0.78, + "grad_norm": 0.6235270500183105, + "learning_rate": 9.425467404927356e-06, + "loss": 0.0354, + "step": 631 + }, + { + "epoch": 0.78, + "grad_norm": 1.024781346321106, + "learning_rate": 9.423651992229673e-06, + "loss": 0.0356, + "step": 632 + }, + { + "epoch": 0.78, + "grad_norm": 0.7387573719024658, + "learning_rate": 9.421833891276233e-06, + "loss": 0.0576, + "step": 633 + }, + { + "epoch": 0.78, + "grad_norm": 0.5336031913757324, + "learning_rate": 9.420013103171893e-06, + "loss": 0.0387, + "step": 634 + }, + { + "epoch": 0.79, + "grad_norm": 1.2542508840560913, + "learning_rate": 9.418189629023149e-06, + "loss": 0.0415, + "step": 635 + }, + { + "epoch": 0.79, + "grad_norm": 1.6477981805801392, + "learning_rate": 9.416363469938128e-06, + "loss": 0.0725, + "step": 636 + }, + { + "epoch": 0.79, + "grad_norm": 0.7093968391418457, + "learning_rate": 9.414534627026586e-06, + "loss": 0.0361, + "step": 637 + }, + { + "epoch": 0.79, + "grad_norm": 0.8406978845596313, + "learning_rate": 9.412703101399912e-06, + "loss": 0.0248, + "step": 638 + }, + { + "epoch": 0.79, + "grad_norm": 0.7647954821586609, + "learning_rate": 9.410868894171126e-06, + "loss": 0.0734, + "step": 639 + }, + { + "epoch": 0.79, + "grad_norm": 0.5869340300559998, + "learning_rate": 9.409032006454877e-06, + "loss": 0.0322, + "step": 640 + }, + { + "epoch": 0.79, + "grad_norm": 0.6841743588447571, + "learning_rate": 9.407192439367443e-06, + "loss": 0.0217, + "step": 641 + }, + { + "epoch": 0.79, + "grad_norm": 1.1286256313323975, + "learning_rate": 9.405350194026728e-06, + "loss": 0.0432, + "step": 642 + }, + { + "epoch": 0.8, + "grad_norm": 1.9575207233428955, + "learning_rate": 9.403505271552267e-06, + "loss": 0.0623, + "step": 643 + }, + { + "epoch": 0.8, + "grad_norm": 2.1534059047698975, + "learning_rate": 9.401657673065218e-06, + "loss": 0.0682, + "step": 644 + }, + { + "epoch": 0.8, + "grad_norm": 0.6419281959533691, + "learning_rate": 9.399807399688371e-06, + "loss": 0.0271, + "step": 645 + }, + { + "epoch": 0.8, + "grad_norm": 0.8669396638870239, + "learning_rate": 9.397954452546139e-06, + "loss": 0.0438, + "step": 646 + }, + { + "epoch": 0.8, + "grad_norm": 1.168561339378357, + "learning_rate": 9.396098832764555e-06, + "loss": 0.0456, + "step": 647 + }, + { + "epoch": 0.8, + "grad_norm": 1.2432861328125, + "learning_rate": 9.394240541471282e-06, + "loss": 0.0666, + "step": 648 + }, + { + "epoch": 0.8, + "grad_norm": 1.9158250093460083, + "learning_rate": 9.392379579795605e-06, + "loss": 0.0452, + "step": 649 + }, + { + "epoch": 0.8, + "grad_norm": 1.2606102228164673, + "learning_rate": 9.39051594886843e-06, + "loss": 0.0288, + "step": 650 + }, + { + "epoch": 0.81, + "grad_norm": 1.0844234228134155, + "learning_rate": 9.388649649822289e-06, + "loss": 0.0374, + "step": 651 + }, + { + "epoch": 0.81, + "grad_norm": 1.0901192426681519, + "learning_rate": 9.386780683791331e-06, + "loss": 0.0498, + "step": 652 + }, + { + "epoch": 0.81, + "grad_norm": 1.03596830368042, + "learning_rate": 9.384909051911329e-06, + "loss": 0.0544, + "step": 653 + }, + { + "epoch": 0.81, + "grad_norm": 0.7338258028030396, + "learning_rate": 9.383034755319673e-06, + "loss": 0.0389, + "step": 654 + }, + { + "epoch": 0.81, + "grad_norm": 1.973031759262085, + "learning_rate": 9.381157795155374e-06, + "loss": 0.0534, + "step": 655 + }, + { + "epoch": 0.81, + "grad_norm": 0.6111584305763245, + "learning_rate": 9.379278172559065e-06, + "loss": 0.0279, + "step": 656 + }, + { + "epoch": 0.81, + "grad_norm": 0.7228569388389587, + "learning_rate": 9.37739588867299e-06, + "loss": 0.0397, + "step": 657 + }, + { + "epoch": 0.81, + "grad_norm": 1.4140815734863281, + "learning_rate": 9.375510944641017e-06, + "loss": 0.0476, + "step": 658 + }, + { + "epoch": 0.82, + "grad_norm": 1.1325860023498535, + "learning_rate": 9.373623341608624e-06, + "loss": 0.0697, + "step": 659 + }, + { + "epoch": 0.82, + "grad_norm": 1.155360221862793, + "learning_rate": 9.371733080722911e-06, + "loss": 0.0493, + "step": 660 + }, + { + "epoch": 0.82, + "grad_norm": 1.2202762365341187, + "learning_rate": 9.36984016313259e-06, + "loss": 0.0425, + "step": 661 + }, + { + "epoch": 0.82, + "grad_norm": 0.9276245832443237, + "learning_rate": 9.36794458998799e-06, + "loss": 0.0324, + "step": 662 + }, + { + "epoch": 0.82, + "grad_norm": 0.8629313707351685, + "learning_rate": 9.366046362441047e-06, + "loss": 0.0551, + "step": 663 + }, + { + "epoch": 0.82, + "grad_norm": 0.3723730742931366, + "learning_rate": 9.36414548164532e-06, + "loss": 0.0157, + "step": 664 + }, + { + "epoch": 0.82, + "grad_norm": 0.9178370833396912, + "learning_rate": 9.36224194875597e-06, + "loss": 0.0467, + "step": 665 + }, + { + "epoch": 0.82, + "grad_norm": 0.7394289374351501, + "learning_rate": 9.360335764929781e-06, + "loss": 0.0303, + "step": 666 + }, + { + "epoch": 0.83, + "grad_norm": 0.757675290107727, + "learning_rate": 9.358426931325137e-06, + "loss": 0.0302, + "step": 667 + }, + { + "epoch": 0.83, + "grad_norm": 1.3911486864089966, + "learning_rate": 9.356515449102041e-06, + "loss": 0.0544, + "step": 668 + }, + { + "epoch": 0.83, + "grad_norm": 0.451570063829422, + "learning_rate": 9.354601319422099e-06, + "loss": 0.0207, + "step": 669 + }, + { + "epoch": 0.83, + "grad_norm": 0.43002304434776306, + "learning_rate": 9.352684543448532e-06, + "loss": 0.0186, + "step": 670 + }, + { + "epoch": 0.83, + "grad_norm": 0.37833526730537415, + "learning_rate": 9.350765122346162e-06, + "loss": 0.0146, + "step": 671 + }, + { + "epoch": 0.83, + "grad_norm": 0.9775627255439758, + "learning_rate": 9.348843057281423e-06, + "loss": 0.0451, + "step": 672 + }, + { + "epoch": 0.83, + "grad_norm": 0.626708447933197, + "learning_rate": 9.346918349422356e-06, + "loss": 0.0301, + "step": 673 + }, + { + "epoch": 0.83, + "grad_norm": 1.5922341346740723, + "learning_rate": 9.344990999938609e-06, + "loss": 0.0501, + "step": 674 + }, + { + "epoch": 0.84, + "grad_norm": 1.1948060989379883, + "learning_rate": 9.343061010001428e-06, + "loss": 0.0394, + "step": 675 + }, + { + "epoch": 0.84, + "grad_norm": 0.9602558016777039, + "learning_rate": 9.341128380783674e-06, + "loss": 0.0429, + "step": 676 + }, + { + "epoch": 0.84, + "grad_norm": 1.0513089895248413, + "learning_rate": 9.339193113459805e-06, + "loss": 0.0391, + "step": 677 + }, + { + "epoch": 0.84, + "grad_norm": 1.1344138383865356, + "learning_rate": 9.337255209205884e-06, + "loss": 0.0274, + "step": 678 + }, + { + "epoch": 0.84, + "grad_norm": 1.1134185791015625, + "learning_rate": 9.335314669199576e-06, + "loss": 0.0604, + "step": 679 + }, + { + "epoch": 0.84, + "grad_norm": 1.0586154460906982, + "learning_rate": 9.33337149462015e-06, + "loss": 0.0325, + "step": 680 + }, + { + "epoch": 0.84, + "grad_norm": 1.0996270179748535, + "learning_rate": 9.331425686648472e-06, + "loss": 0.0332, + "step": 681 + }, + { + "epoch": 0.84, + "grad_norm": 2.7945778369903564, + "learning_rate": 9.32947724646701e-06, + "loss": 0.0664, + "step": 682 + }, + { + "epoch": 0.85, + "grad_norm": 1.8699554204940796, + "learning_rate": 9.327526175259837e-06, + "loss": 0.0592, + "step": 683 + }, + { + "epoch": 0.85, + "grad_norm": 1.0859918594360352, + "learning_rate": 9.325572474212615e-06, + "loss": 0.0434, + "step": 684 + }, + { + "epoch": 0.85, + "grad_norm": 1.2848424911499023, + "learning_rate": 9.323616144512612e-06, + "loss": 0.0343, + "step": 685 + }, + { + "epoch": 0.85, + "grad_norm": 1.860479474067688, + "learning_rate": 9.321657187348689e-06, + "loss": 0.0581, + "step": 686 + }, + { + "epoch": 0.85, + "grad_norm": 1.3358099460601807, + "learning_rate": 9.319695603911306e-06, + "loss": 0.059, + "step": 687 + }, + { + "epoch": 0.85, + "grad_norm": 0.8692423701286316, + "learning_rate": 9.317731395392517e-06, + "loss": 0.0332, + "step": 688 + }, + { + "epoch": 0.85, + "grad_norm": 1.4998887777328491, + "learning_rate": 9.315764562985976e-06, + "loss": 0.0485, + "step": 689 + }, + { + "epoch": 0.85, + "grad_norm": 0.5280508995056152, + "learning_rate": 9.313795107886925e-06, + "loss": 0.0249, + "step": 690 + }, + { + "epoch": 0.86, + "grad_norm": 0.7580534219741821, + "learning_rate": 9.311823031292205e-06, + "loss": 0.0372, + "step": 691 + }, + { + "epoch": 0.86, + "grad_norm": 0.7582796216011047, + "learning_rate": 9.309848334400247e-06, + "loss": 0.0326, + "step": 692 + }, + { + "epoch": 0.86, + "grad_norm": 0.6401865482330322, + "learning_rate": 9.307871018411074e-06, + "loss": 0.0301, + "step": 693 + }, + { + "epoch": 0.86, + "grad_norm": 2.024916410446167, + "learning_rate": 9.305891084526306e-06, + "loss": 0.0723, + "step": 694 + }, + { + "epoch": 0.86, + "grad_norm": 2.180551767349243, + "learning_rate": 9.303908533949146e-06, + "loss": 0.0639, + "step": 695 + }, + { + "epoch": 0.86, + "grad_norm": 0.7816917896270752, + "learning_rate": 9.301923367884393e-06, + "loss": 0.0366, + "step": 696 + }, + { + "epoch": 0.86, + "grad_norm": 0.7270790934562683, + "learning_rate": 9.299935587538432e-06, + "loss": 0.0421, + "step": 697 + }, + { + "epoch": 0.86, + "grad_norm": 0.8784447312355042, + "learning_rate": 9.29794519411924e-06, + "loss": 0.043, + "step": 698 + }, + { + "epoch": 0.87, + "grad_norm": 0.6736301779747009, + "learning_rate": 9.29595218883638e-06, + "loss": 0.047, + "step": 699 + }, + { + "epoch": 0.87, + "grad_norm": 1.0458660125732422, + "learning_rate": 9.293956572900999e-06, + "loss": 0.0295, + "step": 700 + }, + { + "epoch": 0.87, + "grad_norm": 0.8319834470748901, + "learning_rate": 9.29195834752584e-06, + "loss": 0.0606, + "step": 701 + }, + { + "epoch": 0.87, + "grad_norm": 1.5236587524414062, + "learning_rate": 9.28995751392522e-06, + "loss": 0.0405, + "step": 702 + }, + { + "epoch": 0.87, + "grad_norm": 1.4151524305343628, + "learning_rate": 9.28795407331505e-06, + "loss": 0.0397, + "step": 703 + }, + { + "epoch": 0.87, + "grad_norm": 1.9959708452224731, + "learning_rate": 9.285948026912822e-06, + "loss": 0.0715, + "step": 704 + }, + { + "epoch": 0.87, + "grad_norm": 0.5822674632072449, + "learning_rate": 9.283939375937609e-06, + "loss": 0.0281, + "step": 705 + }, + { + "epoch": 0.87, + "grad_norm": 0.7008696794509888, + "learning_rate": 9.28192812161007e-06, + "loss": 0.0486, + "step": 706 + }, + { + "epoch": 0.88, + "grad_norm": 0.7523006796836853, + "learning_rate": 9.279914265152448e-06, + "loss": 0.0505, + "step": 707 + }, + { + "epoch": 0.88, + "grad_norm": 1.051295518875122, + "learning_rate": 9.277897807788562e-06, + "loss": 0.0499, + "step": 708 + }, + { + "epoch": 0.88, + "grad_norm": 0.8184940218925476, + "learning_rate": 9.275878750743818e-06, + "loss": 0.0422, + "step": 709 + }, + { + "epoch": 0.88, + "grad_norm": 1.372441291809082, + "learning_rate": 9.273857095245192e-06, + "loss": 0.0633, + "step": 710 + }, + { + "epoch": 0.88, + "grad_norm": 0.6757863759994507, + "learning_rate": 9.271832842521249e-06, + "loss": 0.0366, + "step": 711 + }, + { + "epoch": 0.88, + "grad_norm": 0.7655669450759888, + "learning_rate": 9.26980599380213e-06, + "loss": 0.0389, + "step": 712 + }, + { + "epoch": 0.88, + "grad_norm": 1.1087899208068848, + "learning_rate": 9.267776550319548e-06, + "loss": 0.0433, + "step": 713 + }, + { + "epoch": 0.88, + "grad_norm": 1.6310410499572754, + "learning_rate": 9.265744513306798e-06, + "loss": 0.0471, + "step": 714 + }, + { + "epoch": 0.88, + "grad_norm": 1.9184622764587402, + "learning_rate": 9.263709883998753e-06, + "loss": 0.0679, + "step": 715 + }, + { + "epoch": 0.89, + "grad_norm": 2.0910892486572266, + "learning_rate": 9.261672663631854e-06, + "loss": 0.0551, + "step": 716 + }, + { + "epoch": 0.89, + "grad_norm": 2.9525444507598877, + "learning_rate": 9.259632853444126e-06, + "loss": 0.0682, + "step": 717 + }, + { + "epoch": 0.89, + "grad_norm": 1.773461103439331, + "learning_rate": 9.257590454675159e-06, + "loss": 0.0441, + "step": 718 + }, + { + "epoch": 0.89, + "grad_norm": 0.9130051136016846, + "learning_rate": 9.255545468566119e-06, + "loss": 0.0454, + "step": 719 + }, + { + "epoch": 0.89, + "grad_norm": 0.34200993180274963, + "learning_rate": 9.253497896359749e-06, + "loss": 0.0119, + "step": 720 + }, + { + "epoch": 0.89, + "grad_norm": 1.0717602968215942, + "learning_rate": 9.251447739300356e-06, + "loss": 0.0552, + "step": 721 + }, + { + "epoch": 0.89, + "grad_norm": 1.0619879961013794, + "learning_rate": 9.249394998633825e-06, + "loss": 0.0568, + "step": 722 + }, + { + "epoch": 0.89, + "grad_norm": 0.8811701536178589, + "learning_rate": 9.247339675607606e-06, + "loss": 0.034, + "step": 723 + }, + { + "epoch": 0.9, + "grad_norm": 0.974205493927002, + "learning_rate": 9.24528177147072e-06, + "loss": 0.0398, + "step": 724 + }, + { + "epoch": 0.9, + "grad_norm": 0.8818910717964172, + "learning_rate": 9.243221287473755e-06, + "loss": 0.048, + "step": 725 + }, + { + "epoch": 0.9, + "grad_norm": 0.6580934524536133, + "learning_rate": 9.241158224868871e-06, + "loss": 0.042, + "step": 726 + }, + { + "epoch": 0.9, + "grad_norm": 1.4452764987945557, + "learning_rate": 9.23909258490979e-06, + "loss": 0.0438, + "step": 727 + }, + { + "epoch": 0.9, + "grad_norm": 0.6177107095718384, + "learning_rate": 9.237024368851805e-06, + "loss": 0.0434, + "step": 728 + }, + { + "epoch": 0.9, + "grad_norm": 0.6715316772460938, + "learning_rate": 9.23495357795177e-06, + "loss": 0.0242, + "step": 729 + }, + { + "epoch": 0.9, + "grad_norm": 1.8438655138015747, + "learning_rate": 9.232880213468106e-06, + "loss": 0.0421, + "step": 730 + }, + { + "epoch": 0.9, + "grad_norm": 1.011062741279602, + "learning_rate": 9.230804276660799e-06, + "loss": 0.0465, + "step": 731 + }, + { + "epoch": 0.91, + "grad_norm": 1.2409260272979736, + "learning_rate": 9.228725768791394e-06, + "loss": 0.029, + "step": 732 + }, + { + "epoch": 0.91, + "grad_norm": 1.2052364349365234, + "learning_rate": 9.226644691123006e-06, + "loss": 0.0465, + "step": 733 + }, + { + "epoch": 0.91, + "grad_norm": 0.60611891746521, + "learning_rate": 9.224561044920303e-06, + "loss": 0.0328, + "step": 734 + }, + { + "epoch": 0.91, + "grad_norm": 0.4640844464302063, + "learning_rate": 9.222474831449519e-06, + "loss": 0.0202, + "step": 735 + }, + { + "epoch": 0.91, + "grad_norm": 1.9622972011566162, + "learning_rate": 9.220386051978449e-06, + "loss": 0.0651, + "step": 736 + }, + { + "epoch": 0.91, + "grad_norm": 1.8986101150512695, + "learning_rate": 9.218294707776441e-06, + "loss": 0.0556, + "step": 737 + }, + { + "epoch": 0.91, + "grad_norm": 1.158408284187317, + "learning_rate": 9.216200800114412e-06, + "loss": 0.0368, + "step": 738 + }, + { + "epoch": 0.91, + "grad_norm": 0.9851293563842773, + "learning_rate": 9.214104330264826e-06, + "loss": 0.053, + "step": 739 + }, + { + "epoch": 0.92, + "grad_norm": 1.1018086671829224, + "learning_rate": 9.212005299501712e-06, + "loss": 0.0597, + "step": 740 + }, + { + "epoch": 0.92, + "grad_norm": 1.84424889087677, + "learning_rate": 9.20990370910065e-06, + "loss": 0.0497, + "step": 741 + }, + { + "epoch": 0.92, + "grad_norm": 1.2366299629211426, + "learning_rate": 9.207799560338779e-06, + "loss": 0.0602, + "step": 742 + }, + { + "epoch": 0.92, + "grad_norm": 1.1586567163467407, + "learning_rate": 9.20569285449479e-06, + "loss": 0.0316, + "step": 743 + }, + { + "epoch": 0.92, + "grad_norm": 0.6110067367553711, + "learning_rate": 9.20358359284893e-06, + "loss": 0.0305, + "step": 744 + }, + { + "epoch": 0.92, + "grad_norm": 0.6773253679275513, + "learning_rate": 9.201471776682999e-06, + "loss": 0.036, + "step": 745 + }, + { + "epoch": 0.92, + "grad_norm": 0.9832028150558472, + "learning_rate": 9.199357407280349e-06, + "loss": 0.0381, + "step": 746 + }, + { + "epoch": 0.92, + "grad_norm": 1.0233718156814575, + "learning_rate": 9.197240485925883e-06, + "loss": 0.0549, + "step": 747 + }, + { + "epoch": 0.93, + "grad_norm": 2.125337839126587, + "learning_rate": 9.195121013906055e-06, + "loss": 0.0776, + "step": 748 + }, + { + "epoch": 0.93, + "grad_norm": 1.2079508304595947, + "learning_rate": 9.19299899250887e-06, + "loss": 0.0384, + "step": 749 + }, + { + "epoch": 0.93, + "grad_norm": 1.0452898740768433, + "learning_rate": 9.19087442302388e-06, + "loss": 0.0387, + "step": 750 + }, + { + "epoch": 0.93, + "grad_norm": 0.8497399687767029, + "learning_rate": 9.18874730674219e-06, + "loss": 0.0386, + "step": 751 + }, + { + "epoch": 0.93, + "grad_norm": 2.1464147567749023, + "learning_rate": 9.186617644956445e-06, + "loss": 0.0725, + "step": 752 + }, + { + "epoch": 0.93, + "grad_norm": 0.4441956579685211, + "learning_rate": 9.184485438960846e-06, + "loss": 0.0214, + "step": 753 + }, + { + "epoch": 0.93, + "grad_norm": 0.818230390548706, + "learning_rate": 9.182350690051134e-06, + "loss": 0.0256, + "step": 754 + }, + { + "epoch": 0.93, + "grad_norm": 1.0162849426269531, + "learning_rate": 9.180213399524599e-06, + "loss": 0.0592, + "step": 755 + }, + { + "epoch": 0.94, + "grad_norm": 0.9444966316223145, + "learning_rate": 9.178073568680071e-06, + "loss": 0.0293, + "step": 756 + }, + { + "epoch": 0.94, + "grad_norm": 0.7616766691207886, + "learning_rate": 9.175931198817926e-06, + "loss": 0.0481, + "step": 757 + }, + { + "epoch": 0.94, + "grad_norm": 0.47808611392974854, + "learning_rate": 9.173786291240085e-06, + "loss": 0.0287, + "step": 758 + }, + { + "epoch": 0.94, + "grad_norm": 0.6669220328330994, + "learning_rate": 9.17163884725001e-06, + "loss": 0.0324, + "step": 759 + }, + { + "epoch": 0.94, + "grad_norm": 0.8807569146156311, + "learning_rate": 9.169488868152704e-06, + "loss": 0.0425, + "step": 760 + }, + { + "epoch": 0.94, + "grad_norm": 1.2071596384048462, + "learning_rate": 9.16733635525471e-06, + "loss": 0.046, + "step": 761 + }, + { + "epoch": 0.94, + "grad_norm": 1.2434258460998535, + "learning_rate": 9.165181309864108e-06, + "loss": 0.0383, + "step": 762 + }, + { + "epoch": 0.94, + "grad_norm": 0.7151886820793152, + "learning_rate": 9.163023733290525e-06, + "loss": 0.0381, + "step": 763 + }, + { + "epoch": 0.95, + "grad_norm": 0.6364666223526001, + "learning_rate": 9.16086362684512e-06, + "loss": 0.0328, + "step": 764 + }, + { + "epoch": 0.95, + "grad_norm": 1.2846086025238037, + "learning_rate": 9.15870099184059e-06, + "loss": 0.0317, + "step": 765 + }, + { + "epoch": 0.95, + "grad_norm": 1.7031409740447998, + "learning_rate": 9.15653582959117e-06, + "loss": 0.0416, + "step": 766 + }, + { + "epoch": 0.95, + "grad_norm": 1.8931663036346436, + "learning_rate": 9.154368141412632e-06, + "loss": 0.0544, + "step": 767 + }, + { + "epoch": 0.95, + "grad_norm": 0.5589671730995178, + "learning_rate": 9.152197928622278e-06, + "loss": 0.0204, + "step": 768 + }, + { + "epoch": 0.95, + "grad_norm": 0.7534042596817017, + "learning_rate": 9.15002519253895e-06, + "loss": 0.0291, + "step": 769 + }, + { + "epoch": 0.95, + "grad_norm": 0.8194689750671387, + "learning_rate": 9.147849934483019e-06, + "loss": 0.0363, + "step": 770 + }, + { + "epoch": 0.95, + "grad_norm": 1.4425467252731323, + "learning_rate": 9.145672155776392e-06, + "loss": 0.0583, + "step": 771 + }, + { + "epoch": 0.96, + "grad_norm": 1.4742876291275024, + "learning_rate": 9.143491857742505e-06, + "loss": 0.0577, + "step": 772 + }, + { + "epoch": 0.96, + "grad_norm": 0.5303352475166321, + "learning_rate": 9.14130904170633e-06, + "loss": 0.0311, + "step": 773 + }, + { + "epoch": 0.96, + "grad_norm": 0.7389684915542603, + "learning_rate": 9.13912370899436e-06, + "loss": 0.028, + "step": 774 + }, + { + "epoch": 0.96, + "grad_norm": 1.5198121070861816, + "learning_rate": 9.136935860934628e-06, + "loss": 0.0461, + "step": 775 + }, + { + "epoch": 0.96, + "grad_norm": 1.799206256866455, + "learning_rate": 9.134745498856685e-06, + "loss": 0.0478, + "step": 776 + }, + { + "epoch": 0.96, + "grad_norm": 1.1272491216659546, + "learning_rate": 9.13255262409162e-06, + "loss": 0.0495, + "step": 777 + }, + { + "epoch": 0.96, + "grad_norm": 1.0748385190963745, + "learning_rate": 9.130357237972044e-06, + "loss": 0.0388, + "step": 778 + }, + { + "epoch": 0.96, + "grad_norm": 0.8800269961357117, + "learning_rate": 9.128159341832092e-06, + "loss": 0.0233, + "step": 779 + }, + { + "epoch": 0.97, + "grad_norm": 0.6652606129646301, + "learning_rate": 9.125958937007427e-06, + "loss": 0.0401, + "step": 780 + }, + { + "epoch": 0.97, + "grad_norm": 0.7951803207397461, + "learning_rate": 9.123756024835237e-06, + "loss": 0.0194, + "step": 781 + }, + { + "epoch": 0.97, + "grad_norm": 0.6082125902175903, + "learning_rate": 9.121550606654232e-06, + "loss": 0.0221, + "step": 782 + }, + { + "epoch": 0.97, + "grad_norm": 1.656269907951355, + "learning_rate": 9.119342683804649e-06, + "loss": 0.0267, + "step": 783 + }, + { + "epoch": 0.97, + "grad_norm": 1.3084255456924438, + "learning_rate": 9.11713225762824e-06, + "loss": 0.0476, + "step": 784 + }, + { + "epoch": 0.97, + "grad_norm": 0.8326955437660217, + "learning_rate": 9.114919329468283e-06, + "loss": 0.0223, + "step": 785 + }, + { + "epoch": 0.97, + "grad_norm": 0.612882673740387, + "learning_rate": 9.112703900669577e-06, + "loss": 0.0186, + "step": 786 + }, + { + "epoch": 0.97, + "grad_norm": 1.0400992631912231, + "learning_rate": 9.110485972578439e-06, + "loss": 0.0494, + "step": 787 + }, + { + "epoch": 0.98, + "grad_norm": 0.9465930461883545, + "learning_rate": 9.108265546542705e-06, + "loss": 0.0336, + "step": 788 + }, + { + "epoch": 0.98, + "grad_norm": 0.8121449947357178, + "learning_rate": 9.106042623911728e-06, + "loss": 0.0392, + "step": 789 + }, + { + "epoch": 0.98, + "grad_norm": 1.7355393171310425, + "learning_rate": 9.103817206036383e-06, + "loss": 0.0492, + "step": 790 + }, + { + "epoch": 0.98, + "grad_norm": 0.5920339822769165, + "learning_rate": 9.101589294269054e-06, + "loss": 0.0354, + "step": 791 + }, + { + "epoch": 0.98, + "grad_norm": 1.1976126432418823, + "learning_rate": 9.099358889963643e-06, + "loss": 0.0618, + "step": 792 + }, + { + "epoch": 0.98, + "grad_norm": 1.0642493963241577, + "learning_rate": 9.097125994475572e-06, + "loss": 0.0555, + "step": 793 + }, + { + "epoch": 0.98, + "grad_norm": 1.2092516422271729, + "learning_rate": 9.09489060916177e-06, + "loss": 0.0391, + "step": 794 + }, + { + "epoch": 0.98, + "grad_norm": 0.67398601770401, + "learning_rate": 9.092652735380683e-06, + "loss": 0.0196, + "step": 795 + }, + { + "epoch": 0.99, + "grad_norm": 0.8952963948249817, + "learning_rate": 9.09041237449227e-06, + "loss": 0.0246, + "step": 796 + }, + { + "epoch": 0.99, + "grad_norm": 0.7937426567077637, + "learning_rate": 9.088169527857996e-06, + "loss": 0.0449, + "step": 797 + }, + { + "epoch": 0.99, + "grad_norm": 1.0983673334121704, + "learning_rate": 9.085924196840841e-06, + "loss": 0.0577, + "step": 798 + }, + { + "epoch": 0.99, + "grad_norm": 1.7625383138656616, + "learning_rate": 9.083676382805295e-06, + "loss": 0.0609, + "step": 799 + }, + { + "epoch": 0.99, + "grad_norm": 1.6659592390060425, + "learning_rate": 9.081426087117356e-06, + "loss": 0.0453, + "step": 800 + } + ], + "logging_steps": 1.0, + "max_steps": 4040, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 100, + "total_flos": 2.335555778196275e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}