{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997539975399754, "eval_steps": 500, "global_step": 2032, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.2452259659767151, "learning_rate": 1.639344262295082e-06, "loss": 0.9804, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.2644808888435364, "learning_rate": 3.278688524590164e-06, "loss": 0.8989, "step": 2 }, { "epoch": 0.0, "grad_norm": 0.32521888613700867, "learning_rate": 4.918032786885246e-06, "loss": 0.882, "step": 3 }, { "epoch": 0.0, "grad_norm": 0.22439579665660858, "learning_rate": 6.557377049180328e-06, "loss": 0.901, "step": 4 }, { "epoch": 0.0, "grad_norm": 0.29476603865623474, "learning_rate": 8.196721311475409e-06, "loss": 0.9564, "step": 5 }, { "epoch": 0.0, "grad_norm": 0.24640238285064697, "learning_rate": 9.836065573770493e-06, "loss": 0.8826, "step": 6 }, { "epoch": 0.0, "grad_norm": 0.270263135433197, "learning_rate": 1.1475409836065575e-05, "loss": 0.8175, "step": 7 }, { "epoch": 0.0, "grad_norm": 0.23105952143669128, "learning_rate": 1.3114754098360657e-05, "loss": 0.9349, "step": 8 }, { "epoch": 0.0, "grad_norm": 0.23050780594348907, "learning_rate": 1.4754098360655739e-05, "loss": 0.9568, "step": 9 }, { "epoch": 0.0, "grad_norm": 0.24696408212184906, "learning_rate": 1.6393442622950818e-05, "loss": 0.9029, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.25463977456092834, "learning_rate": 1.8032786885245903e-05, "loss": 0.9225, "step": 11 }, { "epoch": 0.01, "grad_norm": 0.2964908182621002, "learning_rate": 1.9672131147540985e-05, "loss": 0.9102, "step": 12 }, { "epoch": 0.01, "grad_norm": 0.25669577717781067, "learning_rate": 2.1311475409836064e-05, "loss": 0.9628, "step": 13 }, { "epoch": 0.01, "grad_norm": 0.20238137245178223, "learning_rate": 2.295081967213115e-05, "loss": 0.9002, "step": 14 }, { "epoch": 0.01, "grad_norm": 0.28775113821029663, "learning_rate": 2.459016393442623e-05, "loss": 0.8915, "step": 15 }, { "epoch": 0.01, "grad_norm": 0.21479631960391998, "learning_rate": 2.6229508196721314e-05, "loss": 0.7704, "step": 16 }, { "epoch": 0.01, "grad_norm": 0.2660934031009674, "learning_rate": 2.7868852459016392e-05, "loss": 0.8712, "step": 17 }, { "epoch": 0.01, "grad_norm": 0.23872637748718262, "learning_rate": 2.9508196721311478e-05, "loss": 0.7705, "step": 18 }, { "epoch": 0.01, "grad_norm": 0.31671276688575745, "learning_rate": 3.114754098360656e-05, "loss": 0.7981, "step": 19 }, { "epoch": 0.01, "grad_norm": 0.3238251507282257, "learning_rate": 3.2786885245901635e-05, "loss": 0.7499, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.3827343285083771, "learning_rate": 3.442622950819672e-05, "loss": 0.7658, "step": 21 }, { "epoch": 0.01, "grad_norm": 0.44403713941574097, "learning_rate": 3.6065573770491806e-05, "loss": 0.8293, "step": 22 }, { "epoch": 0.01, "grad_norm": 0.2889109253883362, "learning_rate": 3.7704918032786885e-05, "loss": 0.6778, "step": 23 }, { "epoch": 0.01, "grad_norm": 0.3636094331741333, "learning_rate": 3.934426229508197e-05, "loss": 0.7818, "step": 24 }, { "epoch": 0.01, "grad_norm": 0.27610665559768677, "learning_rate": 4.098360655737705e-05, "loss": 0.7463, "step": 25 }, { "epoch": 0.01, "grad_norm": 0.25867345929145813, "learning_rate": 4.262295081967213e-05, "loss": 0.7244, "step": 26 }, { "epoch": 0.01, "grad_norm": 0.1986331045627594, "learning_rate": 4.426229508196721e-05, "loss": 0.72, "step": 27 }, { "epoch": 0.01, "grad_norm": 0.18041302263736725, "learning_rate": 4.59016393442623e-05, "loss": 0.6641, "step": 28 }, { "epoch": 0.01, "grad_norm": 0.17338967323303223, "learning_rate": 4.754098360655738e-05, "loss": 0.706, "step": 29 }, { "epoch": 0.01, "grad_norm": 0.1726648509502411, "learning_rate": 4.918032786885246e-05, "loss": 0.6779, "step": 30 }, { "epoch": 0.02, "grad_norm": 0.15769243240356445, "learning_rate": 5.081967213114754e-05, "loss": 0.6919, "step": 31 }, { "epoch": 0.02, "grad_norm": 0.1541094332933426, "learning_rate": 5.245901639344263e-05, "loss": 0.6375, "step": 32 }, { "epoch": 0.02, "grad_norm": 0.14119213819503784, "learning_rate": 5.409836065573771e-05, "loss": 0.6859, "step": 33 }, { "epoch": 0.02, "grad_norm": 0.19775144755840302, "learning_rate": 5.5737704918032785e-05, "loss": 0.6393, "step": 34 }, { "epoch": 0.02, "grad_norm": 0.1325417459011078, "learning_rate": 5.737704918032787e-05, "loss": 0.6174, "step": 35 }, { "epoch": 0.02, "grad_norm": 0.1452002078294754, "learning_rate": 5.9016393442622956e-05, "loss": 0.6482, "step": 36 }, { "epoch": 0.02, "grad_norm": 0.13623154163360596, "learning_rate": 6.0655737704918034e-05, "loss": 0.6479, "step": 37 }, { "epoch": 0.02, "grad_norm": 0.11944164335727692, "learning_rate": 6.229508196721313e-05, "loss": 0.5778, "step": 38 }, { "epoch": 0.02, "grad_norm": 0.1125865951180458, "learning_rate": 6.39344262295082e-05, "loss": 0.6524, "step": 39 }, { "epoch": 0.02, "grad_norm": 0.1162809506058693, "learning_rate": 6.557377049180327e-05, "loss": 0.6309, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.10253722220659256, "learning_rate": 6.721311475409836e-05, "loss": 0.6405, "step": 41 }, { "epoch": 0.02, "grad_norm": 0.10489486902952194, "learning_rate": 6.885245901639344e-05, "loss": 0.5481, "step": 42 }, { "epoch": 0.02, "grad_norm": 0.12117042392492294, "learning_rate": 7.049180327868853e-05, "loss": 0.5875, "step": 43 }, { "epoch": 0.02, "grad_norm": 0.10182766616344452, "learning_rate": 7.213114754098361e-05, "loss": 0.6222, "step": 44 }, { "epoch": 0.02, "grad_norm": 0.10895183682441711, "learning_rate": 7.377049180327869e-05, "loss": 0.6127, "step": 45 }, { "epoch": 0.02, "grad_norm": 0.1138620525598526, "learning_rate": 7.540983606557377e-05, "loss": 0.6718, "step": 46 }, { "epoch": 0.02, "grad_norm": 0.10725389420986176, "learning_rate": 7.704918032786885e-05, "loss": 0.6685, "step": 47 }, { "epoch": 0.02, "grad_norm": 0.09819550067186356, "learning_rate": 7.868852459016394e-05, "loss": 0.5724, "step": 48 }, { "epoch": 0.02, "grad_norm": 0.09749134629964828, "learning_rate": 8.032786885245902e-05, "loss": 0.6499, "step": 49 }, { "epoch": 0.02, "grad_norm": 0.09728589653968811, "learning_rate": 8.19672131147541e-05, "loss": 0.5492, "step": 50 }, { "epoch": 0.03, "grad_norm": 0.11725353449583054, "learning_rate": 8.360655737704919e-05, "loss": 0.607, "step": 51 }, { "epoch": 0.03, "grad_norm": 0.10481224209070206, "learning_rate": 8.524590163934426e-05, "loss": 0.6346, "step": 52 }, { "epoch": 0.03, "grad_norm": 0.09270990639925003, "learning_rate": 8.688524590163935e-05, "loss": 0.595, "step": 53 }, { "epoch": 0.03, "grad_norm": 0.11615060269832611, "learning_rate": 8.852459016393443e-05, "loss": 0.6588, "step": 54 }, { "epoch": 0.03, "grad_norm": 0.09314726293087006, "learning_rate": 9.016393442622952e-05, "loss": 0.6268, "step": 55 }, { "epoch": 0.03, "grad_norm": 0.11429200321435928, "learning_rate": 9.18032786885246e-05, "loss": 0.5745, "step": 56 }, { "epoch": 0.03, "grad_norm": 0.08543388545513153, "learning_rate": 9.344262295081968e-05, "loss": 0.5982, "step": 57 }, { "epoch": 0.03, "grad_norm": 0.09446101635694504, "learning_rate": 9.508196721311476e-05, "loss": 0.615, "step": 58 }, { "epoch": 0.03, "grad_norm": 0.10001399368047714, "learning_rate": 9.672131147540983e-05, "loss": 0.5915, "step": 59 }, { "epoch": 0.03, "grad_norm": 0.08994739502668381, "learning_rate": 9.836065573770493e-05, "loss": 0.6082, "step": 60 }, { "epoch": 0.03, "grad_norm": 0.09548361599445343, "learning_rate": 0.0001, "loss": 0.5996, "step": 61 }, { "epoch": 0.03, "grad_norm": 0.10291488468647003, "learning_rate": 9.999993648644623e-05, "loss": 0.6083, "step": 62 }, { "epoch": 0.03, "grad_norm": 0.08348388224840164, "learning_rate": 9.999974594594624e-05, "loss": 0.5786, "step": 63 }, { "epoch": 0.03, "grad_norm": 0.0993671789765358, "learning_rate": 9.999942837898412e-05, "loss": 0.5556, "step": 64 }, { "epoch": 0.03, "grad_norm": 0.13769622147083282, "learning_rate": 9.999898378636666e-05, "loss": 0.6054, "step": 65 }, { "epoch": 0.03, "grad_norm": 0.17541350424289703, "learning_rate": 9.999841216922338e-05, "loss": 0.5628, "step": 66 }, { "epoch": 0.03, "grad_norm": 0.10783867537975311, "learning_rate": 9.999771352900648e-05, "loss": 0.6522, "step": 67 }, { "epoch": 0.03, "grad_norm": 0.09848761558532715, "learning_rate": 9.999688786749088e-05, "loss": 0.5674, "step": 68 }, { "epoch": 0.03, "grad_norm": 0.08708276599645615, "learning_rate": 9.999593518677423e-05, "loss": 0.5638, "step": 69 }, { "epoch": 0.03, "grad_norm": 0.09381628036499023, "learning_rate": 9.999485548927685e-05, "loss": 0.5981, "step": 70 }, { "epoch": 0.03, "grad_norm": 0.10708080977201462, "learning_rate": 9.999364877774174e-05, "loss": 0.594, "step": 71 }, { "epoch": 0.04, "grad_norm": 0.09192465990781784, "learning_rate": 9.999231505523463e-05, "loss": 0.5561, "step": 72 }, { "epoch": 0.04, "grad_norm": 0.10388699173927307, "learning_rate": 9.999085432514387e-05, "loss": 0.5954, "step": 73 }, { "epoch": 0.04, "grad_norm": 0.09248830378055573, "learning_rate": 9.998926659118051e-05, "loss": 0.5628, "step": 74 }, { "epoch": 0.04, "grad_norm": 0.10650835186243057, "learning_rate": 9.998755185737827e-05, "loss": 0.6306, "step": 75 }, { "epoch": 0.04, "grad_norm": 0.10891000926494598, "learning_rate": 9.998571012809351e-05, "loss": 0.6052, "step": 76 }, { "epoch": 0.04, "grad_norm": 0.0916590541601181, "learning_rate": 9.998374140800519e-05, "loss": 0.5604, "step": 77 }, { "epoch": 0.04, "grad_norm": 0.09587835520505905, "learning_rate": 9.998164570211495e-05, "loss": 0.5779, "step": 78 }, { "epoch": 0.04, "grad_norm": 0.09187649190425873, "learning_rate": 9.997942301574701e-05, "loss": 0.5386, "step": 79 }, { "epoch": 0.04, "grad_norm": 0.10820701718330383, "learning_rate": 9.99770733545482e-05, "loss": 0.6043, "step": 80 }, { "epoch": 0.04, "grad_norm": 0.08947017788887024, "learning_rate": 9.997459672448794e-05, "loss": 0.5429, "step": 81 }, { "epoch": 0.04, "grad_norm": 0.09825446456670761, "learning_rate": 9.997199313185821e-05, "loss": 0.525, "step": 82 }, { "epoch": 0.04, "grad_norm": 0.10226935148239136, "learning_rate": 9.996926258327353e-05, "loss": 0.566, "step": 83 }, { "epoch": 0.04, "grad_norm": 0.11438187956809998, "learning_rate": 9.9966405085671e-05, "loss": 0.6105, "step": 84 }, { "epoch": 0.04, "grad_norm": 0.1341792643070221, "learning_rate": 9.996342064631019e-05, "loss": 0.556, "step": 85 }, { "epoch": 0.04, "grad_norm": 0.1021265983581543, "learning_rate": 9.996030927277323e-05, "loss": 0.6074, "step": 86 }, { "epoch": 0.04, "grad_norm": 0.10654778778553009, "learning_rate": 9.995707097296465e-05, "loss": 0.5416, "step": 87 }, { "epoch": 0.04, "grad_norm": 0.11954265832901001, "learning_rate": 9.995370575511151e-05, "loss": 0.5606, "step": 88 }, { "epoch": 0.04, "grad_norm": 0.12531022727489471, "learning_rate": 9.995021362776328e-05, "loss": 0.6136, "step": 89 }, { "epoch": 0.04, "grad_norm": 0.1123749315738678, "learning_rate": 9.994659459979187e-05, "loss": 0.5912, "step": 90 }, { "epoch": 0.04, "grad_norm": 0.10114055126905441, "learning_rate": 9.994284868039156e-05, "loss": 0.5474, "step": 91 }, { "epoch": 0.05, "grad_norm": 0.0987151488661766, "learning_rate": 9.993897587907903e-05, "loss": 0.5591, "step": 92 }, { "epoch": 0.05, "grad_norm": 0.11638948321342468, "learning_rate": 9.993497620569329e-05, "loss": 0.5868, "step": 93 }, { "epoch": 0.05, "grad_norm": 0.13830219209194183, "learning_rate": 9.993084967039567e-05, "loss": 0.5944, "step": 94 }, { "epoch": 0.05, "grad_norm": 0.12190824747085571, "learning_rate": 9.99265962836698e-05, "loss": 0.5374, "step": 95 }, { "epoch": 0.05, "grad_norm": 0.11232335865497589, "learning_rate": 9.992221605632162e-05, "loss": 0.5527, "step": 96 }, { "epoch": 0.05, "grad_norm": 0.11145391315221786, "learning_rate": 9.991770899947926e-05, "loss": 0.5508, "step": 97 }, { "epoch": 0.05, "grad_norm": 0.22503943741321564, "learning_rate": 9.991307512459309e-05, "loss": 0.5629, "step": 98 }, { "epoch": 0.05, "grad_norm": 0.10737734287977219, "learning_rate": 9.990831444343565e-05, "loss": 0.5876, "step": 99 }, { "epoch": 0.05, "grad_norm": 0.09088217467069626, "learning_rate": 9.990342696810168e-05, "loss": 0.5422, "step": 100 }, { "epoch": 0.05, "grad_norm": 0.1034528911113739, "learning_rate": 9.9898412711008e-05, "loss": 0.5203, "step": 101 }, { "epoch": 0.05, "grad_norm": 0.12464069575071335, "learning_rate": 9.989327168489355e-05, "loss": 0.5728, "step": 102 }, { "epoch": 0.05, "grad_norm": 0.11202283203601837, "learning_rate": 9.988800390281931e-05, "loss": 0.5373, "step": 103 }, { "epoch": 0.05, "grad_norm": 0.10540369153022766, "learning_rate": 9.988260937816832e-05, "loss": 0.5803, "step": 104 }, { "epoch": 0.05, "grad_norm": 0.1300932765007019, "learning_rate": 9.98770881246456e-05, "loss": 0.5567, "step": 105 }, { "epoch": 0.05, "grad_norm": 0.11955401301383972, "learning_rate": 9.987144015627809e-05, "loss": 0.5321, "step": 106 }, { "epoch": 0.05, "grad_norm": 0.11116456240415573, "learning_rate": 9.986566548741473e-05, "loss": 0.5351, "step": 107 }, { "epoch": 0.05, "grad_norm": 0.10675757378339767, "learning_rate": 9.985976413272629e-05, "loss": 0.45, "step": 108 }, { "epoch": 0.05, "grad_norm": 0.12946799397468567, "learning_rate": 9.985373610720542e-05, "loss": 0.5294, "step": 109 }, { "epoch": 0.05, "grad_norm": 0.10107728093862534, "learning_rate": 9.984758142616657e-05, "loss": 0.5462, "step": 110 }, { "epoch": 0.05, "grad_norm": 0.11910764127969742, "learning_rate": 9.984130010524597e-05, "loss": 0.6047, "step": 111 }, { "epoch": 0.06, "grad_norm": 0.09973958134651184, "learning_rate": 9.983489216040158e-05, "loss": 0.5399, "step": 112 }, { "epoch": 0.06, "grad_norm": 0.12794210016727448, "learning_rate": 9.982835760791304e-05, "loss": 0.5964, "step": 113 }, { "epoch": 0.06, "grad_norm": 0.10908328741788864, "learning_rate": 9.982169646438167e-05, "loss": 0.5212, "step": 114 }, { "epoch": 0.06, "grad_norm": 0.1857626885175705, "learning_rate": 9.981490874673039e-05, "loss": 0.5316, "step": 115 }, { "epoch": 0.06, "grad_norm": 0.11631382256746292, "learning_rate": 9.980799447220368e-05, "loss": 0.6261, "step": 116 }, { "epoch": 0.06, "grad_norm": 0.11178912967443466, "learning_rate": 9.980095365836753e-05, "loss": 0.5122, "step": 117 }, { "epoch": 0.06, "grad_norm": 0.11617525666952133, "learning_rate": 9.979378632310945e-05, "loss": 0.5166, "step": 118 }, { "epoch": 0.06, "grad_norm": 0.12345264106988907, "learning_rate": 9.978649248463833e-05, "loss": 0.5641, "step": 119 }, { "epoch": 0.06, "grad_norm": 0.12372495979070663, "learning_rate": 9.97790721614845e-05, "loss": 0.5263, "step": 120 }, { "epoch": 0.06, "grad_norm": 0.292542427778244, "learning_rate": 9.977152537249958e-05, "loss": 0.4909, "step": 121 }, { "epoch": 0.06, "grad_norm": 0.122925765812397, "learning_rate": 9.976385213685652e-05, "loss": 0.4783, "step": 122 }, { "epoch": 0.06, "grad_norm": 0.12283392250537872, "learning_rate": 9.975605247404949e-05, "loss": 0.5168, "step": 123 }, { "epoch": 0.06, "grad_norm": 0.13703308999538422, "learning_rate": 9.974812640389388e-05, "loss": 0.5411, "step": 124 }, { "epoch": 0.06, "grad_norm": 0.12067611515522003, "learning_rate": 9.974007394652618e-05, "loss": 0.5356, "step": 125 }, { "epoch": 0.06, "grad_norm": 0.10998321324586868, "learning_rate": 9.973189512240401e-05, "loss": 0.5258, "step": 126 }, { "epoch": 0.06, "grad_norm": 0.11673554033041, "learning_rate": 9.972358995230604e-05, "loss": 0.513, "step": 127 }, { "epoch": 0.06, "grad_norm": 0.10632121562957764, "learning_rate": 9.971515845733185e-05, "loss": 0.5043, "step": 128 }, { "epoch": 0.06, "grad_norm": 0.13670513033866882, "learning_rate": 9.970660065890208e-05, "loss": 0.5615, "step": 129 }, { "epoch": 0.06, "grad_norm": 0.1250362992286682, "learning_rate": 9.969791657875811e-05, "loss": 0.5181, "step": 130 }, { "epoch": 0.06, "grad_norm": 0.12150248885154724, "learning_rate": 9.968910623896225e-05, "loss": 0.5615, "step": 131 }, { "epoch": 0.06, "grad_norm": 0.10510484874248505, "learning_rate": 9.968016966189752e-05, "loss": 0.5, "step": 132 }, { "epoch": 0.07, "grad_norm": 0.1107068881392479, "learning_rate": 9.96711068702677e-05, "loss": 0.4764, "step": 133 }, { "epoch": 0.07, "grad_norm": 0.1341215968132019, "learning_rate": 9.966191788709716e-05, "loss": 0.5772, "step": 134 }, { "epoch": 0.07, "grad_norm": 0.12703493237495422, "learning_rate": 9.965260273573091e-05, "loss": 0.5466, "step": 135 }, { "epoch": 0.07, "grad_norm": 0.12686589360237122, "learning_rate": 9.96431614398345e-05, "loss": 0.5362, "step": 136 }, { "epoch": 0.07, "grad_norm": 0.11782842874526978, "learning_rate": 9.963359402339392e-05, "loss": 0.4791, "step": 137 }, { "epoch": 0.07, "grad_norm": 0.1239015981554985, "learning_rate": 9.96239005107156e-05, "loss": 0.529, "step": 138 }, { "epoch": 0.07, "grad_norm": 0.12432339042425156, "learning_rate": 9.961408092642633e-05, "loss": 0.5124, "step": 139 }, { "epoch": 0.07, "grad_norm": 0.12941674888134003, "learning_rate": 9.960413529547317e-05, "loss": 0.5393, "step": 140 }, { "epoch": 0.07, "grad_norm": 0.13398966193199158, "learning_rate": 9.95940636431234e-05, "loss": 0.5697, "step": 141 }, { "epoch": 0.07, "grad_norm": 0.12811748683452606, "learning_rate": 9.95838659949645e-05, "loss": 0.4851, "step": 142 }, { "epoch": 0.07, "grad_norm": 0.11784821003675461, "learning_rate": 9.957354237690401e-05, "loss": 0.5551, "step": 143 }, { "epoch": 0.07, "grad_norm": 0.13655073940753937, "learning_rate": 9.956309281516954e-05, "loss": 0.5171, "step": 144 }, { "epoch": 0.07, "grad_norm": 0.15821956098079681, "learning_rate": 9.955251733630861e-05, "loss": 0.5612, "step": 145 }, { "epoch": 0.07, "grad_norm": 0.1337892860174179, "learning_rate": 9.954181596718869e-05, "loss": 0.464, "step": 146 }, { "epoch": 0.07, "grad_norm": 0.12777793407440186, "learning_rate": 9.953098873499704e-05, "loss": 0.5269, "step": 147 }, { "epoch": 0.07, "grad_norm": 0.12164995074272156, "learning_rate": 9.952003566724074e-05, "loss": 0.4917, "step": 148 }, { "epoch": 0.07, "grad_norm": 0.12371721118688583, "learning_rate": 9.950895679174648e-05, "loss": 0.5051, "step": 149 }, { "epoch": 0.07, "grad_norm": 0.12011121213436127, "learning_rate": 9.949775213666061e-05, "loss": 0.5593, "step": 150 }, { "epoch": 0.07, "grad_norm": 0.14063221216201782, "learning_rate": 9.948642173044905e-05, "loss": 0.5232, "step": 151 }, { "epoch": 0.07, "grad_norm": 0.11188677698373795, "learning_rate": 9.947496560189718e-05, "loss": 0.5315, "step": 152 }, { "epoch": 0.08, "grad_norm": 0.12956541776657104, "learning_rate": 9.946338378010975e-05, "loss": 0.4877, "step": 153 }, { "epoch": 0.08, "grad_norm": 0.11307179182767868, "learning_rate": 9.945167629451089e-05, "loss": 0.4575, "step": 154 }, { "epoch": 0.08, "grad_norm": 0.1328120082616806, "learning_rate": 9.943984317484395e-05, "loss": 0.5088, "step": 155 }, { "epoch": 0.08, "grad_norm": 0.1312056928873062, "learning_rate": 9.942788445117146e-05, "loss": 0.5125, "step": 156 }, { "epoch": 0.08, "grad_norm": 0.10527380555868149, "learning_rate": 9.941580015387508e-05, "loss": 0.52, "step": 157 }, { "epoch": 0.08, "grad_norm": 0.1504916548728943, "learning_rate": 9.940359031365548e-05, "loss": 0.5361, "step": 158 }, { "epoch": 0.08, "grad_norm": 0.12865802645683289, "learning_rate": 9.939125496153226e-05, "loss": 0.5274, "step": 159 }, { "epoch": 0.08, "grad_norm": 0.12269923090934753, "learning_rate": 9.93787941288439e-05, "loss": 0.5037, "step": 160 }, { "epoch": 0.08, "grad_norm": 0.1158442422747612, "learning_rate": 9.936620784724766e-05, "loss": 0.5069, "step": 161 }, { "epoch": 0.08, "grad_norm": 0.18156588077545166, "learning_rate": 9.935349614871956e-05, "loss": 0.5287, "step": 162 }, { "epoch": 0.08, "grad_norm": 0.13881808519363403, "learning_rate": 9.934065906555416e-05, "loss": 0.493, "step": 163 }, { "epoch": 0.08, "grad_norm": 0.20832446217536926, "learning_rate": 9.932769663036465e-05, "loss": 0.4932, "step": 164 }, { "epoch": 0.08, "grad_norm": 0.13180439174175262, "learning_rate": 9.931460887608261e-05, "loss": 0.4647, "step": 165 }, { "epoch": 0.08, "grad_norm": 0.14434604346752167, "learning_rate": 9.930139583595805e-05, "loss": 0.5153, "step": 166 }, { "epoch": 0.08, "grad_norm": 0.14111323654651642, "learning_rate": 9.928805754355925e-05, "loss": 0.5373, "step": 167 }, { "epoch": 0.08, "grad_norm": 0.12172537297010422, "learning_rate": 9.92745940327727e-05, "loss": 0.4863, "step": 168 }, { "epoch": 0.08, "grad_norm": 0.13874904811382294, "learning_rate": 9.926100533780303e-05, "loss": 0.5713, "step": 169 }, { "epoch": 0.08, "grad_norm": 0.12190142273902893, "learning_rate": 9.92472914931729e-05, "loss": 0.5312, "step": 170 }, { "epoch": 0.08, "grad_norm": 0.12128974497318268, "learning_rate": 9.923345253372287e-05, "loss": 0.4738, "step": 171 }, { "epoch": 0.08, "grad_norm": 0.118557408452034, "learning_rate": 9.921948849461142e-05, "loss": 0.5115, "step": 172 }, { "epoch": 0.09, "grad_norm": 0.12688080966472626, "learning_rate": 9.920539941131481e-05, "loss": 0.5325, "step": 173 }, { "epoch": 0.09, "grad_norm": 0.12694337964057922, "learning_rate": 9.91911853196269e-05, "loss": 0.4877, "step": 174 }, { "epoch": 0.09, "grad_norm": 0.12245693057775497, "learning_rate": 9.917684625565923e-05, "loss": 0.5175, "step": 175 }, { "epoch": 0.09, "grad_norm": 0.11940937489271164, "learning_rate": 9.916238225584077e-05, "loss": 0.4855, "step": 176 }, { "epoch": 0.09, "grad_norm": 0.12251555174589157, "learning_rate": 9.914779335691793e-05, "loss": 0.5249, "step": 177 }, { "epoch": 0.09, "grad_norm": 0.13697026669979095, "learning_rate": 9.913307959595444e-05, "loss": 0.4937, "step": 178 }, { "epoch": 0.09, "grad_norm": 0.11597849428653717, "learning_rate": 9.911824101033121e-05, "loss": 0.4971, "step": 179 }, { "epoch": 0.09, "grad_norm": 0.11831557005643845, "learning_rate": 9.910327763774628e-05, "loss": 0.4827, "step": 180 }, { "epoch": 0.09, "grad_norm": 0.1160125583410263, "learning_rate": 9.908818951621474e-05, "loss": 0.5047, "step": 181 }, { "epoch": 0.09, "grad_norm": 0.13269323110580444, "learning_rate": 9.907297668406863e-05, "loss": 0.4939, "step": 182 }, { "epoch": 0.09, "grad_norm": 0.14000576734542847, "learning_rate": 9.905763917995674e-05, "loss": 0.4523, "step": 183 }, { "epoch": 0.09, "grad_norm": 0.12293880432844162, "learning_rate": 9.904217704284469e-05, "loss": 0.4893, "step": 184 }, { "epoch": 0.09, "grad_norm": 0.13765645027160645, "learning_rate": 9.902659031201464e-05, "loss": 0.4922, "step": 185 }, { "epoch": 0.09, "grad_norm": 0.13345591723918915, "learning_rate": 9.90108790270654e-05, "loss": 0.5337, "step": 186 }, { "epoch": 0.09, "grad_norm": 0.15008282661437988, "learning_rate": 9.89950432279121e-05, "loss": 0.4925, "step": 187 }, { "epoch": 0.09, "grad_norm": 0.12956300377845764, "learning_rate": 9.89790829547863e-05, "loss": 0.5113, "step": 188 }, { "epoch": 0.09, "grad_norm": 0.1294013112783432, "learning_rate": 9.896299824823569e-05, "loss": 0.5074, "step": 189 }, { "epoch": 0.09, "grad_norm": 0.13247185945510864, "learning_rate": 9.89467891491242e-05, "loss": 0.5509, "step": 190 }, { "epoch": 0.09, "grad_norm": 0.12317169457674026, "learning_rate": 9.89304556986317e-05, "loss": 0.4955, "step": 191 }, { "epoch": 0.09, "grad_norm": 0.13177014887332916, "learning_rate": 9.891399793825403e-05, "loss": 0.5226, "step": 192 }, { "epoch": 0.09, "grad_norm": 0.11812160909175873, "learning_rate": 9.889741590980279e-05, "loss": 0.4945, "step": 193 }, { "epoch": 0.1, "grad_norm": 0.12916286289691925, "learning_rate": 9.888070965540534e-05, "loss": 0.4612, "step": 194 }, { "epoch": 0.1, "grad_norm": 0.12834374606609344, "learning_rate": 9.886387921750462e-05, "loss": 0.5039, "step": 195 }, { "epoch": 0.1, "grad_norm": 0.1274203211069107, "learning_rate": 9.88469246388591e-05, "loss": 0.4903, "step": 196 }, { "epoch": 0.1, "grad_norm": 0.1231379434466362, "learning_rate": 9.882984596254254e-05, "loss": 0.5083, "step": 197 }, { "epoch": 0.1, "grad_norm": 0.12806618213653564, "learning_rate": 9.88126432319441e-05, "loss": 0.5052, "step": 198 }, { "epoch": 0.1, "grad_norm": 0.12466035783290863, "learning_rate": 9.879531649076799e-05, "loss": 0.4954, "step": 199 }, { "epoch": 0.1, "grad_norm": 0.1449643224477768, "learning_rate": 9.877786578303357e-05, "loss": 0.5135, "step": 200 }, { "epoch": 0.1, "grad_norm": 0.12989725172519684, "learning_rate": 9.876029115307506e-05, "loss": 0.4917, "step": 201 }, { "epoch": 0.1, "grad_norm": 0.16021104156970978, "learning_rate": 9.874259264554158e-05, "loss": 0.5103, "step": 202 }, { "epoch": 0.1, "grad_norm": 0.13296402990818024, "learning_rate": 9.87247703053969e-05, "loss": 0.5367, "step": 203 }, { "epoch": 0.1, "grad_norm": 0.11390774697065353, "learning_rate": 9.870682417791947e-05, "loss": 0.4317, "step": 204 }, { "epoch": 0.1, "grad_norm": 0.11727535724639893, "learning_rate": 9.868875430870216e-05, "loss": 0.4592, "step": 205 }, { "epoch": 0.1, "grad_norm": 0.12636162340641022, "learning_rate": 9.867056074365222e-05, "loss": 0.4747, "step": 206 }, { "epoch": 0.1, "grad_norm": 0.14426970481872559, "learning_rate": 9.865224352899119e-05, "loss": 0.5039, "step": 207 }, { "epoch": 0.1, "grad_norm": 0.13057394325733185, "learning_rate": 9.863380271125473e-05, "loss": 0.5067, "step": 208 }, { "epoch": 0.1, "grad_norm": 0.13253356516361237, "learning_rate": 9.861523833729249e-05, "loss": 0.5081, "step": 209 }, { "epoch": 0.1, "grad_norm": 0.14303168654441833, "learning_rate": 9.859655045426807e-05, "loss": 0.4965, "step": 210 }, { "epoch": 0.1, "grad_norm": 0.15748755633831024, "learning_rate": 9.85777391096588e-05, "loss": 0.4555, "step": 211 }, { "epoch": 0.1, "grad_norm": 0.13341796398162842, "learning_rate": 9.855880435125571e-05, "loss": 0.4832, "step": 212 }, { "epoch": 0.1, "grad_norm": 0.1341037005186081, "learning_rate": 9.853974622716335e-05, "loss": 0.4789, "step": 213 }, { "epoch": 0.11, "grad_norm": 0.1436738222837448, "learning_rate": 9.85205647857997e-05, "loss": 0.5386, "step": 214 }, { "epoch": 0.11, "grad_norm": 0.17110808193683624, "learning_rate": 9.8501260075896e-05, "loss": 0.5069, "step": 215 }, { "epoch": 0.11, "grad_norm": 0.13706322014331818, "learning_rate": 9.848183214649668e-05, "loss": 0.4708, "step": 216 }, { "epoch": 0.11, "grad_norm": 0.13673759996891022, "learning_rate": 9.846228104695923e-05, "loss": 0.4903, "step": 217 }, { "epoch": 0.11, "grad_norm": 0.1322619765996933, "learning_rate": 9.844260682695402e-05, "loss": 0.5127, "step": 218 }, { "epoch": 0.11, "grad_norm": 0.1301795244216919, "learning_rate": 9.842280953646426e-05, "loss": 0.4286, "step": 219 }, { "epoch": 0.11, "grad_norm": 0.13139507174491882, "learning_rate": 9.840288922578578e-05, "loss": 0.5031, "step": 220 }, { "epoch": 0.11, "grad_norm": 0.12686261534690857, "learning_rate": 9.838284594552697e-05, "loss": 0.4885, "step": 221 }, { "epoch": 0.11, "grad_norm": 0.1369934380054474, "learning_rate": 9.836267974660866e-05, "loss": 0.4838, "step": 222 }, { "epoch": 0.11, "grad_norm": 0.11599792540073395, "learning_rate": 9.834239068026387e-05, "loss": 0.5012, "step": 223 }, { "epoch": 0.11, "grad_norm": 0.13350652158260345, "learning_rate": 9.83219787980379e-05, "loss": 0.4727, "step": 224 }, { "epoch": 0.11, "grad_norm": 0.13019968569278717, "learning_rate": 9.830144415178793e-05, "loss": 0.5038, "step": 225 }, { "epoch": 0.11, "grad_norm": 0.12527818977832794, "learning_rate": 9.828078679368313e-05, "loss": 0.5027, "step": 226 }, { "epoch": 0.11, "grad_norm": 0.14075924456119537, "learning_rate": 9.826000677620439e-05, "loss": 0.519, "step": 227 }, { "epoch": 0.11, "grad_norm": 0.14260035753250122, "learning_rate": 9.823910415214422e-05, "loss": 0.5355, "step": 228 }, { "epoch": 0.11, "grad_norm": 0.13060076534748077, "learning_rate": 9.82180789746066e-05, "loss": 0.4936, "step": 229 }, { "epoch": 0.11, "grad_norm": 0.12088575959205627, "learning_rate": 9.819693129700689e-05, "loss": 0.5063, "step": 230 }, { "epoch": 0.11, "grad_norm": 0.1378595530986786, "learning_rate": 9.817566117307167e-05, "loss": 0.5124, "step": 231 }, { "epoch": 0.11, "grad_norm": 0.12955375015735626, "learning_rate": 9.815426865683857e-05, "loss": 0.4845, "step": 232 }, { "epoch": 0.11, "grad_norm": 0.1291315257549286, "learning_rate": 9.813275380265616e-05, "loss": 0.451, "step": 233 }, { "epoch": 0.12, "grad_norm": 0.12862688302993774, "learning_rate": 9.811111666518387e-05, "loss": 0.4803, "step": 234 }, { "epoch": 0.12, "grad_norm": 0.1328982710838318, "learning_rate": 9.808935729939175e-05, "loss": 0.5266, "step": 235 }, { "epoch": 0.12, "grad_norm": 0.14852191507816315, "learning_rate": 9.806747576056039e-05, "loss": 0.5069, "step": 236 }, { "epoch": 0.12, "grad_norm": 0.13663285970687866, "learning_rate": 9.804547210428075e-05, "loss": 0.5367, "step": 237 }, { "epoch": 0.12, "grad_norm": 0.10936307162046432, "learning_rate": 9.802334638645403e-05, "loss": 0.4395, "step": 238 }, { "epoch": 0.12, "grad_norm": 0.12498772144317627, "learning_rate": 9.800109866329159e-05, "loss": 0.4601, "step": 239 }, { "epoch": 0.12, "grad_norm": 0.13159097731113434, "learning_rate": 9.797872899131468e-05, "loss": 0.4745, "step": 240 }, { "epoch": 0.12, "grad_norm": 0.12591972947120667, "learning_rate": 9.79562374273544e-05, "loss": 0.5015, "step": 241 }, { "epoch": 0.12, "grad_norm": 0.13837288320064545, "learning_rate": 9.793362402855153e-05, "loss": 0.4624, "step": 242 }, { "epoch": 0.12, "grad_norm": 0.13302898406982422, "learning_rate": 9.791088885235634e-05, "loss": 0.5118, "step": 243 }, { "epoch": 0.12, "grad_norm": 0.16310028731822968, "learning_rate": 9.788803195652851e-05, "loss": 0.5182, "step": 244 }, { "epoch": 0.12, "grad_norm": 0.13349036872386932, "learning_rate": 9.786505339913696e-05, "loss": 0.5144, "step": 245 }, { "epoch": 0.12, "grad_norm": 0.13739198446273804, "learning_rate": 9.784195323855969e-05, "loss": 0.5119, "step": 246 }, { "epoch": 0.12, "grad_norm": 0.13266289234161377, "learning_rate": 9.78187315334836e-05, "loss": 0.5032, "step": 247 }, { "epoch": 0.12, "grad_norm": 0.11945129930973053, "learning_rate": 9.779538834290443e-05, "loss": 0.4259, "step": 248 }, { "epoch": 0.12, "grad_norm": 0.11742910742759705, "learning_rate": 9.777192372612654e-05, "loss": 0.4803, "step": 249 }, { "epoch": 0.12, "grad_norm": 0.12939251959323883, "learning_rate": 9.774833774276278e-05, "loss": 0.4612, "step": 250 }, { "epoch": 0.12, "grad_norm": 0.12772531807422638, "learning_rate": 9.77246304527343e-05, "loss": 0.431, "step": 251 }, { "epoch": 0.12, "grad_norm": 0.12651309370994568, "learning_rate": 9.770080191627053e-05, "loss": 0.4862, "step": 252 }, { "epoch": 0.12, "grad_norm": 0.14725381135940552, "learning_rate": 9.767685219390883e-05, "loss": 0.5286, "step": 253 }, { "epoch": 0.12, "grad_norm": 0.1481163203716278, "learning_rate": 9.76527813464945e-05, "loss": 0.5495, "step": 254 }, { "epoch": 0.13, "grad_norm": 0.14147397875785828, "learning_rate": 9.762858943518052e-05, "loss": 0.4974, "step": 255 }, { "epoch": 0.13, "grad_norm": 0.12927831709384918, "learning_rate": 9.76042765214275e-05, "loss": 0.4558, "step": 256 }, { "epoch": 0.13, "grad_norm": 0.1445082128047943, "learning_rate": 9.757984266700336e-05, "loss": 0.4933, "step": 257 }, { "epoch": 0.13, "grad_norm": 0.12772341072559357, "learning_rate": 9.755528793398339e-05, "loss": 0.4761, "step": 258 }, { "epoch": 0.13, "grad_norm": 0.15157930552959442, "learning_rate": 9.753061238474991e-05, "loss": 0.5134, "step": 259 }, { "epoch": 0.13, "grad_norm": 0.13625264167785645, "learning_rate": 9.750581608199218e-05, "loss": 0.4504, "step": 260 }, { "epoch": 0.13, "grad_norm": 0.13586267828941345, "learning_rate": 9.748089908870627e-05, "loss": 0.4685, "step": 261 }, { "epoch": 0.13, "grad_norm": 0.1695537567138672, "learning_rate": 9.745586146819484e-05, "loss": 0.5154, "step": 262 }, { "epoch": 0.13, "grad_norm": 0.15165534615516663, "learning_rate": 9.743070328406703e-05, "loss": 0.4896, "step": 263 }, { "epoch": 0.13, "grad_norm": 0.15566733479499817, "learning_rate": 9.740542460023825e-05, "loss": 0.4959, "step": 264 }, { "epoch": 0.13, "grad_norm": 0.15872296690940857, "learning_rate": 9.738002548093008e-05, "loss": 0.5174, "step": 265 }, { "epoch": 0.13, "grad_norm": 0.1408284306526184, "learning_rate": 9.735450599067005e-05, "loss": 0.52, "step": 266 }, { "epoch": 0.13, "grad_norm": 0.16078899800777435, "learning_rate": 9.732886619429149e-05, "loss": 0.5216, "step": 267 }, { "epoch": 0.13, "grad_norm": 0.13478024303913116, "learning_rate": 9.730310615693338e-05, "loss": 0.4572, "step": 268 }, { "epoch": 0.13, "grad_norm": 0.15019844472408295, "learning_rate": 9.727722594404021e-05, "loss": 0.5133, "step": 269 }, { "epoch": 0.13, "grad_norm": 0.13954737782478333, "learning_rate": 9.725122562136173e-05, "loss": 0.4767, "step": 270 }, { "epoch": 0.13, "grad_norm": 0.13896162807941437, "learning_rate": 9.722510525495286e-05, "loss": 0.4901, "step": 271 }, { "epoch": 0.13, "grad_norm": 0.1498170793056488, "learning_rate": 9.719886491117348e-05, "loss": 0.5052, "step": 272 }, { "epoch": 0.13, "grad_norm": 0.1316058188676834, "learning_rate": 9.71725046566883e-05, "loss": 0.5089, "step": 273 }, { "epoch": 0.13, "grad_norm": 0.15046940743923187, "learning_rate": 9.714602455846666e-05, "loss": 0.5145, "step": 274 }, { "epoch": 0.14, "grad_norm": 0.11596712470054626, "learning_rate": 9.711942468378238e-05, "loss": 0.4336, "step": 275 }, { "epoch": 0.14, "grad_norm": 0.15053020417690277, "learning_rate": 9.709270510021354e-05, "loss": 0.5189, "step": 276 }, { "epoch": 0.14, "grad_norm": 0.13374291360378265, "learning_rate": 9.706586587564237e-05, "loss": 0.4974, "step": 277 }, { "epoch": 0.14, "grad_norm": 0.1439104527235031, "learning_rate": 9.703890707825505e-05, "loss": 0.4876, "step": 278 }, { "epoch": 0.14, "grad_norm": 0.1169486939907074, "learning_rate": 9.701182877654153e-05, "loss": 0.468, "step": 279 }, { "epoch": 0.14, "grad_norm": 0.1369236260652542, "learning_rate": 9.698463103929542e-05, "loss": 0.4662, "step": 280 }, { "epoch": 0.14, "grad_norm": 0.11922533065080643, "learning_rate": 9.69573139356137e-05, "loss": 0.4664, "step": 281 }, { "epoch": 0.14, "grad_norm": 0.1641329973936081, "learning_rate": 9.69298775348966e-05, "loss": 0.5112, "step": 282 }, { "epoch": 0.14, "grad_norm": 0.13259337842464447, "learning_rate": 9.690232190684747e-05, "loss": 0.4908, "step": 283 }, { "epoch": 0.14, "grad_norm": 0.1512133777141571, "learning_rate": 9.687464712147253e-05, "loss": 0.4511, "step": 284 }, { "epoch": 0.14, "grad_norm": 0.1290777325630188, "learning_rate": 9.684685324908077e-05, "loss": 0.4995, "step": 285 }, { "epoch": 0.14, "grad_norm": 0.11606963723897934, "learning_rate": 9.681894036028365e-05, "loss": 0.4535, "step": 286 }, { "epoch": 0.14, "grad_norm": 0.12128835916519165, "learning_rate": 9.679090852599508e-05, "loss": 0.4544, "step": 287 }, { "epoch": 0.14, "grad_norm": 0.12262247502803802, "learning_rate": 9.676275781743111e-05, "loss": 0.4982, "step": 288 }, { "epoch": 0.14, "grad_norm": 0.12359766662120819, "learning_rate": 9.673448830610978e-05, "loss": 0.5105, "step": 289 }, { "epoch": 0.14, "grad_norm": 0.12350254505872726, "learning_rate": 9.6706100063851e-05, "loss": 0.4717, "step": 290 }, { "epoch": 0.14, "grad_norm": 0.11593758314847946, "learning_rate": 9.66775931627763e-05, "loss": 0.4251, "step": 291 }, { "epoch": 0.14, "grad_norm": 0.10588888078927994, "learning_rate": 9.664896767530862e-05, "loss": 0.4145, "step": 292 }, { "epoch": 0.14, "grad_norm": 0.12794917821884155, "learning_rate": 9.662022367417225e-05, "loss": 0.4427, "step": 293 }, { "epoch": 0.14, "grad_norm": 0.1299968808889389, "learning_rate": 9.659136123239255e-05, "loss": 0.5018, "step": 294 }, { "epoch": 0.15, "grad_norm": 0.14009764790534973, "learning_rate": 9.656238042329574e-05, "loss": 0.4562, "step": 295 }, { "epoch": 0.15, "grad_norm": 0.1477385014295578, "learning_rate": 9.65332813205088e-05, "loss": 0.4693, "step": 296 }, { "epoch": 0.15, "grad_norm": 0.1205335333943367, "learning_rate": 9.650406399795923e-05, "loss": 0.4531, "step": 297 }, { "epoch": 0.15, "grad_norm": 0.1287442445755005, "learning_rate": 9.647472852987487e-05, "loss": 0.5161, "step": 298 }, { "epoch": 0.15, "grad_norm": 0.12642082571983337, "learning_rate": 9.644527499078371e-05, "loss": 0.5101, "step": 299 }, { "epoch": 0.15, "grad_norm": 0.13632547855377197, "learning_rate": 9.64157034555137e-05, "loss": 0.4756, "step": 300 }, { "epoch": 0.15, "grad_norm": 0.13950622081756592, "learning_rate": 9.638601399919259e-05, "loss": 0.5162, "step": 301 }, { "epoch": 0.15, "grad_norm": 0.14247675240039825, "learning_rate": 9.635620669724768e-05, "loss": 0.5051, "step": 302 }, { "epoch": 0.15, "grad_norm": 0.13259443640708923, "learning_rate": 9.632628162540568e-05, "loss": 0.4759, "step": 303 }, { "epoch": 0.15, "grad_norm": 0.12039908766746521, "learning_rate": 9.62962388596925e-05, "loss": 0.4733, "step": 304 }, { "epoch": 0.15, "grad_norm": 0.15673649311065674, "learning_rate": 9.626607847643305e-05, "loss": 0.4839, "step": 305 }, { "epoch": 0.15, "grad_norm": 0.14569158852100372, "learning_rate": 9.623580055225105e-05, "loss": 0.5274, "step": 306 }, { "epoch": 0.15, "grad_norm": 0.12974193692207336, "learning_rate": 9.620540516406886e-05, "loss": 0.4364, "step": 307 }, { "epoch": 0.15, "grad_norm": 0.14151649177074432, "learning_rate": 9.617489238910722e-05, "loss": 0.5053, "step": 308 }, { "epoch": 0.15, "grad_norm": 0.12406913936138153, "learning_rate": 9.614426230488514e-05, "loss": 0.4653, "step": 309 }, { "epoch": 0.15, "grad_norm": 0.1319883167743683, "learning_rate": 9.611351498921964e-05, "loss": 0.4623, "step": 310 }, { "epoch": 0.15, "grad_norm": 0.13151374459266663, "learning_rate": 9.608265052022556e-05, "loss": 0.4813, "step": 311 }, { "epoch": 0.15, "grad_norm": 0.14187724888324738, "learning_rate": 9.605166897631538e-05, "loss": 0.4907, "step": 312 }, { "epoch": 0.15, "grad_norm": 0.1293659657239914, "learning_rate": 9.602057043619903e-05, "loss": 0.4659, "step": 313 }, { "epoch": 0.15, "grad_norm": 0.12012505531311035, "learning_rate": 9.598935497888368e-05, "loss": 0.4718, "step": 314 }, { "epoch": 0.15, "grad_norm": 0.12818792462348938, "learning_rate": 9.595802268367348e-05, "loss": 0.4339, "step": 315 }, { "epoch": 0.16, "grad_norm": 0.1206803172826767, "learning_rate": 9.592657363016946e-05, "loss": 0.4326, "step": 316 }, { "epoch": 0.16, "grad_norm": 0.13416646420955658, "learning_rate": 9.589500789826927e-05, "loss": 0.4468, "step": 317 }, { "epoch": 0.16, "grad_norm": 0.13810916244983673, "learning_rate": 9.586332556816699e-05, "loss": 0.5001, "step": 318 }, { "epoch": 0.16, "grad_norm": 0.18333862721920013, "learning_rate": 9.583152672035289e-05, "loss": 0.455, "step": 319 }, { "epoch": 0.16, "grad_norm": 0.1514430195093155, "learning_rate": 9.579961143561332e-05, "loss": 0.5076, "step": 320 }, { "epoch": 0.16, "grad_norm": 0.12491629272699356, "learning_rate": 9.576757979503037e-05, "loss": 0.4427, "step": 321 }, { "epoch": 0.16, "grad_norm": 0.14590425789356232, "learning_rate": 9.57354318799818e-05, "loss": 0.476, "step": 322 }, { "epoch": 0.16, "grad_norm": 0.12693631649017334, "learning_rate": 9.57031677721407e-05, "loss": 0.4437, "step": 323 }, { "epoch": 0.16, "grad_norm": 0.149683877825737, "learning_rate": 9.567078755347546e-05, "loss": 0.5248, "step": 324 }, { "epoch": 0.16, "grad_norm": 0.1543707698583603, "learning_rate": 9.563829130624935e-05, "loss": 0.4565, "step": 325 }, { "epoch": 0.16, "grad_norm": 0.1438606083393097, "learning_rate": 9.560567911302045e-05, "loss": 0.4971, "step": 326 }, { "epoch": 0.16, "grad_norm": 0.13084115087985992, "learning_rate": 9.557295105664144e-05, "loss": 0.4843, "step": 327 }, { "epoch": 0.16, "grad_norm": 0.1604880690574646, "learning_rate": 9.55401072202593e-05, "loss": 0.4602, "step": 328 }, { "epoch": 0.16, "grad_norm": 0.12488158047199249, "learning_rate": 9.550714768731521e-05, "loss": 0.4462, "step": 329 }, { "epoch": 0.16, "grad_norm": 0.14189974963665009, "learning_rate": 9.547407254154421e-05, "loss": 0.5134, "step": 330 }, { "epoch": 0.16, "grad_norm": 0.13690772652626038, "learning_rate": 9.544088186697515e-05, "loss": 0.4586, "step": 331 }, { "epoch": 0.16, "grad_norm": 0.15091572701931, "learning_rate": 9.54075757479303e-05, "loss": 0.5009, "step": 332 }, { "epoch": 0.16, "grad_norm": 0.151514932513237, "learning_rate": 9.53741542690253e-05, "loss": 0.4971, "step": 333 }, { "epoch": 0.16, "grad_norm": 0.16063284873962402, "learning_rate": 9.534061751516877e-05, "loss": 0.5006, "step": 334 }, { "epoch": 0.16, "grad_norm": 0.1446666270494461, "learning_rate": 9.530696557156229e-05, "loss": 0.4569, "step": 335 }, { "epoch": 0.17, "grad_norm": 0.15189188718795776, "learning_rate": 9.527319852370002e-05, "loss": 0.4891, "step": 336 }, { "epoch": 0.17, "grad_norm": 0.12402113527059555, "learning_rate": 9.523931645736858e-05, "loss": 0.4523, "step": 337 }, { "epoch": 0.17, "grad_norm": 0.13852664828300476, "learning_rate": 9.520531945864679e-05, "loss": 0.4892, "step": 338 }, { "epoch": 0.17, "grad_norm": 0.14422492682933807, "learning_rate": 9.517120761390545e-05, "loss": 0.5191, "step": 339 }, { "epoch": 0.17, "grad_norm": 0.13101577758789062, "learning_rate": 9.513698100980715e-05, "loss": 0.4828, "step": 340 }, { "epoch": 0.17, "grad_norm": 0.13587409257888794, "learning_rate": 9.5102639733306e-05, "loss": 0.4736, "step": 341 }, { "epoch": 0.17, "grad_norm": 0.14891067147254944, "learning_rate": 9.506818387164747e-05, "loss": 0.472, "step": 342 }, { "epoch": 0.17, "grad_norm": 0.1796894520521164, "learning_rate": 9.503361351236814e-05, "loss": 0.4728, "step": 343 }, { "epoch": 0.17, "grad_norm": 0.1393360197544098, "learning_rate": 9.499892874329546e-05, "loss": 0.4669, "step": 344 }, { "epoch": 0.17, "grad_norm": 0.13324418663978577, "learning_rate": 9.496412965254754e-05, "loss": 0.4198, "step": 345 }, { "epoch": 0.17, "grad_norm": 0.15685345232486725, "learning_rate": 9.492921632853294e-05, "loss": 0.4847, "step": 346 }, { "epoch": 0.17, "grad_norm": 0.13309939205646515, "learning_rate": 9.489418885995043e-05, "loss": 0.4863, "step": 347 }, { "epoch": 0.17, "grad_norm": 0.12411566078662872, "learning_rate": 9.485904733578879e-05, "loss": 0.4532, "step": 348 }, { "epoch": 0.17, "grad_norm": 0.1404196321964264, "learning_rate": 9.48237918453265e-05, "loss": 0.4284, "step": 349 }, { "epoch": 0.17, "grad_norm": 0.1394430249929428, "learning_rate": 9.478842247813166e-05, "loss": 0.4704, "step": 350 }, { "epoch": 0.17, "grad_norm": 0.12812605500221252, "learning_rate": 9.475293932406162e-05, "loss": 0.4837, "step": 351 }, { "epoch": 0.17, "grad_norm": 0.14801132678985596, "learning_rate": 9.471734247326285e-05, "loss": 0.4364, "step": 352 }, { "epoch": 0.17, "grad_norm": 0.15099765360355377, "learning_rate": 9.468163201617062e-05, "loss": 0.4866, "step": 353 }, { "epoch": 0.17, "grad_norm": 0.1889578253030777, "learning_rate": 9.464580804350887e-05, "loss": 0.4613, "step": 354 }, { "epoch": 0.17, "grad_norm": 0.1622733771800995, "learning_rate": 9.46098706462899e-05, "loss": 0.5143, "step": 355 }, { "epoch": 0.18, "grad_norm": 0.1502552181482315, "learning_rate": 9.457381991581418e-05, "loss": 0.4968, "step": 356 }, { "epoch": 0.18, "grad_norm": 0.13201425969600677, "learning_rate": 9.453765594367013e-05, "loss": 0.4488, "step": 357 }, { "epoch": 0.18, "grad_norm": 0.13599631190299988, "learning_rate": 9.450137882173384e-05, "loss": 0.4939, "step": 358 }, { "epoch": 0.18, "grad_norm": 0.13342833518981934, "learning_rate": 9.446498864216886e-05, "loss": 0.4576, "step": 359 }, { "epoch": 0.18, "grad_norm": 0.12389455735683441, "learning_rate": 9.442848549742598e-05, "loss": 0.5024, "step": 360 }, { "epoch": 0.18, "grad_norm": 0.14464333653450012, "learning_rate": 9.439186948024297e-05, "loss": 0.5098, "step": 361 }, { "epoch": 0.18, "grad_norm": 0.12361609935760498, "learning_rate": 9.435514068364437e-05, "loss": 0.4586, "step": 362 }, { "epoch": 0.18, "grad_norm": 0.13467039167881012, "learning_rate": 9.431829920094125e-05, "loss": 0.4792, "step": 363 }, { "epoch": 0.18, "grad_norm": 0.14349080622196198, "learning_rate": 9.42813451257309e-05, "loss": 0.48, "step": 364 }, { "epoch": 0.18, "grad_norm": 0.125464528799057, "learning_rate": 9.424427855189677e-05, "loss": 0.4389, "step": 365 }, { "epoch": 0.18, "grad_norm": 0.14141547679901123, "learning_rate": 9.420709957360803e-05, "loss": 0.5266, "step": 366 }, { "epoch": 0.18, "grad_norm": 0.12343312799930573, "learning_rate": 9.416980828531943e-05, "loss": 0.4669, "step": 367 }, { "epoch": 0.18, "grad_norm": 0.13704504072666168, "learning_rate": 9.413240478177106e-05, "loss": 0.4507, "step": 368 }, { "epoch": 0.18, "grad_norm": 0.12257388234138489, "learning_rate": 9.409488915798812e-05, "loss": 0.4265, "step": 369 }, { "epoch": 0.18, "grad_norm": 0.1389954388141632, "learning_rate": 9.40572615092806e-05, "loss": 0.4769, "step": 370 }, { "epoch": 0.18, "grad_norm": 0.1359740048646927, "learning_rate": 9.401952193124315e-05, "loss": 0.5128, "step": 371 }, { "epoch": 0.18, "grad_norm": 0.13114336133003235, "learning_rate": 9.398167051975474e-05, "loss": 0.4817, "step": 372 }, { "epoch": 0.18, "grad_norm": 0.15740089118480682, "learning_rate": 9.394370737097853e-05, "loss": 0.4978, "step": 373 }, { "epoch": 0.18, "grad_norm": 0.14335806667804718, "learning_rate": 9.390563258136143e-05, "loss": 0.5029, "step": 374 }, { "epoch": 0.18, "grad_norm": 0.14401783049106598, "learning_rate": 9.386744624763409e-05, "loss": 0.4973, "step": 375 }, { "epoch": 0.18, "grad_norm": 0.15383604168891907, "learning_rate": 9.382914846681047e-05, "loss": 0.4871, "step": 376 }, { "epoch": 0.19, "grad_norm": 0.14398829638957977, "learning_rate": 9.379073933618774e-05, "loss": 0.4861, "step": 377 }, { "epoch": 0.19, "grad_norm": 0.1463446319103241, "learning_rate": 9.375221895334587e-05, "loss": 0.4639, "step": 378 }, { "epoch": 0.19, "grad_norm": 0.13288019597530365, "learning_rate": 9.371358741614755e-05, "loss": 0.4623, "step": 379 }, { "epoch": 0.19, "grad_norm": 0.1346786618232727, "learning_rate": 9.36748448227378e-05, "loss": 0.4572, "step": 380 }, { "epoch": 0.19, "grad_norm": 0.14892414212226868, "learning_rate": 9.363599127154383e-05, "loss": 0.4583, "step": 381 }, { "epoch": 0.19, "grad_norm": 0.15430600941181183, "learning_rate": 9.359702686127474e-05, "loss": 0.4501, "step": 382 }, { "epoch": 0.19, "grad_norm": 0.21581362187862396, "learning_rate": 9.355795169092122e-05, "loss": 0.439, "step": 383 }, { "epoch": 0.19, "grad_norm": 0.15140816569328308, "learning_rate": 9.351876585975541e-05, "loss": 0.4868, "step": 384 }, { "epoch": 0.19, "grad_norm": 0.12807786464691162, "learning_rate": 9.347946946733055e-05, "loss": 0.45, "step": 385 }, { "epoch": 0.19, "grad_norm": 0.15461334586143494, "learning_rate": 9.34400626134808e-05, "loss": 0.4832, "step": 386 }, { "epoch": 0.19, "grad_norm": 0.14168666303157806, "learning_rate": 9.340054539832094e-05, "loss": 0.4863, "step": 387 }, { "epoch": 0.19, "grad_norm": 0.13007405400276184, "learning_rate": 9.336091792224608e-05, "loss": 0.4894, "step": 388 }, { "epoch": 0.19, "grad_norm": 0.11937201768159866, "learning_rate": 9.332118028593154e-05, "loss": 0.4022, "step": 389 }, { "epoch": 0.19, "grad_norm": 0.1424238383769989, "learning_rate": 9.328133259033243e-05, "loss": 0.4741, "step": 390 }, { "epoch": 0.19, "grad_norm": 0.12847980856895447, "learning_rate": 9.324137493668352e-05, "loss": 0.4543, "step": 391 }, { "epoch": 0.19, "grad_norm": 0.12196256965398788, "learning_rate": 9.32013074264989e-05, "loss": 0.4345, "step": 392 }, { "epoch": 0.19, "grad_norm": 0.13215775787830353, "learning_rate": 9.316113016157176e-05, "loss": 0.4941, "step": 393 }, { "epoch": 0.19, "grad_norm": 0.13898774981498718, "learning_rate": 9.312084324397416e-05, "loss": 0.4441, "step": 394 }, { "epoch": 0.19, "grad_norm": 0.12405597418546677, "learning_rate": 9.30804467760567e-05, "loss": 0.4687, "step": 395 }, { "epoch": 0.19, "grad_norm": 0.1518356204032898, "learning_rate": 9.30399408604483e-05, "loss": 0.494, "step": 396 }, { "epoch": 0.2, "grad_norm": 0.14700981974601746, "learning_rate": 9.299932560005596e-05, "loss": 0.4776, "step": 397 }, { "epoch": 0.2, "grad_norm": 0.13325655460357666, "learning_rate": 9.295860109806446e-05, "loss": 0.4603, "step": 398 }, { "epoch": 0.2, "grad_norm": 0.1520041823387146, "learning_rate": 9.29177674579361e-05, "loss": 0.4815, "step": 399 }, { "epoch": 0.2, "grad_norm": 0.1286095827817917, "learning_rate": 9.287682478341047e-05, "loss": 0.4358, "step": 400 }, { "epoch": 0.2, "grad_norm": 0.13595926761627197, "learning_rate": 9.283577317850419e-05, "loss": 0.4751, "step": 401 }, { "epoch": 0.2, "grad_norm": 0.1161298006772995, "learning_rate": 9.279461274751053e-05, "loss": 0.4397, "step": 402 }, { "epoch": 0.2, "grad_norm": 0.13984490931034088, "learning_rate": 9.275334359499936e-05, "loss": 0.4853, "step": 403 }, { "epoch": 0.2, "grad_norm": 0.11953715980052948, "learning_rate": 9.271196582581668e-05, "loss": 0.4206, "step": 404 }, { "epoch": 0.2, "grad_norm": 0.14303675293922424, "learning_rate": 9.267047954508446e-05, "loss": 0.429, "step": 405 }, { "epoch": 0.2, "grad_norm": 0.12968996167182922, "learning_rate": 9.262888485820031e-05, "loss": 0.4808, "step": 406 }, { "epoch": 0.2, "grad_norm": 0.13139332830905914, "learning_rate": 9.258718187083735e-05, "loss": 0.4673, "step": 407 }, { "epoch": 0.2, "grad_norm": 0.12537018954753876, "learning_rate": 9.254537068894371e-05, "loss": 0.4362, "step": 408 }, { "epoch": 0.2, "grad_norm": 0.12897247076034546, "learning_rate": 9.250345141874252e-05, "loss": 0.4372, "step": 409 }, { "epoch": 0.2, "grad_norm": 0.11587245762348175, "learning_rate": 9.24614241667314e-05, "loss": 0.4149, "step": 410 }, { "epoch": 0.2, "grad_norm": 0.12651309370994568, "learning_rate": 9.24192890396824e-05, "loss": 0.4719, "step": 411 }, { "epoch": 0.2, "grad_norm": 0.13252316415309906, "learning_rate": 9.237704614464156e-05, "loss": 0.4595, "step": 412 }, { "epoch": 0.2, "grad_norm": 0.1344020962715149, "learning_rate": 9.233469558892876e-05, "loss": 0.4945, "step": 413 }, { "epoch": 0.2, "grad_norm": 0.12167735397815704, "learning_rate": 9.229223748013732e-05, "loss": 0.4189, "step": 414 }, { "epoch": 0.2, "grad_norm": 0.14204931259155273, "learning_rate": 9.22496719261339e-05, "loss": 0.4455, "step": 415 }, { "epoch": 0.2, "grad_norm": 0.14234411716461182, "learning_rate": 9.220699903505808e-05, "loss": 0.4501, "step": 416 }, { "epoch": 0.21, "grad_norm": 0.1279536634683609, "learning_rate": 9.216421891532214e-05, "loss": 0.4882, "step": 417 }, { "epoch": 0.21, "grad_norm": 0.13576358556747437, "learning_rate": 9.212133167561076e-05, "loss": 0.487, "step": 418 }, { "epoch": 0.21, "grad_norm": 0.13314619660377502, "learning_rate": 9.207833742488078e-05, "loss": 0.4651, "step": 419 }, { "epoch": 0.21, "grad_norm": 0.1400623321533203, "learning_rate": 9.203523627236092e-05, "loss": 0.4641, "step": 420 }, { "epoch": 0.21, "grad_norm": 0.12888535857200623, "learning_rate": 9.19920283275515e-05, "loss": 0.4067, "step": 421 }, { "epoch": 0.21, "grad_norm": 0.12729351222515106, "learning_rate": 9.194871370022406e-05, "loss": 0.4503, "step": 422 }, { "epoch": 0.21, "grad_norm": 0.12670859694480896, "learning_rate": 9.190529250042129e-05, "loss": 0.4282, "step": 423 }, { "epoch": 0.21, "grad_norm": 0.11206643283367157, "learning_rate": 9.186176483845655e-05, "loss": 0.4086, "step": 424 }, { "epoch": 0.21, "grad_norm": 0.13397963345050812, "learning_rate": 9.181813082491371e-05, "loss": 0.487, "step": 425 }, { "epoch": 0.21, "grad_norm": 0.14213979244232178, "learning_rate": 9.177439057064683e-05, "loss": 0.4807, "step": 426 }, { "epoch": 0.21, "grad_norm": 0.13667424023151398, "learning_rate": 9.173054418677986e-05, "loss": 0.4288, "step": 427 }, { "epoch": 0.21, "grad_norm": 0.13159430027008057, "learning_rate": 9.168659178470638e-05, "loss": 0.4398, "step": 428 }, { "epoch": 0.21, "grad_norm": 0.13335877656936646, "learning_rate": 9.164253347608933e-05, "loss": 0.4788, "step": 429 }, { "epoch": 0.21, "grad_norm": 0.13102643191814423, "learning_rate": 9.15983693728607e-05, "loss": 0.4536, "step": 430 }, { "epoch": 0.21, "grad_norm": 0.14166386425495148, "learning_rate": 9.155409958722124e-05, "loss": 0.4919, "step": 431 }, { "epoch": 0.21, "grad_norm": 0.13534221053123474, "learning_rate": 9.150972423164024e-05, "loss": 0.4501, "step": 432 }, { "epoch": 0.21, "grad_norm": 0.13583782315254211, "learning_rate": 9.146524341885512e-05, "loss": 0.4629, "step": 433 }, { "epoch": 0.21, "grad_norm": 0.15323059260845184, "learning_rate": 9.14206572618713e-05, "loss": 0.4859, "step": 434 }, { "epoch": 0.21, "grad_norm": 0.14903734624385834, "learning_rate": 9.137596587396176e-05, "loss": 0.5226, "step": 435 }, { "epoch": 0.21, "grad_norm": 0.1502879559993744, "learning_rate": 9.133116936866687e-05, "loss": 0.4522, "step": 436 }, { "epoch": 0.22, "grad_norm": 0.11704117804765701, "learning_rate": 9.128626785979404e-05, "loss": 0.4375, "step": 437 }, { "epoch": 0.22, "grad_norm": 0.13761398196220398, "learning_rate": 9.124126146141742e-05, "loss": 0.4837, "step": 438 }, { "epoch": 0.22, "grad_norm": 0.12360789626836777, "learning_rate": 9.119615028787771e-05, "loss": 0.4227, "step": 439 }, { "epoch": 0.22, "grad_norm": 0.1279977411031723, "learning_rate": 9.115093445378172e-05, "loss": 0.4718, "step": 440 }, { "epoch": 0.22, "grad_norm": 0.1237136647105217, "learning_rate": 9.110561407400218e-05, "loss": 0.4332, "step": 441 }, { "epoch": 0.22, "grad_norm": 0.12261021137237549, "learning_rate": 9.106018926367743e-05, "loss": 0.4466, "step": 442 }, { "epoch": 0.22, "grad_norm": 0.14952373504638672, "learning_rate": 9.101466013821111e-05, "loss": 0.4859, "step": 443 }, { "epoch": 0.22, "grad_norm": 0.11799508333206177, "learning_rate": 9.09690268132719e-05, "loss": 0.3931, "step": 444 }, { "epoch": 0.22, "grad_norm": 0.16621364653110504, "learning_rate": 9.092328940479318e-05, "loss": 0.4853, "step": 445 }, { "epoch": 0.22, "grad_norm": 0.14257293939590454, "learning_rate": 9.087744802897274e-05, "loss": 0.4665, "step": 446 }, { "epoch": 0.22, "grad_norm": 0.14346961677074432, "learning_rate": 9.083150280227255e-05, "loss": 0.4555, "step": 447 }, { "epoch": 0.22, "grad_norm": 0.15263685584068298, "learning_rate": 9.078545384141839e-05, "loss": 0.4864, "step": 448 }, { "epoch": 0.22, "grad_norm": 0.12736818194389343, "learning_rate": 9.073930126339959e-05, "loss": 0.4591, "step": 449 }, { "epoch": 0.22, "grad_norm": 0.12816786766052246, "learning_rate": 9.069304518546872e-05, "loss": 0.4404, "step": 450 }, { "epoch": 0.22, "grad_norm": 0.12346238642930984, "learning_rate": 9.064668572514127e-05, "loss": 0.4226, "step": 451 }, { "epoch": 0.22, "grad_norm": 0.14203311502933502, "learning_rate": 9.060022300019545e-05, "loss": 0.4686, "step": 452 }, { "epoch": 0.22, "grad_norm": 0.12274100631475449, "learning_rate": 9.055365712867175e-05, "loss": 0.3808, "step": 453 }, { "epoch": 0.22, "grad_norm": 0.13359345495700836, "learning_rate": 9.050698822887269e-05, "loss": 0.4524, "step": 454 }, { "epoch": 0.22, "grad_norm": 0.13781769573688507, "learning_rate": 9.046021641936264e-05, "loss": 0.4682, "step": 455 }, { "epoch": 0.22, "grad_norm": 0.1546061635017395, "learning_rate": 9.041334181896733e-05, "loss": 0.4702, "step": 456 }, { "epoch": 0.22, "grad_norm": 0.12171639502048492, "learning_rate": 9.036636454677364e-05, "loss": 0.4448, "step": 457 }, { "epoch": 0.23, "grad_norm": 0.12493584305047989, "learning_rate": 9.031928472212932e-05, "loss": 0.4694, "step": 458 }, { "epoch": 0.23, "grad_norm": 0.13549324870109558, "learning_rate": 9.027210246464267e-05, "loss": 0.4885, "step": 459 }, { "epoch": 0.23, "grad_norm": 0.13407613337039948, "learning_rate": 9.022481789418217e-05, "loss": 0.4851, "step": 460 }, { "epoch": 0.23, "grad_norm": 0.14039187133312225, "learning_rate": 9.01774311308763e-05, "loss": 0.5095, "step": 461 }, { "epoch": 0.23, "grad_norm": 0.13262192904949188, "learning_rate": 9.01299422951131e-05, "loss": 0.453, "step": 462 }, { "epoch": 0.23, "grad_norm": 0.13190452754497528, "learning_rate": 9.008235150753998e-05, "loss": 0.433, "step": 463 }, { "epoch": 0.23, "grad_norm": 0.1233881264925003, "learning_rate": 9.003465888906333e-05, "loss": 0.4136, "step": 464 }, { "epoch": 0.23, "grad_norm": 0.13878096640110016, "learning_rate": 8.998686456084823e-05, "loss": 0.4704, "step": 465 }, { "epoch": 0.23, "grad_norm": 0.15728069841861725, "learning_rate": 8.993896864431826e-05, "loss": 0.522, "step": 466 }, { "epoch": 0.23, "grad_norm": 0.11948581784963608, "learning_rate": 8.989097126115493e-05, "loss": 0.4243, "step": 467 }, { "epoch": 0.23, "grad_norm": 0.14761267602443695, "learning_rate": 8.984287253329769e-05, "loss": 0.4698, "step": 468 }, { "epoch": 0.23, "grad_norm": 0.1491837203502655, "learning_rate": 8.979467258294333e-05, "loss": 0.4696, "step": 469 }, { "epoch": 0.23, "grad_norm": 0.1259170025587082, "learning_rate": 8.974637153254588e-05, "loss": 0.4341, "step": 470 }, { "epoch": 0.23, "grad_norm": 0.12583096325397491, "learning_rate": 8.96979695048162e-05, "loss": 0.4607, "step": 471 }, { "epoch": 0.23, "grad_norm": 0.1380632370710373, "learning_rate": 8.964946662272166e-05, "loss": 0.4641, "step": 472 }, { "epoch": 0.23, "grad_norm": 0.14320193231105804, "learning_rate": 8.960086300948589e-05, "loss": 0.4897, "step": 473 }, { "epoch": 0.23, "grad_norm": 0.14502008259296417, "learning_rate": 8.955215878858842e-05, "loss": 0.494, "step": 474 }, { "epoch": 0.23, "grad_norm": 0.15373514592647552, "learning_rate": 8.950335408376437e-05, "loss": 0.4962, "step": 475 }, { "epoch": 0.23, "grad_norm": 0.1620248407125473, "learning_rate": 8.945444901900415e-05, "loss": 0.4897, "step": 476 }, { "epoch": 0.23, "grad_norm": 0.14162644743919373, "learning_rate": 8.940544371855315e-05, "loss": 0.4613, "step": 477 }, { "epoch": 0.24, "grad_norm": 0.1431538313627243, "learning_rate": 8.935633830691139e-05, "loss": 0.4137, "step": 478 }, { "epoch": 0.24, "grad_norm": 0.13714560866355896, "learning_rate": 8.930713290883324e-05, "loss": 0.4529, "step": 479 }, { "epoch": 0.24, "grad_norm": 0.12592652440071106, "learning_rate": 8.925782764932709e-05, "loss": 0.4602, "step": 480 }, { "epoch": 0.24, "grad_norm": 0.143580362200737, "learning_rate": 8.920842265365503e-05, "loss": 0.4565, "step": 481 }, { "epoch": 0.24, "grad_norm": 0.14570894837379456, "learning_rate": 8.915891804733252e-05, "loss": 0.4751, "step": 482 }, { "epoch": 0.24, "grad_norm": 0.13929927349090576, "learning_rate": 8.910931395612812e-05, "loss": 0.4631, "step": 483 }, { "epoch": 0.24, "grad_norm": 0.15324315428733826, "learning_rate": 8.90596105060631e-05, "loss": 0.5237, "step": 484 }, { "epoch": 0.24, "grad_norm": 0.1352134346961975, "learning_rate": 8.900980782341119e-05, "loss": 0.4553, "step": 485 }, { "epoch": 0.24, "grad_norm": 0.14908449351787567, "learning_rate": 8.89599060346982e-05, "loss": 0.5489, "step": 486 }, { "epoch": 0.24, "grad_norm": 0.15090306103229523, "learning_rate": 8.890990526670169e-05, "loss": 0.4575, "step": 487 }, { "epoch": 0.24, "grad_norm": 0.12587741017341614, "learning_rate": 8.885980564645075e-05, "loss": 0.4466, "step": 488 }, { "epoch": 0.24, "grad_norm": 0.1315077245235443, "learning_rate": 8.880960730122558e-05, "loss": 0.4928, "step": 489 }, { "epoch": 0.24, "grad_norm": 0.13669529557228088, "learning_rate": 8.875931035855721e-05, "loss": 0.4766, "step": 490 }, { "epoch": 0.24, "grad_norm": 0.15492258965969086, "learning_rate": 8.870891494622709e-05, "loss": 0.4602, "step": 491 }, { "epoch": 0.24, "grad_norm": 0.13165941834449768, "learning_rate": 8.865842119226693e-05, "loss": 0.4608, "step": 492 }, { "epoch": 0.24, "grad_norm": 0.12878145277500153, "learning_rate": 8.860782922495822e-05, "loss": 0.4312, "step": 493 }, { "epoch": 0.24, "grad_norm": 0.13100062310695648, "learning_rate": 8.855713917283199e-05, "loss": 0.5069, "step": 494 }, { "epoch": 0.24, "grad_norm": 0.13377130031585693, "learning_rate": 8.850635116466848e-05, "loss": 0.4773, "step": 495 }, { "epoch": 0.24, "grad_norm": 0.13485388457775116, "learning_rate": 8.845546532949673e-05, "loss": 0.416, "step": 496 }, { "epoch": 0.24, "grad_norm": 0.12871213257312775, "learning_rate": 8.840448179659435e-05, "loss": 0.4452, "step": 497 }, { "epoch": 0.25, "grad_norm": 0.1448107808828354, "learning_rate": 8.835340069548719e-05, "loss": 0.4662, "step": 498 }, { "epoch": 0.25, "grad_norm": 0.14174394309520721, "learning_rate": 8.83022221559489e-05, "loss": 0.4514, "step": 499 }, { "epoch": 0.25, "grad_norm": 0.12678150832653046, "learning_rate": 8.825094630800075e-05, "loss": 0.4283, "step": 500 }, { "epoch": 0.25, "grad_norm": 0.16120824217796326, "learning_rate": 8.819957328191117e-05, "loss": 0.4572, "step": 501 }, { "epoch": 0.25, "grad_norm": 0.15984316170215607, "learning_rate": 8.814810320819551e-05, "loss": 0.4765, "step": 502 }, { "epoch": 0.25, "grad_norm": 0.1244916245341301, "learning_rate": 8.809653621761564e-05, "loss": 0.439, "step": 503 }, { "epoch": 0.25, "grad_norm": 0.149594247341156, "learning_rate": 8.804487244117971e-05, "loss": 0.4733, "step": 504 }, { "epoch": 0.25, "grad_norm": 0.12552577257156372, "learning_rate": 8.79931120101417e-05, "loss": 0.4207, "step": 505 }, { "epoch": 0.25, "grad_norm": 0.13124561309814453, "learning_rate": 8.794125505600117e-05, "loss": 0.4349, "step": 506 }, { "epoch": 0.25, "grad_norm": 0.14597705006599426, "learning_rate": 8.788930171050289e-05, "loss": 0.4657, "step": 507 }, { "epoch": 0.25, "grad_norm": 0.15598197281360626, "learning_rate": 8.783725210563653e-05, "loss": 0.4847, "step": 508 }, { "epoch": 0.25, "grad_norm": 0.14909197390079498, "learning_rate": 8.778510637363633e-05, "loss": 0.4813, "step": 509 }, { "epoch": 0.25, "grad_norm": 0.16419531404972076, "learning_rate": 8.773286464698068e-05, "loss": 0.4726, "step": 510 }, { "epoch": 0.25, "grad_norm": 0.1523641049861908, "learning_rate": 8.76805270583919e-05, "loss": 0.4433, "step": 511 }, { "epoch": 0.25, "grad_norm": 0.15485210716724396, "learning_rate": 8.762809374083585e-05, "loss": 0.4897, "step": 512 }, { "epoch": 0.25, "grad_norm": 0.12884008884429932, "learning_rate": 8.757556482752157e-05, "loss": 0.4466, "step": 513 }, { "epoch": 0.25, "grad_norm": 0.13010382652282715, "learning_rate": 8.752294045190099e-05, "loss": 0.4597, "step": 514 }, { "epoch": 0.25, "grad_norm": 0.1605740487575531, "learning_rate": 8.747022074766856e-05, "loss": 0.5, "step": 515 }, { "epoch": 0.25, "grad_norm": 0.13687753677368164, "learning_rate": 8.741740584876091e-05, "loss": 0.4447, "step": 516 }, { "epoch": 0.25, "grad_norm": 0.13992847502231598, "learning_rate": 8.736449588935649e-05, "loss": 0.4545, "step": 517 }, { "epoch": 0.25, "grad_norm": 0.1358829289674759, "learning_rate": 8.73114910038753e-05, "loss": 0.4293, "step": 518 }, { "epoch": 0.26, "grad_norm": 0.12691998481750488, "learning_rate": 8.725839132697851e-05, "loss": 0.4437, "step": 519 }, { "epoch": 0.26, "grad_norm": 0.144168421626091, "learning_rate": 8.720519699356804e-05, "loss": 0.4425, "step": 520 }, { "epoch": 0.26, "grad_norm": 0.15920904278755188, "learning_rate": 8.715190813878637e-05, "loss": 0.4435, "step": 521 }, { "epoch": 0.26, "grad_norm": 0.13866642117500305, "learning_rate": 8.709852489801607e-05, "loss": 0.469, "step": 522 }, { "epoch": 0.26, "grad_norm": 0.14505444467067719, "learning_rate": 8.704504740687952e-05, "loss": 0.4602, "step": 523 }, { "epoch": 0.26, "grad_norm": 0.14122958481311798, "learning_rate": 8.699147580123855e-05, "loss": 0.4512, "step": 524 }, { "epoch": 0.26, "grad_norm": 0.15349215269088745, "learning_rate": 8.693781021719403e-05, "loss": 0.4383, "step": 525 }, { "epoch": 0.26, "grad_norm": 0.17053580284118652, "learning_rate": 8.68840507910857e-05, "loss": 0.4827, "step": 526 }, { "epoch": 0.26, "grad_norm": 0.13227176666259766, "learning_rate": 8.683019765949163e-05, "loss": 0.426, "step": 527 }, { "epoch": 0.26, "grad_norm": 0.15786974132061005, "learning_rate": 8.677625095922796e-05, "loss": 0.4834, "step": 528 }, { "epoch": 0.26, "grad_norm": 0.1476069837808609, "learning_rate": 8.672221082734857e-05, "loss": 0.4796, "step": 529 }, { "epoch": 0.26, "grad_norm": 0.14548389613628387, "learning_rate": 8.666807740114466e-05, "loss": 0.4495, "step": 530 }, { "epoch": 0.26, "grad_norm": 0.14326879382133484, "learning_rate": 8.661385081814453e-05, "loss": 0.4755, "step": 531 }, { "epoch": 0.26, "grad_norm": 0.15756504237651825, "learning_rate": 8.655953121611307e-05, "loss": 0.4791, "step": 532 }, { "epoch": 0.26, "grad_norm": 0.136744886636734, "learning_rate": 8.650511873305152e-05, "loss": 0.4473, "step": 533 }, { "epoch": 0.26, "grad_norm": 0.12174384295940399, "learning_rate": 8.64506135071971e-05, "loss": 0.4431, "step": 534 }, { "epoch": 0.26, "grad_norm": 0.13670197129249573, "learning_rate": 8.63960156770226e-05, "loss": 0.4527, "step": 535 }, { "epoch": 0.26, "grad_norm": 0.1331476867198944, "learning_rate": 8.634132538123616e-05, "loss": 0.4283, "step": 536 }, { "epoch": 0.26, "grad_norm": 0.14409831166267395, "learning_rate": 8.628654275878074e-05, "loss": 0.4365, "step": 537 }, { "epoch": 0.26, "grad_norm": 0.14046485722064972, "learning_rate": 8.623166794883393e-05, "loss": 0.4619, "step": 538 }, { "epoch": 0.27, "grad_norm": 0.14470967650413513, "learning_rate": 8.617670109080746e-05, "loss": 0.457, "step": 539 }, { "epoch": 0.27, "grad_norm": 0.15705853700637817, "learning_rate": 8.6121642324347e-05, "loss": 0.4471, "step": 540 }, { "epoch": 0.27, "grad_norm": 0.14535781741142273, "learning_rate": 8.606649178933163e-05, "loss": 0.4684, "step": 541 }, { "epoch": 0.27, "grad_norm": 0.14385047554969788, "learning_rate": 8.601124962587362e-05, "loss": 0.4601, "step": 542 }, { "epoch": 0.27, "grad_norm": 0.14150488376617432, "learning_rate": 8.595591597431801e-05, "loss": 0.4507, "step": 543 }, { "epoch": 0.27, "grad_norm": 0.14099669456481934, "learning_rate": 8.590049097524229e-05, "loss": 0.4683, "step": 544 }, { "epoch": 0.27, "grad_norm": 0.12593232095241547, "learning_rate": 8.584497476945599e-05, "loss": 0.4489, "step": 545 }, { "epoch": 0.27, "grad_norm": 0.13864102959632874, "learning_rate": 8.578936749800038e-05, "loss": 0.442, "step": 546 }, { "epoch": 0.27, "grad_norm": 0.14365458488464355, "learning_rate": 8.573366930214806e-05, "loss": 0.4899, "step": 547 }, { "epoch": 0.27, "grad_norm": 0.13706675171852112, "learning_rate": 8.567788032340266e-05, "loss": 0.4403, "step": 548 }, { "epoch": 0.27, "grad_norm": 0.15798896551132202, "learning_rate": 8.562200070349845e-05, "loss": 0.5023, "step": 549 }, { "epoch": 0.27, "grad_norm": 0.13478447496891022, "learning_rate": 8.556603058439991e-05, "loss": 0.437, "step": 550 }, { "epoch": 0.27, "grad_norm": 0.14750154316425323, "learning_rate": 8.550997010830154e-05, "loss": 0.4653, "step": 551 }, { "epoch": 0.27, "grad_norm": 0.12813888490200043, "learning_rate": 8.54538194176273e-05, "loss": 0.4341, "step": 552 }, { "epoch": 0.27, "grad_norm": 0.16272859275341034, "learning_rate": 8.539757865503042e-05, "loss": 0.4849, "step": 553 }, { "epoch": 0.27, "grad_norm": 0.14321710169315338, "learning_rate": 8.53412479633929e-05, "loss": 0.4437, "step": 554 }, { "epoch": 0.27, "grad_norm": 0.13166069984436035, "learning_rate": 8.528482748582525e-05, "loss": 0.4406, "step": 555 }, { "epoch": 0.27, "grad_norm": 0.13955581188201904, "learning_rate": 8.522831736566607e-05, "loss": 0.4753, "step": 556 }, { "epoch": 0.27, "grad_norm": 0.15354441106319427, "learning_rate": 8.517171774648173e-05, "loss": 0.4515, "step": 557 }, { "epoch": 0.27, "grad_norm": 0.15759271383285522, "learning_rate": 8.51150287720659e-05, "loss": 0.4943, "step": 558 }, { "epoch": 0.28, "grad_norm": 0.1502160131931305, "learning_rate": 8.505825058643933e-05, "loss": 0.4472, "step": 559 }, { "epoch": 0.28, "grad_norm": 0.14027456939220428, "learning_rate": 8.50013833338494e-05, "loss": 0.4582, "step": 560 }, { "epoch": 0.28, "grad_norm": 0.15820910036563873, "learning_rate": 8.494442715876976e-05, "loss": 0.4292, "step": 561 }, { "epoch": 0.28, "grad_norm": 0.15127217769622803, "learning_rate": 8.488738220589996e-05, "loss": 0.4344, "step": 562 }, { "epoch": 0.28, "grad_norm": 0.1514924019575119, "learning_rate": 8.483024862016513e-05, "loss": 0.5104, "step": 563 }, { "epoch": 0.28, "grad_norm": 0.14321254193782806, "learning_rate": 8.477302654671553e-05, "loss": 0.4462, "step": 564 }, { "epoch": 0.28, "grad_norm": 0.14858044683933258, "learning_rate": 8.471571613092627e-05, "loss": 0.4558, "step": 565 }, { "epoch": 0.28, "grad_norm": 0.15899378061294556, "learning_rate": 8.465831751839686e-05, "loss": 0.4736, "step": 566 }, { "epoch": 0.28, "grad_norm": 0.13204921782016754, "learning_rate": 8.46008308549509e-05, "loss": 0.4876, "step": 567 }, { "epoch": 0.28, "grad_norm": 0.1270461529493332, "learning_rate": 8.454325628663569e-05, "loss": 0.4091, "step": 568 }, { "epoch": 0.28, "grad_norm": 0.16556496918201447, "learning_rate": 8.448559395972182e-05, "loss": 0.5089, "step": 569 }, { "epoch": 0.28, "grad_norm": 0.15013235807418823, "learning_rate": 8.442784402070291e-05, "loss": 0.4695, "step": 570 }, { "epoch": 0.28, "grad_norm": 0.14579994976520538, "learning_rate": 8.437000661629506e-05, "loss": 0.4541, "step": 571 }, { "epoch": 0.28, "grad_norm": 0.14743971824645996, "learning_rate": 8.43120818934367e-05, "loss": 0.4665, "step": 572 }, { "epoch": 0.28, "grad_norm": 0.1320423185825348, "learning_rate": 8.425406999928795e-05, "loss": 0.4429, "step": 573 }, { "epoch": 0.28, "grad_norm": 0.136445552110672, "learning_rate": 8.419597108123054e-05, "loss": 0.4543, "step": 574 }, { "epoch": 0.28, "grad_norm": 0.13487732410430908, "learning_rate": 8.413778528686717e-05, "loss": 0.4518, "step": 575 }, { "epoch": 0.28, "grad_norm": 0.1340235471725464, "learning_rate": 8.407951276402135e-05, "loss": 0.4773, "step": 576 }, { "epoch": 0.28, "grad_norm": 0.11425097286701202, "learning_rate": 8.402115366073685e-05, "loss": 0.3606, "step": 577 }, { "epoch": 0.28, "grad_norm": 0.124888576567173, "learning_rate": 8.396270812527744e-05, "loss": 0.4614, "step": 578 }, { "epoch": 0.28, "grad_norm": 0.12708690762519836, "learning_rate": 8.390417630612647e-05, "loss": 0.4254, "step": 579 }, { "epoch": 0.29, "grad_norm": 0.1517501175403595, "learning_rate": 8.384555835198648e-05, "loss": 0.472, "step": 580 }, { "epoch": 0.29, "grad_norm": 0.1389685720205307, "learning_rate": 8.378685441177886e-05, "loss": 0.4289, "step": 581 }, { "epoch": 0.29, "grad_norm": 0.13276535272598267, "learning_rate": 8.372806463464347e-05, "loss": 0.4527, "step": 582 }, { "epoch": 0.29, "grad_norm": 0.1462005376815796, "learning_rate": 8.366918916993817e-05, "loss": 0.4396, "step": 583 }, { "epoch": 0.29, "grad_norm": 0.14018382132053375, "learning_rate": 8.361022816723858e-05, "loss": 0.4715, "step": 584 }, { "epoch": 0.29, "grad_norm": 0.1300714612007141, "learning_rate": 8.355118177633763e-05, "loss": 0.4596, "step": 585 }, { "epoch": 0.29, "grad_norm": 0.14050182700157166, "learning_rate": 8.349205014724515e-05, "loss": 0.4238, "step": 586 }, { "epoch": 0.29, "grad_norm": 0.1514914333820343, "learning_rate": 8.343283343018754e-05, "loss": 0.4606, "step": 587 }, { "epoch": 0.29, "grad_norm": 0.1345144361257553, "learning_rate": 8.337353177560736e-05, "loss": 0.4555, "step": 588 }, { "epoch": 0.29, "grad_norm": 0.15013709664344788, "learning_rate": 8.331414533416298e-05, "loss": 0.479, "step": 589 }, { "epoch": 0.29, "grad_norm": 0.17467045783996582, "learning_rate": 8.325467425672813e-05, "loss": 0.4342, "step": 590 }, { "epoch": 0.29, "grad_norm": 0.1799718141555786, "learning_rate": 8.31951186943916e-05, "loss": 0.4484, "step": 591 }, { "epoch": 0.29, "grad_norm": 0.1405632197856903, "learning_rate": 8.313547879845681e-05, "loss": 0.443, "step": 592 }, { "epoch": 0.29, "grad_norm": 0.13723380863666534, "learning_rate": 8.307575472044143e-05, "loss": 0.4473, "step": 593 }, { "epoch": 0.29, "grad_norm": 0.13592466711997986, "learning_rate": 8.301594661207702e-05, "loss": 0.4582, "step": 594 }, { "epoch": 0.29, "grad_norm": 0.14895528554916382, "learning_rate": 8.295605462530855e-05, "loss": 0.4518, "step": 595 }, { "epoch": 0.29, "grad_norm": 0.14703898131847382, "learning_rate": 8.289607891229417e-05, "loss": 0.4642, "step": 596 }, { "epoch": 0.29, "grad_norm": 0.13721701502799988, "learning_rate": 8.28360196254047e-05, "loss": 0.4633, "step": 597 }, { "epoch": 0.29, "grad_norm": 0.14089564979076385, "learning_rate": 8.277587691722328e-05, "loss": 0.4391, "step": 598 }, { "epoch": 0.29, "grad_norm": 0.13690243661403656, "learning_rate": 8.271565094054503e-05, "loss": 0.4341, "step": 599 }, { "epoch": 0.3, "grad_norm": 0.148747980594635, "learning_rate": 8.265534184837654e-05, "loss": 0.4289, "step": 600 }, { "epoch": 0.3, "grad_norm": 0.12950879335403442, "learning_rate": 8.259494979393563e-05, "loss": 0.4425, "step": 601 }, { "epoch": 0.3, "grad_norm": 0.12704671919345856, "learning_rate": 8.253447493065086e-05, "loss": 0.4339, "step": 602 }, { "epoch": 0.3, "grad_norm": 0.12717247009277344, "learning_rate": 8.247391741216113e-05, "loss": 0.4576, "step": 603 }, { "epoch": 0.3, "grad_norm": 0.14944474399089813, "learning_rate": 8.241327739231541e-05, "loss": 0.4436, "step": 604 }, { "epoch": 0.3, "grad_norm": 0.13703951239585876, "learning_rate": 8.235255502517223e-05, "loss": 0.4661, "step": 605 }, { "epoch": 0.3, "grad_norm": 0.18576645851135254, "learning_rate": 8.229175046499928e-05, "loss": 0.4733, "step": 606 }, { "epoch": 0.3, "grad_norm": 0.14139994978904724, "learning_rate": 8.223086386627314e-05, "loss": 0.4529, "step": 607 }, { "epoch": 0.3, "grad_norm": 0.15422146022319794, "learning_rate": 8.216989538367879e-05, "loss": 0.4684, "step": 608 }, { "epoch": 0.3, "grad_norm": 0.17433564364910126, "learning_rate": 8.21088451721092e-05, "loss": 0.4604, "step": 609 }, { "epoch": 0.3, "grad_norm": 0.16186095774173737, "learning_rate": 8.204771338666503e-05, "loss": 0.4295, "step": 610 }, { "epoch": 0.3, "grad_norm": 0.14331543445587158, "learning_rate": 8.198650018265415e-05, "loss": 0.4094, "step": 611 }, { "epoch": 0.3, "grad_norm": 0.15338410437107086, "learning_rate": 8.192520571559128e-05, "loss": 0.4515, "step": 612 }, { "epoch": 0.3, "grad_norm": 0.14384318888187408, "learning_rate": 8.18638301411976e-05, "loss": 0.4136, "step": 613 }, { "epoch": 0.3, "grad_norm": 0.1422734409570694, "learning_rate": 8.180237361540034e-05, "loss": 0.4625, "step": 614 }, { "epoch": 0.3, "grad_norm": 0.13708388805389404, "learning_rate": 8.17408362943324e-05, "loss": 0.4451, "step": 615 }, { "epoch": 0.3, "grad_norm": 0.14580971002578735, "learning_rate": 8.167921833433194e-05, "loss": 0.4499, "step": 616 }, { "epoch": 0.3, "grad_norm": 0.13817071914672852, "learning_rate": 8.1617519891942e-05, "loss": 0.401, "step": 617 }, { "epoch": 0.3, "grad_norm": 0.1476030796766281, "learning_rate": 8.155574112391002e-05, "loss": 0.4606, "step": 618 }, { "epoch": 0.3, "grad_norm": 0.14434263110160828, "learning_rate": 8.149388218718761e-05, "loss": 0.4624, "step": 619 }, { "epoch": 0.31, "grad_norm": 0.14760713279247284, "learning_rate": 8.143194323893002e-05, "loss": 0.4466, "step": 620 }, { "epoch": 0.31, "grad_norm": 0.1390756368637085, "learning_rate": 8.136992443649571e-05, "loss": 0.4554, "step": 621 }, { "epoch": 0.31, "grad_norm": 0.13889239728450775, "learning_rate": 8.13078259374461e-05, "loss": 0.3999, "step": 622 }, { "epoch": 0.31, "grad_norm": 0.14155450463294983, "learning_rate": 8.124564789954501e-05, "loss": 0.4456, "step": 623 }, { "epoch": 0.31, "grad_norm": 0.15029045939445496, "learning_rate": 8.11833904807584e-05, "loss": 0.4687, "step": 624 }, { "epoch": 0.31, "grad_norm": 0.15886496007442474, "learning_rate": 8.112105383925385e-05, "loss": 0.4972, "step": 625 }, { "epoch": 0.31, "grad_norm": 0.1314540058374405, "learning_rate": 8.105863813340024e-05, "loss": 0.438, "step": 626 }, { "epoch": 0.31, "grad_norm": 0.13515862822532654, "learning_rate": 8.099614352176727e-05, "loss": 0.4155, "step": 627 }, { "epoch": 0.31, "grad_norm": 0.13809122145175934, "learning_rate": 8.093357016312517e-05, "loss": 0.4363, "step": 628 }, { "epoch": 0.31, "grad_norm": 0.14311912655830383, "learning_rate": 8.08709182164442e-05, "loss": 0.4395, "step": 629 }, { "epoch": 0.31, "grad_norm": 0.1520572006702423, "learning_rate": 8.080818784089421e-05, "loss": 0.4713, "step": 630 }, { "epoch": 0.31, "grad_norm": 0.14011049270629883, "learning_rate": 8.074537919584443e-05, "loss": 0.4505, "step": 631 }, { "epoch": 0.31, "grad_norm": 0.13939343392848969, "learning_rate": 8.068249244086284e-05, "loss": 0.4147, "step": 632 }, { "epoch": 0.31, "grad_norm": 0.13660132884979248, "learning_rate": 8.061952773571591e-05, "loss": 0.4517, "step": 633 }, { "epoch": 0.31, "grad_norm": 0.21176087856292725, "learning_rate": 8.05564852403681e-05, "loss": 0.4418, "step": 634 }, { "epoch": 0.31, "grad_norm": 0.14301875233650208, "learning_rate": 8.049336511498153e-05, "loss": 0.4805, "step": 635 }, { "epoch": 0.31, "grad_norm": 0.12077558785676956, "learning_rate": 8.043016751991557e-05, "loss": 0.3953, "step": 636 }, { "epoch": 0.31, "grad_norm": 0.1368398219347, "learning_rate": 8.036689261572635e-05, "loss": 0.4528, "step": 637 }, { "epoch": 0.31, "grad_norm": 0.14997579157352448, "learning_rate": 8.030354056316643e-05, "loss": 0.4482, "step": 638 }, { "epoch": 0.31, "grad_norm": 0.14751151204109192, "learning_rate": 8.024011152318438e-05, "loss": 0.4962, "step": 639 }, { "epoch": 0.31, "grad_norm": 0.13927365839481354, "learning_rate": 8.017660565692435e-05, "loss": 0.4176, "step": 640 }, { "epoch": 0.32, "grad_norm": 0.14938226342201233, "learning_rate": 8.011302312572566e-05, "loss": 0.4712, "step": 641 }, { "epoch": 0.32, "grad_norm": 0.14915160834789276, "learning_rate": 8.004936409112242e-05, "loss": 0.4526, "step": 642 }, { "epoch": 0.32, "grad_norm": 0.15028274059295654, "learning_rate": 7.998562871484309e-05, "loss": 0.4522, "step": 643 }, { "epoch": 0.32, "grad_norm": 0.13898566365242004, "learning_rate": 7.992181715881007e-05, "loss": 0.4419, "step": 644 }, { "epoch": 0.32, "grad_norm": 0.13849930465221405, "learning_rate": 7.985792958513931e-05, "loss": 0.4372, "step": 645 }, { "epoch": 0.32, "grad_norm": 0.13728925585746765, "learning_rate": 7.97939661561399e-05, "loss": 0.4557, "step": 646 }, { "epoch": 0.32, "grad_norm": 0.1573091298341751, "learning_rate": 7.972992703431361e-05, "loss": 0.4774, "step": 647 }, { "epoch": 0.32, "grad_norm": 0.13723629713058472, "learning_rate": 7.966581238235453e-05, "loss": 0.4814, "step": 648 }, { "epoch": 0.32, "grad_norm": 0.14129038155078888, "learning_rate": 7.960162236314866e-05, "loss": 0.4875, "step": 649 }, { "epoch": 0.32, "grad_norm": 0.13432279229164124, "learning_rate": 7.953735713977341e-05, "loss": 0.4636, "step": 650 }, { "epoch": 0.32, "grad_norm": 0.136550173163414, "learning_rate": 7.947301687549731e-05, "loss": 0.4684, "step": 651 }, { "epoch": 0.32, "grad_norm": 0.12857531011104584, "learning_rate": 7.940860173377953e-05, "loss": 0.4389, "step": 652 }, { "epoch": 0.32, "grad_norm": 0.15144282579421997, "learning_rate": 7.934411187826942e-05, "loss": 0.4385, "step": 653 }, { "epoch": 0.32, "grad_norm": 0.15752093493938446, "learning_rate": 7.927954747280618e-05, "loss": 0.5091, "step": 654 }, { "epoch": 0.32, "grad_norm": 0.1391921043395996, "learning_rate": 7.921490868141843e-05, "loss": 0.4433, "step": 655 }, { "epoch": 0.32, "grad_norm": 0.14206765592098236, "learning_rate": 7.915019566832372e-05, "loss": 0.4674, "step": 656 }, { "epoch": 0.32, "grad_norm": 0.1416061669588089, "learning_rate": 7.908540859792822e-05, "loss": 0.4638, "step": 657 }, { "epoch": 0.32, "grad_norm": 0.1374977082014084, "learning_rate": 7.902054763482616e-05, "loss": 0.468, "step": 658 }, { "epoch": 0.32, "grad_norm": 0.1261112093925476, "learning_rate": 7.895561294379959e-05, "loss": 0.3553, "step": 659 }, { "epoch": 0.32, "grad_norm": 0.17170506715774536, "learning_rate": 7.889060468981782e-05, "loss": 0.4609, "step": 660 }, { "epoch": 0.33, "grad_norm": 0.13992102444171906, "learning_rate": 7.882552303803705e-05, "loss": 0.4893, "step": 661 }, { "epoch": 0.33, "grad_norm": 0.15339495241641998, "learning_rate": 7.876036815379999e-05, "loss": 0.4218, "step": 662 }, { "epoch": 0.33, "grad_norm": 0.1548285335302353, "learning_rate": 7.869514020263533e-05, "loss": 0.5, "step": 663 }, { "epoch": 0.33, "grad_norm": 0.13190871477127075, "learning_rate": 7.862983935025745e-05, "loss": 0.446, "step": 664 }, { "epoch": 0.33, "grad_norm": 0.14355428516864777, "learning_rate": 7.856446576256593e-05, "loss": 0.4442, "step": 665 }, { "epoch": 0.33, "grad_norm": 0.1340962052345276, "learning_rate": 7.849901960564509e-05, "loss": 0.4276, "step": 666 }, { "epoch": 0.33, "grad_norm": 0.19278065860271454, "learning_rate": 7.843350104576369e-05, "loss": 0.4664, "step": 667 }, { "epoch": 0.33, "grad_norm": 0.15378601849079132, "learning_rate": 7.836791024937437e-05, "loss": 0.4622, "step": 668 }, { "epoch": 0.33, "grad_norm": 0.13706521689891815, "learning_rate": 7.830224738311331e-05, "loss": 0.4621, "step": 669 }, { "epoch": 0.33, "grad_norm": 0.13976649940013885, "learning_rate": 7.823651261379981e-05, "loss": 0.4701, "step": 670 }, { "epoch": 0.33, "grad_norm": 0.14810733497142792, "learning_rate": 7.817070610843579e-05, "loss": 0.4612, "step": 671 }, { "epoch": 0.33, "grad_norm": 0.1367623656988144, "learning_rate": 7.810482803420549e-05, "loss": 0.4486, "step": 672 }, { "epoch": 0.33, "grad_norm": 0.14897918701171875, "learning_rate": 7.803887855847489e-05, "loss": 0.4527, "step": 673 }, { "epoch": 0.33, "grad_norm": 0.16939713060855865, "learning_rate": 7.797285784879146e-05, "loss": 0.4626, "step": 674 }, { "epoch": 0.33, "grad_norm": 0.14905552566051483, "learning_rate": 7.790676607288356e-05, "loss": 0.4573, "step": 675 }, { "epoch": 0.33, "grad_norm": 0.15478728711605072, "learning_rate": 7.784060339866015e-05, "loss": 0.4842, "step": 676 }, { "epoch": 0.33, "grad_norm": 0.13243986666202545, "learning_rate": 7.77743699942103e-05, "loss": 0.4213, "step": 677 }, { "epoch": 0.33, "grad_norm": 0.1524590104818344, "learning_rate": 7.770806602780274e-05, "loss": 0.4598, "step": 678 }, { "epoch": 0.33, "grad_norm": 0.17100092768669128, "learning_rate": 7.764169166788551e-05, "loss": 0.4784, "step": 679 }, { "epoch": 0.33, "grad_norm": 0.17295029759407043, "learning_rate": 7.757524708308545e-05, "loss": 0.4116, "step": 680 }, { "epoch": 0.34, "grad_norm": 0.1419491320848465, "learning_rate": 7.750873244220787e-05, "loss": 0.414, "step": 681 }, { "epoch": 0.34, "grad_norm": 0.13847751915454865, "learning_rate": 7.744214791423596e-05, "loss": 0.4435, "step": 682 }, { "epoch": 0.34, "grad_norm": 0.13579434156417847, "learning_rate": 7.737549366833058e-05, "loss": 0.4567, "step": 683 }, { "epoch": 0.34, "grad_norm": 0.14528582990169525, "learning_rate": 7.730876987382962e-05, "loss": 0.4268, "step": 684 }, { "epoch": 0.34, "grad_norm": 0.15323251485824585, "learning_rate": 7.724197670024768e-05, "loss": 0.4665, "step": 685 }, { "epoch": 0.34, "grad_norm": 0.1382426619529724, "learning_rate": 7.717511431727565e-05, "loss": 0.4599, "step": 686 }, { "epoch": 0.34, "grad_norm": 0.1580415964126587, "learning_rate": 7.710818289478025e-05, "loss": 0.4379, "step": 687 }, { "epoch": 0.34, "grad_norm": 0.1315174251794815, "learning_rate": 7.704118260280355e-05, "loss": 0.4347, "step": 688 }, { "epoch": 0.34, "grad_norm": 0.14556631445884705, "learning_rate": 7.697411361156262e-05, "loss": 0.4539, "step": 689 }, { "epoch": 0.34, "grad_norm": 0.16913580894470215, "learning_rate": 7.690697609144905e-05, "loss": 0.4791, "step": 690 }, { "epoch": 0.34, "grad_norm": 0.13996735215187073, "learning_rate": 7.68397702130286e-05, "loss": 0.4305, "step": 691 }, { "epoch": 0.34, "grad_norm": 0.13888368010520935, "learning_rate": 7.677249614704057e-05, "loss": 0.4456, "step": 692 }, { "epoch": 0.34, "grad_norm": 0.15303093194961548, "learning_rate": 7.670515406439758e-05, "loss": 0.4029, "step": 693 }, { "epoch": 0.34, "grad_norm": 0.1657179892063141, "learning_rate": 7.663774413618504e-05, "loss": 0.4566, "step": 694 }, { "epoch": 0.34, "grad_norm": 0.13763493299484253, "learning_rate": 7.657026653366068e-05, "loss": 0.4382, "step": 695 }, { "epoch": 0.34, "grad_norm": 0.1505671739578247, "learning_rate": 7.650272142825426e-05, "loss": 0.4447, "step": 696 }, { "epoch": 0.34, "grad_norm": 0.14574865996837616, "learning_rate": 7.64351089915669e-05, "loss": 0.4557, "step": 697 }, { "epoch": 0.34, "grad_norm": 0.1301255226135254, "learning_rate": 7.636742939537089e-05, "loss": 0.4376, "step": 698 }, { "epoch": 0.34, "grad_norm": 0.14549827575683594, "learning_rate": 7.629968281160906e-05, "loss": 0.4303, "step": 699 }, { "epoch": 0.34, "grad_norm": 0.1470489501953125, "learning_rate": 7.623186941239449e-05, "loss": 0.4739, "step": 700 }, { "epoch": 0.34, "grad_norm": 0.14301317930221558, "learning_rate": 7.616398937000999e-05, "loss": 0.4558, "step": 701 }, { "epoch": 0.35, "grad_norm": 0.1229747012257576, "learning_rate": 7.609604285690762e-05, "loss": 0.4041, "step": 702 }, { "epoch": 0.35, "grad_norm": 0.14903809130191803, "learning_rate": 7.60280300457084e-05, "loss": 0.4746, "step": 703 }, { "epoch": 0.35, "grad_norm": 0.14069899916648865, "learning_rate": 7.595995110920174e-05, "loss": 0.4702, "step": 704 }, { "epoch": 0.35, "grad_norm": 0.13392746448516846, "learning_rate": 7.589180622034504e-05, "loss": 0.4186, "step": 705 }, { "epoch": 0.35, "grad_norm": 0.13193225860595703, "learning_rate": 7.582359555226326e-05, "loss": 0.3949, "step": 706 }, { "epoch": 0.35, "grad_norm": 0.14410898089408875, "learning_rate": 7.575531927824849e-05, "loss": 0.433, "step": 707 }, { "epoch": 0.35, "grad_norm": 0.15617424249649048, "learning_rate": 7.568697757175947e-05, "loss": 0.4836, "step": 708 }, { "epoch": 0.35, "grad_norm": 0.13635756075382233, "learning_rate": 7.56185706064212e-05, "loss": 0.4279, "step": 709 }, { "epoch": 0.35, "grad_norm": 0.1503961831331253, "learning_rate": 7.555009855602442e-05, "loss": 0.4573, "step": 710 }, { "epoch": 0.35, "grad_norm": 0.17020638287067413, "learning_rate": 7.548156159452531e-05, "loss": 0.4502, "step": 711 }, { "epoch": 0.35, "grad_norm": 0.143310546875, "learning_rate": 7.541295989604488e-05, "loss": 0.465, "step": 712 }, { "epoch": 0.35, "grad_norm": 0.14423972368240356, "learning_rate": 7.534429363486866e-05, "loss": 0.4373, "step": 713 }, { "epoch": 0.35, "grad_norm": 0.14974582195281982, "learning_rate": 7.527556298544613e-05, "loss": 0.4127, "step": 714 }, { "epoch": 0.35, "grad_norm": 0.14904718101024628, "learning_rate": 7.520676812239047e-05, "loss": 0.4108, "step": 715 }, { "epoch": 0.35, "grad_norm": 0.1516634076833725, "learning_rate": 7.51379092204779e-05, "loss": 0.4643, "step": 716 }, { "epoch": 0.35, "grad_norm": 0.16152378916740417, "learning_rate": 7.506898645464733e-05, "loss": 0.462, "step": 717 }, { "epoch": 0.35, "grad_norm": 0.18015022575855255, "learning_rate": 7.500000000000001e-05, "loss": 0.4733, "step": 718 }, { "epoch": 0.35, "grad_norm": 0.1346128135919571, "learning_rate": 7.493095003179889e-05, "loss": 0.4554, "step": 719 }, { "epoch": 0.35, "grad_norm": 0.13532784581184387, "learning_rate": 7.486183672546836e-05, "loss": 0.4308, "step": 720 }, { "epoch": 0.35, "grad_norm": 0.1512995809316635, "learning_rate": 7.479266025659365e-05, "loss": 0.4601, "step": 721 }, { "epoch": 0.36, "grad_norm": 0.1438412219285965, "learning_rate": 7.472342080092054e-05, "loss": 0.4261, "step": 722 }, { "epoch": 0.36, "grad_norm": 0.15818530321121216, "learning_rate": 7.465411853435473e-05, "loss": 0.4724, "step": 723 }, { "epoch": 0.36, "grad_norm": 0.14813333749771118, "learning_rate": 7.458475363296161e-05, "loss": 0.4734, "step": 724 }, { "epoch": 0.36, "grad_norm": 0.16741804778575897, "learning_rate": 7.45153262729656e-05, "loss": 0.4415, "step": 725 }, { "epoch": 0.36, "grad_norm": 0.13209865987300873, "learning_rate": 7.444583663074982e-05, "loss": 0.4235, "step": 726 }, { "epoch": 0.36, "grad_norm": 0.15438614785671234, "learning_rate": 7.437628488285568e-05, "loss": 0.4893, "step": 727 }, { "epoch": 0.36, "grad_norm": 0.13894088566303253, "learning_rate": 7.430667120598229e-05, "loss": 0.43, "step": 728 }, { "epoch": 0.36, "grad_norm": 0.15603627264499664, "learning_rate": 7.423699577698614e-05, "loss": 0.5007, "step": 729 }, { "epoch": 0.36, "grad_norm": 0.14360208809375763, "learning_rate": 7.416725877288062e-05, "loss": 0.4091, "step": 730 }, { "epoch": 0.36, "grad_norm": 0.1359792947769165, "learning_rate": 7.409746037083548e-05, "loss": 0.4308, "step": 731 }, { "epoch": 0.36, "grad_norm": 0.14601247012615204, "learning_rate": 7.402760074817653e-05, "loss": 0.4525, "step": 732 }, { "epoch": 0.36, "grad_norm": 0.1461189091205597, "learning_rate": 7.395768008238511e-05, "loss": 0.4382, "step": 733 }, { "epoch": 0.36, "grad_norm": 0.15040484070777893, "learning_rate": 7.388769855109759e-05, "loss": 0.4121, "step": 734 }, { "epoch": 0.36, "grad_norm": 0.1626395732164383, "learning_rate": 7.381765633210497e-05, "loss": 0.4476, "step": 735 }, { "epoch": 0.36, "grad_norm": 0.1596745252609253, "learning_rate": 7.374755360335253e-05, "loss": 0.4476, "step": 736 }, { "epoch": 0.36, "grad_norm": 0.13897104561328888, "learning_rate": 7.367739054293914e-05, "loss": 0.4126, "step": 737 }, { "epoch": 0.36, "grad_norm": 0.13909079134464264, "learning_rate": 7.360716732911706e-05, "loss": 0.4302, "step": 738 }, { "epoch": 0.36, "grad_norm": 0.14715461432933807, "learning_rate": 7.353688414029132e-05, "loss": 0.436, "step": 739 }, { "epoch": 0.36, "grad_norm": 0.1493682861328125, "learning_rate": 7.346654115501929e-05, "loss": 0.4237, "step": 740 }, { "epoch": 0.36, "grad_norm": 0.1293671578168869, "learning_rate": 7.339613855201032e-05, "loss": 0.4226, "step": 741 }, { "epoch": 0.37, "grad_norm": 0.14128343760967255, "learning_rate": 7.332567651012518e-05, "loss": 0.4699, "step": 742 }, { "epoch": 0.37, "grad_norm": 0.16285786032676697, "learning_rate": 7.325515520837565e-05, "loss": 0.4959, "step": 743 }, { "epoch": 0.37, "grad_norm": 0.16623514890670776, "learning_rate": 7.318457482592409e-05, "loss": 0.4337, "step": 744 }, { "epoch": 0.37, "grad_norm": 0.15882305800914764, "learning_rate": 7.311393554208292e-05, "loss": 0.4932, "step": 745 }, { "epoch": 0.37, "grad_norm": 0.1348472237586975, "learning_rate": 7.304323753631423e-05, "loss": 0.4238, "step": 746 }, { "epoch": 0.37, "grad_norm": 0.14953245222568512, "learning_rate": 7.297248098822926e-05, "loss": 0.4481, "step": 747 }, { "epoch": 0.37, "grad_norm": 0.14031442999839783, "learning_rate": 7.290166607758804e-05, "loss": 0.4843, "step": 748 }, { "epoch": 0.37, "grad_norm": 0.1400069296360016, "learning_rate": 7.28307929842988e-05, "loss": 0.4112, "step": 749 }, { "epoch": 0.37, "grad_norm": 0.14739136397838593, "learning_rate": 7.275986188841764e-05, "loss": 0.4272, "step": 750 }, { "epoch": 0.37, "grad_norm": 0.1464834064245224, "learning_rate": 7.2688872970148e-05, "loss": 0.4459, "step": 751 }, { "epoch": 0.37, "grad_norm": 0.14527562260627747, "learning_rate": 7.26178264098402e-05, "loss": 0.4311, "step": 752 }, { "epoch": 0.37, "grad_norm": 0.1288638561964035, "learning_rate": 7.254672238799107e-05, "loss": 0.4475, "step": 753 }, { "epoch": 0.37, "grad_norm": 0.15614116191864014, "learning_rate": 7.247556108524329e-05, "loss": 0.4926, "step": 754 }, { "epoch": 0.37, "grad_norm": 0.13819308578968048, "learning_rate": 7.240434268238524e-05, "loss": 0.4679, "step": 755 }, { "epoch": 0.37, "grad_norm": 0.14042241871356964, "learning_rate": 7.233306736035022e-05, "loss": 0.4457, "step": 756 }, { "epoch": 0.37, "grad_norm": 0.14200957119464874, "learning_rate": 7.226173530021619e-05, "loss": 0.4327, "step": 757 }, { "epoch": 0.37, "grad_norm": 0.14008744060993195, "learning_rate": 7.219034668320528e-05, "loss": 0.4653, "step": 758 }, { "epoch": 0.37, "grad_norm": 0.1547507792711258, "learning_rate": 7.211890169068328e-05, "loss": 0.4496, "step": 759 }, { "epoch": 0.37, "grad_norm": 0.13638268411159515, "learning_rate": 7.204740050415917e-05, "loss": 0.3919, "step": 760 }, { "epoch": 0.37, "grad_norm": 0.13298386335372925, "learning_rate": 7.197584330528478e-05, "loss": 0.4273, "step": 761 }, { "epoch": 0.37, "grad_norm": 0.13216255605220795, "learning_rate": 7.190423027585414e-05, "loss": 0.3613, "step": 762 }, { "epoch": 0.38, "grad_norm": 0.1576545387506485, "learning_rate": 7.18325615978032e-05, "loss": 0.4643, "step": 763 }, { "epoch": 0.38, "grad_norm": 0.14647802710533142, "learning_rate": 7.176083745320925e-05, "loss": 0.4625, "step": 764 }, { "epoch": 0.38, "grad_norm": 0.14687716960906982, "learning_rate": 7.168905802429052e-05, "loss": 0.4021, "step": 765 }, { "epoch": 0.38, "grad_norm": 0.15105555951595306, "learning_rate": 7.161722349340563e-05, "loss": 0.4426, "step": 766 }, { "epoch": 0.38, "grad_norm": 0.15849849581718445, "learning_rate": 7.154533404305328e-05, "loss": 0.4804, "step": 767 }, { "epoch": 0.38, "grad_norm": 0.14763112366199493, "learning_rate": 7.147338985587162e-05, "loss": 0.4135, "step": 768 }, { "epoch": 0.38, "grad_norm": 0.153654545545578, "learning_rate": 7.14013911146379e-05, "loss": 0.4638, "step": 769 }, { "epoch": 0.38, "grad_norm": 0.1571790724992752, "learning_rate": 7.132933800226796e-05, "loss": 0.4303, "step": 770 }, { "epoch": 0.38, "grad_norm": 0.14765222370624542, "learning_rate": 7.125723070181576e-05, "loss": 0.4449, "step": 771 }, { "epoch": 0.38, "grad_norm": 0.14503902196884155, "learning_rate": 7.118506939647295e-05, "loss": 0.3839, "step": 772 }, { "epoch": 0.38, "grad_norm": 0.1527874618768692, "learning_rate": 7.111285426956835e-05, "loss": 0.4437, "step": 773 }, { "epoch": 0.38, "grad_norm": 0.12923458218574524, "learning_rate": 7.104058550456756e-05, "loss": 0.3792, "step": 774 }, { "epoch": 0.38, "grad_norm": 0.15651898086071014, "learning_rate": 7.09682632850724e-05, "loss": 0.4394, "step": 775 }, { "epoch": 0.38, "grad_norm": 0.16697047650814056, "learning_rate": 7.089588779482052e-05, "loss": 0.4223, "step": 776 }, { "epoch": 0.38, "grad_norm": 0.15254193544387817, "learning_rate": 7.082345921768491e-05, "loss": 0.426, "step": 777 }, { "epoch": 0.38, "grad_norm": 0.14779062569141388, "learning_rate": 7.075097773767343e-05, "loss": 0.4572, "step": 778 }, { "epoch": 0.38, "grad_norm": 0.14753121137619019, "learning_rate": 7.067844353892831e-05, "loss": 0.4006, "step": 779 }, { "epoch": 0.38, "grad_norm": 0.1578618288040161, "learning_rate": 7.060585680572578e-05, "loss": 0.4575, "step": 780 }, { "epoch": 0.38, "grad_norm": 0.148184135556221, "learning_rate": 7.053321772247545e-05, "loss": 0.4335, "step": 781 }, { "epoch": 0.38, "grad_norm": 0.1484382450580597, "learning_rate": 7.046052647372e-05, "loss": 0.4232, "step": 782 }, { "epoch": 0.39, "grad_norm": 0.14367879927158356, "learning_rate": 7.038778324413462e-05, "loss": 0.4067, "step": 783 }, { "epoch": 0.39, "grad_norm": 0.14896979928016663, "learning_rate": 7.031498821852653e-05, "loss": 0.4451, "step": 784 }, { "epoch": 0.39, "grad_norm": 0.1551053524017334, "learning_rate": 7.024214158183456e-05, "loss": 0.4495, "step": 785 }, { "epoch": 0.39, "grad_norm": 0.153823122382164, "learning_rate": 7.016924351912868e-05, "loss": 0.4385, "step": 786 }, { "epoch": 0.39, "grad_norm": 0.1408875733613968, "learning_rate": 7.009629421560946e-05, "loss": 0.426, "step": 787 }, { "epoch": 0.39, "grad_norm": 0.1543664187192917, "learning_rate": 7.002329385660772e-05, "loss": 0.4101, "step": 788 }, { "epoch": 0.39, "grad_norm": 0.1672615259885788, "learning_rate": 6.995024262758392e-05, "loss": 0.4246, "step": 789 }, { "epoch": 0.39, "grad_norm": 0.14263275265693665, "learning_rate": 6.98771407141278e-05, "loss": 0.427, "step": 790 }, { "epoch": 0.39, "grad_norm": 0.15519587695598602, "learning_rate": 6.980398830195785e-05, "loss": 0.4744, "step": 791 }, { "epoch": 0.39, "grad_norm": 0.13808229565620422, "learning_rate": 6.973078557692085e-05, "loss": 0.4541, "step": 792 }, { "epoch": 0.39, "grad_norm": 0.16556517779827118, "learning_rate": 6.965753272499141e-05, "loss": 0.4624, "step": 793 }, { "epoch": 0.39, "grad_norm": 0.1323944628238678, "learning_rate": 6.958422993227151e-05, "loss": 0.4373, "step": 794 }, { "epoch": 0.39, "grad_norm": 0.14971445500850677, "learning_rate": 6.951087738498994e-05, "loss": 0.4561, "step": 795 }, { "epoch": 0.39, "grad_norm": 0.1549205482006073, "learning_rate": 6.943747526950198e-05, "loss": 0.4329, "step": 796 }, { "epoch": 0.39, "grad_norm": 0.14359986782073975, "learning_rate": 6.936402377228879e-05, "loss": 0.4143, "step": 797 }, { "epoch": 0.39, "grad_norm": 0.15592825412750244, "learning_rate": 6.929052307995698e-05, "loss": 0.4226, "step": 798 }, { "epoch": 0.39, "grad_norm": 0.15207093954086304, "learning_rate": 6.921697337923817e-05, "loss": 0.4609, "step": 799 }, { "epoch": 0.39, "grad_norm": 0.1452329307794571, "learning_rate": 6.914337485698845e-05, "loss": 0.4505, "step": 800 }, { "epoch": 0.39, "grad_norm": 0.13710728287696838, "learning_rate": 6.906972770018802e-05, "loss": 0.4408, "step": 801 }, { "epoch": 0.39, "grad_norm": 0.14112155139446259, "learning_rate": 6.899603209594052e-05, "loss": 0.4722, "step": 802 }, { "epoch": 0.4, "grad_norm": 0.14787013828754425, "learning_rate": 6.89222882314728e-05, "loss": 0.4089, "step": 803 }, { "epoch": 0.4, "grad_norm": 0.1732758730649948, "learning_rate": 6.884849629413421e-05, "loss": 0.4405, "step": 804 }, { "epoch": 0.4, "grad_norm": 0.16073428094387054, "learning_rate": 6.877465647139632e-05, "loss": 0.4777, "step": 805 }, { "epoch": 0.4, "grad_norm": 0.14995670318603516, "learning_rate": 6.870076895085227e-05, "loss": 0.4288, "step": 806 }, { "epoch": 0.4, "grad_norm": 0.1596534103155136, "learning_rate": 6.862683392021644e-05, "loss": 0.4389, "step": 807 }, { "epoch": 0.4, "grad_norm": 0.16381408274173737, "learning_rate": 6.855285156732389e-05, "loss": 0.4363, "step": 808 }, { "epoch": 0.4, "grad_norm": 0.15220500528812408, "learning_rate": 6.847882208012991e-05, "loss": 0.4065, "step": 809 }, { "epoch": 0.4, "grad_norm": 0.14427056908607483, "learning_rate": 6.840474564670953e-05, "loss": 0.4084, "step": 810 }, { "epoch": 0.4, "grad_norm": 0.14557424187660217, "learning_rate": 6.833062245525705e-05, "loss": 0.4433, "step": 811 }, { "epoch": 0.4, "grad_norm": 0.16300426423549652, "learning_rate": 6.825645269408556e-05, "loss": 0.473, "step": 812 }, { "epoch": 0.4, "grad_norm": 0.14467796683311462, "learning_rate": 6.818223655162646e-05, "loss": 0.4014, "step": 813 }, { "epoch": 0.4, "grad_norm": 0.1565333902835846, "learning_rate": 6.810797421642901e-05, "loss": 0.426, "step": 814 }, { "epoch": 0.4, "grad_norm": 0.15642747282981873, "learning_rate": 6.803366587715978e-05, "loss": 0.4523, "step": 815 }, { "epoch": 0.4, "grad_norm": 0.16147111356258392, "learning_rate": 6.795931172260224e-05, "loss": 0.4122, "step": 816 }, { "epoch": 0.4, "grad_norm": 0.15971477329730988, "learning_rate": 6.78849119416563e-05, "loss": 0.4234, "step": 817 }, { "epoch": 0.4, "grad_norm": 0.17470532655715942, "learning_rate": 6.781046672333765e-05, "loss": 0.4255, "step": 818 }, { "epoch": 0.4, "grad_norm": 0.17106963694095612, "learning_rate": 6.773597625677757e-05, "loss": 0.4338, "step": 819 }, { "epoch": 0.4, "grad_norm": 0.157113179564476, "learning_rate": 6.766144073122223e-05, "loss": 0.4241, "step": 820 }, { "epoch": 0.4, "grad_norm": 0.1626725047826767, "learning_rate": 6.758686033603225e-05, "loss": 0.4308, "step": 821 }, { "epoch": 0.4, "grad_norm": 0.16659627854824066, "learning_rate": 6.751223526068227e-05, "loss": 0.4444, "step": 822 }, { "epoch": 0.4, "grad_norm": 0.172677680850029, "learning_rate": 6.743756569476047e-05, "loss": 0.4407, "step": 823 }, { "epoch": 0.41, "grad_norm": 0.1640663594007492, "learning_rate": 6.736285182796798e-05, "loss": 0.4098, "step": 824 }, { "epoch": 0.41, "grad_norm": 0.1459806263446808, "learning_rate": 6.728809385011856e-05, "loss": 0.4109, "step": 825 }, { "epoch": 0.41, "grad_norm": 0.15432342886924744, "learning_rate": 6.721329195113801e-05, "loss": 0.4086, "step": 826 }, { "epoch": 0.41, "grad_norm": 0.14531828463077545, "learning_rate": 6.713844632106369e-05, "loss": 0.4282, "step": 827 }, { "epoch": 0.41, "grad_norm": 0.16352249681949615, "learning_rate": 6.706355715004408e-05, "loss": 0.4896, "step": 828 }, { "epoch": 0.41, "grad_norm": 0.17216593027114868, "learning_rate": 6.698862462833829e-05, "loss": 0.4791, "step": 829 }, { "epoch": 0.41, "grad_norm": 0.16557826101779938, "learning_rate": 6.691364894631554e-05, "loss": 0.413, "step": 830 }, { "epoch": 0.41, "grad_norm": 0.15508468449115753, "learning_rate": 6.683863029445469e-05, "loss": 0.4248, "step": 831 }, { "epoch": 0.41, "grad_norm": 0.17772945761680603, "learning_rate": 6.676356886334383e-05, "loss": 0.4614, "step": 832 }, { "epoch": 0.41, "grad_norm": 0.15081843733787537, "learning_rate": 6.668846484367964e-05, "loss": 0.4316, "step": 833 }, { "epoch": 0.41, "grad_norm": 0.16790862381458282, "learning_rate": 6.66133184262671e-05, "loss": 0.4364, "step": 834 }, { "epoch": 0.41, "grad_norm": 0.14499300718307495, "learning_rate": 6.653812980201882e-05, "loss": 0.4499, "step": 835 }, { "epoch": 0.41, "grad_norm": 0.1447850912809372, "learning_rate": 6.646289916195467e-05, "loss": 0.428, "step": 836 }, { "epoch": 0.41, "grad_norm": 0.15877428650856018, "learning_rate": 6.638762669720125e-05, "loss": 0.457, "step": 837 }, { "epoch": 0.41, "grad_norm": 0.1455867439508438, "learning_rate": 6.631231259899146e-05, "loss": 0.4588, "step": 838 }, { "epoch": 0.41, "grad_norm": 0.15158191323280334, "learning_rate": 6.623695705866394e-05, "loss": 0.471, "step": 839 }, { "epoch": 0.41, "grad_norm": 0.17149366438388824, "learning_rate": 6.616156026766259e-05, "loss": 0.4931, "step": 840 }, { "epoch": 0.41, "grad_norm": 0.1557389795780182, "learning_rate": 6.608612241753614e-05, "loss": 0.4669, "step": 841 }, { "epoch": 0.41, "grad_norm": 0.1694549322128296, "learning_rate": 6.601064369993766e-05, "loss": 0.4715, "step": 842 }, { "epoch": 0.41, "grad_norm": 0.15540090203285217, "learning_rate": 6.593512430662398e-05, "loss": 0.4753, "step": 843 }, { "epoch": 0.42, "grad_norm": 0.15908129513263702, "learning_rate": 6.585956442945532e-05, "loss": 0.4538, "step": 844 }, { "epoch": 0.42, "grad_norm": 0.16982631385326385, "learning_rate": 6.57839642603947e-05, "loss": 0.4307, "step": 845 }, { "epoch": 0.42, "grad_norm": 0.1509215086698532, "learning_rate": 6.570832399150757e-05, "loss": 0.4734, "step": 846 }, { "epoch": 0.42, "grad_norm": 0.14199353754520416, "learning_rate": 6.563264381496123e-05, "loss": 0.4347, "step": 847 }, { "epoch": 0.42, "grad_norm": 0.1764611452817917, "learning_rate": 6.555692392302434e-05, "loss": 0.4659, "step": 848 }, { "epoch": 0.42, "grad_norm": 0.14893242716789246, "learning_rate": 6.548116450806649e-05, "loss": 0.4495, "step": 849 }, { "epoch": 0.42, "grad_norm": 0.1434950828552246, "learning_rate": 6.540536576255763e-05, "loss": 0.4237, "step": 850 }, { "epoch": 0.42, "grad_norm": 0.16458651423454285, "learning_rate": 6.532952787906771e-05, "loss": 0.4558, "step": 851 }, { "epoch": 0.42, "grad_norm": 0.15058903396129608, "learning_rate": 6.525365105026605e-05, "loss": 0.479, "step": 852 }, { "epoch": 0.42, "grad_norm": 0.1442369669675827, "learning_rate": 6.517773546892093e-05, "loss": 0.4661, "step": 853 }, { "epoch": 0.42, "grad_norm": 0.15443040430545807, "learning_rate": 6.51017813278991e-05, "loss": 0.4535, "step": 854 }, { "epoch": 0.42, "grad_norm": 0.15055908262729645, "learning_rate": 6.502578882016523e-05, "loss": 0.4402, "step": 855 }, { "epoch": 0.42, "grad_norm": 0.14989763498306274, "learning_rate": 6.494975813878152e-05, "loss": 0.4436, "step": 856 }, { "epoch": 0.42, "grad_norm": 0.1594366729259491, "learning_rate": 6.48736894769071e-05, "loss": 0.4633, "step": 857 }, { "epoch": 0.42, "grad_norm": 0.15897324681282043, "learning_rate": 6.479758302779764e-05, "loss": 0.4283, "step": 858 }, { "epoch": 0.42, "grad_norm": 0.16383293271064758, "learning_rate": 6.472143898480474e-05, "loss": 0.4507, "step": 859 }, { "epoch": 0.42, "grad_norm": 0.14080683887004852, "learning_rate": 6.464525754137557e-05, "loss": 0.4263, "step": 860 }, { "epoch": 0.42, "grad_norm": 0.1480110138654709, "learning_rate": 6.45690388910523e-05, "loss": 0.4205, "step": 861 }, { "epoch": 0.42, "grad_norm": 0.183593288064003, "learning_rate": 6.449278322747163e-05, "loss": 0.4776, "step": 862 }, { "epoch": 0.42, "grad_norm": 0.17068292200565338, "learning_rate": 6.441649074436428e-05, "loss": 0.4166, "step": 863 }, { "epoch": 0.43, "grad_norm": 0.15425418317317963, "learning_rate": 6.434016163555452e-05, "loss": 0.4439, "step": 864 }, { "epoch": 0.43, "grad_norm": 0.147231325507164, "learning_rate": 6.426379609495967e-05, "loss": 0.4192, "step": 865 }, { "epoch": 0.43, "grad_norm": 0.1655813753604889, "learning_rate": 6.41873943165896e-05, "loss": 0.4935, "step": 866 }, { "epoch": 0.43, "grad_norm": 0.17077775299549103, "learning_rate": 6.411095649454625e-05, "loss": 0.4201, "step": 867 }, { "epoch": 0.43, "grad_norm": 0.1619710475206375, "learning_rate": 6.403448282302313e-05, "loss": 0.4319, "step": 868 }, { "epoch": 0.43, "grad_norm": 0.15145759284496307, "learning_rate": 6.395797349630483e-05, "loss": 0.424, "step": 869 }, { "epoch": 0.43, "grad_norm": 0.1487855166196823, "learning_rate": 6.388142870876652e-05, "loss": 0.4506, "step": 870 }, { "epoch": 0.43, "grad_norm": 0.16325931251049042, "learning_rate": 6.380484865487347e-05, "loss": 0.4375, "step": 871 }, { "epoch": 0.43, "grad_norm": 0.15605948865413666, "learning_rate": 6.372823352918049e-05, "loss": 0.4219, "step": 872 }, { "epoch": 0.43, "grad_norm": 0.1474214494228363, "learning_rate": 6.365158352633156e-05, "loss": 0.408, "step": 873 }, { "epoch": 0.43, "grad_norm": 0.14674103260040283, "learning_rate": 6.357489884105927e-05, "loss": 0.4274, "step": 874 }, { "epoch": 0.43, "grad_norm": 0.16598211228847504, "learning_rate": 6.349817966818428e-05, "loss": 0.469, "step": 875 }, { "epoch": 0.43, "grad_norm": 0.14640849828720093, "learning_rate": 6.342142620261487e-05, "loss": 0.445, "step": 876 }, { "epoch": 0.43, "grad_norm": 0.16052457690238953, "learning_rate": 6.334463863934645e-05, "loss": 0.4409, "step": 877 }, { "epoch": 0.43, "grad_norm": 0.1524929404258728, "learning_rate": 6.326781717346109e-05, "loss": 0.4326, "step": 878 }, { "epoch": 0.43, "grad_norm": 0.14336341619491577, "learning_rate": 6.319096200012693e-05, "loss": 0.4171, "step": 879 }, { "epoch": 0.43, "grad_norm": 0.15154996514320374, "learning_rate": 6.311407331459781e-05, "loss": 0.4853, "step": 880 }, { "epoch": 0.43, "grad_norm": 0.17360936105251312, "learning_rate": 6.303715131221264e-05, "loss": 0.4961, "step": 881 }, { "epoch": 0.43, "grad_norm": 0.1641799658536911, "learning_rate": 6.296019618839505e-05, "loss": 0.4812, "step": 882 }, { "epoch": 0.43, "grad_norm": 0.1534123718738556, "learning_rate": 6.288320813865274e-05, "loss": 0.4516, "step": 883 }, { "epoch": 0.43, "grad_norm": 0.16091935336589813, "learning_rate": 6.280618735857712e-05, "loss": 0.4553, "step": 884 }, { "epoch": 0.44, "grad_norm": 0.15887701511383057, "learning_rate": 6.272913404384269e-05, "loss": 0.4265, "step": 885 }, { "epoch": 0.44, "grad_norm": 0.1367524415254593, "learning_rate": 6.265204839020671e-05, "loss": 0.3835, "step": 886 }, { "epoch": 0.44, "grad_norm": 0.15519799292087555, "learning_rate": 6.257493059350848e-05, "loss": 0.3845, "step": 887 }, { "epoch": 0.44, "grad_norm": 0.1732037216424942, "learning_rate": 6.249778084966902e-05, "loss": 0.4345, "step": 888 }, { "epoch": 0.44, "grad_norm": 0.177877739071846, "learning_rate": 6.242059935469051e-05, "loss": 0.4673, "step": 889 }, { "epoch": 0.44, "grad_norm": 0.15576790273189545, "learning_rate": 6.234338630465582e-05, "loss": 0.446, "step": 890 }, { "epoch": 0.44, "grad_norm": 0.153909370303154, "learning_rate": 6.22661418957279e-05, "loss": 0.4306, "step": 891 }, { "epoch": 0.44, "grad_norm": 0.14537866413593292, "learning_rate": 6.218886632414948e-05, "loss": 0.4211, "step": 892 }, { "epoch": 0.44, "grad_norm": 0.16007272899150848, "learning_rate": 6.211155978624239e-05, "loss": 0.4071, "step": 893 }, { "epoch": 0.44, "grad_norm": 0.1608007699251175, "learning_rate": 6.203422247840714e-05, "loss": 0.4556, "step": 894 }, { "epoch": 0.44, "grad_norm": 0.15606263279914856, "learning_rate": 6.195685459712243e-05, "loss": 0.3957, "step": 895 }, { "epoch": 0.44, "grad_norm": 0.15933269262313843, "learning_rate": 6.187945633894461e-05, "loss": 0.4687, "step": 896 }, { "epoch": 0.44, "grad_norm": 0.1741895079612732, "learning_rate": 6.180202790050724e-05, "loss": 0.439, "step": 897 }, { "epoch": 0.44, "grad_norm": 0.17336581647396088, "learning_rate": 6.172456947852049e-05, "loss": 0.4401, "step": 898 }, { "epoch": 0.44, "grad_norm": 0.16295744478702545, "learning_rate": 6.164708126977081e-05, "loss": 0.4412, "step": 899 }, { "epoch": 0.44, "grad_norm": 0.1655481606721878, "learning_rate": 6.156956347112019e-05, "loss": 0.451, "step": 900 }, { "epoch": 0.44, "grad_norm": 0.17833803594112396, "learning_rate": 6.149201627950593e-05, "loss": 0.4575, "step": 901 }, { "epoch": 0.44, "grad_norm": 0.165806844830513, "learning_rate": 6.141443989193988e-05, "loss": 0.4275, "step": 902 }, { "epoch": 0.44, "grad_norm": 0.16207672655582428, "learning_rate": 6.133683450550818e-05, "loss": 0.4547, "step": 903 }, { "epoch": 0.44, "grad_norm": 0.15352056920528412, "learning_rate": 6.125920031737054e-05, "loss": 0.4378, "step": 904 }, { "epoch": 0.45, "grad_norm": 0.17437709867954254, "learning_rate": 6.11815375247599e-05, "loss": 0.4742, "step": 905 }, { "epoch": 0.45, "grad_norm": 0.152421236038208, "learning_rate": 6.110384632498187e-05, "loss": 0.4387, "step": 906 }, { "epoch": 0.45, "grad_norm": 0.16105760633945465, "learning_rate": 6.1026126915414215e-05, "loss": 0.468, "step": 907 }, { "epoch": 0.45, "grad_norm": 0.15451014041900635, "learning_rate": 6.0948379493506355e-05, "loss": 0.423, "step": 908 }, { "epoch": 0.45, "grad_norm": 0.15793992578983307, "learning_rate": 6.087060425677891e-05, "loss": 0.4328, "step": 909 }, { "epoch": 0.45, "grad_norm": 0.14242716133594513, "learning_rate": 6.079280140282313e-05, "loss": 0.4172, "step": 910 }, { "epoch": 0.45, "grad_norm": 0.17085228860378265, "learning_rate": 6.071497112930047e-05, "loss": 0.4409, "step": 911 }, { "epoch": 0.45, "grad_norm": 0.16285860538482666, "learning_rate": 6.0637113633942e-05, "loss": 0.443, "step": 912 }, { "epoch": 0.45, "grad_norm": 0.1640520989894867, "learning_rate": 6.055922911454798e-05, "loss": 0.4243, "step": 913 }, { "epoch": 0.45, "grad_norm": 0.15653076767921448, "learning_rate": 6.04813177689873e-05, "loss": 0.4128, "step": 914 }, { "epoch": 0.45, "grad_norm": 0.16285474598407745, "learning_rate": 6.040337979519703e-05, "loss": 0.4429, "step": 915 }, { "epoch": 0.45, "grad_norm": 0.1596003919839859, "learning_rate": 6.032541539118187e-05, "loss": 0.4498, "step": 916 }, { "epoch": 0.45, "grad_norm": 0.167347252368927, "learning_rate": 6.0247424755013694e-05, "loss": 0.4647, "step": 917 }, { "epoch": 0.45, "grad_norm": 0.16328398883342743, "learning_rate": 6.016940808483097e-05, "loss": 0.4638, "step": 918 }, { "epoch": 0.45, "grad_norm": 0.15470579266548157, "learning_rate": 6.0091365578838365e-05, "loss": 0.4472, "step": 919 }, { "epoch": 0.45, "grad_norm": 0.15434837341308594, "learning_rate": 6.001329743530614e-05, "loss": 0.4362, "step": 920 }, { "epoch": 0.45, "grad_norm": 0.1569792926311493, "learning_rate": 5.99352038525697e-05, "loss": 0.4197, "step": 921 }, { "epoch": 0.45, "grad_norm": 0.16532695293426514, "learning_rate": 5.985708502902909e-05, "loss": 0.4565, "step": 922 }, { "epoch": 0.45, "grad_norm": 0.15995478630065918, "learning_rate": 5.9778941163148497e-05, "loss": 0.4428, "step": 923 }, { "epoch": 0.45, "grad_norm": 0.15316393971443176, "learning_rate": 5.970077245345566e-05, "loss": 0.4403, "step": 924 }, { "epoch": 0.46, "grad_norm": 0.16436900198459625, "learning_rate": 5.96225790985415e-05, "loss": 0.4337, "step": 925 }, { "epoch": 0.46, "grad_norm": 0.15238617360591888, "learning_rate": 5.9544361297059556e-05, "loss": 0.4153, "step": 926 }, { "epoch": 0.46, "grad_norm": 0.1665799915790558, "learning_rate": 5.946611924772542e-05, "loss": 0.4756, "step": 927 }, { "epoch": 0.46, "grad_norm": 0.16179361939430237, "learning_rate": 5.938785314931633e-05, "loss": 0.4232, "step": 928 }, { "epoch": 0.46, "grad_norm": 0.1443018615245819, "learning_rate": 5.9309563200670593e-05, "loss": 0.4668, "step": 929 }, { "epoch": 0.46, "grad_norm": 0.1784832626581192, "learning_rate": 5.9231249600687154e-05, "loss": 0.4767, "step": 930 }, { "epoch": 0.46, "grad_norm": 0.16211380064487457, "learning_rate": 5.9152912548324976e-05, "loss": 0.4262, "step": 931 }, { "epoch": 0.46, "grad_norm": 0.1452757865190506, "learning_rate": 5.907455224260268e-05, "loss": 0.4424, "step": 932 }, { "epoch": 0.46, "grad_norm": 0.1694449633359909, "learning_rate": 5.899616888259789e-05, "loss": 0.4319, "step": 933 }, { "epoch": 0.46, "grad_norm": 0.15538029372692108, "learning_rate": 5.891776266744686e-05, "loss": 0.4404, "step": 934 }, { "epoch": 0.46, "grad_norm": 0.14633725583553314, "learning_rate": 5.8839333796343874e-05, "loss": 0.4184, "step": 935 }, { "epoch": 0.46, "grad_norm": 0.15739810466766357, "learning_rate": 5.87608824685408e-05, "loss": 0.4189, "step": 936 }, { "epoch": 0.46, "grad_norm": 0.1564759463071823, "learning_rate": 5.868240888334653e-05, "loss": 0.4415, "step": 937 }, { "epoch": 0.46, "grad_norm": 0.16821657121181488, "learning_rate": 5.860391324012652e-05, "loss": 0.3961, "step": 938 }, { "epoch": 0.46, "grad_norm": 0.16510997712612152, "learning_rate": 5.8525395738302235e-05, "loss": 0.4219, "step": 939 }, { "epoch": 0.46, "grad_norm": 0.1607184112071991, "learning_rate": 5.844685657735075e-05, "loss": 0.455, "step": 940 }, { "epoch": 0.46, "grad_norm": 0.1897452175617218, "learning_rate": 5.836829595680406e-05, "loss": 0.4384, "step": 941 }, { "epoch": 0.46, "grad_norm": 0.16906006634235382, "learning_rate": 5.828971407624877e-05, "loss": 0.4286, "step": 942 }, { "epoch": 0.46, "grad_norm": 0.1656024307012558, "learning_rate": 5.8211111135325445e-05, "loss": 0.4464, "step": 943 }, { "epoch": 0.46, "grad_norm": 0.16191771626472473, "learning_rate": 5.813248733372818e-05, "loss": 0.4643, "step": 944 }, { "epoch": 0.46, "grad_norm": 0.1608259677886963, "learning_rate": 5.805384287120403e-05, "loss": 0.4227, "step": 945 }, { "epoch": 0.47, "grad_norm": 0.14724114537239075, "learning_rate": 5.797517794755261e-05, "loss": 0.4555, "step": 946 }, { "epoch": 0.47, "grad_norm": 0.1574697345495224, "learning_rate": 5.789649276262542e-05, "loss": 0.3946, "step": 947 }, { "epoch": 0.47, "grad_norm": 0.15362043678760529, "learning_rate": 5.781778751632555e-05, "loss": 0.4196, "step": 948 }, { "epoch": 0.47, "grad_norm": 0.16778577864170074, "learning_rate": 5.773906240860694e-05, "loss": 0.4314, "step": 949 }, { "epoch": 0.47, "grad_norm": 0.1526278704404831, "learning_rate": 5.7660317639474084e-05, "loss": 0.4394, "step": 950 }, { "epoch": 0.47, "grad_norm": 0.16373859345912933, "learning_rate": 5.758155340898137e-05, "loss": 0.4313, "step": 951 }, { "epoch": 0.47, "grad_norm": 0.15492920577526093, "learning_rate": 5.7502769917232635e-05, "loss": 0.4668, "step": 952 }, { "epoch": 0.47, "grad_norm": 0.16833825409412384, "learning_rate": 5.742396736438069e-05, "loss": 0.4395, "step": 953 }, { "epoch": 0.47, "grad_norm": 0.1713876575231552, "learning_rate": 5.734514595062672e-05, "loss": 0.3795, "step": 954 }, { "epoch": 0.47, "grad_norm": 0.16916221380233765, "learning_rate": 5.7266305876219864e-05, "loss": 0.4523, "step": 955 }, { "epoch": 0.47, "grad_norm": 0.1733112335205078, "learning_rate": 5.7187447341456636e-05, "loss": 0.4276, "step": 956 }, { "epoch": 0.47, "grad_norm": 0.16729463636875153, "learning_rate": 5.710857054668049e-05, "loss": 0.4327, "step": 957 }, { "epoch": 0.47, "grad_norm": 0.15378537774085999, "learning_rate": 5.7029675692281206e-05, "loss": 0.4185, "step": 958 }, { "epoch": 0.47, "grad_norm": 0.15887102484703064, "learning_rate": 5.6950762978694535e-05, "loss": 0.4378, "step": 959 }, { "epoch": 0.47, "grad_norm": 0.150125652551651, "learning_rate": 5.687183260640153e-05, "loss": 0.4173, "step": 960 }, { "epoch": 0.47, "grad_norm": 0.1464613974094391, "learning_rate": 5.679288477592815e-05, "loss": 0.4483, "step": 961 }, { "epoch": 0.47, "grad_norm": 0.1606331318616867, "learning_rate": 5.671391968784463e-05, "loss": 0.4225, "step": 962 }, { "epoch": 0.47, "grad_norm": 0.1531604379415512, "learning_rate": 5.663493754276518e-05, "loss": 0.3824, "step": 963 }, { "epoch": 0.47, "grad_norm": 0.18071341514587402, "learning_rate": 5.655593854134721e-05, "loss": 0.4597, "step": 964 }, { "epoch": 0.47, "grad_norm": 0.18037347495555878, "learning_rate": 5.647692288429104e-05, "loss": 0.4448, "step": 965 }, { "epoch": 0.48, "grad_norm": 0.17007885873317719, "learning_rate": 5.639789077233927e-05, "loss": 0.4405, "step": 966 }, { "epoch": 0.48, "grad_norm": 0.15510225296020508, "learning_rate": 5.631884240627632e-05, "loss": 0.4254, "step": 967 }, { "epoch": 0.48, "grad_norm": 0.14798195660114288, "learning_rate": 5.623977798692788e-05, "loss": 0.426, "step": 968 }, { "epoch": 0.48, "grad_norm": 0.1494758278131485, "learning_rate": 5.6160697715160445e-05, "loss": 0.3757, "step": 969 }, { "epoch": 0.48, "grad_norm": 0.16347622871398926, "learning_rate": 5.608160179188079e-05, "loss": 0.4585, "step": 970 }, { "epoch": 0.48, "grad_norm": 0.16009631752967834, "learning_rate": 5.600249041803545e-05, "loss": 0.4539, "step": 971 }, { "epoch": 0.48, "grad_norm": 0.14668318629264832, "learning_rate": 5.592336379461017e-05, "loss": 0.4215, "step": 972 }, { "epoch": 0.48, "grad_norm": 0.1720777153968811, "learning_rate": 5.584422212262951e-05, "loss": 0.4383, "step": 973 }, { "epoch": 0.48, "grad_norm": 0.17379383742809296, "learning_rate": 5.576506560315619e-05, "loss": 0.4572, "step": 974 }, { "epoch": 0.48, "grad_norm": 0.18097186088562012, "learning_rate": 5.568589443729071e-05, "loss": 0.4477, "step": 975 }, { "epoch": 0.48, "grad_norm": 0.1491256207227707, "learning_rate": 5.560670882617073e-05, "loss": 0.4035, "step": 976 }, { "epoch": 0.48, "grad_norm": 0.1577547788619995, "learning_rate": 5.5527508970970654e-05, "loss": 0.3851, "step": 977 }, { "epoch": 0.48, "grad_norm": 0.14161686599254608, "learning_rate": 5.544829507290106e-05, "loss": 0.379, "step": 978 }, { "epoch": 0.48, "grad_norm": 0.1505327969789505, "learning_rate": 5.536906733320816e-05, "loss": 0.3859, "step": 979 }, { "epoch": 0.48, "grad_norm": 0.16661107540130615, "learning_rate": 5.52898259531734e-05, "loss": 0.4432, "step": 980 }, { "epoch": 0.48, "grad_norm": 0.1739940345287323, "learning_rate": 5.5210571134112824e-05, "loss": 0.4897, "step": 981 }, { "epoch": 0.48, "grad_norm": 0.16543810069561005, "learning_rate": 5.513130307737666e-05, "loss": 0.444, "step": 982 }, { "epoch": 0.48, "grad_norm": 0.17440205812454224, "learning_rate": 5.505202198434872e-05, "loss": 0.4349, "step": 983 }, { "epoch": 0.48, "grad_norm": 0.17263653874397278, "learning_rate": 5.4972728056446e-05, "loss": 0.4382, "step": 984 }, { "epoch": 0.48, "grad_norm": 0.1637726128101349, "learning_rate": 5.4893421495118035e-05, "loss": 0.4222, "step": 985 }, { "epoch": 0.49, "grad_norm": 0.1486929953098297, "learning_rate": 5.4814102501846496e-05, "loss": 0.4291, "step": 986 }, { "epoch": 0.49, "grad_norm": 0.1591012328863144, "learning_rate": 5.473477127814464e-05, "loss": 0.4286, "step": 987 }, { "epoch": 0.49, "grad_norm": 0.16511763632297516, "learning_rate": 5.465542802555677e-05, "loss": 0.4358, "step": 988 }, { "epoch": 0.49, "grad_norm": 0.17221887409687042, "learning_rate": 5.457607294565776e-05, "loss": 0.425, "step": 989 }, { "epoch": 0.49, "grad_norm": 0.1773669421672821, "learning_rate": 5.4496706240052565e-05, "loss": 0.4204, "step": 990 }, { "epoch": 0.49, "grad_norm": 0.18131878972053528, "learning_rate": 5.44173281103756e-05, "loss": 0.4356, "step": 991 }, { "epoch": 0.49, "grad_norm": 0.15839515626430511, "learning_rate": 5.4337938758290394e-05, "loss": 0.4086, "step": 992 }, { "epoch": 0.49, "grad_norm": 0.16947250068187714, "learning_rate": 5.425853838548891e-05, "loss": 0.4515, "step": 993 }, { "epoch": 0.49, "grad_norm": 0.21210332214832306, "learning_rate": 5.417912719369116e-05, "loss": 0.4432, "step": 994 }, { "epoch": 0.49, "grad_norm": 0.17386673390865326, "learning_rate": 5.4099705384644614e-05, "loss": 0.4517, "step": 995 }, { "epoch": 0.49, "grad_norm": 0.15087562799453735, "learning_rate": 5.402027316012375e-05, "loss": 0.4427, "step": 996 }, { "epoch": 0.49, "grad_norm": 0.16693668067455292, "learning_rate": 5.394083072192944e-05, "loss": 0.4503, "step": 997 }, { "epoch": 0.49, "grad_norm": 0.16983500123023987, "learning_rate": 5.386137827188858e-05, "loss": 0.4565, "step": 998 }, { "epoch": 0.49, "grad_norm": 0.15319420397281647, "learning_rate": 5.378191601185345e-05, "loss": 0.451, "step": 999 }, { "epoch": 0.49, "grad_norm": 0.1558017134666443, "learning_rate": 5.370244414370129e-05, "loss": 0.4477, "step": 1000 }, { "epoch": 0.49, "grad_norm": 0.1685212254524231, "learning_rate": 5.362296286933371e-05, "loss": 0.4857, "step": 1001 }, { "epoch": 0.49, "grad_norm": 0.16392169892787933, "learning_rate": 5.354347239067625e-05, "loss": 0.4621, "step": 1002 }, { "epoch": 0.49, "grad_norm": 0.1676805168390274, "learning_rate": 5.346397290967783e-05, "loss": 0.4579, "step": 1003 }, { "epoch": 0.49, "grad_norm": 0.14531049132347107, "learning_rate": 5.338446462831021e-05, "loss": 0.3652, "step": 1004 }, { "epoch": 0.49, "grad_norm": 0.15470877289772034, "learning_rate": 5.330494774856755e-05, "loss": 0.4243, "step": 1005 }, { "epoch": 0.49, "grad_norm": 0.1716100126504898, "learning_rate": 5.3225422472465824e-05, "loss": 0.4625, "step": 1006 }, { "epoch": 0.5, "grad_norm": 0.15028026700019836, "learning_rate": 5.314588900204235e-05, "loss": 0.4015, "step": 1007 }, { "epoch": 0.5, "grad_norm": 0.15226700901985168, "learning_rate": 5.306634753935527e-05, "loss": 0.3911, "step": 1008 }, { "epoch": 0.5, "grad_norm": 0.1710689216852188, "learning_rate": 5.298679828648301e-05, "loss": 0.4167, "step": 1009 }, { "epoch": 0.5, "grad_norm": 0.19081401824951172, "learning_rate": 5.290724144552379e-05, "loss": 0.4356, "step": 1010 }, { "epoch": 0.5, "grad_norm": 0.16916236281394958, "learning_rate": 5.282767721859516e-05, "loss": 0.4583, "step": 1011 }, { "epoch": 0.5, "grad_norm": 0.17091983556747437, "learning_rate": 5.274810580783335e-05, "loss": 0.4627, "step": 1012 }, { "epoch": 0.5, "grad_norm": 0.15171006321907043, "learning_rate": 5.266852741539291e-05, "loss": 0.3841, "step": 1013 }, { "epoch": 0.5, "grad_norm": 0.1597701907157898, "learning_rate": 5.258894224344608e-05, "loss": 0.4686, "step": 1014 }, { "epoch": 0.5, "grad_norm": 0.14917601644992828, "learning_rate": 5.2509350494182365e-05, "loss": 0.4098, "step": 1015 }, { "epoch": 0.5, "grad_norm": 0.16674911975860596, "learning_rate": 5.242975236980793e-05, "loss": 0.4459, "step": 1016 }, { "epoch": 0.5, "grad_norm": 0.14154627919197083, "learning_rate": 5.2350148072545204e-05, "loss": 0.3895, "step": 1017 }, { "epoch": 0.5, "grad_norm": 0.16500206291675568, "learning_rate": 5.2270537804632214e-05, "loss": 0.4097, "step": 1018 }, { "epoch": 0.5, "grad_norm": 0.1625814586877823, "learning_rate": 5.219092176832224e-05, "loss": 0.4403, "step": 1019 }, { "epoch": 0.5, "grad_norm": 0.148121640086174, "learning_rate": 5.211130016588316e-05, "loss": 0.4475, "step": 1020 }, { "epoch": 0.5, "grad_norm": 0.16178402304649353, "learning_rate": 5.203167319959702e-05, "loss": 0.4058, "step": 1021 }, { "epoch": 0.5, "grad_norm": 0.16992266476154327, "learning_rate": 5.195204107175946e-05, "loss": 0.4568, "step": 1022 }, { "epoch": 0.5, "grad_norm": 0.17596228420734406, "learning_rate": 5.1872403984679294e-05, "loss": 0.4701, "step": 1023 }, { "epoch": 0.5, "grad_norm": 0.14288471639156342, "learning_rate": 5.1792762140677874e-05, "loss": 0.434, "step": 1024 }, { "epoch": 0.5, "grad_norm": 0.14827826619148254, "learning_rate": 5.171311574208868e-05, "loss": 0.4332, "step": 1025 }, { "epoch": 0.5, "grad_norm": 0.1783694177865982, "learning_rate": 5.163346499125672e-05, "loss": 0.4394, "step": 1026 }, { "epoch": 0.51, "grad_norm": 0.157792329788208, "learning_rate": 5.15538100905381e-05, "loss": 0.4169, "step": 1027 }, { "epoch": 0.51, "grad_norm": 0.1556575745344162, "learning_rate": 5.147415124229944e-05, "loss": 0.4062, "step": 1028 }, { "epoch": 0.51, "grad_norm": 0.1650574505329132, "learning_rate": 5.139448864891743e-05, "loss": 0.465, "step": 1029 }, { "epoch": 0.51, "grad_norm": 0.16151168942451477, "learning_rate": 5.1314822512778214e-05, "loss": 0.4482, "step": 1030 }, { "epoch": 0.51, "grad_norm": 0.157331645488739, "learning_rate": 5.123515303627698e-05, "loss": 0.43, "step": 1031 }, { "epoch": 0.51, "grad_norm": 0.1596032977104187, "learning_rate": 5.1155480421817414e-05, "loss": 0.4042, "step": 1032 }, { "epoch": 0.51, "grad_norm": 0.16871720552444458, "learning_rate": 5.1075804871811115e-05, "loss": 0.4472, "step": 1033 }, { "epoch": 0.51, "grad_norm": 0.1590101569890976, "learning_rate": 5.099612658867721e-05, "loss": 0.4323, "step": 1034 }, { "epoch": 0.51, "grad_norm": 0.14821843802928925, "learning_rate": 5.0916445774841694e-05, "loss": 0.4068, "step": 1035 }, { "epoch": 0.51, "grad_norm": 0.16836895048618317, "learning_rate": 5.083676263273708e-05, "loss": 0.4326, "step": 1036 }, { "epoch": 0.51, "grad_norm": 0.17555101215839386, "learning_rate": 5.075707736480171e-05, "loss": 0.4476, "step": 1037 }, { "epoch": 0.51, "grad_norm": 0.18806350231170654, "learning_rate": 5.067739017347939e-05, "loss": 0.4698, "step": 1038 }, { "epoch": 0.51, "grad_norm": 0.16602127254009247, "learning_rate": 5.0597701261218775e-05, "loss": 0.3916, "step": 1039 }, { "epoch": 0.51, "grad_norm": 0.16455860435962677, "learning_rate": 5.0518010830472925e-05, "loss": 0.405, "step": 1040 }, { "epoch": 0.51, "grad_norm": 0.1661609560251236, "learning_rate": 5.0438319083698714e-05, "loss": 0.405, "step": 1041 }, { "epoch": 0.51, "grad_norm": 0.1853649616241455, "learning_rate": 5.035862622335641e-05, "loss": 0.4232, "step": 1042 }, { "epoch": 0.51, "grad_norm": 0.18783442676067352, "learning_rate": 5.027893245190904e-05, "loss": 0.4341, "step": 1043 }, { "epoch": 0.51, "grad_norm": 0.18433260917663574, "learning_rate": 5.0199237971822056e-05, "loss": 0.4669, "step": 1044 }, { "epoch": 0.51, "grad_norm": 0.16763673722743988, "learning_rate": 5.011954298556257e-05, "loss": 0.4155, "step": 1045 }, { "epoch": 0.51, "grad_norm": 0.17437323927879333, "learning_rate": 5.003984769559912e-05, "loss": 0.4494, "step": 1046 }, { "epoch": 0.52, "grad_norm": 0.18701590597629547, "learning_rate": 4.9960152304400905e-05, "loss": 0.4717, "step": 1047 }, { "epoch": 0.52, "grad_norm": 0.16341975331306458, "learning_rate": 4.988045701443743e-05, "loss": 0.4199, "step": 1048 }, { "epoch": 0.52, "grad_norm": 0.16152223944664001, "learning_rate": 4.980076202817797e-05, "loss": 0.4423, "step": 1049 }, { "epoch": 0.52, "grad_norm": 0.16297519207000732, "learning_rate": 4.972106754809096e-05, "loss": 0.423, "step": 1050 }, { "epoch": 0.52, "grad_norm": 0.17564581334590912, "learning_rate": 4.9641373776643616e-05, "loss": 0.4355, "step": 1051 }, { "epoch": 0.52, "grad_norm": 0.17856962978839874, "learning_rate": 4.95616809163013e-05, "loss": 0.4416, "step": 1052 }, { "epoch": 0.52, "grad_norm": 0.16722428798675537, "learning_rate": 4.94819891695271e-05, "loss": 0.4506, "step": 1053 }, { "epoch": 0.52, "grad_norm": 0.16458894312381744, "learning_rate": 4.9402298738781236e-05, "loss": 0.4618, "step": 1054 }, { "epoch": 0.52, "grad_norm": 0.1548207551240921, "learning_rate": 4.9322609826520634e-05, "loss": 0.426, "step": 1055 }, { "epoch": 0.52, "grad_norm": 0.17390599846839905, "learning_rate": 4.9242922635198304e-05, "loss": 0.4639, "step": 1056 }, { "epoch": 0.52, "grad_norm": 0.19197556376457214, "learning_rate": 4.916323736726295e-05, "loss": 0.4175, "step": 1057 }, { "epoch": 0.52, "grad_norm": 0.164363831281662, "learning_rate": 4.908355422515832e-05, "loss": 0.4401, "step": 1058 }, { "epoch": 0.52, "grad_norm": 0.17498661577701569, "learning_rate": 4.900387341132282e-05, "loss": 0.4203, "step": 1059 }, { "epoch": 0.52, "grad_norm": 0.1608424037694931, "learning_rate": 4.892419512818889e-05, "loss": 0.4386, "step": 1060 }, { "epoch": 0.52, "grad_norm": 0.16727614402770996, "learning_rate": 4.8844519578182604e-05, "loss": 0.4382, "step": 1061 }, { "epoch": 0.52, "grad_norm": 0.1503480225801468, "learning_rate": 4.876484696372303e-05, "loss": 0.4281, "step": 1062 }, { "epoch": 0.52, "grad_norm": 0.16535915434360504, "learning_rate": 4.8685177487221804e-05, "loss": 0.4382, "step": 1063 }, { "epoch": 0.52, "grad_norm": 0.16789019107818604, "learning_rate": 4.860551135108259e-05, "loss": 0.4269, "step": 1064 }, { "epoch": 0.52, "grad_norm": 0.18806110322475433, "learning_rate": 4.852584875770058e-05, "loss": 0.4825, "step": 1065 }, { "epoch": 0.52, "grad_norm": 0.17548543214797974, "learning_rate": 4.844618990946191e-05, "loss": 0.466, "step": 1066 }, { "epoch": 0.52, "grad_norm": 0.1700591742992401, "learning_rate": 4.8366535008743305e-05, "loss": 0.4231, "step": 1067 }, { "epoch": 0.53, "grad_norm": 0.17217998206615448, "learning_rate": 4.828688425791134e-05, "loss": 0.4122, "step": 1068 }, { "epoch": 0.53, "grad_norm": 0.1419622004032135, "learning_rate": 4.8207237859322144e-05, "loss": 0.3765, "step": 1069 }, { "epoch": 0.53, "grad_norm": 0.17649981379508972, "learning_rate": 4.812759601532071e-05, "loss": 0.4874, "step": 1070 }, { "epoch": 0.53, "grad_norm": 0.1802574098110199, "learning_rate": 4.804795892824056e-05, "loss": 0.4545, "step": 1071 }, { "epoch": 0.53, "grad_norm": 0.16465520858764648, "learning_rate": 4.7968326800402995e-05, "loss": 0.4209, "step": 1072 }, { "epoch": 0.53, "grad_norm": 0.1775420755147934, "learning_rate": 4.7888699834116865e-05, "loss": 0.4328, "step": 1073 }, { "epoch": 0.53, "grad_norm": 0.15878228843212128, "learning_rate": 4.7809078231677764e-05, "loss": 0.3507, "step": 1074 }, { "epoch": 0.53, "grad_norm": 0.16620752215385437, "learning_rate": 4.772946219536781e-05, "loss": 0.4397, "step": 1075 }, { "epoch": 0.53, "grad_norm": 0.17258423566818237, "learning_rate": 4.7649851927454814e-05, "loss": 0.4448, "step": 1076 }, { "epoch": 0.53, "grad_norm": 0.15681672096252441, "learning_rate": 4.757024763019209e-05, "loss": 0.3954, "step": 1077 }, { "epoch": 0.53, "grad_norm": 0.15691447257995605, "learning_rate": 4.749064950581765e-05, "loss": 0.4451, "step": 1078 }, { "epoch": 0.53, "grad_norm": 0.16387434303760529, "learning_rate": 4.7411057756553945e-05, "loss": 0.4268, "step": 1079 }, { "epoch": 0.53, "grad_norm": 0.15970174968242645, "learning_rate": 4.73314725846071e-05, "loss": 0.4099, "step": 1080 }, { "epoch": 0.53, "grad_norm": 0.179684117436409, "learning_rate": 4.7251894192166654e-05, "loss": 0.437, "step": 1081 }, { "epoch": 0.53, "grad_norm": 0.1771576851606369, "learning_rate": 4.717232278140485e-05, "loss": 0.4209, "step": 1082 }, { "epoch": 0.53, "grad_norm": 0.195830836892128, "learning_rate": 4.709275855447621e-05, "loss": 0.4434, "step": 1083 }, { "epoch": 0.53, "grad_norm": 0.19978076219558716, "learning_rate": 4.7013201713517e-05, "loss": 0.4612, "step": 1084 }, { "epoch": 0.53, "grad_norm": 0.17799879610538483, "learning_rate": 4.693365246064475e-05, "loss": 0.4277, "step": 1085 }, { "epoch": 0.53, "grad_norm": 0.173565074801445, "learning_rate": 4.6854110997957654e-05, "loss": 0.425, "step": 1086 }, { "epoch": 0.53, "grad_norm": 0.17417378723621368, "learning_rate": 4.6774577527534195e-05, "loss": 0.4122, "step": 1087 }, { "epoch": 0.54, "grad_norm": 0.186448872089386, "learning_rate": 4.6695052251432455e-05, "loss": 0.443, "step": 1088 }, { "epoch": 0.54, "grad_norm": 0.17259927093982697, "learning_rate": 4.66155353716898e-05, "loss": 0.417, "step": 1089 }, { "epoch": 0.54, "grad_norm": 0.18993909657001495, "learning_rate": 4.653602709032218e-05, "loss": 0.4823, "step": 1090 }, { "epoch": 0.54, "grad_norm": 0.17361067235469818, "learning_rate": 4.645652760932376e-05, "loss": 0.4091, "step": 1091 }, { "epoch": 0.54, "grad_norm": 0.18706758320331573, "learning_rate": 4.63770371306663e-05, "loss": 0.4741, "step": 1092 }, { "epoch": 0.54, "grad_norm": 0.16696308553218842, "learning_rate": 4.629755585629873e-05, "loss": 0.4056, "step": 1093 }, { "epoch": 0.54, "grad_norm": 0.16241039335727692, "learning_rate": 4.621808398814656e-05, "loss": 0.454, "step": 1094 }, { "epoch": 0.54, "grad_norm": 0.18459995090961456, "learning_rate": 4.613862172811144e-05, "loss": 0.4407, "step": 1095 }, { "epoch": 0.54, "grad_norm": 0.1706986278295517, "learning_rate": 4.605916927807057e-05, "loss": 0.3929, "step": 1096 }, { "epoch": 0.54, "grad_norm": 0.17266102135181427, "learning_rate": 4.597972683987628e-05, "loss": 0.4725, "step": 1097 }, { "epoch": 0.54, "grad_norm": 0.16350972652435303, "learning_rate": 4.590029461535539e-05, "loss": 0.4022, "step": 1098 }, { "epoch": 0.54, "grad_norm": 0.16346822679042816, "learning_rate": 4.5820872806308854e-05, "loss": 0.4047, "step": 1099 }, { "epoch": 0.54, "grad_norm": 0.1691911220550537, "learning_rate": 4.57414616145111e-05, "loss": 0.4223, "step": 1100 }, { "epoch": 0.54, "grad_norm": 0.20355074107646942, "learning_rate": 4.566206124170963e-05, "loss": 0.4883, "step": 1101 }, { "epoch": 0.54, "grad_norm": 0.16469252109527588, "learning_rate": 4.5582671889624414e-05, "loss": 0.4165, "step": 1102 }, { "epoch": 0.54, "grad_norm": 0.207878977060318, "learning_rate": 4.550329375994746e-05, "loss": 0.4596, "step": 1103 }, { "epoch": 0.54, "grad_norm": 0.2029222846031189, "learning_rate": 4.542392705434225e-05, "loss": 0.4337, "step": 1104 }, { "epoch": 0.54, "grad_norm": 0.17208030819892883, "learning_rate": 4.534457197444325e-05, "loss": 0.3824, "step": 1105 }, { "epoch": 0.54, "grad_norm": 0.2177746742963791, "learning_rate": 4.5265228721855374e-05, "loss": 0.4108, "step": 1106 }, { "epoch": 0.54, "grad_norm": 0.16845650970935822, "learning_rate": 4.5185897498153516e-05, "loss": 0.4428, "step": 1107 }, { "epoch": 0.55, "grad_norm": 0.18296177685260773, "learning_rate": 4.510657850488198e-05, "loss": 0.4316, "step": 1108 }, { "epoch": 0.55, "grad_norm": 0.1718183308839798, "learning_rate": 4.502727194355402e-05, "loss": 0.4269, "step": 1109 }, { "epoch": 0.55, "grad_norm": 0.17277958989143372, "learning_rate": 4.494797801565129e-05, "loss": 0.4044, "step": 1110 }, { "epoch": 0.55, "grad_norm": 0.16493889689445496, "learning_rate": 4.486869692262337e-05, "loss": 0.4355, "step": 1111 }, { "epoch": 0.55, "grad_norm": 0.17258916795253754, "learning_rate": 4.478942886588719e-05, "loss": 0.4346, "step": 1112 }, { "epoch": 0.55, "grad_norm": 0.16142186522483826, "learning_rate": 4.4710174046826624e-05, "loss": 0.4109, "step": 1113 }, { "epoch": 0.55, "grad_norm": 0.13949555158615112, "learning_rate": 4.463093266679185e-05, "loss": 0.4077, "step": 1114 }, { "epoch": 0.55, "grad_norm": 0.17025431990623474, "learning_rate": 4.455170492709897e-05, "loss": 0.4288, "step": 1115 }, { "epoch": 0.55, "grad_norm": 0.14815957844257355, "learning_rate": 4.447249102902936e-05, "loss": 0.4048, "step": 1116 }, { "epoch": 0.55, "grad_norm": 0.18268819153308868, "learning_rate": 4.439329117382929e-05, "loss": 0.4291, "step": 1117 }, { "epoch": 0.55, "grad_norm": 0.15350164473056793, "learning_rate": 4.4314105562709307e-05, "loss": 0.41, "step": 1118 }, { "epoch": 0.55, "grad_norm": 0.16633598506450653, "learning_rate": 4.423493439684384e-05, "loss": 0.4251, "step": 1119 }, { "epoch": 0.55, "grad_norm": 0.19753322005271912, "learning_rate": 4.415577787737051e-05, "loss": 0.4584, "step": 1120 }, { "epoch": 0.55, "grad_norm": 0.15714702010154724, "learning_rate": 4.407663620538985e-05, "loss": 0.4209, "step": 1121 }, { "epoch": 0.55, "grad_norm": 0.16098181903362274, "learning_rate": 4.399750958196456e-05, "loss": 0.3841, "step": 1122 }, { "epoch": 0.55, "grad_norm": 0.18159626424312592, "learning_rate": 4.3918398208119225e-05, "loss": 0.441, "step": 1123 }, { "epoch": 0.55, "grad_norm": 0.19437649846076965, "learning_rate": 4.383930228483956e-05, "loss": 0.4316, "step": 1124 }, { "epoch": 0.55, "grad_norm": 0.1582677811384201, "learning_rate": 4.376022201307215e-05, "loss": 0.4152, "step": 1125 }, { "epoch": 0.55, "grad_norm": 0.1664888709783554, "learning_rate": 4.36811575937237e-05, "loss": 0.4246, "step": 1126 }, { "epoch": 0.55, "grad_norm": 0.20477506518363953, "learning_rate": 4.360210922766076e-05, "loss": 0.4244, "step": 1127 }, { "epoch": 0.55, "grad_norm": 0.1682632565498352, "learning_rate": 4.352307711570897e-05, "loss": 0.4094, "step": 1128 }, { "epoch": 0.56, "grad_norm": 0.19064323604106903, "learning_rate": 4.3444061458652816e-05, "loss": 0.4257, "step": 1129 }, { "epoch": 0.56, "grad_norm": 0.1711409091949463, "learning_rate": 4.336506245723484e-05, "loss": 0.4253, "step": 1130 }, { "epoch": 0.56, "grad_norm": 0.16075028479099274, "learning_rate": 4.328608031215539e-05, "loss": 0.4526, "step": 1131 }, { "epoch": 0.56, "grad_norm": 0.18054410815238953, "learning_rate": 4.320711522407187e-05, "loss": 0.4519, "step": 1132 }, { "epoch": 0.56, "grad_norm": 0.16165342926979065, "learning_rate": 4.312816739359848e-05, "loss": 0.4125, "step": 1133 }, { "epoch": 0.56, "grad_norm": 0.17259930074214935, "learning_rate": 4.304923702130547e-05, "loss": 0.4445, "step": 1134 }, { "epoch": 0.56, "grad_norm": 0.16946721076965332, "learning_rate": 4.29703243077188e-05, "loss": 0.4354, "step": 1135 }, { "epoch": 0.56, "grad_norm": 0.18125861883163452, "learning_rate": 4.289142945331953e-05, "loss": 0.477, "step": 1136 }, { "epoch": 0.56, "grad_norm": 0.18720857799053192, "learning_rate": 4.281255265854338e-05, "loss": 0.3933, "step": 1137 }, { "epoch": 0.56, "grad_norm": 0.14627858996391296, "learning_rate": 4.273369412378015e-05, "loss": 0.418, "step": 1138 }, { "epoch": 0.56, "grad_norm": 0.17631866037845612, "learning_rate": 4.265485404937329e-05, "loss": 0.4263, "step": 1139 }, { "epoch": 0.56, "grad_norm": 0.2008868157863617, "learning_rate": 4.257603263561932e-05, "loss": 0.4554, "step": 1140 }, { "epoch": 0.56, "grad_norm": 0.19984133541584015, "learning_rate": 4.249723008276737e-05, "loss": 0.4642, "step": 1141 }, { "epoch": 0.56, "grad_norm": 0.1811922788619995, "learning_rate": 4.241844659101865e-05, "loss": 0.4415, "step": 1142 }, { "epoch": 0.56, "grad_norm": 0.16772067546844482, "learning_rate": 4.2339682360525935e-05, "loss": 0.4356, "step": 1143 }, { "epoch": 0.56, "grad_norm": 0.1641172468662262, "learning_rate": 4.2260937591393066e-05, "loss": 0.4211, "step": 1144 }, { "epoch": 0.56, "grad_norm": 0.1808784306049347, "learning_rate": 4.218221248367448e-05, "loss": 0.4682, "step": 1145 }, { "epoch": 0.56, "grad_norm": 0.19729982316493988, "learning_rate": 4.210350723737459e-05, "loss": 0.4441, "step": 1146 }, { "epoch": 0.56, "grad_norm": 0.1638176590204239, "learning_rate": 4.2024822052447424e-05, "loss": 0.3695, "step": 1147 }, { "epoch": 0.56, "grad_norm": 0.18886907398700714, "learning_rate": 4.1946157128795985e-05, "loss": 0.4309, "step": 1148 }, { "epoch": 0.57, "grad_norm": 0.16525466740131378, "learning_rate": 4.1867512666271845e-05, "loss": 0.4, "step": 1149 }, { "epoch": 0.57, "grad_norm": 0.18296821415424347, "learning_rate": 4.1788888864674566e-05, "loss": 0.4547, "step": 1150 }, { "epoch": 0.57, "grad_norm": 0.17641517519950867, "learning_rate": 4.171028592375125e-05, "loss": 0.4198, "step": 1151 }, { "epoch": 0.57, "grad_norm": 0.17311687767505646, "learning_rate": 4.163170404319595e-05, "loss": 0.4089, "step": 1152 }, { "epoch": 0.57, "grad_norm": 0.18426398932933807, "learning_rate": 4.155314342264928e-05, "loss": 0.4387, "step": 1153 }, { "epoch": 0.57, "grad_norm": 0.18487504124641418, "learning_rate": 4.147460426169777e-05, "loss": 0.4736, "step": 1154 }, { "epoch": 0.57, "grad_norm": 0.18331630527973175, "learning_rate": 4.139608675987351e-05, "loss": 0.4166, "step": 1155 }, { "epoch": 0.57, "grad_norm": 0.19390788674354553, "learning_rate": 4.131759111665349e-05, "loss": 0.4663, "step": 1156 }, { "epoch": 0.57, "grad_norm": 0.18704354763031006, "learning_rate": 4.123911753145922e-05, "loss": 0.4472, "step": 1157 }, { "epoch": 0.57, "grad_norm": 0.18491679430007935, "learning_rate": 4.116066620365614e-05, "loss": 0.435, "step": 1158 }, { "epoch": 0.57, "grad_norm": 0.16120894253253937, "learning_rate": 4.108223733255316e-05, "loss": 0.3592, "step": 1159 }, { "epoch": 0.57, "grad_norm": 0.176873579621315, "learning_rate": 4.100383111740213e-05, "loss": 0.4218, "step": 1160 }, { "epoch": 0.57, "grad_norm": 0.1785566657781601, "learning_rate": 4.092544775739735e-05, "loss": 0.4142, "step": 1161 }, { "epoch": 0.57, "grad_norm": 0.1803351491689682, "learning_rate": 4.0847087451675035e-05, "loss": 0.4186, "step": 1162 }, { "epoch": 0.57, "grad_norm": 0.19252926111221313, "learning_rate": 4.076875039931287e-05, "loss": 0.4745, "step": 1163 }, { "epoch": 0.57, "grad_norm": 0.1582832783460617, "learning_rate": 4.069043679932942e-05, "loss": 0.4343, "step": 1164 }, { "epoch": 0.57, "grad_norm": 0.1714107096195221, "learning_rate": 4.0612146850683696e-05, "loss": 0.4461, "step": 1165 }, { "epoch": 0.57, "grad_norm": 0.16712558269500732, "learning_rate": 4.053388075227459e-05, "loss": 0.3922, "step": 1166 }, { "epoch": 0.57, "grad_norm": 0.16691477596759796, "learning_rate": 4.045563870294047e-05, "loss": 0.3903, "step": 1167 }, { "epoch": 0.57, "grad_norm": 0.1882498562335968, "learning_rate": 4.0377420901458506e-05, "loss": 0.4649, "step": 1168 }, { "epoch": 0.58, "grad_norm": 0.1542661190032959, "learning_rate": 4.0299227546544366e-05, "loss": 0.4121, "step": 1169 }, { "epoch": 0.58, "grad_norm": 0.17766988277435303, "learning_rate": 4.022105883685152e-05, "loss": 0.4399, "step": 1170 }, { "epoch": 0.58, "grad_norm": 0.1890285462141037, "learning_rate": 4.0142914970970926e-05, "loss": 0.4034, "step": 1171 }, { "epoch": 0.58, "grad_norm": 0.1727905422449112, "learning_rate": 4.0064796147430305e-05, "loss": 0.3959, "step": 1172 }, { "epoch": 0.58, "grad_norm": 0.16325156390666962, "learning_rate": 3.998670256469388e-05, "loss": 0.3838, "step": 1173 }, { "epoch": 0.58, "grad_norm": 0.20756226778030396, "learning_rate": 3.990863442116164e-05, "loss": 0.433, "step": 1174 }, { "epoch": 0.58, "grad_norm": 0.18255798518657684, "learning_rate": 3.983059191516905e-05, "loss": 0.4189, "step": 1175 }, { "epoch": 0.58, "grad_norm": 0.22535844147205353, "learning_rate": 3.975257524498631e-05, "loss": 0.4714, "step": 1176 }, { "epoch": 0.58, "grad_norm": 0.19689707458019257, "learning_rate": 3.967458460881814e-05, "loss": 0.4306, "step": 1177 }, { "epoch": 0.58, "grad_norm": 0.20398923754692078, "learning_rate": 3.959662020480297e-05, "loss": 0.4166, "step": 1178 }, { "epoch": 0.58, "grad_norm": 0.16215121746063232, "learning_rate": 3.951868223101272e-05, "loss": 0.4098, "step": 1179 }, { "epoch": 0.58, "grad_norm": 0.19960317015647888, "learning_rate": 3.9440770885452025e-05, "loss": 0.4766, "step": 1180 }, { "epoch": 0.58, "grad_norm": 0.18014560639858246, "learning_rate": 3.9362886366058025e-05, "loss": 0.4372, "step": 1181 }, { "epoch": 0.58, "grad_norm": 0.1776171177625656, "learning_rate": 3.928502887069953e-05, "loss": 0.4265, "step": 1182 }, { "epoch": 0.58, "grad_norm": 0.17940515279769897, "learning_rate": 3.9207198597176884e-05, "loss": 0.375, "step": 1183 }, { "epoch": 0.58, "grad_norm": 0.18429002165794373, "learning_rate": 3.9129395743221096e-05, "loss": 0.4224, "step": 1184 }, { "epoch": 0.58, "grad_norm": 0.16344821453094482, "learning_rate": 3.9051620506493656e-05, "loss": 0.4194, "step": 1185 }, { "epoch": 0.58, "grad_norm": 0.16867202520370483, "learning_rate": 3.89738730845858e-05, "loss": 0.4028, "step": 1186 }, { "epoch": 0.58, "grad_norm": 0.1708584427833557, "learning_rate": 3.889615367501814e-05, "loss": 0.4047, "step": 1187 }, { "epoch": 0.58, "grad_norm": 0.17418964207172394, "learning_rate": 3.8818462475240104e-05, "loss": 0.4219, "step": 1188 }, { "epoch": 0.58, "grad_norm": 0.16301923990249634, "learning_rate": 3.874079968262948e-05, "loss": 0.4485, "step": 1189 }, { "epoch": 0.59, "grad_norm": 0.17896440625190735, "learning_rate": 3.8663165494491835e-05, "loss": 0.4576, "step": 1190 }, { "epoch": 0.59, "grad_norm": 0.175236776471138, "learning_rate": 3.858556010806013e-05, "loss": 0.4137, "step": 1191 }, { "epoch": 0.59, "grad_norm": 0.191704660654068, "learning_rate": 3.850798372049409e-05, "loss": 0.4398, "step": 1192 }, { "epoch": 0.59, "grad_norm": 0.17973364889621735, "learning_rate": 3.843043652887982e-05, "loss": 0.4355, "step": 1193 }, { "epoch": 0.59, "grad_norm": 0.1684478372335434, "learning_rate": 3.835291873022922e-05, "loss": 0.4126, "step": 1194 }, { "epoch": 0.59, "grad_norm": 0.1709199845790863, "learning_rate": 3.827543052147952e-05, "loss": 0.4184, "step": 1195 }, { "epoch": 0.59, "grad_norm": 0.1854672133922577, "learning_rate": 3.819797209949279e-05, "loss": 0.4634, "step": 1196 }, { "epoch": 0.59, "grad_norm": 0.17530451714992523, "learning_rate": 3.8120543661055405e-05, "loss": 0.4349, "step": 1197 }, { "epoch": 0.59, "grad_norm": 0.16253620386123657, "learning_rate": 3.804314540287759e-05, "loss": 0.4372, "step": 1198 }, { "epoch": 0.59, "grad_norm": 0.19519898295402527, "learning_rate": 3.796577752159288e-05, "loss": 0.4686, "step": 1199 }, { "epoch": 0.59, "grad_norm": 0.17145699262619019, "learning_rate": 3.7888440213757625e-05, "loss": 0.4081, "step": 1200 }, { "epoch": 0.59, "grad_norm": 0.16318893432617188, "learning_rate": 3.781113367585053e-05, "loss": 0.3999, "step": 1201 }, { "epoch": 0.59, "grad_norm": 0.16164809465408325, "learning_rate": 3.77338581042721e-05, "loss": 0.4274, "step": 1202 }, { "epoch": 0.59, "grad_norm": 0.19859689474105835, "learning_rate": 3.7656613695344204e-05, "loss": 0.4136, "step": 1203 }, { "epoch": 0.59, "grad_norm": 0.18018724024295807, "learning_rate": 3.75794006453095e-05, "loss": 0.4595, "step": 1204 }, { "epoch": 0.59, "grad_norm": 0.1565493941307068, "learning_rate": 3.7502219150331e-05, "loss": 0.3719, "step": 1205 }, { "epoch": 0.59, "grad_norm": 0.18528230488300323, "learning_rate": 3.742506940649154e-05, "loss": 0.4594, "step": 1206 }, { "epoch": 0.59, "grad_norm": 0.19387869536876678, "learning_rate": 3.734795160979331e-05, "loss": 0.4485, "step": 1207 }, { "epoch": 0.59, "grad_norm": 0.16848109662532806, "learning_rate": 3.7270865956157307e-05, "loss": 0.4262, "step": 1208 }, { "epoch": 0.59, "grad_norm": 0.18520793318748474, "learning_rate": 3.719381264142291e-05, "loss": 0.4299, "step": 1209 }, { "epoch": 0.6, "grad_norm": 0.1625683754682541, "learning_rate": 3.7116791861347274e-05, "loss": 0.3951, "step": 1210 }, { "epoch": 0.6, "grad_norm": 0.1870507448911667, "learning_rate": 3.703980381160497e-05, "loss": 0.4186, "step": 1211 }, { "epoch": 0.6, "grad_norm": 0.18271136283874512, "learning_rate": 3.696284868778737e-05, "loss": 0.4274, "step": 1212 }, { "epoch": 0.6, "grad_norm": 0.18494969606399536, "learning_rate": 3.688592668540221e-05, "loss": 0.4078, "step": 1213 }, { "epoch": 0.6, "grad_norm": 0.19201305508613586, "learning_rate": 3.680903799987308e-05, "loss": 0.4214, "step": 1214 }, { "epoch": 0.6, "grad_norm": 0.18444941937923431, "learning_rate": 3.673218282653893e-05, "loss": 0.4081, "step": 1215 }, { "epoch": 0.6, "grad_norm": 0.1725461781024933, "learning_rate": 3.665536136065356e-05, "loss": 0.4223, "step": 1216 }, { "epoch": 0.6, "grad_norm": 0.1817905306816101, "learning_rate": 3.657857379738515e-05, "loss": 0.4102, "step": 1217 }, { "epoch": 0.6, "grad_norm": 0.16917766630649567, "learning_rate": 3.6501820331815736e-05, "loss": 0.4372, "step": 1218 }, { "epoch": 0.6, "grad_norm": 0.2047378122806549, "learning_rate": 3.6425101158940746e-05, "loss": 0.4595, "step": 1219 }, { "epoch": 0.6, "grad_norm": 0.16528359055519104, "learning_rate": 3.6348416473668444e-05, "loss": 0.4185, "step": 1220 }, { "epoch": 0.6, "grad_norm": 0.18230435252189636, "learning_rate": 3.627176647081954e-05, "loss": 0.4112, "step": 1221 }, { "epoch": 0.6, "grad_norm": 0.15391410887241364, "learning_rate": 3.619515134512656e-05, "loss": 0.3569, "step": 1222 }, { "epoch": 0.6, "grad_norm": 0.18343640863895416, "learning_rate": 3.6118571291233506e-05, "loss": 0.4652, "step": 1223 }, { "epoch": 0.6, "grad_norm": 0.17084887623786926, "learning_rate": 3.604202650369517e-05, "loss": 0.4209, "step": 1224 }, { "epoch": 0.6, "grad_norm": 0.2104005217552185, "learning_rate": 3.596551717697689e-05, "loss": 0.4609, "step": 1225 }, { "epoch": 0.6, "grad_norm": 0.1988956481218338, "learning_rate": 3.588904350545376e-05, "loss": 0.4758, "step": 1226 }, { "epoch": 0.6, "grad_norm": 0.18209640681743622, "learning_rate": 3.5812605683410424e-05, "loss": 0.4358, "step": 1227 }, { "epoch": 0.6, "grad_norm": 0.17746426165103912, "learning_rate": 3.573620390504034e-05, "loss": 0.4382, "step": 1228 }, { "epoch": 0.6, "grad_norm": 0.17043355107307434, "learning_rate": 3.5659838364445505e-05, "loss": 0.419, "step": 1229 }, { "epoch": 0.61, "grad_norm": 0.16547374427318573, "learning_rate": 3.558350925563573e-05, "loss": 0.4154, "step": 1230 }, { "epoch": 0.61, "grad_norm": 0.18820108473300934, "learning_rate": 3.550721677252839e-05, "loss": 0.44, "step": 1231 }, { "epoch": 0.61, "grad_norm": 0.18086692690849304, "learning_rate": 3.54309611089477e-05, "loss": 0.4157, "step": 1232 }, { "epoch": 0.61, "grad_norm": 0.16555728018283844, "learning_rate": 3.535474245862445e-05, "loss": 0.3924, "step": 1233 }, { "epoch": 0.61, "grad_norm": 0.17524179816246033, "learning_rate": 3.5278561015195274e-05, "loss": 0.4687, "step": 1234 }, { "epoch": 0.61, "grad_norm": 0.16479650139808655, "learning_rate": 3.520241697220239e-05, "loss": 0.4229, "step": 1235 }, { "epoch": 0.61, "grad_norm": 0.18819870054721832, "learning_rate": 3.51263105230929e-05, "loss": 0.45, "step": 1236 }, { "epoch": 0.61, "grad_norm": 0.20367495715618134, "learning_rate": 3.505024186121849e-05, "loss": 0.4405, "step": 1237 }, { "epoch": 0.61, "grad_norm": 0.16043953597545624, "learning_rate": 3.497421117983477e-05, "loss": 0.4197, "step": 1238 }, { "epoch": 0.61, "grad_norm": 0.19539308547973633, "learning_rate": 3.489821867210091e-05, "loss": 0.4478, "step": 1239 }, { "epoch": 0.61, "grad_norm": 0.1795402467250824, "learning_rate": 3.4822264531079075e-05, "loss": 0.4336, "step": 1240 }, { "epoch": 0.61, "grad_norm": 0.19555340707302094, "learning_rate": 3.4746348949733965e-05, "loss": 0.4445, "step": 1241 }, { "epoch": 0.61, "grad_norm": 0.17491547763347626, "learning_rate": 3.467047212093229e-05, "loss": 0.4162, "step": 1242 }, { "epoch": 0.61, "grad_norm": 0.1661037653684616, "learning_rate": 3.459463423744238e-05, "loss": 0.4316, "step": 1243 }, { "epoch": 0.61, "grad_norm": 0.16778497397899628, "learning_rate": 3.451883549193353e-05, "loss": 0.4673, "step": 1244 }, { "epoch": 0.61, "grad_norm": 0.1624813973903656, "learning_rate": 3.444307607697567e-05, "loss": 0.3876, "step": 1245 }, { "epoch": 0.61, "grad_norm": 0.18184678256511688, "learning_rate": 3.436735618503877e-05, "loss": 0.4317, "step": 1246 }, { "epoch": 0.61, "grad_norm": 0.157467320561409, "learning_rate": 3.4291676008492424e-05, "loss": 0.3875, "step": 1247 }, { "epoch": 0.61, "grad_norm": 0.18366743624210358, "learning_rate": 3.42160357396053e-05, "loss": 0.4545, "step": 1248 }, { "epoch": 0.61, "grad_norm": 0.16325421631336212, "learning_rate": 3.41404355705447e-05, "loss": 0.3805, "step": 1249 }, { "epoch": 0.62, "grad_norm": 0.200888991355896, "learning_rate": 3.406487569337603e-05, "loss": 0.3979, "step": 1250 }, { "epoch": 0.62, "grad_norm": 0.19040879607200623, "learning_rate": 3.398935630006236e-05, "loss": 0.4499, "step": 1251 }, { "epoch": 0.62, "grad_norm": 0.1920650452375412, "learning_rate": 3.391387758246386e-05, "loss": 0.4438, "step": 1252 }, { "epoch": 0.62, "grad_norm": 0.19058239459991455, "learning_rate": 3.3838439732337427e-05, "loss": 0.405, "step": 1253 }, { "epoch": 0.62, "grad_norm": 0.18155650794506073, "learning_rate": 3.3763042941336076e-05, "loss": 0.4203, "step": 1254 }, { "epoch": 0.62, "grad_norm": 0.20571352541446686, "learning_rate": 3.368768740100855e-05, "loss": 0.4765, "step": 1255 }, { "epoch": 0.62, "grad_norm": 0.16188322007656097, "learning_rate": 3.3612373302798764e-05, "loss": 0.3551, "step": 1256 }, { "epoch": 0.62, "grad_norm": 0.17868348956108093, "learning_rate": 3.353710083804535e-05, "loss": 0.4506, "step": 1257 }, { "epoch": 0.62, "grad_norm": 0.16932377219200134, "learning_rate": 3.34618701979812e-05, "loss": 0.3789, "step": 1258 }, { "epoch": 0.62, "grad_norm": 0.1802414208650589, "learning_rate": 3.3386681573732927e-05, "loss": 0.4335, "step": 1259 }, { "epoch": 0.62, "grad_norm": 0.19814319908618927, "learning_rate": 3.331153515632037e-05, "loss": 0.4414, "step": 1260 }, { "epoch": 0.62, "grad_norm": 0.17975422739982605, "learning_rate": 3.32364311366562e-05, "loss": 0.4437, "step": 1261 }, { "epoch": 0.62, "grad_norm": 0.1839234083890915, "learning_rate": 3.316136970554532e-05, "loss": 0.3947, "step": 1262 }, { "epoch": 0.62, "grad_norm": 0.1721470206975937, "learning_rate": 3.308635105368448e-05, "loss": 0.3749, "step": 1263 }, { "epoch": 0.62, "grad_norm": 0.1624041050672531, "learning_rate": 3.3011375371661725e-05, "loss": 0.4146, "step": 1264 }, { "epoch": 0.62, "grad_norm": 0.171004518866539, "learning_rate": 3.293644284995593e-05, "loss": 0.4127, "step": 1265 }, { "epoch": 0.62, "grad_norm": 0.18826109170913696, "learning_rate": 3.286155367893632e-05, "loss": 0.4161, "step": 1266 }, { "epoch": 0.62, "grad_norm": 0.17312119901180267, "learning_rate": 3.2786708048862e-05, "loss": 0.4063, "step": 1267 }, { "epoch": 0.62, "grad_norm": 0.18811482191085815, "learning_rate": 3.271190614988144e-05, "loss": 0.4425, "step": 1268 }, { "epoch": 0.62, "grad_norm": 0.19477587938308716, "learning_rate": 3.263714817203204e-05, "loss": 0.4435, "step": 1269 }, { "epoch": 0.62, "grad_norm": 0.19289782643318176, "learning_rate": 3.2562434305239556e-05, "loss": 0.4498, "step": 1270 }, { "epoch": 0.63, "grad_norm": 0.1867329627275467, "learning_rate": 3.248776473931774e-05, "loss": 0.4572, "step": 1271 }, { "epoch": 0.63, "grad_norm": 0.2070753425359726, "learning_rate": 3.2413139663967765e-05, "loss": 0.4315, "step": 1272 }, { "epoch": 0.63, "grad_norm": 0.19125168025493622, "learning_rate": 3.233855926877779e-05, "loss": 0.4213, "step": 1273 }, { "epoch": 0.63, "grad_norm": 0.18593250215053558, "learning_rate": 3.226402374322244e-05, "loss": 0.4056, "step": 1274 }, { "epoch": 0.63, "grad_norm": 0.1907048225402832, "learning_rate": 3.2189533276662364e-05, "loss": 0.4323, "step": 1275 }, { "epoch": 0.63, "grad_norm": 0.18620069324970245, "learning_rate": 3.2115088058343725e-05, "loss": 0.4044, "step": 1276 }, { "epoch": 0.63, "grad_norm": 0.18708837032318115, "learning_rate": 3.204068827739777e-05, "loss": 0.4044, "step": 1277 }, { "epoch": 0.63, "grad_norm": 0.22097015380859375, "learning_rate": 3.196633412284023e-05, "loss": 0.434, "step": 1278 }, { "epoch": 0.63, "grad_norm": 0.18286707997322083, "learning_rate": 3.189202578357101e-05, "loss": 0.4111, "step": 1279 }, { "epoch": 0.63, "grad_norm": 0.19943729043006897, "learning_rate": 3.181776344837355e-05, "loss": 0.4109, "step": 1280 }, { "epoch": 0.63, "grad_norm": 0.17524567246437073, "learning_rate": 3.174354730591447e-05, "loss": 0.4287, "step": 1281 }, { "epoch": 0.63, "grad_norm": 0.19784198701381683, "learning_rate": 3.1669377544742964e-05, "loss": 0.4371, "step": 1282 }, { "epoch": 0.63, "grad_norm": 0.19207602739334106, "learning_rate": 3.1595254353290496e-05, "loss": 0.4362, "step": 1283 }, { "epoch": 0.63, "grad_norm": 0.17561034858226776, "learning_rate": 3.1521177919870106e-05, "loss": 0.4027, "step": 1284 }, { "epoch": 0.63, "grad_norm": 0.19479133188724518, "learning_rate": 3.144714843267613e-05, "loss": 0.4442, "step": 1285 }, { "epoch": 0.63, "grad_norm": 0.1794152855873108, "learning_rate": 3.137316607978357e-05, "loss": 0.4471, "step": 1286 }, { "epoch": 0.63, "grad_norm": 0.20211893320083618, "learning_rate": 3.129923104914776e-05, "loss": 0.4228, "step": 1287 }, { "epoch": 0.63, "grad_norm": 0.17944130301475525, "learning_rate": 3.1225343528603696e-05, "loss": 0.4076, "step": 1288 }, { "epoch": 0.63, "grad_norm": 0.15724264085292816, "learning_rate": 3.11515037058658e-05, "loss": 0.3962, "step": 1289 }, { "epoch": 0.63, "grad_norm": 0.18198132514953613, "learning_rate": 3.107771176852721e-05, "loss": 0.4316, "step": 1290 }, { "epoch": 0.64, "grad_norm": 0.18715420365333557, "learning_rate": 3.100396790405948e-05, "loss": 0.4577, "step": 1291 }, { "epoch": 0.64, "grad_norm": 0.16927407681941986, "learning_rate": 3.0930272299811995e-05, "loss": 0.4191, "step": 1292 }, { "epoch": 0.64, "grad_norm": 0.17603392899036407, "learning_rate": 3.085662514301155e-05, "loss": 0.3946, "step": 1293 }, { "epoch": 0.64, "grad_norm": 0.18856900930404663, "learning_rate": 3.078302662076185e-05, "loss": 0.4419, "step": 1294 }, { "epoch": 0.64, "grad_norm": 0.173095703125, "learning_rate": 3.0709476920043036e-05, "loss": 0.4054, "step": 1295 }, { "epoch": 0.64, "grad_norm": 0.16694311797618866, "learning_rate": 3.063597622771122e-05, "loss": 0.4185, "step": 1296 }, { "epoch": 0.64, "grad_norm": 0.1691720187664032, "learning_rate": 3.056252473049802e-05, "loss": 0.4341, "step": 1297 }, { "epoch": 0.64, "grad_norm": 0.1720270961523056, "learning_rate": 3.0489122615010056e-05, "loss": 0.4173, "step": 1298 }, { "epoch": 0.64, "grad_norm": 0.218688502907753, "learning_rate": 3.041577006772851e-05, "loss": 0.4547, "step": 1299 }, { "epoch": 0.64, "grad_norm": 0.17873355746269226, "learning_rate": 3.0342467275008592e-05, "loss": 0.3693, "step": 1300 }, { "epoch": 0.64, "grad_norm": 0.1664518117904663, "learning_rate": 3.026921442307916e-05, "loss": 0.398, "step": 1301 }, { "epoch": 0.64, "grad_norm": 0.16966193914413452, "learning_rate": 3.019601169804216e-05, "loss": 0.4173, "step": 1302 }, { "epoch": 0.64, "grad_norm": 0.19999423623085022, "learning_rate": 3.0122859285872214e-05, "loss": 0.4462, "step": 1303 }, { "epoch": 0.64, "grad_norm": 0.17895002663135529, "learning_rate": 3.0049757372416088e-05, "loss": 0.4097, "step": 1304 }, { "epoch": 0.64, "grad_norm": 0.1781972497701645, "learning_rate": 2.9976706143392297e-05, "loss": 0.4489, "step": 1305 }, { "epoch": 0.64, "grad_norm": 0.17591695487499237, "learning_rate": 2.9903705784390544e-05, "loss": 0.4069, "step": 1306 }, { "epoch": 0.64, "grad_norm": 0.16532135009765625, "learning_rate": 2.983075648087134e-05, "loss": 0.4172, "step": 1307 }, { "epoch": 0.64, "grad_norm": 0.17733897268772125, "learning_rate": 2.975785841816545e-05, "loss": 0.4251, "step": 1308 }, { "epoch": 0.64, "grad_norm": 0.16544422507286072, "learning_rate": 2.9685011781473493e-05, "loss": 0.3824, "step": 1309 }, { "epoch": 0.64, "grad_norm": 0.2064037173986435, "learning_rate": 2.961221675586539e-05, "loss": 0.4454, "step": 1310 }, { "epoch": 0.65, "grad_norm": 0.19144149124622345, "learning_rate": 2.9539473526280005e-05, "loss": 0.4379, "step": 1311 }, { "epoch": 0.65, "grad_norm": 0.1825249046087265, "learning_rate": 2.9466782277524553e-05, "loss": 0.3801, "step": 1312 }, { "epoch": 0.65, "grad_norm": 0.18263846635818481, "learning_rate": 2.9394143194274238e-05, "loss": 0.382, "step": 1313 }, { "epoch": 0.65, "grad_norm": 0.20798428356647491, "learning_rate": 2.9321556461071692e-05, "loss": 0.4135, "step": 1314 }, { "epoch": 0.65, "grad_norm": 0.1747855246067047, "learning_rate": 2.9249022262326586e-05, "loss": 0.3823, "step": 1315 }, { "epoch": 0.65, "grad_norm": 0.20395706593990326, "learning_rate": 2.9176540782315097e-05, "loss": 0.4363, "step": 1316 }, { "epoch": 0.65, "grad_norm": 0.21141205728054047, "learning_rate": 2.9104112205179495e-05, "loss": 0.4542, "step": 1317 }, { "epoch": 0.65, "grad_norm": 0.1827576905488968, "learning_rate": 2.9031736714927605e-05, "loss": 0.42, "step": 1318 }, { "epoch": 0.65, "grad_norm": 0.17097045481204987, "learning_rate": 2.895941449543245e-05, "loss": 0.3725, "step": 1319 }, { "epoch": 0.65, "grad_norm": 0.18988263607025146, "learning_rate": 2.888714573043166e-05, "loss": 0.4356, "step": 1320 }, { "epoch": 0.65, "grad_norm": 0.14741677045822144, "learning_rate": 2.8814930603527068e-05, "loss": 0.3911, "step": 1321 }, { "epoch": 0.65, "grad_norm": 0.18934138119220734, "learning_rate": 2.8742769298184248e-05, "loss": 0.4137, "step": 1322 }, { "epoch": 0.65, "grad_norm": 0.19723200798034668, "learning_rate": 2.867066199773205e-05, "loss": 0.4221, "step": 1323 }, { "epoch": 0.65, "grad_norm": 0.19483500719070435, "learning_rate": 2.8598608885362123e-05, "loss": 0.4255, "step": 1324 }, { "epoch": 0.65, "grad_norm": 0.20747026801109314, "learning_rate": 2.852661014412841e-05, "loss": 0.3986, "step": 1325 }, { "epoch": 0.65, "grad_norm": 0.16653136909008026, "learning_rate": 2.8454665956946736e-05, "loss": 0.4133, "step": 1326 }, { "epoch": 0.65, "grad_norm": 0.17636097967624664, "learning_rate": 2.8382776506594383e-05, "loss": 0.4157, "step": 1327 }, { "epoch": 0.65, "grad_norm": 0.20152291655540466, "learning_rate": 2.831094197570951e-05, "loss": 0.4532, "step": 1328 }, { "epoch": 0.65, "grad_norm": 0.2007521092891693, "learning_rate": 2.8239162546790764e-05, "loss": 0.4215, "step": 1329 }, { "epoch": 0.65, "grad_norm": 0.2032855898141861, "learning_rate": 2.8167438402196805e-05, "loss": 0.4166, "step": 1330 }, { "epoch": 0.65, "grad_norm": 0.19417956471443176, "learning_rate": 2.809576972414587e-05, "loss": 0.4142, "step": 1331 }, { "epoch": 0.66, "grad_norm": 0.1658124029636383, "learning_rate": 2.8024156694715242e-05, "loss": 0.4357, "step": 1332 }, { "epoch": 0.66, "grad_norm": 0.1953446865081787, "learning_rate": 2.7952599495840847e-05, "loss": 0.438, "step": 1333 }, { "epoch": 0.66, "grad_norm": 0.17377813160419464, "learning_rate": 2.7881098309316735e-05, "loss": 0.4105, "step": 1334 }, { "epoch": 0.66, "grad_norm": 0.20540669560432434, "learning_rate": 2.7809653316794727e-05, "loss": 0.438, "step": 1335 }, { "epoch": 0.66, "grad_norm": 0.1718394011259079, "learning_rate": 2.773826469978382e-05, "loss": 0.4238, "step": 1336 }, { "epoch": 0.66, "grad_norm": 0.20961636304855347, "learning_rate": 2.766693263964981e-05, "loss": 0.4624, "step": 1337 }, { "epoch": 0.66, "grad_norm": 0.18207523226737976, "learning_rate": 2.7595657317614776e-05, "loss": 0.431, "step": 1338 }, { "epoch": 0.66, "grad_norm": 0.1988602876663208, "learning_rate": 2.7524438914756712e-05, "loss": 0.4244, "step": 1339 }, { "epoch": 0.66, "grad_norm": 0.18528982996940613, "learning_rate": 2.7453277612008966e-05, "loss": 0.3973, "step": 1340 }, { "epoch": 0.66, "grad_norm": 0.1695837676525116, "learning_rate": 2.738217359015981e-05, "loss": 0.3827, "step": 1341 }, { "epoch": 0.66, "grad_norm": 0.19106817245483398, "learning_rate": 2.7311127029852007e-05, "loss": 0.4131, "step": 1342 }, { "epoch": 0.66, "grad_norm": 0.1801414042711258, "learning_rate": 2.724013811158237e-05, "loss": 0.3711, "step": 1343 }, { "epoch": 0.66, "grad_norm": 0.17489413917064667, "learning_rate": 2.7169207015701216e-05, "loss": 0.4233, "step": 1344 }, { "epoch": 0.66, "grad_norm": 0.1956557184457779, "learning_rate": 2.709833392241199e-05, "loss": 0.4166, "step": 1345 }, { "epoch": 0.66, "grad_norm": 0.16916592419147491, "learning_rate": 2.702751901177074e-05, "loss": 0.4201, "step": 1346 }, { "epoch": 0.66, "grad_norm": 0.18853600323200226, "learning_rate": 2.6956762463685787e-05, "loss": 0.434, "step": 1347 }, { "epoch": 0.66, "grad_norm": 0.1811629980802536, "learning_rate": 2.6886064457917092e-05, "loss": 0.419, "step": 1348 }, { "epoch": 0.66, "grad_norm": 0.20379959046840668, "learning_rate": 2.6815425174075936e-05, "loss": 0.4232, "step": 1349 }, { "epoch": 0.66, "grad_norm": 0.18091918528079987, "learning_rate": 2.6744844791624357e-05, "loss": 0.3816, "step": 1350 }, { "epoch": 0.66, "grad_norm": 0.1842348575592041, "learning_rate": 2.6674323489874843e-05, "loss": 0.4252, "step": 1351 }, { "epoch": 0.67, "grad_norm": 0.18005922436714172, "learning_rate": 2.6603861447989703e-05, "loss": 0.414, "step": 1352 }, { "epoch": 0.67, "grad_norm": 0.18748770654201508, "learning_rate": 2.6533458844980737e-05, "loss": 0.4105, "step": 1353 }, { "epoch": 0.67, "grad_norm": 0.19216570258140564, "learning_rate": 2.64631158597087e-05, "loss": 0.4255, "step": 1354 }, { "epoch": 0.67, "grad_norm": 0.17764945328235626, "learning_rate": 2.639283267088295e-05, "loss": 0.4641, "step": 1355 }, { "epoch": 0.67, "grad_norm": 0.19968628883361816, "learning_rate": 2.6322609457060854e-05, "loss": 0.4305, "step": 1356 }, { "epoch": 0.67, "grad_norm": 0.16937702894210815, "learning_rate": 2.62524463966475e-05, "loss": 0.3906, "step": 1357 }, { "epoch": 0.67, "grad_norm": 0.1761816143989563, "learning_rate": 2.6182343667895036e-05, "loss": 0.3933, "step": 1358 }, { "epoch": 0.67, "grad_norm": 0.20705078542232513, "learning_rate": 2.6112301448902444e-05, "loss": 0.4201, "step": 1359 }, { "epoch": 0.67, "grad_norm": 0.18533958494663239, "learning_rate": 2.60423199176149e-05, "loss": 0.392, "step": 1360 }, { "epoch": 0.67, "grad_norm": 0.17876093089580536, "learning_rate": 2.5972399251823488e-05, "loss": 0.4185, "step": 1361 }, { "epoch": 0.67, "grad_norm": 0.17652106285095215, "learning_rate": 2.5902539629164528e-05, "loss": 0.4319, "step": 1362 }, { "epoch": 0.67, "grad_norm": 0.18993395566940308, "learning_rate": 2.5832741227119406e-05, "loss": 0.4257, "step": 1363 }, { "epoch": 0.67, "grad_norm": 0.17801856994628906, "learning_rate": 2.5763004223013853e-05, "loss": 0.4036, "step": 1364 }, { "epoch": 0.67, "grad_norm": 0.17437198758125305, "learning_rate": 2.5693328794017735e-05, "loss": 0.4232, "step": 1365 }, { "epoch": 0.67, "grad_norm": 0.18640847504138947, "learning_rate": 2.5623715117144336e-05, "loss": 0.4388, "step": 1366 }, { "epoch": 0.67, "grad_norm": 0.1812058389186859, "learning_rate": 2.5554163369250195e-05, "loss": 0.4357, "step": 1367 }, { "epoch": 0.67, "grad_norm": 0.1772056221961975, "learning_rate": 2.5484673727034413e-05, "loss": 0.4278, "step": 1368 }, { "epoch": 0.67, "grad_norm": 0.19270136952400208, "learning_rate": 2.5415246367038403e-05, "loss": 0.3965, "step": 1369 }, { "epoch": 0.67, "grad_norm": 0.19726654887199402, "learning_rate": 2.5345881465645272e-05, "loss": 0.4111, "step": 1370 }, { "epoch": 0.67, "grad_norm": 0.19174014031887054, "learning_rate": 2.5276579199079486e-05, "loss": 0.4532, "step": 1371 }, { "epoch": 0.68, "grad_norm": 0.19043917953968048, "learning_rate": 2.5207339743406345e-05, "loss": 0.4063, "step": 1372 }, { "epoch": 0.68, "grad_norm": 0.17165736854076385, "learning_rate": 2.5138163274531657e-05, "loss": 0.4245, "step": 1373 }, { "epoch": 0.68, "grad_norm": 0.1753571480512619, "learning_rate": 2.506904996820112e-05, "loss": 0.4222, "step": 1374 }, { "epoch": 0.68, "grad_norm": 0.17958104610443115, "learning_rate": 2.500000000000001e-05, "loss": 0.4005, "step": 1375 }, { "epoch": 0.68, "grad_norm": 0.1918800324201584, "learning_rate": 2.4931013545352667e-05, "loss": 0.435, "step": 1376 }, { "epoch": 0.68, "grad_norm": 0.19200588762760162, "learning_rate": 2.4862090779522117e-05, "loss": 0.4536, "step": 1377 }, { "epoch": 0.68, "grad_norm": 0.18893642723560333, "learning_rate": 2.4793231877609546e-05, "loss": 0.3967, "step": 1378 }, { "epoch": 0.68, "grad_norm": 0.21066032350063324, "learning_rate": 2.472443701455388e-05, "loss": 0.4657, "step": 1379 }, { "epoch": 0.68, "grad_norm": 0.1749165952205658, "learning_rate": 2.4655706365131358e-05, "loss": 0.3818, "step": 1380 }, { "epoch": 0.68, "grad_norm": 0.1712990552186966, "learning_rate": 2.4587040103955134e-05, "loss": 0.4419, "step": 1381 }, { "epoch": 0.68, "grad_norm": 0.1927119791507721, "learning_rate": 2.451843840547471e-05, "loss": 0.4042, "step": 1382 }, { "epoch": 0.68, "grad_norm": 0.20715966820716858, "learning_rate": 2.44499014439756e-05, "loss": 0.4056, "step": 1383 }, { "epoch": 0.68, "grad_norm": 0.17762576043605804, "learning_rate": 2.438142939357882e-05, "loss": 0.3857, "step": 1384 }, { "epoch": 0.68, "grad_norm": 0.19470296800136566, "learning_rate": 2.4313022428240545e-05, "loss": 0.4079, "step": 1385 }, { "epoch": 0.68, "grad_norm": 0.1810818612575531, "learning_rate": 2.4244680721751523e-05, "loss": 0.4395, "step": 1386 }, { "epoch": 0.68, "grad_norm": 0.17147907614707947, "learning_rate": 2.4176404447736756e-05, "loss": 0.4228, "step": 1387 }, { "epoch": 0.68, "grad_norm": 0.18902720510959625, "learning_rate": 2.4108193779654964e-05, "loss": 0.3888, "step": 1388 }, { "epoch": 0.68, "grad_norm": 0.19417132437229156, "learning_rate": 2.4040048890798267e-05, "loss": 0.3887, "step": 1389 }, { "epoch": 0.68, "grad_norm": 0.17712099850177765, "learning_rate": 2.3971969954291607e-05, "loss": 0.376, "step": 1390 }, { "epoch": 0.68, "grad_norm": 0.19583719968795776, "learning_rate": 2.39039571430924e-05, "loss": 0.3996, "step": 1391 }, { "epoch": 0.68, "grad_norm": 0.2079552561044693, "learning_rate": 2.3836010629990025e-05, "loss": 0.4679, "step": 1392 }, { "epoch": 0.69, "grad_norm": 0.23367395997047424, "learning_rate": 2.3768130587605515e-05, "loss": 0.4435, "step": 1393 }, { "epoch": 0.69, "grad_norm": 0.1940685510635376, "learning_rate": 2.3700317188390953e-05, "loss": 0.4231, "step": 1394 }, { "epoch": 0.69, "grad_norm": 0.18245066702365875, "learning_rate": 2.363257060462914e-05, "loss": 0.4029, "step": 1395 }, { "epoch": 0.69, "grad_norm": 0.19449882209300995, "learning_rate": 2.3564891008433105e-05, "loss": 0.4145, "step": 1396 }, { "epoch": 0.69, "grad_norm": 0.2096615880727768, "learning_rate": 2.349727857174576e-05, "loss": 0.4311, "step": 1397 }, { "epoch": 0.69, "grad_norm": 0.19228814542293549, "learning_rate": 2.3429733466339326e-05, "loss": 0.3962, "step": 1398 }, { "epoch": 0.69, "grad_norm": 0.18391267955303192, "learning_rate": 2.3362255863814993e-05, "loss": 0.4307, "step": 1399 }, { "epoch": 0.69, "grad_norm": 0.19370320439338684, "learning_rate": 2.329484593560243e-05, "loss": 0.3929, "step": 1400 }, { "epoch": 0.69, "grad_norm": 0.18368737399578094, "learning_rate": 2.3227503852959453e-05, "loss": 0.4321, "step": 1401 }, { "epoch": 0.69, "grad_norm": 0.18322281539440155, "learning_rate": 2.3160229786971428e-05, "loss": 0.433, "step": 1402 }, { "epoch": 0.69, "grad_norm": 0.17966963350772858, "learning_rate": 2.3093023908550958e-05, "loss": 0.4004, "step": 1403 }, { "epoch": 0.69, "grad_norm": 0.19327202439308167, "learning_rate": 2.3025886388437394e-05, "loss": 0.4301, "step": 1404 }, { "epoch": 0.69, "grad_norm": 0.19294461607933044, "learning_rate": 2.2958817397196474e-05, "loss": 0.4145, "step": 1405 }, { "epoch": 0.69, "grad_norm": 0.17913855612277985, "learning_rate": 2.289181710521976e-05, "loss": 0.3977, "step": 1406 }, { "epoch": 0.69, "grad_norm": 0.205645352602005, "learning_rate": 2.2824885682724366e-05, "loss": 0.4359, "step": 1407 }, { "epoch": 0.69, "grad_norm": 0.16316623985767365, "learning_rate": 2.275802329975233e-05, "loss": 0.399, "step": 1408 }, { "epoch": 0.69, "grad_norm": 0.1836337000131607, "learning_rate": 2.2691230126170405e-05, "loss": 0.445, "step": 1409 }, { "epoch": 0.69, "grad_norm": 0.18437521159648895, "learning_rate": 2.262450633166942e-05, "loss": 0.4116, "step": 1410 }, { "epoch": 0.69, "grad_norm": 0.18072322010993958, "learning_rate": 2.2557852085764053e-05, "loss": 0.4627, "step": 1411 }, { "epoch": 0.69, "grad_norm": 0.17010323703289032, "learning_rate": 2.249126755779215e-05, "loss": 0.4128, "step": 1412 }, { "epoch": 0.7, "grad_norm": 0.18738055229187012, "learning_rate": 2.242475291691456e-05, "loss": 0.3972, "step": 1413 }, { "epoch": 0.7, "grad_norm": 0.18066906929016113, "learning_rate": 2.2358308332114498e-05, "loss": 0.4311, "step": 1414 }, { "epoch": 0.7, "grad_norm": 0.22322335839271545, "learning_rate": 2.2291933972197287e-05, "loss": 0.4619, "step": 1415 }, { "epoch": 0.7, "grad_norm": 0.19198822975158691, "learning_rate": 2.2225630005789716e-05, "loss": 0.4402, "step": 1416 }, { "epoch": 0.7, "grad_norm": 0.20767426490783691, "learning_rate": 2.215939660133986e-05, "loss": 0.4333, "step": 1417 }, { "epoch": 0.7, "grad_norm": 0.18728384375572205, "learning_rate": 2.2093233927116437e-05, "loss": 0.3812, "step": 1418 }, { "epoch": 0.7, "grad_norm": 0.18642625212669373, "learning_rate": 2.2027142151208564e-05, "loss": 0.4457, "step": 1419 }, { "epoch": 0.7, "grad_norm": 0.17548121511936188, "learning_rate": 2.1961121441525112e-05, "loss": 0.4241, "step": 1420 }, { "epoch": 0.7, "grad_norm": 0.21922212839126587, "learning_rate": 2.189517196579453e-05, "loss": 0.4434, "step": 1421 }, { "epoch": 0.7, "grad_norm": 0.19005966186523438, "learning_rate": 2.1829293891564212e-05, "loss": 0.4545, "step": 1422 }, { "epoch": 0.7, "grad_norm": 0.2232292741537094, "learning_rate": 2.1763487386200204e-05, "loss": 0.4359, "step": 1423 }, { "epoch": 0.7, "grad_norm": 0.20814676582813263, "learning_rate": 2.1697752616886702e-05, "loss": 0.4127, "step": 1424 }, { "epoch": 0.7, "grad_norm": 0.22278355062007904, "learning_rate": 2.1632089750625652e-05, "loss": 0.4294, "step": 1425 }, { "epoch": 0.7, "grad_norm": 0.18155193328857422, "learning_rate": 2.1566498954236312e-05, "loss": 0.4215, "step": 1426 }, { "epoch": 0.7, "grad_norm": 0.21246926486492157, "learning_rate": 2.150098039435491e-05, "loss": 0.4657, "step": 1427 }, { "epoch": 0.7, "grad_norm": 0.2263745814561844, "learning_rate": 2.1435534237434084e-05, "loss": 0.4449, "step": 1428 }, { "epoch": 0.7, "grad_norm": 0.1953396499156952, "learning_rate": 2.137016064974256e-05, "loss": 0.4067, "step": 1429 }, { "epoch": 0.7, "grad_norm": 0.2048516720533371, "learning_rate": 2.1304859797364674e-05, "loss": 0.394, "step": 1430 }, { "epoch": 0.7, "grad_norm": 0.190232515335083, "learning_rate": 2.1239631846200026e-05, "loss": 0.4274, "step": 1431 }, { "epoch": 0.7, "grad_norm": 0.18894876539707184, "learning_rate": 2.1174476961962958e-05, "loss": 0.4249, "step": 1432 }, { "epoch": 0.71, "grad_norm": 0.18762405216693878, "learning_rate": 2.11093953101822e-05, "loss": 0.425, "step": 1433 }, { "epoch": 0.71, "grad_norm": 0.1875537484884262, "learning_rate": 2.104438705620042e-05, "loss": 0.4045, "step": 1434 }, { "epoch": 0.71, "grad_norm": 0.19858813285827637, "learning_rate": 2.097945236517385e-05, "loss": 0.4064, "step": 1435 }, { "epoch": 0.71, "grad_norm": 0.1966381072998047, "learning_rate": 2.091459140207181e-05, "loss": 0.4349, "step": 1436 }, { "epoch": 0.71, "grad_norm": 0.19883474707603455, "learning_rate": 2.08498043316763e-05, "loss": 0.3972, "step": 1437 }, { "epoch": 0.71, "grad_norm": 0.19201971590518951, "learning_rate": 2.0785091318581577e-05, "loss": 0.3862, "step": 1438 }, { "epoch": 0.71, "grad_norm": 0.17760716378688812, "learning_rate": 2.072045252719383e-05, "loss": 0.3512, "step": 1439 }, { "epoch": 0.71, "grad_norm": 0.18705005943775177, "learning_rate": 2.0655888121730605e-05, "loss": 0.3795, "step": 1440 }, { "epoch": 0.71, "grad_norm": 0.18435943126678467, "learning_rate": 2.0591398266220502e-05, "loss": 0.4065, "step": 1441 }, { "epoch": 0.71, "grad_norm": 0.1907874047756195, "learning_rate": 2.052698312450269e-05, "loss": 0.4257, "step": 1442 }, { "epoch": 0.71, "grad_norm": 0.18680541217327118, "learning_rate": 2.04626428602266e-05, "loss": 0.423, "step": 1443 }, { "epoch": 0.71, "grad_norm": 0.1979174166917801, "learning_rate": 2.0398377636851362e-05, "loss": 0.4206, "step": 1444 }, { "epoch": 0.71, "grad_norm": 0.21498000621795654, "learning_rate": 2.0334187617645485e-05, "loss": 0.4162, "step": 1445 }, { "epoch": 0.71, "grad_norm": 0.22416259348392487, "learning_rate": 2.0270072965686394e-05, "loss": 0.3952, "step": 1446 }, { "epoch": 0.71, "grad_norm": 0.1878574788570404, "learning_rate": 2.020603384386011e-05, "loss": 0.3742, "step": 1447 }, { "epoch": 0.71, "grad_norm": 0.18884602189064026, "learning_rate": 2.0142070414860704e-05, "loss": 0.4287, "step": 1448 }, { "epoch": 0.71, "grad_norm": 0.19438043236732483, "learning_rate": 2.0078182841189952e-05, "loss": 0.4188, "step": 1449 }, { "epoch": 0.71, "grad_norm": 0.2290455400943756, "learning_rate": 2.0014371285156918e-05, "loss": 0.4036, "step": 1450 }, { "epoch": 0.71, "grad_norm": 0.17741759121418, "learning_rate": 1.995063590887759e-05, "loss": 0.4285, "step": 1451 }, { "epoch": 0.71, "grad_norm": 0.1831721067428589, "learning_rate": 1.9886976874274356e-05, "loss": 0.4355, "step": 1452 }, { "epoch": 0.71, "grad_norm": 0.18776841461658478, "learning_rate": 1.982339434307567e-05, "loss": 0.4095, "step": 1453 }, { "epoch": 0.72, "grad_norm": 0.16547493636608124, "learning_rate": 1.9759888476815623e-05, "loss": 0.4165, "step": 1454 }, { "epoch": 0.72, "grad_norm": 0.1630392074584961, "learning_rate": 1.969645943683358e-05, "loss": 0.3837, "step": 1455 }, { "epoch": 0.72, "grad_norm": 0.19616207480430603, "learning_rate": 1.963310738427367e-05, "loss": 0.416, "step": 1456 }, { "epoch": 0.72, "grad_norm": 0.1911313235759735, "learning_rate": 1.9569832480084455e-05, "loss": 0.423, "step": 1457 }, { "epoch": 0.72, "grad_norm": 0.20524878799915314, "learning_rate": 1.9506634885018475e-05, "loss": 0.4375, "step": 1458 }, { "epoch": 0.72, "grad_norm": 0.1963445395231247, "learning_rate": 1.944351475963192e-05, "loss": 0.4109, "step": 1459 }, { "epoch": 0.72, "grad_norm": 0.21990934014320374, "learning_rate": 1.93804722642841e-05, "loss": 0.459, "step": 1460 }, { "epoch": 0.72, "grad_norm": 0.19196054339408875, "learning_rate": 1.9317507559137184e-05, "loss": 0.4425, "step": 1461 }, { "epoch": 0.72, "grad_norm": 0.18231140077114105, "learning_rate": 1.9254620804155575e-05, "loss": 0.4195, "step": 1462 }, { "epoch": 0.72, "grad_norm": 0.20360532402992249, "learning_rate": 1.91918121591058e-05, "loss": 0.4785, "step": 1463 }, { "epoch": 0.72, "grad_norm": 0.18171778321266174, "learning_rate": 1.9129081783555815e-05, "loss": 0.4001, "step": 1464 }, { "epoch": 0.72, "grad_norm": 0.19582213461399078, "learning_rate": 1.9066429836874844e-05, "loss": 0.4034, "step": 1465 }, { "epoch": 0.72, "grad_norm": 0.1977054327726364, "learning_rate": 1.9003856478232728e-05, "loss": 0.3996, "step": 1466 }, { "epoch": 0.72, "grad_norm": 0.21055199205875397, "learning_rate": 1.8941361866599776e-05, "loss": 0.422, "step": 1467 }, { "epoch": 0.72, "grad_norm": 0.1965697854757309, "learning_rate": 1.8878946160746147e-05, "loss": 0.44, "step": 1468 }, { "epoch": 0.72, "grad_norm": 0.19527587294578552, "learning_rate": 1.881660951924162e-05, "loss": 0.4547, "step": 1469 }, { "epoch": 0.72, "grad_norm": 0.19270171225070953, "learning_rate": 1.8754352100454997e-05, "loss": 0.3911, "step": 1470 }, { "epoch": 0.72, "grad_norm": 0.19116497039794922, "learning_rate": 1.8692174062553924e-05, "loss": 0.4314, "step": 1471 }, { "epoch": 0.72, "grad_norm": 0.19386743009090424, "learning_rate": 1.8630075563504296e-05, "loss": 0.4582, "step": 1472 }, { "epoch": 0.72, "grad_norm": 0.17313340306282043, "learning_rate": 1.8568056761069995e-05, "loss": 0.4138, "step": 1473 }, { "epoch": 0.73, "grad_norm": 0.18318858742713928, "learning_rate": 1.850611781281239e-05, "loss": 0.3887, "step": 1474 }, { "epoch": 0.73, "grad_norm": 0.1934928297996521, "learning_rate": 1.8444258876089993e-05, "loss": 0.4121, "step": 1475 }, { "epoch": 0.73, "grad_norm": 0.18863897025585175, "learning_rate": 1.8382480108058013e-05, "loss": 0.3878, "step": 1476 }, { "epoch": 0.73, "grad_norm": 0.20230375230312347, "learning_rate": 1.8320781665668062e-05, "loss": 0.4053, "step": 1477 }, { "epoch": 0.73, "grad_norm": 0.21132275462150574, "learning_rate": 1.825916370566761e-05, "loss": 0.4247, "step": 1478 }, { "epoch": 0.73, "grad_norm": 0.17700020968914032, "learning_rate": 1.8197626384599675e-05, "loss": 0.4054, "step": 1479 }, { "epoch": 0.73, "grad_norm": 0.19625122845172882, "learning_rate": 1.8136169858802405e-05, "loss": 0.3816, "step": 1480 }, { "epoch": 0.73, "grad_norm": 0.16137881577014923, "learning_rate": 1.807479428440873e-05, "loss": 0.4114, "step": 1481 }, { "epoch": 0.73, "grad_norm": 0.19946132600307465, "learning_rate": 1.8013499817345862e-05, "loss": 0.4255, "step": 1482 }, { "epoch": 0.73, "grad_norm": 0.19783736765384674, "learning_rate": 1.7952286613334984e-05, "loss": 0.4113, "step": 1483 }, { "epoch": 0.73, "grad_norm": 0.18271303176879883, "learning_rate": 1.78911548278908e-05, "loss": 0.4388, "step": 1484 }, { "epoch": 0.73, "grad_norm": 0.18126699328422546, "learning_rate": 1.7830104616321218e-05, "loss": 0.4476, "step": 1485 }, { "epoch": 0.73, "grad_norm": 0.2011980414390564, "learning_rate": 1.7769136133726866e-05, "loss": 0.4298, "step": 1486 }, { "epoch": 0.73, "grad_norm": 0.21266260743141174, "learning_rate": 1.7708249535000736e-05, "loss": 0.4063, "step": 1487 }, { "epoch": 0.73, "grad_norm": 0.18916939198970795, "learning_rate": 1.7647444974827783e-05, "loss": 0.4035, "step": 1488 }, { "epoch": 0.73, "grad_norm": 0.20462746918201447, "learning_rate": 1.758672260768459e-05, "loss": 0.4327, "step": 1489 }, { "epoch": 0.73, "grad_norm": 0.1832674890756607, "learning_rate": 1.7526082587838875e-05, "loss": 0.4211, "step": 1490 }, { "epoch": 0.73, "grad_norm": 0.20940318703651428, "learning_rate": 1.746552506934917e-05, "loss": 0.4145, "step": 1491 }, { "epoch": 0.73, "grad_norm": 0.17925870418548584, "learning_rate": 1.7405050206064373e-05, "loss": 0.3675, "step": 1492 }, { "epoch": 0.73, "grad_norm": 0.20548883080482483, "learning_rate": 1.7344658151623467e-05, "loss": 0.438, "step": 1493 }, { "epoch": 0.74, "grad_norm": 0.20585589110851288, "learning_rate": 1.7284349059454985e-05, "loss": 0.4254, "step": 1494 }, { "epoch": 0.74, "grad_norm": 0.22267809510231018, "learning_rate": 1.722412308277673e-05, "loss": 0.4433, "step": 1495 }, { "epoch": 0.74, "grad_norm": 0.1930231750011444, "learning_rate": 1.7163980374595312e-05, "loss": 0.4156, "step": 1496 }, { "epoch": 0.74, "grad_norm": 0.20308440923690796, "learning_rate": 1.710392108770585e-05, "loss": 0.4085, "step": 1497 }, { "epoch": 0.74, "grad_norm": 0.1920638382434845, "learning_rate": 1.7043945374691473e-05, "loss": 0.4407, "step": 1498 }, { "epoch": 0.74, "grad_norm": 0.17781995236873627, "learning_rate": 1.6984053387923014e-05, "loss": 0.3879, "step": 1499 }, { "epoch": 0.74, "grad_norm": 0.20482394099235535, "learning_rate": 1.6924245279558575e-05, "loss": 0.4045, "step": 1500 }, { "epoch": 0.74, "grad_norm": 0.19784508645534515, "learning_rate": 1.6864521201543197e-05, "loss": 0.4121, "step": 1501 }, { "epoch": 0.74, "grad_norm": 0.21278789639472961, "learning_rate": 1.680488130560842e-05, "loss": 0.428, "step": 1502 }, { "epoch": 0.74, "grad_norm": 0.20508059859275818, "learning_rate": 1.67453257432719e-05, "loss": 0.4595, "step": 1503 }, { "epoch": 0.74, "grad_norm": 0.22627341747283936, "learning_rate": 1.6685854665837038e-05, "loss": 0.4698, "step": 1504 }, { "epoch": 0.74, "grad_norm": 0.19423756003379822, "learning_rate": 1.6626468224392648e-05, "loss": 0.3916, "step": 1505 }, { "epoch": 0.74, "grad_norm": 0.18366330862045288, "learning_rate": 1.6567166569812477e-05, "loss": 0.4155, "step": 1506 }, { "epoch": 0.74, "grad_norm": 0.20893968641757965, "learning_rate": 1.6507949852754867e-05, "loss": 0.4443, "step": 1507 }, { "epoch": 0.74, "grad_norm": 0.2163964807987213, "learning_rate": 1.644881822366237e-05, "loss": 0.4276, "step": 1508 }, { "epoch": 0.74, "grad_norm": 0.22802579402923584, "learning_rate": 1.6389771832761426e-05, "loss": 0.4479, "step": 1509 }, { "epoch": 0.74, "grad_norm": 0.21351584792137146, "learning_rate": 1.6330810830061833e-05, "loss": 0.4155, "step": 1510 }, { "epoch": 0.74, "grad_norm": 0.19640874862670898, "learning_rate": 1.627193536535656e-05, "loss": 0.4411, "step": 1511 }, { "epoch": 0.74, "grad_norm": 0.1958571821451187, "learning_rate": 1.6213145588221145e-05, "loss": 0.422, "step": 1512 }, { "epoch": 0.74, "grad_norm": 0.19867700338363647, "learning_rate": 1.6154441648013535e-05, "loss": 0.4141, "step": 1513 }, { "epoch": 0.74, "grad_norm": 0.18235549330711365, "learning_rate": 1.6095823693873536e-05, "loss": 0.3897, "step": 1514 }, { "epoch": 0.75, "grad_norm": 0.20947769284248352, "learning_rate": 1.603729187472258e-05, "loss": 0.4342, "step": 1515 }, { "epoch": 0.75, "grad_norm": 0.19586513936519623, "learning_rate": 1.597884633926316e-05, "loss": 0.4069, "step": 1516 }, { "epoch": 0.75, "grad_norm": 0.18264049291610718, "learning_rate": 1.592048723597866e-05, "loss": 0.4273, "step": 1517 }, { "epoch": 0.75, "grad_norm": 0.1969510167837143, "learning_rate": 1.5862214713132824e-05, "loss": 0.3994, "step": 1518 }, { "epoch": 0.75, "grad_norm": 0.20635266602039337, "learning_rate": 1.5804028918769485e-05, "loss": 0.461, "step": 1519 }, { "epoch": 0.75, "grad_norm": 0.18621830642223358, "learning_rate": 1.5745930000712056e-05, "loss": 0.4146, "step": 1520 }, { "epoch": 0.75, "grad_norm": 0.1895444393157959, "learning_rate": 1.5687918106563326e-05, "loss": 0.4099, "step": 1521 }, { "epoch": 0.75, "grad_norm": 0.1961425095796585, "learning_rate": 1.5629993383704934e-05, "loss": 0.4297, "step": 1522 }, { "epoch": 0.75, "grad_norm": 0.20220254361629486, "learning_rate": 1.557215597929712e-05, "loss": 0.3991, "step": 1523 }, { "epoch": 0.75, "grad_norm": 0.21826310455799103, "learning_rate": 1.5514406040278183e-05, "loss": 0.4172, "step": 1524 }, { "epoch": 0.75, "grad_norm": 0.17745064198970795, "learning_rate": 1.5456743713364336e-05, "loss": 0.3877, "step": 1525 }, { "epoch": 0.75, "grad_norm": 0.18183733522891998, "learning_rate": 1.539916914504911e-05, "loss": 0.4096, "step": 1526 }, { "epoch": 0.75, "grad_norm": 0.21207475662231445, "learning_rate": 1.5341682481603154e-05, "loss": 0.4136, "step": 1527 }, { "epoch": 0.75, "grad_norm": 0.20768669247627258, "learning_rate": 1.528428386907375e-05, "loss": 0.4179, "step": 1528 }, { "epoch": 0.75, "grad_norm": 0.21358920633792877, "learning_rate": 1.5226973453284487e-05, "loss": 0.422, "step": 1529 }, { "epoch": 0.75, "grad_norm": 0.211592897772789, "learning_rate": 1.5169751379834879e-05, "loss": 0.4482, "step": 1530 }, { "epoch": 0.75, "grad_norm": 0.2099764049053192, "learning_rate": 1.5112617794100047e-05, "loss": 0.3908, "step": 1531 }, { "epoch": 0.75, "grad_norm": 0.194295272231102, "learning_rate": 1.5055572841230254e-05, "loss": 0.4107, "step": 1532 }, { "epoch": 0.75, "grad_norm": 0.19561241567134857, "learning_rate": 1.4998616666150616e-05, "loss": 0.4225, "step": 1533 }, { "epoch": 0.75, "grad_norm": 0.2192692905664444, "learning_rate": 1.4941749413560674e-05, "loss": 0.4055, "step": 1534 }, { "epoch": 0.76, "grad_norm": 0.19740304350852966, "learning_rate": 1.4884971227934108e-05, "loss": 0.4321, "step": 1535 }, { "epoch": 0.76, "grad_norm": 0.21323628723621368, "learning_rate": 1.4828282253518288e-05, "loss": 0.4527, "step": 1536 }, { "epoch": 0.76, "grad_norm": 0.19307342171669006, "learning_rate": 1.4771682634333933e-05, "loss": 0.3876, "step": 1537 }, { "epoch": 0.76, "grad_norm": 0.22197125852108002, "learning_rate": 1.4715172514174746e-05, "loss": 0.4723, "step": 1538 }, { "epoch": 0.76, "grad_norm": 0.19538268446922302, "learning_rate": 1.465875203660711e-05, "loss": 0.3936, "step": 1539 }, { "epoch": 0.76, "grad_norm": 0.22010764479637146, "learning_rate": 1.4602421344969603e-05, "loss": 0.4184, "step": 1540 }, { "epoch": 0.76, "grad_norm": 0.19435614347457886, "learning_rate": 1.454618058237272e-05, "loss": 0.4038, "step": 1541 }, { "epoch": 0.76, "grad_norm": 0.20033884048461914, "learning_rate": 1.4490029891698476e-05, "loss": 0.3895, "step": 1542 }, { "epoch": 0.76, "grad_norm": 0.2040463387966156, "learning_rate": 1.4433969415600102e-05, "loss": 0.4542, "step": 1543 }, { "epoch": 0.76, "grad_norm": 0.23028813302516937, "learning_rate": 1.4377999296501582e-05, "loss": 0.4216, "step": 1544 }, { "epoch": 0.76, "grad_norm": 0.20009548962116241, "learning_rate": 1.4322119676597357e-05, "loss": 0.4555, "step": 1545 }, { "epoch": 0.76, "grad_norm": 0.18588322401046753, "learning_rate": 1.4266330697851954e-05, "loss": 0.4124, "step": 1546 }, { "epoch": 0.76, "grad_norm": 0.18526224792003632, "learning_rate": 1.4210632501999643e-05, "loss": 0.4463, "step": 1547 }, { "epoch": 0.76, "grad_norm": 0.1698131561279297, "learning_rate": 1.4155025230544033e-05, "loss": 0.3849, "step": 1548 }, { "epoch": 0.76, "grad_norm": 0.2071213722229004, "learning_rate": 1.4099509024757734e-05, "loss": 0.4381, "step": 1549 }, { "epoch": 0.76, "grad_norm": 0.20984041690826416, "learning_rate": 1.4044084025681998e-05, "loss": 0.3934, "step": 1550 }, { "epoch": 0.76, "grad_norm": 0.22569172084331512, "learning_rate": 1.3988750374126396e-05, "loss": 0.4046, "step": 1551 }, { "epoch": 0.76, "grad_norm": 0.18658342957496643, "learning_rate": 1.3933508210668388e-05, "loss": 0.3739, "step": 1552 }, { "epoch": 0.76, "grad_norm": 0.22320377826690674, "learning_rate": 1.3878357675653025e-05, "loss": 0.4141, "step": 1553 }, { "epoch": 0.76, "grad_norm": 0.24126341938972473, "learning_rate": 1.3823298909192545e-05, "loss": 0.418, "step": 1554 }, { "epoch": 0.77, "grad_norm": 0.18668067455291748, "learning_rate": 1.3768332051166088e-05, "loss": 0.4109, "step": 1555 }, { "epoch": 0.77, "grad_norm": 0.19640204310417175, "learning_rate": 1.3713457241219273e-05, "loss": 0.4154, "step": 1556 }, { "epoch": 0.77, "grad_norm": 0.1853923350572586, "learning_rate": 1.3658674618763862e-05, "loss": 0.3991, "step": 1557 }, { "epoch": 0.77, "grad_norm": 0.19131822884082794, "learning_rate": 1.3603984322977403e-05, "loss": 0.4526, "step": 1558 }, { "epoch": 0.77, "grad_norm": 0.22523625195026398, "learning_rate": 1.3549386492802923e-05, "loss": 0.4051, "step": 1559 }, { "epoch": 0.77, "grad_norm": 0.1803130954504013, "learning_rate": 1.3494881266948495e-05, "loss": 0.3987, "step": 1560 }, { "epoch": 0.77, "grad_norm": 0.21042081713676453, "learning_rate": 1.344046878388695e-05, "loss": 0.3962, "step": 1561 }, { "epoch": 0.77, "grad_norm": 0.18720225989818573, "learning_rate": 1.3386149181855478e-05, "loss": 0.4313, "step": 1562 }, { "epoch": 0.77, "grad_norm": 0.1856386512517929, "learning_rate": 1.333192259885534e-05, "loss": 0.3752, "step": 1563 }, { "epoch": 0.77, "grad_norm": 0.21580949425697327, "learning_rate": 1.3277789172651439e-05, "loss": 0.4487, "step": 1564 }, { "epoch": 0.77, "grad_norm": 0.19767288863658905, "learning_rate": 1.322374904077206e-05, "loss": 0.4164, "step": 1565 }, { "epoch": 0.77, "grad_norm": 0.19402769207954407, "learning_rate": 1.3169802340508374e-05, "loss": 0.4446, "step": 1566 }, { "epoch": 0.77, "grad_norm": 0.19632939994335175, "learning_rate": 1.3115949208914302e-05, "loss": 0.419, "step": 1567 }, { "epoch": 0.77, "grad_norm": 0.21267423033714294, "learning_rate": 1.306218978280596e-05, "loss": 0.4025, "step": 1568 }, { "epoch": 0.77, "grad_norm": 0.19962793588638306, "learning_rate": 1.3008524198761485e-05, "loss": 0.4194, "step": 1569 }, { "epoch": 0.77, "grad_norm": 0.20233914256095886, "learning_rate": 1.295495259312049e-05, "loss": 0.4305, "step": 1570 }, { "epoch": 0.77, "grad_norm": 0.199266254901886, "learning_rate": 1.2901475101983939e-05, "loss": 0.4034, "step": 1571 }, { "epoch": 0.77, "grad_norm": 0.17649458348751068, "learning_rate": 1.2848091861213634e-05, "loss": 0.4105, "step": 1572 }, { "epoch": 0.77, "grad_norm": 0.19638273119926453, "learning_rate": 1.2794803006431982e-05, "loss": 0.3573, "step": 1573 }, { "epoch": 0.77, "grad_norm": 0.2222113460302353, "learning_rate": 1.274160867302151e-05, "loss": 0.3922, "step": 1574 }, { "epoch": 0.77, "grad_norm": 0.19756613671779633, "learning_rate": 1.268850899612471e-05, "loss": 0.4207, "step": 1575 }, { "epoch": 0.78, "grad_norm": 0.1981205940246582, "learning_rate": 1.263550411064352e-05, "loss": 0.4324, "step": 1576 }, { "epoch": 0.78, "grad_norm": 0.19727161526679993, "learning_rate": 1.2582594151239107e-05, "loss": 0.4009, "step": 1577 }, { "epoch": 0.78, "grad_norm": 0.21921490132808685, "learning_rate": 1.2529779252331453e-05, "loss": 0.4316, "step": 1578 }, { "epoch": 0.78, "grad_norm": 0.1991022378206253, "learning_rate": 1.2477059548099019e-05, "loss": 0.3681, "step": 1579 }, { "epoch": 0.78, "grad_norm": 0.1817418783903122, "learning_rate": 1.2424435172478432e-05, "loss": 0.4019, "step": 1580 }, { "epoch": 0.78, "grad_norm": 0.2050340175628662, "learning_rate": 1.2371906259164168e-05, "loss": 0.474, "step": 1581 }, { "epoch": 0.78, "grad_norm": 0.23674575984477997, "learning_rate": 1.2319472941608118e-05, "loss": 0.4192, "step": 1582 }, { "epoch": 0.78, "grad_norm": 0.1980106383562088, "learning_rate": 1.2267135353019343e-05, "loss": 0.3918, "step": 1583 }, { "epoch": 0.78, "grad_norm": 0.20271557569503784, "learning_rate": 1.2214893626363683e-05, "loss": 0.4323, "step": 1584 }, { "epoch": 0.78, "grad_norm": 0.20678327977657318, "learning_rate": 1.216274789436347e-05, "loss": 0.3806, "step": 1585 }, { "epoch": 0.78, "grad_norm": 0.21985599398612976, "learning_rate": 1.2110698289497119e-05, "loss": 0.3916, "step": 1586 }, { "epoch": 0.78, "grad_norm": 0.21409113705158234, "learning_rate": 1.2058744943998845e-05, "loss": 0.4053, "step": 1587 }, { "epoch": 0.78, "grad_norm": 0.2017294317483902, "learning_rate": 1.20068879898583e-05, "loss": 0.4123, "step": 1588 }, { "epoch": 0.78, "grad_norm": 0.21247157454490662, "learning_rate": 1.1955127558820294e-05, "loss": 0.4299, "step": 1589 }, { "epoch": 0.78, "grad_norm": 0.19242793321609497, "learning_rate": 1.1903463782384361e-05, "loss": 0.4461, "step": 1590 }, { "epoch": 0.78, "grad_norm": 0.20927371084690094, "learning_rate": 1.1851896791804507e-05, "loss": 0.3923, "step": 1591 }, { "epoch": 0.78, "grad_norm": 0.206283301115036, "learning_rate": 1.1800426718088836e-05, "loss": 0.4253, "step": 1592 }, { "epoch": 0.78, "grad_norm": 0.20108522474765778, "learning_rate": 1.1749053691999257e-05, "loss": 0.4125, "step": 1593 }, { "epoch": 0.78, "grad_norm": 0.1970703899860382, "learning_rate": 1.1697777844051105e-05, "loss": 0.4565, "step": 1594 }, { "epoch": 0.78, "grad_norm": 0.236130028963089, "learning_rate": 1.1646599304512829e-05, "loss": 0.4149, "step": 1595 }, { "epoch": 0.79, "grad_norm": 0.2055957317352295, "learning_rate": 1.1595518203405647e-05, "loss": 0.4014, "step": 1596 }, { "epoch": 0.79, "grad_norm": 0.22740332782268524, "learning_rate": 1.1544534670503282e-05, "loss": 0.4384, "step": 1597 }, { "epoch": 0.79, "grad_norm": 0.23274151980876923, "learning_rate": 1.1493648835331539e-05, "loss": 0.4371, "step": 1598 }, { "epoch": 0.79, "grad_norm": 0.20822468400001526, "learning_rate": 1.1442860827168018e-05, "loss": 0.4093, "step": 1599 }, { "epoch": 0.79, "grad_norm": 0.20117636024951935, "learning_rate": 1.1392170775041788e-05, "loss": 0.384, "step": 1600 }, { "epoch": 0.79, "grad_norm": 0.201768696308136, "learning_rate": 1.1341578807733088e-05, "loss": 0.4042, "step": 1601 }, { "epoch": 0.79, "grad_norm": 0.22420474886894226, "learning_rate": 1.1291085053772926e-05, "loss": 0.4508, "step": 1602 }, { "epoch": 0.79, "grad_norm": 0.19843405485153198, "learning_rate": 1.124068964144282e-05, "loss": 0.4361, "step": 1603 }, { "epoch": 0.79, "grad_norm": 0.21102271974086761, "learning_rate": 1.1190392698774416e-05, "loss": 0.4149, "step": 1604 }, { "epoch": 0.79, "grad_norm": 0.19546155631542206, "learning_rate": 1.1140194353549254e-05, "loss": 0.3899, "step": 1605 }, { "epoch": 0.79, "grad_norm": 0.2032841295003891, "learning_rate": 1.1090094733298328e-05, "loss": 0.4338, "step": 1606 }, { "epoch": 0.79, "grad_norm": 0.18995386362075806, "learning_rate": 1.1040093965301834e-05, "loss": 0.3995, "step": 1607 }, { "epoch": 0.79, "grad_norm": 0.20747630298137665, "learning_rate": 1.0990192176588816e-05, "loss": 0.4382, "step": 1608 }, { "epoch": 0.79, "grad_norm": 0.2156132608652115, "learning_rate": 1.0940389493936903e-05, "loss": 0.4424, "step": 1609 }, { "epoch": 0.79, "grad_norm": 0.19035573303699493, "learning_rate": 1.0890686043871894e-05, "loss": 0.3911, "step": 1610 }, { "epoch": 0.79, "grad_norm": 0.19414740800857544, "learning_rate": 1.0841081952667498e-05, "loss": 0.4067, "step": 1611 }, { "epoch": 0.79, "grad_norm": 0.23628096282482147, "learning_rate": 1.0791577346344988e-05, "loss": 0.4403, "step": 1612 }, { "epoch": 0.79, "grad_norm": 0.21353869140148163, "learning_rate": 1.0742172350672925e-05, "loss": 0.4962, "step": 1613 }, { "epoch": 0.79, "grad_norm": 0.19328925013542175, "learning_rate": 1.0692867091166765e-05, "loss": 0.4073, "step": 1614 }, { "epoch": 0.79, "grad_norm": 0.18696925044059753, "learning_rate": 1.064366169308863e-05, "loss": 0.3934, "step": 1615 }, { "epoch": 0.8, "grad_norm": 0.2156519591808319, "learning_rate": 1.0594556281446861e-05, "loss": 0.4573, "step": 1616 }, { "epoch": 0.8, "grad_norm": 0.20224152505397797, "learning_rate": 1.0545550980995856e-05, "loss": 0.4156, "step": 1617 }, { "epoch": 0.8, "grad_norm": 0.18363499641418457, "learning_rate": 1.049664591623563e-05, "loss": 0.4009, "step": 1618 }, { "epoch": 0.8, "grad_norm": 0.17196479439735413, "learning_rate": 1.0447841211411603e-05, "loss": 0.3839, "step": 1619 }, { "epoch": 0.8, "grad_norm": 0.1958489716053009, "learning_rate": 1.0399136990514118e-05, "loss": 0.3911, "step": 1620 }, { "epoch": 0.8, "grad_norm": 0.17815430462360382, "learning_rate": 1.0350533377278353e-05, "loss": 0.4006, "step": 1621 }, { "epoch": 0.8, "grad_norm": 0.17372673749923706, "learning_rate": 1.0302030495183812e-05, "loss": 0.3969, "step": 1622 }, { "epoch": 0.8, "grad_norm": 0.20404615998268127, "learning_rate": 1.025362846745414e-05, "loss": 0.4186, "step": 1623 }, { "epoch": 0.8, "grad_norm": 0.22321657836437225, "learning_rate": 1.0205327417056681e-05, "loss": 0.427, "step": 1624 }, { "epoch": 0.8, "grad_norm": 0.19635595381259918, "learning_rate": 1.0157127466702327e-05, "loss": 0.4195, "step": 1625 }, { "epoch": 0.8, "grad_norm": 0.18007998168468475, "learning_rate": 1.0109028738845062e-05, "loss": 0.3716, "step": 1626 }, { "epoch": 0.8, "grad_norm": 0.22507210075855255, "learning_rate": 1.0061031355681766e-05, "loss": 0.4509, "step": 1627 }, { "epoch": 0.8, "grad_norm": 0.20726299285888672, "learning_rate": 1.0013135439151767e-05, "loss": 0.4221, "step": 1628 }, { "epoch": 0.8, "grad_norm": 0.21522113680839539, "learning_rate": 9.965341110936693e-06, "loss": 0.4196, "step": 1629 }, { "epoch": 0.8, "grad_norm": 0.21569006145000458, "learning_rate": 9.91764849246003e-06, "loss": 0.4401, "step": 1630 }, { "epoch": 0.8, "grad_norm": 0.228597953915596, "learning_rate": 9.870057704886908e-06, "loss": 0.4367, "step": 1631 }, { "epoch": 0.8, "grad_norm": 0.2067096084356308, "learning_rate": 9.822568869123711e-06, "loss": 0.4094, "step": 1632 }, { "epoch": 0.8, "grad_norm": 0.21547719836235046, "learning_rate": 9.775182105817837e-06, "loss": 0.3878, "step": 1633 }, { "epoch": 0.8, "grad_norm": 0.20170579850673676, "learning_rate": 9.72789753535734e-06, "loss": 0.4673, "step": 1634 }, { "epoch": 0.8, "grad_norm": 0.20141136646270752, "learning_rate": 9.680715277870683e-06, "loss": 0.4253, "step": 1635 }, { "epoch": 0.8, "grad_norm": 0.21639539301395416, "learning_rate": 9.633635453226376e-06, "loss": 0.4375, "step": 1636 }, { "epoch": 0.81, "grad_norm": 0.18588784337043762, "learning_rate": 9.586658181032692e-06, "loss": 0.4233, "step": 1637 }, { "epoch": 0.81, "grad_norm": 0.1993391066789627, "learning_rate": 9.539783580637363e-06, "loss": 0.424, "step": 1638 }, { "epoch": 0.81, "grad_norm": 0.19244639575481415, "learning_rate": 9.493011771127308e-06, "loss": 0.403, "step": 1639 }, { "epoch": 0.81, "grad_norm": 0.19878752529621124, "learning_rate": 9.446342871328273e-06, "loss": 0.3974, "step": 1640 }, { "epoch": 0.81, "grad_norm": 0.22190791368484497, "learning_rate": 9.399776999804566e-06, "loss": 0.405, "step": 1641 }, { "epoch": 0.81, "grad_norm": 0.2131974846124649, "learning_rate": 9.353314274858726e-06, "loss": 0.4352, "step": 1642 }, { "epoch": 0.81, "grad_norm": 0.20800502598285675, "learning_rate": 9.306954814531294e-06, "loss": 0.4331, "step": 1643 }, { "epoch": 0.81, "grad_norm": 0.21112945675849915, "learning_rate": 9.26069873660042e-06, "loss": 0.4444, "step": 1644 }, { "epoch": 0.81, "grad_norm": 0.2381008416414261, "learning_rate": 9.214546158581622e-06, "loss": 0.4602, "step": 1645 }, { "epoch": 0.81, "grad_norm": 0.2168436199426651, "learning_rate": 9.16849719772745e-06, "loss": 0.4041, "step": 1646 }, { "epoch": 0.81, "grad_norm": 0.21405412256717682, "learning_rate": 9.122551971027265e-06, "loss": 0.4511, "step": 1647 }, { "epoch": 0.81, "grad_norm": 0.2069162279367447, "learning_rate": 9.076710595206833e-06, "loss": 0.4131, "step": 1648 }, { "epoch": 0.81, "grad_norm": 0.19047938287258148, "learning_rate": 9.030973186728108e-06, "loss": 0.4097, "step": 1649 }, { "epoch": 0.81, "grad_norm": 0.23237194120883942, "learning_rate": 8.985339861788883e-06, "loss": 0.4136, "step": 1650 }, { "epoch": 0.81, "grad_norm": 0.22656477987766266, "learning_rate": 8.939810736322574e-06, "loss": 0.4418, "step": 1651 }, { "epoch": 0.81, "grad_norm": 0.19432346522808075, "learning_rate": 8.894385925997827e-06, "loss": 0.4165, "step": 1652 }, { "epoch": 0.81, "grad_norm": 0.23096157610416412, "learning_rate": 8.849065546218294e-06, "loss": 0.4032, "step": 1653 }, { "epoch": 0.81, "grad_norm": 0.20077307522296906, "learning_rate": 8.803849712122292e-06, "loss": 0.3853, "step": 1654 }, { "epoch": 0.81, "grad_norm": 0.22590836882591248, "learning_rate": 8.758738538582578e-06, "loss": 0.4328, "step": 1655 }, { "epoch": 0.81, "grad_norm": 0.22454734146595, "learning_rate": 8.713732140205977e-06, "loss": 0.4331, "step": 1656 }, { "epoch": 0.82, "grad_norm": 0.25102072954177856, "learning_rate": 8.668830631333147e-06, "loss": 0.4591, "step": 1657 }, { "epoch": 0.82, "grad_norm": 0.2081625610589981, "learning_rate": 8.624034126038249e-06, "loss": 0.4177, "step": 1658 }, { "epoch": 0.82, "grad_norm": 0.17783699929714203, "learning_rate": 8.579342738128714e-06, "loss": 0.3821, "step": 1659 }, { "epoch": 0.82, "grad_norm": 0.20047414302825928, "learning_rate": 8.534756581144893e-06, "loss": 0.4631, "step": 1660 }, { "epoch": 0.82, "grad_norm": 0.2046467512845993, "learning_rate": 8.490275768359784e-06, "loss": 0.4152, "step": 1661 }, { "epoch": 0.82, "grad_norm": 0.19442375004291534, "learning_rate": 8.445900412778767e-06, "loss": 0.4235, "step": 1662 }, { "epoch": 0.82, "grad_norm": 0.20529651641845703, "learning_rate": 8.401630627139318e-06, "loss": 0.4121, "step": 1663 }, { "epoch": 0.82, "grad_norm": 0.19125999510288239, "learning_rate": 8.357466523910689e-06, "loss": 0.411, "step": 1664 }, { "epoch": 0.82, "grad_norm": 0.22517508268356323, "learning_rate": 8.313408215293645e-06, "loss": 0.4125, "step": 1665 }, { "epoch": 0.82, "grad_norm": 0.2182742953300476, "learning_rate": 8.269455813220156e-06, "loss": 0.4242, "step": 1666 }, { "epoch": 0.82, "grad_norm": 0.19459427893161774, "learning_rate": 8.225609429353187e-06, "loss": 0.425, "step": 1667 }, { "epoch": 0.82, "grad_norm": 0.2188401073217392, "learning_rate": 8.181869175086293e-06, "loss": 0.4039, "step": 1668 }, { "epoch": 0.82, "grad_norm": 0.20811998844146729, "learning_rate": 8.138235161543468e-06, "loss": 0.4276, "step": 1669 }, { "epoch": 0.82, "grad_norm": 0.25418028235435486, "learning_rate": 8.094707499578724e-06, "loss": 0.4114, "step": 1670 }, { "epoch": 0.82, "grad_norm": 0.19322514533996582, "learning_rate": 8.051286299775951e-06, "loss": 0.4134, "step": 1671 }, { "epoch": 0.82, "grad_norm": 0.20260502398014069, "learning_rate": 8.00797167244851e-06, "loss": 0.3922, "step": 1672 }, { "epoch": 0.82, "grad_norm": 0.2086922973394394, "learning_rate": 7.964763727639085e-06, "loss": 0.4276, "step": 1673 }, { "epoch": 0.82, "grad_norm": 0.21455061435699463, "learning_rate": 7.92166257511922e-06, "loss": 0.4327, "step": 1674 }, { "epoch": 0.82, "grad_norm": 0.23166830837726593, "learning_rate": 7.878668324389255e-06, "loss": 0.414, "step": 1675 }, { "epoch": 0.82, "grad_norm": 0.21797077357769012, "learning_rate": 7.835781084677862e-06, "loss": 0.425, "step": 1676 }, { "epoch": 0.83, "grad_norm": 0.17600877583026886, "learning_rate": 7.793000964941932e-06, "loss": 0.4109, "step": 1677 }, { "epoch": 0.83, "grad_norm": 0.2215367704629898, "learning_rate": 7.750328073866104e-06, "loss": 0.4442, "step": 1678 }, { "epoch": 0.83, "grad_norm": 0.20055784285068512, "learning_rate": 7.707762519862687e-06, "loss": 0.3938, "step": 1679 }, { "epoch": 0.83, "grad_norm": 0.196663960814476, "learning_rate": 7.665304411071256e-06, "loss": 0.4138, "step": 1680 }, { "epoch": 0.83, "grad_norm": 0.20250043272972107, "learning_rate": 7.6229538553584556e-06, "loss": 0.3965, "step": 1681 }, { "epoch": 0.83, "grad_norm": 0.19539731740951538, "learning_rate": 7.580710960317605e-06, "loss": 0.3796, "step": 1682 }, { "epoch": 0.83, "grad_norm": 0.20999492704868317, "learning_rate": 7.538575833268602e-06, "loss": 0.4057, "step": 1683 }, { "epoch": 0.83, "grad_norm": 0.21114909648895264, "learning_rate": 7.49654858125749e-06, "loss": 0.425, "step": 1684 }, { "epoch": 0.83, "grad_norm": 0.17071469128131866, "learning_rate": 7.454629311056288e-06, "loss": 0.3949, "step": 1685 }, { "epoch": 0.83, "grad_norm": 0.20629732310771942, "learning_rate": 7.412818129162669e-06, "loss": 0.4012, "step": 1686 }, { "epoch": 0.83, "grad_norm": 0.19257041811943054, "learning_rate": 7.371115141799695e-06, "loss": 0.3842, "step": 1687 }, { "epoch": 0.83, "grad_norm": 0.22959087789058685, "learning_rate": 7.329520454915556e-06, "loss": 0.4628, "step": 1688 }, { "epoch": 0.83, "grad_norm": 0.21770425140857697, "learning_rate": 7.288034174183328e-06, "loss": 0.4469, "step": 1689 }, { "epoch": 0.83, "grad_norm": 0.2069375216960907, "learning_rate": 7.246656405000646e-06, "loss": 0.413, "step": 1690 }, { "epoch": 0.83, "grad_norm": 0.19819796085357666, "learning_rate": 7.20538725248947e-06, "loss": 0.4418, "step": 1691 }, { "epoch": 0.83, "grad_norm": 0.20461313426494598, "learning_rate": 7.164226821495823e-06, "loss": 0.3783, "step": 1692 }, { "epoch": 0.83, "grad_norm": 0.2109365314245224, "learning_rate": 7.123175216589528e-06, "loss": 0.3649, "step": 1693 }, { "epoch": 0.83, "grad_norm": 0.18167844414710999, "learning_rate": 7.0822325420639106e-06, "loss": 0.3735, "step": 1694 }, { "epoch": 0.83, "grad_norm": 0.20528644323349, "learning_rate": 7.041398901935558e-06, "loss": 0.4411, "step": 1695 }, { "epoch": 0.83, "grad_norm": 0.20338168740272522, "learning_rate": 7.000674399944046e-06, "loss": 0.4135, "step": 1696 }, { "epoch": 0.83, "grad_norm": 0.18965357542037964, "learning_rate": 6.960059139551705e-06, "loss": 0.4013, "step": 1697 }, { "epoch": 0.84, "grad_norm": 0.18433687090873718, "learning_rate": 6.919553223943314e-06, "loss": 0.4407, "step": 1698 }, { "epoch": 0.84, "grad_norm": 0.22077429294586182, "learning_rate": 6.879156756025851e-06, "loss": 0.4245, "step": 1699 }, { "epoch": 0.84, "grad_norm": 0.20843379199504852, "learning_rate": 6.838869838428241e-06, "loss": 0.4419, "step": 1700 }, { "epoch": 0.84, "grad_norm": 0.2087080329656601, "learning_rate": 6.798692573501114e-06, "loss": 0.4118, "step": 1701 }, { "epoch": 0.84, "grad_norm": 0.19805902242660522, "learning_rate": 6.758625063316493e-06, "loss": 0.4247, "step": 1702 }, { "epoch": 0.84, "grad_norm": 0.18337005376815796, "learning_rate": 6.718667409667578e-06, "loss": 0.4364, "step": 1703 }, { "epoch": 0.84, "grad_norm": 0.19436267018318176, "learning_rate": 6.678819714068463e-06, "loss": 0.4268, "step": 1704 }, { "epoch": 0.84, "grad_norm": 0.20808185636997223, "learning_rate": 6.639082077753922e-06, "loss": 0.4124, "step": 1705 }, { "epoch": 0.84, "grad_norm": 0.22889494895935059, "learning_rate": 6.599454601679078e-06, "loss": 0.4111, "step": 1706 }, { "epoch": 0.84, "grad_norm": 0.22176896035671234, "learning_rate": 6.55993738651921e-06, "loss": 0.4282, "step": 1707 }, { "epoch": 0.84, "grad_norm": 0.2038697600364685, "learning_rate": 6.52053053266945e-06, "loss": 0.3985, "step": 1708 }, { "epoch": 0.84, "grad_norm": 0.18443606793880463, "learning_rate": 6.4812341402446075e-06, "loss": 0.3635, "step": 1709 }, { "epoch": 0.84, "grad_norm": 0.18501728773117065, "learning_rate": 6.442048309078796e-06, "loss": 0.4007, "step": 1710 }, { "epoch": 0.84, "grad_norm": 0.19907592236995697, "learning_rate": 6.402973138725282e-06, "loss": 0.427, "step": 1711 }, { "epoch": 0.84, "grad_norm": 0.20524227619171143, "learning_rate": 6.364008728456172e-06, "loss": 0.4186, "step": 1712 }, { "epoch": 0.84, "grad_norm": 0.20032066106796265, "learning_rate": 6.32515517726221e-06, "loss": 0.4378, "step": 1713 }, { "epoch": 0.84, "grad_norm": 0.1688099503517151, "learning_rate": 6.286412583852469e-06, "loss": 0.4071, "step": 1714 }, { "epoch": 0.84, "grad_norm": 0.21957676112651825, "learning_rate": 6.247781046654144e-06, "loss": 0.408, "step": 1715 }, { "epoch": 0.84, "grad_norm": 0.20641595125198364, "learning_rate": 6.209260663812272e-06, "loss": 0.41, "step": 1716 }, { "epoch": 0.84, "grad_norm": 0.21435695886611938, "learning_rate": 6.170851533189537e-06, "loss": 0.4504, "step": 1717 }, { "epoch": 0.85, "grad_norm": 0.18814608454704285, "learning_rate": 6.132553752365927e-06, "loss": 0.3976, "step": 1718 }, { "epoch": 0.85, "grad_norm": 0.18613781034946442, "learning_rate": 6.0943674186385856e-06, "loss": 0.4299, "step": 1719 }, { "epoch": 0.85, "grad_norm": 0.17699775099754333, "learning_rate": 6.056292629021482e-06, "loss": 0.4248, "step": 1720 }, { "epoch": 0.85, "grad_norm": 0.19636863470077515, "learning_rate": 6.018329480245255e-06, "loss": 0.3951, "step": 1721 }, { "epoch": 0.85, "grad_norm": 0.1924283504486084, "learning_rate": 5.9804780687568504e-06, "loss": 0.4354, "step": 1722 }, { "epoch": 0.85, "grad_norm": 0.193400040268898, "learning_rate": 5.9427384907194185e-06, "loss": 0.4049, "step": 1723 }, { "epoch": 0.85, "grad_norm": 0.1960735321044922, "learning_rate": 5.905110842011896e-06, "loss": 0.3902, "step": 1724 }, { "epoch": 0.85, "grad_norm": 0.2234296351671219, "learning_rate": 5.867595218228949e-06, "loss": 0.4135, "step": 1725 }, { "epoch": 0.85, "grad_norm": 0.21450071036815643, "learning_rate": 5.830191714680577e-06, "loss": 0.4394, "step": 1726 }, { "epoch": 0.85, "grad_norm": 0.20071625709533691, "learning_rate": 5.79290042639199e-06, "loss": 0.4066, "step": 1727 }, { "epoch": 0.85, "grad_norm": 0.190434992313385, "learning_rate": 5.755721448103229e-06, "loss": 0.4134, "step": 1728 }, { "epoch": 0.85, "grad_norm": 0.22027884423732758, "learning_rate": 5.718654874269103e-06, "loss": 0.4503, "step": 1729 }, { "epoch": 0.85, "grad_norm": 0.2222197949886322, "learning_rate": 5.68170079905877e-06, "loss": 0.3886, "step": 1730 }, { "epoch": 0.85, "grad_norm": 0.21269673109054565, "learning_rate": 5.64485931635565e-06, "loss": 0.4055, "step": 1731 }, { "epoch": 0.85, "grad_norm": 0.18928168714046478, "learning_rate": 5.608130519757043e-06, "loss": 0.4098, "step": 1732 }, { "epoch": 0.85, "grad_norm": 0.21093791723251343, "learning_rate": 5.571514502574038e-06, "loss": 0.4044, "step": 1733 }, { "epoch": 0.85, "grad_norm": 0.19369980692863464, "learning_rate": 5.535011357831149e-06, "loss": 0.3939, "step": 1734 }, { "epoch": 0.85, "grad_norm": 0.2005825638771057, "learning_rate": 5.498621178266167e-06, "loss": 0.3476, "step": 1735 }, { "epoch": 0.85, "grad_norm": 0.2121814787387848, "learning_rate": 5.462344056329877e-06, "loss": 0.3923, "step": 1736 }, { "epoch": 0.85, "grad_norm": 0.21452876925468445, "learning_rate": 5.426180084185828e-06, "loss": 0.4095, "step": 1737 }, { "epoch": 0.86, "grad_norm": 0.2058766633272171, "learning_rate": 5.39012935371011e-06, "loss": 0.4076, "step": 1738 }, { "epoch": 0.86, "grad_norm": 0.21767227351665497, "learning_rate": 5.354191956491145e-06, "loss": 0.4303, "step": 1739 }, { "epoch": 0.86, "grad_norm": 0.22206884622573853, "learning_rate": 5.318367983829392e-06, "loss": 0.4116, "step": 1740 }, { "epoch": 0.86, "grad_norm": 0.20146076381206512, "learning_rate": 5.2826575267371615e-06, "loss": 0.4318, "step": 1741 }, { "epoch": 0.86, "grad_norm": 0.22497648000717163, "learning_rate": 5.247060675938375e-06, "loss": 0.4225, "step": 1742 }, { "epoch": 0.86, "grad_norm": 0.20824171602725983, "learning_rate": 5.211577521868339e-06, "loss": 0.4411, "step": 1743 }, { "epoch": 0.86, "grad_norm": 0.21089524030685425, "learning_rate": 5.176208154673501e-06, "loss": 0.4113, "step": 1744 }, { "epoch": 0.86, "grad_norm": 0.22562706470489502, "learning_rate": 5.1409526642112295e-06, "loss": 0.4234, "step": 1745 }, { "epoch": 0.86, "grad_norm": 0.2116924375295639, "learning_rate": 5.105811140049571e-06, "loss": 0.423, "step": 1746 }, { "epoch": 0.86, "grad_norm": 0.20538924634456635, "learning_rate": 5.070783671467066e-06, "loss": 0.3786, "step": 1747 }, { "epoch": 0.86, "grad_norm": 0.1939808875322342, "learning_rate": 5.03587034745247e-06, "loss": 0.3889, "step": 1748 }, { "epoch": 0.86, "grad_norm": 0.2072398066520691, "learning_rate": 5.001071256704554e-06, "loss": 0.4092, "step": 1749 }, { "epoch": 0.86, "grad_norm": 0.20914949476718903, "learning_rate": 4.966386487631863e-06, "loss": 0.4126, "step": 1750 }, { "epoch": 0.86, "grad_norm": 0.2222181260585785, "learning_rate": 4.931816128352534e-06, "loss": 0.3818, "step": 1751 }, { "epoch": 0.86, "grad_norm": 0.2109532505273819, "learning_rate": 4.897360266694012e-06, "loss": 0.4043, "step": 1752 }, { "epoch": 0.86, "grad_norm": 0.20766288042068481, "learning_rate": 4.863018990192869e-06, "loss": 0.429, "step": 1753 }, { "epoch": 0.86, "grad_norm": 0.20251628756523132, "learning_rate": 4.828792386094555e-06, "loss": 0.3968, "step": 1754 }, { "epoch": 0.86, "grad_norm": 0.21159225702285767, "learning_rate": 4.794680541353214e-06, "loss": 0.4247, "step": 1755 }, { "epoch": 0.86, "grad_norm": 0.18154466152191162, "learning_rate": 4.760683542631422e-06, "loss": 0.4116, "step": 1756 }, { "epoch": 0.86, "grad_norm": 0.19249317049980164, "learning_rate": 4.726801476299991e-06, "loss": 0.4169, "step": 1757 }, { "epoch": 0.86, "grad_norm": 0.22602523863315582, "learning_rate": 4.69303442843772e-06, "loss": 0.43, "step": 1758 }, { "epoch": 0.87, "grad_norm": 0.17798416316509247, "learning_rate": 4.659382484831238e-06, "loss": 0.348, "step": 1759 }, { "epoch": 0.87, "grad_norm": 0.23558557033538818, "learning_rate": 4.62584573097472e-06, "loss": 0.4557, "step": 1760 }, { "epoch": 0.87, "grad_norm": 0.1950879544019699, "learning_rate": 4.592424252069705e-06, "loss": 0.3808, "step": 1761 }, { "epoch": 0.87, "grad_norm": 0.23806942999362946, "learning_rate": 4.559118133024853e-06, "loss": 0.3994, "step": 1762 }, { "epoch": 0.87, "grad_norm": 0.18817107379436493, "learning_rate": 4.525927458455786e-06, "loss": 0.3837, "step": 1763 }, { "epoch": 0.87, "grad_norm": 0.22326554358005524, "learning_rate": 4.492852312684803e-06, "loss": 0.4446, "step": 1764 }, { "epoch": 0.87, "grad_norm": 0.19444304704666138, "learning_rate": 4.4598927797407045e-06, "loss": 0.4041, "step": 1765 }, { "epoch": 0.87, "grad_norm": 0.24679523706436157, "learning_rate": 4.427048943358564e-06, "loss": 0.4213, "step": 1766 }, { "epoch": 0.87, "grad_norm": 0.182652547955513, "learning_rate": 4.394320886979552e-06, "loss": 0.3934, "step": 1767 }, { "epoch": 0.87, "grad_norm": 0.19980712234973907, "learning_rate": 4.361708693750665e-06, "loss": 0.3721, "step": 1768 }, { "epoch": 0.87, "grad_norm": 0.2574414908885956, "learning_rate": 4.329212446524555e-06, "loss": 0.4639, "step": 1769 }, { "epoch": 0.87, "grad_norm": 0.22576430439949036, "learning_rate": 4.296832227859299e-06, "loss": 0.4354, "step": 1770 }, { "epoch": 0.87, "grad_norm": 0.19671516120433807, "learning_rate": 4.26456812001822e-06, "loss": 0.4279, "step": 1771 }, { "epoch": 0.87, "grad_norm": 0.20316365361213684, "learning_rate": 4.232420204969634e-06, "loss": 0.4311, "step": 1772 }, { "epoch": 0.87, "grad_norm": 0.2197170853614807, "learning_rate": 4.200388564386698e-06, "loss": 0.4408, "step": 1773 }, { "epoch": 0.87, "grad_norm": 0.21952112019062042, "learning_rate": 4.168473279647111e-06, "loss": 0.3688, "step": 1774 }, { "epoch": 0.87, "grad_norm": 0.24765317142009735, "learning_rate": 4.136674431833021e-06, "loss": 0.407, "step": 1775 }, { "epoch": 0.87, "grad_norm": 0.20826472342014313, "learning_rate": 4.104992101730726e-06, "loss": 0.3957, "step": 1776 }, { "epoch": 0.87, "grad_norm": 0.19602957367897034, "learning_rate": 4.073426369830552e-06, "loss": 0.4016, "step": 1777 }, { "epoch": 0.87, "grad_norm": 0.21990837156772614, "learning_rate": 4.0419773163265285e-06, "loss": 0.4225, "step": 1778 }, { "epoch": 0.88, "grad_norm": 0.2168082445859909, "learning_rate": 4.010645021116332e-06, "loss": 0.4281, "step": 1779 }, { "epoch": 0.88, "grad_norm": 0.21174605190753937, "learning_rate": 3.979429563800968e-06, "loss": 0.3627, "step": 1780 }, { "epoch": 0.88, "grad_norm": 0.2117747962474823, "learning_rate": 3.948331023684637e-06, "loss": 0.4178, "step": 1781 }, { "epoch": 0.88, "grad_norm": 0.2376181036233902, "learning_rate": 3.917349479774457e-06, "loss": 0.4415, "step": 1782 }, { "epoch": 0.88, "grad_norm": 0.19416551291942596, "learning_rate": 3.886485010780378e-06, "loss": 0.3898, "step": 1783 }, { "epoch": 0.88, "grad_norm": 0.22622522711753845, "learning_rate": 3.855737695114864e-06, "loss": 0.4389, "step": 1784 }, { "epoch": 0.88, "grad_norm": 0.20256483554840088, "learning_rate": 3.825107610892798e-06, "loss": 0.4132, "step": 1785 }, { "epoch": 0.88, "grad_norm": 0.20474128425121307, "learning_rate": 3.7945948359311535e-06, "loss": 0.4044, "step": 1786 }, { "epoch": 0.88, "grad_norm": 0.2127659022808075, "learning_rate": 3.764199447748962e-06, "loss": 0.4347, "step": 1787 }, { "epoch": 0.88, "grad_norm": 0.22377413511276245, "learning_rate": 3.733921523566958e-06, "loss": 0.4221, "step": 1788 }, { "epoch": 0.88, "grad_norm": 0.22696468234062195, "learning_rate": 3.70376114030751e-06, "loss": 0.4246, "step": 1789 }, { "epoch": 0.88, "grad_norm": 0.20068852603435516, "learning_rate": 3.6737183745943317e-06, "loss": 0.4709, "step": 1790 }, { "epoch": 0.88, "grad_norm": 0.23992674052715302, "learning_rate": 3.64379330275233e-06, "loss": 0.4222, "step": 1791 }, { "epoch": 0.88, "grad_norm": 0.21016018092632294, "learning_rate": 3.6139860008074122e-06, "loss": 0.4386, "step": 1792 }, { "epoch": 0.88, "grad_norm": 0.22421510517597198, "learning_rate": 3.5842965444862984e-06, "loss": 0.4252, "step": 1793 }, { "epoch": 0.88, "grad_norm": 0.1979125738143921, "learning_rate": 3.5547250092162975e-06, "loss": 0.4143, "step": 1794 }, { "epoch": 0.88, "grad_norm": 0.2162698358297348, "learning_rate": 3.5252714701251356e-06, "loss": 0.4005, "step": 1795 }, { "epoch": 0.88, "grad_norm": 0.198076069355011, "learning_rate": 3.495936002040767e-06, "loss": 0.4041, "step": 1796 }, { "epoch": 0.88, "grad_norm": 0.2078656107187271, "learning_rate": 3.4667186794912044e-06, "loss": 0.4232, "step": 1797 }, { "epoch": 0.88, "grad_norm": 0.24045872688293457, "learning_rate": 3.437619576704271e-06, "loss": 0.4404, "step": 1798 }, { "epoch": 0.89, "grad_norm": 0.20662428438663483, "learning_rate": 3.4086387676074636e-06, "loss": 0.434, "step": 1799 }, { "epoch": 0.89, "grad_norm": 0.21099458634853363, "learning_rate": 3.379776325827749e-06, "loss": 0.4116, "step": 1800 }, { "epoch": 0.89, "grad_norm": 0.2138308882713318, "learning_rate": 3.3510323246913887e-06, "loss": 0.4662, "step": 1801 }, { "epoch": 0.89, "grad_norm": 0.2190125435590744, "learning_rate": 3.3224068372237237e-06, "loss": 0.4122, "step": 1802 }, { "epoch": 0.89, "grad_norm": 0.20099395513534546, "learning_rate": 3.2938999361490085e-06, "loss": 0.4027, "step": 1803 }, { "epoch": 0.89, "grad_norm": 0.20788200199604034, "learning_rate": 3.2655116938902162e-06, "loss": 0.4048, "step": 1804 }, { "epoch": 0.89, "grad_norm": 0.21974097192287445, "learning_rate": 3.2372421825689004e-06, "loss": 0.4498, "step": 1805 }, { "epoch": 0.89, "grad_norm": 0.18858005106449127, "learning_rate": 3.209091474004922e-06, "loss": 0.3987, "step": 1806 }, { "epoch": 0.89, "grad_norm": 0.20863588154315948, "learning_rate": 3.181059639716355e-06, "loss": 0.3771, "step": 1807 }, { "epoch": 0.89, "grad_norm": 0.21534626185894012, "learning_rate": 3.1531467509192505e-06, "loss": 0.4149, "step": 1808 }, { "epoch": 0.89, "grad_norm": 0.21677905321121216, "learning_rate": 3.1253528785274823e-06, "loss": 0.4211, "step": 1809 }, { "epoch": 0.89, "grad_norm": 0.20993918180465698, "learning_rate": 3.0976780931525507e-06, "loss": 0.4274, "step": 1810 }, { "epoch": 0.89, "grad_norm": 0.2098807841539383, "learning_rate": 3.07012246510342e-06, "loss": 0.3968, "step": 1811 }, { "epoch": 0.89, "grad_norm": 0.18863803148269653, "learning_rate": 3.0426860643863186e-06, "loss": 0.4093, "step": 1812 }, { "epoch": 0.89, "grad_norm": 0.2022509127855301, "learning_rate": 3.0153689607045845e-06, "loss": 0.3958, "step": 1813 }, { "epoch": 0.89, "grad_norm": 0.20496173202991486, "learning_rate": 2.9881712234584692e-06, "loss": 0.4623, "step": 1814 }, { "epoch": 0.89, "grad_norm": 0.2346290647983551, "learning_rate": 2.9610929217449735e-06, "loss": 0.4348, "step": 1815 }, { "epoch": 0.89, "grad_norm": 0.22074253857135773, "learning_rate": 2.934134124357646e-06, "loss": 0.457, "step": 1816 }, { "epoch": 0.89, "grad_norm": 0.21187560260295868, "learning_rate": 2.907294899786478e-06, "loss": 0.4019, "step": 1817 }, { "epoch": 0.89, "grad_norm": 0.19057275354862213, "learning_rate": 2.880575316217632e-06, "loss": 0.4073, "step": 1818 }, { "epoch": 0.89, "grad_norm": 0.18932707607746124, "learning_rate": 2.853975441533341e-06, "loss": 0.4046, "step": 1819 }, { "epoch": 0.9, "grad_norm": 0.2132713496685028, "learning_rate": 2.827495343311698e-06, "loss": 0.4237, "step": 1820 }, { "epoch": 0.9, "grad_norm": 0.21885187923908234, "learning_rate": 2.801135088826529e-06, "loss": 0.4338, "step": 1821 }, { "epoch": 0.9, "grad_norm": 0.2400888204574585, "learning_rate": 2.774894745047152e-06, "loss": 0.4169, "step": 1822 }, { "epoch": 0.9, "grad_norm": 0.18380342423915863, "learning_rate": 2.7487743786382803e-06, "loss": 0.3968, "step": 1823 }, { "epoch": 0.9, "grad_norm": 0.218204066157341, "learning_rate": 2.7227740559597925e-06, "loss": 0.4397, "step": 1824 }, { "epoch": 0.9, "grad_norm": 0.21851587295532227, "learning_rate": 2.6968938430666168e-06, "loss": 0.4183, "step": 1825 }, { "epoch": 0.9, "grad_norm": 0.2244994044303894, "learning_rate": 2.6711338057085145e-06, "loss": 0.4237, "step": 1826 }, { "epoch": 0.9, "grad_norm": 0.19670161604881287, "learning_rate": 2.6454940093299686e-06, "loss": 0.4054, "step": 1827 }, { "epoch": 0.9, "grad_norm": 0.20466399192810059, "learning_rate": 2.619974519069929e-06, "loss": 0.4223, "step": 1828 }, { "epoch": 0.9, "grad_norm": 0.2283145785331726, "learning_rate": 2.594575399761756e-06, "loss": 0.4366, "step": 1829 }, { "epoch": 0.9, "grad_norm": 0.23291389644145966, "learning_rate": 2.5692967159329786e-06, "loss": 0.4369, "step": 1830 }, { "epoch": 0.9, "grad_norm": 0.22148671746253967, "learning_rate": 2.5441385318051735e-06, "loss": 0.4106, "step": 1831 }, { "epoch": 0.9, "grad_norm": 0.21913298964500427, "learning_rate": 2.519100911293737e-06, "loss": 0.4521, "step": 1832 }, { "epoch": 0.9, "grad_norm": 0.20981213450431824, "learning_rate": 2.4941839180078254e-06, "loss": 0.4356, "step": 1833 }, { "epoch": 0.9, "grad_norm": 0.232079416513443, "learning_rate": 2.469387615250096e-06, "loss": 0.415, "step": 1834 }, { "epoch": 0.9, "grad_norm": 0.21712473034858704, "learning_rate": 2.444712066016619e-06, "loss": 0.4459, "step": 1835 }, { "epoch": 0.9, "grad_norm": 0.21690692007541656, "learning_rate": 2.420157332996642e-06, "loss": 0.4407, "step": 1836 }, { "epoch": 0.9, "grad_norm": 0.20435580611228943, "learning_rate": 2.39572347857252e-06, "loss": 0.4029, "step": 1837 }, { "epoch": 0.9, "grad_norm": 0.21136578917503357, "learning_rate": 2.371410564819476e-06, "loss": 0.4143, "step": 1838 }, { "epoch": 0.9, "grad_norm": 0.18793676793575287, "learning_rate": 2.3472186535055063e-06, "loss": 0.3763, "step": 1839 }, { "epoch": 0.91, "grad_norm": 0.22332410514354706, "learning_rate": 2.3231478060911694e-06, "loss": 0.4012, "step": 1840 }, { "epoch": 0.91, "grad_norm": 0.2000952661037445, "learning_rate": 2.2991980837294757e-06, "loss": 0.411, "step": 1841 }, { "epoch": 0.91, "grad_norm": 0.20654362440109253, "learning_rate": 2.275369547265693e-06, "loss": 0.4015, "step": 1842 }, { "epoch": 0.91, "grad_norm": 0.20377500355243683, "learning_rate": 2.2516622572372414e-06, "loss": 0.3734, "step": 1843 }, { "epoch": 0.91, "grad_norm": 0.23251742124557495, "learning_rate": 2.228076273873475e-06, "loss": 0.421, "step": 1844 }, { "epoch": 0.91, "grad_norm": 0.19338220357894897, "learning_rate": 2.2046116570955846e-06, "loss": 0.3826, "step": 1845 }, { "epoch": 0.91, "grad_norm": 0.20749856531620026, "learning_rate": 2.1812684665164063e-06, "loss": 0.374, "step": 1846 }, { "epoch": 0.91, "grad_norm": 0.20768998563289642, "learning_rate": 2.158046761440324e-06, "loss": 0.4215, "step": 1847 }, { "epoch": 0.91, "grad_norm": 0.24220119416713715, "learning_rate": 2.1349466008630416e-06, "loss": 0.4264, "step": 1848 }, { "epoch": 0.91, "grad_norm": 0.2018035650253296, "learning_rate": 2.1119680434714904e-06, "loss": 0.406, "step": 1849 }, { "epoch": 0.91, "grad_norm": 0.20518916845321655, "learning_rate": 2.0891111476436632e-06, "loss": 0.4162, "step": 1850 }, { "epoch": 0.91, "grad_norm": 0.22521735727787018, "learning_rate": 2.066375971448481e-06, "loss": 0.4379, "step": 1851 }, { "epoch": 0.91, "grad_norm": 0.2196858823299408, "learning_rate": 2.043762572645602e-06, "loss": 0.4497, "step": 1852 }, { "epoch": 0.91, "grad_norm": 0.21082384884357452, "learning_rate": 2.021271008685327e-06, "loss": 0.387, "step": 1853 }, { "epoch": 0.91, "grad_norm": 0.19944769144058228, "learning_rate": 1.998901336708414e-06, "loss": 0.4178, "step": 1854 }, { "epoch": 0.91, "grad_norm": 0.1943204402923584, "learning_rate": 1.97665361354597e-06, "loss": 0.4129, "step": 1855 }, { "epoch": 0.91, "grad_norm": 0.22640955448150635, "learning_rate": 1.954527895719266e-06, "loss": 0.4233, "step": 1856 }, { "epoch": 0.91, "grad_norm": 0.23088344931602478, "learning_rate": 1.9325242394396203e-06, "loss": 0.4308, "step": 1857 }, { "epoch": 0.91, "grad_norm": 0.21118809282779694, "learning_rate": 1.9106427006082504e-06, "loss": 0.3965, "step": 1858 }, { "epoch": 0.91, "grad_norm": 0.2244696021080017, "learning_rate": 1.8888833348161272e-06, "loss": 0.439, "step": 1859 }, { "epoch": 0.92, "grad_norm": 0.21009711921215057, "learning_rate": 1.8672461973438414e-06, "loss": 0.358, "step": 1860 }, { "epoch": 0.92, "grad_norm": 0.21541482210159302, "learning_rate": 1.8457313431614498e-06, "loss": 0.4005, "step": 1861 }, { "epoch": 0.92, "grad_norm": 0.23341244459152222, "learning_rate": 1.8243388269283401e-06, "loss": 0.4536, "step": 1862 }, { "epoch": 0.92, "grad_norm": 0.21500873565673828, "learning_rate": 1.8030687029931105e-06, "loss": 0.4349, "step": 1863 }, { "epoch": 0.92, "grad_norm": 0.1953534036874771, "learning_rate": 1.7819210253934072e-06, "loss": 0.3977, "step": 1864 }, { "epoch": 0.92, "grad_norm": 0.23788581788539886, "learning_rate": 1.7608958478557868e-06, "loss": 0.4471, "step": 1865 }, { "epoch": 0.92, "grad_norm": 0.19605985283851624, "learning_rate": 1.739993223795605e-06, "loss": 0.4098, "step": 1866 }, { "epoch": 0.92, "grad_norm": 0.20600788295269012, "learning_rate": 1.7192132063168664e-06, "loss": 0.4282, "step": 1867 }, { "epoch": 0.92, "grad_norm": 0.20914702117443085, "learning_rate": 1.6985558482120745e-06, "loss": 0.3947, "step": 1868 }, { "epoch": 0.92, "grad_norm": 0.24296888709068298, "learning_rate": 1.6780212019621155e-06, "loss": 0.3932, "step": 1869 }, { "epoch": 0.92, "grad_norm": 0.21534322202205658, "learning_rate": 1.6576093197361253e-06, "loss": 0.3876, "step": 1870 }, { "epoch": 0.92, "grad_norm": 0.23175913095474243, "learning_rate": 1.6373202533913556e-06, "loss": 0.4302, "step": 1871 }, { "epoch": 0.92, "grad_norm": 0.20790866017341614, "learning_rate": 1.6171540544730301e-06, "loss": 0.4298, "step": 1872 }, { "epoch": 0.92, "grad_norm": 0.206355482339859, "learning_rate": 1.5971107742142333e-06, "loss": 0.4089, "step": 1873 }, { "epoch": 0.92, "grad_norm": 0.21617279946804047, "learning_rate": 1.5771904635357493e-06, "loss": 0.4242, "step": 1874 }, { "epoch": 0.92, "grad_norm": 0.20635119080543518, "learning_rate": 1.5573931730459846e-06, "loss": 0.3744, "step": 1875 }, { "epoch": 0.92, "grad_norm": 0.21111814677715302, "learning_rate": 1.5377189530407786e-06, "loss": 0.4025, "step": 1876 }, { "epoch": 0.92, "grad_norm": 0.17844848334789276, "learning_rate": 1.5181678535033272e-06, "loss": 0.3665, "step": 1877 }, { "epoch": 0.92, "grad_norm": 0.20188473165035248, "learning_rate": 1.498739924104009e-06, "loss": 0.4397, "step": 1878 }, { "epoch": 0.92, "grad_norm": 0.25785887241363525, "learning_rate": 1.4794352142003087e-06, "loss": 0.4429, "step": 1879 }, { "epoch": 0.92, "grad_norm": 0.20419232547283173, "learning_rate": 1.460253772836645e-06, "loss": 0.3934, "step": 1880 }, { "epoch": 0.93, "grad_norm": 0.23154211044311523, "learning_rate": 1.4411956487442925e-06, "loss": 0.4482, "step": 1881 }, { "epoch": 0.93, "grad_norm": 0.23342986404895782, "learning_rate": 1.4222608903412039e-06, "loss": 0.3789, "step": 1882 }, { "epoch": 0.93, "grad_norm": 0.21670278906822205, "learning_rate": 1.403449545731944e-06, "loss": 0.4069, "step": 1883 }, { "epoch": 0.93, "grad_norm": 0.2121937870979309, "learning_rate": 1.3847616627075121e-06, "loss": 0.4244, "step": 1884 }, { "epoch": 0.93, "grad_norm": 0.20774179697036743, "learning_rate": 1.3661972887452857e-06, "loss": 0.4126, "step": 1885 }, { "epoch": 0.93, "grad_norm": 0.21411091089248657, "learning_rate": 1.3477564710088098e-06, "loss": 0.4108, "step": 1886 }, { "epoch": 0.93, "grad_norm": 0.19670739769935608, "learning_rate": 1.329439256347781e-06, "loss": 0.4057, "step": 1887 }, { "epoch": 0.93, "grad_norm": 0.1967586725950241, "learning_rate": 1.3112456912978465e-06, "loss": 0.4375, "step": 1888 }, { "epoch": 0.93, "grad_norm": 0.21294692158699036, "learning_rate": 1.2931758220805335e-06, "loss": 0.4318, "step": 1889 }, { "epoch": 0.93, "grad_norm": 0.20194266736507416, "learning_rate": 1.2752296946030917e-06, "loss": 0.4192, "step": 1890 }, { "epoch": 0.93, "grad_norm": 0.21286322176456451, "learning_rate": 1.2574073544584286e-06, "loss": 0.3845, "step": 1891 }, { "epoch": 0.93, "grad_norm": 0.19377005100250244, "learning_rate": 1.2397088469249418e-06, "loss": 0.4353, "step": 1892 }, { "epoch": 0.93, "grad_norm": 0.21963383257389069, "learning_rate": 1.2221342169664419e-06, "loss": 0.4158, "step": 1893 }, { "epoch": 0.93, "grad_norm": 0.25539687275886536, "learning_rate": 1.204683509232013e-06, "loss": 0.4591, "step": 1894 }, { "epoch": 0.93, "grad_norm": 0.21926568448543549, "learning_rate": 1.1873567680559139e-06, "loss": 0.4309, "step": 1895 }, { "epoch": 0.93, "grad_norm": 0.2100962996482849, "learning_rate": 1.1701540374574605e-06, "loss": 0.3965, "step": 1896 }, { "epoch": 0.93, "grad_norm": 0.23248746991157532, "learning_rate": 1.153075361140915e-06, "loss": 0.4304, "step": 1897 }, { "epoch": 0.93, "grad_norm": 0.22966471314430237, "learning_rate": 1.1361207824953813e-06, "loss": 0.3939, "step": 1898 }, { "epoch": 0.93, "grad_norm": 0.19890917837619781, "learning_rate": 1.1192903445946757e-06, "loss": 0.4151, "step": 1899 }, { "epoch": 0.93, "grad_norm": 0.20696093142032623, "learning_rate": 1.1025840901972229e-06, "loss": 0.4495, "step": 1900 }, { "epoch": 0.94, "grad_norm": 0.20510539412498474, "learning_rate": 1.0860020617459887e-06, "loss": 0.3969, "step": 1901 }, { "epoch": 0.94, "grad_norm": 0.19739936292171478, "learning_rate": 1.069544301368308e-06, "loss": 0.4295, "step": 1902 }, { "epoch": 0.94, "grad_norm": 0.22097568213939667, "learning_rate": 1.0532108508758076e-06, "loss": 0.4541, "step": 1903 }, { "epoch": 0.94, "grad_norm": 0.23417547345161438, "learning_rate": 1.037001751764305e-06, "loss": 0.4225, "step": 1904 }, { "epoch": 0.94, "grad_norm": 0.1932479292154312, "learning_rate": 1.0209170452137095e-06, "loss": 0.4375, "step": 1905 }, { "epoch": 0.94, "grad_norm": 0.20545832812786102, "learning_rate": 1.004956772087895e-06, "loss": 0.4068, "step": 1906 }, { "epoch": 0.94, "grad_norm": 0.2106175422668457, "learning_rate": 9.891209729346096e-07, "loss": 0.4526, "step": 1907 }, { "epoch": 0.94, "grad_norm": 0.20827437937259674, "learning_rate": 9.734096879853549e-07, "loss": 0.355, "step": 1908 }, { "epoch": 0.94, "grad_norm": 0.23807892203330994, "learning_rate": 9.578229571553299e-07, "loss": 0.4236, "step": 1909 }, { "epoch": 0.94, "grad_norm": 0.2260412573814392, "learning_rate": 9.423608200432699e-07, "loss": 0.4524, "step": 1910 }, { "epoch": 0.94, "grad_norm": 0.20360590517520905, "learning_rate": 9.270233159313912e-07, "loss": 0.4077, "step": 1911 }, { "epoch": 0.94, "grad_norm": 0.21435077488422394, "learning_rate": 9.118104837852637e-07, "loss": 0.4351, "step": 1912 }, { "epoch": 0.94, "grad_norm": 0.2177480012178421, "learning_rate": 8.967223622537324e-07, "loss": 0.4486, "step": 1913 }, { "epoch": 0.94, "grad_norm": 0.20079147815704346, "learning_rate": 8.817589896688127e-07, "loss": 0.4005, "step": 1914 }, { "epoch": 0.94, "grad_norm": 0.22776076197624207, "learning_rate": 8.669204040455736e-07, "loss": 0.4278, "step": 1915 }, { "epoch": 0.94, "grad_norm": 0.21716420352458954, "learning_rate": 8.522066430820707e-07, "loss": 0.4449, "step": 1916 }, { "epoch": 0.94, "grad_norm": 0.25123047828674316, "learning_rate": 8.376177441592359e-07, "loss": 0.4383, "step": 1917 }, { "epoch": 0.94, "grad_norm": 0.21649956703186035, "learning_rate": 8.231537443407821e-07, "loss": 0.4004, "step": 1918 }, { "epoch": 0.94, "grad_norm": 0.1996014565229416, "learning_rate": 8.088146803731044e-07, "loss": 0.4222, "step": 1919 }, { "epoch": 0.94, "grad_norm": 0.19229093194007874, "learning_rate": 7.946005886852015e-07, "loss": 0.3757, "step": 1920 }, { "epoch": 0.95, "grad_norm": 0.21812762320041656, "learning_rate": 7.805115053885758e-07, "loss": 0.4072, "step": 1921 }, { "epoch": 0.95, "grad_norm": 0.195990189909935, "learning_rate": 7.665474662771399e-07, "loss": 0.4176, "step": 1922 }, { "epoch": 0.95, "grad_norm": 0.20929014682769775, "learning_rate": 7.527085068271212e-07, "loss": 0.3875, "step": 1923 }, { "epoch": 0.95, "grad_norm": 0.24352213740348816, "learning_rate": 7.389946621969679e-07, "loss": 0.4048, "step": 1924 }, { "epoch": 0.95, "grad_norm": 0.18062089383602142, "learning_rate": 7.254059672272995e-07, "loss": 0.3772, "step": 1925 }, { "epoch": 0.95, "grad_norm": 0.2000730037689209, "learning_rate": 7.119424564407562e-07, "loss": 0.4388, "step": 1926 }, { "epoch": 0.95, "grad_norm": 0.21490830183029175, "learning_rate": 6.986041640419605e-07, "loss": 0.4414, "step": 1927 }, { "epoch": 0.95, "grad_norm": 0.22425851225852966, "learning_rate": 6.853911239173949e-07, "loss": 0.402, "step": 1928 }, { "epoch": 0.95, "grad_norm": 0.19869260489940643, "learning_rate": 6.723033696353631e-07, "loss": 0.42, "step": 1929 }, { "epoch": 0.95, "grad_norm": 0.20910996198654175, "learning_rate": 6.593409344458402e-07, "loss": 0.4247, "step": 1930 }, { "epoch": 0.95, "grad_norm": 0.2204931080341339, "learning_rate": 6.465038512804555e-07, "loss": 0.4374, "step": 1931 }, { "epoch": 0.95, "grad_norm": 0.20224407315254211, "learning_rate": 6.337921527523382e-07, "loss": 0.4027, "step": 1932 }, { "epoch": 0.95, "grad_norm": 0.2094656080007553, "learning_rate": 6.212058711561164e-07, "loss": 0.4298, "step": 1933 }, { "epoch": 0.95, "grad_norm": 0.21481162309646606, "learning_rate": 6.087450384677506e-07, "loss": 0.4292, "step": 1934 }, { "epoch": 0.95, "grad_norm": 0.1952059119939804, "learning_rate": 5.96409686344529e-07, "loss": 0.3938, "step": 1935 }, { "epoch": 0.95, "grad_norm": 0.21672658622264862, "learning_rate": 5.841998461249165e-07, "loss": 0.4495, "step": 1936 }, { "epoch": 0.95, "grad_norm": 0.20482760667800903, "learning_rate": 5.72115548828539e-07, "loss": 0.4151, "step": 1937 }, { "epoch": 0.95, "grad_norm": 0.20232464373111725, "learning_rate": 5.601568251560552e-07, "loss": 0.3991, "step": 1938 }, { "epoch": 0.95, "grad_norm": 0.20103326439857483, "learning_rate": 5.483237054891177e-07, "loss": 0.4188, "step": 1939 }, { "epoch": 0.95, "grad_norm": 0.2143714278936386, "learning_rate": 5.366162198902514e-07, "loss": 0.4385, "step": 1940 }, { "epoch": 0.95, "grad_norm": 0.21240565180778503, "learning_rate": 5.250343981028305e-07, "loss": 0.3786, "step": 1941 }, { "epoch": 0.96, "grad_norm": 0.2266232669353485, "learning_rate": 5.135782695509461e-07, "loss": 0.4041, "step": 1942 }, { "epoch": 0.96, "grad_norm": 0.21604247391223907, "learning_rate": 5.022478633393946e-07, "loss": 0.3931, "step": 1943 }, { "epoch": 0.96, "grad_norm": 0.1978873312473297, "learning_rate": 4.910432082535387e-07, "loss": 0.4048, "step": 1944 }, { "epoch": 0.96, "grad_norm": 0.20706962049007416, "learning_rate": 4.799643327592751e-07, "loss": 0.4001, "step": 1945 }, { "epoch": 0.96, "grad_norm": 0.24617184698581696, "learning_rate": 4.6901126500295543e-07, "loss": 0.4089, "step": 1946 }, { "epoch": 0.96, "grad_norm": 0.19703038036823273, "learning_rate": 4.5818403281131516e-07, "loss": 0.4137, "step": 1947 }, { "epoch": 0.96, "grad_norm": 0.20901229977607727, "learning_rate": 4.4748266369139536e-07, "loss": 0.4072, "step": 1948 }, { "epoch": 0.96, "grad_norm": 0.1970798671245575, "learning_rate": 4.3690718483047066e-07, "loss": 0.4179, "step": 1949 }, { "epoch": 0.96, "grad_norm": 0.2121417224407196, "learning_rate": 4.264576230959827e-07, "loss": 0.4123, "step": 1950 }, { "epoch": 0.96, "grad_norm": 0.21521428227424622, "learning_rate": 4.1613400503550114e-07, "loss": 0.3872, "step": 1951 }, { "epoch": 0.96, "grad_norm": 0.2255493551492691, "learning_rate": 4.059363568766017e-07, "loss": 0.433, "step": 1952 }, { "epoch": 0.96, "grad_norm": 0.23160548508167267, "learning_rate": 3.958647045268382e-07, "loss": 0.4584, "step": 1953 }, { "epoch": 0.96, "grad_norm": 0.2127949744462967, "learning_rate": 3.859190735736762e-07, "loss": 0.4204, "step": 1954 }, { "epoch": 0.96, "grad_norm": 0.21475207805633545, "learning_rate": 3.7609948928440384e-07, "loss": 0.4446, "step": 1955 }, { "epoch": 0.96, "grad_norm": 0.2191481739282608, "learning_rate": 3.664059766060879e-07, "loss": 0.4232, "step": 1956 }, { "epoch": 0.96, "grad_norm": 0.2431909739971161, "learning_rate": 3.568385601655122e-07, "loss": 0.4344, "step": 1957 }, { "epoch": 0.96, "grad_norm": 0.20131605863571167, "learning_rate": 3.473972642690948e-07, "loss": 0.3974, "step": 1958 }, { "epoch": 0.96, "grad_norm": 0.20530828833580017, "learning_rate": 3.380821129028489e-07, "loss": 0.4002, "step": 1959 }, { "epoch": 0.96, "grad_norm": 0.19925428926944733, "learning_rate": 3.288931297323161e-07, "loss": 0.425, "step": 1960 }, { "epoch": 0.96, "grad_norm": 0.21212442219257355, "learning_rate": 3.1983033810248366e-07, "loss": 0.3851, "step": 1961 }, { "epoch": 0.97, "grad_norm": 0.23202063143253326, "learning_rate": 3.10893761037756e-07, "loss": 0.442, "step": 1962 }, { "epoch": 0.97, "grad_norm": 0.22275333106517792, "learning_rate": 3.020834212418944e-07, "loss": 0.4368, "step": 1963 }, { "epoch": 0.97, "grad_norm": 0.21868956089019775, "learning_rate": 2.93399341097933e-07, "loss": 0.4224, "step": 1964 }, { "epoch": 0.97, "grad_norm": 0.20419920980930328, "learning_rate": 2.848415426681406e-07, "loss": 0.4303, "step": 1965 }, { "epoch": 0.97, "grad_norm": 0.19343627989292145, "learning_rate": 2.7641004769397016e-07, "loss": 0.3734, "step": 1966 }, { "epoch": 0.97, "grad_norm": 0.20459556579589844, "learning_rate": 2.681048775959816e-07, "loss": 0.4067, "step": 1967 }, { "epoch": 0.97, "grad_norm": 0.2031210958957672, "learning_rate": 2.5992605347382457e-07, "loss": 0.4358, "step": 1968 }, { "epoch": 0.97, "grad_norm": 0.2015312910079956, "learning_rate": 2.5187359610612805e-07, "loss": 0.418, "step": 1969 }, { "epoch": 0.97, "grad_norm": 0.20226195454597473, "learning_rate": 2.4394752595051087e-07, "loss": 0.4122, "step": 1970 }, { "epoch": 0.97, "grad_norm": 0.22146867215633392, "learning_rate": 2.3614786314348768e-07, "loss": 0.4111, "step": 1971 }, { "epoch": 0.97, "grad_norm": 0.20249901711940765, "learning_rate": 2.2847462750043015e-07, "loss": 0.3917, "step": 1972 }, { "epoch": 0.97, "grad_norm": 0.21513429284095764, "learning_rate": 2.209278385155167e-07, "loss": 0.4, "step": 1973 }, { "epoch": 0.97, "grad_norm": 0.2143140286207199, "learning_rate": 2.1350751536167724e-07, "loss": 0.4089, "step": 1974 }, { "epoch": 0.97, "grad_norm": 0.22656406462192535, "learning_rate": 2.062136768905598e-07, "loss": 0.4433, "step": 1975 }, { "epoch": 0.97, "grad_norm": 0.19329681992530823, "learning_rate": 1.9904634163247504e-07, "loss": 0.407, "step": 1976 }, { "epoch": 0.97, "grad_norm": 0.19425331056118011, "learning_rate": 1.920055277963295e-07, "loss": 0.4145, "step": 1977 }, { "epoch": 0.97, "grad_norm": 0.2015434354543686, "learning_rate": 1.850912532696092e-07, "loss": 0.4085, "step": 1978 }, { "epoch": 0.97, "grad_norm": 0.23681402206420898, "learning_rate": 1.7830353561832935e-07, "loss": 0.4164, "step": 1979 }, { "epoch": 0.97, "grad_norm": 0.22124652564525604, "learning_rate": 1.7164239208696253e-07, "loss": 0.4015, "step": 1980 }, { "epoch": 0.97, "grad_norm": 0.2552355229854584, "learning_rate": 1.651078395984329e-07, "loss": 0.3997, "step": 1981 }, { "epoch": 0.98, "grad_norm": 0.21645213663578033, "learning_rate": 1.5869989475403302e-07, "loss": 0.4486, "step": 1982 }, { "epoch": 0.98, "grad_norm": 0.22153499722480774, "learning_rate": 1.5241857383343494e-07, "loss": 0.4368, "step": 1983 }, { "epoch": 0.98, "grad_norm": 0.22287209331989288, "learning_rate": 1.4626389279458474e-07, "loss": 0.4144, "step": 1984 }, { "epoch": 0.98, "grad_norm": 0.23654805123806, "learning_rate": 1.4023586727371362e-07, "loss": 0.4129, "step": 1985 }, { "epoch": 0.98, "grad_norm": 0.20596343278884888, "learning_rate": 1.3433451258527684e-07, "loss": 0.4072, "step": 1986 }, { "epoch": 0.98, "grad_norm": 0.2418130338191986, "learning_rate": 1.2855984372191487e-07, "loss": 0.4493, "step": 1987 }, { "epoch": 0.98, "grad_norm": 0.20759262144565582, "learning_rate": 1.229118753544145e-07, "loss": 0.4357, "step": 1988 }, { "epoch": 0.98, "grad_norm": 0.23979327082633972, "learning_rate": 1.1739062183168115e-07, "loss": 0.4082, "step": 1989 }, { "epoch": 0.98, "grad_norm": 0.21632802486419678, "learning_rate": 1.1199609718068882e-07, "loss": 0.4039, "step": 1990 }, { "epoch": 0.98, "grad_norm": 0.19885508716106415, "learning_rate": 1.0672831510645242e-07, "loss": 0.3501, "step": 1991 }, { "epoch": 0.98, "grad_norm": 0.19692523777484894, "learning_rate": 1.0158728899199444e-07, "loss": 0.4083, "step": 1992 }, { "epoch": 0.98, "grad_norm": 0.19487325847148895, "learning_rate": 9.657303189832268e-08, "loss": 0.395, "step": 1993 }, { "epoch": 0.98, "grad_norm": 0.20713594555854797, "learning_rate": 9.168555656434708e-08, "loss": 0.4229, "step": 1994 }, { "epoch": 0.98, "grad_norm": 0.23070469498634338, "learning_rate": 8.692487540691852e-08, "loss": 0.4109, "step": 1995 }, { "epoch": 0.98, "grad_norm": 0.23503658175468445, "learning_rate": 8.229100052074556e-08, "loss": 0.4148, "step": 1996 }, { "epoch": 0.98, "grad_norm": 0.21484753489494324, "learning_rate": 7.778394367838338e-08, "loss": 0.4509, "step": 1997 }, { "epoch": 0.98, "grad_norm": 0.2131231129169464, "learning_rate": 7.340371633019483e-08, "loss": 0.4187, "step": 1998 }, { "epoch": 0.98, "grad_norm": 0.21797029674053192, "learning_rate": 6.915032960433942e-08, "loss": 0.4285, "step": 1999 }, { "epoch": 0.98, "grad_norm": 0.18551306426525116, "learning_rate": 6.502379430671778e-08, "loss": 0.391, "step": 2000 }, { "epoch": 0.98, "grad_norm": 0.19592125713825226, "learning_rate": 6.102412092097165e-08, "loss": 0.3658, "step": 2001 }, { "epoch": 0.98, "grad_norm": 0.19075609743595123, "learning_rate": 5.715131960843945e-08, "loss": 0.3631, "step": 2002 }, { "epoch": 0.99, "grad_norm": 0.2520363926887512, "learning_rate": 5.3405400208134116e-08, "loss": 0.4321, "step": 2003 }, { "epoch": 0.99, "grad_norm": 0.20800505578517914, "learning_rate": 4.978637223672089e-08, "loss": 0.3852, "step": 2004 }, { "epoch": 0.99, "grad_norm": 0.19880563020706177, "learning_rate": 4.6294244888500645e-08, "loss": 0.4004, "step": 2005 }, { "epoch": 0.99, "grad_norm": 0.20241095125675201, "learning_rate": 4.29290270353655e-08, "loss": 0.4232, "step": 2006 }, { "epoch": 0.99, "grad_norm": 0.22910137474536896, "learning_rate": 3.969072722678768e-08, "loss": 0.4132, "step": 2007 }, { "epoch": 0.99, "grad_norm": 0.21551550924777985, "learning_rate": 3.657935368980847e-08, "loss": 0.421, "step": 2008 }, { "epoch": 0.99, "grad_norm": 0.19579172134399414, "learning_rate": 3.359491432900486e-08, "loss": 0.3925, "step": 2009 }, { "epoch": 0.99, "grad_norm": 0.22210298478603363, "learning_rate": 3.07374167264729e-08, "loss": 0.4123, "step": 2010 }, { "epoch": 0.99, "grad_norm": 0.22647666931152344, "learning_rate": 2.8006868141805533e-08, "loss": 0.4303, "step": 2011 }, { "epoch": 0.99, "grad_norm": 0.23893052339553833, "learning_rate": 2.5403275512064783e-08, "loss": 0.453, "step": 2012 }, { "epoch": 0.99, "grad_norm": 0.21037045121192932, "learning_rate": 2.2926645451804006e-08, "loss": 0.4186, "step": 2013 }, { "epoch": 0.99, "grad_norm": 0.1928960680961609, "learning_rate": 2.0576984252995703e-08, "loss": 0.3769, "step": 2014 }, { "epoch": 0.99, "grad_norm": 0.2320747822523117, "learning_rate": 1.835429788505927e-08, "loss": 0.4496, "step": 2015 }, { "epoch": 0.99, "grad_norm": 0.1975114792585373, "learning_rate": 1.6258591994816608e-08, "loss": 0.3534, "step": 2016 }, { "epoch": 0.99, "grad_norm": 0.20322668552398682, "learning_rate": 1.4289871906503216e-08, "loss": 0.3917, "step": 2017 }, { "epoch": 0.99, "grad_norm": 0.17660529911518097, "learning_rate": 1.244814262172933e-08, "loss": 0.3703, "step": 2018 }, { "epoch": 0.99, "grad_norm": 0.22094283998012543, "learning_rate": 1.0733408819491031e-08, "loss": 0.4015, "step": 2019 }, { "epoch": 0.99, "grad_norm": 0.20329605042934418, "learning_rate": 9.145674856136932e-09, "loss": 0.3713, "step": 2020 }, { "epoch": 0.99, "grad_norm": 0.21092285215854645, "learning_rate": 7.684944765379286e-09, "loss": 0.4011, "step": 2021 }, { "epoch": 0.99, "grad_norm": 0.20429110527038574, "learning_rate": 6.35122225826068e-09, "loss": 0.377, "step": 2022 }, { "epoch": 1.0, "grad_norm": 0.2415814995765686, "learning_rate": 5.144510723154028e-09, "loss": 0.4835, "step": 2023 }, { "epoch": 1.0, "grad_norm": 0.2091236412525177, "learning_rate": 4.064813225768127e-09, "loss": 0.3957, "step": 2024 }, { "epoch": 1.0, "grad_norm": 0.21922674775123596, "learning_rate": 3.112132509119903e-09, "loss": 0.4395, "step": 2025 }, { "epoch": 1.0, "grad_norm": 0.19109301269054413, "learning_rate": 2.2864709935288554e-09, "loss": 0.3949, "step": 2026 }, { "epoch": 1.0, "grad_norm": 0.24152390658855438, "learning_rate": 1.587830776628163e-09, "loss": 0.4189, "step": 2027 }, { "epoch": 1.0, "grad_norm": 0.21924638748168945, "learning_rate": 1.0162136333424776e-09, "loss": 0.4215, "step": 2028 }, { "epoch": 1.0, "grad_norm": 0.20084549486637115, "learning_rate": 5.716210158879243e-10, "loss": 0.4175, "step": 2029 }, { "epoch": 1.0, "grad_norm": 0.21729256212711334, "learning_rate": 2.5405405377210145e-10, "loss": 0.422, "step": 2030 }, { "epoch": 1.0, "grad_norm": 0.22051146626472473, "learning_rate": 6.351355378297896e-11, "loss": 0.4497, "step": 2031 }, { "epoch": 1.0, "grad_norm": 0.22746571898460388, "learning_rate": 0.0, "loss": 0.3918, "step": 2032 }, { "epoch": 1.0, "step": 2032, "total_flos": 4.621675955323142e+18, "train_loss": 0.4509266271717905, "train_runtime": 24826.4279, "train_samples_per_second": 10.479, "train_steps_per_second": 0.082 } ], "logging_steps": 1.0, "max_steps": 2032, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 4.621675955323142e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }