diff --git "a/irish_llama31_8b_data_v3/checkpoint-1900/trainer_state.json" "b/irish_llama31_8b_data_v3/checkpoint-1900/trainer_state.json" new file mode 100644--- /dev/null +++ "b/irish_llama31_8b_data_v3/checkpoint-1900/trainer_state.json" @@ -0,0 +1,6690 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9720534629404617, + "eval_steps": 500, + "global_step": 1900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005116070857581378, + "grad_norm": 5.084794521331787, + "learning_rate": 5.115089514066497e-07, + "loss": 2.9408, + "step": 1 + }, + { + "epoch": 0.0010232141715162755, + "grad_norm": 5.157843112945557, + "learning_rate": 1.0230179028132994e-06, + "loss": 3.0401, + "step": 2 + }, + { + "epoch": 0.002046428343032551, + "grad_norm": 5.386075973510742, + "learning_rate": 2.0460358056265987e-06, + "loss": 3.071, + "step": 4 + }, + { + "epoch": 0.0030696425145488263, + "grad_norm": 5.142333984375, + "learning_rate": 3.069053708439898e-06, + "loss": 3.1039, + "step": 6 + }, + { + "epoch": 0.004092856686065102, + "grad_norm": 3.101259231567383, + "learning_rate": 4.092071611253197e-06, + "loss": 2.8181, + "step": 8 + }, + { + "epoch": 0.005116070857581378, + "grad_norm": 2.207404375076294, + "learning_rate": 5.1150895140664966e-06, + "loss": 2.5627, + "step": 10 + }, + { + "epoch": 0.006139285029097653, + "grad_norm": 1.8845449686050415, + "learning_rate": 6.138107416879796e-06, + "loss": 2.6209, + "step": 12 + }, + { + "epoch": 0.007162499200613928, + "grad_norm": 2.1659703254699707, + "learning_rate": 7.161125319693095e-06, + "loss": 2.5467, + "step": 14 + }, + { + "epoch": 0.008185713372130204, + "grad_norm": 1.6377224922180176, + "learning_rate": 8.184143222506395e-06, + "loss": 2.5057, + "step": 16 + }, + { + "epoch": 0.009208927543646479, + "grad_norm": 1.1535893678665161, + "learning_rate": 9.207161125319694e-06, + "loss": 2.538, + "step": 18 + }, + { + "epoch": 0.010232141715162756, + "grad_norm": 1.0513185262680054, + "learning_rate": 1.0230179028132993e-05, + "loss": 2.5247, + "step": 20 + }, + { + "epoch": 0.01125535588667903, + "grad_norm": 0.8281764388084412, + "learning_rate": 1.1253196930946292e-05, + "loss": 2.4595, + "step": 22 + }, + { + "epoch": 0.012278570058195305, + "grad_norm": 0.5129208564758301, + "learning_rate": 1.2276214833759591e-05, + "loss": 2.3742, + "step": 24 + }, + { + "epoch": 0.013301784229711582, + "grad_norm": 0.46613597869873047, + "learning_rate": 1.3299232736572892e-05, + "loss": 2.4564, + "step": 26 + }, + { + "epoch": 0.014324998401227857, + "grad_norm": 0.354717493057251, + "learning_rate": 1.432225063938619e-05, + "loss": 2.3467, + "step": 28 + }, + { + "epoch": 0.015348212572744133, + "grad_norm": 0.3325178623199463, + "learning_rate": 1.534526854219949e-05, + "loss": 2.3978, + "step": 30 + }, + { + "epoch": 0.016371426744260408, + "grad_norm": 0.32920145988464355, + "learning_rate": 1.636828644501279e-05, + "loss": 2.2522, + "step": 32 + }, + { + "epoch": 0.017394640915776683, + "grad_norm": 0.25466033816337585, + "learning_rate": 1.739130434782609e-05, + "loss": 2.243, + "step": 34 + }, + { + "epoch": 0.018417855087292958, + "grad_norm": 0.35631808638572693, + "learning_rate": 1.8414322250639388e-05, + "loss": 2.2527, + "step": 36 + }, + { + "epoch": 0.019441069258809233, + "grad_norm": 0.23582319915294647, + "learning_rate": 1.9437340153452684e-05, + "loss": 2.1452, + "step": 38 + }, + { + "epoch": 0.02046428343032551, + "grad_norm": 0.2491885870695114, + "learning_rate": 2.0460358056265986e-05, + "loss": 2.1778, + "step": 40 + }, + { + "epoch": 0.021487497601841786, + "grad_norm": 0.2993784546852112, + "learning_rate": 2.1483375959079285e-05, + "loss": 2.1006, + "step": 42 + }, + { + "epoch": 0.02251071177335806, + "grad_norm": 0.21940283477306366, + "learning_rate": 2.2506393861892585e-05, + "loss": 2.1752, + "step": 44 + }, + { + "epoch": 0.023533925944874336, + "grad_norm": 0.15252649784088135, + "learning_rate": 2.3529411764705884e-05, + "loss": 2.1295, + "step": 46 + }, + { + "epoch": 0.02455714011639061, + "grad_norm": 0.19182737171649933, + "learning_rate": 2.4552429667519183e-05, + "loss": 2.1181, + "step": 48 + }, + { + "epoch": 0.02558035428790689, + "grad_norm": 0.19416701793670654, + "learning_rate": 2.5575447570332482e-05, + "loss": 2.0953, + "step": 50 + }, + { + "epoch": 0.026603568459423164, + "grad_norm": 0.12562625110149384, + "learning_rate": 2.6598465473145784e-05, + "loss": 2.0856, + "step": 52 + }, + { + "epoch": 0.02762678263093944, + "grad_norm": 0.13417182862758636, + "learning_rate": 2.7621483375959077e-05, + "loss": 2.0948, + "step": 54 + }, + { + "epoch": 0.028649996802455713, + "grad_norm": 0.10808593034744263, + "learning_rate": 2.864450127877238e-05, + "loss": 2.0541, + "step": 56 + }, + { + "epoch": 0.02967321097397199, + "grad_norm": 0.14162665605545044, + "learning_rate": 2.966751918158568e-05, + "loss": 2.0756, + "step": 58 + }, + { + "epoch": 0.030696425145488267, + "grad_norm": 0.10216689854860306, + "learning_rate": 3.069053708439898e-05, + "loss": 2.0502, + "step": 60 + }, + { + "epoch": 0.03171963931700454, + "grad_norm": 0.0772320106625557, + "learning_rate": 3.171355498721228e-05, + "loss": 2.0598, + "step": 62 + }, + { + "epoch": 0.032742853488520816, + "grad_norm": 0.07200902700424194, + "learning_rate": 3.273657289002558e-05, + "loss": 2.0416, + "step": 64 + }, + { + "epoch": 0.03376606766003709, + "grad_norm": 0.07764917612075806, + "learning_rate": 3.375959079283887e-05, + "loss": 2.04, + "step": 66 + }, + { + "epoch": 0.034789281831553366, + "grad_norm": 0.07703404128551483, + "learning_rate": 3.478260869565218e-05, + "loss": 2.0426, + "step": 68 + }, + { + "epoch": 0.03581249600306964, + "grad_norm": 0.05096273496747017, + "learning_rate": 3.580562659846548e-05, + "loss": 2.0264, + "step": 70 + }, + { + "epoch": 0.036835710174585916, + "grad_norm": 0.07172555476427078, + "learning_rate": 3.6828644501278776e-05, + "loss": 1.9799, + "step": 72 + }, + { + "epoch": 0.03785892434610219, + "grad_norm": 0.05563480406999588, + "learning_rate": 3.7851662404092075e-05, + "loss": 1.9922, + "step": 74 + }, + { + "epoch": 0.038882138517618466, + "grad_norm": 0.04726962745189667, + "learning_rate": 3.887468030690537e-05, + "loss": 1.9826, + "step": 76 + }, + { + "epoch": 0.03990535268913475, + "grad_norm": 0.040130794048309326, + "learning_rate": 3.989769820971867e-05, + "loss": 1.9693, + "step": 78 + }, + { + "epoch": 0.04092856686065102, + "grad_norm": 0.051317401230335236, + "learning_rate": 4.092071611253197e-05, + "loss": 1.9454, + "step": 80 + }, + { + "epoch": 0.0419517810321673, + "grad_norm": 0.03843973949551582, + "learning_rate": 4.194373401534527e-05, + "loss": 1.9535, + "step": 82 + }, + { + "epoch": 0.04297499520368357, + "grad_norm": 0.04338320344686508, + "learning_rate": 4.296675191815857e-05, + "loss": 1.9017, + "step": 84 + }, + { + "epoch": 0.04399820937519985, + "grad_norm": 0.0422111339867115, + "learning_rate": 4.398976982097187e-05, + "loss": 1.9806, + "step": 86 + }, + { + "epoch": 0.04502142354671612, + "grad_norm": 0.043594423681497574, + "learning_rate": 4.501278772378517e-05, + "loss": 1.9809, + "step": 88 + }, + { + "epoch": 0.0460446377182324, + "grad_norm": 0.050932493060827255, + "learning_rate": 4.603580562659847e-05, + "loss": 2.002, + "step": 90 + }, + { + "epoch": 0.04706785188974867, + "grad_norm": 0.039923008531332016, + "learning_rate": 4.705882352941177e-05, + "loss": 1.9898, + "step": 92 + }, + { + "epoch": 0.048091066061264946, + "grad_norm": 0.04199720919132233, + "learning_rate": 4.8081841432225067e-05, + "loss": 1.9375, + "step": 94 + }, + { + "epoch": 0.04911428023278122, + "grad_norm": 0.03885011374950409, + "learning_rate": 4.9104859335038366e-05, + "loss": 1.9594, + "step": 96 + }, + { + "epoch": 0.0501374944042975, + "grad_norm": 0.04459952563047409, + "learning_rate": 5.0127877237851665e-05, + "loss": 1.9327, + "step": 98 + }, + { + "epoch": 0.05116070857581378, + "grad_norm": 0.04154925048351288, + "learning_rate": 5.1150895140664964e-05, + "loss": 1.9385, + "step": 100 + }, + { + "epoch": 0.05218392274733005, + "grad_norm": 0.04149138927459717, + "learning_rate": 5.217391304347826e-05, + "loss": 1.9251, + "step": 102 + }, + { + "epoch": 0.05320713691884633, + "grad_norm": 0.05338102579116821, + "learning_rate": 5.319693094629157e-05, + "loss": 1.9211, + "step": 104 + }, + { + "epoch": 0.0542303510903626, + "grad_norm": 0.04964439943432808, + "learning_rate": 5.421994884910486e-05, + "loss": 1.8863, + "step": 106 + }, + { + "epoch": 0.05525356526187888, + "grad_norm": 0.040731314569711685, + "learning_rate": 5.5242966751918154e-05, + "loss": 1.9002, + "step": 108 + }, + { + "epoch": 0.05627677943339515, + "grad_norm": 0.05813027173280716, + "learning_rate": 5.626598465473146e-05, + "loss": 1.8944, + "step": 110 + }, + { + "epoch": 0.05729999360491143, + "grad_norm": 0.04966093972325325, + "learning_rate": 5.728900255754476e-05, + "loss": 1.898, + "step": 112 + }, + { + "epoch": 0.0583232077764277, + "grad_norm": 0.050573479384183884, + "learning_rate": 5.8312020460358065e-05, + "loss": 1.8778, + "step": 114 + }, + { + "epoch": 0.05934642194794398, + "grad_norm": 0.05025520175695419, + "learning_rate": 5.933503836317136e-05, + "loss": 1.9044, + "step": 116 + }, + { + "epoch": 0.06036963611946025, + "grad_norm": 0.05153055489063263, + "learning_rate": 6.035805626598465e-05, + "loss": 1.9045, + "step": 118 + }, + { + "epoch": 0.06139285029097653, + "grad_norm": 0.051311247050762177, + "learning_rate": 6.138107416879796e-05, + "loss": 1.9077, + "step": 120 + }, + { + "epoch": 0.06241606446249281, + "grad_norm": 0.05084897577762604, + "learning_rate": 6.240409207161125e-05, + "loss": 1.8538, + "step": 122 + }, + { + "epoch": 0.06343927863400908, + "grad_norm": 0.05961287021636963, + "learning_rate": 6.342710997442456e-05, + "loss": 1.8792, + "step": 124 + }, + { + "epoch": 0.06446249280552535, + "grad_norm": 0.05775010585784912, + "learning_rate": 6.445012787723786e-05, + "loss": 1.8587, + "step": 126 + }, + { + "epoch": 0.06548570697704163, + "grad_norm": 0.09344275295734406, + "learning_rate": 6.547314578005116e-05, + "loss": 1.8454, + "step": 128 + }, + { + "epoch": 0.0665089211485579, + "grad_norm": 0.0748172476887703, + "learning_rate": 6.649616368286446e-05, + "loss": 1.8998, + "step": 130 + }, + { + "epoch": 0.06753213532007418, + "grad_norm": 0.07188538461923599, + "learning_rate": 6.751918158567774e-05, + "loss": 1.8219, + "step": 132 + }, + { + "epoch": 0.06855534949159046, + "grad_norm": 0.05799673870205879, + "learning_rate": 6.854219948849106e-05, + "loss": 1.8549, + "step": 134 + }, + { + "epoch": 0.06957856366310673, + "grad_norm": 0.07886774092912674, + "learning_rate": 6.956521739130436e-05, + "loss": 1.8885, + "step": 136 + }, + { + "epoch": 0.07060177783462301, + "grad_norm": 0.0599171444773674, + "learning_rate": 7.058823529411765e-05, + "loss": 1.829, + "step": 138 + }, + { + "epoch": 0.07162499200613928, + "grad_norm": 0.07810111343860626, + "learning_rate": 7.161125319693095e-05, + "loss": 1.8878, + "step": 140 + }, + { + "epoch": 0.07264820617765556, + "grad_norm": 0.062123704701662064, + "learning_rate": 7.263427109974424e-05, + "loss": 1.8633, + "step": 142 + }, + { + "epoch": 0.07367142034917183, + "grad_norm": 0.08402098715305328, + "learning_rate": 7.365728900255755e-05, + "loss": 1.8377, + "step": 144 + }, + { + "epoch": 0.07469463452068811, + "grad_norm": 0.06189502775669098, + "learning_rate": 7.468030690537085e-05, + "loss": 1.8683, + "step": 146 + }, + { + "epoch": 0.07571784869220438, + "grad_norm": 0.07368986308574677, + "learning_rate": 7.570332480818415e-05, + "loss": 1.8636, + "step": 148 + }, + { + "epoch": 0.07674106286372066, + "grad_norm": 0.06430894136428833, + "learning_rate": 7.672634271099745e-05, + "loss": 1.8341, + "step": 150 + }, + { + "epoch": 0.07776427703523693, + "grad_norm": 0.05924483761191368, + "learning_rate": 7.774936061381073e-05, + "loss": 1.9151, + "step": 152 + }, + { + "epoch": 0.07878749120675321, + "grad_norm": 0.06166929751634598, + "learning_rate": 7.877237851662405e-05, + "loss": 1.8306, + "step": 154 + }, + { + "epoch": 0.0798107053782695, + "grad_norm": 0.07514499127864838, + "learning_rate": 7.979539641943735e-05, + "loss": 1.8572, + "step": 156 + }, + { + "epoch": 0.08083391954978576, + "grad_norm": 0.06925056874752045, + "learning_rate": 8.081841432225065e-05, + "loss": 1.8449, + "step": 158 + }, + { + "epoch": 0.08185713372130204, + "grad_norm": 0.08889607340097427, + "learning_rate": 8.184143222506395e-05, + "loss": 1.8217, + "step": 160 + }, + { + "epoch": 0.08288034789281831, + "grad_norm": 0.11205849796533585, + "learning_rate": 8.286445012787724e-05, + "loss": 1.7859, + "step": 162 + }, + { + "epoch": 0.0839035620643346, + "grad_norm": 0.13293609023094177, + "learning_rate": 8.388746803069054e-05, + "loss": 1.8245, + "step": 164 + }, + { + "epoch": 0.08492677623585086, + "grad_norm": 0.14082959294319153, + "learning_rate": 8.491048593350384e-05, + "loss": 1.8077, + "step": 166 + }, + { + "epoch": 0.08594999040736714, + "grad_norm": 0.0726478174328804, + "learning_rate": 8.593350383631714e-05, + "loss": 1.8081, + "step": 168 + }, + { + "epoch": 0.08697320457888341, + "grad_norm": 0.21175715327262878, + "learning_rate": 8.695652173913044e-05, + "loss": 1.8289, + "step": 170 + }, + { + "epoch": 0.0879964187503997, + "grad_norm": 0.19227363169193268, + "learning_rate": 8.797953964194374e-05, + "loss": 1.8092, + "step": 172 + }, + { + "epoch": 0.08901963292191598, + "grad_norm": 0.13788004219532013, + "learning_rate": 8.900255754475704e-05, + "loss": 1.7986, + "step": 174 + }, + { + "epoch": 0.09004284709343224, + "grad_norm": 0.09351494908332825, + "learning_rate": 9.002557544757034e-05, + "loss": 1.8077, + "step": 176 + }, + { + "epoch": 0.09106606126494853, + "grad_norm": 0.09681002050638199, + "learning_rate": 9.104859335038364e-05, + "loss": 1.794, + "step": 178 + }, + { + "epoch": 0.0920892754364648, + "grad_norm": 0.061654381453990936, + "learning_rate": 9.207161125319694e-05, + "loss": 1.7935, + "step": 180 + }, + { + "epoch": 0.09311248960798107, + "grad_norm": 0.06282493472099304, + "learning_rate": 9.309462915601024e-05, + "loss": 1.7758, + "step": 182 + }, + { + "epoch": 0.09413570377949734, + "grad_norm": 0.08118202537298203, + "learning_rate": 9.411764705882353e-05, + "loss": 1.8209, + "step": 184 + }, + { + "epoch": 0.09515891795101362, + "grad_norm": 0.0755864828824997, + "learning_rate": 9.514066496163683e-05, + "loss": 1.7672, + "step": 186 + }, + { + "epoch": 0.09618213212252989, + "grad_norm": 0.07810387760400772, + "learning_rate": 9.616368286445013e-05, + "loss": 1.7655, + "step": 188 + }, + { + "epoch": 0.09720534629404617, + "grad_norm": 0.08016899228096008, + "learning_rate": 9.718670076726343e-05, + "loss": 1.7818, + "step": 190 + }, + { + "epoch": 0.09822856046556244, + "grad_norm": 0.07527964562177658, + "learning_rate": 9.820971867007673e-05, + "loss": 1.7386, + "step": 192 + }, + { + "epoch": 0.09925177463707872, + "grad_norm": 0.08135760575532913, + "learning_rate": 9.923273657289003e-05, + "loss": 1.7678, + "step": 194 + }, + { + "epoch": 0.100274988808595, + "grad_norm": 0.06465744972229004, + "learning_rate": 0.00010025575447570333, + "loss": 1.8469, + "step": 196 + }, + { + "epoch": 0.10129820298011127, + "grad_norm": 0.0678311362862587, + "learning_rate": 0.00010127877237851664, + "loss": 1.7856, + "step": 198 + }, + { + "epoch": 0.10232141715162756, + "grad_norm": 0.06425610929727554, + "learning_rate": 0.00010230179028132993, + "loss": 1.7542, + "step": 200 + }, + { + "epoch": 0.10334463132314382, + "grad_norm": 0.06820003688335419, + "learning_rate": 0.00010332480818414323, + "loss": 1.783, + "step": 202 + }, + { + "epoch": 0.1043678454946601, + "grad_norm": 0.0690922886133194, + "learning_rate": 0.00010434782608695653, + "loss": 1.7612, + "step": 204 + }, + { + "epoch": 0.10539105966617637, + "grad_norm": 0.06488107144832611, + "learning_rate": 0.00010537084398976983, + "loss": 1.7648, + "step": 206 + }, + { + "epoch": 0.10641427383769266, + "grad_norm": 0.08278009295463562, + "learning_rate": 0.00010639386189258314, + "loss": 1.7661, + "step": 208 + }, + { + "epoch": 0.10743748800920892, + "grad_norm": 0.08722035586833954, + "learning_rate": 0.00010741687979539642, + "loss": 1.7578, + "step": 210 + }, + { + "epoch": 0.1084607021807252, + "grad_norm": 0.0737011507153511, + "learning_rate": 0.00010843989769820972, + "loss": 1.7381, + "step": 212 + }, + { + "epoch": 0.10948391635224147, + "grad_norm": 0.08060843497514725, + "learning_rate": 0.00010946291560102302, + "loss": 1.7967, + "step": 214 + }, + { + "epoch": 0.11050713052375775, + "grad_norm": 0.10279374569654465, + "learning_rate": 0.00011048593350383631, + "loss": 1.7703, + "step": 216 + }, + { + "epoch": 0.11153034469527404, + "grad_norm": 0.0777791365981102, + "learning_rate": 0.00011150895140664963, + "loss": 1.8015, + "step": 218 + }, + { + "epoch": 0.1125535588667903, + "grad_norm": 0.06883997470140457, + "learning_rate": 0.00011253196930946292, + "loss": 1.7731, + "step": 220 + }, + { + "epoch": 0.11357677303830659, + "grad_norm": 0.06231442466378212, + "learning_rate": 0.00011355498721227622, + "loss": 1.8063, + "step": 222 + }, + { + "epoch": 0.11459998720982285, + "grad_norm": 0.06607846170663834, + "learning_rate": 0.00011457800511508952, + "loss": 1.7616, + "step": 224 + }, + { + "epoch": 0.11562320138133914, + "grad_norm": 0.05903138220310211, + "learning_rate": 0.0001156010230179028, + "loss": 1.7993, + "step": 226 + }, + { + "epoch": 0.1166464155528554, + "grad_norm": 0.07282232493162155, + "learning_rate": 0.00011662404092071613, + "loss": 1.7374, + "step": 228 + }, + { + "epoch": 0.11766962972437169, + "grad_norm": 0.06793032586574554, + "learning_rate": 0.00011764705882352942, + "loss": 1.7852, + "step": 230 + }, + { + "epoch": 0.11869284389588795, + "grad_norm": 0.06404048949480057, + "learning_rate": 0.00011867007672634271, + "loss": 1.775, + "step": 232 + }, + { + "epoch": 0.11971605806740424, + "grad_norm": 0.08423135429620743, + "learning_rate": 0.00011969309462915601, + "loss": 1.779, + "step": 234 + }, + { + "epoch": 0.1207392722389205, + "grad_norm": 0.0814799889922142, + "learning_rate": 0.0001207161125319693, + "loss": 1.7082, + "step": 236 + }, + { + "epoch": 0.12176248641043678, + "grad_norm": 0.08876215666532516, + "learning_rate": 0.00012173913043478263, + "loss": 1.7767, + "step": 238 + }, + { + "epoch": 0.12278570058195307, + "grad_norm": 0.07051345705986023, + "learning_rate": 0.00012276214833759592, + "loss": 1.7181, + "step": 240 + }, + { + "epoch": 0.12380891475346933, + "grad_norm": 0.07023751735687256, + "learning_rate": 0.00012378516624040922, + "loss": 1.7308, + "step": 242 + }, + { + "epoch": 0.12483212892498562, + "grad_norm": 0.0754849910736084, + "learning_rate": 0.0001248081841432225, + "loss": 1.7782, + "step": 244 + }, + { + "epoch": 0.1258553430965019, + "grad_norm": 0.07223635166883469, + "learning_rate": 0.0001258312020460358, + "loss": 1.718, + "step": 246 + }, + { + "epoch": 0.12687855726801817, + "grad_norm": 0.07007969915866852, + "learning_rate": 0.00012685421994884912, + "loss": 1.7686, + "step": 248 + }, + { + "epoch": 0.12790177143953443, + "grad_norm": 0.06361662596464157, + "learning_rate": 0.00012787723785166242, + "loss": 1.7217, + "step": 250 + }, + { + "epoch": 0.1289249856110507, + "grad_norm": 0.08723774552345276, + "learning_rate": 0.00012890025575447572, + "loss": 1.7369, + "step": 252 + }, + { + "epoch": 0.129948199782567, + "grad_norm": 0.06651702523231506, + "learning_rate": 0.000129923273657289, + "loss": 1.7163, + "step": 254 + }, + { + "epoch": 0.13097141395408327, + "grad_norm": 0.07153377681970596, + "learning_rate": 0.00013094629156010232, + "loss": 1.7168, + "step": 256 + }, + { + "epoch": 0.13199462812559953, + "grad_norm": 0.09451760351657867, + "learning_rate": 0.00013196930946291562, + "loss": 1.7182, + "step": 258 + }, + { + "epoch": 0.1330178422971158, + "grad_norm": 0.08822207897901535, + "learning_rate": 0.00013299232736572892, + "loss": 1.7483, + "step": 260 + }, + { + "epoch": 0.1340410564686321, + "grad_norm": 0.11073771119117737, + "learning_rate": 0.00013401534526854221, + "loss": 1.7087, + "step": 262 + }, + { + "epoch": 0.13506427064014837, + "grad_norm": 0.07717689871788025, + "learning_rate": 0.0001350383631713555, + "loss": 1.6943, + "step": 264 + }, + { + "epoch": 0.13608748481166463, + "grad_norm": 0.09418254345655441, + "learning_rate": 0.0001360613810741688, + "loss": 1.7084, + "step": 266 + }, + { + "epoch": 0.13711069898318093, + "grad_norm": 0.0922132208943367, + "learning_rate": 0.0001370843989769821, + "loss": 1.7526, + "step": 268 + }, + { + "epoch": 0.1381339131546972, + "grad_norm": 0.08973314613103867, + "learning_rate": 0.0001381074168797954, + "loss": 1.7049, + "step": 270 + }, + { + "epoch": 0.13915712732621346, + "grad_norm": 0.0772908478975296, + "learning_rate": 0.0001391304347826087, + "loss": 1.7444, + "step": 272 + }, + { + "epoch": 0.14018034149772973, + "grad_norm": 0.07179255038499832, + "learning_rate": 0.00014015345268542198, + "loss": 1.7309, + "step": 274 + }, + { + "epoch": 0.14120355566924603, + "grad_norm": 0.10786614567041397, + "learning_rate": 0.0001411764705882353, + "loss": 1.7413, + "step": 276 + }, + { + "epoch": 0.1422267698407623, + "grad_norm": 0.0815059244632721, + "learning_rate": 0.0001421994884910486, + "loss": 1.6895, + "step": 278 + }, + { + "epoch": 0.14324998401227856, + "grad_norm": 0.12658405303955078, + "learning_rate": 0.0001432225063938619, + "loss": 1.7013, + "step": 280 + }, + { + "epoch": 0.14427319818379483, + "grad_norm": 0.0807737335562706, + "learning_rate": 0.0001442455242966752, + "loss": 1.7378, + "step": 282 + }, + { + "epoch": 0.14529641235531113, + "grad_norm": 0.09726593643426895, + "learning_rate": 0.00014526854219948848, + "loss": 1.7143, + "step": 284 + }, + { + "epoch": 0.1463196265268274, + "grad_norm": 0.08326689153909683, + "learning_rate": 0.0001462915601023018, + "loss": 1.7395, + "step": 286 + }, + { + "epoch": 0.14734284069834366, + "grad_norm": 0.08783421665430069, + "learning_rate": 0.0001473145780051151, + "loss": 1.7466, + "step": 288 + }, + { + "epoch": 0.14836605486985996, + "grad_norm": 0.0639604702591896, + "learning_rate": 0.0001483375959079284, + "loss": 1.7019, + "step": 290 + }, + { + "epoch": 0.14938926904137623, + "grad_norm": 0.08028368651866913, + "learning_rate": 0.0001493606138107417, + "loss": 1.7134, + "step": 292 + }, + { + "epoch": 0.1504124832128925, + "grad_norm": 0.0739947184920311, + "learning_rate": 0.00015038363171355497, + "loss": 1.702, + "step": 294 + }, + { + "epoch": 0.15143569738440876, + "grad_norm": 0.07335802167654037, + "learning_rate": 0.0001514066496163683, + "loss": 1.7321, + "step": 296 + }, + { + "epoch": 0.15245891155592506, + "grad_norm": 0.07030144333839417, + "learning_rate": 0.0001524296675191816, + "loss": 1.6654, + "step": 298 + }, + { + "epoch": 0.15348212572744133, + "grad_norm": 0.07079968601465225, + "learning_rate": 0.0001534526854219949, + "loss": 1.7129, + "step": 300 + }, + { + "epoch": 0.1545053398989576, + "grad_norm": 0.06605160236358643, + "learning_rate": 0.0001544757033248082, + "loss": 1.713, + "step": 302 + }, + { + "epoch": 0.15552855407047386, + "grad_norm": 0.08417898416519165, + "learning_rate": 0.00015549872122762147, + "loss": 1.7063, + "step": 304 + }, + { + "epoch": 0.15655176824199016, + "grad_norm": 0.07255028933286667, + "learning_rate": 0.0001565217391304348, + "loss": 1.742, + "step": 306 + }, + { + "epoch": 0.15757498241350643, + "grad_norm": 0.06561743468046188, + "learning_rate": 0.0001575447570332481, + "loss": 1.6912, + "step": 308 + }, + { + "epoch": 0.1585981965850227, + "grad_norm": 0.07030262053012848, + "learning_rate": 0.0001585677749360614, + "loss": 1.7434, + "step": 310 + }, + { + "epoch": 0.159621410756539, + "grad_norm": 0.076111800968647, + "learning_rate": 0.0001595907928388747, + "loss": 1.6783, + "step": 312 + }, + { + "epoch": 0.16064462492805526, + "grad_norm": 0.06267083436250687, + "learning_rate": 0.000160613810741688, + "loss": 1.7193, + "step": 314 + }, + { + "epoch": 0.16166783909957153, + "grad_norm": 0.07638990879058838, + "learning_rate": 0.0001616368286445013, + "loss": 1.7395, + "step": 316 + }, + { + "epoch": 0.1626910532710878, + "grad_norm": 0.07447683811187744, + "learning_rate": 0.0001626598465473146, + "loss": 1.6574, + "step": 318 + }, + { + "epoch": 0.1637142674426041, + "grad_norm": 0.07413692772388458, + "learning_rate": 0.0001636828644501279, + "loss": 1.6868, + "step": 320 + }, + { + "epoch": 0.16473748161412036, + "grad_norm": 0.07566969096660614, + "learning_rate": 0.0001647058823529412, + "loss": 1.779, + "step": 322 + }, + { + "epoch": 0.16576069578563662, + "grad_norm": 0.09093326330184937, + "learning_rate": 0.0001657289002557545, + "loss": 1.6807, + "step": 324 + }, + { + "epoch": 0.16678390995715292, + "grad_norm": 0.0930614024400711, + "learning_rate": 0.0001667519181585678, + "loss": 1.7067, + "step": 326 + }, + { + "epoch": 0.1678071241286692, + "grad_norm": 0.06676892936229706, + "learning_rate": 0.0001677749360613811, + "loss": 1.6609, + "step": 328 + }, + { + "epoch": 0.16883033830018546, + "grad_norm": 0.08882534503936768, + "learning_rate": 0.00016879795396419439, + "loss": 1.6796, + "step": 330 + }, + { + "epoch": 0.16985355247170172, + "grad_norm": 0.07226958125829697, + "learning_rate": 0.00016982097186700768, + "loss": 1.7163, + "step": 332 + }, + { + "epoch": 0.17087676664321802, + "grad_norm": 0.07271122932434082, + "learning_rate": 0.00017084398976982098, + "loss": 1.7585, + "step": 334 + }, + { + "epoch": 0.1718999808147343, + "grad_norm": 0.08161617070436478, + "learning_rate": 0.00017186700767263428, + "loss": 1.6299, + "step": 336 + }, + { + "epoch": 0.17292319498625056, + "grad_norm": 0.08419859409332275, + "learning_rate": 0.00017289002557544758, + "loss": 1.6848, + "step": 338 + }, + { + "epoch": 0.17394640915776682, + "grad_norm": 0.08996909856796265, + "learning_rate": 0.00017391304347826088, + "loss": 1.6582, + "step": 340 + }, + { + "epoch": 0.17496962332928312, + "grad_norm": 0.09278981387615204, + "learning_rate": 0.00017493606138107418, + "loss": 1.7044, + "step": 342 + }, + { + "epoch": 0.1759928375007994, + "grad_norm": 0.08387704193592072, + "learning_rate": 0.00017595907928388748, + "loss": 1.6503, + "step": 344 + }, + { + "epoch": 0.17701605167231566, + "grad_norm": 0.07442387193441391, + "learning_rate": 0.00017698209718670078, + "loss": 1.7058, + "step": 346 + }, + { + "epoch": 0.17803926584383195, + "grad_norm": 0.06898263841867447, + "learning_rate": 0.00017800511508951408, + "loss": 1.6708, + "step": 348 + }, + { + "epoch": 0.17906248001534822, + "grad_norm": 0.07982076704502106, + "learning_rate": 0.00017902813299232738, + "loss": 1.6807, + "step": 350 + }, + { + "epoch": 0.1800856941868645, + "grad_norm": 0.07170634716749191, + "learning_rate": 0.00018005115089514068, + "loss": 1.6753, + "step": 352 + }, + { + "epoch": 0.18110890835838075, + "grad_norm": 0.07484789937734604, + "learning_rate": 0.00018107416879795398, + "loss": 1.6883, + "step": 354 + }, + { + "epoch": 0.18213212252989705, + "grad_norm": 0.08390472084283829, + "learning_rate": 0.00018209718670076727, + "loss": 1.6783, + "step": 356 + }, + { + "epoch": 0.18315533670141332, + "grad_norm": 0.0833701565861702, + "learning_rate": 0.00018312020460358057, + "loss": 1.6804, + "step": 358 + }, + { + "epoch": 0.1841785508729296, + "grad_norm": 0.07489979267120361, + "learning_rate": 0.00018414322250639387, + "loss": 1.6179, + "step": 360 + }, + { + "epoch": 0.18520176504444585, + "grad_norm": 0.14307746291160583, + "learning_rate": 0.00018516624040920717, + "loss": 1.6396, + "step": 362 + }, + { + "epoch": 0.18622497921596215, + "grad_norm": 0.13637496531009674, + "learning_rate": 0.00018618925831202047, + "loss": 1.6425, + "step": 364 + }, + { + "epoch": 0.18724819338747842, + "grad_norm": 0.13586537539958954, + "learning_rate": 0.00018721227621483377, + "loss": 1.6915, + "step": 366 + }, + { + "epoch": 0.18827140755899469, + "grad_norm": 0.07892754673957825, + "learning_rate": 0.00018823529411764707, + "loss": 1.6628, + "step": 368 + }, + { + "epoch": 0.18929462173051098, + "grad_norm": 0.20291955769062042, + "learning_rate": 0.00018925831202046037, + "loss": 1.6572, + "step": 370 + }, + { + "epoch": 0.19031783590202725, + "grad_norm": 0.3548440933227539, + "learning_rate": 0.00019028132992327367, + "loss": 1.6963, + "step": 372 + }, + { + "epoch": 0.19134105007354352, + "grad_norm": 0.19051846861839294, + "learning_rate": 0.00019130434782608697, + "loss": 1.6853, + "step": 374 + }, + { + "epoch": 0.19236426424505979, + "grad_norm": 0.3201465308666229, + "learning_rate": 0.00019232736572890027, + "loss": 1.6549, + "step": 376 + }, + { + "epoch": 0.19338747841657608, + "grad_norm": 0.1700785905122757, + "learning_rate": 0.00019335038363171357, + "loss": 1.658, + "step": 378 + }, + { + "epoch": 0.19441069258809235, + "grad_norm": 0.1742287576198578, + "learning_rate": 0.00019437340153452686, + "loss": 1.6644, + "step": 380 + }, + { + "epoch": 0.19543390675960862, + "grad_norm": 0.0945478230714798, + "learning_rate": 0.00019539641943734016, + "loss": 1.65, + "step": 382 + }, + { + "epoch": 0.19645712093112488, + "grad_norm": 0.06995284557342529, + "learning_rate": 0.00019641943734015346, + "loss": 1.6608, + "step": 384 + }, + { + "epoch": 0.19748033510264118, + "grad_norm": 0.07590003311634064, + "learning_rate": 0.00019744245524296676, + "loss": 1.6367, + "step": 386 + }, + { + "epoch": 0.19850354927415745, + "grad_norm": 0.09830451011657715, + "learning_rate": 0.00019846547314578006, + "loss": 1.6638, + "step": 388 + }, + { + "epoch": 0.19952676344567372, + "grad_norm": 0.10720949620008469, + "learning_rate": 0.00019948849104859336, + "loss": 1.6571, + "step": 390 + }, + { + "epoch": 0.20054997761719, + "grad_norm": 0.06915664672851562, + "learning_rate": 0.0001999999910488914, + "loss": 1.669, + "step": 392 + }, + { + "epoch": 0.20157319178870628, + "grad_norm": 0.04960264638066292, + "learning_rate": 0.00019999991944003202, + "loss": 1.6529, + "step": 394 + }, + { + "epoch": 0.20259640596022255, + "grad_norm": 0.05139967054128647, + "learning_rate": 0.00019999977622236462, + "loss": 1.6053, + "step": 396 + }, + { + "epoch": 0.20361962013173882, + "grad_norm": 0.05288904160261154, + "learning_rate": 0.0001999995613959917, + "loss": 1.6905, + "step": 398 + }, + { + "epoch": 0.2046428343032551, + "grad_norm": 0.056239306926727295, + "learning_rate": 0.00019999927496106707, + "loss": 1.6662, + "step": 400 + }, + { + "epoch": 0.20566604847477138, + "grad_norm": 0.06484871357679367, + "learning_rate": 0.0001999989169177959, + "loss": 1.6803, + "step": 402 + }, + { + "epoch": 0.20668926264628765, + "grad_norm": 0.11631152778863907, + "learning_rate": 0.00019999848726643454, + "loss": 1.6389, + "step": 404 + }, + { + "epoch": 0.20771247681780391, + "grad_norm": 0.06311234086751938, + "learning_rate": 0.00019999798600729064, + "loss": 1.7017, + "step": 406 + }, + { + "epoch": 0.2087356909893202, + "grad_norm": 0.06155601888895035, + "learning_rate": 0.00019999741314072323, + "loss": 1.7014, + "step": 408 + }, + { + "epoch": 0.20975890516083648, + "grad_norm": 0.06340397894382477, + "learning_rate": 0.00019999676866714244, + "loss": 1.6735, + "step": 410 + }, + { + "epoch": 0.21078211933235275, + "grad_norm": 0.06068040430545807, + "learning_rate": 0.00019999605258700983, + "loss": 1.6224, + "step": 412 + }, + { + "epoch": 0.21180533350386904, + "grad_norm": 0.06651381403207779, + "learning_rate": 0.00019999526490083817, + "loss": 1.6279, + "step": 414 + }, + { + "epoch": 0.2128285476753853, + "grad_norm": 0.06273658573627472, + "learning_rate": 0.00019999440560919152, + "loss": 1.6591, + "step": 416 + }, + { + "epoch": 0.21385176184690158, + "grad_norm": 0.06989671289920807, + "learning_rate": 0.00019999347471268516, + "loss": 1.6405, + "step": 418 + }, + { + "epoch": 0.21487497601841785, + "grad_norm": 0.06204582378268242, + "learning_rate": 0.00019999247221198573, + "loss": 1.6512, + "step": 420 + }, + { + "epoch": 0.21589819018993414, + "grad_norm": 0.1728357970714569, + "learning_rate": 0.00019999139810781112, + "loss": 1.6332, + "step": 422 + }, + { + "epoch": 0.2169214043614504, + "grad_norm": 0.0696343332529068, + "learning_rate": 0.00019999025240093044, + "loss": 1.6649, + "step": 424 + }, + { + "epoch": 0.21794461853296668, + "grad_norm": 0.060923777520656586, + "learning_rate": 0.00019998903509216415, + "loss": 1.6269, + "step": 426 + }, + { + "epoch": 0.21896783270448295, + "grad_norm": 0.061977677047252655, + "learning_rate": 0.00019998774618238394, + "loss": 1.6636, + "step": 428 + }, + { + "epoch": 0.21999104687599924, + "grad_norm": 0.07241713255643845, + "learning_rate": 0.0001999863856725128, + "loss": 1.643, + "step": 430 + }, + { + "epoch": 0.2210142610475155, + "grad_norm": 0.06513350456953049, + "learning_rate": 0.000199984953563525, + "loss": 1.6184, + "step": 432 + }, + { + "epoch": 0.22203747521903178, + "grad_norm": 0.06109536439180374, + "learning_rate": 0.000199983449856446, + "loss": 1.6734, + "step": 434 + }, + { + "epoch": 0.22306068939054807, + "grad_norm": 0.09125282615423203, + "learning_rate": 0.0001999818745523526, + "loss": 1.6617, + "step": 436 + }, + { + "epoch": 0.22408390356206434, + "grad_norm": 0.05963214859366417, + "learning_rate": 0.00019998022765237288, + "loss": 1.648, + "step": 438 + }, + { + "epoch": 0.2251071177335806, + "grad_norm": 0.18775390088558197, + "learning_rate": 0.00019997850915768613, + "loss": 1.6599, + "step": 440 + }, + { + "epoch": 0.22613033190509688, + "grad_norm": 0.05968334153294563, + "learning_rate": 0.00019997671906952298, + "loss": 1.6072, + "step": 442 + }, + { + "epoch": 0.22715354607661317, + "grad_norm": 0.05431201308965683, + "learning_rate": 0.0001999748573891653, + "loss": 1.6315, + "step": 444 + }, + { + "epoch": 0.22817676024812944, + "grad_norm": 0.05960986390709877, + "learning_rate": 0.00019997292411794618, + "loss": 1.6565, + "step": 446 + }, + { + "epoch": 0.2291999744196457, + "grad_norm": 0.07451862096786499, + "learning_rate": 0.00019997091925725004, + "loss": 1.6793, + "step": 448 + }, + { + "epoch": 0.23022318859116198, + "grad_norm": 0.05454723909497261, + "learning_rate": 0.0001999688428085125, + "loss": 1.6055, + "step": 450 + }, + { + "epoch": 0.23124640276267827, + "grad_norm": 0.05422728881239891, + "learning_rate": 0.00019996669477322055, + "loss": 1.6455, + "step": 452 + }, + { + "epoch": 0.23226961693419454, + "grad_norm": 0.06064201146364212, + "learning_rate": 0.00019996447515291233, + "loss": 1.5895, + "step": 454 + }, + { + "epoch": 0.2332928311057108, + "grad_norm": 0.04667961224913597, + "learning_rate": 0.0001999621839491773, + "loss": 1.652, + "step": 456 + }, + { + "epoch": 0.2343160452772271, + "grad_norm": 0.06072809919714928, + "learning_rate": 0.00019995982116365616, + "loss": 1.6073, + "step": 458 + }, + { + "epoch": 0.23533925944874337, + "grad_norm": 0.05477429926395416, + "learning_rate": 0.00019995738679804085, + "loss": 1.6412, + "step": 460 + }, + { + "epoch": 0.23636247362025964, + "grad_norm": 0.08307594060897827, + "learning_rate": 0.00019995488085407462, + "loss": 1.6396, + "step": 462 + }, + { + "epoch": 0.2373856877917759, + "grad_norm": 0.059893883764743805, + "learning_rate": 0.00019995230333355192, + "loss": 1.6426, + "step": 464 + }, + { + "epoch": 0.2384089019632922, + "grad_norm": 0.06132538989186287, + "learning_rate": 0.00019994965423831854, + "loss": 1.6133, + "step": 466 + }, + { + "epoch": 0.23943211613480847, + "grad_norm": 0.07076270133256912, + "learning_rate": 0.00019994693357027138, + "loss": 1.576, + "step": 468 + }, + { + "epoch": 0.24045533030632474, + "grad_norm": 0.06282426416873932, + "learning_rate": 0.00019994414133135877, + "loss": 1.6373, + "step": 470 + }, + { + "epoch": 0.241478544477841, + "grad_norm": 0.058667294681072235, + "learning_rate": 0.00019994127752358013, + "loss": 1.619, + "step": 472 + }, + { + "epoch": 0.2425017586493573, + "grad_norm": 0.08359505236148834, + "learning_rate": 0.00019993834214898626, + "loss": 1.6225, + "step": 474 + }, + { + "epoch": 0.24352497282087357, + "grad_norm": 0.06758000701665878, + "learning_rate": 0.00019993533520967912, + "loss": 1.5799, + "step": 476 + }, + { + "epoch": 0.24454818699238984, + "grad_norm": 0.11436283588409424, + "learning_rate": 0.0001999322567078119, + "loss": 1.6385, + "step": 478 + }, + { + "epoch": 0.24557140116390613, + "grad_norm": 0.05773819610476494, + "learning_rate": 0.00019992910664558915, + "loss": 1.6022, + "step": 480 + }, + { + "epoch": 0.2465946153354224, + "grad_norm": 0.052521176636219025, + "learning_rate": 0.00019992588502526658, + "loss": 1.6137, + "step": 482 + }, + { + "epoch": 0.24761782950693867, + "grad_norm": 0.056573059409856796, + "learning_rate": 0.00019992259184915115, + "loss": 1.6065, + "step": 484 + }, + { + "epoch": 0.24864104367845494, + "grad_norm": 0.05170164257287979, + "learning_rate": 0.00019991922711960102, + "loss": 1.6325, + "step": 486 + }, + { + "epoch": 0.24966425784997123, + "grad_norm": 0.05951111018657684, + "learning_rate": 0.00019991579083902572, + "loss": 1.6034, + "step": 488 + }, + { + "epoch": 0.2506874720214875, + "grad_norm": 0.054325833916664124, + "learning_rate": 0.00019991228300988585, + "loss": 1.6102, + "step": 490 + }, + { + "epoch": 0.2517106861930038, + "grad_norm": 0.07080011814832687, + "learning_rate": 0.0001999087036346934, + "loss": 1.6302, + "step": 492 + }, + { + "epoch": 0.25273390036452004, + "grad_norm": 0.06116727367043495, + "learning_rate": 0.00019990505271601144, + "loss": 1.6243, + "step": 494 + }, + { + "epoch": 0.25375711453603633, + "grad_norm": 0.0602283850312233, + "learning_rate": 0.0001999013302564544, + "loss": 1.6024, + "step": 496 + }, + { + "epoch": 0.2547803287075526, + "grad_norm": 0.06313999742269516, + "learning_rate": 0.0001998975362586879, + "loss": 1.6238, + "step": 498 + }, + { + "epoch": 0.25580354287906887, + "grad_norm": 0.06217190623283386, + "learning_rate": 0.00019989367072542876, + "loss": 1.6251, + "step": 500 + }, + { + "epoch": 0.25682675705058516, + "grad_norm": 0.07256064563989639, + "learning_rate": 0.00019988973365944507, + "loss": 1.5929, + "step": 502 + }, + { + "epoch": 0.2578499712221014, + "grad_norm": 0.062201980501413345, + "learning_rate": 0.00019988572506355606, + "loss": 1.5933, + "step": 504 + }, + { + "epoch": 0.2588731853936177, + "grad_norm": 0.07168910652399063, + "learning_rate": 0.00019988164494063226, + "loss": 1.6474, + "step": 506 + }, + { + "epoch": 0.259896399565134, + "grad_norm": 0.056935928761959076, + "learning_rate": 0.00019987749329359548, + "loss": 1.5992, + "step": 508 + }, + { + "epoch": 0.26091961373665024, + "grad_norm": 0.07088612020015717, + "learning_rate": 0.00019987327012541855, + "loss": 1.5952, + "step": 510 + }, + { + "epoch": 0.26194282790816653, + "grad_norm": 0.06023348495364189, + "learning_rate": 0.0001998689754391257, + "loss": 1.6064, + "step": 512 + }, + { + "epoch": 0.2629660420796828, + "grad_norm": 0.05686601996421814, + "learning_rate": 0.0001998646092377923, + "loss": 1.5992, + "step": 514 + }, + { + "epoch": 0.26398925625119907, + "grad_norm": 0.07028970122337341, + "learning_rate": 0.00019986017152454495, + "loss": 1.5835, + "step": 516 + }, + { + "epoch": 0.26501247042271536, + "grad_norm": 0.0645250454545021, + "learning_rate": 0.0001998556623025614, + "loss": 1.6055, + "step": 518 + }, + { + "epoch": 0.2660356845942316, + "grad_norm": 0.0723612904548645, + "learning_rate": 0.00019985108157507067, + "loss": 1.6248, + "step": 520 + }, + { + "epoch": 0.2670588987657479, + "grad_norm": 0.06222670525312424, + "learning_rate": 0.00019984642934535297, + "loss": 1.6411, + "step": 522 + }, + { + "epoch": 0.2680821129372642, + "grad_norm": 0.057786975055933, + "learning_rate": 0.00019984170561673976, + "loss": 1.6313, + "step": 524 + }, + { + "epoch": 0.26910532710878043, + "grad_norm": 0.061039313673973083, + "learning_rate": 0.00019983691039261357, + "loss": 1.5896, + "step": 526 + }, + { + "epoch": 0.27012854128029673, + "grad_norm": 0.04816308245062828, + "learning_rate": 0.00019983204367640824, + "loss": 1.5986, + "step": 528 + }, + { + "epoch": 0.271151755451813, + "grad_norm": 0.06095914542675018, + "learning_rate": 0.0001998271054716088, + "loss": 1.5995, + "step": 530 + }, + { + "epoch": 0.27217496962332927, + "grad_norm": 0.05422305688261986, + "learning_rate": 0.00019982209578175137, + "loss": 1.6047, + "step": 532 + }, + { + "epoch": 0.27319818379484556, + "grad_norm": 0.05381491780281067, + "learning_rate": 0.0001998170146104234, + "loss": 1.5748, + "step": 534 + }, + { + "epoch": 0.27422139796636186, + "grad_norm": 0.08168444782495499, + "learning_rate": 0.0001998118619612634, + "loss": 1.5941, + "step": 536 + }, + { + "epoch": 0.2752446121378781, + "grad_norm": 0.05323650687932968, + "learning_rate": 0.00019980663783796118, + "loss": 1.6015, + "step": 538 + }, + { + "epoch": 0.2762678263093944, + "grad_norm": 0.08093535900115967, + "learning_rate": 0.0001998013422442577, + "loss": 1.6325, + "step": 540 + }, + { + "epoch": 0.27729104048091063, + "grad_norm": 0.05909120664000511, + "learning_rate": 0.00019979597518394491, + "loss": 1.6684, + "step": 542 + }, + { + "epoch": 0.27831425465242693, + "grad_norm": 0.0684690847992897, + "learning_rate": 0.00019979053666086634, + "loss": 1.6682, + "step": 544 + }, + { + "epoch": 0.2793374688239432, + "grad_norm": 0.05854607746005058, + "learning_rate": 0.00019978502667891625, + "loss": 1.6133, + "step": 546 + }, + { + "epoch": 0.28036068299545946, + "grad_norm": 0.05019630119204521, + "learning_rate": 0.00019977944524204037, + "loss": 1.5968, + "step": 548 + }, + { + "epoch": 0.28138389716697576, + "grad_norm": 0.0662982240319252, + "learning_rate": 0.00019977379235423551, + "loss": 1.589, + "step": 550 + }, + { + "epoch": 0.28240711133849206, + "grad_norm": 0.049058698117733, + "learning_rate": 0.00019976806801954964, + "loss": 1.5979, + "step": 552 + }, + { + "epoch": 0.2834303255100083, + "grad_norm": 0.058459024876356125, + "learning_rate": 0.00019976227224208183, + "loss": 1.5813, + "step": 554 + }, + { + "epoch": 0.2844535396815246, + "grad_norm": 0.048455361276865005, + "learning_rate": 0.00019975640502598244, + "loss": 1.5652, + "step": 556 + }, + { + "epoch": 0.2854767538530409, + "grad_norm": 0.06029395014047623, + "learning_rate": 0.00019975046637545288, + "loss": 1.6166, + "step": 558 + }, + { + "epoch": 0.28649996802455713, + "grad_norm": 0.05902372673153877, + "learning_rate": 0.00019974445629474574, + "loss": 1.5955, + "step": 560 + }, + { + "epoch": 0.2875231821960734, + "grad_norm": 0.04898110404610634, + "learning_rate": 0.0001997383747881648, + "loss": 1.5554, + "step": 562 + }, + { + "epoch": 0.28854639636758966, + "grad_norm": 0.07228821516036987, + "learning_rate": 0.00019973222186006498, + "loss": 1.6178, + "step": 564 + }, + { + "epoch": 0.28956961053910596, + "grad_norm": 0.07162781804800034, + "learning_rate": 0.00019972599751485226, + "loss": 1.6128, + "step": 566 + }, + { + "epoch": 0.29059282471062226, + "grad_norm": 0.047708939760923386, + "learning_rate": 0.00019971970175698385, + "loss": 1.5776, + "step": 568 + }, + { + "epoch": 0.2916160388821385, + "grad_norm": 0.05930710583925247, + "learning_rate": 0.0001997133345909681, + "loss": 1.6095, + "step": 570 + }, + { + "epoch": 0.2926392530536548, + "grad_norm": 0.057511184364557266, + "learning_rate": 0.00019970689602136438, + "loss": 1.564, + "step": 572 + }, + { + "epoch": 0.2936624672251711, + "grad_norm": 0.0659165233373642, + "learning_rate": 0.00019970038605278338, + "loss": 1.6057, + "step": 574 + }, + { + "epoch": 0.2946856813966873, + "grad_norm": 0.0638163760304451, + "learning_rate": 0.00019969380468988677, + "loss": 1.5684, + "step": 576 + }, + { + "epoch": 0.2957088955682036, + "grad_norm": 0.0477282889187336, + "learning_rate": 0.00019968715193738738, + "loss": 1.5596, + "step": 578 + }, + { + "epoch": 0.2967321097397199, + "grad_norm": 0.055721577256917953, + "learning_rate": 0.00019968042780004917, + "loss": 1.5854, + "step": 580 + }, + { + "epoch": 0.29775532391123616, + "grad_norm": 0.05852237716317177, + "learning_rate": 0.00019967363228268724, + "loss": 1.5952, + "step": 582 + }, + { + "epoch": 0.29877853808275245, + "grad_norm": 0.04583214595913887, + "learning_rate": 0.00019966676539016779, + "loss": 1.5835, + "step": 584 + }, + { + "epoch": 0.2998017522542687, + "grad_norm": 0.052682552486658096, + "learning_rate": 0.00019965982712740808, + "loss": 1.5932, + "step": 586 + }, + { + "epoch": 0.300824966425785, + "grad_norm": 0.06101151555776596, + "learning_rate": 0.00019965281749937655, + "loss": 1.661, + "step": 588 + }, + { + "epoch": 0.3018481805973013, + "grad_norm": 0.052221182733774185, + "learning_rate": 0.0001996457365110927, + "loss": 1.5834, + "step": 590 + }, + { + "epoch": 0.3028713947688175, + "grad_norm": 0.05288353189826012, + "learning_rate": 0.00019963858416762717, + "loss": 1.561, + "step": 592 + }, + { + "epoch": 0.3038946089403338, + "grad_norm": 0.05072011053562164, + "learning_rate": 0.00019963136047410166, + "loss": 1.5542, + "step": 594 + }, + { + "epoch": 0.3049178231118501, + "grad_norm": 0.05482899025082588, + "learning_rate": 0.00019962406543568898, + "loss": 1.6568, + "step": 596 + }, + { + "epoch": 0.30594103728336636, + "grad_norm": 0.06114513427019119, + "learning_rate": 0.00019961669905761302, + "loss": 1.5619, + "step": 598 + }, + { + "epoch": 0.30696425145488265, + "grad_norm": 0.14878755807876587, + "learning_rate": 0.00019960926134514873, + "loss": 1.6222, + "step": 600 + }, + { + "epoch": 0.30798746562639895, + "grad_norm": 0.05369825288653374, + "learning_rate": 0.00019960175230362222, + "loss": 1.574, + "step": 602 + }, + { + "epoch": 0.3090106797979152, + "grad_norm": 0.04912363365292549, + "learning_rate": 0.00019959417193841063, + "loss": 1.5644, + "step": 604 + }, + { + "epoch": 0.3100338939694315, + "grad_norm": 0.055376555770635605, + "learning_rate": 0.00019958652025494212, + "loss": 1.5978, + "step": 606 + }, + { + "epoch": 0.3110571081409477, + "grad_norm": 0.054994821548461914, + "learning_rate": 0.00019957879725869602, + "loss": 1.6327, + "step": 608 + }, + { + "epoch": 0.312080322312464, + "grad_norm": 0.05939999222755432, + "learning_rate": 0.00019957100295520266, + "loss": 1.5706, + "step": 610 + }, + { + "epoch": 0.3131035364839803, + "grad_norm": 0.05616987124085426, + "learning_rate": 0.00019956313735004346, + "loss": 1.5932, + "step": 612 + }, + { + "epoch": 0.31412675065549656, + "grad_norm": 0.10900183767080307, + "learning_rate": 0.00019955520044885087, + "loss": 1.5757, + "step": 614 + }, + { + "epoch": 0.31514996482701285, + "grad_norm": 1.115419864654541, + "learning_rate": 0.00019954719225730847, + "loss": 1.666, + "step": 616 + }, + { + "epoch": 0.31617317899852915, + "grad_norm": 0.13737702369689941, + "learning_rate": 0.00019953911278115078, + "loss": 1.6406, + "step": 618 + }, + { + "epoch": 0.3171963931700454, + "grad_norm": 0.18733379244804382, + "learning_rate": 0.00019953096202616344, + "loss": 1.6465, + "step": 620 + }, + { + "epoch": 0.3182196073415617, + "grad_norm": 0.513283371925354, + "learning_rate": 0.0001995227399981831, + "loss": 1.6477, + "step": 622 + }, + { + "epoch": 0.319242821513078, + "grad_norm": 0.30918484926223755, + "learning_rate": 0.0001995144467030975, + "loss": 1.6566, + "step": 624 + }, + { + "epoch": 0.3202660356845942, + "grad_norm": 0.0951157733798027, + "learning_rate": 0.00019950608214684535, + "loss": 1.6034, + "step": 626 + }, + { + "epoch": 0.3212892498561105, + "grad_norm": 0.05696268379688263, + "learning_rate": 0.00019949764633541643, + "loss": 1.6518, + "step": 628 + }, + { + "epoch": 0.32231246402762675, + "grad_norm": 0.06777111440896988, + "learning_rate": 0.00019948913927485146, + "loss": 1.6585, + "step": 630 + }, + { + "epoch": 0.32333567819914305, + "grad_norm": 0.055656664073467255, + "learning_rate": 0.00019948056097124234, + "loss": 1.5623, + "step": 632 + }, + { + "epoch": 0.32435889237065935, + "grad_norm": 0.05220302939414978, + "learning_rate": 0.00019947191143073186, + "loss": 1.6067, + "step": 634 + }, + { + "epoch": 0.3253821065421756, + "grad_norm": 0.05276400223374367, + "learning_rate": 0.00019946319065951382, + "loss": 1.5997, + "step": 636 + }, + { + "epoch": 0.3264053207136919, + "grad_norm": 0.06689111888408661, + "learning_rate": 0.00019945439866383312, + "loss": 1.5621, + "step": 638 + }, + { + "epoch": 0.3274285348852082, + "grad_norm": 0.07574088871479034, + "learning_rate": 0.00019944553544998562, + "loss": 1.5873, + "step": 640 + }, + { + "epoch": 0.3284517490567244, + "grad_norm": 0.1480696201324463, + "learning_rate": 0.0001994366010243181, + "loss": 1.6142, + "step": 642 + }, + { + "epoch": 0.3294749632282407, + "grad_norm": 0.2425205558538437, + "learning_rate": 0.00019942759539322844, + "loss": 1.6513, + "step": 644 + }, + { + "epoch": 0.330498177399757, + "grad_norm": 0.10395582765340805, + "learning_rate": 0.00019941851856316548, + "loss": 1.6186, + "step": 646 + }, + { + "epoch": 0.33152139157127325, + "grad_norm": 0.07959388941526413, + "learning_rate": 0.000199409370540629, + "loss": 1.5954, + "step": 648 + }, + { + "epoch": 0.33254460574278955, + "grad_norm": 0.08391022682189941, + "learning_rate": 0.00019940015133216985, + "loss": 1.6359, + "step": 650 + }, + { + "epoch": 0.33356781991430584, + "grad_norm": 0.10863954573869705, + "learning_rate": 0.00019939086094438975, + "loss": 1.5591, + "step": 652 + }, + { + "epoch": 0.3345910340858221, + "grad_norm": 0.0719527155160904, + "learning_rate": 0.00019938149938394145, + "loss": 1.5536, + "step": 654 + }, + { + "epoch": 0.3356142482573384, + "grad_norm": 0.054009951651096344, + "learning_rate": 0.0001993720666575287, + "loss": 1.5925, + "step": 656 + }, + { + "epoch": 0.3366374624288546, + "grad_norm": 0.06805548816919327, + "learning_rate": 0.00019936256277190608, + "loss": 1.6079, + "step": 658 + }, + { + "epoch": 0.3376606766003709, + "grad_norm": 0.057809535413980484, + "learning_rate": 0.0001993529877338793, + "loss": 1.5569, + "step": 660 + }, + { + "epoch": 0.3386838907718872, + "grad_norm": 0.05796423181891441, + "learning_rate": 0.0001993433415503049, + "loss": 1.6148, + "step": 662 + }, + { + "epoch": 0.33970710494340345, + "grad_norm": 0.0450466088950634, + "learning_rate": 0.0001993336242280904, + "loss": 1.6024, + "step": 664 + }, + { + "epoch": 0.34073031911491974, + "grad_norm": 0.05356905981898308, + "learning_rate": 0.00019932383577419432, + "loss": 1.5696, + "step": 666 + }, + { + "epoch": 0.34175353328643604, + "grad_norm": 0.04915151000022888, + "learning_rate": 0.00019931397619562597, + "loss": 1.601, + "step": 668 + }, + { + "epoch": 0.3427767474579523, + "grad_norm": 0.2238396257162094, + "learning_rate": 0.00019930404549944574, + "loss": 1.6144, + "step": 670 + }, + { + "epoch": 0.3437999616294686, + "grad_norm": 0.07003773748874664, + "learning_rate": 0.00019929404369276488, + "loss": 1.6132, + "step": 672 + }, + { + "epoch": 0.34482317580098487, + "grad_norm": 0.07609610259532928, + "learning_rate": 0.00019928397078274555, + "loss": 1.5351, + "step": 674 + }, + { + "epoch": 0.3458463899725011, + "grad_norm": 0.057023849338293076, + "learning_rate": 0.00019927382677660088, + "loss": 1.5643, + "step": 676 + }, + { + "epoch": 0.3468696041440174, + "grad_norm": 0.0493864081799984, + "learning_rate": 0.0001992636116815948, + "loss": 1.5837, + "step": 678 + }, + { + "epoch": 0.34789281831553365, + "grad_norm": 0.05028039962053299, + "learning_rate": 0.00019925332550504234, + "loss": 1.6003, + "step": 680 + }, + { + "epoch": 0.34891603248704994, + "grad_norm": 0.050032299011945724, + "learning_rate": 0.00019924296825430925, + "loss": 1.5583, + "step": 682 + }, + { + "epoch": 0.34993924665856624, + "grad_norm": 0.04059847444295883, + "learning_rate": 0.00019923253993681225, + "loss": 1.6101, + "step": 684 + }, + { + "epoch": 0.3509624608300825, + "grad_norm": 0.045728132128715515, + "learning_rate": 0.00019922204056001895, + "loss": 1.5973, + "step": 686 + }, + { + "epoch": 0.3519856750015988, + "grad_norm": 0.04674302786588669, + "learning_rate": 0.0001992114701314478, + "loss": 1.5785, + "step": 688 + }, + { + "epoch": 0.35300888917311507, + "grad_norm": 0.04860880225896835, + "learning_rate": 0.00019920082865866818, + "loss": 1.5761, + "step": 690 + }, + { + "epoch": 0.3540321033446313, + "grad_norm": 0.04689641669392586, + "learning_rate": 0.00019919011614930035, + "loss": 1.6015, + "step": 692 + }, + { + "epoch": 0.3550553175161476, + "grad_norm": 0.04507840797305107, + "learning_rate": 0.0001991793326110154, + "loss": 1.5762, + "step": 694 + }, + { + "epoch": 0.3560785316876639, + "grad_norm": 0.04468555748462677, + "learning_rate": 0.00019916847805153526, + "loss": 1.5615, + "step": 696 + }, + { + "epoch": 0.35710174585918014, + "grad_norm": 0.07028740644454956, + "learning_rate": 0.00019915755247863285, + "loss": 1.6001, + "step": 698 + }, + { + "epoch": 0.35812496003069644, + "grad_norm": 0.03917892277240753, + "learning_rate": 0.00019914655590013176, + "loss": 1.6153, + "step": 700 + }, + { + "epoch": 0.3591481742022127, + "grad_norm": 0.06443695724010468, + "learning_rate": 0.0001991354883239066, + "loss": 1.5588, + "step": 702 + }, + { + "epoch": 0.360171388373729, + "grad_norm": 0.04684121161699295, + "learning_rate": 0.00019912434975788264, + "loss": 1.5726, + "step": 704 + }, + { + "epoch": 0.36119460254524527, + "grad_norm": 0.04538768157362938, + "learning_rate": 0.00019911314021003613, + "loss": 1.592, + "step": 706 + }, + { + "epoch": 0.3622178167167615, + "grad_norm": 0.040085602551698685, + "learning_rate": 0.0001991018596883941, + "loss": 1.577, + "step": 708 + }, + { + "epoch": 0.3632410308882778, + "grad_norm": 0.04734279587864876, + "learning_rate": 0.00019909050820103442, + "loss": 1.6194, + "step": 710 + }, + { + "epoch": 0.3642642450597941, + "grad_norm": 0.051557011902332306, + "learning_rate": 0.00019907908575608573, + "loss": 1.5776, + "step": 712 + }, + { + "epoch": 0.36528745923131034, + "grad_norm": 0.042105671018362045, + "learning_rate": 0.00019906759236172752, + "loss": 1.562, + "step": 714 + }, + { + "epoch": 0.36631067340282664, + "grad_norm": 0.04763809219002724, + "learning_rate": 0.00019905602802619007, + "loss": 1.5727, + "step": 716 + }, + { + "epoch": 0.36733388757434293, + "grad_norm": 0.05205756798386574, + "learning_rate": 0.00019904439275775452, + "loss": 1.5595, + "step": 718 + }, + { + "epoch": 0.3683571017458592, + "grad_norm": 0.04210933670401573, + "learning_rate": 0.0001990326865647527, + "loss": 1.5812, + "step": 720 + }, + { + "epoch": 0.36938031591737547, + "grad_norm": 0.04100721701979637, + "learning_rate": 0.00019902090945556728, + "loss": 1.5492, + "step": 722 + }, + { + "epoch": 0.3704035300888917, + "grad_norm": 0.04252148047089577, + "learning_rate": 0.0001990090614386318, + "loss": 1.5397, + "step": 724 + }, + { + "epoch": 0.371426744260408, + "grad_norm": 0.040999703109264374, + "learning_rate": 0.00019899714252243035, + "loss": 1.533, + "step": 726 + }, + { + "epoch": 0.3724499584319243, + "grad_norm": 0.03823763504624367, + "learning_rate": 0.00019898515271549804, + "loss": 1.5385, + "step": 728 + }, + { + "epoch": 0.37347317260344054, + "grad_norm": 0.041486915200948715, + "learning_rate": 0.0001989730920264206, + "loss": 1.5975, + "step": 730 + }, + { + "epoch": 0.37449638677495684, + "grad_norm": 0.042897533625364304, + "learning_rate": 0.00019896096046383456, + "loss": 1.574, + "step": 732 + }, + { + "epoch": 0.37551960094647313, + "grad_norm": 0.05677172914147377, + "learning_rate": 0.00019894875803642715, + "loss": 1.5564, + "step": 734 + }, + { + "epoch": 0.37654281511798937, + "grad_norm": 0.0416000559926033, + "learning_rate": 0.00019893648475293648, + "loss": 1.5982, + "step": 736 + }, + { + "epoch": 0.37756602928950567, + "grad_norm": 0.04389720410108566, + "learning_rate": 0.00019892414062215122, + "loss": 1.5661, + "step": 738 + }, + { + "epoch": 0.37858924346102196, + "grad_norm": 0.048660341650247574, + "learning_rate": 0.0001989117256529109, + "loss": 1.5554, + "step": 740 + }, + { + "epoch": 0.3796124576325382, + "grad_norm": 0.04659014940261841, + "learning_rate": 0.00019889923985410576, + "loss": 1.5932, + "step": 742 + }, + { + "epoch": 0.3806356718040545, + "grad_norm": 0.04693235456943512, + "learning_rate": 0.00019888668323467669, + "loss": 1.5985, + "step": 744 + }, + { + "epoch": 0.38165888597557074, + "grad_norm": 0.05906931310892105, + "learning_rate": 0.00019887405580361537, + "loss": 1.592, + "step": 746 + }, + { + "epoch": 0.38268210014708703, + "grad_norm": 0.0707060918211937, + "learning_rate": 0.0001988613575699642, + "loss": 1.5491, + "step": 748 + }, + { + "epoch": 0.38370531431860333, + "grad_norm": 0.0510844886302948, + "learning_rate": 0.00019884858854281613, + "loss": 1.5433, + "step": 750 + }, + { + "epoch": 0.38472852849011957, + "grad_norm": 0.058799102902412415, + "learning_rate": 0.00019883574873131503, + "loss": 1.5467, + "step": 752 + }, + { + "epoch": 0.38575174266163587, + "grad_norm": 0.04918012022972107, + "learning_rate": 0.0001988228381446553, + "loss": 1.5685, + "step": 754 + }, + { + "epoch": 0.38677495683315216, + "grad_norm": 0.044637810438871384, + "learning_rate": 0.00019880985679208207, + "loss": 1.5767, + "step": 756 + }, + { + "epoch": 0.3877981710046684, + "grad_norm": 0.052684806287288666, + "learning_rate": 0.0001987968046828911, + "loss": 1.5457, + "step": 758 + }, + { + "epoch": 0.3888213851761847, + "grad_norm": 0.045015860348939896, + "learning_rate": 0.0001987836818264289, + "loss": 1.5136, + "step": 760 + }, + { + "epoch": 0.389844599347701, + "grad_norm": 0.0538019984960556, + "learning_rate": 0.0001987704882320926, + "loss": 1.5673, + "step": 762 + }, + { + "epoch": 0.39086781351921723, + "grad_norm": 0.04201149195432663, + "learning_rate": 0.00019875722390932997, + "loss": 1.5559, + "step": 764 + }, + { + "epoch": 0.39189102769073353, + "grad_norm": 0.04188109561800957, + "learning_rate": 0.00019874388886763944, + "loss": 1.4982, + "step": 766 + }, + { + "epoch": 0.39291424186224977, + "grad_norm": 0.0503980815410614, + "learning_rate": 0.00019873048311657007, + "loss": 1.5018, + "step": 768 + }, + { + "epoch": 0.39393745603376606, + "grad_norm": 0.04854050651192665, + "learning_rate": 0.0001987170066657216, + "loss": 1.5331, + "step": 770 + }, + { + "epoch": 0.39496067020528236, + "grad_norm": 0.04634295031428337, + "learning_rate": 0.00019870345952474437, + "loss": 1.5304, + "step": 772 + }, + { + "epoch": 0.3959838843767986, + "grad_norm": 0.04464833438396454, + "learning_rate": 0.0001986898417033393, + "loss": 1.5518, + "step": 774 + }, + { + "epoch": 0.3970070985483149, + "grad_norm": 0.04434438794851303, + "learning_rate": 0.00019867615321125795, + "loss": 1.5372, + "step": 776 + }, + { + "epoch": 0.3980303127198312, + "grad_norm": 0.04564082249999046, + "learning_rate": 0.00019866239405830248, + "loss": 1.5373, + "step": 778 + }, + { + "epoch": 0.39905352689134743, + "grad_norm": 0.042439211159944534, + "learning_rate": 0.00019864856425432574, + "loss": 1.5682, + "step": 780 + }, + { + "epoch": 0.4000767410628637, + "grad_norm": 0.051853910088539124, + "learning_rate": 0.00019863466380923105, + "loss": 1.5408, + "step": 782 + }, + { + "epoch": 0.40109995523438, + "grad_norm": 0.04109041020274162, + "learning_rate": 0.00019862069273297232, + "loss": 1.5557, + "step": 784 + }, + { + "epoch": 0.40212316940589626, + "grad_norm": 0.04249493032693863, + "learning_rate": 0.00019860665103555415, + "loss": 1.5723, + "step": 786 + }, + { + "epoch": 0.40314638357741256, + "grad_norm": 0.041393015533685684, + "learning_rate": 0.0001985925387270316, + "loss": 1.6034, + "step": 788 + }, + { + "epoch": 0.4041695977489288, + "grad_norm": 0.03967997431755066, + "learning_rate": 0.00019857835581751037, + "loss": 1.5252, + "step": 790 + }, + { + "epoch": 0.4051928119204451, + "grad_norm": 0.0383961945772171, + "learning_rate": 0.00019856410231714662, + "loss": 1.5718, + "step": 792 + }, + { + "epoch": 0.4062160260919614, + "grad_norm": 0.04732939228415489, + "learning_rate": 0.00019854977823614717, + "loss": 1.5473, + "step": 794 + }, + { + "epoch": 0.40723924026347763, + "grad_norm": 0.04425951838493347, + "learning_rate": 0.00019853538358476932, + "loss": 1.5976, + "step": 796 + }, + { + "epoch": 0.4082624544349939, + "grad_norm": 0.041833970695734024, + "learning_rate": 0.0001985209183733209, + "loss": 1.6024, + "step": 798 + }, + { + "epoch": 0.4092856686065102, + "grad_norm": 0.04387862607836723, + "learning_rate": 0.0001985063826121603, + "loss": 1.5384, + "step": 800 + }, + { + "epoch": 0.41030888277802646, + "grad_norm": 0.04852529242634773, + "learning_rate": 0.00019849177631169643, + "loss": 1.5485, + "step": 802 + }, + { + "epoch": 0.41133209694954276, + "grad_norm": 0.04267437756061554, + "learning_rate": 0.00019847709948238865, + "loss": 1.5186, + "step": 804 + }, + { + "epoch": 0.41235531112105905, + "grad_norm": 0.04403737559914589, + "learning_rate": 0.00019846235213474692, + "loss": 1.5374, + "step": 806 + }, + { + "epoch": 0.4133785252925753, + "grad_norm": 0.04668973386287689, + "learning_rate": 0.00019844753427933164, + "loss": 1.5209, + "step": 808 + }, + { + "epoch": 0.4144017394640916, + "grad_norm": 0.045447513461112976, + "learning_rate": 0.00019843264592675367, + "loss": 1.5888, + "step": 810 + }, + { + "epoch": 0.41542495363560783, + "grad_norm": 0.04239337146282196, + "learning_rate": 0.00019841768708767438, + "loss": 1.5866, + "step": 812 + }, + { + "epoch": 0.4164481678071241, + "grad_norm": 0.04571668431162834, + "learning_rate": 0.0001984026577728057, + "loss": 1.5134, + "step": 814 + }, + { + "epoch": 0.4174713819786404, + "grad_norm": 0.041478246450424194, + "learning_rate": 0.00019838755799290994, + "loss": 1.5555, + "step": 816 + }, + { + "epoch": 0.41849459615015666, + "grad_norm": 0.04084784537553787, + "learning_rate": 0.00019837238775879983, + "loss": 1.5847, + "step": 818 + }, + { + "epoch": 0.41951781032167296, + "grad_norm": 0.0393175333738327, + "learning_rate": 0.00019835714708133862, + "loss": 1.5377, + "step": 820 + }, + { + "epoch": 0.42054102449318925, + "grad_norm": 0.03987790644168854, + "learning_rate": 0.00019834183597143996, + "loss": 1.5604, + "step": 822 + }, + { + "epoch": 0.4215642386647055, + "grad_norm": 0.04945560172200203, + "learning_rate": 0.00019832645444006804, + "loss": 1.5239, + "step": 824 + }, + { + "epoch": 0.4225874528362218, + "grad_norm": 0.042219970375299454, + "learning_rate": 0.00019831100249823733, + "loss": 1.5435, + "step": 826 + }, + { + "epoch": 0.4236106670077381, + "grad_norm": 0.06793594360351562, + "learning_rate": 0.00019829548015701283, + "loss": 1.5204, + "step": 828 + }, + { + "epoch": 0.4246338811792543, + "grad_norm": 0.04633813723921776, + "learning_rate": 0.00019827988742750988, + "loss": 1.5494, + "step": 830 + }, + { + "epoch": 0.4256570953507706, + "grad_norm": 0.041469499468803406, + "learning_rate": 0.0001982642243208943, + "loss": 1.5549, + "step": 832 + }, + { + "epoch": 0.42668030952228686, + "grad_norm": 0.039512719959020615, + "learning_rate": 0.0001982484908483822, + "loss": 1.5614, + "step": 834 + }, + { + "epoch": 0.42770352369380316, + "grad_norm": 0.04240869730710983, + "learning_rate": 0.0001982326870212402, + "loss": 1.5597, + "step": 836 + }, + { + "epoch": 0.42872673786531945, + "grad_norm": 0.04469761997461319, + "learning_rate": 0.00019821681285078522, + "loss": 1.575, + "step": 838 + }, + { + "epoch": 0.4297499520368357, + "grad_norm": 0.05203311890363693, + "learning_rate": 0.00019820086834838456, + "loss": 1.5144, + "step": 840 + }, + { + "epoch": 0.430773166208352, + "grad_norm": 0.046044569462537766, + "learning_rate": 0.00019818485352545592, + "loss": 1.5328, + "step": 842 + }, + { + "epoch": 0.4317963803798683, + "grad_norm": 0.05522793158888817, + "learning_rate": 0.00019816876839346735, + "loss": 1.5266, + "step": 844 + }, + { + "epoch": 0.4328195945513845, + "grad_norm": 0.04644525796175003, + "learning_rate": 0.00019815261296393715, + "loss": 1.5682, + "step": 846 + }, + { + "epoch": 0.4338428087229008, + "grad_norm": 0.06290300190448761, + "learning_rate": 0.00019813638724843413, + "loss": 1.5643, + "step": 848 + }, + { + "epoch": 0.4348660228944171, + "grad_norm": 0.050486985594034195, + "learning_rate": 0.00019812009125857728, + "loss": 1.5491, + "step": 850 + }, + { + "epoch": 0.43588923706593335, + "grad_norm": 0.05234065279364586, + "learning_rate": 0.000198103725006036, + "loss": 1.5718, + "step": 852 + }, + { + "epoch": 0.43691245123744965, + "grad_norm": 0.05265431106090546, + "learning_rate": 0.00019808728850253, + "loss": 1.56, + "step": 854 + }, + { + "epoch": 0.4379356654089659, + "grad_norm": 0.04220706969499588, + "learning_rate": 0.00019807078175982924, + "loss": 1.551, + "step": 856 + }, + { + "epoch": 0.4389588795804822, + "grad_norm": 0.042153794318437576, + "learning_rate": 0.00019805420478975403, + "loss": 1.5793, + "step": 858 + }, + { + "epoch": 0.4399820937519985, + "grad_norm": 0.04063679277896881, + "learning_rate": 0.00019803755760417494, + "loss": 1.5404, + "step": 860 + }, + { + "epoch": 0.4410053079235147, + "grad_norm": 0.04740441218018532, + "learning_rate": 0.0001980208402150128, + "loss": 1.526, + "step": 862 + }, + { + "epoch": 0.442028522095031, + "grad_norm": 0.04050862789154053, + "learning_rate": 0.0001980040526342388, + "loss": 1.5357, + "step": 864 + }, + { + "epoch": 0.4430517362665473, + "grad_norm": 0.050952885299921036, + "learning_rate": 0.00019798719487387428, + "loss": 1.5102, + "step": 866 + }, + { + "epoch": 0.44407495043806355, + "grad_norm": 0.048501502722501755, + "learning_rate": 0.00019797026694599098, + "loss": 1.5637, + "step": 868 + }, + { + "epoch": 0.44509816460957985, + "grad_norm": 0.03910909220576286, + "learning_rate": 0.0001979532688627107, + "loss": 1.5367, + "step": 870 + }, + { + "epoch": 0.44612137878109615, + "grad_norm": 0.05638305842876434, + "learning_rate": 0.0001979362006362056, + "loss": 1.5282, + "step": 872 + }, + { + "epoch": 0.4471445929526124, + "grad_norm": 0.05307792127132416, + "learning_rate": 0.00019791906227869808, + "loss": 1.5467, + "step": 874 + }, + { + "epoch": 0.4481678071241287, + "grad_norm": 0.04324028640985489, + "learning_rate": 0.0001979018538024607, + "loss": 1.5711, + "step": 876 + }, + { + "epoch": 0.4491910212956449, + "grad_norm": 0.03858278691768646, + "learning_rate": 0.00019788457521981623, + "loss": 1.5561, + "step": 878 + }, + { + "epoch": 0.4502142354671612, + "grad_norm": 0.043761543929576874, + "learning_rate": 0.00019786722654313772, + "loss": 1.5187, + "step": 880 + }, + { + "epoch": 0.4512374496386775, + "grad_norm": 0.08969100564718246, + "learning_rate": 0.00019784980778484834, + "loss": 1.5486, + "step": 882 + }, + { + "epoch": 0.45226066381019375, + "grad_norm": 0.04808567091822624, + "learning_rate": 0.00019783231895742143, + "loss": 1.5164, + "step": 884 + }, + { + "epoch": 0.45328387798171005, + "grad_norm": 0.04110665246844292, + "learning_rate": 0.00019781476007338058, + "loss": 1.5177, + "step": 886 + }, + { + "epoch": 0.45430709215322634, + "grad_norm": 0.050568196922540665, + "learning_rate": 0.00019779713114529947, + "loss": 1.5265, + "step": 888 + }, + { + "epoch": 0.4553303063247426, + "grad_norm": 0.04753986746072769, + "learning_rate": 0.00019777943218580207, + "loss": 1.5304, + "step": 890 + }, + { + "epoch": 0.4563535204962589, + "grad_norm": 0.05155970901250839, + "learning_rate": 0.00019776166320756227, + "loss": 1.566, + "step": 892 + }, + { + "epoch": 0.4573767346677752, + "grad_norm": 0.048765815794467926, + "learning_rate": 0.00019774382422330433, + "loss": 1.5276, + "step": 894 + }, + { + "epoch": 0.4583999488392914, + "grad_norm": 0.16882531344890594, + "learning_rate": 0.0001977259152458025, + "loss": 1.5074, + "step": 896 + }, + { + "epoch": 0.4594231630108077, + "grad_norm": 0.04014374688267708, + "learning_rate": 0.00019770793628788122, + "loss": 1.5262, + "step": 898 + }, + { + "epoch": 0.46044637718232395, + "grad_norm": 0.04874645173549652, + "learning_rate": 0.000197689887362415, + "loss": 1.5158, + "step": 900 + }, + { + "epoch": 0.46146959135384025, + "grad_norm": 0.049459170550107956, + "learning_rate": 0.00019767176848232846, + "loss": 1.5449, + "step": 902 + }, + { + "epoch": 0.46249280552535654, + "grad_norm": 0.04516777768731117, + "learning_rate": 0.00019765357966059638, + "loss": 1.5722, + "step": 904 + }, + { + "epoch": 0.4635160196968728, + "grad_norm": 0.04243026673793793, + "learning_rate": 0.00019763532091024352, + "loss": 1.5562, + "step": 906 + }, + { + "epoch": 0.4645392338683891, + "grad_norm": 0.04713771492242813, + "learning_rate": 0.00019761699224434475, + "loss": 1.5425, + "step": 908 + }, + { + "epoch": 0.4655624480399054, + "grad_norm": 0.0495879128575325, + "learning_rate": 0.0001975985936760251, + "loss": 1.5517, + "step": 910 + }, + { + "epoch": 0.4665856622114216, + "grad_norm": 0.037338342517614365, + "learning_rate": 0.00019758012521845948, + "loss": 1.5923, + "step": 912 + }, + { + "epoch": 0.4676088763829379, + "grad_norm": 0.044082753360271454, + "learning_rate": 0.000197561586884873, + "loss": 1.5582, + "step": 914 + }, + { + "epoch": 0.4686320905544542, + "grad_norm": 0.045763563364744186, + "learning_rate": 0.00019754297868854073, + "loss": 1.5435, + "step": 916 + }, + { + "epoch": 0.46965530472597045, + "grad_norm": 0.04221731796860695, + "learning_rate": 0.00019752430064278777, + "loss": 1.5365, + "step": 918 + }, + { + "epoch": 0.47067851889748674, + "grad_norm": 0.04800180345773697, + "learning_rate": 0.0001975055527609893, + "loss": 1.5534, + "step": 920 + }, + { + "epoch": 0.471701733069003, + "grad_norm": 0.05618242546916008, + "learning_rate": 0.00019748673505657046, + "loss": 1.5568, + "step": 922 + }, + { + "epoch": 0.4727249472405193, + "grad_norm": 0.04696999117732048, + "learning_rate": 0.00019746784754300637, + "loss": 1.5249, + "step": 924 + }, + { + "epoch": 0.4737481614120356, + "grad_norm": 0.041852448135614395, + "learning_rate": 0.00019744889023382215, + "loss": 1.5415, + "step": 926 + }, + { + "epoch": 0.4747713755835518, + "grad_norm": 0.04743418097496033, + "learning_rate": 0.00019742986314259299, + "loss": 1.5633, + "step": 928 + }, + { + "epoch": 0.4757945897550681, + "grad_norm": 0.04543265700340271, + "learning_rate": 0.00019741076628294386, + "loss": 1.5261, + "step": 930 + }, + { + "epoch": 0.4768178039265844, + "grad_norm": 0.04992993175983429, + "learning_rate": 0.00019739159966854992, + "loss": 1.5175, + "step": 932 + }, + { + "epoch": 0.47784101809810064, + "grad_norm": 0.05793948844075203, + "learning_rate": 0.00019737236331313608, + "loss": 1.59, + "step": 934 + }, + { + "epoch": 0.47886423226961694, + "grad_norm": 0.051816169172525406, + "learning_rate": 0.00019735305723047732, + "loss": 1.5008, + "step": 936 + }, + { + "epoch": 0.47988744644113324, + "grad_norm": 0.04754515737295151, + "learning_rate": 0.0001973336814343985, + "loss": 1.4773, + "step": 938 + }, + { + "epoch": 0.4809106606126495, + "grad_norm": 0.0393076054751873, + "learning_rate": 0.0001973142359387744, + "loss": 1.5568, + "step": 940 + }, + { + "epoch": 0.48193387478416577, + "grad_norm": 0.04164562746882439, + "learning_rate": 0.00019729472075752974, + "loss": 1.5319, + "step": 942 + }, + { + "epoch": 0.482957088955682, + "grad_norm": 0.04371575266122818, + "learning_rate": 0.00019727513590463906, + "loss": 1.5571, + "step": 944 + }, + { + "epoch": 0.4839803031271983, + "grad_norm": 0.0573207251727581, + "learning_rate": 0.00019725548139412692, + "loss": 1.5372, + "step": 946 + }, + { + "epoch": 0.4850035172987146, + "grad_norm": 0.04900820180773735, + "learning_rate": 0.00019723575724006767, + "loss": 1.5327, + "step": 948 + }, + { + "epoch": 0.48602673147023084, + "grad_norm": 0.039241593331098557, + "learning_rate": 0.00019721596345658552, + "loss": 1.5438, + "step": 950 + }, + { + "epoch": 0.48704994564174714, + "grad_norm": 0.043952930718660355, + "learning_rate": 0.00019719610005785465, + "loss": 1.5577, + "step": 952 + }, + { + "epoch": 0.48807315981326344, + "grad_norm": 0.038709525018930435, + "learning_rate": 0.0001971761670580989, + "loss": 1.5527, + "step": 954 + }, + { + "epoch": 0.4890963739847797, + "grad_norm": 0.03867029398679733, + "learning_rate": 0.0001971561644715922, + "loss": 1.5329, + "step": 956 + }, + { + "epoch": 0.49011958815629597, + "grad_norm": 0.0413273349404335, + "learning_rate": 0.00019713609231265805, + "loss": 1.5415, + "step": 958 + }, + { + "epoch": 0.49114280232781227, + "grad_norm": 0.03651106357574463, + "learning_rate": 0.00019711595059566998, + "loss": 1.5596, + "step": 960 + }, + { + "epoch": 0.4921660164993285, + "grad_norm": 0.03891696035861969, + "learning_rate": 0.0001970957393350512, + "loss": 1.5452, + "step": 962 + }, + { + "epoch": 0.4931892306708448, + "grad_norm": 0.03818392753601074, + "learning_rate": 0.0001970754585452748, + "loss": 1.5821, + "step": 964 + }, + { + "epoch": 0.49421244484236104, + "grad_norm": 0.03790618106722832, + "learning_rate": 0.0001970551082408636, + "loss": 1.5456, + "step": 966 + }, + { + "epoch": 0.49523565901387734, + "grad_norm": 0.043467581272125244, + "learning_rate": 0.00019703468843639024, + "loss": 1.4916, + "step": 968 + }, + { + "epoch": 0.49625887318539363, + "grad_norm": 0.03895978257060051, + "learning_rate": 0.0001970141991464771, + "loss": 1.5529, + "step": 970 + }, + { + "epoch": 0.4972820873569099, + "grad_norm": 0.03736645728349686, + "learning_rate": 0.0001969936403857963, + "loss": 1.5243, + "step": 972 + }, + { + "epoch": 0.49830530152842617, + "grad_norm": 0.03589653596282005, + "learning_rate": 0.0001969730121690698, + "loss": 1.5418, + "step": 974 + }, + { + "epoch": 0.49932851569994247, + "grad_norm": 0.03768768534064293, + "learning_rate": 0.00019695231451106912, + "loss": 1.5114, + "step": 976 + }, + { + "epoch": 0.5003517298714587, + "grad_norm": 0.04931550845503807, + "learning_rate": 0.00019693154742661575, + "loss": 1.564, + "step": 978 + }, + { + "epoch": 0.501374944042975, + "grad_norm": 0.04325348883867264, + "learning_rate": 0.0001969107109305807, + "loss": 1.5092, + "step": 980 + }, + { + "epoch": 0.5023981582144913, + "grad_norm": 0.03987947851419449, + "learning_rate": 0.00019688980503788475, + "loss": 1.5222, + "step": 982 + }, + { + "epoch": 0.5034213723860076, + "grad_norm": 0.04482003673911095, + "learning_rate": 0.00019686882976349836, + "loss": 1.517, + "step": 984 + }, + { + "epoch": 0.5044445865575238, + "grad_norm": 0.04025088995695114, + "learning_rate": 0.00019684778512244172, + "loss": 1.5188, + "step": 986 + }, + { + "epoch": 0.5054678007290401, + "grad_norm": 0.04705490544438362, + "learning_rate": 0.00019682667112978463, + "loss": 1.5266, + "step": 988 + }, + { + "epoch": 0.5064910149005564, + "grad_norm": 0.0493633933365345, + "learning_rate": 0.0001968054878006466, + "loss": 1.5079, + "step": 990 + }, + { + "epoch": 0.5075142290720727, + "grad_norm": 0.04063592851161957, + "learning_rate": 0.00019678423515019674, + "loss": 1.5169, + "step": 992 + }, + { + "epoch": 0.508537443243589, + "grad_norm": 0.04962534457445145, + "learning_rate": 0.00019676291319365387, + "loss": 1.5219, + "step": 994 + }, + { + "epoch": 0.5095606574151051, + "grad_norm": 0.03995488956570625, + "learning_rate": 0.00019674152194628638, + "loss": 1.5397, + "step": 996 + }, + { + "epoch": 0.5105838715866214, + "grad_norm": 0.04593009501695633, + "learning_rate": 0.00019672006142341234, + "loss": 1.5616, + "step": 998 + }, + { + "epoch": 0.5116070857581377, + "grad_norm": 0.04215447977185249, + "learning_rate": 0.00019669853164039933, + "loss": 1.5425, + "step": 1000 + }, + { + "epoch": 0.512630299929654, + "grad_norm": 0.043728407472372055, + "learning_rate": 0.0001966769326126646, + "loss": 1.5044, + "step": 1002 + }, + { + "epoch": 0.5136535141011703, + "grad_norm": 0.04384353384375572, + "learning_rate": 0.00019665526435567497, + "loss": 1.5734, + "step": 1004 + }, + { + "epoch": 0.5146767282726866, + "grad_norm": 0.04542085528373718, + "learning_rate": 0.00019663352688494684, + "loss": 1.5023, + "step": 1006 + }, + { + "epoch": 0.5156999424442028, + "grad_norm": 0.05727483332157135, + "learning_rate": 0.0001966117202160462, + "loss": 1.5668, + "step": 1008 + }, + { + "epoch": 0.5167231566157191, + "grad_norm": 0.055995501577854156, + "learning_rate": 0.0001965898443645885, + "loss": 1.5533, + "step": 1010 + }, + { + "epoch": 0.5177463707872354, + "grad_norm": 0.04521145299077034, + "learning_rate": 0.00019656789934623881, + "loss": 1.5196, + "step": 1012 + }, + { + "epoch": 0.5187695849587517, + "grad_norm": 0.040051352232694626, + "learning_rate": 0.0001965458851767117, + "loss": 1.5293, + "step": 1014 + }, + { + "epoch": 0.519792799130268, + "grad_norm": 0.04483609274029732, + "learning_rate": 0.00019652380187177126, + "loss": 1.5028, + "step": 1016 + }, + { + "epoch": 0.5208160133017842, + "grad_norm": 0.04116397351026535, + "learning_rate": 0.00019650164944723115, + "loss": 1.5272, + "step": 1018 + }, + { + "epoch": 0.5218392274733005, + "grad_norm": 0.04803440347313881, + "learning_rate": 0.00019647942791895445, + "loss": 1.525, + "step": 1020 + }, + { + "epoch": 0.5228624416448168, + "grad_norm": 0.05390439182519913, + "learning_rate": 0.00019645713730285366, + "loss": 1.5446, + "step": 1022 + }, + { + "epoch": 0.5238856558163331, + "grad_norm": 0.04475432634353638, + "learning_rate": 0.00019643477761489096, + "loss": 1.5213, + "step": 1024 + }, + { + "epoch": 0.5249088699878494, + "grad_norm": 0.04424989968538284, + "learning_rate": 0.00019641234887107778, + "loss": 1.4888, + "step": 1026 + }, + { + "epoch": 0.5259320841593657, + "grad_norm": 0.049827560782432556, + "learning_rate": 0.00019638985108747515, + "loss": 1.5555, + "step": 1028 + }, + { + "epoch": 0.5269552983308818, + "grad_norm": 0.04092090204358101, + "learning_rate": 0.0001963672842801934, + "loss": 1.4815, + "step": 1030 + }, + { + "epoch": 0.5279785125023981, + "grad_norm": 0.052185434848070145, + "learning_rate": 0.00019634464846539246, + "loss": 1.5657, + "step": 1032 + }, + { + "epoch": 0.5290017266739144, + "grad_norm": 0.04300570487976074, + "learning_rate": 0.00019632194365928153, + "loss": 1.5259, + "step": 1034 + }, + { + "epoch": 0.5300249408454307, + "grad_norm": 0.04205292835831642, + "learning_rate": 0.00019629916987811926, + "loss": 1.527, + "step": 1036 + }, + { + "epoch": 0.531048155016947, + "grad_norm": 0.06136661395430565, + "learning_rate": 0.00019627632713821368, + "loss": 1.5541, + "step": 1038 + }, + { + "epoch": 0.5320713691884632, + "grad_norm": 0.03824898600578308, + "learning_rate": 0.00019625341545592226, + "loss": 1.5496, + "step": 1040 + }, + { + "epoch": 0.5330945833599795, + "grad_norm": 0.041780851781368256, + "learning_rate": 0.0001962304348476518, + "loss": 1.5283, + "step": 1042 + }, + { + "epoch": 0.5341177975314958, + "grad_norm": 0.04486005753278732, + "learning_rate": 0.0001962073853298584, + "loss": 1.5312, + "step": 1044 + }, + { + "epoch": 0.5351410117030121, + "grad_norm": 0.041384853422641754, + "learning_rate": 0.00019618426691904762, + "loss": 1.5011, + "step": 1046 + }, + { + "epoch": 0.5361642258745284, + "grad_norm": 0.0440378412604332, + "learning_rate": 0.00019616107963177425, + "loss": 1.4855, + "step": 1048 + }, + { + "epoch": 0.5371874400460447, + "grad_norm": 0.052033115178346634, + "learning_rate": 0.00019613782348464244, + "loss": 1.4811, + "step": 1050 + }, + { + "epoch": 0.5382106542175609, + "grad_norm": 0.04121650755405426, + "learning_rate": 0.00019611449849430565, + "loss": 1.5653, + "step": 1052 + }, + { + "epoch": 0.5392338683890772, + "grad_norm": 0.04445752128958702, + "learning_rate": 0.00019609110467746666, + "loss": 1.5098, + "step": 1054 + }, + { + "epoch": 0.5402570825605935, + "grad_norm": 0.06591064482927322, + "learning_rate": 0.00019606764205087757, + "loss": 1.5304, + "step": 1056 + }, + { + "epoch": 0.5412802967321098, + "grad_norm": 0.05301080271601677, + "learning_rate": 0.0001960441106313396, + "loss": 1.4871, + "step": 1058 + }, + { + "epoch": 0.542303510903626, + "grad_norm": 0.040986523032188416, + "learning_rate": 0.0001960205104357034, + "loss": 1.5195, + "step": 1060 + }, + { + "epoch": 0.5433267250751422, + "grad_norm": 0.03562408685684204, + "learning_rate": 0.00019599684148086878, + "loss": 1.5384, + "step": 1062 + }, + { + "epoch": 0.5443499392466585, + "grad_norm": 0.04383963719010353, + "learning_rate": 0.00019597310378378476, + "loss": 1.4988, + "step": 1064 + }, + { + "epoch": 0.5453731534181748, + "grad_norm": 0.06702277064323425, + "learning_rate": 0.00019594929736144976, + "loss": 1.4897, + "step": 1066 + }, + { + "epoch": 0.5463963675896911, + "grad_norm": 0.0414276085793972, + "learning_rate": 0.00019592542223091118, + "loss": 1.5049, + "step": 1068 + }, + { + "epoch": 0.5474195817612074, + "grad_norm": 0.0432027168571949, + "learning_rate": 0.00019590147840926577, + "loss": 1.4686, + "step": 1070 + }, + { + "epoch": 0.5484427959327237, + "grad_norm": 0.044036637991666794, + "learning_rate": 0.00019587746591365941, + "loss": 1.5082, + "step": 1072 + }, + { + "epoch": 0.5494660101042399, + "grad_norm": 0.04510560259222984, + "learning_rate": 0.0001958533847612872, + "loss": 1.5213, + "step": 1074 + }, + { + "epoch": 0.5504892242757562, + "grad_norm": 0.04027169942855835, + "learning_rate": 0.00019582923496939337, + "loss": 1.4952, + "step": 1076 + }, + { + "epoch": 0.5515124384472725, + "grad_norm": 0.08312036097049713, + "learning_rate": 0.00019580501655527133, + "loss": 1.512, + "step": 1078 + }, + { + "epoch": 0.5525356526187888, + "grad_norm": 0.04634568840265274, + "learning_rate": 0.00019578072953626357, + "loss": 1.5248, + "step": 1080 + }, + { + "epoch": 0.5535588667903051, + "grad_norm": 0.044149454683065414, + "learning_rate": 0.00019575637392976178, + "loss": 1.4911, + "step": 1082 + }, + { + "epoch": 0.5545820809618213, + "grad_norm": 0.04358943551778793, + "learning_rate": 0.00019573194975320673, + "loss": 1.5427, + "step": 1084 + }, + { + "epoch": 0.5556052951333376, + "grad_norm": 0.038042690604925156, + "learning_rate": 0.0001957074570240883, + "loss": 1.5032, + "step": 1086 + }, + { + "epoch": 0.5566285093048539, + "grad_norm": 0.04171706736087799, + "learning_rate": 0.00019568289575994544, + "loss": 1.493, + "step": 1088 + }, + { + "epoch": 0.5576517234763702, + "grad_norm": 0.04037075862288475, + "learning_rate": 0.0001956582659783662, + "loss": 1.5334, + "step": 1090 + }, + { + "epoch": 0.5586749376478864, + "grad_norm": 0.036902882158756256, + "learning_rate": 0.0001956335676969877, + "loss": 1.5093, + "step": 1092 + }, + { + "epoch": 0.5596981518194027, + "grad_norm": 0.04198329523205757, + "learning_rate": 0.00019560880093349607, + "loss": 1.5069, + "step": 1094 + }, + { + "epoch": 0.5607213659909189, + "grad_norm": 0.034086357802152634, + "learning_rate": 0.0001955839657056265, + "loss": 1.5101, + "step": 1096 + }, + { + "epoch": 0.5617445801624352, + "grad_norm": 0.03502487763762474, + "learning_rate": 0.0001955590620311633, + "loss": 1.5305, + "step": 1098 + }, + { + "epoch": 0.5627677943339515, + "grad_norm": 0.03580254316329956, + "learning_rate": 0.00019553408992793964, + "loss": 1.4984, + "step": 1100 + }, + { + "epoch": 0.5637910085054678, + "grad_norm": 0.0441250242292881, + "learning_rate": 0.00019550904941383773, + "loss": 1.4956, + "step": 1102 + }, + { + "epoch": 0.5648142226769841, + "grad_norm": 0.039550572633743286, + "learning_rate": 0.00019548394050678883, + "loss": 1.5041, + "step": 1104 + }, + { + "epoch": 0.5658374368485003, + "grad_norm": 0.03674033284187317, + "learning_rate": 0.0001954587632247732, + "loss": 1.4694, + "step": 1106 + }, + { + "epoch": 0.5668606510200166, + "grad_norm": 0.03579515963792801, + "learning_rate": 0.00019543351758581994, + "loss": 1.4789, + "step": 1108 + }, + { + "epoch": 0.5678838651915329, + "grad_norm": 0.04077816754579544, + "learning_rate": 0.0001954082036080072, + "loss": 1.5221, + "step": 1110 + }, + { + "epoch": 0.5689070793630492, + "grad_norm": 0.03694437816739082, + "learning_rate": 0.00019538282130946198, + "loss": 1.5273, + "step": 1112 + }, + { + "epoch": 0.5699302935345655, + "grad_norm": 0.03998146578669548, + "learning_rate": 0.00019535737070836028, + "loss": 1.5426, + "step": 1114 + }, + { + "epoch": 0.5709535077060818, + "grad_norm": 0.03823567554354668, + "learning_rate": 0.00019533185182292703, + "loss": 1.5264, + "step": 1116 + }, + { + "epoch": 0.571976721877598, + "grad_norm": 0.03891613706946373, + "learning_rate": 0.000195306264671436, + "loss": 1.5194, + "step": 1118 + }, + { + "epoch": 0.5729999360491143, + "grad_norm": 0.035352472215890884, + "learning_rate": 0.0001952806092722098, + "loss": 1.5049, + "step": 1120 + }, + { + "epoch": 0.5740231502206306, + "grad_norm": 0.03947431594133377, + "learning_rate": 0.00019525488564362003, + "loss": 1.5562, + "step": 1122 + }, + { + "epoch": 0.5750463643921468, + "grad_norm": 0.0398818701505661, + "learning_rate": 0.00019522909380408705, + "loss": 1.5216, + "step": 1124 + }, + { + "epoch": 0.5760695785636631, + "grad_norm": 0.03842191398143768, + "learning_rate": 0.00019520323377208017, + "loss": 1.5461, + "step": 1126 + }, + { + "epoch": 0.5770927927351793, + "grad_norm": 0.03299557417631149, + "learning_rate": 0.00019517730556611738, + "loss": 1.4988, + "step": 1128 + }, + { + "epoch": 0.5781160069066956, + "grad_norm": 0.032452985644340515, + "learning_rate": 0.00019515130920476562, + "loss": 1.4837, + "step": 1130 + }, + { + "epoch": 0.5791392210782119, + "grad_norm": 0.03567085042595863, + "learning_rate": 0.00019512524470664057, + "loss": 1.5081, + "step": 1132 + }, + { + "epoch": 0.5801624352497282, + "grad_norm": 0.04303791746497154, + "learning_rate": 0.00019509911209040676, + "loss": 1.517, + "step": 1134 + }, + { + "epoch": 0.5811856494212445, + "grad_norm": 0.040586575865745544, + "learning_rate": 0.00019507291137477742, + "loss": 1.5494, + "step": 1136 + }, + { + "epoch": 0.5822088635927608, + "grad_norm": 0.038383904844522476, + "learning_rate": 0.0001950466425785146, + "loss": 1.4641, + "step": 1138 + }, + { + "epoch": 0.583232077764277, + "grad_norm": 0.0484977550804615, + "learning_rate": 0.0001950203057204291, + "loss": 1.4838, + "step": 1140 + }, + { + "epoch": 0.5842552919357933, + "grad_norm": 0.03300706669688225, + "learning_rate": 0.00019499390081938046, + "loss": 1.4935, + "step": 1142 + }, + { + "epoch": 0.5852785061073096, + "grad_norm": 0.041923582553863525, + "learning_rate": 0.00019496742789427683, + "loss": 1.484, + "step": 1144 + }, + { + "epoch": 0.5863017202788259, + "grad_norm": 0.04476374387741089, + "learning_rate": 0.00019494088696407532, + "loss": 1.5222, + "step": 1146 + }, + { + "epoch": 0.5873249344503422, + "grad_norm": 0.039443958550691605, + "learning_rate": 0.00019491427804778147, + "loss": 1.4899, + "step": 1148 + }, + { + "epoch": 0.5883481486218584, + "grad_norm": 0.0458071269094944, + "learning_rate": 0.00019488760116444966, + "loss": 1.5006, + "step": 1150 + }, + { + "epoch": 0.5893713627933747, + "grad_norm": 0.04912669211626053, + "learning_rate": 0.00019486085633318293, + "loss": 1.5193, + "step": 1152 + }, + { + "epoch": 0.590394576964891, + "grad_norm": 0.05331273376941681, + "learning_rate": 0.00019483404357313293, + "loss": 1.5115, + "step": 1154 + }, + { + "epoch": 0.5914177911364072, + "grad_norm": 0.04301870986819267, + "learning_rate": 0.00019480716290349995, + "loss": 1.4997, + "step": 1156 + }, + { + "epoch": 0.5924410053079235, + "grad_norm": 0.042690206319093704, + "learning_rate": 0.00019478021434353297, + "loss": 1.5014, + "step": 1158 + }, + { + "epoch": 0.5934642194794398, + "grad_norm": 0.045416899025440216, + "learning_rate": 0.00019475319791252956, + "loss": 1.5287, + "step": 1160 + }, + { + "epoch": 0.594487433650956, + "grad_norm": 0.04627612978219986, + "learning_rate": 0.0001947261136298358, + "loss": 1.5238, + "step": 1162 + }, + { + "epoch": 0.5955106478224723, + "grad_norm": 0.0443304218351841, + "learning_rate": 0.00019469896151484654, + "loss": 1.4956, + "step": 1164 + }, + { + "epoch": 0.5965338619939886, + "grad_norm": 0.042293716222047806, + "learning_rate": 0.00019467174158700504, + "loss": 1.4962, + "step": 1166 + }, + { + "epoch": 0.5975570761655049, + "grad_norm": 0.035955190658569336, + "learning_rate": 0.0001946444538658032, + "loss": 1.4799, + "step": 1168 + }, + { + "epoch": 0.5985802903370212, + "grad_norm": 0.04025396704673767, + "learning_rate": 0.00019461709837078145, + "loss": 1.489, + "step": 1170 + }, + { + "epoch": 0.5996035045085374, + "grad_norm": 0.057371869683265686, + "learning_rate": 0.0001945896751215287, + "loss": 1.4872, + "step": 1172 + }, + { + "epoch": 0.6006267186800537, + "grad_norm": 0.05806579813361168, + "learning_rate": 0.0001945621841376825, + "loss": 1.5153, + "step": 1174 + }, + { + "epoch": 0.60164993285157, + "grad_norm": 0.03980225697159767, + "learning_rate": 0.00019453462543892882, + "loss": 1.5093, + "step": 1176 + }, + { + "epoch": 0.6026731470230863, + "grad_norm": 0.041456956416368484, + "learning_rate": 0.0001945069990450021, + "loss": 1.5115, + "step": 1178 + }, + { + "epoch": 0.6036963611946026, + "grad_norm": 0.03392681106925011, + "learning_rate": 0.00019447930497568528, + "loss": 1.4863, + "step": 1180 + }, + { + "epoch": 0.6047195753661189, + "grad_norm": 0.03312285616993904, + "learning_rate": 0.0001944515432508098, + "loss": 1.5321, + "step": 1182 + }, + { + "epoch": 0.605742789537635, + "grad_norm": 0.03741718456149101, + "learning_rate": 0.00019442371389025552, + "loss": 1.4874, + "step": 1184 + }, + { + "epoch": 0.6067660037091513, + "grad_norm": 0.03954221308231354, + "learning_rate": 0.00019439581691395067, + "loss": 1.5014, + "step": 1186 + }, + { + "epoch": 0.6077892178806676, + "grad_norm": 0.03756248950958252, + "learning_rate": 0.00019436785234187205, + "loss": 1.522, + "step": 1188 + }, + { + "epoch": 0.6088124320521839, + "grad_norm": 0.03895876556634903, + "learning_rate": 0.00019433982019404473, + "loss": 1.5546, + "step": 1190 + }, + { + "epoch": 0.6098356462237002, + "grad_norm": 0.038288913667201996, + "learning_rate": 0.0001943117204905422, + "loss": 1.4859, + "step": 1192 + }, + { + "epoch": 0.6108588603952164, + "grad_norm": 0.034622881561517715, + "learning_rate": 0.00019428355325148633, + "loss": 1.5246, + "step": 1194 + }, + { + "epoch": 0.6118820745667327, + "grad_norm": 0.04585454985499382, + "learning_rate": 0.0001942553184970474, + "loss": 1.5001, + "step": 1196 + }, + { + "epoch": 0.612905288738249, + "grad_norm": 0.03685140982270241, + "learning_rate": 0.00019422701624744395, + "loss": 1.5114, + "step": 1198 + }, + { + "epoch": 0.6139285029097653, + "grad_norm": 0.033848248422145844, + "learning_rate": 0.00019419864652294296, + "loss": 1.5047, + "step": 1200 + }, + { + "epoch": 0.6149517170812816, + "grad_norm": 0.03485368937253952, + "learning_rate": 0.00019417020934385962, + "loss": 1.5412, + "step": 1202 + }, + { + "epoch": 0.6159749312527979, + "grad_norm": 0.03737105429172516, + "learning_rate": 0.00019414170473055746, + "loss": 1.5014, + "step": 1204 + }, + { + "epoch": 0.6169981454243141, + "grad_norm": 0.0417652502655983, + "learning_rate": 0.00019411313270344837, + "loss": 1.4963, + "step": 1206 + }, + { + "epoch": 0.6180213595958304, + "grad_norm": 0.037758734077215195, + "learning_rate": 0.0001940844932829924, + "loss": 1.4935, + "step": 1208 + }, + { + "epoch": 0.6190445737673467, + "grad_norm": 0.03808191418647766, + "learning_rate": 0.00019405578648969796, + "loss": 1.5181, + "step": 1210 + }, + { + "epoch": 0.620067787938863, + "grad_norm": 0.03454340249300003, + "learning_rate": 0.00019402701234412162, + "loss": 1.493, + "step": 1212 + }, + { + "epoch": 0.6210910021103793, + "grad_norm": 0.03708413615822792, + "learning_rate": 0.00019399817086686826, + "loss": 1.4987, + "step": 1214 + }, + { + "epoch": 0.6221142162818954, + "grad_norm": 0.046957071870565414, + "learning_rate": 0.00019396926207859084, + "loss": 1.473, + "step": 1216 + }, + { + "epoch": 0.6231374304534117, + "grad_norm": 0.03893362358212471, + "learning_rate": 0.00019394028599999073, + "loss": 1.4915, + "step": 1218 + }, + { + "epoch": 0.624160644624928, + "grad_norm": 0.04247049614787102, + "learning_rate": 0.0001939112426518173, + "loss": 1.5384, + "step": 1220 + }, + { + "epoch": 0.6251838587964443, + "grad_norm": 0.036440882831811905, + "learning_rate": 0.00019388213205486822, + "loss": 1.5124, + "step": 1222 + }, + { + "epoch": 0.6262070729679606, + "grad_norm": 0.037374429404735565, + "learning_rate": 0.00019385295422998921, + "loss": 1.5244, + "step": 1224 + }, + { + "epoch": 0.6272302871394769, + "grad_norm": 0.0383899062871933, + "learning_rate": 0.00019382370919807419, + "loss": 1.5078, + "step": 1226 + }, + { + "epoch": 0.6282535013109931, + "grad_norm": 0.03726350888609886, + "learning_rate": 0.0001937943969800652, + "loss": 1.4968, + "step": 1228 + }, + { + "epoch": 0.6292767154825094, + "grad_norm": 0.037606336176395416, + "learning_rate": 0.0001937650175969524, + "loss": 1.4735, + "step": 1230 + }, + { + "epoch": 0.6302999296540257, + "grad_norm": 0.03583415970206261, + "learning_rate": 0.000193735571069774, + "loss": 1.4872, + "step": 1232 + }, + { + "epoch": 0.631323143825542, + "grad_norm": 0.029802750796079636, + "learning_rate": 0.00019370605741961635, + "loss": 1.5037, + "step": 1234 + }, + { + "epoch": 0.6323463579970583, + "grad_norm": 0.037094760686159134, + "learning_rate": 0.00019367647666761385, + "loss": 1.518, + "step": 1236 + }, + { + "epoch": 0.6333695721685745, + "grad_norm": 0.03802032023668289, + "learning_rate": 0.00019364682883494893, + "loss": 1.4997, + "step": 1238 + }, + { + "epoch": 0.6343927863400908, + "grad_norm": 0.03934174031019211, + "learning_rate": 0.00019361711394285202, + "loss": 1.5033, + "step": 1240 + }, + { + "epoch": 0.6354160005116071, + "grad_norm": 0.03484318405389786, + "learning_rate": 0.00019358733201260169, + "loss": 1.5068, + "step": 1242 + }, + { + "epoch": 0.6364392146831234, + "grad_norm": 0.03633354604244232, + "learning_rate": 0.00019355748306552442, + "loss": 1.5462, + "step": 1244 + }, + { + "epoch": 0.6374624288546397, + "grad_norm": 0.05548425391316414, + "learning_rate": 0.00019352756712299468, + "loss": 1.5036, + "step": 1246 + }, + { + "epoch": 0.638485643026156, + "grad_norm": 0.032225679606199265, + "learning_rate": 0.00019349758420643493, + "loss": 1.5026, + "step": 1248 + }, + { + "epoch": 0.6395088571976721, + "grad_norm": 0.03236972540616989, + "learning_rate": 0.00019346753433731564, + "loss": 1.5199, + "step": 1250 + }, + { + "epoch": 0.6405320713691884, + "grad_norm": 0.03576046973466873, + "learning_rate": 0.00019343741753715516, + "loss": 1.5146, + "step": 1252 + }, + { + "epoch": 0.6415552855407047, + "grad_norm": 0.04308708757162094, + "learning_rate": 0.00019340723382751978, + "loss": 1.5, + "step": 1254 + }, + { + "epoch": 0.642578499712221, + "grad_norm": 0.035895735025405884, + "learning_rate": 0.0001933769832300237, + "loss": 1.5043, + "step": 1256 + }, + { + "epoch": 0.6436017138837373, + "grad_norm": 0.03789574280381203, + "learning_rate": 0.00019334666576632906, + "loss": 1.4935, + "step": 1258 + }, + { + "epoch": 0.6446249280552535, + "grad_norm": 0.03609545901417732, + "learning_rate": 0.00019331628145814587, + "loss": 1.5296, + "step": 1260 + }, + { + "epoch": 0.6456481422267698, + "grad_norm": 0.0432671383023262, + "learning_rate": 0.00019328583032723193, + "loss": 1.5045, + "step": 1262 + }, + { + "epoch": 0.6466713563982861, + "grad_norm": 0.038937125355005264, + "learning_rate": 0.000193255312395393, + "loss": 1.4801, + "step": 1264 + }, + { + "epoch": 0.6476945705698024, + "grad_norm": 0.03925538435578346, + "learning_rate": 0.00019322472768448258, + "loss": 1.4903, + "step": 1266 + }, + { + "epoch": 0.6487177847413187, + "grad_norm": 0.03581652417778969, + "learning_rate": 0.00019319407621640208, + "loss": 1.471, + "step": 1268 + }, + { + "epoch": 0.649740998912835, + "grad_norm": 0.03643723577260971, + "learning_rate": 0.00019316335801310063, + "loss": 1.5019, + "step": 1270 + }, + { + "epoch": 0.6507642130843512, + "grad_norm": 0.03839946910738945, + "learning_rate": 0.0001931325730965752, + "loss": 1.5148, + "step": 1272 + }, + { + "epoch": 0.6517874272558675, + "grad_norm": 0.04306597262620926, + "learning_rate": 0.00019310172148887054, + "loss": 1.472, + "step": 1274 + }, + { + "epoch": 0.6528106414273838, + "grad_norm": 0.069839708507061, + "learning_rate": 0.00019307080321207912, + "loss": 1.521, + "step": 1276 + }, + { + "epoch": 0.6538338555989001, + "grad_norm": 0.05618079751729965, + "learning_rate": 0.00019303981828834113, + "loss": 1.5019, + "step": 1278 + }, + { + "epoch": 0.6548570697704164, + "grad_norm": 0.04359296336770058, + "learning_rate": 0.00019300876673984462, + "loss": 1.4676, + "step": 1280 + }, + { + "epoch": 0.6558802839419325, + "grad_norm": 0.038589805364608765, + "learning_rate": 0.00019297764858882514, + "loss": 1.4791, + "step": 1282 + }, + { + "epoch": 0.6569034981134488, + "grad_norm": 0.0316338986158371, + "learning_rate": 0.00019294646385756612, + "loss": 1.4824, + "step": 1284 + }, + { + "epoch": 0.6579267122849651, + "grad_norm": 0.03457920625805855, + "learning_rate": 0.00019291521256839858, + "loss": 1.4946, + "step": 1286 + }, + { + "epoch": 0.6589499264564814, + "grad_norm": 0.04637923464179039, + "learning_rate": 0.00019288389474370117, + "loss": 1.5049, + "step": 1288 + }, + { + "epoch": 0.6599731406279977, + "grad_norm": 0.05314064025878906, + "learning_rate": 0.0001928525104059003, + "loss": 1.5021, + "step": 1290 + }, + { + "epoch": 0.660996354799514, + "grad_norm": 0.041335079818964005, + "learning_rate": 0.00019282105957746986, + "loss": 1.4869, + "step": 1292 + }, + { + "epoch": 0.6620195689710302, + "grad_norm": 0.040912263095378876, + "learning_rate": 0.00019278954228093146, + "loss": 1.5168, + "step": 1294 + }, + { + "epoch": 0.6630427831425465, + "grad_norm": 0.037110935896635056, + "learning_rate": 0.00019275795853885433, + "loss": 1.4973, + "step": 1296 + }, + { + "epoch": 0.6640659973140628, + "grad_norm": 0.035204846411943436, + "learning_rate": 0.00019272630837385518, + "loss": 1.5062, + "step": 1298 + }, + { + "epoch": 0.6650892114855791, + "grad_norm": 0.0464470274746418, + "learning_rate": 0.0001926945918085983, + "loss": 1.5412, + "step": 1300 + }, + { + "epoch": 0.6661124256570954, + "grad_norm": 0.033444374799728394, + "learning_rate": 0.00019266280886579565, + "loss": 1.4799, + "step": 1302 + }, + { + "epoch": 0.6671356398286117, + "grad_norm": 0.036789704114198685, + "learning_rate": 0.0001926309595682066, + "loss": 1.5604, + "step": 1304 + }, + { + "epoch": 0.6681588540001279, + "grad_norm": 0.03726235032081604, + "learning_rate": 0.00019259904393863802, + "loss": 1.5054, + "step": 1306 + }, + { + "epoch": 0.6691820681716442, + "grad_norm": 0.03499661013484001, + "learning_rate": 0.00019256706199994442, + "loss": 1.5039, + "step": 1308 + }, + { + "epoch": 0.6702052823431605, + "grad_norm": 0.037414226680994034, + "learning_rate": 0.00019253501377502764, + "loss": 1.4952, + "step": 1310 + }, + { + "epoch": 0.6712284965146768, + "grad_norm": 0.041186489164829254, + "learning_rate": 0.00019250289928683705, + "loss": 1.519, + "step": 1312 + }, + { + "epoch": 0.672251710686193, + "grad_norm": 0.050159044563770294, + "learning_rate": 0.0001924707185583695, + "loss": 1.5112, + "step": 1314 + }, + { + "epoch": 0.6732749248577092, + "grad_norm": 0.05124843865633011, + "learning_rate": 0.0001924384716126692, + "loss": 1.4897, + "step": 1316 + }, + { + "epoch": 0.6742981390292255, + "grad_norm": 0.03580416738986969, + "learning_rate": 0.00019240615847282788, + "loss": 1.4739, + "step": 1318 + }, + { + "epoch": 0.6753213532007418, + "grad_norm": 0.03572642430663109, + "learning_rate": 0.00019237377916198458, + "loss": 1.4735, + "step": 1320 + }, + { + "epoch": 0.6763445673722581, + "grad_norm": 0.04381095990538597, + "learning_rate": 0.00019234133370332578, + "loss": 1.4817, + "step": 1322 + }, + { + "epoch": 0.6773677815437744, + "grad_norm": 0.03948042169213295, + "learning_rate": 0.00019230882212008528, + "loss": 1.5288, + "step": 1324 + }, + { + "epoch": 0.6783909957152907, + "grad_norm": 0.04092205688357353, + "learning_rate": 0.00019227624443554425, + "loss": 1.503, + "step": 1326 + }, + { + "epoch": 0.6794142098868069, + "grad_norm": 0.0372740812599659, + "learning_rate": 0.0001922436006730312, + "loss": 1.5186, + "step": 1328 + }, + { + "epoch": 0.6804374240583232, + "grad_norm": 0.03410439193248749, + "learning_rate": 0.00019221089085592202, + "loss": 1.5104, + "step": 1330 + }, + { + "epoch": 0.6814606382298395, + "grad_norm": 0.04406609386205673, + "learning_rate": 0.00019217811500763977, + "loss": 1.497, + "step": 1332 + }, + { + "epoch": 0.6824838524013558, + "grad_norm": 0.04020300507545471, + "learning_rate": 0.00019214527315165487, + "loss": 1.4589, + "step": 1334 + }, + { + "epoch": 0.6835070665728721, + "grad_norm": 0.03552987799048424, + "learning_rate": 0.000192112365311485, + "loss": 1.4938, + "step": 1336 + }, + { + "epoch": 0.6845302807443883, + "grad_norm": 0.035595186054706573, + "learning_rate": 0.00019207939151069515, + "loss": 1.4664, + "step": 1338 + }, + { + "epoch": 0.6855534949159046, + "grad_norm": 0.030798960477113724, + "learning_rate": 0.00019204635177289743, + "loss": 1.4786, + "step": 1340 + }, + { + "epoch": 0.6865767090874209, + "grad_norm": 0.03413120657205582, + "learning_rate": 0.00019201324612175123, + "loss": 1.5409, + "step": 1342 + }, + { + "epoch": 0.6875999232589372, + "grad_norm": 0.03786253184080124, + "learning_rate": 0.0001919800745809631, + "loss": 1.4725, + "step": 1344 + }, + { + "epoch": 0.6886231374304534, + "grad_norm": 0.0414445661008358, + "learning_rate": 0.00019194683717428687, + "loss": 1.4993, + "step": 1346 + }, + { + "epoch": 0.6896463516019697, + "grad_norm": 0.0378003790974617, + "learning_rate": 0.00019191353392552344, + "loss": 1.5225, + "step": 1348 + }, + { + "epoch": 0.6906695657734859, + "grad_norm": 0.0343095101416111, + "learning_rate": 0.0001918801648585209, + "loss": 1.4671, + "step": 1350 + }, + { + "epoch": 0.6916927799450022, + "grad_norm": 0.03458075597882271, + "learning_rate": 0.0001918467299971744, + "loss": 1.4843, + "step": 1352 + }, + { + "epoch": 0.6927159941165185, + "grad_norm": 0.03243357688188553, + "learning_rate": 0.00019181322936542635, + "loss": 1.494, + "step": 1354 + }, + { + "epoch": 0.6937392082880348, + "grad_norm": 0.03002413548529148, + "learning_rate": 0.00019177966298726613, + "loss": 1.5046, + "step": 1356 + }, + { + "epoch": 0.6947624224595511, + "grad_norm": 0.031211066991090775, + "learning_rate": 0.00019174603088673026, + "loss": 1.4664, + "step": 1358 + }, + { + "epoch": 0.6957856366310673, + "grad_norm": 0.03740109130740166, + "learning_rate": 0.00019171233308790225, + "loss": 1.4394, + "step": 1360 + }, + { + "epoch": 0.6968088508025836, + "grad_norm": 0.03566642478108406, + "learning_rate": 0.0001916785696149128, + "loss": 1.4935, + "step": 1362 + }, + { + "epoch": 0.6978320649740999, + "grad_norm": 0.033135462552309036, + "learning_rate": 0.00019164474049193948, + "loss": 1.5171, + "step": 1364 + }, + { + "epoch": 0.6988552791456162, + "grad_norm": 0.03240213543176651, + "learning_rate": 0.00019161084574320696, + "loss": 1.4644, + "step": 1366 + }, + { + "epoch": 0.6998784933171325, + "grad_norm": 0.0337255634367466, + "learning_rate": 0.0001915768853929869, + "loss": 1.4739, + "step": 1368 + }, + { + "epoch": 0.7009017074886488, + "grad_norm": 0.033216070383787155, + "learning_rate": 0.00019154285946559792, + "loss": 1.4691, + "step": 1370 + }, + { + "epoch": 0.701924921660165, + "grad_norm": 0.03151748329401016, + "learning_rate": 0.0001915087679854056, + "loss": 1.4882, + "step": 1372 + }, + { + "epoch": 0.7029481358316813, + "grad_norm": 0.03065643645823002, + "learning_rate": 0.00019147461097682246, + "loss": 1.4608, + "step": 1374 + }, + { + "epoch": 0.7039713500031975, + "grad_norm": 0.0341670848429203, + "learning_rate": 0.0001914403884643079, + "loss": 1.4714, + "step": 1376 + }, + { + "epoch": 0.7049945641747138, + "grad_norm": 0.035825930535793304, + "learning_rate": 0.00019140610047236833, + "loss": 1.4752, + "step": 1378 + }, + { + "epoch": 0.7060177783462301, + "grad_norm": 0.042743559926748276, + "learning_rate": 0.00019137174702555697, + "loss": 1.5077, + "step": 1380 + }, + { + "epoch": 0.7070409925177463, + "grad_norm": 0.03980020061135292, + "learning_rate": 0.00019133732814847397, + "loss": 1.4813, + "step": 1382 + }, + { + "epoch": 0.7080642066892626, + "grad_norm": 0.03854946047067642, + "learning_rate": 0.00019130284386576624, + "loss": 1.4623, + "step": 1384 + }, + { + "epoch": 0.7090874208607789, + "grad_norm": 0.037254948168992996, + "learning_rate": 0.00019126829420212764, + "loss": 1.5247, + "step": 1386 + }, + { + "epoch": 0.7101106350322952, + "grad_norm": 0.047802574932575226, + "learning_rate": 0.00019123367918229874, + "loss": 1.4989, + "step": 1388 + }, + { + "epoch": 0.7111338492038115, + "grad_norm": 0.039889827370643616, + "learning_rate": 0.000191198998831067, + "loss": 1.4727, + "step": 1390 + }, + { + "epoch": 0.7121570633753278, + "grad_norm": 0.03746683895587921, + "learning_rate": 0.0001911642531732666, + "loss": 1.4929, + "step": 1392 + }, + { + "epoch": 0.713180277546844, + "grad_norm": 0.04323015734553337, + "learning_rate": 0.00019112944223377855, + "loss": 1.4989, + "step": 1394 + }, + { + "epoch": 0.7142034917183603, + "grad_norm": 0.04086681455373764, + "learning_rate": 0.0001910945660375305, + "loss": 1.4884, + "step": 1396 + }, + { + "epoch": 0.7152267058898766, + "grad_norm": 0.03528650477528572, + "learning_rate": 0.00019105962460949698, + "loss": 1.4932, + "step": 1398 + }, + { + "epoch": 0.7162499200613929, + "grad_norm": 0.041061852127313614, + "learning_rate": 0.00019102461797469912, + "loss": 1.5063, + "step": 1400 + }, + { + "epoch": 0.7172731342329092, + "grad_norm": 0.033481474965810776, + "learning_rate": 0.00019098954615820476, + "loss": 1.4825, + "step": 1402 + }, + { + "epoch": 0.7182963484044254, + "grad_norm": 0.03925000876188278, + "learning_rate": 0.00019095440918512842, + "loss": 1.513, + "step": 1404 + }, + { + "epoch": 0.7193195625759417, + "grad_norm": 0.03856325149536133, + "learning_rate": 0.0001909192070806313, + "loss": 1.4907, + "step": 1406 + }, + { + "epoch": 0.720342776747458, + "grad_norm": 0.03494630753993988, + "learning_rate": 0.00019088393986992124, + "loss": 1.4604, + "step": 1408 + }, + { + "epoch": 0.7213659909189742, + "grad_norm": 0.03931909799575806, + "learning_rate": 0.00019084860757825268, + "loss": 1.4905, + "step": 1410 + }, + { + "epoch": 0.7223892050904905, + "grad_norm": 0.03644140437245369, + "learning_rate": 0.00019081321023092668, + "loss": 1.49, + "step": 1412 + }, + { + "epoch": 0.7234124192620068, + "grad_norm": 0.03480161353945732, + "learning_rate": 0.00019077774785329087, + "loss": 1.5301, + "step": 1414 + }, + { + "epoch": 0.724435633433523, + "grad_norm": 0.03516329079866409, + "learning_rate": 0.00019074222047073947, + "loss": 1.4801, + "step": 1416 + }, + { + "epoch": 0.7254588476050393, + "grad_norm": 0.03371971845626831, + "learning_rate": 0.00019070662810871322, + "loss": 1.4724, + "step": 1418 + }, + { + "epoch": 0.7264820617765556, + "grad_norm": 0.034337956458330154, + "learning_rate": 0.00019067097079269942, + "loss": 1.4726, + "step": 1420 + }, + { + "epoch": 0.7275052759480719, + "grad_norm": 0.0360429473221302, + "learning_rate": 0.00019063524854823186, + "loss": 1.4856, + "step": 1422 + }, + { + "epoch": 0.7285284901195882, + "grad_norm": 0.03850055858492851, + "learning_rate": 0.0001905994614008908, + "loss": 1.5022, + "step": 1424 + }, + { + "epoch": 0.7295517042911044, + "grad_norm": 0.03869333118200302, + "learning_rate": 0.0001905636093763031, + "loss": 1.4949, + "step": 1426 + }, + { + "epoch": 0.7305749184626207, + "grad_norm": 0.03506360575556755, + "learning_rate": 0.0001905276925001419, + "loss": 1.4617, + "step": 1428 + }, + { + "epoch": 0.731598132634137, + "grad_norm": 0.033819831907749176, + "learning_rate": 0.00019049171079812692, + "loss": 1.4698, + "step": 1430 + }, + { + "epoch": 0.7326213468056533, + "grad_norm": 0.03606401011347771, + "learning_rate": 0.00019045566429602424, + "loss": 1.5038, + "step": 1432 + }, + { + "epoch": 0.7336445609771696, + "grad_norm": 0.04196172207593918, + "learning_rate": 0.00019041955301964632, + "loss": 1.5142, + "step": 1434 + }, + { + "epoch": 0.7346677751486859, + "grad_norm": 0.03859662637114525, + "learning_rate": 0.00019038337699485208, + "loss": 1.5072, + "step": 1436 + }, + { + "epoch": 0.735690989320202, + "grad_norm": 0.036224085837602615, + "learning_rate": 0.00019034713624754672, + "loss": 1.5033, + "step": 1438 + }, + { + "epoch": 0.7367142034917183, + "grad_norm": 0.04655170813202858, + "learning_rate": 0.00019031083080368183, + "loss": 1.5255, + "step": 1440 + }, + { + "epoch": 0.7377374176632346, + "grad_norm": 0.040406614542007446, + "learning_rate": 0.0001902744606892554, + "loss": 1.5199, + "step": 1442 + }, + { + "epoch": 0.7387606318347509, + "grad_norm": 0.03488042950630188, + "learning_rate": 0.00019023802593031154, + "loss": 1.5127, + "step": 1444 + }, + { + "epoch": 0.7397838460062672, + "grad_norm": 0.031517501920461655, + "learning_rate": 0.00019020152655294085, + "loss": 1.4726, + "step": 1446 + }, + { + "epoch": 0.7408070601777834, + "grad_norm": 0.0331415981054306, + "learning_rate": 0.0001901649625832801, + "loss": 1.473, + "step": 1448 + }, + { + "epoch": 0.7418302743492997, + "grad_norm": 0.03110121190547943, + "learning_rate": 0.00019012833404751235, + "loss": 1.4693, + "step": 1450 + }, + { + "epoch": 0.742853488520816, + "grad_norm": 0.03500855341553688, + "learning_rate": 0.00019009164097186684, + "loss": 1.4962, + "step": 1452 + }, + { + "epoch": 0.7438767026923323, + "grad_norm": 0.03449893742799759, + "learning_rate": 0.0001900548833826191, + "loss": 1.4938, + "step": 1454 + }, + { + "epoch": 0.7448999168638486, + "grad_norm": 0.03199852257966995, + "learning_rate": 0.0001900180613060908, + "loss": 1.4905, + "step": 1456 + }, + { + "epoch": 0.7459231310353649, + "grad_norm": 0.03547672927379608, + "learning_rate": 0.00018998117476864984, + "loss": 1.4495, + "step": 1458 + }, + { + "epoch": 0.7469463452068811, + "grad_norm": 0.03338061273097992, + "learning_rate": 0.00018994422379671016, + "loss": 1.4895, + "step": 1460 + }, + { + "epoch": 0.7479695593783974, + "grad_norm": 0.036238085478544235, + "learning_rate": 0.00018990720841673207, + "loss": 1.5382, + "step": 1462 + }, + { + "epoch": 0.7489927735499137, + "grad_norm": 0.03941986709833145, + "learning_rate": 0.0001898701286552218, + "loss": 1.4917, + "step": 1464 + }, + { + "epoch": 0.75001598772143, + "grad_norm": 0.03612781688570976, + "learning_rate": 0.0001898329845387317, + "loss": 1.4856, + "step": 1466 + }, + { + "epoch": 0.7510392018929463, + "grad_norm": 0.035338182002305984, + "learning_rate": 0.00018979577609386033, + "loss": 1.4787, + "step": 1468 + }, + { + "epoch": 0.7520624160644624, + "grad_norm": 0.035387344658374786, + "learning_rate": 0.0001897585033472522, + "loss": 1.489, + "step": 1470 + }, + { + "epoch": 0.7530856302359787, + "grad_norm": 0.033865489065647125, + "learning_rate": 0.00018972116632559786, + "loss": 1.4958, + "step": 1472 + }, + { + "epoch": 0.754108844407495, + "grad_norm": 0.03240435943007469, + "learning_rate": 0.000189683765055634, + "loss": 1.48, + "step": 1474 + }, + { + "epoch": 0.7551320585790113, + "grad_norm": 0.0325872041285038, + "learning_rate": 0.0001896462995641432, + "loss": 1.4685, + "step": 1476 + }, + { + "epoch": 0.7561552727505276, + "grad_norm": 0.030261578038334846, + "learning_rate": 0.00018960876987795413, + "loss": 1.4985, + "step": 1478 + }, + { + "epoch": 0.7571784869220439, + "grad_norm": 0.034684158861637115, + "learning_rate": 0.0001895711760239413, + "loss": 1.4869, + "step": 1480 + }, + { + "epoch": 0.7582017010935601, + "grad_norm": 0.03360000252723694, + "learning_rate": 0.00018953351802902525, + "loss": 1.5089, + "step": 1482 + }, + { + "epoch": 0.7592249152650764, + "grad_norm": 0.03356654942035675, + "learning_rate": 0.0001894957959201725, + "loss": 1.5119, + "step": 1484 + }, + { + "epoch": 0.7602481294365927, + "grad_norm": 0.035596925765275955, + "learning_rate": 0.00018945800972439538, + "loss": 1.5242, + "step": 1486 + }, + { + "epoch": 0.761271343608109, + "grad_norm": 0.03309349715709686, + "learning_rate": 0.00018942015946875215, + "loss": 1.519, + "step": 1488 + }, + { + "epoch": 0.7622945577796253, + "grad_norm": 0.03727027401328087, + "learning_rate": 0.00018938224518034698, + "loss": 1.4651, + "step": 1490 + }, + { + "epoch": 0.7633177719511415, + "grad_norm": 0.03802427276968956, + "learning_rate": 0.00018934426688632986, + "loss": 1.4584, + "step": 1492 + }, + { + "epoch": 0.7643409861226578, + "grad_norm": 0.03257981687784195, + "learning_rate": 0.00018930622461389655, + "loss": 1.4622, + "step": 1494 + }, + { + "epoch": 0.7653642002941741, + "grad_norm": 0.03339976444840431, + "learning_rate": 0.00018926811839028876, + "loss": 1.4486, + "step": 1496 + }, + { + "epoch": 0.7663874144656904, + "grad_norm": 0.03176839277148247, + "learning_rate": 0.00018922994824279395, + "loss": 1.478, + "step": 1498 + }, + { + "epoch": 0.7674106286372067, + "grad_norm": 0.03458357974886894, + "learning_rate": 0.00018919171419874524, + "loss": 1.5167, + "step": 1500 + }, + { + "epoch": 0.768433842808723, + "grad_norm": 0.037736013531684875, + "learning_rate": 0.00018915341628552166, + "loss": 1.5323, + "step": 1502 + }, + { + "epoch": 0.7694570569802391, + "grad_norm": 0.03360259160399437, + "learning_rate": 0.00018911505453054786, + "loss": 1.469, + "step": 1504 + }, + { + "epoch": 0.7704802711517554, + "grad_norm": 0.03466862440109253, + "learning_rate": 0.00018907662896129433, + "loss": 1.5173, + "step": 1506 + }, + { + "epoch": 0.7715034853232717, + "grad_norm": 0.036147862672805786, + "learning_rate": 0.00018903813960527714, + "loss": 1.4801, + "step": 1508 + }, + { + "epoch": 0.772526699494788, + "grad_norm": 0.03919236734509468, + "learning_rate": 0.0001889995864900581, + "loss": 1.479, + "step": 1510 + }, + { + "epoch": 0.7735499136663043, + "grad_norm": 0.03543972223997116, + "learning_rate": 0.0001889609696432446, + "loss": 1.4771, + "step": 1512 + }, + { + "epoch": 0.7745731278378205, + "grad_norm": 0.04238108918070793, + "learning_rate": 0.00018892228909248978, + "loss": 1.4936, + "step": 1514 + }, + { + "epoch": 0.7755963420093368, + "grad_norm": 0.035696953535079956, + "learning_rate": 0.00018888354486549237, + "loss": 1.49, + "step": 1516 + }, + { + "epoch": 0.7766195561808531, + "grad_norm": 0.04000556096434593, + "learning_rate": 0.00018884473698999661, + "loss": 1.5206, + "step": 1518 + }, + { + "epoch": 0.7776427703523694, + "grad_norm": 0.06562638282775879, + "learning_rate": 0.0001888058654937924, + "loss": 1.4672, + "step": 1520 + }, + { + "epoch": 0.7786659845238857, + "grad_norm": 0.03467231243848801, + "learning_rate": 0.00018876693040471517, + "loss": 1.5033, + "step": 1522 + }, + { + "epoch": 0.779689198695402, + "grad_norm": 0.03708554431796074, + "learning_rate": 0.00018872793175064593, + "loss": 1.4606, + "step": 1524 + }, + { + "epoch": 0.7807124128669182, + "grad_norm": 0.039738163352012634, + "learning_rate": 0.00018868886955951115, + "loss": 1.4506, + "step": 1526 + }, + { + "epoch": 0.7817356270384345, + "grad_norm": 0.036794066429138184, + "learning_rate": 0.00018864974385928283, + "loss": 1.516, + "step": 1528 + }, + { + "epoch": 0.7827588412099508, + "grad_norm": 0.037196848541498184, + "learning_rate": 0.0001886105546779784, + "loss": 1.5051, + "step": 1530 + }, + { + "epoch": 0.7837820553814671, + "grad_norm": 0.03867275267839432, + "learning_rate": 0.00018857130204366084, + "loss": 1.5015, + "step": 1532 + }, + { + "epoch": 0.7848052695529834, + "grad_norm": 0.03784462809562683, + "learning_rate": 0.00018853198598443852, + "loss": 1.4713, + "step": 1534 + }, + { + "epoch": 0.7858284837244995, + "grad_norm": 0.04151632636785507, + "learning_rate": 0.00018849260652846519, + "loss": 1.4671, + "step": 1536 + }, + { + "epoch": 0.7868516978960158, + "grad_norm": 0.04655742272734642, + "learning_rate": 0.00018845316370394005, + "loss": 1.4751, + "step": 1538 + }, + { + "epoch": 0.7878749120675321, + "grad_norm": 0.037444863468408585, + "learning_rate": 0.00018841365753910765, + "loss": 1.5155, + "step": 1540 + }, + { + "epoch": 0.7888981262390484, + "grad_norm": 0.04184754192829132, + "learning_rate": 0.0001883740880622579, + "loss": 1.4717, + "step": 1542 + }, + { + "epoch": 0.7899213404105647, + "grad_norm": 0.042664580047130585, + "learning_rate": 0.00018833445530172605, + "loss": 1.5221, + "step": 1544 + }, + { + "epoch": 0.790944554582081, + "grad_norm": 0.05149197578430176, + "learning_rate": 0.00018829475928589271, + "loss": 1.4861, + "step": 1546 + }, + { + "epoch": 0.7919677687535972, + "grad_norm": 0.04174793139100075, + "learning_rate": 0.0001882550000431837, + "loss": 1.4887, + "step": 1548 + }, + { + "epoch": 0.7929909829251135, + "grad_norm": 0.03560099005699158, + "learning_rate": 0.0001882151776020702, + "loss": 1.5099, + "step": 1550 + }, + { + "epoch": 0.7940141970966298, + "grad_norm": 0.049874622374773026, + "learning_rate": 0.0001881752919910686, + "loss": 1.4835, + "step": 1552 + }, + { + "epoch": 0.7950374112681461, + "grad_norm": 0.04354040324687958, + "learning_rate": 0.0001881353432387405, + "loss": 1.4778, + "step": 1554 + }, + { + "epoch": 0.7960606254396624, + "grad_norm": 0.04164579510688782, + "learning_rate": 0.0001880953313736928, + "loss": 1.4968, + "step": 1556 + }, + { + "epoch": 0.7970838396111786, + "grad_norm": 0.034870538860559464, + "learning_rate": 0.0001880552564245775, + "loss": 1.4628, + "step": 1558 + }, + { + "epoch": 0.7981070537826949, + "grad_norm": 0.034135766327381134, + "learning_rate": 0.00018801511842009183, + "loss": 1.4836, + "step": 1560 + }, + { + "epoch": 0.7991302679542112, + "grad_norm": 0.03587375581264496, + "learning_rate": 0.00018797491738897816, + "loss": 1.4636, + "step": 1562 + }, + { + "epoch": 0.8001534821257275, + "grad_norm": 0.03559894114732742, + "learning_rate": 0.000187934653360024, + "loss": 1.4874, + "step": 1564 + }, + { + "epoch": 0.8011766962972438, + "grad_norm": 0.05410682037472725, + "learning_rate": 0.00018789432636206197, + "loss": 1.4701, + "step": 1566 + }, + { + "epoch": 0.80219991046876, + "grad_norm": 0.046682942658662796, + "learning_rate": 0.00018785393642396976, + "loss": 1.4993, + "step": 1568 + }, + { + "epoch": 0.8032231246402762, + "grad_norm": 0.03647172451019287, + "learning_rate": 0.00018781348357467013, + "loss": 1.5053, + "step": 1570 + }, + { + "epoch": 0.8042463388117925, + "grad_norm": 0.035208649933338165, + "learning_rate": 0.00018777296784313095, + "loss": 1.5099, + "step": 1572 + }, + { + "epoch": 0.8052695529833088, + "grad_norm": 0.03541814163327217, + "learning_rate": 0.00018773238925836507, + "loss": 1.5027, + "step": 1574 + }, + { + "epoch": 0.8062927671548251, + "grad_norm": 0.04706384614109993, + "learning_rate": 0.0001876917478494303, + "loss": 1.5111, + "step": 1576 + }, + { + "epoch": 0.8073159813263414, + "grad_norm": 0.042128194123506546, + "learning_rate": 0.00018765104364542955, + "loss": 1.4832, + "step": 1578 + }, + { + "epoch": 0.8083391954978576, + "grad_norm": 0.033496059477329254, + "learning_rate": 0.00018761027667551063, + "loss": 1.49, + "step": 1580 + }, + { + "epoch": 0.8093624096693739, + "grad_norm": 0.036655962467193604, + "learning_rate": 0.0001875694469688663, + "loss": 1.4835, + "step": 1582 + }, + { + "epoch": 0.8103856238408902, + "grad_norm": 0.036248572170734406, + "learning_rate": 0.0001875285545547342, + "loss": 1.5025, + "step": 1584 + }, + { + "epoch": 0.8114088380124065, + "grad_norm": 0.040282152593135834, + "learning_rate": 0.000187487599462397, + "loss": 1.4776, + "step": 1586 + }, + { + "epoch": 0.8124320521839228, + "grad_norm": 0.03675289452075958, + "learning_rate": 0.00018744658172118215, + "loss": 1.5036, + "step": 1588 + }, + { + "epoch": 0.8134552663554391, + "grad_norm": 0.03431113436818123, + "learning_rate": 0.00018740550136046196, + "loss": 1.4701, + "step": 1590 + }, + { + "epoch": 0.8144784805269553, + "grad_norm": 0.03184695914387703, + "learning_rate": 0.00018736435840965366, + "loss": 1.473, + "step": 1592 + }, + { + "epoch": 0.8155016946984716, + "grad_norm": 0.031748853623867035, + "learning_rate": 0.00018732315289821921, + "loss": 1.5039, + "step": 1594 + }, + { + "epoch": 0.8165249088699879, + "grad_norm": 0.034614481031894684, + "learning_rate": 0.00018728188485566544, + "loss": 1.4664, + "step": 1596 + }, + { + "epoch": 0.8175481230415041, + "grad_norm": 0.0308011993765831, + "learning_rate": 0.0001872405543115439, + "loss": 1.4719, + "step": 1598 + }, + { + "epoch": 0.8185713372130204, + "grad_norm": 0.031010661274194717, + "learning_rate": 0.00018719916129545093, + "loss": 1.4841, + "step": 1600 + }, + { + "epoch": 0.8195945513845366, + "grad_norm": 0.03110615722835064, + "learning_rate": 0.0001871577058370276, + "loss": 1.4878, + "step": 1602 + }, + { + "epoch": 0.8206177655560529, + "grad_norm": 0.030799025669693947, + "learning_rate": 0.00018711618796595972, + "loss": 1.4391, + "step": 1604 + }, + { + "epoch": 0.8216409797275692, + "grad_norm": 0.029373083263635635, + "learning_rate": 0.00018707460771197774, + "loss": 1.5265, + "step": 1606 + }, + { + "epoch": 0.8226641938990855, + "grad_norm": 0.03043638914823532, + "learning_rate": 0.0001870329651048568, + "loss": 1.5027, + "step": 1608 + }, + { + "epoch": 0.8236874080706018, + "grad_norm": 0.0337023101747036, + "learning_rate": 0.00018699126017441672, + "loss": 1.4793, + "step": 1610 + }, + { + "epoch": 0.8247106222421181, + "grad_norm": 0.03439760580658913, + "learning_rate": 0.0001869494929505219, + "loss": 1.4764, + "step": 1612 + }, + { + "epoch": 0.8257338364136343, + "grad_norm": 0.03283720836043358, + "learning_rate": 0.00018690766346308145, + "loss": 1.4829, + "step": 1614 + }, + { + "epoch": 0.8267570505851506, + "grad_norm": 0.030338643118739128, + "learning_rate": 0.00018686577174204885, + "loss": 1.4587, + "step": 1616 + }, + { + "epoch": 0.8277802647566669, + "grad_norm": 0.03556302934885025, + "learning_rate": 0.00018682381781742245, + "loss": 1.4924, + "step": 1618 + }, + { + "epoch": 0.8288034789281832, + "grad_norm": 0.032113250344991684, + "learning_rate": 0.00018678180171924485, + "loss": 1.4875, + "step": 1620 + }, + { + "epoch": 0.8298266930996995, + "grad_norm": 0.1559678167104721, + "learning_rate": 0.00018673972347760338, + "loss": 1.5009, + "step": 1622 + }, + { + "epoch": 0.8308499072712157, + "grad_norm": 0.06492070108652115, + "learning_rate": 0.00018669758312262976, + "loss": 1.4632, + "step": 1624 + }, + { + "epoch": 0.831873121442732, + "grad_norm": 0.05882725864648819, + "learning_rate": 0.00018665538068450023, + "loss": 1.472, + "step": 1626 + }, + { + "epoch": 0.8328963356142483, + "grad_norm": 0.03860605135560036, + "learning_rate": 0.00018661311619343546, + "loss": 1.4662, + "step": 1628 + }, + { + "epoch": 0.8339195497857645, + "grad_norm": 0.04597290977835655, + "learning_rate": 0.00018657078967970062, + "loss": 1.4706, + "step": 1630 + }, + { + "epoch": 0.8349427639572808, + "grad_norm": 0.04754943400621414, + "learning_rate": 0.00018652840117360517, + "loss": 1.475, + "step": 1632 + }, + { + "epoch": 0.8359659781287971, + "grad_norm": 0.03354303911328316, + "learning_rate": 0.0001864859507055031, + "loss": 1.5133, + "step": 1634 + }, + { + "epoch": 0.8369891923003133, + "grad_norm": 0.042201388627290726, + "learning_rate": 0.0001864434383057927, + "loss": 1.5125, + "step": 1636 + }, + { + "epoch": 0.8380124064718296, + "grad_norm": 0.0343627855181694, + "learning_rate": 0.00018640086400491658, + "loss": 1.4811, + "step": 1638 + }, + { + "epoch": 0.8390356206433459, + "grad_norm": 0.03558426350355148, + "learning_rate": 0.00018635822783336174, + "loss": 1.5171, + "step": 1640 + }, + { + "epoch": 0.8400588348148622, + "grad_norm": 0.03267373517155647, + "learning_rate": 0.00018631552982165944, + "loss": 1.4758, + "step": 1642 + }, + { + "epoch": 0.8410820489863785, + "grad_norm": 0.03015967085957527, + "learning_rate": 0.00018627277000038533, + "loss": 1.4501, + "step": 1644 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.03152506798505783, + "learning_rate": 0.0001862299484001591, + "loss": 1.4625, + "step": 1646 + }, + { + "epoch": 0.843128477329411, + "grad_norm": 0.03820090368390083, + "learning_rate": 0.0001861870650516449, + "loss": 1.5065, + "step": 1648 + }, + { + "epoch": 0.8441516915009273, + "grad_norm": 0.030817920342087746, + "learning_rate": 0.000186144119985551, + "loss": 1.4814, + "step": 1650 + }, + { + "epoch": 0.8451749056724436, + "grad_norm": 0.03546105697751045, + "learning_rate": 0.00018610111323262986, + "loss": 1.4554, + "step": 1652 + }, + { + "epoch": 0.8461981198439599, + "grad_norm": 0.033546384423971176, + "learning_rate": 0.00018605804482367807, + "loss": 1.4379, + "step": 1654 + }, + { + "epoch": 0.8472213340154762, + "grad_norm": 0.035938508808612823, + "learning_rate": 0.00018601491478953657, + "loss": 1.4931, + "step": 1656 + }, + { + "epoch": 0.8482445481869924, + "grad_norm": 0.03531987965106964, + "learning_rate": 0.00018597172316109015, + "loss": 1.4483, + "step": 1658 + }, + { + "epoch": 0.8492677623585086, + "grad_norm": 0.03041314333677292, + "learning_rate": 0.00018592846996926793, + "loss": 1.4541, + "step": 1660 + }, + { + "epoch": 0.850290976530025, + "grad_norm": 0.03549192473292351, + "learning_rate": 0.00018588515524504295, + "loss": 1.4615, + "step": 1662 + }, + { + "epoch": 0.8513141907015412, + "grad_norm": 0.03376925736665726, + "learning_rate": 0.0001858417790194325, + "loss": 1.4722, + "step": 1664 + }, + { + "epoch": 0.8523374048730575, + "grad_norm": 0.03313841298222542, + "learning_rate": 0.00018579834132349772, + "loss": 1.4791, + "step": 1666 + }, + { + "epoch": 0.8533606190445737, + "grad_norm": 0.033985435962677, + "learning_rate": 0.00018575484218834388, + "loss": 1.4443, + "step": 1668 + }, + { + "epoch": 0.85438383321609, + "grad_norm": 0.032460469752550125, + "learning_rate": 0.00018571128164512023, + "loss": 1.4988, + "step": 1670 + }, + { + "epoch": 0.8554070473876063, + "grad_norm": 0.03272455185651779, + "learning_rate": 0.00018566765972501993, + "loss": 1.4659, + "step": 1672 + }, + { + "epoch": 0.8564302615591226, + "grad_norm": 0.031708747148513794, + "learning_rate": 0.0001856239764592802, + "loss": 1.5007, + "step": 1674 + }, + { + "epoch": 0.8574534757306389, + "grad_norm": 0.034189220517873764, + "learning_rate": 0.0001855802318791821, + "loss": 1.4423, + "step": 1676 + }, + { + "epoch": 0.8584766899021552, + "grad_norm": 0.03221631050109863, + "learning_rate": 0.00018553642601605068, + "loss": 1.4701, + "step": 1678 + }, + { + "epoch": 0.8594999040736714, + "grad_norm": 0.029117561876773834, + "learning_rate": 0.00018549255890125475, + "loss": 1.4769, + "step": 1680 + }, + { + "epoch": 0.8605231182451877, + "grad_norm": 0.029596133157610893, + "learning_rate": 0.00018544863056620708, + "loss": 1.4635, + "step": 1682 + }, + { + "epoch": 0.861546332416704, + "grad_norm": 0.030032752081751823, + "learning_rate": 0.00018540464104236425, + "loss": 1.4991, + "step": 1684 + }, + { + "epoch": 0.8625695465882203, + "grad_norm": 0.03227202966809273, + "learning_rate": 0.00018536059036122667, + "loss": 1.4608, + "step": 1686 + }, + { + "epoch": 0.8635927607597366, + "grad_norm": 0.03331397473812103, + "learning_rate": 0.0001853164785543385, + "loss": 1.4958, + "step": 1688 + }, + { + "epoch": 0.8646159749312528, + "grad_norm": 0.033648762851953506, + "learning_rate": 0.00018527230565328778, + "loss": 1.4949, + "step": 1690 + }, + { + "epoch": 0.865639189102769, + "grad_norm": 0.03504339978098869, + "learning_rate": 0.00018522807168970616, + "loss": 1.439, + "step": 1692 + }, + { + "epoch": 0.8666624032742853, + "grad_norm": 0.034829430282115936, + "learning_rate": 0.0001851837766952691, + "loss": 1.5001, + "step": 1694 + }, + { + "epoch": 0.8676856174458016, + "grad_norm": 0.03803844377398491, + "learning_rate": 0.0001851394207016957, + "loss": 1.4905, + "step": 1696 + }, + { + "epoch": 0.8687088316173179, + "grad_norm": 0.0394139364361763, + "learning_rate": 0.00018509500374074884, + "loss": 1.4537, + "step": 1698 + }, + { + "epoch": 0.8697320457888342, + "grad_norm": 0.039348065853118896, + "learning_rate": 0.000185050525844235, + "loss": 1.4865, + "step": 1700 + }, + { + "epoch": 0.8707552599603504, + "grad_norm": 0.03650161996483803, + "learning_rate": 0.00018500598704400428, + "loss": 1.4658, + "step": 1702 + }, + { + "epoch": 0.8717784741318667, + "grad_norm": 0.03312232345342636, + "learning_rate": 0.00018496138737195036, + "loss": 1.477, + "step": 1704 + }, + { + "epoch": 0.872801688303383, + "grad_norm": 0.031243184581398964, + "learning_rate": 0.00018491672686001066, + "loss": 1.4983, + "step": 1706 + }, + { + "epoch": 0.8738249024748993, + "grad_norm": 0.03666044771671295, + "learning_rate": 0.00018487200554016602, + "loss": 1.4606, + "step": 1708 + }, + { + "epoch": 0.8748481166464156, + "grad_norm": 0.035856928676366806, + "learning_rate": 0.00018482722344444086, + "loss": 1.4808, + "step": 1710 + }, + { + "epoch": 0.8758713308179318, + "grad_norm": 0.03538081422448158, + "learning_rate": 0.00018478238060490312, + "loss": 1.4734, + "step": 1712 + }, + { + "epoch": 0.8768945449894481, + "grad_norm": 0.02917349338531494, + "learning_rate": 0.00018473747705366426, + "loss": 1.4947, + "step": 1714 + }, + { + "epoch": 0.8779177591609644, + "grad_norm": 0.035214658826589584, + "learning_rate": 0.0001846925128228792, + "loss": 1.4773, + "step": 1716 + }, + { + "epoch": 0.8789409733324807, + "grad_norm": 0.03703998774290085, + "learning_rate": 0.00018464748794474634, + "loss": 1.4704, + "step": 1718 + }, + { + "epoch": 0.879964187503997, + "grad_norm": 0.03480003774166107, + "learning_rate": 0.0001846024024515075, + "loss": 1.4723, + "step": 1720 + }, + { + "epoch": 0.8809874016755133, + "grad_norm": 0.04090346395969391, + "learning_rate": 0.00018455725637544785, + "loss": 1.4525, + "step": 1722 + }, + { + "epoch": 0.8820106158470294, + "grad_norm": 0.042412955313920975, + "learning_rate": 0.00018451204974889596, + "loss": 1.4418, + "step": 1724 + }, + { + "epoch": 0.8830338300185457, + "grad_norm": 0.03738129511475563, + "learning_rate": 0.00018446678260422385, + "loss": 1.4747, + "step": 1726 + }, + { + "epoch": 0.884057044190062, + "grad_norm": 0.03728758171200752, + "learning_rate": 0.00018442145497384673, + "loss": 1.5007, + "step": 1728 + }, + { + "epoch": 0.8850802583615783, + "grad_norm": 0.038157109171152115, + "learning_rate": 0.0001843760668902233, + "loss": 1.4937, + "step": 1730 + }, + { + "epoch": 0.8861034725330946, + "grad_norm": 0.03238663077354431, + "learning_rate": 0.00018433061838585534, + "loss": 1.4631, + "step": 1732 + }, + { + "epoch": 0.8871266867046108, + "grad_norm": 0.03741516172885895, + "learning_rate": 0.0001842851094932881, + "loss": 1.4887, + "step": 1734 + }, + { + "epoch": 0.8881499008761271, + "grad_norm": 0.03934532031416893, + "learning_rate": 0.00018423954024510996, + "loss": 1.4208, + "step": 1736 + }, + { + "epoch": 0.8891731150476434, + "grad_norm": 0.03238905593752861, + "learning_rate": 0.00018419391067395248, + "loss": 1.4587, + "step": 1738 + }, + { + "epoch": 0.8901963292191597, + "grad_norm": 0.039086490869522095, + "learning_rate": 0.00018414822081249058, + "loss": 1.4545, + "step": 1740 + }, + { + "epoch": 0.891219543390676, + "grad_norm": 0.0370473712682724, + "learning_rate": 0.00018410247069344218, + "loss": 1.4473, + "step": 1742 + }, + { + "epoch": 0.8922427575621923, + "grad_norm": 0.034061599522829056, + "learning_rate": 0.00018405666034956844, + "loss": 1.4831, + "step": 1744 + }, + { + "epoch": 0.8932659717337085, + "grad_norm": 0.0363328754901886, + "learning_rate": 0.00018401078981367363, + "loss": 1.4729, + "step": 1746 + }, + { + "epoch": 0.8942891859052248, + "grad_norm": 0.035310424864292145, + "learning_rate": 0.00018396485911860512, + "loss": 1.518, + "step": 1748 + }, + { + "epoch": 0.8953124000767411, + "grad_norm": 0.03476149961352348, + "learning_rate": 0.00018391886829725334, + "loss": 1.4611, + "step": 1750 + }, + { + "epoch": 0.8963356142482574, + "grad_norm": 0.03310383856296539, + "learning_rate": 0.00018387281738255185, + "loss": 1.4746, + "step": 1752 + }, + { + "epoch": 0.8973588284197737, + "grad_norm": 0.0307275652885437, + "learning_rate": 0.00018382670640747714, + "loss": 1.4697, + "step": 1754 + }, + { + "epoch": 0.8983820425912898, + "grad_norm": 0.028024040162563324, + "learning_rate": 0.00018378053540504873, + "loss": 1.4608, + "step": 1756 + }, + { + "epoch": 0.8994052567628061, + "grad_norm": 0.029499476775527, + "learning_rate": 0.00018373430440832923, + "loss": 1.4614, + "step": 1758 + }, + { + "epoch": 0.9004284709343224, + "grad_norm": 0.033067066222429276, + "learning_rate": 0.0001836880134504241, + "loss": 1.479, + "step": 1760 + }, + { + "epoch": 0.9014516851058387, + "grad_norm": 0.03787175565958023, + "learning_rate": 0.00018364166256448173, + "loss": 1.4712, + "step": 1762 + }, + { + "epoch": 0.902474899277355, + "grad_norm": 0.02690064162015915, + "learning_rate": 0.0001835952517836935, + "loss": 1.4673, + "step": 1764 + }, + { + "epoch": 0.9034981134488713, + "grad_norm": 0.026671042665839195, + "learning_rate": 0.00018354878114129367, + "loss": 1.4561, + "step": 1766 + }, + { + "epoch": 0.9045213276203875, + "grad_norm": 0.03277120366692543, + "learning_rate": 0.00018350225067055925, + "loss": 1.4879, + "step": 1768 + }, + { + "epoch": 0.9055445417919038, + "grad_norm": 0.03682045266032219, + "learning_rate": 0.00018345566040481028, + "loss": 1.467, + "step": 1770 + }, + { + "epoch": 0.9065677559634201, + "grad_norm": 0.027602965012192726, + "learning_rate": 0.0001834090103774095, + "loss": 1.4514, + "step": 1772 + }, + { + "epoch": 0.9075909701349364, + "grad_norm": 0.03043595515191555, + "learning_rate": 0.00018336230062176244, + "loss": 1.4835, + "step": 1774 + }, + { + "epoch": 0.9086141843064527, + "grad_norm": 0.030672984197735786, + "learning_rate": 0.0001833155311713174, + "loss": 1.492, + "step": 1776 + }, + { + "epoch": 0.9096373984779689, + "grad_norm": 0.032694920897483826, + "learning_rate": 0.00018326870205956553, + "loss": 1.475, + "step": 1778 + }, + { + "epoch": 0.9106606126494852, + "grad_norm": 0.031511466950178146, + "learning_rate": 0.00018322181332004056, + "loss": 1.4457, + "step": 1780 + }, + { + "epoch": 0.9116838268210015, + "grad_norm": 0.03155050054192543, + "learning_rate": 0.00018317486498631899, + "loss": 1.5165, + "step": 1782 + }, + { + "epoch": 0.9127070409925178, + "grad_norm": 0.03132548928260803, + "learning_rate": 0.00018312785709202002, + "loss": 1.5171, + "step": 1784 + }, + { + "epoch": 0.913730255164034, + "grad_norm": 0.036277156323194504, + "learning_rate": 0.00018308078967080546, + "loss": 1.4726, + "step": 1786 + }, + { + "epoch": 0.9147534693355504, + "grad_norm": 0.029615385457873344, + "learning_rate": 0.00018303366275637976, + "loss": 1.448, + "step": 1788 + }, + { + "epoch": 0.9157766835070665, + "grad_norm": 0.029571905732154846, + "learning_rate": 0.00018298647638248996, + "loss": 1.4629, + "step": 1790 + }, + { + "epoch": 0.9167998976785828, + "grad_norm": 0.028433986008167267, + "learning_rate": 0.0001829392305829257, + "loss": 1.474, + "step": 1792 + }, + { + "epoch": 0.9178231118500991, + "grad_norm": 0.034186169505119324, + "learning_rate": 0.0001828919253915191, + "loss": 1.4828, + "step": 1794 + }, + { + "epoch": 0.9188463260216154, + "grad_norm": 0.03323967382311821, + "learning_rate": 0.00018284456084214496, + "loss": 1.4883, + "step": 1796 + }, + { + "epoch": 0.9198695401931317, + "grad_norm": 0.03627438098192215, + "learning_rate": 0.00018279713696872047, + "loss": 1.4505, + "step": 1798 + }, + { + "epoch": 0.9208927543646479, + "grad_norm": 0.037414826452732086, + "learning_rate": 0.0001827496538052053, + "loss": 1.5153, + "step": 1800 + }, + { + "epoch": 0.9219159685361642, + "grad_norm": 0.036538898944854736, + "learning_rate": 0.00018270211138560162, + "loss": 1.4565, + "step": 1802 + }, + { + "epoch": 0.9229391827076805, + "grad_norm": 0.034286949783563614, + "learning_rate": 0.00018265450974395403, + "loss": 1.4596, + "step": 1804 + }, + { + "epoch": 0.9239623968791968, + "grad_norm": 0.03332148864865303, + "learning_rate": 0.0001826068489143495, + "loss": 1.4452, + "step": 1806 + }, + { + "epoch": 0.9249856110507131, + "grad_norm": 0.030349107459187508, + "learning_rate": 0.00018255912893091743, + "loss": 1.4937, + "step": 1808 + }, + { + "epoch": 0.9260088252222294, + "grad_norm": 0.030373625457286835, + "learning_rate": 0.00018251134982782952, + "loss": 1.4774, + "step": 1810 + }, + { + "epoch": 0.9270320393937456, + "grad_norm": 0.03661259636282921, + "learning_rate": 0.00018246351163929991, + "loss": 1.4694, + "step": 1812 + }, + { + "epoch": 0.9280552535652619, + "grad_norm": 0.036550264805555344, + "learning_rate": 0.00018241561439958495, + "loss": 1.4944, + "step": 1814 + }, + { + "epoch": 0.9290784677367782, + "grad_norm": 0.03492378070950508, + "learning_rate": 0.0001823676581429833, + "loss": 1.445, + "step": 1816 + }, + { + "epoch": 0.9301016819082945, + "grad_norm": 0.03306609019637108, + "learning_rate": 0.0001823196429038359, + "loss": 1.4222, + "step": 1818 + }, + { + "epoch": 0.9311248960798107, + "grad_norm": 0.03200085088610649, + "learning_rate": 0.0001822715687165259, + "loss": 1.467, + "step": 1820 + }, + { + "epoch": 0.9321481102513269, + "grad_norm": 0.036335378885269165, + "learning_rate": 0.00018222343561547874, + "loss": 1.4693, + "step": 1822 + }, + { + "epoch": 0.9331713244228432, + "grad_norm": 0.039753127843141556, + "learning_rate": 0.00018217524363516193, + "loss": 1.4594, + "step": 1824 + }, + { + "epoch": 0.9341945385943595, + "grad_norm": 0.03748109191656113, + "learning_rate": 0.0001821269928100852, + "loss": 1.5014, + "step": 1826 + }, + { + "epoch": 0.9352177527658758, + "grad_norm": 0.04106932878494263, + "learning_rate": 0.00018207868317480046, + "loss": 1.4823, + "step": 1828 + }, + { + "epoch": 0.9362409669373921, + "grad_norm": 0.032248884439468384, + "learning_rate": 0.00018203031476390167, + "loss": 1.4697, + "step": 1830 + }, + { + "epoch": 0.9372641811089084, + "grad_norm": 0.047158315777778625, + "learning_rate": 0.00018198188761202487, + "loss": 1.5449, + "step": 1832 + }, + { + "epoch": 0.9382873952804246, + "grad_norm": 0.03881628066301346, + "learning_rate": 0.00018193340175384824, + "loss": 1.5129, + "step": 1834 + }, + { + "epoch": 0.9393106094519409, + "grad_norm": 0.038932789117097855, + "learning_rate": 0.00018188485722409197, + "loss": 1.4508, + "step": 1836 + }, + { + "epoch": 0.9403338236234572, + "grad_norm": 0.042171675711870193, + "learning_rate": 0.00018183625405751816, + "loss": 1.4976, + "step": 1838 + }, + { + "epoch": 0.9413570377949735, + "grad_norm": 0.03824607655405998, + "learning_rate": 0.00018178759228893108, + "loss": 1.4759, + "step": 1840 + }, + { + "epoch": 0.9423802519664898, + "grad_norm": 0.0380014143884182, + "learning_rate": 0.0001817388719531768, + "loss": 1.4765, + "step": 1842 + }, + { + "epoch": 0.943403466138006, + "grad_norm": 0.03372355177998543, + "learning_rate": 0.00018169009308514344, + "loss": 1.4724, + "step": 1844 + }, + { + "epoch": 0.9444266803095223, + "grad_norm": 0.03503812104463577, + "learning_rate": 0.00018164125571976098, + "loss": 1.4537, + "step": 1846 + }, + { + "epoch": 0.9454498944810386, + "grad_norm": 0.03842812776565552, + "learning_rate": 0.00018159235989200132, + "loss": 1.4747, + "step": 1848 + }, + { + "epoch": 0.9464731086525549, + "grad_norm": 0.03686497360467911, + "learning_rate": 0.0001815434056368782, + "loss": 1.4433, + "step": 1850 + }, + { + "epoch": 0.9474963228240711, + "grad_norm": 0.03216801956295967, + "learning_rate": 0.00018149439298944717, + "loss": 1.4628, + "step": 1852 + }, + { + "epoch": 0.9485195369955874, + "grad_norm": 0.04245101660490036, + "learning_rate": 0.0001814453219848057, + "loss": 1.5411, + "step": 1854 + }, + { + "epoch": 0.9495427511671036, + "grad_norm": 0.041708942502737045, + "learning_rate": 0.0001813961926580929, + "loss": 1.4828, + "step": 1856 + }, + { + "epoch": 0.9505659653386199, + "grad_norm": 0.038249559700489044, + "learning_rate": 0.0001813470050444898, + "loss": 1.4633, + "step": 1858 + }, + { + "epoch": 0.9515891795101362, + "grad_norm": 0.03623546287417412, + "learning_rate": 0.00018129775917921905, + "loss": 1.4644, + "step": 1860 + }, + { + "epoch": 0.9526123936816525, + "grad_norm": 0.03886585682630539, + "learning_rate": 0.00018124845509754505, + "loss": 1.4642, + "step": 1862 + }, + { + "epoch": 0.9536356078531688, + "grad_norm": 0.03367486968636513, + "learning_rate": 0.00018119909283477394, + "loss": 1.4577, + "step": 1864 + }, + { + "epoch": 0.954658822024685, + "grad_norm": 0.034619078040122986, + "learning_rate": 0.00018114967242625343, + "loss": 1.4424, + "step": 1866 + }, + { + "epoch": 0.9556820361962013, + "grad_norm": 0.036260370165109634, + "learning_rate": 0.00018110019390737292, + "loss": 1.4749, + "step": 1868 + }, + { + "epoch": 0.9567052503677176, + "grad_norm": 0.037158943712711334, + "learning_rate": 0.00018105065731356343, + "loss": 1.4185, + "step": 1870 + }, + { + "epoch": 0.9577284645392339, + "grad_norm": 0.03858686238527298, + "learning_rate": 0.00018100106268029755, + "loss": 1.5027, + "step": 1872 + }, + { + "epoch": 0.9587516787107502, + "grad_norm": 0.03699406236410141, + "learning_rate": 0.00018095141004308943, + "loss": 1.4283, + "step": 1874 + }, + { + "epoch": 0.9597748928822665, + "grad_norm": 0.030941152945160866, + "learning_rate": 0.00018090169943749476, + "loss": 1.4729, + "step": 1876 + }, + { + "epoch": 0.9607981070537827, + "grad_norm": 0.03944398835301399, + "learning_rate": 0.00018085193089911075, + "loss": 1.4636, + "step": 1878 + }, + { + "epoch": 0.961821321225299, + "grad_norm": 0.03944871574640274, + "learning_rate": 0.00018080210446357606, + "loss": 1.4458, + "step": 1880 + }, + { + "epoch": 0.9628445353968152, + "grad_norm": 0.042511675506830215, + "learning_rate": 0.00018075222016657088, + "loss": 1.4868, + "step": 1882 + }, + { + "epoch": 0.9638677495683315, + "grad_norm": 0.036067429929971695, + "learning_rate": 0.00018070227804381674, + "loss": 1.4681, + "step": 1884 + }, + { + "epoch": 0.9648909637398478, + "grad_norm": 0.030013304203748703, + "learning_rate": 0.00018065227813107666, + "loss": 1.5088, + "step": 1886 + }, + { + "epoch": 0.965914177911364, + "grad_norm": 0.030714694410562515, + "learning_rate": 0.000180602220464155, + "loss": 1.4443, + "step": 1888 + }, + { + "epoch": 0.9669373920828803, + "grad_norm": 0.03553122654557228, + "learning_rate": 0.0001805521050788975, + "loss": 1.4667, + "step": 1890 + }, + { + "epoch": 0.9679606062543966, + "grad_norm": 0.032518330961465836, + "learning_rate": 0.0001805019320111912, + "loss": 1.4756, + "step": 1892 + }, + { + "epoch": 0.9689838204259129, + "grad_norm": 0.032445941120386124, + "learning_rate": 0.0001804517012969644, + "loss": 1.474, + "step": 1894 + }, + { + "epoch": 0.9700070345974292, + "grad_norm": 0.03390254080295563, + "learning_rate": 0.00018040141297218695, + "loss": 1.4477, + "step": 1896 + }, + { + "epoch": 0.9710302487689455, + "grad_norm": 0.02915276773273945, + "learning_rate": 0.00018035106707286954, + "loss": 1.4784, + "step": 1898 + }, + { + "epoch": 0.9720534629404617, + "grad_norm": 0.028000080958008766, + "learning_rate": 0.00018030066363506437, + "loss": 1.45, + "step": 1900 + } + ], + "logging_steps": 2, + "max_steps": 7816, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.236992921365381e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}