diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -3,7126 +3,9317 @@ "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, - "global_step": 1016, + "global_step": 1329, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.000984251968503937, - "grad_norm": 24.054025650024414, - "learning_rate": 1.999995219404616e-05, - "loss": 3.8221, + "epoch": 0.0007524454477050414, + "grad_norm": 0.006495347712188959, + "learning_rate": 1.999997206043851e-05, + "loss": 0.0004, "step": 1 }, { - "epoch": 0.001968503937007874, - "grad_norm": 25.52571678161621, - "learning_rate": 1.9999808776641724e-05, - "loss": 2.257, + "epoch": 0.0015048908954100827, + "grad_norm": 0.05232545733451843, + "learning_rate": 1.9999888241910165e-05, + "loss": 0.0017, "step": 2 }, { - "epoch": 0.002952755905511811, - "grad_norm": 14.151020050048828, - "learning_rate": 1.9999569749157934e-05, - "loss": 1.6475, + "epoch": 0.002257336343115124, + "grad_norm": 0.016717378050088882, + "learning_rate": 1.999974854488333e-05, + "loss": 0.0009, "step": 3 }, { - "epoch": 0.003937007874015748, - "grad_norm": 9.001044273376465, - "learning_rate": 1.999923511388017e-05, - "loss": 0.364, + "epoch": 0.0030097817908201654, + "grad_norm": 0.51008540391922, + "learning_rate": 1.9999552970138628e-05, + "loss": 0.0233, "step": 4 }, { - "epoch": 0.004921259842519685, - "grad_norm": 5.057156562805176, - "learning_rate": 1.999880487400795e-05, - "loss": 0.1114, + "epoch": 0.003762227238525207, + "grad_norm": 3.963065682910383e-05, + "learning_rate": 1.999930151876891e-05, + "loss": 0.0, "step": 5 }, { - "epoch": 0.005905511811023622, - "grad_norm": 0.33928874135017395, - "learning_rate": 1.9998279033654883e-05, - "loss": 0.0094, + "epoch": 0.004514672686230248, + "grad_norm": 8.221525604312774e-06, + "learning_rate": 1.9998994192179256e-05, + "loss": 0.0, "step": 6 }, { - "epoch": 0.006889763779527559, - "grad_norm": 0.08265714347362518, - "learning_rate": 1.999765759784862e-05, - "loss": 0.0023, + "epoch": 0.005267118133935289, + "grad_norm": 0.019136814400553703, + "learning_rate": 1.999863099208699e-05, + "loss": 0.0005, "step": 7 }, { - "epoch": 0.007874015748031496, - "grad_norm": 0.7710925936698914, - "learning_rate": 1.999694057253083e-05, - "loss": 0.0141, + "epoch": 0.006019563581640331, + "grad_norm": 0.0003651017614174634, + "learning_rate": 1.9998211920521646e-05, + "loss": 0.0, "step": 8 }, { - "epoch": 0.008858267716535433, - "grad_norm": 9.648259162902832, - "learning_rate": 1.9996127964557136e-05, - "loss": 0.0457, + "epoch": 0.006772009029345372, + "grad_norm": 0.0008514254004694521, + "learning_rate": 1.9997736979824944e-05, + "loss": 0.0, "step": 9 }, { - "epoch": 0.00984251968503937, - "grad_norm": 0.016596384346485138, - "learning_rate": 1.999521978169703e-05, - "loss": 0.0006, + "epoch": 0.007524454477050414, + "grad_norm": 0.005437616258859634, + "learning_rate": 1.9997206172650826e-05, + "loss": 0.0002, "step": 10 }, { - "epoch": 0.010826771653543307, - "grad_norm": 14.048759460449219, - "learning_rate": 1.9994216032633824e-05, - "loss": 0.4977, + "epoch": 0.008276899924755455, + "grad_norm": 0.5016034245491028, + "learning_rate": 1.9996619501965385e-05, + "loss": 0.0063, "step": 11 }, { - "epoch": 0.011811023622047244, - "grad_norm": 0.008794532157480717, - "learning_rate": 1.9993116726964554e-05, - "loss": 0.0003, + "epoch": 0.009029345372460496, + "grad_norm": 0.0040865750052034855, + "learning_rate": 1.9995976971046896e-05, + "loss": 0.0001, "step": 12 }, { - "epoch": 0.012795275590551181, - "grad_norm": 3.7268006801605225, - "learning_rate": 1.9991921875199894e-05, - "loss": 0.6703, + "epoch": 0.009781790820165538, + "grad_norm": 0.06221418455243111, + "learning_rate": 1.9995278583485755e-05, + "loss": 0.0013, "step": 13 }, { - "epoch": 0.013779527559055118, - "grad_norm": 0.012454725801944733, - "learning_rate": 1.9990631488764044e-05, - "loss": 0.0004, + "epoch": 0.010534236267870579, + "grad_norm": 0.34792619943618774, + "learning_rate": 1.9994524343184494e-05, + "loss": 0.0127, "step": 14 }, { - "epoch": 0.014763779527559055, - "grad_norm": 0.022320333868265152, - "learning_rate": 1.9989245579994638e-05, - "loss": 0.0006, + "epoch": 0.011286681715575621, + "grad_norm": 0.0, + "learning_rate": 1.999371425435775e-05, + "loss": 0.0, "step": 15 }, { - "epoch": 0.015748031496062992, - "grad_norm": 0.05911239981651306, - "learning_rate": 1.9987764162142615e-05, - "loss": 0.0015, + "epoch": 0.012039127163280662, + "grad_norm": 7.092195510864258, + "learning_rate": 1.9992848321532213e-05, + "loss": 0.5353, "step": 16 }, { - "epoch": 0.01673228346456693, - "grad_norm": 0.17896273732185364, - "learning_rate": 1.998618724937209e-05, - "loss": 0.0038, + "epoch": 0.012791572610985704, + "grad_norm": 0.0006019376451149583, + "learning_rate": 1.9991926549546653e-05, + "loss": 0.0, "step": 17 }, { - "epoch": 0.017716535433070866, - "grad_norm": 0.034781314432621, - "learning_rate": 1.9984514856760233e-05, - "loss": 0.0012, + "epoch": 0.013544018058690745, + "grad_norm": 0.026538817211985588, + "learning_rate": 1.9990948943551843e-05, + "loss": 0.0006, "step": 18 }, { - "epoch": 0.018700787401574805, - "grad_norm": 22.77977180480957, - "learning_rate": 1.9982747000297108e-05, - "loss": 0.0736, + "epoch": 0.014296463506395787, + "grad_norm": 0.0013464675284922123, + "learning_rate": 1.998991550901056e-05, + "loss": 0.0, "step": 19 }, { - "epoch": 0.01968503937007874, - "grad_norm": 0.021774308755993843, - "learning_rate": 1.998088369688552e-05, - "loss": 0.0005, + "epoch": 0.015048908954100828, + "grad_norm": 5.049379825592041, + "learning_rate": 1.998882625169755e-05, + "loss": 0.2596, "step": 20 }, { - "epoch": 0.02066929133858268, - "grad_norm": 0.03547021374106407, - "learning_rate": 1.9978924964340876e-05, - "loss": 0.001, + "epoch": 0.01580135440180587, + "grad_norm": 0.0, + "learning_rate": 1.9987681177699486e-05, + "loss": 0.0, "step": 21 }, { - "epoch": 0.021653543307086614, - "grad_norm": 0.022672317922115326, - "learning_rate": 1.997687082139099e-05, - "loss": 0.0006, + "epoch": 0.01655379984951091, + "grad_norm": 0.0007531401934102178, + "learning_rate": 1.9986480293414938e-05, + "loss": 0.0, "step": 22 }, { - "epoch": 0.022637795275590553, - "grad_norm": 4.798707962036133, - "learning_rate": 1.9974721287675914e-05, - "loss": 0.1171, + "epoch": 0.01730624529721595, + "grad_norm": 1.847428560256958, + "learning_rate": 1.9985223605554346e-05, + "loss": 0.1395, "step": 23 }, { - "epoch": 0.023622047244094488, - "grad_norm": 1.9526541233062744, - "learning_rate": 1.9972476383747748e-05, - "loss": 0.0262, + "epoch": 0.01805869074492099, + "grad_norm": 0.013212837278842926, + "learning_rate": 1.998391112113997e-05, + "loss": 0.0001, "step": 24 }, { - "epoch": 0.024606299212598427, - "grad_norm": 0.042920470237731934, - "learning_rate": 1.997013613107045e-05, - "loss": 0.0014, + "epoch": 0.018811136192626036, + "grad_norm": 0.41415101289749146, + "learning_rate": 1.9982542847505858e-05, + "loss": 0.0105, "step": 25 }, { - "epoch": 0.025590551181102362, - "grad_norm": 0.027491770684719086, - "learning_rate": 1.996770055201962e-05, - "loss": 0.0009, + "epoch": 0.019563581640331076, + "grad_norm": 0.0014141664141789079, + "learning_rate": 1.99811187922978e-05, + "loss": 0.0001, "step": 26 }, { - "epoch": 0.0265748031496063, - "grad_norm": 4.108683109283447, - "learning_rate": 1.9965169669882293e-05, - "loss": 0.1578, + "epoch": 0.020316027088036117, + "grad_norm": 0.002521197311580181, + "learning_rate": 1.9979638963473294e-05, + "loss": 0.0001, "step": 27 }, { - "epoch": 0.027559055118110236, - "grad_norm": 0.03151802718639374, - "learning_rate": 1.9962543508856722e-05, - "loss": 0.001, + "epoch": 0.021068472535741158, + "grad_norm": 4.3378071784973145, + "learning_rate": 1.9978103369301495e-05, + "loss": 0.3246, "step": 28 }, { - "epoch": 0.028543307086614175, - "grad_norm": 0.06828831881284714, - "learning_rate": 1.9959822094052124e-05, - "loss": 0.0021, + "epoch": 0.0218209179834462, + "grad_norm": 4.08248852181714e-05, + "learning_rate": 1.997651201836317e-05, + "loss": 0.0, "step": 29 }, { - "epoch": 0.02952755905511811, - "grad_norm": 2.8961188793182373, - "learning_rate": 1.9957005451488476e-05, - "loss": 0.0227, + "epoch": 0.022573363431151242, + "grad_norm": 0.006794753950089216, + "learning_rate": 1.9974864919550642e-05, + "loss": 0.0003, "step": 30 }, { - "epoch": 0.03051181102362205, - "grad_norm": 0.3329976201057434, - "learning_rate": 1.9954093608096225e-05, - "loss": 0.0049, + "epoch": 0.023325808878856283, + "grad_norm": 0.05969928577542305, + "learning_rate": 1.9973162082067762e-05, + "loss": 0.0011, "step": 31 }, { - "epoch": 0.031496062992125984, - "grad_norm": 0.01458835694938898, - "learning_rate": 1.995108659171607e-05, - "loss": 0.0006, + "epoch": 0.024078254326561323, + "grad_norm": 0.07203614711761475, + "learning_rate": 1.9971403515429833e-05, + "loss": 0.002, "step": 32 }, { - "epoch": 0.03248031496062992, - "grad_norm": 0.23636917769908905, - "learning_rate": 1.9947984431098658e-05, - "loss": 0.0031, + "epoch": 0.024830699774266364, + "grad_norm": 0.0011654079426079988, + "learning_rate": 1.996958922946357e-05, + "loss": 0.0001, "step": 33 }, { - "epoch": 0.03346456692913386, - "grad_norm": 11.23897933959961, - "learning_rate": 1.9944787155904346e-05, - "loss": 0.4427, + "epoch": 0.025583145221971408, + "grad_norm": 0.002619029488414526, + "learning_rate": 1.9967719234307044e-05, + "loss": 0.0001, "step": 34 }, { - "epoch": 0.0344488188976378, - "grad_norm": 0.009200596250593662, - "learning_rate": 1.9941494796702892e-05, - "loss": 0.0004, + "epoch": 0.02633559066967645, + "grad_norm": 0.48534148931503296, + "learning_rate": 1.9965793540409628e-05, + "loss": 0.0088, "step": 35 }, { - "epoch": 0.03543307086614173, - "grad_norm": 0.01176590658724308, - "learning_rate": 1.9938107384973165e-05, - "loss": 0.0004, + "epoch": 0.02708803611738149, + "grad_norm": 0.00325326737947762, + "learning_rate": 1.9963812158531926e-05, + "loss": 0.0001, "step": 36 }, { - "epoch": 0.03641732283464567, - "grad_norm": 0.004416824784129858, - "learning_rate": 1.9934624953102858e-05, - "loss": 0.0002, + "epoch": 0.02784048156508653, + "grad_norm": 0.008502104319632053, + "learning_rate": 1.9961775099745727e-05, + "loss": 0.0003, "step": 37 }, { - "epoch": 0.03740157480314961, - "grad_norm": 0.005047894548624754, - "learning_rate": 1.993104753438817e-05, - "loss": 0.0002, + "epoch": 0.028592927012791574, + "grad_norm": 0.9508514404296875, + "learning_rate": 1.995968237543394e-05, + "loss": 0.114, "step": 38 }, { - "epoch": 0.038385826771653545, - "grad_norm": 0.02448008581995964, - "learning_rate": 1.9927375163033483e-05, - "loss": 0.0008, + "epoch": 0.029345372460496615, + "grad_norm": 1.4725682735443115, + "learning_rate": 1.9957533997290524e-05, + "loss": 0.096, "step": 39 }, { - "epoch": 0.03937007874015748, - "grad_norm": 0.1712186634540558, - "learning_rate": 1.992360787415103e-05, - "loss": 0.0029, + "epoch": 0.030097817908201655, + "grad_norm": 0.1169065311551094, + "learning_rate": 1.9955329977320422e-05, + "loss": 0.0018, "step": 40 }, { - "epoch": 0.040354330708661415, - "grad_norm": 8.718997955322266, - "learning_rate": 1.9919745703760592e-05, - "loss": 0.2108, + "epoch": 0.030850263355906696, + "grad_norm": 0.054003071039915085, + "learning_rate": 1.9953070327839513e-05, + "loss": 0.0012, "step": 41 }, { - "epoch": 0.04133858267716536, - "grad_norm": 0.4471137225627899, - "learning_rate": 1.9915788688789107e-05, - "loss": 0.0076, + "epoch": 0.03160270880361174, + "grad_norm": 0.0011868480360135436, + "learning_rate": 1.9950755061474513e-05, + "loss": 0.0001, "step": 42 }, { - "epoch": 0.04232283464566929, - "grad_norm": 0.013411296531558037, - "learning_rate": 1.9911736867070358e-05, - "loss": 0.0005, + "epoch": 0.03235515425131678, + "grad_norm": 0.27812469005584717, + "learning_rate": 1.9948384191162932e-05, + "loss": 0.0074, "step": 43 }, { - "epoch": 0.04330708661417323, - "grad_norm": 0.02578163705766201, - "learning_rate": 1.9907590277344582e-05, - "loss": 0.0006, + "epoch": 0.03310759969902182, + "grad_norm": 2.0421926975250244, + "learning_rate": 1.994595773015298e-05, + "loss": 0.2558, "step": 44 }, { - "epoch": 0.04429133858267716, - "grad_norm": 0.0920349657535553, - "learning_rate": 1.9903348959258112e-05, - "loss": 0.0017, + "epoch": 0.033860045146726865, + "grad_norm": 0.0036680996417999268, + "learning_rate": 1.9943475692003514e-05, + "loss": 0.0001, "step": 45 }, { - "epoch": 0.045275590551181105, - "grad_norm": 0.01718003675341606, - "learning_rate": 1.9899012953363002e-05, - "loss": 0.0007, + "epoch": 0.0346124905944319, + "grad_norm": 0.0018471762305125594, + "learning_rate": 1.994093809058394e-05, + "loss": 0.0001, "step": 46 }, { - "epoch": 0.04625984251968504, - "grad_norm": 0.020548053085803986, - "learning_rate": 1.9894582301116633e-05, - "loss": 0.0007, + "epoch": 0.035364936042136946, + "grad_norm": 2.0573832988739014, + "learning_rate": 1.9938344940074162e-05, + "loss": 0.2316, "step": 47 }, { - "epoch": 0.047244094488188976, - "grad_norm": 0.14374031126499176, - "learning_rate": 1.9890057044881308e-05, - "loss": 0.0028, + "epoch": 0.03611738148984198, + "grad_norm": 0.02541675977408886, + "learning_rate": 1.9935696254964468e-05, + "loss": 0.0009, "step": 48 }, { - "epoch": 0.04822834645669291, - "grad_norm": 19.97130584716797, - "learning_rate": 1.9885437227923876e-05, - "loss": 0.075, + "epoch": 0.03686982693754703, + "grad_norm": 0.039695482701063156, + "learning_rate": 1.9932992050055478e-05, + "loss": 0.001, "step": 49 }, { - "epoch": 0.04921259842519685, - "grad_norm": 0.001329949707724154, - "learning_rate": 1.9880722894415284e-05, + "epoch": 0.03762227238525207, + "grad_norm": 0.0008929629693739116, + "learning_rate": 1.993023234045806e-05, "loss": 0.0001, "step": 50 }, { - "epoch": 0.05019685039370079, - "grad_norm": 0.764119029045105, - "learning_rate": 1.987591408943017e-05, - "loss": 0.006, + "epoch": 0.03837471783295711, + "grad_norm": 0.0004025468078907579, + "learning_rate": 1.992741714159322e-05, + "loss": 0.0, "step": 51 }, { - "epoch": 0.051181102362204724, - "grad_norm": 7.61911678314209, - "learning_rate": 1.9871010858946443e-05, - "loss": 0.7777, + "epoch": 0.03912716328066215, + "grad_norm": 0.012651098892092705, + "learning_rate": 1.992454646919205e-05, + "loss": 0.0006, "step": 52 }, { - "epoch": 0.05216535433070866, - "grad_norm": 3.337543249130249, - "learning_rate": 1.986601324984482e-05, - "loss": 0.0308, + "epoch": 0.0398796087283672, + "grad_norm": 5.611824035644531, + "learning_rate": 1.9921620339295612e-05, + "loss": 0.0996, "step": 53 }, { - "epoch": 0.0531496062992126, - "grad_norm": 0.015905100852251053, - "learning_rate": 1.9860921309908395e-05, - "loss": 0.0005, + "epoch": 0.040632054176072234, + "grad_norm": 0.024261457845568657, + "learning_rate": 1.9918638768254865e-05, + "loss": 0.0008, "step": 54 }, { - "epoch": 0.054133858267716536, - "grad_norm": 0.3135690689086914, - "learning_rate": 1.9855735087822185e-05, - "loss": 0.0088, + "epoch": 0.04138449962377728, + "grad_norm": 0.04541151970624924, + "learning_rate": 1.9915601772730562e-05, + "loss": 0.0013, "step": 55 }, { - "epoch": 0.05511811023622047, - "grad_norm": 0.14215576648712158, - "learning_rate": 1.9850454633172632e-05, - "loss": 0.0025, + "epoch": 0.042136945071482315, + "grad_norm": 0.0012648458359763026, + "learning_rate": 1.9912509369693172e-05, + "loss": 0.0001, "step": 56 }, { - "epoch": 0.05610236220472441, - "grad_norm": 0.03396113961935043, - "learning_rate": 1.984507999644719e-05, - "loss": 0.0006, + "epoch": 0.04288939051918736, + "grad_norm": 0.009661543183028698, + "learning_rate": 1.990936157642277e-05, + "loss": 0.0004, "step": 57 }, { - "epoch": 0.05708661417322835, - "grad_norm": 1.2952784299850464, - "learning_rate": 1.9839611229033774e-05, - "loss": 0.0304, + "epoch": 0.0436418359668924, + "grad_norm": 0.00047065879334695637, + "learning_rate": 1.990615841050895e-05, + "loss": 0.0, "step": 58 }, { - "epoch": 0.058070866141732284, - "grad_norm": 0.005510976072400808, - "learning_rate": 1.9834048383220312e-05, - "loss": 0.0002, + "epoch": 0.04439428141459744, + "grad_norm": 2.226644992828369, + "learning_rate": 1.990289988985072e-05, + "loss": 0.0725, "step": 59 }, { - "epoch": 0.05905511811023622, - "grad_norm": 0.009561412036418915, - "learning_rate": 1.982839151219424e-05, + "epoch": 0.045146726862302484, + "grad_norm": 0.007543186657130718, + "learning_rate": 1.9899586032656407e-05, "loss": 0.0003, "step": 60 }, { - "epoch": 0.060039370078740155, - "grad_norm": 2.7667431831359863, - "learning_rate": 1.9822640670041976e-05, - "loss": 0.0178, + "epoch": 0.04589917231000752, + "grad_norm": 0.3191293478012085, + "learning_rate": 1.9896216857443563e-05, + "loss": 0.0149, "step": 61 }, { - "epoch": 0.0610236220472441, - "grad_norm": 7.660120487213135, - "learning_rate": 1.9816795911748422e-05, - "loss": 0.1271, + "epoch": 0.046651617757712566, + "grad_norm": 0.0007462618523277342, + "learning_rate": 1.989279238303883e-05, + "loss": 0.0, "step": 62 }, { - "epoch": 0.06200787401574803, - "grad_norm": 0.022357389330863953, - "learning_rate": 1.981085729319643e-05, - "loss": 0.0008, + "epoch": 0.04740406320541761, + "grad_norm": 0.1068795919418335, + "learning_rate": 1.9889312628577887e-05, + "loss": 0.0024, "step": 63 }, { - "epoch": 0.06299212598425197, - "grad_norm": 0.004426524043083191, - "learning_rate": 1.9804824871166254e-05, + "epoch": 0.04815650865312265, + "grad_norm": 0.003478952683508396, + "learning_rate": 1.9885777613505278e-05, "loss": 0.0002, "step": 64 }, { - "epoch": 0.0639763779527559, - "grad_norm": 0.004661270417273045, - "learning_rate": 1.9798698703335043e-05, - "loss": 0.0002, + "epoch": 0.04890895410082769, + "grad_norm": 0.002991155255585909, + "learning_rate": 1.988218735757437e-05, + "loss": 0.0001, "step": 65 }, { - "epoch": 0.06496062992125984, - "grad_norm": 0.07409872114658356, - "learning_rate": 1.979247884827625e-05, - "loss": 0.0017, + "epoch": 0.04966139954853273, + "grad_norm": 0.003038755152374506, + "learning_rate": 1.98785418808472e-05, + "loss": 0.0001, "step": 66 }, { - "epoch": 0.06594488188976377, - "grad_norm": 0.8511402606964111, - "learning_rate": 1.9786165365459102e-05, - "loss": 0.0095, + "epoch": 0.05041384499623777, + "grad_norm": 1.9981627464294434, + "learning_rate": 1.987484120369436e-05, + "loss": 0.5126, "step": 67 }, { - "epoch": 0.06692913385826772, - "grad_norm": 0.012232961133122444, - "learning_rate": 1.9779758315248006e-05, - "loss": 0.0004, + "epoch": 0.051166290443942816, + "grad_norm": 0.1265110969543457, + "learning_rate": 1.9871085346794922e-05, + "loss": 0.0043, "step": 68 }, { - "epoch": 0.06791338582677166, - "grad_norm": 0.004477099049836397, - "learning_rate": 1.9773257758901993e-05, - "loss": 0.0002, + "epoch": 0.05191873589164785, + "grad_norm": 0.5709707736968994, + "learning_rate": 1.9867274331136276e-05, + "loss": 0.1178, "step": 69 }, { - "epoch": 0.0688976377952756, - "grad_norm": 0.07645303755998611, - "learning_rate": 1.9766663758574122e-05, - "loss": 0.0013, + "epoch": 0.0526711813393529, + "grad_norm": 0.17010557651519775, + "learning_rate": 1.986340817801405e-05, + "loss": 0.0031, "step": 70 }, { - "epoch": 0.06988188976377953, - "grad_norm": 0.003697085427120328, - "learning_rate": 1.9759976377310892e-05, - "loss": 0.0002, + "epoch": 0.05342362678705794, + "grad_norm": 0.1551416665315628, + "learning_rate": 1.985948690903196e-05, + "loss": 0.0035, "step": 71 }, { - "epoch": 0.07086614173228346, - "grad_norm": 24.469940185546875, - "learning_rate": 1.975319567905163e-05, - "loss": 0.3354, + "epoch": 0.05417607223476298, + "grad_norm": 0.17882609367370605, + "learning_rate": 1.9855510546101725e-05, + "loss": 0.0066, "step": 72 }, { - "epoch": 0.0718503937007874, - "grad_norm": 0.0019476768793538213, - "learning_rate": 1.974632172862788e-05, - "loss": 0.0001, + "epoch": 0.05492851768246802, + "grad_norm": 0.014555556699633598, + "learning_rate": 1.9851479111442902e-05, + "loss": 0.0008, "step": 73 }, { - "epoch": 0.07283464566929133, - "grad_norm": 0.001231226953677833, - "learning_rate": 1.9739354591762798e-05, - "loss": 0.0001, + "epoch": 0.05568096313017306, + "grad_norm": 0.03164775297045708, + "learning_rate": 1.98473926275828e-05, + "loss": 0.0014, "step": 74 }, { - "epoch": 0.07381889763779527, - "grad_norm": 5.479063510894775, - "learning_rate": 1.9732294335070507e-05, - "loss": 0.1605, + "epoch": 0.056433408577878104, + "grad_norm": 1.0504040718078613, + "learning_rate": 1.984325111735633e-05, + "loss": 0.047, "step": 75 }, { - "epoch": 0.07480314960629922, - "grad_norm": 2.7482311725616455, - "learning_rate": 1.9725141026055473e-05, - "loss": 0.542, + "epoch": 0.05718585402558315, + "grad_norm": 0.06992053985595703, + "learning_rate": 1.9839054603905887e-05, + "loss": 0.0025, "step": 76 }, { - "epoch": 0.07578740157480315, - "grad_norm": 0.007556090131402016, - "learning_rate": 1.971789473311184e-05, - "loss": 0.0003, + "epoch": 0.057938299473288185, + "grad_norm": 3.3065266609191895, + "learning_rate": 1.9834803110681223e-05, + "loss": 0.039, "step": 77 }, { - "epoch": 0.07677165354330709, - "grad_norm": 0.0038557201623916626, - "learning_rate": 1.9710555525522802e-05, - "loss": 0.0002, + "epoch": 0.05869074492099323, + "grad_norm": 0.04477689415216446, + "learning_rate": 1.983049666143931e-05, + "loss": 0.0017, "step": 78 }, { - "epoch": 0.07775590551181102, - "grad_norm": 0.010755794122815132, - "learning_rate": 1.970312347345992e-05, - "loss": 0.0005, + "epoch": 0.059443190368698266, + "grad_norm": 0.027361512184143066, + "learning_rate": 1.9826135280244204e-05, + "loss": 0.0011, "step": 79 }, { - "epoch": 0.07874015748031496, - "grad_norm": 3.593060255050659, - "learning_rate": 1.9695598647982467e-05, - "loss": 0.1224, + "epoch": 0.06019563581640331, + "grad_norm": 0.2297053188085556, + "learning_rate": 1.9821718991466925e-05, + "loss": 0.0076, "step": 80 }, { - "epoch": 0.0797244094488189, - "grad_norm": 0.02859203703701496, - "learning_rate": 1.9687981121036732e-05, - "loss": 0.0011, + "epoch": 0.060948081264108354, + "grad_norm": 0.015010291710495949, + "learning_rate": 1.9817247819785303e-05, + "loss": 0.0007, "step": 81 }, { - "epoch": 0.08070866141732283, - "grad_norm": 0.022516317665576935, - "learning_rate": 1.9680270965455343e-05, - "loss": 0.0006, + "epoch": 0.06170052671181339, + "grad_norm": 1.8525534868240356, + "learning_rate": 1.981272179018386e-05, + "loss": 0.3579, "step": 82 }, { - "epoch": 0.08169291338582677, - "grad_norm": 2.1842687129974365, - "learning_rate": 1.9672468254956562e-05, - "loss": 0.3053, + "epoch": 0.062452972159518436, + "grad_norm": 0.02152411825954914, + "learning_rate": 1.9808140927953644e-05, + "loss": 0.0006, "step": 83 }, { - "epoch": 0.08267716535433071, - "grad_norm": 0.11225120723247528, - "learning_rate": 1.9664573064143604e-05, - "loss": 0.0045, + "epoch": 0.06320541760722348, + "grad_norm": 0.0057389941066503525, + "learning_rate": 1.9803505258692117e-05, + "loss": 0.0004, "step": 84 }, { - "epoch": 0.08366141732283465, - "grad_norm": 0.05971734598278999, - "learning_rate": 1.965658546850389e-05, - "loss": 0.0021, + "epoch": 0.06395786305492852, + "grad_norm": 0.021787824109196663, + "learning_rate": 1.9798814808302992e-05, + "loss": 0.0008, "step": 85 }, { - "epoch": 0.08464566929133858, - "grad_norm": 0.0755971372127533, - "learning_rate": 1.9648505544408343e-05, - "loss": 0.0026, + "epoch": 0.06471030850263355, + "grad_norm": 1.793501853942871, + "learning_rate": 1.9794069602996093e-05, + "loss": 0.1386, "step": 86 }, { - "epoch": 0.08562992125984252, - "grad_norm": 0.8131499886512756, - "learning_rate": 1.9640333369110662e-05, - "loss": 0.0358, + "epoch": 0.0654627539503386, + "grad_norm": 0.9608948826789856, + "learning_rate": 1.9789269669287212e-05, + "loss": 0.126, "step": 87 }, { - "epoch": 0.08661417322834646, - "grad_norm": 0.10997315496206284, - "learning_rate": 1.9632069020746574e-05, - "loss": 0.0031, + "epoch": 0.06621519939804364, + "grad_norm": 0.01728960871696472, + "learning_rate": 1.9784415033997955e-05, + "loss": 0.0008, "step": 88 }, { - "epoch": 0.08759842519685039, - "grad_norm": 0.1209130510687828, - "learning_rate": 1.9623712578333086e-05, - "loss": 0.0036, + "epoch": 0.06696764484574869, + "grad_norm": 0.0034939961042255163, + "learning_rate": 1.9779505724255602e-05, + "loss": 0.0002, "step": 89 }, { - "epoch": 0.08858267716535433, - "grad_norm": 3.4418578147888184, - "learning_rate": 1.9615264121767742e-05, - "loss": 0.0424, + "epoch": 0.06772009029345373, + "grad_norm": 0.27871963381767273, + "learning_rate": 1.9774541767492942e-05, + "loss": 0.1145, "step": 90 }, { - "epoch": 0.08956692913385826, - "grad_norm": 0.050817012786865234, - "learning_rate": 1.9606723731827846e-05, - "loss": 0.0017, + "epoch": 0.06847253574115876, + "grad_norm": 0.020328784361481667, + "learning_rate": 1.9769523191448136e-05, + "loss": 0.0009, "step": 91 }, { - "epoch": 0.09055118110236221, - "grad_norm": 0.047979529947042465, - "learning_rate": 1.9598091490169696e-05, - "loss": 0.0015, + "epoch": 0.0692249811888638, + "grad_norm": 0.003004940692335367, + "learning_rate": 1.976445002416454e-05, + "loss": 0.0002, "step": 92 }, { - "epoch": 0.09153543307086615, - "grad_norm": 0.14766374230384827, - "learning_rate": 1.95893674793278e-05, - "loss": 0.0064, + "epoch": 0.06997742663656885, + "grad_norm": 0.051723234355449677, + "learning_rate": 1.975932229399057e-05, + "loss": 0.0021, "step": 93 }, { - "epoch": 0.09251968503937008, - "grad_norm": 0.12434018403291702, - "learning_rate": 1.958055178271409e-05, - "loss": 0.0064, + "epoch": 0.07072987208427389, + "grad_norm": 0.02858145534992218, + "learning_rate": 1.975414002957953e-05, + "loss": 0.0011, "step": 94 }, { - "epoch": 0.09350393700787402, - "grad_norm": 1.18305504322052, - "learning_rate": 1.9571644484617122e-05, - "loss": 0.0412, + "epoch": 0.07148231753197894, + "grad_norm": 0.076414555311203, + "learning_rate": 1.9748903259889466e-05, + "loss": 0.0024, "step": 95 }, { - "epoch": 0.09448818897637795, - "grad_norm": 1.9955068826675415, - "learning_rate": 1.9562645670201278e-05, - "loss": 0.0681, + "epoch": 0.07223476297968397, + "grad_norm": 1.1670242547988892, + "learning_rate": 1.9743612014182982e-05, + "loss": 0.0559, "step": 96 }, { - "epoch": 0.09547244094488189, - "grad_norm": 5.675886154174805, - "learning_rate": 1.9553555425505933e-05, - "loss": 0.1765, + "epoch": 0.07298720842738901, + "grad_norm": 0.05960305780172348, + "learning_rate": 1.9738266322027094e-05, + "loss": 0.0026, "step": 97 }, { - "epoch": 0.09645669291338582, - "grad_norm": 0.037112608551979065, - "learning_rate": 1.954437383744465e-05, - "loss": 0.0011, + "epoch": 0.07373965387509406, + "grad_norm": 0.4573524296283722, + "learning_rate": 1.9732866213293066e-05, + "loss": 0.082, "step": 98 }, { - "epoch": 0.09744094488188976, - "grad_norm": 0.001966514391824603, - "learning_rate": 1.9535100993804352e-05, - "loss": 0.0001, + "epoch": 0.0744920993227991, + "grad_norm": 0.02990127168595791, + "learning_rate": 1.972741171815623e-05, + "loss": 0.0011, "step": 99 }, { - "epoch": 0.0984251968503937, - "grad_norm": 21.267513275146484, - "learning_rate": 1.9525736983244458e-05, - "loss": 0.3253, + "epoch": 0.07524454477050414, + "grad_norm": 1.6312720775604248, + "learning_rate": 1.9721902867095828e-05, + "loss": 0.1075, "step": 100 }, { - "epoch": 0.09940944881889764, - "grad_norm": 0.006219896487891674, - "learning_rate": 1.9516281895296064e-05, - "loss": 0.0002, + "epoch": 0.07599699021820917, + "grad_norm": 0.3260236978530884, + "learning_rate": 1.9716339690894834e-05, + "loss": 0.0788, "step": 101 }, { - "epoch": 0.10039370078740158, - "grad_norm": 3.1399686336517334, - "learning_rate": 1.9506735820361065e-05, - "loss": 0.1277, + "epoch": 0.07674943566591422, + "grad_norm": 0.0010308397468179464, + "learning_rate": 1.9710722220639785e-05, + "loss": 0.0001, "step": 102 }, { - "epoch": 0.10137795275590551, - "grad_norm": 0.55072021484375, - "learning_rate": 1.949709884971131e-05, - "loss": 0.0058, + "epoch": 0.07750188111361926, + "grad_norm": 0.04024788737297058, + "learning_rate": 1.9705050487720618e-05, + "loss": 0.0016, "step": 103 }, { - "epoch": 0.10236220472440945, - "grad_norm": 0.2464011013507843, - "learning_rate": 1.948737107548771e-05, - "loss": 0.0032, + "epoch": 0.0782543265613243, + "grad_norm": 5.523307800292969, + "learning_rate": 1.969932452383048e-05, + "loss": 0.3277, "step": 104 }, { - "epoch": 0.10334645669291338, - "grad_norm": 0.220285564661026, - "learning_rate": 1.9477552590699375e-05, - "loss": 0.003, + "epoch": 0.07900677200902935, + "grad_norm": 2.359933853149414, + "learning_rate": 1.9693544360965548e-05, + "loss": 0.189, "step": 105 }, { - "epoch": 0.10433070866141732, - "grad_norm": 0.3101298213005066, - "learning_rate": 1.9467643489222704e-05, - "loss": 0.0032, + "epoch": 0.0797592174567344, + "grad_norm": 0.6973968744277954, + "learning_rate": 1.9687710031424873e-05, + "loss": 0.0215, "step": 106 }, { - "epoch": 0.10531496062992125, - "grad_norm": 0.005104558076709509, - "learning_rate": 1.945764386580051e-05, - "loss": 0.0001, + "epoch": 0.08051166290443942, + "grad_norm": 0.01043291948735714, + "learning_rate": 1.9681821567810172e-05, + "loss": 0.0004, "step": 107 }, { - "epoch": 0.1062992125984252, - "grad_norm": 0.13238918781280518, - "learning_rate": 1.94475538160411e-05, - "loss": 0.0017, + "epoch": 0.08126410835214447, + "grad_norm": 0.0018412582576274872, + "learning_rate": 1.9675879003025668e-05, + "loss": 0.0001, "step": 108 }, { - "epoch": 0.10728346456692914, - "grad_norm": 0.001642601448111236, - "learning_rate": 1.9437373436417366e-05, - "loss": 0.0, + "epoch": 0.08201655379984951, + "grad_norm": 0.6676415205001831, + "learning_rate": 1.9669882370277885e-05, + "loss": 0.1539, "step": 109 }, { - "epoch": 0.10826771653543307, - "grad_norm": 3.8352627754211426, - "learning_rate": 1.9427102824265858e-05, - "loss": 0.4077, + "epoch": 0.08276899924755456, + "grad_norm": 0.006418765056878328, + "learning_rate": 1.9663831703075488e-05, + "loss": 0.0004, "step": 110 }, { - "epoch": 0.10925196850393701, - "grad_norm": 0.005100015085190535, - "learning_rate": 1.941674207778586e-05, - "loss": 0.0001, + "epoch": 0.0835214446952596, + "grad_norm": 0.0075980802066624165, + "learning_rate": 1.9657727035229066e-05, + "loss": 0.0004, "step": 111 }, { - "epoch": 0.11023622047244094, - "grad_norm": 0.1013755053281784, - "learning_rate": 1.940629129603844e-05, - "loss": 0.0014, + "epoch": 0.08427389014296463, + "grad_norm": 0.004072375129908323, + "learning_rate": 1.9651568400850976e-05, + "loss": 0.0001, "step": 112 }, { - "epoch": 0.11122047244094488, - "grad_norm": 0.1667880415916443, - "learning_rate": 1.9395750578945523e-05, - "loss": 0.0049, + "epoch": 0.08502633559066967, + "grad_norm": 0.6514503359794617, + "learning_rate": 1.964535583435512e-05, + "loss": 0.0636, "step": 113 }, { - "epoch": 0.11220472440944881, - "grad_norm": 3.229433536529541, - "learning_rate": 1.9385120027288914e-05, - "loss": 0.4105, + "epoch": 0.08577878103837472, + "grad_norm": 0.6835861802101135, + "learning_rate": 1.9639089370456784e-05, + "loss": 0.1035, "step": 114 }, { - "epoch": 0.11318897637795275, - "grad_norm": 0.002785398392006755, - "learning_rate": 1.937439974270934e-05, - "loss": 0.0001, + "epoch": 0.08653122648607976, + "grad_norm": 0.18389393389225006, + "learning_rate": 1.963276904417241e-05, + "loss": 0.0048, "step": 115 }, { - "epoch": 0.1141732283464567, - "grad_norm": 2.9435739517211914, - "learning_rate": 1.9363589827705494e-05, - "loss": 0.2322, + "epoch": 0.0872836719337848, + "grad_norm": 0.002048420486971736, + "learning_rate": 1.962639489081943e-05, + "loss": 0.0001, "step": 116 }, { - "epoch": 0.11515748031496063, - "grad_norm": 0.09882983565330505, - "learning_rate": 1.935269038563303e-05, - "loss": 0.0021, + "epoch": 0.08803611738148984, + "grad_norm": 0.010052050463855267, + "learning_rate": 1.9619966946016054e-05, + "loss": 0.0005, "step": 117 }, { - "epoch": 0.11614173228346457, - "grad_norm": 0.0068194130435585976, - "learning_rate": 1.93417015207036e-05, - "loss": 0.0003, + "epoch": 0.08878856282919488, + "grad_norm": 0.100750632584095, + "learning_rate": 1.9613485245681073e-05, + "loss": 0.0027, "step": 118 }, { - "epoch": 0.1171259842519685, - "grad_norm": 1.5461909770965576, - "learning_rate": 1.9330623337983832e-05, - "loss": 0.0139, + "epoch": 0.08954100827689992, + "grad_norm": 0.04654483497142792, + "learning_rate": 1.960694982603366e-05, + "loss": 0.0019, "step": 119 }, { - "epoch": 0.11811023622047244, - "grad_norm": 0.016029296442866325, - "learning_rate": 1.9319455943394347e-05, - "loss": 0.0003, + "epoch": 0.09029345372460497, + "grad_norm": 0.01746903918683529, + "learning_rate": 1.960036072359317e-05, + "loss": 0.0006, "step": 120 }, { - "epoch": 0.11909448818897637, - "grad_norm": 0.008099294267594814, - "learning_rate": 1.9308199443708733e-05, - "loss": 0.0002, + "epoch": 0.09104589917231001, + "grad_norm": 0.05034410208463669, + "learning_rate": 1.9593717975178924e-05, + "loss": 0.002, "step": 121 }, { - "epoch": 0.12007874015748031, - "grad_norm": 0.028314432129263878, - "learning_rate": 1.9296853946552532e-05, - "loss": 0.0004, + "epoch": 0.09179834462001504, + "grad_norm": 3.37355637550354, + "learning_rate": 1.958702161791002e-05, + "loss": 0.5772, "step": 122 }, { - "epoch": 0.12106299212598425, - "grad_norm": 1.141755223274231, - "learning_rate": 1.9285419560402208e-05, - "loss": 0.0325, + "epoch": 0.09255079006772009, + "grad_norm": 0.0004869294061791152, + "learning_rate": 1.958027168920512e-05, + "loss": 0.0, "step": 123 }, { - "epoch": 0.1220472440944882, - "grad_norm": 0.0045331381261348724, - "learning_rate": 1.9273896394584103e-05, - "loss": 0.0002, + "epoch": 0.09330323551542513, + "grad_norm": 0.013796951621770859, + "learning_rate": 1.9573468226782224e-05, + "loss": 0.0006, "step": 124 }, { - "epoch": 0.12303149606299213, - "grad_norm": 0.13167083263397217, - "learning_rate": 1.9262284559273412e-05, - "loss": 0.0035, + "epoch": 0.09405568096313018, + "grad_norm": 0.020256061106920242, + "learning_rate": 1.956661126865849e-05, + "loss": 0.0007, "step": 125 }, { - "epoch": 0.12401574803149606, - "grad_norm": 0.02842235378921032, - "learning_rate": 1.9250584165493102e-05, - "loss": 0.0012, + "epoch": 0.09480812641083522, + "grad_norm": 0.4556258022785187, + "learning_rate": 1.9559700853149997e-05, + "loss": 0.007, "step": 126 }, { - "epoch": 0.125, - "grad_norm": 0.025816919282078743, - "learning_rate": 1.9238795325112867e-05, - "loss": 0.0005, + "epoch": 0.09556057185854025, + "grad_norm": 1.9277138710021973, + "learning_rate": 1.9552737018871543e-05, + "loss": 0.0393, "step": 127 }, { - "epoch": 0.12598425196850394, - "grad_norm": 5.089158058166504, - "learning_rate": 1.9226918150848067e-05, - "loss": 0.1138, + "epoch": 0.0963130173062453, + "grad_norm": 0.0008740609628148377, + "learning_rate": 1.954571980473642e-05, + "loss": 0.0001, "step": 128 }, { - "epoch": 0.12696850393700787, - "grad_norm": 4.072977542877197, - "learning_rate": 1.9214952756258625e-05, - "loss": 0.1086, + "epoch": 0.09706546275395034, + "grad_norm": 0.03682279586791992, + "learning_rate": 1.953864924995621e-05, + "loss": 0.0015, "step": 129 }, { - "epoch": 0.1279527559055118, - "grad_norm": 0.029234636574983597, - "learning_rate": 1.9202899255747967e-05, - "loss": 0.0015, + "epoch": 0.09781790820165538, + "grad_norm": 0.03125663846731186, + "learning_rate": 1.9531525394040546e-05, + "loss": 0.0008, "step": 130 }, { - "epoch": 0.12893700787401574, - "grad_norm": 0.7469910383224487, - "learning_rate": 1.9190757764561905e-05, - "loss": 0.0133, + "epoch": 0.09857035364936043, + "grad_norm": 0.0102821821346879, + "learning_rate": 1.9524348276796913e-05, + "loss": 0.0004, "step": 131 }, { - "epoch": 0.12992125984251968, - "grad_norm": 3.471719741821289, - "learning_rate": 1.9178528398787553e-05, - "loss": 0.1318, + "epoch": 0.09932279909706546, + "grad_norm": 0.07433848083019257, + "learning_rate": 1.9517117938330414e-05, + "loss": 0.0029, "step": 132 }, { - "epoch": 0.1309055118110236, - "grad_norm": 0.909016489982605, - "learning_rate": 1.916621127535221e-05, - "loss": 0.0145, + "epoch": 0.1000752445447705, + "grad_norm": 0.31897610425949097, + "learning_rate": 1.9509834419043544e-05, + "loss": 0.0816, "step": 133 }, { - "epoch": 0.13188976377952755, - "grad_norm": 0.0073660132475197315, - "learning_rate": 1.9153806512022248e-05, - "loss": 0.0004, + "epoch": 0.10082768999247554, + "grad_norm": 0.042768437415361404, + "learning_rate": 1.9502497759635973e-05, + "loss": 0.0017, "step": 134 }, { - "epoch": 0.1328740157480315, - "grad_norm": 0.06388348340988159, - "learning_rate": 1.914131422740197e-05, - "loss": 0.0008, + "epoch": 0.10158013544018059, + "grad_norm": 0.27461105585098267, + "learning_rate": 1.9495108001104312e-05, + "loss": 0.0972, "step": 135 }, { - "epoch": 0.13385826771653545, - "grad_norm": 0.276848167181015, - "learning_rate": 1.9128734540932494e-05, - "loss": 0.0064, + "epoch": 0.10233258088788563, + "grad_norm": 0.012465243227779865, + "learning_rate": 1.9487665184741878e-05, + "loss": 0.0006, "step": 136 }, { - "epoch": 0.13484251968503938, - "grad_norm": 1.6468451023101807, - "learning_rate": 1.9116067572890603e-05, - "loss": 0.155, + "epoch": 0.10308502633559068, + "grad_norm": 0.042917292565107346, + "learning_rate": 1.9480169352138473e-05, + "loss": 0.0014, "step": 137 }, { - "epoch": 0.13582677165354332, - "grad_norm": 0.0815034881234169, - "learning_rate": 1.9103313444387595e-05, - "loss": 0.0022, + "epoch": 0.1038374717832957, + "grad_norm": 0.06652813404798508, + "learning_rate": 1.9472620545180165e-05, + "loss": 0.0018, "step": 138 }, { - "epoch": 0.13681102362204725, - "grad_norm": 2.26296067237854, - "learning_rate": 1.9090472277368124e-05, - "loss": 0.0633, + "epoch": 0.10458991723100075, + "grad_norm": 3.4145336151123047, + "learning_rate": 1.9465018806049014e-05, + "loss": 0.0251, "step": 139 }, { - "epoch": 0.1377952755905512, - "grad_norm": 0.006262979470193386, - "learning_rate": 1.907754419460904e-05, - "loss": 0.0002, + "epoch": 0.1053423626787058, + "grad_norm": 0.5462217926979065, + "learning_rate": 1.9457364177222877e-05, + "loss": 0.1197, "step": 140 }, { - "epoch": 0.13877952755905512, - "grad_norm": 0.015910493209958076, - "learning_rate": 1.9064529319718206e-05, - "loss": 0.0005, + "epoch": 0.10609480812641084, + "grad_norm": 0.00809057243168354, + "learning_rate": 1.9449656701475147e-05, + "loss": 0.0004, "step": 141 }, { - "epoch": 0.13976377952755906, - "grad_norm": 0.000633381016086787, - "learning_rate": 1.9051427777133328e-05, - "loss": 0.0001, + "epoch": 0.10684725357411588, + "grad_norm": 0.03647633641958237, + "learning_rate": 1.944189642187452e-05, + "loss": 0.0008, "step": 142 }, { - "epoch": 0.140748031496063, - "grad_norm": 0.06496637314558029, - "learning_rate": 1.903823969212075e-05, - "loss": 0.0027, + "epoch": 0.10759969902182091, + "grad_norm": 0.007750968914479017, + "learning_rate": 1.9434083381784764e-05, + "loss": 0.0002, "step": 143 }, { - "epoch": 0.14173228346456693, - "grad_norm": 0.011212184093892574, - "learning_rate": 1.9024965190774262e-05, - "loss": 0.0003, + "epoch": 0.10835214446952596, + "grad_norm": 0.002081293612718582, + "learning_rate": 1.9426217624864456e-05, + "loss": 0.0001, "step": 144 }, { - "epoch": 0.14271653543307086, - "grad_norm": 0.04856955260038376, - "learning_rate": 1.901160440001392e-05, - "loss": 0.0018, + "epoch": 0.109104589917231, + "grad_norm": 2.682119131088257, + "learning_rate": 1.9418299195066755e-05, + "loss": 0.1876, "step": 145 }, { - "epoch": 0.1437007874015748, - "grad_norm": 0.0017084332648664713, - "learning_rate": 1.899815744758478e-05, - "loss": 0.0001, + "epoch": 0.10985703536493605, + "grad_norm": 0.21694301068782806, + "learning_rate": 1.9410328136639163e-05, + "loss": 0.0035, "step": 146 }, { - "epoch": 0.14468503937007873, - "grad_norm": 0.05552929639816284, - "learning_rate": 1.8984624462055724e-05, - "loss": 0.0009, + "epoch": 0.11060948081264109, + "grad_norm": 0.020226623862981796, + "learning_rate": 1.940230449412324e-05, + "loss": 0.0006, "step": 147 }, { - "epoch": 0.14566929133858267, - "grad_norm": 0.06473974883556366, - "learning_rate": 1.8971005572818213e-05, - "loss": 0.0013, + "epoch": 0.11136192626034612, + "grad_norm": 0.006655423901975155, + "learning_rate": 1.939422831235441e-05, + "loss": 0.0002, "step": 148 }, { - "epoch": 0.1466535433070866, - "grad_norm": 0.0019852975383400917, - "learning_rate": 1.895730091008504e-05, - "loss": 0.0001, + "epoch": 0.11211437170805116, + "grad_norm": 0.011194628663361073, + "learning_rate": 1.938609963646166e-05, + "loss": 0.0005, "step": 149 }, { - "epoch": 0.14763779527559054, - "grad_norm": 0.007983777672052383, - "learning_rate": 1.8943510604889094e-05, - "loss": 0.0003, + "epoch": 0.11286681715575621, + "grad_norm": 1.6491260528564453, + "learning_rate": 1.9377918511867318e-05, + "loss": 0.1711, "step": 150 }, { - "epoch": 0.1486220472440945, - "grad_norm": 0.0025301447603851557, - "learning_rate": 1.8929634789082123e-05, - "loss": 0.0001, + "epoch": 0.11361926260346125, + "grad_norm": 0.20927202701568604, + "learning_rate": 1.9369684984286798e-05, + "loss": 0.0075, "step": 151 }, { - "epoch": 0.14960629921259844, - "grad_norm": 5.2840576171875, - "learning_rate": 1.8915673595333443e-05, - "loss": 0.0331, + "epoch": 0.1143717080511663, + "grad_norm": 0.10048650205135345, + "learning_rate": 1.9361399099728326e-05, + "loss": 0.0033, "step": 152 }, { - "epoch": 0.15059055118110237, - "grad_norm": 0.024088528007268906, - "learning_rate": 1.8901627157128697e-05, - "loss": 0.0009, + "epoch": 0.11512415349887133, + "grad_norm": 0.09928741306066513, + "learning_rate": 1.9353060904492694e-05, + "loss": 0.0038, "step": 153 }, { - "epoch": 0.1515748031496063, - "grad_norm": 7.089936256408691, - "learning_rate": 1.8887495608768557e-05, - "loss": 0.2164, + "epoch": 0.11587659894657637, + "grad_norm": 0.3446449935436249, + "learning_rate": 1.934467044517301e-05, + "loss": 0.0654, "step": 154 }, { - "epoch": 0.15255905511811024, - "grad_norm": 0.41628381609916687, - "learning_rate": 1.8873279085367454e-05, - "loss": 0.0158, + "epoch": 0.11662904439428141, + "grad_norm": 0.007841469720005989, + "learning_rate": 1.9336227768654424e-05, + "loss": 0.0004, "step": 155 }, { - "epoch": 0.15354330708661418, - "grad_norm": 0.001422123983502388, - "learning_rate": 1.8858977722852273e-05, - "loss": 0.0001, + "epoch": 0.11738148984198646, + "grad_norm": 0.24826376140117645, + "learning_rate": 1.9327732922113872e-05, + "loss": 0.006, "step": 156 }, { - "epoch": 0.1545275590551181, - "grad_norm": 0.017981093376874924, - "learning_rate": 1.8844591657961083e-05, - "loss": 0.0004, + "epoch": 0.1181339352896915, + "grad_norm": 2.012798547744751, + "learning_rate": 1.9319185953019817e-05, + "loss": 0.1098, "step": 157 }, { - "epoch": 0.15551181102362205, - "grad_norm": 5.039153575897217, - "learning_rate": 1.883012102824178e-05, - "loss": 0.2792, + "epoch": 0.11888638073739653, + "grad_norm": 0.02779470756649971, + "learning_rate": 1.9310586909131964e-05, + "loss": 0.0009, "step": 158 }, { - "epoch": 0.15649606299212598, - "grad_norm": 0.6117565631866455, - "learning_rate": 1.8815565972050827e-05, - "loss": 0.0101, + "epoch": 0.11963882618510158, + "grad_norm": 3.675586462020874, + "learning_rate": 1.930193583850102e-05, + "loss": 0.5443, "step": 159 }, { - "epoch": 0.15748031496062992, - "grad_norm": 0.003265405772253871, - "learning_rate": 1.8800926628551884e-05, - "loss": 0.0002, + "epoch": 0.12039127163280662, + "grad_norm": 0.009048126637935638, + "learning_rate": 1.9293232789468403e-05, + "loss": 0.0005, "step": 160 }, { - "epoch": 0.15846456692913385, - "grad_norm": 0.01255448441952467, - "learning_rate": 1.8786203137714514e-05, - "loss": 0.0003, + "epoch": 0.12114371708051166, + "grad_norm": 0.00870896689593792, + "learning_rate": 1.9284477810666e-05, + "loss": 0.0004, "step": 161 }, { - "epoch": 0.1594488188976378, - "grad_norm": 0.05142652988433838, - "learning_rate": 1.877139564031282e-05, - "loss": 0.0015, + "epoch": 0.12189616252821671, + "grad_norm": 0.01799067109823227, + "learning_rate": 1.9275670951015854e-05, + "loss": 0.0009, "step": 162 }, { - "epoch": 0.16043307086614172, - "grad_norm": 0.5440412163734436, - "learning_rate": 1.8756504277924104e-05, - "loss": 0.0094, + "epoch": 0.12264860797592174, + "grad_norm": 0.07708598673343658, + "learning_rate": 1.9266812259729927e-05, + "loss": 0.0032, "step": 163 }, { - "epoch": 0.16141732283464566, - "grad_norm": 2.8627021312713623, - "learning_rate": 1.8741529192927528e-05, - "loss": 0.5114, + "epoch": 0.12340105342362678, + "grad_norm": 0.012997242622077465, + "learning_rate": 1.9257901786309813e-05, + "loss": 0.0003, "step": 164 }, { - "epoch": 0.1624015748031496, - "grad_norm": 0.006565171759575605, - "learning_rate": 1.8726470528502737e-05, - "loss": 0.0003, + "epoch": 0.12415349887133183, + "grad_norm": 0.08036967366933823, + "learning_rate": 1.9248939580546453e-05, + "loss": 0.0021, "step": 165 }, { - "epoch": 0.16338582677165353, - "grad_norm": 0.021097129210829735, - "learning_rate": 1.8711328428628492e-05, - "loss": 0.0006, + "epoch": 0.12490594431903687, + "grad_norm": 0.08968926221132278, + "learning_rate": 1.9239925692519867e-05, + "loss": 0.0036, "step": 166 }, { - "epoch": 0.1643700787401575, - "grad_norm": 0.05373897776007652, - "learning_rate": 1.8696103038081297e-05, - "loss": 0.0024, + "epoch": 0.1256583897667419, + "grad_norm": 0.10857436060905457, + "learning_rate": 1.923086017259887e-05, + "loss": 0.0045, "step": 167 }, { - "epoch": 0.16535433070866143, - "grad_norm": 0.02692520059645176, - "learning_rate": 1.8680794502434018e-05, - "loss": 0.0012, + "epoch": 0.12641083521444696, + "grad_norm": 0.33034518361091614, + "learning_rate": 1.9221743071440792e-05, + "loss": 0.012, "step": 168 }, { - "epoch": 0.16633858267716536, - "grad_norm": 3.5850143432617188, - "learning_rate": 1.8665402968054485e-05, - "loss": 0.0908, + "epoch": 0.127163280662152, + "grad_norm": 0.020488563925027847, + "learning_rate": 1.92125744399912e-05, + "loss": 0.0011, "step": 169 }, { - "epoch": 0.1673228346456693, - "grad_norm": 1.8069084882736206, - "learning_rate": 1.8649928582104097e-05, - "loss": 0.0431, + "epoch": 0.12791572610985705, + "grad_norm": 1.1725200414657593, + "learning_rate": 1.9203354329483593e-05, + "loss": 0.2263, "step": 170 }, { - "epoch": 0.16830708661417323, - "grad_norm": 0.02186671830713749, - "learning_rate": 1.86343714925364e-05, - "loss": 0.001, + "epoch": 0.12866817155756208, + "grad_norm": 0.009000587277114391, + "learning_rate": 1.9194082791439146e-05, + "loss": 0.0005, "step": 171 }, { - "epoch": 0.16929133858267717, - "grad_norm": 0.0052742431871593, - "learning_rate": 1.8618731848095706e-05, - "loss": 0.0003, + "epoch": 0.1294206170052671, + "grad_norm": 0.15425604581832886, + "learning_rate": 1.9184759877666403e-05, + "loss": 0.006, "step": 172 }, { - "epoch": 0.1702755905511811, - "grad_norm": 3.9548587799072266, - "learning_rate": 1.8603009798315633e-05, - "loss": 0.1289, + "epoch": 0.13017306245297217, + "grad_norm": 0.04585198312997818, + "learning_rate": 1.917538564026098e-05, + "loss": 0.0011, "step": 173 }, { - "epoch": 0.17125984251968504, - "grad_norm": 2.617983818054199, - "learning_rate": 1.8587205493517703e-05, - "loss": 0.1487, + "epoch": 0.1309255079006772, + "grad_norm": 0.012005002237856388, + "learning_rate": 1.9165960131605304e-05, + "loss": 0.0007, "step": 174 }, { - "epoch": 0.17224409448818898, - "grad_norm": 2.227367401123047, - "learning_rate": 1.8571319084809884e-05, - "loss": 0.1262, + "epoch": 0.13167795334838225, + "grad_norm": 0.04149067774415016, + "learning_rate": 1.915648340436828e-05, + "loss": 0.0017, "step": 175 }, { - "epoch": 0.1732283464566929, - "grad_norm": 2.0675241947174072, - "learning_rate": 1.855535072408516e-05, - "loss": 0.2527, + "epoch": 0.13243039879608728, + "grad_norm": 0.13023491203784943, + "learning_rate": 1.9146955511505035e-05, + "loss": 0.0047, "step": 176 }, { - "epoch": 0.17421259842519685, - "grad_norm": 1.5445764064788818, - "learning_rate": 1.853930056402008e-05, - "loss": 0.0511, + "epoch": 0.13318284424379231, + "grad_norm": 1.0637022256851196, + "learning_rate": 1.91373765062566e-05, + "loss": 0.2339, "step": 177 }, { - "epoch": 0.17519685039370078, - "grad_norm": 0.09199322015047073, - "learning_rate": 1.8523168758073283e-05, - "loss": 0.0054, + "epoch": 0.13393528969149737, + "grad_norm": 0.6275419592857361, + "learning_rate": 1.9127746442149612e-05, + "loss": 0.1668, "step": 178 }, { - "epoch": 0.17618110236220472, - "grad_norm": 0.2140803039073944, - "learning_rate": 1.8506955460484042e-05, - "loss": 0.0105, + "epoch": 0.1346877351392024, + "grad_norm": 0.009538069367408752, + "learning_rate": 1.9118065372996027e-05, + "loss": 0.0006, "step": 179 }, { - "epoch": 0.17716535433070865, - "grad_norm": 0.021860981360077858, - "learning_rate": 1.849066082627079e-05, - "loss": 0.0009, + "epoch": 0.13544018058690746, + "grad_norm": 0.01008265744894743, + "learning_rate": 1.910833335289281e-05, + "loss": 0.0006, "step": 180 }, { - "epoch": 0.1781496062992126, - "grad_norm": 0.11479674279689789, - "learning_rate": 1.847428501122963e-05, - "loss": 0.0053, + "epoch": 0.1361926260346125, + "grad_norm": 0.030792739242315292, + "learning_rate": 1.9098550436221636e-05, + "loss": 0.0014, "step": 181 }, { - "epoch": 0.17913385826771652, - "grad_norm": 0.10818558186292648, - "learning_rate": 1.845782817193286e-05, - "loss": 0.0055, + "epoch": 0.13694507148231752, + "grad_norm": 0.010570832528173923, + "learning_rate": 1.9088716677648583e-05, + "loss": 0.0005, "step": 182 }, { - "epoch": 0.18011811023622049, - "grad_norm": 1.6669580936431885, - "learning_rate": 1.844129046572745e-05, - "loss": 0.0359, + "epoch": 0.13769751693002258, + "grad_norm": 0.7934188842773438, + "learning_rate": 1.9078832132123833e-05, + "loss": 0.0201, "step": 183 }, { - "epoch": 0.18110236220472442, - "grad_norm": 0.07271285355091095, - "learning_rate": 1.8424672050733577e-05, - "loss": 0.0037, + "epoch": 0.1384499623777276, + "grad_norm": 0.004436484072357416, + "learning_rate": 1.9068896854881364e-05, + "loss": 0.0002, "step": 184 }, { - "epoch": 0.18208661417322836, - "grad_norm": 1.0394022464752197, - "learning_rate": 1.8407973085843066e-05, - "loss": 0.1121, + "epoch": 0.13920240782543267, + "grad_norm": 0.025362318381667137, + "learning_rate": 1.9058910901438628e-05, + "loss": 0.0017, "step": 185 }, { - "epoch": 0.1830708661417323, - "grad_norm": 1.8798764944076538, - "learning_rate": 1.839119373071791e-05, - "loss": 0.0917, + "epoch": 0.1399548532731377, + "grad_norm": 0.007729747798293829, + "learning_rate": 1.904887432759626e-05, + "loss": 0.0004, "step": 186 }, { - "epoch": 0.18405511811023623, - "grad_norm": 0.0645948201417923, - "learning_rate": 1.8374334145788723e-05, - "loss": 0.0023, + "epoch": 0.14070729872084273, + "grad_norm": 2.8755412101745605, + "learning_rate": 1.9038787189437752e-05, + "loss": 0.1614, "step": 187 }, { - "epoch": 0.18503937007874016, - "grad_norm": 1.8646386861801147, - "learning_rate": 1.8357394492253216e-05, - "loss": 0.132, + "epoch": 0.14145974416854779, + "grad_norm": 2.1707630157470703, + "learning_rate": 1.902864954332915e-05, + "loss": 0.3061, "step": 188 }, { - "epoch": 0.1860236220472441, - "grad_norm": 0.2028031051158905, - "learning_rate": 1.8340374932074646e-05, - "loss": 0.0078, + "epoch": 0.14221218961625282, + "grad_norm": 0.5352933406829834, + "learning_rate": 1.9018461445918727e-05, + "loss": 0.1411, "step": 189 }, { - "epoch": 0.18700787401574803, - "grad_norm": 1.2554411888122559, - "learning_rate": 1.8323275627980272e-05, - "loss": 0.0794, + "epoch": 0.14296463506395787, + "grad_norm": 0.44579315185546875, + "learning_rate": 1.900822295413668e-05, + "loss": 0.1438, "step": 190 }, { - "epoch": 0.18799212598425197, - "grad_norm": 0.018183542415499687, - "learning_rate": 1.8306096743459808e-05, - "loss": 0.0008, + "epoch": 0.1437170805116629, + "grad_norm": 2.404839038848877, + "learning_rate": 1.8997934125194806e-05, + "loss": 0.2562, "step": 191 }, { - "epoch": 0.1889763779527559, - "grad_norm": 0.40241336822509766, - "learning_rate": 1.8288838442763838e-05, - "loss": 0.0147, + "epoch": 0.14446952595936793, + "grad_norm": 2.7213966846466064, + "learning_rate": 1.898759501658618e-05, + "loss": 0.1053, "step": 192 }, { - "epoch": 0.18996062992125984, - "grad_norm": 0.035246290266513824, - "learning_rate": 1.8271500890902277e-05, - "loss": 0.0017, + "epoch": 0.145221971407073, + "grad_norm": 1.2511435747146606, + "learning_rate": 1.8977205686084828e-05, + "loss": 0.1214, "step": 193 }, { - "epoch": 0.19094488188976377, - "grad_norm": 2.249130964279175, - "learning_rate": 1.825408425364276e-05, - "loss": 0.121, + "epoch": 0.14597441685477802, + "grad_norm": 0.9504565000534058, + "learning_rate": 1.8966766191745423e-05, + "loss": 0.136, "step": 194 }, { - "epoch": 0.1919291338582677, - "grad_norm": 0.7878203392028809, - "learning_rate": 1.8236588697509082e-05, - "loss": 0.0394, + "epoch": 0.14672686230248308, + "grad_norm": 0.053916994482278824, + "learning_rate": 1.895627659190294e-05, + "loss": 0.0035, "step": 195 }, { - "epoch": 0.19291338582677164, - "grad_norm": 0.18222226202487946, - "learning_rate": 1.8219014389779586e-05, - "loss": 0.0093, + "epoch": 0.1474793077501881, + "grad_norm": 0.21418577432632446, + "learning_rate": 1.8945736945172345e-05, + "loss": 0.0128, "step": 196 }, { - "epoch": 0.19389763779527558, - "grad_norm": 0.2684551477432251, - "learning_rate": 1.820136149848559e-05, - "loss": 0.0182, + "epoch": 0.14823175319789314, + "grad_norm": 0.22742077708244324, + "learning_rate": 1.8935147310448258e-05, + "loss": 0.011, "step": 197 }, { - "epoch": 0.19488188976377951, - "grad_norm": 3.1357297897338867, - "learning_rate": 1.8183630192409746e-05, - "loss": 0.1391, + "epoch": 0.1489841986455982, + "grad_norm": 0.06237643212080002, + "learning_rate": 1.8924507746904628e-05, + "loss": 0.0035, "step": 198 }, { - "epoch": 0.19586614173228348, - "grad_norm": 1.3211259841918945, - "learning_rate": 1.8165820641084458e-05, - "loss": 0.0907, + "epoch": 0.14973664409330323, + "grad_norm": 0.027896396815776825, + "learning_rate": 1.8913818313994406e-05, + "loss": 0.0017, "step": 199 }, { - "epoch": 0.1968503937007874, - "grad_norm": 2.953669309616089, - "learning_rate": 1.8147933014790245e-05, - "loss": 0.1264, + "epoch": 0.1504890895410083, + "grad_norm": 0.021885687485337257, + "learning_rate": 1.8903079071449206e-05, + "loss": 0.0013, "step": 200 }, { - "epoch": 0.19783464566929135, - "grad_norm": 0.003996099345386028, - "learning_rate": 1.8129967484554116e-05, - "loss": 0.0002, + "epoch": 0.15124153498871332, + "grad_norm": 0.004547674674540758, + "learning_rate": 1.889229007927897e-05, + "loss": 0.0003, "step": 201 }, { - "epoch": 0.19881889763779528, - "grad_norm": 0.004197091795504093, - "learning_rate": 1.8111924222147927e-05, - "loss": 0.0002, + "epoch": 0.15199398043641835, + "grad_norm": 0.08865097910165787, + "learning_rate": 1.8881451397771647e-05, + "loss": 0.0046, "step": 202 }, { - "epoch": 0.19980314960629922, - "grad_norm": 0.008141039870679379, - "learning_rate": 1.8093803400086757e-05, - "loss": 0.0003, + "epoch": 0.1527464258841234, + "grad_norm": 0.13727599382400513, + "learning_rate": 1.887056308749283e-05, + "loss": 0.0075, "step": 203 }, { - "epoch": 0.20078740157480315, - "grad_norm": 0.024663496762514114, - "learning_rate": 1.8075605191627242e-05, - "loss": 0.001, + "epoch": 0.15349887133182843, + "grad_norm": 0.6231827735900879, + "learning_rate": 1.885962520928545e-05, + "loss": 0.028, "step": 204 }, { - "epoch": 0.2017716535433071, - "grad_norm": 0.014869451522827148, - "learning_rate": 1.805732977076592e-05, - "loss": 0.0004, + "epoch": 0.1542513167795335, + "grad_norm": 0.1673537939786911, + "learning_rate": 1.88486378242694e-05, + "loss": 0.0099, "step": 205 }, { - "epoch": 0.20275590551181102, - "grad_norm": 0.0006046657217666507, - "learning_rate": 1.8038977312237583e-05, - "loss": 0.0001, + "epoch": 0.15500376222723852, + "grad_norm": 0.008311329409480095, + "learning_rate": 1.8837600993841237e-05, + "loss": 0.0005, "step": 206 }, { - "epoch": 0.20374015748031496, - "grad_norm": 1.34624445438385, - "learning_rate": 1.8020547991513583e-05, - "loss": 0.1106, + "epoch": 0.15575620767494355, + "grad_norm": 0.1961093544960022, + "learning_rate": 1.8826514779673792e-05, + "loss": 0.01, "step": 207 }, { - "epoch": 0.2047244094488189, - "grad_norm": 1.5433149337768555, - "learning_rate": 1.8002041984800173e-05, - "loss": 0.184, + "epoch": 0.1565086531226486, + "grad_norm": 0.3649809658527374, + "learning_rate": 1.881537924371586e-05, + "loss": 0.1412, "step": 208 }, { - "epoch": 0.20570866141732283, - "grad_norm": 0.0008625572081655264, - "learning_rate": 1.7983459469036813e-05, - "loss": 0.0001, + "epoch": 0.15726109857035364, + "grad_norm": 1.2528727054595947, + "learning_rate": 1.8804194448191843e-05, + "loss": 0.1621, "step": 209 }, { - "epoch": 0.20669291338582677, - "grad_norm": 0.016304846853017807, - "learning_rate": 1.796480062189448e-05, - "loss": 0.0007, + "epoch": 0.1580135440180587, + "grad_norm": 0.0017916648648679256, + "learning_rate": 1.8792960455601396e-05, + "loss": 0.0001, "step": 210 }, { - "epoch": 0.2076771653543307, - "grad_norm": 0.22865940630435944, - "learning_rate": 1.794606562177398e-05, - "loss": 0.0109, + "epoch": 0.15876598946576373, + "grad_norm": 0.2502264976501465, + "learning_rate": 1.8781677328719078e-05, + "loss": 0.01, "step": 211 }, { - "epoch": 0.20866141732283464, - "grad_norm": 0.17081470787525177, - "learning_rate": 1.792725464780421e-05, - "loss": 0.0041, + "epoch": 0.1595184349134688, + "grad_norm": 0.3349985182285309, + "learning_rate": 1.8770345130594017e-05, + "loss": 0.0182, "step": 212 }, { - "epoch": 0.20964566929133857, - "grad_norm": 2.3666884899139404, - "learning_rate": 1.7908367879840484e-05, - "loss": 0.1146, + "epoch": 0.16027088036117382, + "grad_norm": 0.007236495614051819, + "learning_rate": 1.875896392454955e-05, + "loss": 0.0005, "step": 213 }, { - "epoch": 0.2106299212598425, - "grad_norm": 0.15037871897220612, - "learning_rate": 1.78894054984628e-05, - "loss": 0.0052, + "epoch": 0.16102332580887885, + "grad_norm": 0.7724468111991882, + "learning_rate": 1.8747533774182845e-05, + "loss": 0.0307, "step": 214 }, { - "epoch": 0.21161417322834647, - "grad_norm": 0.11735182255506516, - "learning_rate": 1.78703676849741e-05, - "loss": 0.0055, + "epoch": 0.1617757712565839, + "grad_norm": 0.0025126487016677856, + "learning_rate": 1.8736054743364587e-05, + "loss": 0.0001, "step": 215 }, { - "epoch": 0.2125984251968504, - "grad_norm": 0.01386881060898304, - "learning_rate": 1.785125462139855e-05, - "loss": 0.0005, + "epoch": 0.16252821670428894, + "grad_norm": 0.1561364233493805, + "learning_rate": 1.872452689623859e-05, + "loss": 0.0034, "step": 216 }, { - "epoch": 0.21358267716535434, - "grad_norm": 0.013034471310675144, - "learning_rate": 1.7832066490479797e-05, - "loss": 0.0004, + "epoch": 0.163280662151994, + "grad_norm": 0.004739896394312382, + "learning_rate": 1.8712950297221457e-05, + "loss": 0.0003, "step": 217 }, { - "epoch": 0.21456692913385828, - "grad_norm": 0.03758182004094124, - "learning_rate": 1.7812803475679224e-05, - "loss": 0.0014, + "epoch": 0.16403310759969902, + "grad_norm": 0.5904443264007568, + "learning_rate": 1.8701325011002204e-05, + "loss": 0.1231, "step": 218 }, { - "epoch": 0.2155511811023622, - "grad_norm": 0.0030961865559220314, - "learning_rate": 1.7793465761174184e-05, + "epoch": 0.16478555304740405, + "grad_norm": 0.002162502147257328, + "learning_rate": 1.8689651102541915e-05, "loss": 0.0001, "step": 219 }, { - "epoch": 0.21653543307086615, - "grad_norm": 5.2172654250171036e-05, - "learning_rate": 1.7774053531856258e-05, - "loss": 0.0, + "epoch": 0.1655379984951091, + "grad_norm": 0.022224850952625275, + "learning_rate": 1.8677928637073367e-05, + "loss": 0.0007, "step": 220 }, { - "epoch": 0.21751968503937008, - "grad_norm": 0.2028627246618271, - "learning_rate": 1.7754566973329478e-05, - "loss": 0.0054, + "epoch": 0.16629044394281414, + "grad_norm": 0.0014565088786184788, + "learning_rate": 1.8666157680100663e-05, + "loss": 0.0001, "step": 221 }, { - "epoch": 0.21850393700787402, - "grad_norm": 3.40848970413208, - "learning_rate": 1.773500627190854e-05, - "loss": 0.2228, + "epoch": 0.1670428893905192, + "grad_norm": 0.0017851298907771707, + "learning_rate": 1.865433829739888e-05, + "loss": 0.0001, "step": 222 }, { - "epoch": 0.21948818897637795, - "grad_norm": 0.05393334478139877, - "learning_rate": 1.7715371614617044e-05, - "loss": 0.0019, + "epoch": 0.16779533483822423, + "grad_norm": 0.16844072937965393, + "learning_rate": 1.8642470555013695e-05, + "loss": 0.0023, "step": 223 }, { - "epoch": 0.2204724409448819, - "grad_norm": 2.155111312866211, - "learning_rate": 1.7695663189185703e-05, - "loss": 0.1049, + "epoch": 0.16854778028592926, + "grad_norm": 3.2930855751037598, + "learning_rate": 1.8630554519261007e-05, + "loss": 0.0823, "step": 224 }, { - "epoch": 0.22145669291338582, - "grad_norm": 4.374336242675781, - "learning_rate": 1.7675881184050518e-05, - "loss": 0.1378, + "epoch": 0.16930022573363432, + "grad_norm": 0.00976943876594305, + "learning_rate": 1.8618590256726587e-05, + "loss": 0.0006, "step": 225 }, { - "epoch": 0.22244094488188976, - "grad_norm": 0.0039710612036287785, - "learning_rate": 1.765602578835102e-05, - "loss": 0.0001, + "epoch": 0.17005267118133935, + "grad_norm": 0.3796117901802063, + "learning_rate": 1.860657783426568e-05, + "loss": 0.1048, "step": 226 }, { - "epoch": 0.2234251968503937, - "grad_norm": 0.12996165454387665, - "learning_rate": 1.7636097191928437e-05, - "loss": 0.0055, + "epoch": 0.1708051166290444, + "grad_norm": 2.5069665908813477, + "learning_rate": 1.8594517319002646e-05, + "loss": 0.3186, "step": 227 }, { - "epoch": 0.22440944881889763, - "grad_norm": 0.7774534821510315, - "learning_rate": 1.7616095585323882e-05, - "loss": 0.0247, + "epoch": 0.17155756207674944, + "grad_norm": 0.05536516755819321, + "learning_rate": 1.8582408778330588e-05, + "loss": 0.0019, "step": 228 }, { - "epoch": 0.22539370078740156, - "grad_norm": 3.976025342941284, - "learning_rate": 1.7596021159776526e-05, - "loss": 0.0942, + "epoch": 0.17231000752445447, + "grad_norm": 0.013733879663050175, + "learning_rate": 1.8570252279910975e-05, + "loss": 0.0005, "step": 229 }, { - "epoch": 0.2263779527559055, - "grad_norm": 0.14758476614952087, - "learning_rate": 1.7575874107221785e-05, - "loss": 0.0064, + "epoch": 0.17306245297215953, + "grad_norm": 0.26580435037612915, + "learning_rate": 1.8558047891673247e-05, + "loss": 0.0078, "step": 230 }, { - "epoch": 0.22736220472440946, - "grad_norm": 0.11664961278438568, - "learning_rate": 1.755565462028947e-05, - "loss": 0.0045, + "epoch": 0.17381489841986456, + "grad_norm": 0.005103581584990025, + "learning_rate": 1.854579568181446e-05, + "loss": 0.0003, "step": 231 }, { - "epoch": 0.2283464566929134, - "grad_norm": 0.008540516719222069, - "learning_rate": 1.7535362892301953e-05, - "loss": 0.0003, + "epoch": 0.1745673438675696, + "grad_norm": 0.8314829468727112, + "learning_rate": 1.8533495718798882e-05, + "loss": 0.0161, "step": 232 }, { - "epoch": 0.22933070866141733, - "grad_norm": 0.004662222228944302, - "learning_rate": 1.7514999117272323e-05, - "loss": 0.0002, + "epoch": 0.17531978931527464, + "grad_norm": 0.04428403079509735, + "learning_rate": 1.8521148071357633e-05, + "loss": 0.0012, "step": 233 }, { - "epoch": 0.23031496062992127, - "grad_norm": 0.007470139302313328, - "learning_rate": 1.749456348990251e-05, - "loss": 0.0003, + "epoch": 0.17607223476297967, + "grad_norm": 0.033717166632413864, + "learning_rate": 1.8508752808488283e-05, + "loss": 0.0014, "step": 234 }, { - "epoch": 0.2312992125984252, - "grad_norm": 1.8820122480392456, - "learning_rate": 1.7474056205581448e-05, - "loss": 0.0366, + "epoch": 0.17682468021068473, + "grad_norm": 0.03402457386255264, + "learning_rate": 1.8496309999454475e-05, + "loss": 0.0013, "step": 235 }, { - "epoch": 0.23228346456692914, - "grad_norm": 0.002202255418524146, - "learning_rate": 1.745347746038319e-05, - "loss": 0.0001, + "epoch": 0.17757712565838976, + "grad_norm": 0.062332432717084885, + "learning_rate": 1.848381971378553e-05, + "loss": 0.0019, "step": 236 }, { - "epoch": 0.23326771653543307, - "grad_norm": 1.1024023294448853, - "learning_rate": 1.7432827451065052e-05, - "loss": 0.1958, + "epoch": 0.17832957110609482, + "grad_norm": 5.1166229248046875, + "learning_rate": 1.8471282021276073e-05, + "loss": 0.1784, "step": 237 }, { - "epoch": 0.234251968503937, - "grad_norm": 0.8766173124313354, - "learning_rate": 1.741210637506571e-05, - "loss": 0.0243, + "epoch": 0.17908201655379985, + "grad_norm": 0.0013702790020033717, + "learning_rate": 1.845869699198563e-05, + "loss": 0.0001, "step": 238 }, { - "epoch": 0.23523622047244094, - "grad_norm": 0.000482661445857957, - "learning_rate": 1.7391314430503322e-05, - "loss": 0.0, + "epoch": 0.17983446200150488, + "grad_norm": 0.01880730129778385, + "learning_rate": 1.844606469623824e-05, + "loss": 0.0005, "step": 239 }, { - "epoch": 0.23622047244094488, - "grad_norm": 0.00958346389234066, - "learning_rate": 1.737045181617364e-05, - "loss": 0.0004, + "epoch": 0.18058690744920994, + "grad_norm": 1.5539264678955078, + "learning_rate": 1.8433385204622067e-05, + "loss": 0.1897, "step": 240 }, { - "epoch": 0.2372047244094488, - "grad_norm": 0.8040327429771423, - "learning_rate": 1.7349518731548095e-05, - "loss": 0.0349, + "epoch": 0.18133935289691497, + "grad_norm": 4.555483341217041, + "learning_rate": 1.842065858798899e-05, + "loss": 0.327, "step": 241 }, { - "epoch": 0.23818897637795275, - "grad_norm": 2.7319023609161377, - "learning_rate": 1.732851537677191e-05, - "loss": 0.2243, + "epoch": 0.18209179834462003, + "grad_norm": 0.014411378651857376, + "learning_rate": 1.8407884917454233e-05, + "loss": 0.0008, "step": 242 }, { - "epoch": 0.23917322834645668, - "grad_norm": 0.0035934855695813894, - "learning_rate": 1.7307441952662162e-05, - "loss": 0.0001, + "epoch": 0.18284424379232506, + "grad_norm": 0.3658387362957001, + "learning_rate": 1.8395064264395945e-05, + "loss": 0.0159, "step": 243 }, { - "epoch": 0.24015748031496062, - "grad_norm": 0.008882607333362103, - "learning_rate": 1.7286298660705877e-05, - "loss": 0.0003, + "epoch": 0.1835966892400301, + "grad_norm": 0.052855461835861206, + "learning_rate": 1.838219670045481e-05, + "loss": 0.0018, "step": 244 }, { - "epoch": 0.24114173228346455, - "grad_norm": 1.4643610715866089, - "learning_rate": 1.7265085703058103e-05, - "loss": 0.1764, + "epoch": 0.18434913468773514, + "grad_norm": 0.03438251465559006, + "learning_rate": 1.836928229753365e-05, + "loss": 0.0012, "step": 245 }, { - "epoch": 0.2421259842519685, - "grad_norm": 0.00039498109254054725, - "learning_rate": 1.724380328253998e-05, - "loss": 0.0, + "epoch": 0.18510158013544017, + "grad_norm": 0.24612069129943848, + "learning_rate": 1.835632112779701e-05, + "loss": 0.1226, "step": 246 }, { - "epoch": 0.24311023622047245, - "grad_norm": 0.7886760234832764, - "learning_rate": 1.7222451602636785e-05, - "loss": 0.0258, + "epoch": 0.18585402558314523, + "grad_norm": 0.04806346446275711, + "learning_rate": 1.8343313263670782e-05, + "loss": 0.0021, "step": 247 }, { - "epoch": 0.2440944881889764, - "grad_norm": 2.236525058746338, - "learning_rate": 1.7201030867496005e-05, - "loss": 0.1745, + "epoch": 0.18660647103085026, + "grad_norm": 0.6806573867797852, + "learning_rate": 1.8330258777841755e-05, + "loss": 0.0208, "step": 248 }, { - "epoch": 0.24507874015748032, - "grad_norm": 1.2629692554473877, - "learning_rate": 1.7179541281925378e-05, - "loss": 0.1318, + "epoch": 0.1873589164785553, + "grad_norm": 0.11799991130828857, + "learning_rate": 1.831715774325726e-05, + "loss": 0.0036, "step": 249 }, { - "epoch": 0.24606299212598426, - "grad_norm": 0.005061782896518707, - "learning_rate": 1.7157983051390926e-05, - "loss": 0.0002, + "epoch": 0.18811136192626035, + "grad_norm": 0.002671067602932453, + "learning_rate": 1.830401023312472e-05, + "loss": 0.0001, "step": 250 }, { - "epoch": 0.2470472440944882, - "grad_norm": 0.00048711083945818245, - "learning_rate": 1.7136356382015007e-05, - "loss": 0.0001, + "epoch": 0.18886380737396538, + "grad_norm": 0.2947131097316742, + "learning_rate": 1.8290816320911285e-05, + "loss": 0.1261, "step": 251 }, { - "epoch": 0.24803149606299213, - "grad_norm": 6.473639011383057, - "learning_rate": 1.711466148057433e-05, - "loss": 0.4129, + "epoch": 0.18961625282167044, + "grad_norm": 1.2162272930145264, + "learning_rate": 1.8277576080343362e-05, + "loss": 0.1268, "step": 252 }, { - "epoch": 0.24901574803149606, - "grad_norm": 0.004619178827852011, - "learning_rate": 1.709289855449799e-05, - "loss": 0.0001, + "epoch": 0.19036869826937547, + "grad_norm": 1.2172619104385376, + "learning_rate": 1.8264289585406266e-05, + "loss": 0.0297, "step": 253 }, { - "epoch": 0.25, - "grad_norm": 0.013872313313186169, - "learning_rate": 1.7071067811865477e-05, - "loss": 0.0004, + "epoch": 0.1911211437170805, + "grad_norm": 0.20552274584770203, + "learning_rate": 1.825095691034376e-05, + "loss": 0.0085, "step": 254 }, { - "epoch": 0.25098425196850394, - "grad_norm": 0.0745028704404831, - "learning_rate": 1.704916946140468e-05, - "loss": 0.0033, + "epoch": 0.19187358916478556, + "grad_norm": 0.008679857477545738, + "learning_rate": 1.8237578129657664e-05, + "loss": 0.0002, "step": 255 }, { - "epoch": 0.25196850393700787, - "grad_norm": 0.0023475021589547396, - "learning_rate": 1.7027203712489902e-05, - "loss": 0.0001, + "epoch": 0.1926260346124906, + "grad_norm": 0.0659710094332695, + "learning_rate": 1.822415331810743e-05, + "loss": 0.002, "step": 256 }, { - "epoch": 0.2529527559055118, - "grad_norm": 0.16909445822238922, - "learning_rate": 1.700517077513987e-05, - "loss": 0.0077, + "epoch": 0.19337848006019565, + "grad_norm": 0.4817584753036499, + "learning_rate": 1.821068255070973e-05, + "loss": 0.1022, "step": 257 }, { - "epoch": 0.25393700787401574, - "grad_norm": 1.422851800918579, - "learning_rate": 1.698307086001569e-05, - "loss": 0.171, + "epoch": 0.19413092550790068, + "grad_norm": 1.5975401401519775, + "learning_rate": 1.819716590273803e-05, + "loss": 0.1331, "step": 258 }, { - "epoch": 0.2549212598425197, - "grad_norm": 1.204367756843567, - "learning_rate": 1.6960904178418874e-05, - "loss": 0.1415, + "epoch": 0.1948833709556057, + "grad_norm": 0.05313832312822342, + "learning_rate": 1.818360344972217e-05, + "loss": 0.0016, "step": 259 }, { - "epoch": 0.2559055118110236, - "grad_norm": 0.32935601472854614, - "learning_rate": 1.6938670942289292e-05, - "loss": 0.0162, + "epoch": 0.19563581640331076, + "grad_norm": 0.02122344821691513, + "learning_rate": 1.8169995267447953e-05, + "loss": 0.0008, "step": 260 }, { - "epoch": 0.25688976377952755, - "grad_norm": 0.036025624722242355, - "learning_rate": 1.691637136420315e-05, - "loss": 0.0005, + "epoch": 0.1963882618510158, + "grad_norm": 0.0018245746614411473, + "learning_rate": 1.8156341431956706e-05, + "loss": 0.0001, "step": 261 }, { - "epoch": 0.2578740157480315, - "grad_norm": 0.5372671484947205, - "learning_rate": 1.689400565737098e-05, - "loss": 0.0224, + "epoch": 0.19714070729872085, + "grad_norm": 0.07786925882101059, + "learning_rate": 1.814264201954486e-05, + "loss": 0.0018, "step": 262 }, { - "epoch": 0.2588582677165354, - "grad_norm": 0.5319058299064636, - "learning_rate": 1.6871574035635562e-05, - "loss": 0.0245, + "epoch": 0.19789315274642588, + "grad_norm": 0.46468350291252136, + "learning_rate": 1.812889710676354e-05, + "loss": 0.013, "step": 263 }, { - "epoch": 0.25984251968503935, - "grad_norm": 0.015808330848813057, - "learning_rate": 1.6849076713469914e-05, - "loss": 0.0003, + "epoch": 0.1986455981941309, + "grad_norm": 0.4126303493976593, + "learning_rate": 1.811510677041811e-05, + "loss": 0.0055, "step": 264 }, { - "epoch": 0.2608267716535433, - "grad_norm": 0.1305120587348938, - "learning_rate": 1.6826513905975223e-05, - "loss": 0.0039, + "epoch": 0.19939804364183597, + "grad_norm": 0.20423245429992676, + "learning_rate": 1.8101271087567753e-05, + "loss": 0.1157, "step": 265 }, { - "epoch": 0.2618110236220472, - "grad_norm": 1.2115839719772339, - "learning_rate": 1.6803885828878798e-05, - "loss": 0.1593, + "epoch": 0.200150489089541, + "grad_norm": 0.9413779377937317, + "learning_rate": 1.8087390135525056e-05, + "loss": 0.0718, "step": 266 }, { - "epoch": 0.26279527559055116, - "grad_norm": 0.6236674785614014, - "learning_rate": 1.6781192698532e-05, - "loss": 0.0538, + "epoch": 0.20090293453724606, + "grad_norm": 0.5326927304267883, + "learning_rate": 1.8073463991855562e-05, + "loss": 0.0053, "step": 267 }, { - "epoch": 0.2637795275590551, - "grad_norm": 0.16451089084148407, - "learning_rate": 1.6758434731908178e-05, - "loss": 0.0035, + "epoch": 0.2016553799849511, + "grad_norm": 0.2141987830400467, + "learning_rate": 1.8059492734377342e-05, + "loss": 0.1024, "step": 268 }, { - "epoch": 0.26476377952755903, - "grad_norm": 2.773745536804199, - "learning_rate": 1.673561214660059e-05, - "loss": 0.1588, + "epoch": 0.20240782543265612, + "grad_norm": 4.482271194458008, + "learning_rate": 1.8045476441160552e-05, + "loss": 0.3966, "step": 269 }, { - "epoch": 0.265748031496063, - "grad_norm": 0.011931652203202248, - "learning_rate": 1.671272516082033e-05, - "loss": 0.0005, + "epoch": 0.20316027088036118, + "grad_norm": 0.001211398164741695, + "learning_rate": 1.8031415190527016e-05, + "loss": 0.0001, "step": 270 }, { - "epoch": 0.26673228346456695, - "grad_norm": 2.069915294647217, - "learning_rate": 1.6689773993394234e-05, - "loss": 0.0438, + "epoch": 0.2039127163280662, + "grad_norm": 0.06283948570489883, + "learning_rate": 1.8017309061049767e-05, + "loss": 0.0014, "step": 271 }, { - "epoch": 0.2677165354330709, - "grad_norm": 1.2944964170455933, - "learning_rate": 1.6666758863762796e-05, - "loss": 0.0732, + "epoch": 0.20466516177577126, + "grad_norm": 0.655404806137085, + "learning_rate": 1.8003158131552615e-05, + "loss": 0.0746, "step": 272 }, { - "epoch": 0.2687007874015748, - "grad_norm": 0.05713488534092903, - "learning_rate": 1.6643679991978056e-05, - "loss": 0.0019, + "epoch": 0.2054176072234763, + "grad_norm": 0.010463729500770569, + "learning_rate": 1.7988962481109716e-05, + "loss": 0.0003, "step": 273 }, { - "epoch": 0.26968503937007876, - "grad_norm": 0.009257754310965538, - "learning_rate": 1.662053759870151e-05, - "loss": 0.0004, + "epoch": 0.20617005267118135, + "grad_norm": 0.0010110132861882448, + "learning_rate": 1.7974722189045126e-05, + "loss": 0.0001, "step": 274 }, { - "epoch": 0.2706692913385827, - "grad_norm": 0.12434358894824982, - "learning_rate": 1.6597331905202e-05, - "loss": 0.0043, + "epoch": 0.20692249811888638, + "grad_norm": 0.1091262623667717, + "learning_rate": 1.7960437334932334e-05, + "loss": 0.0042, "step": 275 }, { - "epoch": 0.27165354330708663, - "grad_norm": 2.276559352874756, - "learning_rate": 1.657406313335358e-05, - "loss": 0.0906, + "epoch": 0.2076749435665914, + "grad_norm": 5.209902286529541, + "learning_rate": 1.7946107998593867e-05, + "loss": 0.0551, "step": 276 }, { - "epoch": 0.27263779527559057, - "grad_norm": 0.04692010581493378, - "learning_rate": 1.655073150563343e-05, - "loss": 0.0022, + "epoch": 0.20842738901429647, + "grad_norm": 0.00470168748870492, + "learning_rate": 1.7931734260100792e-05, + "loss": 0.0002, "step": 277 }, { - "epoch": 0.2736220472440945, - "grad_norm": 0.35286766290664673, - "learning_rate": 1.6527337245119678e-05, - "loss": 0.0199, + "epoch": 0.2091798344620015, + "grad_norm": 0.7987887263298035, + "learning_rate": 1.7917316199772296e-05, + "loss": 0.0129, "step": 278 }, { - "epoch": 0.27460629921259844, - "grad_norm": 0.047983959317207336, - "learning_rate": 1.6503880575489324e-05, - "loss": 0.0013, + "epoch": 0.20993227990970656, + "grad_norm": 0.021809808909893036, + "learning_rate": 1.7902853898175244e-05, + "loss": 0.0011, "step": 279 }, { - "epoch": 0.2755905511811024, - "grad_norm": 0.14280354976654053, - "learning_rate": 1.6480361721016053e-05, - "loss": 0.0043, + "epoch": 0.2106847253574116, + "grad_norm": 0.3490143120288849, + "learning_rate": 1.7888347436123707e-05, + "loss": 0.1188, "step": 280 }, { - "epoch": 0.2765748031496063, - "grad_norm": 1.407649278640747, - "learning_rate": 1.645678090656812e-05, - "loss": 0.0304, + "epoch": 0.21143717080511662, + "grad_norm": 0.6932041645050049, + "learning_rate": 1.7873796894678514e-05, + "loss": 0.0158, "step": 281 }, { - "epoch": 0.27755905511811024, - "grad_norm": 0.007997879758477211, - "learning_rate": 1.6433138357606198e-05, + "epoch": 0.21218961625282168, + "grad_norm": 0.006826372817158699, + "learning_rate": 1.7859202355146826e-05, "loss": 0.0003, "step": 282 }, { - "epoch": 0.2785433070866142, - "grad_norm": 0.25700461864471436, - "learning_rate": 1.64094343001812e-05, - "loss": 0.0075, + "epoch": 0.2129420617005267, + "grad_norm": 0.00620373385027051, + "learning_rate": 1.7844563899081642e-05, + "loss": 0.0002, "step": 283 }, { - "epoch": 0.2795275590551181, - "grad_norm": 0.20328575372695923, - "learning_rate": 1.6385668960932143e-05, - "loss": 0.005, + "epoch": 0.21369450714823177, + "grad_norm": 0.0037128764670342207, + "learning_rate": 1.782988160828137e-05, + "loss": 0.0002, "step": 284 }, { - "epoch": 0.28051181102362205, - "grad_norm": 5.554577350616455, - "learning_rate": 1.6361842567083975e-05, - "loss": 0.3292, + "epoch": 0.2144469525959368, + "grad_norm": 0.008649413473904133, + "learning_rate": 1.7815155564789374e-05, + "loss": 0.0002, "step": 285 }, { - "epoch": 0.281496062992126, - "grad_norm": 2.7741620540618896, - "learning_rate": 1.633795534644538e-05, - "loss": 0.5126, + "epoch": 0.21519939804364183, + "grad_norm": 0.0007761880406178534, + "learning_rate": 1.780038585089348e-05, + "loss": 0.0, "step": 286 }, { - "epoch": 0.2824803149606299, - "grad_norm": 0.9424337148666382, - "learning_rate": 1.6314007527406643e-05, - "loss": 0.0196, + "epoch": 0.21595184349134688, + "grad_norm": 8.137001037597656, + "learning_rate": 1.7785572549125566e-05, + "loss": 0.2602, "step": 287 }, { - "epoch": 0.28346456692913385, - "grad_norm": 0.34798404574394226, - "learning_rate": 1.6289999338937427e-05, - "loss": 0.0165, + "epoch": 0.21670428893905191, + "grad_norm": 0.011192454025149345, + "learning_rate": 1.7770715742261057e-05, + "loss": 0.0003, "step": 288 }, { - "epoch": 0.2844488188976378, - "grad_norm": 0.46083176136016846, - "learning_rate": 1.6265931010584603e-05, - "loss": 0.0153, + "epoch": 0.21745673438675697, + "grad_norm": 0.9992365837097168, + "learning_rate": 1.775581551331848e-05, + "loss": 0.013, "step": 289 }, { - "epoch": 0.2854330708661417, - "grad_norm": 0.5354030728340149, - "learning_rate": 1.6241802772470043e-05, - "loss": 0.0088, + "epoch": 0.218209179834462, + "grad_norm": 3.0127367973327637, + "learning_rate": 1.7740871945559022e-05, + "loss": 0.2808, "step": 290 }, { - "epoch": 0.28641732283464566, - "grad_norm": 8.248590165749192e-05, - "learning_rate": 1.6217614855288442e-05, + "epoch": 0.21896162528216703, + "grad_norm": 6.692657188978046e-05, + "learning_rate": 1.772588512248602e-05, "loss": 0.0, "step": 291 }, { - "epoch": 0.2874015748031496, - "grad_norm": 0.002541697584092617, - "learning_rate": 1.619336749030509e-05, - "loss": 0.0001, + "epoch": 0.2197140707298721, + "grad_norm": 9.9885743111372e-05, + "learning_rate": 1.771085512784453e-05, + "loss": 0.0, "step": 292 }, { - "epoch": 0.28838582677165353, - "grad_norm": 1.5113205909729004, - "learning_rate": 1.6169060909353665e-05, - "loss": 0.0184, + "epoch": 0.22046651617757712, + "grad_norm": 0.002337940037250519, + "learning_rate": 1.769578204562084e-05, + "loss": 0.0001, "step": 293 }, { - "epoch": 0.28937007874015747, - "grad_norm": 0.0003574075235519558, - "learning_rate": 1.6144695344834026e-05, - "loss": 0.0001, + "epoch": 0.22121896162528218, + "grad_norm": 4.053681373596191, + "learning_rate": 1.7680665960042016e-05, + "loss": 0.2921, "step": 294 }, { - "epoch": 0.2903543307086614, - "grad_norm": 0.0002322999353054911, - "learning_rate": 1.612027102970999e-05, - "loss": 0.0, + "epoch": 0.2219714070729872, + "grad_norm": 0.004839350003749132, + "learning_rate": 1.7665506955575417e-05, + "loss": 0.0003, "step": 295 }, { - "epoch": 0.29133858267716534, - "grad_norm": 1.984909176826477, - "learning_rate": 1.609578819750708e-05, - "loss": 0.143, + "epoch": 0.22272385252069224, + "grad_norm": 3.5069193840026855, + "learning_rate": 1.765030511692823e-05, + "loss": 0.3916, "step": 296 }, { - "epoch": 0.29232283464566927, - "grad_norm": 0.09265024214982986, - "learning_rate": 1.6071247082310337e-05, + "epoch": 0.2234762979683973, + "grad_norm": 0.10265739262104034, + "learning_rate": 1.7635060529046994e-05, "loss": 0.0018, "step": 297 }, { - "epoch": 0.2933070866141732, - "grad_norm": 0.001633710227906704, - "learning_rate": 1.604664791876204e-05, - "loss": 0.0001, + "epoch": 0.22422874341610233, + "grad_norm": 8.340572357177734, + "learning_rate": 1.7619773277117135e-05, + "loss": 0.2325, "step": 298 }, { - "epoch": 0.29429133858267714, - "grad_norm": 0.10373327136039734, - "learning_rate": 1.6021990942059486e-05, - "loss": 0.0035, + "epoch": 0.22498118886380739, + "grad_norm": 0.004961786326020956, + "learning_rate": 1.760444344656247e-05, + "loss": 0.0003, "step": 299 }, { - "epoch": 0.2952755905511811, - "grad_norm": 1.0027252435684204, - "learning_rate": 1.5997276387952733e-05, - "loss": 0.066, + "epoch": 0.22573363431151242, + "grad_norm": 0.2516036033630371, + "learning_rate": 1.758907112304475e-05, + "loss": 0.0051, "step": 300 }, { - "epoch": 0.296259842519685, - "grad_norm": 2.6707913875579834, - "learning_rate": 1.5972504492742346e-05, - "loss": 0.1572, + "epoch": 0.22648607975921745, + "grad_norm": 0.01247998047620058, + "learning_rate": 1.757365639246317e-05, + "loss": 0.0006, "step": 301 }, { - "epoch": 0.297244094488189, - "grad_norm": 0.008716387674212456, - "learning_rate": 1.594767549327714e-05, - "loss": 0.0005, + "epoch": 0.2272385252069225, + "grad_norm": 0.0020404146052896976, + "learning_rate": 1.7558199340953893e-05, + "loss": 0.0001, "step": 302 }, { - "epoch": 0.29822834645669294, - "grad_norm": 0.006882714107632637, - "learning_rate": 1.592278962695192e-05, - "loss": 0.0003, + "epoch": 0.22799097065462753, + "grad_norm": 0.3927852213382721, + "learning_rate": 1.7542700054889572e-05, + "loss": 0.0126, "step": 303 }, { - "epoch": 0.2992125984251969, - "grad_norm": 0.26246535778045654, - "learning_rate": 1.5897847131705194e-05, - "loss": 0.007, + "epoch": 0.2287434161023326, + "grad_norm": 0.012273562140762806, + "learning_rate": 1.752715862087885e-05, + "loss": 0.0003, "step": 304 }, { - "epoch": 0.3001968503937008, - "grad_norm": 2.4476964473724365, - "learning_rate": 1.5872848246016928e-05, - "loss": 0.2309, + "epoch": 0.22949586155003762, + "grad_norm": 0.0031097414903342724, + "learning_rate": 1.7511575125765902e-05, + "loss": 0.0002, "step": 305 }, { - "epoch": 0.30118110236220474, - "grad_norm": 0.002369433408603072, - "learning_rate": 1.5847793208906228e-05, - "loss": 0.0001, + "epoch": 0.23024830699774265, + "grad_norm": 0.36811593174934387, + "learning_rate": 1.7495949656629933e-05, + "loss": 0.0076, "step": 306 }, { - "epoch": 0.3021653543307087, - "grad_norm": 3.6016793251037598, - "learning_rate": 1.5822682259929086e-05, - "loss": 0.1724, + "epoch": 0.2310007524454477, + "grad_norm": 0.008364018052816391, + "learning_rate": 1.748028230078469e-05, + "loss": 0.0002, "step": 307 }, { - "epoch": 0.3031496062992126, - "grad_norm": 0.0165077093988657, - "learning_rate": 1.5797515639176077e-05, - "loss": 0.0006, + "epoch": 0.23175319789315274, + "grad_norm": 0.47602003812789917, + "learning_rate": 1.7464573145777987e-05, + "loss": 0.1192, "step": 308 }, { - "epoch": 0.30413385826771655, - "grad_norm": 0.3576982319355011, - "learning_rate": 1.577229358727006e-05, - "loss": 0.0064, + "epoch": 0.2325056433408578, + "grad_norm": 0.858063280582428, + "learning_rate": 1.7448822279391204e-05, + "loss": 0.0553, "step": 309 }, { - "epoch": 0.3051181102362205, - "grad_norm": 1.1690768003463745, - "learning_rate": 1.5747016345363885e-05, - "loss": 0.0296, + "epoch": 0.23325808878856283, + "grad_norm": 4.250735759735107, + "learning_rate": 1.7433029789638794e-05, + "loss": 0.0937, "step": 310 }, { - "epoch": 0.3061023622047244, - "grad_norm": 0.409931480884552, - "learning_rate": 1.572168415513809e-05, - "loss": 0.0216, + "epoch": 0.23401053423626786, + "grad_norm": 0.029880870133638382, + "learning_rate": 1.7417195764767816e-05, + "loss": 0.0013, "step": 311 }, { - "epoch": 0.30708661417322836, - "grad_norm": 0.0014893177431076765, - "learning_rate": 1.5696297258798573e-05, - "loss": 0.0001, + "epoch": 0.23476297968397292, + "grad_norm": 0.04074908420443535, + "learning_rate": 1.7401320293257403e-05, + "loss": 0.0021, "step": 312 }, { - "epoch": 0.3080708661417323, - "grad_norm": 5.435436248779297, - "learning_rate": 1.5670855899074292e-05, - "loss": 0.0929, + "epoch": 0.23551542513167795, + "grad_norm": 0.03922910988330841, + "learning_rate": 1.7385403463818308e-05, + "loss": 0.0012, "step": 313 }, { - "epoch": 0.3090551181102362, - "grad_norm": 3.219839572906494, - "learning_rate": 1.5645360319214946e-05, - "loss": 0.3074, + "epoch": 0.236267870579383, + "grad_norm": 0.06185416132211685, + "learning_rate": 1.7369445365392365e-05, + "loss": 0.0031, "step": 314 }, { - "epoch": 0.31003937007874016, - "grad_norm": 0.005359490867704153, - "learning_rate": 1.561981076298863e-05, - "loss": 0.0002, + "epoch": 0.23702031602708803, + "grad_norm": 6.229604721069336, + "learning_rate": 1.7353446087152038e-05, + "loss": 0.0754, "step": 315 }, { - "epoch": 0.3110236220472441, - "grad_norm": 0.0015385436126962304, - "learning_rate": 1.5594207474679533e-05, - "loss": 0.0001, + "epoch": 0.23777276147479307, + "grad_norm": 0.4178987443447113, + "learning_rate": 1.733740571849989e-05, + "loss": 0.1242, "step": 316 }, { - "epoch": 0.31200787401574803, - "grad_norm": 0.8422860503196716, - "learning_rate": 1.5568550699085574e-05, - "loss": 0.0201, + "epoch": 0.23852520692249812, + "grad_norm": 0.2695218324661255, + "learning_rate": 1.732132434906809e-05, + "loss": 0.0052, "step": 317 }, { - "epoch": 0.31299212598425197, - "grad_norm": 0.015530900098383427, - "learning_rate": 1.554284068151608e-05, - "loss": 0.0007, + "epoch": 0.23927765237020315, + "grad_norm": 0.011477818712592125, + "learning_rate": 1.730520206871792e-05, + "loss": 0.0004, "step": 318 }, { - "epoch": 0.3139763779527559, - "grad_norm": 0.004284654278308153, - "learning_rate": 1.5517077667789437e-05, - "loss": 0.0002, + "epoch": 0.2400300978179082, + "grad_norm": 0.01223902590572834, + "learning_rate": 1.728903896753927e-05, + "loss": 0.0006, "step": 319 }, { - "epoch": 0.31496062992125984, - "grad_norm": 0.0007849871763028204, - "learning_rate": 1.549126190423073e-05, - "loss": 0.0001, + "epoch": 0.24078254326561324, + "grad_norm": 0.02941061556339264, + "learning_rate": 1.7272835135850133e-05, + "loss": 0.0012, "step": 320 }, { - "epoch": 0.3159448818897638, - "grad_norm": 0.017790410667657852, - "learning_rate": 1.5465393637669395e-05, - "loss": 0.0006, + "epoch": 0.24153498871331827, + "grad_norm": 0.45937439799308777, + "learning_rate": 1.72565906641961e-05, + "loss": 0.0131, "step": 321 }, { - "epoch": 0.3169291338582677, - "grad_norm": 0.42641007900238037, - "learning_rate": 1.5439473115436872e-05, - "loss": 0.0078, + "epoch": 0.24228743416102333, + "grad_norm": 0.06633768230676651, + "learning_rate": 1.7240305643349854e-05, + "loss": 0.0012, "step": 322 }, { - "epoch": 0.31791338582677164, - "grad_norm": 2.963014602661133, - "learning_rate": 1.5413500585364213e-05, - "loss": 0.2623, + "epoch": 0.24303987960872836, + "grad_norm": 1.1109821796417236, + "learning_rate": 1.7223980164310658e-05, + "loss": 0.0369, "step": 323 }, { - "epoch": 0.3188976377952756, - "grad_norm": 0.03737296536564827, - "learning_rate": 1.5387476295779737e-05, - "loss": 0.0013, + "epoch": 0.24379232505643342, + "grad_norm": 5.301788330078125, + "learning_rate": 1.720761431830386e-05, + "loss": 0.2436, "step": 324 }, { - "epoch": 0.3198818897637795, - "grad_norm": 2.027540683746338, - "learning_rate": 1.5361400495506642e-05, - "loss": 0.086, + "epoch": 0.24454477050413845, + "grad_norm": 3.4349093437194824, + "learning_rate": 1.719120819678038e-05, + "loss": 0.3296, "step": 325 }, { - "epoch": 0.32086614173228345, - "grad_norm": 1.8849432468414307, - "learning_rate": 1.533527343386062e-05, - "loss": 0.2542, + "epoch": 0.24529721595184348, + "grad_norm": 0.13575129210948944, + "learning_rate": 1.7174761891416176e-05, + "loss": 0.0041, "step": 326 }, { - "epoch": 0.3218503937007874, - "grad_norm": 0.003224625950679183, - "learning_rate": 1.5309095360647505e-05, - "loss": 0.0002, + "epoch": 0.24604966139954854, + "grad_norm": 0.7291696667671204, + "learning_rate": 1.7158275494111763e-05, + "loss": 0.1419, "step": 327 }, { - "epoch": 0.3228346456692913, - "grad_norm": 0.9043647050857544, - "learning_rate": 1.5282866526160837e-05, - "loss": 0.0375, + "epoch": 0.24680210684725357, + "grad_norm": 0.011178013868629932, + "learning_rate": 1.7141749096991686e-05, + "loss": 0.0004, "step": 328 }, { - "epoch": 0.32381889763779526, - "grad_norm": 1.6890374422073364, - "learning_rate": 1.5256587181179514e-05, - "loss": 0.0388, + "epoch": 0.24755455229495862, + "grad_norm": 0.17662179470062256, + "learning_rate": 1.7125182792403995e-05, + "loss": 0.0028, "step": 329 }, { - "epoch": 0.3248031496062992, - "grad_norm": 2.1363799571990967, - "learning_rate": 1.5230257576965363e-05, - "loss": 0.1154, + "epoch": 0.24830699774266365, + "grad_norm": 0.8367990851402283, + "learning_rate": 1.7108576672919757e-05, + "loss": 0.0191, "step": 330 }, { - "epoch": 0.3257874015748031, - "grad_norm": 0.1475294679403305, - "learning_rate": 1.5203877965260751e-05, - "loss": 0.0063, + "epoch": 0.24905944319036868, + "grad_norm": 3.8350296020507812, + "learning_rate": 1.7091930831332507e-05, + "loss": 0.1464, "step": 331 }, { - "epoch": 0.32677165354330706, - "grad_norm": 0.8888038396835327, - "learning_rate": 1.5177448598286182e-05, - "loss": 0.0203, + "epoch": 0.24981188863807374, + "grad_norm": 0.8443143963813782, + "learning_rate": 1.7075245360657744e-05, + "loss": 0.0053, "step": 332 }, { - "epoch": 0.327755905511811, - "grad_norm": 1.6374540328979492, - "learning_rate": 1.5150969728737874e-05, - "loss": 0.0316, + "epoch": 0.2505643340857788, + "grad_norm": 0.04546971619129181, + "learning_rate": 1.705852035413242e-05, + "loss": 0.0014, "step": 333 }, { - "epoch": 0.328740157480315, - "grad_norm": 2.2455031871795654, - "learning_rate": 1.5124441609785347e-05, - "loss": 0.3561, + "epoch": 0.2513167795334838, + "grad_norm": 0.025590356439352036, + "learning_rate": 1.7041755905214404e-05, + "loss": 0.0013, "step": 334 }, { - "epoch": 0.3297244094488189, - "grad_norm": 2.401850938796997, - "learning_rate": 1.5097864495069012e-05, - "loss": 0.0581, + "epoch": 0.2520692249811889, + "grad_norm": 0.013746229000389576, + "learning_rate": 1.7024952107581965e-05, + "loss": 0.0004, "step": 335 }, { - "epoch": 0.33070866141732286, - "grad_norm": 2.0310356616973877, - "learning_rate": 1.5071238638697731e-05, - "loss": 0.1915, + "epoch": 0.2528216704288939, + "grad_norm": 0.12485899776220322, + "learning_rate": 1.700810905513325e-05, + "loss": 0.0049, "step": 336 }, { - "epoch": 0.3316929133858268, - "grad_norm": 0.08647636324167252, - "learning_rate": 1.5044564295246395e-05, - "loss": 0.0039, + "epoch": 0.25357411587659895, + "grad_norm": 0.20258508622646332, + "learning_rate": 1.699122684198576e-05, + "loss": 0.1096, "step": 337 }, { - "epoch": 0.33267716535433073, - "grad_norm": 0.4023967385292053, - "learning_rate": 1.5017841719753495e-05, - "loss": 0.0141, + "epoch": 0.254326561324304, + "grad_norm": 0.026910100132226944, + "learning_rate": 1.6974305562475825e-05, + "loss": 0.0011, "step": 338 }, { - "epoch": 0.33366141732283466, - "grad_norm": 0.057239603251218796, - "learning_rate": 1.4991071167718665e-05, - "loss": 0.0025, + "epoch": 0.255079006772009, + "grad_norm": 0.8987321853637695, + "learning_rate": 1.6957345311158066e-05, + "loss": 0.0763, "step": 339 }, { - "epoch": 0.3346456692913386, - "grad_norm": 0.6688069701194763, - "learning_rate": 1.4964252895100265e-05, - "loss": 0.0211, + "epoch": 0.2558314522197141, + "grad_norm": 0.0026313799899071455, + "learning_rate": 1.6940346182804876e-05, + "loss": 0.0001, "step": 340 }, { - "epoch": 0.33562992125984253, - "grad_norm": 1.3943629264831543, - "learning_rate": 1.4937387158312912e-05, - "loss": 0.1129, + "epoch": 0.2565838976674191, + "grad_norm": 6.234673976898193, + "learning_rate": 1.69233082724059e-05, + "loss": 0.1531, "step": 341 }, { - "epoch": 0.33661417322834647, - "grad_norm": 0.43440359830856323, - "learning_rate": 1.491047421422505e-05, - "loss": 0.0203, + "epoch": 0.25733634311512416, + "grad_norm": 0.0024953181855380535, + "learning_rate": 1.6906231675167488e-05, + "loss": 0.0002, "step": 342 }, { - "epoch": 0.3375984251968504, - "grad_norm": 0.031664639711380005, - "learning_rate": 1.488351432015646e-05, - "loss": 0.0015, + "epoch": 0.2580887885628292, + "grad_norm": 0.3546563684940338, + "learning_rate": 1.6889116486512165e-05, + "loss": 0.0036, "step": 343 }, { - "epoch": 0.33858267716535434, - "grad_norm": 0.06361010670661926, - "learning_rate": 1.4856507733875837e-05, - "loss": 0.0029, + "epoch": 0.2588412340105342, + "grad_norm": 0.0036818115040659904, + "learning_rate": 1.6871962802078103e-05, + "loss": 0.0002, "step": 344 }, { - "epoch": 0.3395669291338583, - "grad_norm": 0.14204591512680054, - "learning_rate": 1.4829454713598307e-05, - "loss": 0.0056, + "epoch": 0.2595936794582393, + "grad_norm": 0.7731168270111084, + "learning_rate": 1.6854770717718587e-05, + "loss": 0.0134, "step": 345 }, { - "epoch": 0.3405511811023622, - "grad_norm": 0.04043606296181679, - "learning_rate": 1.4802355517982956e-05, - "loss": 0.0017, + "epoch": 0.26034612490594433, + "grad_norm": 0.006896435748785734, + "learning_rate": 1.683754032950148e-05, + "loss": 0.0004, "step": 346 }, { - "epoch": 0.34153543307086615, - "grad_norm": 2.801612138748169, - "learning_rate": 1.4775210406130358e-05, - "loss": 0.1305, + "epoch": 0.26109857035364936, + "grad_norm": 0.08711463212966919, + "learning_rate": 1.6820271733708676e-05, + "loss": 0.0019, "step": 347 }, { - "epoch": 0.3425196850393701, - "grad_norm": 0.010530326515436172, - "learning_rate": 1.4748019637580116e-05, - "loss": 0.0004, + "epoch": 0.2618510158013544, + "grad_norm": 0.31446611881256104, + "learning_rate": 1.6802965026835575e-05, + "loss": 0.0066, "step": 348 }, { - "epoch": 0.343503937007874, - "grad_norm": 2.6631920337677, - "learning_rate": 1.4720783472308344e-05, - "loss": 0.2141, + "epoch": 0.2626034612490594, + "grad_norm": 0.038149867206811905, + "learning_rate": 1.6785620305590536e-05, + "loss": 0.0012, "step": 349 }, { - "epoch": 0.34448818897637795, - "grad_norm": 0.7517235279083252, - "learning_rate": 1.469350217072522e-05, - "loss": 0.0438, + "epoch": 0.2633559066967645, + "grad_norm": 0.003243704093620181, + "learning_rate": 1.676823766689434e-05, + "loss": 0.0002, "step": 350 }, { - "epoch": 0.3454724409448819, - "grad_norm": 0.12892574071884155, - "learning_rate": 1.4666175993672474e-05, - "loss": 0.0041, + "epoch": 0.26410835214446954, + "grad_norm": 0.0029318176675587893, + "learning_rate": 1.6750817207879655e-05, + "loss": 0.0001, "step": 351 }, { - "epoch": 0.3464566929133858, - "grad_norm": 0.020355578511953354, - "learning_rate": 1.4638805202420896e-05, - "loss": 0.0008, + "epoch": 0.26486079759217457, + "grad_norm": 1.8430068492889404, + "learning_rate": 1.673335902589047e-05, + "loss": 0.0449, "step": 352 }, { - "epoch": 0.34744094488188976, - "grad_norm": 0.010113202035427094, - "learning_rate": 1.4611390058667841e-05, - "loss": 0.0005, + "epoch": 0.2656132430398796, + "grad_norm": 0.00958191603422165, + "learning_rate": 1.6715863218481573e-05, + "loss": 0.0004, "step": 353 }, { - "epoch": 0.3484251968503937, - "grad_norm": 0.005961467046290636, - "learning_rate": 1.4583930824534729e-05, - "loss": 0.0002, + "epoch": 0.26636568848758463, + "grad_norm": 2.806882619857788, + "learning_rate": 1.6698329883418008e-05, + "loss": 0.1586, "step": 354 }, { - "epoch": 0.3494094488188976, - "grad_norm": 0.07392887026071548, - "learning_rate": 1.455642776256454e-05, - "loss": 0.0031, + "epoch": 0.2671181339352897, + "grad_norm": 0.0034162893425673246, + "learning_rate": 1.6680759118674512e-05, + "loss": 0.0002, "step": 355 }, { - "epoch": 0.35039370078740156, - "grad_norm": 2.6939687728881836, - "learning_rate": 1.452888113571929e-05, - "loss": 0.1334, + "epoch": 0.26787057938299474, + "grad_norm": 0.008835663087666035, + "learning_rate": 1.6663151022434978e-05, + "loss": 0.0003, "step": 356 }, { - "epoch": 0.3513779527559055, - "grad_norm": 0.07764803618192673, - "learning_rate": 1.4501291207377537e-05, - "loss": 0.0029, + "epoch": 0.2686230248306998, + "grad_norm": 0.0020498326048254967, + "learning_rate": 1.6645505693091897e-05, + "loss": 0.0001, "step": 357 }, { - "epoch": 0.35236220472440943, - "grad_norm": 0.013460423797369003, - "learning_rate": 1.447365824133185e-05, - "loss": 0.0006, + "epoch": 0.2693754702784048, + "grad_norm": 0.0011977710528299212, + "learning_rate": 1.662782322924583e-05, + "loss": 0.0001, "step": 358 }, { - "epoch": 0.35334645669291337, - "grad_norm": 0.001607846817933023, - "learning_rate": 1.4445982501786285e-05, - "loss": 0.0001, + "epoch": 0.27012791572610984, + "grad_norm": 0.013555724173784256, + "learning_rate": 1.661010372970483e-05, + "loss": 0.0004, "step": 359 }, { - "epoch": 0.3543307086614173, - "grad_norm": 0.016496863216161728, - "learning_rate": 1.4418264253353869e-05, - "loss": 0.0007, + "epoch": 0.2708803611738149, + "grad_norm": 0.0006222277879714966, + "learning_rate": 1.6592347293483908e-05, + "loss": 0.0, "step": 360 }, { - "epoch": 0.35531496062992124, - "grad_norm": 2.189188003540039, - "learning_rate": 1.4390503761054067e-05, - "loss": 0.1916, + "epoch": 0.27163280662151995, + "grad_norm": 0.008171453140676022, + "learning_rate": 1.6574554019804474e-05, + "loss": 0.0003, "step": 361 }, { - "epoch": 0.3562992125984252, - "grad_norm": 0.4954896569252014, - "learning_rate": 1.4362701290310234e-05, - "loss": 0.0182, + "epoch": 0.272385252069225, + "grad_norm": 0.004504789598286152, + "learning_rate": 1.655672400809378e-05, + "loss": 0.0001, "step": 362 }, { - "epoch": 0.3572834645669291, - "grad_norm": 0.006718991789966822, - "learning_rate": 1.4334857106947105e-05, - "loss": 0.0004, + "epoch": 0.27313769751693, + "grad_norm": 0.00137099449057132, + "learning_rate": 1.6538857357984358e-05, + "loss": 0.0001, "step": 363 }, { - "epoch": 0.35826771653543305, - "grad_norm": 2.0623433589935303, - "learning_rate": 1.4306971477188223e-05, - "loss": 0.3435, + "epoch": 0.27389014296463504, + "grad_norm": 0.06337831914424896, + "learning_rate": 1.6520954169313498e-05, + "loss": 0.0016, "step": 364 }, { - "epoch": 0.359251968503937, - "grad_norm": 0.0019281110726296902, - "learning_rate": 1.4279044667653414e-05, - "loss": 0.0001, + "epoch": 0.2746425884123401, + "grad_norm": 0.0002586379705462605, + "learning_rate": 1.6503014542122637e-05, + "loss": 0.0, "step": 365 }, { - "epoch": 0.36023622047244097, - "grad_norm": 0.02298598363995552, - "learning_rate": 1.4251076945356233e-05, - "loss": 0.0012, + "epoch": 0.27539503386004516, + "grad_norm": 0.004504516255110502, + "learning_rate": 1.6485038576656842e-05, + "loss": 0.0001, "step": 366 }, { - "epoch": 0.3612204724409449, - "grad_norm": 0.3729340434074402, - "learning_rate": 1.422306857770141e-05, - "loss": 0.0107, + "epoch": 0.2761474793077502, + "grad_norm": 0.31715670228004456, + "learning_rate": 1.646702637336423e-05, + "loss": 0.0032, "step": 367 }, { - "epoch": 0.36220472440944884, - "grad_norm": 0.28734129667282104, - "learning_rate": 1.419501983248229e-05, - "loss": 0.0119, + "epoch": 0.2768999247554552, + "grad_norm": 0.0008062532288022339, + "learning_rate": 1.6448978032895417e-05, + "loss": 0.0, "step": 368 }, { - "epoch": 0.3631889763779528, - "grad_norm": 3.690598964691162, - "learning_rate": 1.4166930977878277e-05, - "loss": 0.8193, + "epoch": 0.27765237020316025, + "grad_norm": 0.0017397012561559677, + "learning_rate": 1.6430893656102942e-05, + "loss": 0.0001, "step": 369 }, { - "epoch": 0.3641732283464567, - "grad_norm": 0.6311355829238892, - "learning_rate": 1.4138802282452269e-05, - "loss": 0.0285, + "epoch": 0.27840481565086533, + "grad_norm": 0.3345002830028534, + "learning_rate": 1.6412773344040717e-05, + "loss": 0.0042, "step": 370 }, { - "epoch": 0.36515748031496065, - "grad_norm": 0.9034423232078552, - "learning_rate": 1.4110634015148083e-05, - "loss": 0.031, + "epoch": 0.27915726109857036, + "grad_norm": 9.196252358378842e-05, + "learning_rate": 1.6394617197963462e-05, + "loss": 0.0, "step": 371 }, { - "epoch": 0.3661417322834646, - "grad_norm": 0.041711922734975815, - "learning_rate": 1.4082426445287904e-05, - "loss": 0.0015, + "epoch": 0.2799097065462754, + "grad_norm": 0.2911829352378845, + "learning_rate": 1.6376425319326125e-05, + "loss": 0.1043, "step": 372 }, { - "epoch": 0.3671259842519685, - "grad_norm": 0.04113816097378731, - "learning_rate": 1.4054179842569688e-05, - "loss": 0.0017, + "epoch": 0.2806621519939804, + "grad_norm": 0.0008261505281552672, + "learning_rate": 1.635819780978333e-05, + "loss": 0.0, "step": 373 }, { - "epoch": 0.36811023622047245, - "grad_norm": 1.0451749563217163, - "learning_rate": 1.4025894477064586e-05, - "loss": 0.0211, + "epoch": 0.28141459744168545, + "grad_norm": 0.013018965721130371, + "learning_rate": 1.6339934771188796e-05, + "loss": 0.0003, "step": 374 }, { - "epoch": 0.3690944881889764, - "grad_norm": 0.010880300775170326, - "learning_rate": 1.399757061921438e-05, - "loss": 0.0005, + "epoch": 0.28216704288939054, + "grad_norm": 0.5947379469871521, + "learning_rate": 1.6321636305594784e-05, + "loss": 0.0068, "step": 375 }, { - "epoch": 0.3700787401574803, - "grad_norm": 0.3653640151023865, - "learning_rate": 1.3969208539828873e-05, - "loss": 0.0107, + "epoch": 0.28291948833709557, + "grad_norm": 0.005715230479836464, + "learning_rate": 1.630330251525152e-05, + "loss": 0.0002, "step": 376 }, { - "epoch": 0.37106299212598426, - "grad_norm": 0.030315397307276726, - "learning_rate": 1.3940808510083321e-05, - "loss": 0.0014, + "epoch": 0.2836719337848006, + "grad_norm": 0.07143538445234299, + "learning_rate": 1.6284933502606614e-05, + "loss": 0.0009, "step": 377 }, { - "epoch": 0.3720472440944882, - "grad_norm": 0.013632961548864841, - "learning_rate": 1.3912370801515821e-05, - "loss": 0.0007, + "epoch": 0.28442437923250563, + "grad_norm": 4.414242267608643, + "learning_rate": 1.6266529370304492e-05, + "loss": 0.0671, "step": 378 }, { - "epoch": 0.37303149606299213, - "grad_norm": 5.989406585693359, - "learning_rate": 1.3883895686024738e-05, - "loss": 0.3232, + "epoch": 0.28517682468021066, + "grad_norm": 0.0007347184582613409, + "learning_rate": 1.624809022118584e-05, + "loss": 0.0, "step": 379 }, { - "epoch": 0.37401574803149606, - "grad_norm": 0.951386034488678, - "learning_rate": 1.3855383435866076e-05, - "loss": 0.1446, + "epoch": 0.28592927012791575, + "grad_norm": 0.00025847938377410173, + "learning_rate": 1.6229616158286997e-05, + "loss": 0.0, "step": 380 }, { - "epoch": 0.375, - "grad_norm": 0.06878283619880676, - "learning_rate": 1.3826834323650899e-05, - "loss": 0.0038, + "epoch": 0.2866817155756208, + "grad_norm": 12.279823303222656, + "learning_rate": 1.6211107284839417e-05, + "loss": 0.1936, "step": 381 }, { - "epoch": 0.37598425196850394, - "grad_norm": 0.8978389501571655, - "learning_rate": 1.3798248622342719e-05, - "loss": 0.0538, + "epoch": 0.2874341610233258, + "grad_norm": 0.0016370975645259023, + "learning_rate": 1.6192563704269048e-05, + "loss": 0.0001, "step": 382 }, { - "epoch": 0.37696850393700787, - "grad_norm": 1.9064515829086304, - "learning_rate": 1.3769626605254876e-05, - "loss": 0.5207, + "epoch": 0.28818660647103084, + "grad_norm": 5.603551835520193e-05, + "learning_rate": 1.6173985520195805e-05, + "loss": 0.0, "step": 383 }, { - "epoch": 0.3779527559055118, - "grad_norm": 0.45705941319465637, - "learning_rate": 1.3740968546047935e-05, - "loss": 0.028, + "epoch": 0.28893905191873587, + "grad_norm": 0.017844753339886665, + "learning_rate": 1.6155372836432944e-05, + "loss": 0.0007, "step": 384 }, { - "epoch": 0.37893700787401574, - "grad_norm": 2.8139076232910156, - "learning_rate": 1.3712274718727066e-05, - "loss": 0.1262, + "epoch": 0.28969149736644095, + "grad_norm": 0.0008964858716353774, + "learning_rate": 1.6136725756986514e-05, + "loss": 0.0, "step": 385 }, { - "epoch": 0.3799212598425197, - "grad_norm": 0.01576286181807518, - "learning_rate": 1.3683545397639433e-05, - "loss": 0.0008, + "epoch": 0.290443942814146, + "grad_norm": 0.12251462787389755, + "learning_rate": 1.6118044386054755e-05, + "loss": 0.0021, "step": 386 }, { - "epoch": 0.3809055118110236, - "grad_norm": 0.890476405620575, - "learning_rate": 1.3654780857471548e-05, - "loss": 0.1181, + "epoch": 0.291196388261851, + "grad_norm": 8.942232131958008, + "learning_rate": 1.609932882802753e-05, + "loss": 0.1369, "step": 387 }, { - "epoch": 0.38188976377952755, - "grad_norm": 1.1201790571212769, - "learning_rate": 1.362598137324667e-05, - "loss": 0.17, + "epoch": 0.29194883370955604, + "grad_norm": 0.005296614021062851, + "learning_rate": 1.6080579187485738e-05, + "loss": 0.0001, "step": 388 }, { - "epoch": 0.3828740157480315, - "grad_norm": 0.1687823086977005, - "learning_rate": 1.3597147220322164e-05, - "loss": 0.0084, + "epoch": 0.2927012791572611, + "grad_norm": 0.00021417946845758706, + "learning_rate": 1.6061795569200725e-05, + "loss": 0.0, "step": 389 }, { - "epoch": 0.3838582677165354, - "grad_norm": 0.07194636762142181, - "learning_rate": 1.3568278674386863e-05, - "loss": 0.0038, + "epoch": 0.29345372460496616, + "grad_norm": 0.0008177552372217178, + "learning_rate": 1.60429780781337e-05, + "loss": 0.0, "step": 390 }, { - "epoch": 0.38484251968503935, - "grad_norm": 0.21647506952285767, - "learning_rate": 1.3539376011458442e-05, - "loss": 0.0144, + "epoch": 0.2942061700526712, + "grad_norm": 2.832350015640259, + "learning_rate": 1.6024126819435156e-05, + "loss": 0.0632, "step": 391 }, { - "epoch": 0.3858267716535433, - "grad_norm": 0.14514599740505219, - "learning_rate": 1.3510439507880778e-05, - "loss": 0.0079, + "epoch": 0.2949586155003762, + "grad_norm": 0.007019749376922846, + "learning_rate": 1.6005241898444275e-05, + "loss": 0.0002, "step": 392 }, { - "epoch": 0.3868110236220472, - "grad_norm": 0.3828102946281433, - "learning_rate": 1.34814694403213e-05, - "loss": 0.0206, + "epoch": 0.29571106094808125, + "grad_norm": 0.49707838892936707, + "learning_rate": 1.5986323420688335e-05, + "loss": 0.093, "step": 393 }, { - "epoch": 0.38779527559055116, - "grad_norm": 0.1748681664466858, - "learning_rate": 1.3452466085768348e-05, - "loss": 0.0047, + "epoch": 0.2964635063957863, + "grad_norm": 5.089029788970947, + "learning_rate": 1.5967371491882136e-05, + "loss": 0.0233, "step": 394 }, { - "epoch": 0.3887795275590551, - "grad_norm": 0.2489398717880249, - "learning_rate": 1.3423429721528531e-05, - "loss": 0.011, + "epoch": 0.29721595184349137, + "grad_norm": 0.0013898630859330297, + "learning_rate": 1.5948386217927384e-05, + "loss": 0.0001, "step": 395 }, { - "epoch": 0.38976377952755903, - "grad_norm": 0.07907087355852127, - "learning_rate": 1.3394360625224067e-05, - "loss": 0.0044, + "epoch": 0.2979683972911964, + "grad_norm": 0.0012057372368872166, + "learning_rate": 1.592936770491214e-05, + "loss": 0.0001, "step": 396 }, { - "epoch": 0.390748031496063, - "grad_norm": 0.029289737343788147, - "learning_rate": 1.336525907479013e-05, - "loss": 0.0011, + "epoch": 0.2987208427389014, + "grad_norm": 0.0038578074891120195, + "learning_rate": 1.591031605911017e-05, + "loss": 0.0001, "step": 397 }, { - "epoch": 0.39173228346456695, - "grad_norm": 0.030908431857824326, - "learning_rate": 1.3336125348472193e-05, - "loss": 0.0014, + "epoch": 0.29947328818660646, + "grad_norm": 8.180018630810082e-05, + "learning_rate": 1.5891231386980415e-05, + "loss": 0.0, "step": 398 }, { - "epoch": 0.3927165354330709, - "grad_norm": 0.82310950756073, - "learning_rate": 1.3306959724823379e-05, - "loss": 0.0429, + "epoch": 0.3002257336343115, + "grad_norm": 0.0012178582837805152, + "learning_rate": 1.5872113795166337e-05, + "loss": 0.0, "step": 399 }, { - "epoch": 0.3937007874015748, - "grad_norm": 0.011711200699210167, - "learning_rate": 1.3277762482701769e-05, - "loss": 0.0006, + "epoch": 0.3009781790820166, + "grad_norm": 7.556064520031214e-05, + "learning_rate": 1.585296339049537e-05, + "loss": 0.0, "step": 400 }, { - "epoch": 0.39468503937007876, - "grad_norm": 0.015772564336657524, - "learning_rate": 1.3248533901267771e-05, - "loss": 0.0009, + "epoch": 0.3017306245297216, + "grad_norm": 0.0032481662929058075, + "learning_rate": 1.5833780279978293e-05, + "loss": 0.0001, "step": 401 }, { - "epoch": 0.3956692913385827, - "grad_norm": 1.4480652809143066, - "learning_rate": 1.321927425998143e-05, - "loss": 0.1008, + "epoch": 0.30248306997742663, + "grad_norm": 4.7872754294076e-06, + "learning_rate": 1.5814564570808643e-05, + "loss": 0.0, "step": 402 }, { - "epoch": 0.39665354330708663, - "grad_norm": 0.014079952612519264, - "learning_rate": 1.3189983838599751e-05, - "loss": 0.0007, + "epoch": 0.30323551542513166, + "grad_norm": 0.014581550844013691, + "learning_rate": 1.5795316370362122e-05, + "loss": 0.0003, "step": 403 }, { - "epoch": 0.39763779527559057, - "grad_norm": 0.21890543401241302, - "learning_rate": 1.3160662917174045e-05, - "loss": 0.0073, + "epoch": 0.3039879608728367, + "grad_norm": 0.3884536623954773, + "learning_rate": 1.5776035786195983e-05, + "loss": 0.1174, "step": 404 }, { - "epoch": 0.3986220472440945, - "grad_norm": 0.1788094937801361, - "learning_rate": 1.3131311776047237e-05, - "loss": 0.0079, + "epoch": 0.3047404063205418, + "grad_norm": 0.061029549688100815, + "learning_rate": 1.575672292604844e-05, + "loss": 0.0002, "step": 405 }, { - "epoch": 0.39960629921259844, - "grad_norm": 0.004129558801651001, - "learning_rate": 1.3101930695851186e-05, - "loss": 0.0002, + "epoch": 0.3054928517682468, + "grad_norm": 0.0007864750223234296, + "learning_rate": 1.5737377897838065e-05, + "loss": 0.0, "step": 406 }, { - "epoch": 0.4005905511811024, - "grad_norm": 0.24551954865455627, - "learning_rate": 1.3072519957504e-05, - "loss": 0.0118, + "epoch": 0.30624529721595184, + "grad_norm": 0.0008468715823255479, + "learning_rate": 1.5718000809663173e-05, + "loss": 0.0, "step": 407 }, { - "epoch": 0.4015748031496063, - "grad_norm": 0.22367309033870697, - "learning_rate": 1.3043079842207363e-05, - "loss": 0.0059, + "epoch": 0.30699774266365687, + "grad_norm": 6.305576243903488e-05, + "learning_rate": 1.569859176980124e-05, + "loss": 0.0, "step": 408 }, { - "epoch": 0.40255905511811024, - "grad_norm": 0.06608809530735016, - "learning_rate": 1.3013610631443832e-05, - "loss": 0.0015, + "epoch": 0.3077501881113619, + "grad_norm": 0.6824759840965271, + "learning_rate": 1.5679150886708273e-05, + "loss": 0.1164, "step": 409 }, { - "epoch": 0.4035433070866142, - "grad_norm": 0.0029334845021367073, - "learning_rate": 1.2984112606974155e-05, + "epoch": 0.308502633559067, + "grad_norm": 0.01927897147834301, + "learning_rate": 1.565967826901822e-05, "loss": 0.0002, "step": 410 }, { - "epoch": 0.4045275590551181, - "grad_norm": 1.0782864093780518, - "learning_rate": 1.2954586050834568e-05, - "loss": 0.0263, + "epoch": 0.309255079006772, + "grad_norm": 0.001680307206697762, + "learning_rate": 1.564017402554237e-05, + "loss": 0.0001, "step": 411 }, { - "epoch": 0.40551181102362205, - "grad_norm": 0.4754510223865509, - "learning_rate": 1.2925031245334112e-05, - "loss": 0.1004, + "epoch": 0.31000752445447705, + "grad_norm": 0.036916520446538925, + "learning_rate": 1.5620638265268718e-05, + "loss": 0.0005, "step": 412 }, { - "epoch": 0.406496062992126, - "grad_norm": 0.000790523539762944, - "learning_rate": 1.2895448473051912e-05, - "loss": 0.0001, + "epoch": 0.3107599699021821, + "grad_norm": 4.316399097442627, + "learning_rate": 1.560107109736138e-05, + "loss": 0.4106, "step": 413 }, { - "epoch": 0.4074803149606299, - "grad_norm": 5.1754631996154785, - "learning_rate": 1.2865838016834506e-05, - "loss": 0.0855, + "epoch": 0.3115124153498871, + "grad_norm": 0.0003387642209418118, + "learning_rate": 1.5581472631159977e-05, + "loss": 0.0, "step": 414 }, { - "epoch": 0.40846456692913385, - "grad_norm": 0.015399942174553871, - "learning_rate": 1.2836200159793114e-05, - "loss": 0.0007, + "epoch": 0.3122648607975922, + "grad_norm": 0.5591127872467041, + "learning_rate": 1.5561842976179013e-05, + "loss": 0.1023, "step": 415 }, { - "epoch": 0.4094488188976378, - "grad_norm": 0.004884008783847094, - "learning_rate": 1.2806535185300931e-05, - "loss": 0.0002, + "epoch": 0.3130173062452972, + "grad_norm": 6.071132659912109, + "learning_rate": 1.5542182242107284e-05, + "loss": 0.0244, "step": 416 }, { - "epoch": 0.4104330708661417, - "grad_norm": 2.0491995811462402, - "learning_rate": 1.2776843376990448e-05, - "loss": 0.2533, + "epoch": 0.31376975169300225, + "grad_norm": 0.0023193645756691694, + "learning_rate": 1.5522490538807248e-05, + "loss": 0.0001, "step": 417 }, { - "epoch": 0.41141732283464566, - "grad_norm": 0.07046820968389511, - "learning_rate": 1.2747125018750708e-05, - "loss": 0.0015, + "epoch": 0.3145221971407073, + "grad_norm": 0.0017903498373925686, + "learning_rate": 1.5502767976314413e-05, + "loss": 0.0001, "step": 418 }, { - "epoch": 0.4124015748031496, - "grad_norm": 3.137295722961426, - "learning_rate": 1.2717380394724597e-05, - "loss": 0.2979, + "epoch": 0.3152746425884123, + "grad_norm": 10.95837116241455, + "learning_rate": 1.5483014664836732e-05, + "loss": 0.6184, "step": 419 }, { - "epoch": 0.41338582677165353, - "grad_norm": 0.01203125063329935, - "learning_rate": 1.2687609789306144e-05, - "loss": 0.0005, + "epoch": 0.3160270880361174, + "grad_norm": 0.002409271663054824, + "learning_rate": 1.546323071475397e-05, + "loss": 0.0001, "step": 420 }, { - "epoch": 0.41437007874015747, - "grad_norm": 1.552305817604065, - "learning_rate": 1.2657813487137784e-05, - "loss": 0.1256, + "epoch": 0.31677953348382243, + "grad_norm": 7.29747480363585e-05, + "learning_rate": 1.544341623661711e-05, + "loss": 0.0, "step": 421 }, { - "epoch": 0.4153543307086614, - "grad_norm": 0.14510490000247955, - "learning_rate": 1.2627991773107651e-05, - "loss": 0.004, + "epoch": 0.31753197893152746, + "grad_norm": 2.6204562187194824, + "learning_rate": 1.5423571341147724e-05, + "loss": 0.1352, "step": 422 }, { - "epoch": 0.41633858267716534, - "grad_norm": 3.7488880157470703, - "learning_rate": 1.2598144932346837e-05, - "loss": 0.1464, + "epoch": 0.3182844243792325, + "grad_norm": 0.0007865215302444994, + "learning_rate": 1.5403696139237338e-05, + "loss": 0.0, "step": 423 }, { - "epoch": 0.41732283464566927, - "grad_norm": 0.03448360413312912, - "learning_rate": 1.2568273250226681e-05, - "loss": 0.0013, + "epoch": 0.3190368698269376, + "grad_norm": 3.1011784076690674, + "learning_rate": 1.538379074194684e-05, + "loss": 0.4246, "step": 424 }, { - "epoch": 0.4183070866141732, - "grad_norm": 0.13749021291732788, - "learning_rate": 1.2538377012356038e-05, - "loss": 0.0057, + "epoch": 0.3197893152746426, + "grad_norm": 0.000731561507564038, + "learning_rate": 1.5363855260505848e-05, + "loss": 0.0, "step": 425 }, { - "epoch": 0.41929133858267714, - "grad_norm": 0.03410489112138748, - "learning_rate": 1.2508456504578538e-05, - "loss": 0.0013, + "epoch": 0.32054176072234764, + "grad_norm": 6.430411338806152, + "learning_rate": 1.534388980631208e-05, + "loss": 0.2966, "step": 426 }, { - "epoch": 0.4202755905511811, - "grad_norm": 3.3951494693756104, - "learning_rate": 1.2478512012969864e-05, - "loss": 0.0954, + "epoch": 0.32129420617005267, + "grad_norm": 0.010142396204173565, + "learning_rate": 1.5323894490930743e-05, + "loss": 0.0004, "step": 427 }, { - "epoch": 0.421259842519685, - "grad_norm": 2.10465407371521, - "learning_rate": 1.2448543823835016e-05, - "loss": 0.1298, + "epoch": 0.3220466516177577, + "grad_norm": 0.5542206764221191, + "learning_rate": 1.5303869426093906e-05, + "loss": 0.0149, "step": 428 }, { - "epoch": 0.422244094488189, - "grad_norm": 0.315626323223114, - "learning_rate": 1.241855222370556e-05, - "loss": 0.0125, + "epoch": 0.3227990970654628, + "grad_norm": 0.35877907276153564, + "learning_rate": 1.5283814723699877e-05, + "loss": 0.0831, "step": 429 }, { - "epoch": 0.42322834645669294, - "grad_norm": 1.9896522760391235, - "learning_rate": 1.2388537499336915e-05, - "loss": 0.0819, + "epoch": 0.3235515425131678, + "grad_norm": 0.082036592066288, + "learning_rate": 1.5263730495812568e-05, + "loss": 0.0032, "step": 430 }, { - "epoch": 0.4242125984251969, - "grad_norm": 0.2132893055677414, - "learning_rate": 1.2358499937705579e-05, - "loss": 0.0075, + "epoch": 0.32430398796087284, + "grad_norm": 0.017158055678009987, + "learning_rate": 1.5243616854660894e-05, + "loss": 0.001, "step": 431 }, { - "epoch": 0.4251968503937008, - "grad_norm": 1.2012134790420532, - "learning_rate": 1.2328439826006415e-05, - "loss": 0.0332, + "epoch": 0.32505643340857787, + "grad_norm": 0.00297352927736938, + "learning_rate": 1.522347391263811e-05, + "loss": 0.0001, "step": 432 }, { - "epoch": 0.42618110236220474, - "grad_norm": 0.0016943581867963076, - "learning_rate": 1.2298357451649883e-05, + "epoch": 0.3258088788562829, + "grad_norm": 0.003190380521118641, + "learning_rate": 1.5203301782301212e-05, "loss": 0.0001, "step": 433 }, { - "epoch": 0.4271653543307087, - "grad_norm": 1.2882392406463623, - "learning_rate": 1.2268253102259302e-05, - "loss": 0.0256, + "epoch": 0.326561324303988, + "grad_norm": 0.0062565989792346954, + "learning_rate": 1.5183100576370291e-05, + "loss": 0.0003, "step": 434 }, { - "epoch": 0.4281496062992126, - "grad_norm": 0.6166675090789795, - "learning_rate": 1.2238127065668102e-05, - "loss": 0.0134, + "epoch": 0.327313769751693, + "grad_norm": 2.2504420280456543, + "learning_rate": 1.5162870407727922e-05, + "loss": 0.1951, "step": 435 }, { - "epoch": 0.42913385826771655, - "grad_norm": 0.721755862236023, - "learning_rate": 1.2207979629917061e-05, - "loss": 0.1546, + "epoch": 0.32806621519939805, + "grad_norm": 1.0565892457962036, + "learning_rate": 1.5142611389418505e-05, + "loss": 0.0199, "step": 436 }, { - "epoch": 0.4301181102362205, - "grad_norm": 0.054884858429431915, - "learning_rate": 1.2177811083251572e-05, - "loss": 0.0015, + "epoch": 0.3288186606471031, + "grad_norm": 1.0833739042282104, + "learning_rate": 1.512232363464766e-05, + "loss": 0.0284, "step": 437 }, { - "epoch": 0.4311023622047244, - "grad_norm": 0.028624525293707848, - "learning_rate": 1.2147621714118856e-05, - "loss": 0.0007, + "epoch": 0.3295711060948081, + "grad_norm": 0.32872238755226135, + "learning_rate": 1.5102007256781583e-05, + "loss": 0.0861, "step": 438 }, { - "epoch": 0.43208661417322836, - "grad_norm": 1.138548493385315, - "learning_rate": 1.2117411811165234e-05, - "loss": 0.1214, + "epoch": 0.3303235515425132, + "grad_norm": 0.08302167803049088, + "learning_rate": 1.5081662369346412e-05, + "loss": 0.0027, "step": 439 }, { - "epoch": 0.4330708661417323, - "grad_norm": 0.7347490787506104, - "learning_rate": 1.2087181663233354e-05, - "loss": 0.0185, + "epoch": 0.3310759969902182, + "grad_norm": 0.01665945164859295, + "learning_rate": 1.5061289086027593e-05, + "loss": 0.0009, "step": 440 }, { - "epoch": 0.4340551181102362, - "grad_norm": 0.025441914796829224, - "learning_rate": 1.2056931559359421e-05, - "loss": 0.0012, + "epoch": 0.33182844243792325, + "grad_norm": 0.00258431863039732, + "learning_rate": 1.5040887520669245e-05, + "loss": 0.0001, "step": 441 }, { - "epoch": 0.43503937007874016, - "grad_norm": 1.6668004989624023, - "learning_rate": 1.2026661788770453e-05, - "loss": 0.0453, + "epoch": 0.3325808878856283, + "grad_norm": 0.11993154883384705, + "learning_rate": 1.502045778727353e-05, + "loss": 0.0023, "step": 442 }, { - "epoch": 0.4360236220472441, - "grad_norm": 0.1318824738264084, - "learning_rate": 1.1996372640881502e-05, - "loss": 0.0055, + "epoch": 0.3333333333333333, + "grad_norm": 10.732891082763672, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.0681, "step": 443 }, { - "epoch": 0.43700787401574803, - "grad_norm": 0.0263349711894989, - "learning_rate": 1.1966064405292887e-05, - "loss": 0.001, + "epoch": 0.3340857787810384, + "grad_norm": 0.0034522817004472017, + "learning_rate": 1.497951427316498e-05, + "loss": 0.0002, "step": 444 }, { - "epoch": 0.43799212598425197, - "grad_norm": 0.013901818543672562, - "learning_rate": 1.193573737178743e-05, - "loss": 0.0005, + "epoch": 0.33483822422874343, + "grad_norm": 0.0015809908509254456, + "learning_rate": 1.495900072124092e-05, + "loss": 0.0001, "step": 445 }, { - "epoch": 0.4389763779527559, - "grad_norm": 0.8777344226837158, - "learning_rate": 1.1905391830327685e-05, - "loss": 0.1446, + "epoch": 0.33559066967644846, + "grad_norm": 0.006714050658047199, + "learning_rate": 1.4938459458855739e-05, + "loss": 0.0004, "step": 446 }, { - "epoch": 0.43996062992125984, - "grad_norm": 0.6416303515434265, - "learning_rate": 1.1875028071053165e-05, - "loss": 0.1305, + "epoch": 0.3363431151241535, + "grad_norm": 0.010480109602212906, + "learning_rate": 1.4917890600792215e-05, + "loss": 0.0006, "step": 447 }, { - "epoch": 0.4409448818897638, - "grad_norm": 0.16758762300014496, - "learning_rate": 1.184464638427756e-05, - "loss": 0.006, + "epoch": 0.3370955605718585, + "grad_norm": 0.06001771241426468, + "learning_rate": 1.4897294261987325e-05, + "loss": 0.0021, "step": 448 }, { - "epoch": 0.4419291338582677, - "grad_norm": 9.109311213251203e-05, - "learning_rate": 1.1814247060485977e-05, - "loss": 0.0, + "epoch": 0.3378480060195636, + "grad_norm": 0.0026993490755558014, + "learning_rate": 1.4876670557531598e-05, + "loss": 0.0001, "step": 449 }, { - "epoch": 0.44291338582677164, - "grad_norm": 8.31602155813016e-05, - "learning_rate": 1.178383039033215e-05, - "loss": 0.0, + "epoch": 0.33860045146726864, + "grad_norm": 1.938813328742981, + "learning_rate": 1.485601960266849e-05, + "loss": 0.0378, "step": 450 }, { - "epoch": 0.4438976377952756, - "grad_norm": 0.04009177163243294, - "learning_rate": 1.1753396664635658e-05, - "loss": 0.0012, + "epoch": 0.33935289691497367, + "grad_norm": 0.004080132115632296, + "learning_rate": 1.4835341512793727e-05, + "loss": 0.0002, "step": 451 }, { - "epoch": 0.4448818897637795, - "grad_norm": 0.0029926470015197992, - "learning_rate": 1.1722946174379168e-05, - "loss": 0.0001, + "epoch": 0.3401053423626787, + "grad_norm": 3.724914073944092, + "learning_rate": 1.4814636403454656e-05, + "loss": 0.1663, "step": 452 }, { - "epoch": 0.44586614173228345, - "grad_norm": 0.6937942504882812, - "learning_rate": 1.1692479210705623e-05, - "loss": 0.0421, + "epoch": 0.34085778781038373, + "grad_norm": 0.6385138034820557, + "learning_rate": 1.4793904390349618e-05, + "loss": 0.1188, "step": 453 }, { - "epoch": 0.4468503937007874, - "grad_norm": 0.0031900855246931314, - "learning_rate": 1.166199606491547e-05, - "loss": 0.0002, + "epoch": 0.3416102332580888, + "grad_norm": 4.093441963195801, + "learning_rate": 1.477314558932728e-05, + "loss": 0.0387, "step": 454 }, { - "epoch": 0.4478346456692913, - "grad_norm": 0.003432472702115774, - "learning_rate": 1.1631497028463887e-05, - "loss": 0.0001, + "epoch": 0.34236267870579384, + "grad_norm": 0.028323644772171974, + "learning_rate": 1.4752360116386002e-05, + "loss": 0.0012, "step": 455 }, { - "epoch": 0.44881889763779526, - "grad_norm": 1.6299903392791748, - "learning_rate": 1.1600982392957978e-05, - "loss": 0.1117, + "epoch": 0.3431151241534989, + "grad_norm": 0.14701040089130402, + "learning_rate": 1.4731548087673186e-05, + "loss": 0.0046, "step": 456 }, { - "epoch": 0.4498031496062992, - "grad_norm": 0.0015075228875502944, - "learning_rate": 1.1570452450153992e-05, - "loss": 0.0001, + "epoch": 0.3438675696012039, + "grad_norm": 0.22901006042957306, + "learning_rate": 1.4710709619484623e-05, + "loss": 0.0061, "step": 457 }, { - "epoch": 0.4507874015748031, - "grad_norm": 0.004224324133247137, - "learning_rate": 1.1539907491954539e-05, - "loss": 0.0001, + "epoch": 0.34462001504890893, + "grad_norm": 0.007301859557628632, + "learning_rate": 1.4689844828263846e-05, + "loss": 0.0003, "step": 458 }, { - "epoch": 0.45177165354330706, - "grad_norm": 0.0018088892102241516, - "learning_rate": 1.150934781040579e-05, - "loss": 0.0001, + "epoch": 0.345372460496614, + "grad_norm": 0.005133870989084244, + "learning_rate": 1.4668953830601473e-05, + "loss": 0.0002, "step": 459 }, { - "epoch": 0.452755905511811, - "grad_norm": 0.0003201038343831897, - "learning_rate": 1.1478773697694691e-05, - "loss": 0.0, + "epoch": 0.34612490594431905, + "grad_norm": 0.003975029569119215, + "learning_rate": 1.4648036743234573e-05, + "loss": 0.0002, "step": 460 }, { - "epoch": 0.453740157480315, - "grad_norm": 0.0004969264264218509, - "learning_rate": 1.1448185446146166e-05, - "loss": 0.0, + "epoch": 0.3468773513920241, + "grad_norm": 0.012133643962442875, + "learning_rate": 1.4627093683045997e-05, + "loss": 0.0006, "step": 461 }, { - "epoch": 0.4547244094488189, - "grad_norm": 1.309754729270935, - "learning_rate": 1.1417583348220322e-05, - "loss": 0.1656, + "epoch": 0.3476297968397291, + "grad_norm": 0.00555196451023221, + "learning_rate": 1.4606124767063721e-05, + "loss": 0.0003, "step": 462 }, { - "epoch": 0.45570866141732286, - "grad_norm": 0.4610322415828705, - "learning_rate": 1.1386967696509657e-05, - "loss": 0.01, + "epoch": 0.34838224228743414, + "grad_norm": 0.001998197054490447, + "learning_rate": 1.4585130112460214e-05, + "loss": 0.0001, "step": 463 }, { - "epoch": 0.4566929133858268, - "grad_norm": 0.15401987731456757, - "learning_rate": 1.1356338783736256e-05, - "loss": 0.0044, + "epoch": 0.3491346877351392, + "grad_norm": 0.4198460578918457, + "learning_rate": 1.4564109836551763e-05, + "loss": 0.0066, "step": 464 }, { - "epoch": 0.45767716535433073, - "grad_norm": 4.4690903450828046e-05, - "learning_rate": 1.1325696902749002e-05, - "loss": 0.0, + "epoch": 0.34988713318284426, + "grad_norm": 0.5744123458862305, + "learning_rate": 1.4543064056797826e-05, + "loss": 0.0147, "step": 465 }, { - "epoch": 0.45866141732283466, - "grad_norm": 0.1083870604634285, - "learning_rate": 1.1295042346520755e-05, - "loss": 0.003, + "epoch": 0.3506395786305493, + "grad_norm": 0.0055709234438836575, + "learning_rate": 1.4521992890800379e-05, + "loss": 0.0002, "step": 466 }, { - "epoch": 0.4596456692913386, - "grad_norm": 0.00825383048504591, - "learning_rate": 1.1264375408145582e-05, - "loss": 0.0003, + "epoch": 0.3513920240782543, + "grad_norm": 0.0019197987858206034, + "learning_rate": 1.4500896456303241e-05, + "loss": 0.0001, "step": 467 }, { - "epoch": 0.46062992125984253, - "grad_norm": 0.001960827736184001, - "learning_rate": 1.123369638083593e-05, - "loss": 0.0001, + "epoch": 0.35214446952595935, + "grad_norm": 0.8345107436180115, + "learning_rate": 1.4479774871191447e-05, + "loss": 0.1345, "step": 468 }, { - "epoch": 0.46161417322834647, - "grad_norm": 2.9676544666290283, - "learning_rate": 1.1203005557919833e-05, - "loss": 0.1622, + "epoch": 0.35289691497366443, + "grad_norm": 0.6766433715820312, + "learning_rate": 1.4458628253490555e-05, + "loss": 0.179, "step": 469 }, { - "epoch": 0.4625984251968504, - "grad_norm": 5.880873680114746, - "learning_rate": 1.11723032328381e-05, - "loss": 0.0674, + "epoch": 0.35364936042136946, + "grad_norm": 0.014497430063784122, + "learning_rate": 1.4437456721366013e-05, + "loss": 0.0007, "step": 470 }, { - "epoch": 0.46358267716535434, - "grad_norm": 5.849965572357178, - "learning_rate": 1.1141589699141517e-05, - "loss": 0.285, + "epoch": 0.3544018058690745, + "grad_norm": 0.11778023838996887, + "learning_rate": 1.4416260393122487e-05, + "loss": 0.0046, "step": 471 }, { - "epoch": 0.4645669291338583, - "grad_norm": 0.08785033971071243, - "learning_rate": 1.1110865250488047e-05, - "loss": 0.0023, + "epoch": 0.3551542513167795, + "grad_norm": 0.47652357816696167, + "learning_rate": 1.4395039387203197e-05, + "loss": 0.0069, "step": 472 }, { - "epoch": 0.4655511811023622, - "grad_norm": 1.9894014596939087, - "learning_rate": 1.108013018064e-05, - "loss": 0.0167, + "epoch": 0.35590669676448455, + "grad_norm": 0.3849281668663025, + "learning_rate": 1.4373793822189266e-05, + "loss": 0.11, "step": 473 }, { - "epoch": 0.46653543307086615, - "grad_norm": 0.03137003630399704, - "learning_rate": 1.1049384783461237e-05, - "loss": 0.0008, + "epoch": 0.35665914221218964, + "grad_norm": 2.7059948444366455, + "learning_rate": 1.4352523816799046e-05, + "loss": 0.1874, "step": 474 }, { - "epoch": 0.4675196850393701, - "grad_norm": 2.649750232696533, - "learning_rate": 1.1018629352914374e-05, - "loss": 0.2686, + "epoch": 0.35741158765989467, + "grad_norm": 0.7493427395820618, + "learning_rate": 1.4331229489887463e-05, + "loss": 0.0055, "step": 475 }, { - "epoch": 0.468503937007874, - "grad_norm": 3.7212471961975098, - "learning_rate": 1.0987864183057943e-05, - "loss": 0.3142, + "epoch": 0.3581640331075997, + "grad_norm": 0.002171468921005726, + "learning_rate": 1.430991096044535e-05, + "loss": 0.0001, "step": 476 }, { - "epoch": 0.46948818897637795, - "grad_norm": 0.008369374088943005, - "learning_rate": 1.0957089568043607e-05, - "loss": 0.0004, + "epoch": 0.35891647855530473, + "grad_norm": 12.364990234375, + "learning_rate": 1.4288568347598777e-05, + "loss": 0.0784, "step": 477 }, { - "epoch": 0.4704724409448819, - "grad_norm": 1.2758026123046875, - "learning_rate": 1.092630580211333e-05, - "loss": 0.0821, + "epoch": 0.35966892400300976, + "grad_norm": 0.022090526297688484, + "learning_rate": 1.4267201770608392e-05, + "loss": 0.0008, "step": 478 }, { - "epoch": 0.4714566929133858, - "grad_norm": 0.0016634146450087428, - "learning_rate": 1.0895513179596567e-05, - "loss": 0.0001, + "epoch": 0.36042136945071485, + "grad_norm": 1.5892497301101685, + "learning_rate": 1.4245811348868753e-05, + "loss": 0.0918, "step": 479 }, { - "epoch": 0.47244094488188976, - "grad_norm": 0.06537233293056488, - "learning_rate": 1.0864711994907457e-05, - "loss": 0.0024, + "epoch": 0.3611738148984199, + "grad_norm": 0.005540026817470789, + "learning_rate": 1.4224397201907664e-05, + "loss": 0.0003, "step": 480 }, { - "epoch": 0.4734251968503937, - "grad_norm": 0.0042429231107234955, - "learning_rate": 1.0833902542542008e-05, - "loss": 0.0002, + "epoch": 0.3619262603461249, + "grad_norm": 0.06051875278353691, + "learning_rate": 1.42029594493855e-05, + "loss": 0.0014, "step": 481 }, { - "epoch": 0.4744094488188976, - "grad_norm": 0.18925046920776367, - "learning_rate": 1.080308511707527e-05, - "loss": 0.0096, + "epoch": 0.36267870579382994, + "grad_norm": 0.011512890458106995, + "learning_rate": 1.418149821109454e-05, + "loss": 0.0005, "step": 482 }, { - "epoch": 0.47539370078740156, - "grad_norm": 0.0011809567222371697, - "learning_rate": 1.077226001315852e-05, + "epoch": 0.36343115124153497, + "grad_norm": 0.002135386224836111, + "learning_rate": 1.4160013606958303e-05, "loss": 0.0001, "step": 483 }, { - "epoch": 0.4763779527559055, - "grad_norm": 0.7972561717033386, - "learning_rate": 1.0741427525516463e-05, - "loss": 0.0205, + "epoch": 0.36418359668924005, + "grad_norm": 0.018821068108081818, + "learning_rate": 1.4138505757030869e-05, + "loss": 0.0004, "step": 484 }, { - "epoch": 0.47736220472440943, - "grad_norm": 0.0010766938794404268, - "learning_rate": 1.0710587948944395e-05, - "loss": 0.0001, + "epoch": 0.3649360421369451, + "grad_norm": 0.0008346544927917421, + "learning_rate": 1.411697478149622e-05, + "loss": 0.0, "step": 485 }, { - "epoch": 0.47834645669291337, - "grad_norm": 0.16127875447273254, - "learning_rate": 1.067974157830539e-05, - "loss": 0.0032, + "epoch": 0.3656884875846501, + "grad_norm": 0.18350642919540405, + "learning_rate": 1.409542080066756e-05, + "loss": 0.004, "step": 486 }, { - "epoch": 0.4793307086614173, - "grad_norm": 0.0004286077746655792, - "learning_rate": 1.0648888708527481e-05, - "loss": 0.0001, + "epoch": 0.36644093303235514, + "grad_norm": 2.6405723094940186, + "learning_rate": 1.4073843934986644e-05, + "loss": 0.3307, "step": 487 }, { - "epoch": 0.48031496062992124, - "grad_norm": 5.30352258682251, - "learning_rate": 1.0618029634600843e-05, - "loss": 0.3593, + "epoch": 0.3671933784800602, + "grad_norm": 0.0034803429152816534, + "learning_rate": 1.4052244305023101e-05, + "loss": 0.0001, "step": 488 }, { - "epoch": 0.4812992125984252, - "grad_norm": 0.034324873238801956, - "learning_rate": 1.0587164651574967e-05, - "loss": 0.0008, + "epoch": 0.36794582392776526, + "grad_norm": 0.006721539422869682, + "learning_rate": 1.403062203147377e-05, + "loss": 0.0003, "step": 489 }, { - "epoch": 0.4822834645669291, - "grad_norm": 0.00026891447487287223, - "learning_rate": 1.0556294054555847e-05, - "loss": 0.0, + "epoch": 0.3686982693754703, + "grad_norm": 0.6599304676055908, + "learning_rate": 1.4008977235162024e-05, + "loss": 0.0107, "step": 490 }, { - "epoch": 0.48326771653543305, - "grad_norm": 0.01480196975171566, - "learning_rate": 1.052541813870315e-05, - "loss": 0.0006, + "epoch": 0.3694507148231753, + "grad_norm": 0.05510720983147621, + "learning_rate": 1.3987310037037081e-05, + "loss": 0.0022, "step": 491 }, { - "epoch": 0.484251968503937, - "grad_norm": 0.40251630544662476, - "learning_rate": 1.0494537199227393e-05, - "loss": 0.0208, + "epoch": 0.37020316027088035, + "grad_norm": 0.0020602026488631964, + "learning_rate": 1.3965620558173345e-05, + "loss": 0.0001, "step": 492 }, { - "epoch": 0.48523622047244097, - "grad_norm": 0.13406778872013092, - "learning_rate": 1.046365153138713e-05, - "loss": 0.0036, + "epoch": 0.3709556057185854, + "grad_norm": 3.3849992752075195, + "learning_rate": 1.3943908919769724e-05, + "loss": 0.0947, "step": 493 }, { - "epoch": 0.4862204724409449, - "grad_norm": 1.7141681909561157, - "learning_rate": 1.043276143048613e-05, - "loss": 0.1129, + "epoch": 0.37170805116629047, + "grad_norm": 0.0011134468950331211, + "learning_rate": 1.3922175243148948e-05, + "loss": 0.0001, "step": 494 }, { - "epoch": 0.48720472440944884, - "grad_norm": 0.06691700220108032, - "learning_rate": 1.0401867191870534e-05, - "loss": 0.001, + "epoch": 0.3724604966139955, + "grad_norm": 0.4826012849807739, + "learning_rate": 1.3900419649756895e-05, + "loss": 0.0075, "step": 495 }, { - "epoch": 0.4881889763779528, - "grad_norm": 0.07686873525381088, - "learning_rate": 1.0370969110926052e-05, - "loss": 0.0022, + "epoch": 0.3732129420617005, + "grad_norm": 1.0771576166152954, + "learning_rate": 1.3878642261161916e-05, + "loss": 0.015, "step": 496 }, { - "epoch": 0.4891732283464567, - "grad_norm": 0.11128439009189606, - "learning_rate": 1.0340067483075135e-05, - "loss": 0.0027, + "epoch": 0.37396538750940556, + "grad_norm": 0.14827294647693634, + "learning_rate": 1.3856843199054144e-05, + "loss": 0.0036, "step": 497 }, { - "epoch": 0.49015748031496065, - "grad_norm": 0.6419368386268616, - "learning_rate": 1.0309162603774137e-05, - "loss": 0.0455, + "epoch": 0.3747178329571106, + "grad_norm": 0.0043605901300907135, + "learning_rate": 1.3835022585244829e-05, + "loss": 0.0002, "step": 498 }, { - "epoch": 0.4911417322834646, - "grad_norm": 0.2135791927576065, - "learning_rate": 1.0278254768510505e-05, - "loss": 0.0075, + "epoch": 0.37547027840481567, + "grad_norm": 0.05336622893810272, + "learning_rate": 1.3813180541665646e-05, + "loss": 0.0025, "step": 499 }, { - "epoch": 0.4921259842519685, - "grad_norm": 2.2798004150390625, - "learning_rate": 1.024734427279995e-05, - "loss": 0.0681, + "epoch": 0.3762227238525207, + "grad_norm": 0.00369238737039268, + "learning_rate": 1.3791317190368018e-05, + "loss": 0.0002, "step": 500 }, { - "epoch": 0.49311023622047245, - "grad_norm": 0.02419065684080124, - "learning_rate": 1.0216431412183619e-05, - "loss": 0.0008, + "epoch": 0.37697516930022573, + "grad_norm": 1.3321623802185059, + "learning_rate": 1.3769432653522436e-05, + "loss": 0.0112, "step": 501 }, { - "epoch": 0.4940944881889764, - "grad_norm": 0.040688831359148026, - "learning_rate": 1.0185516482225264e-05, - "loss": 0.0013, + "epoch": 0.37772761474793076, + "grad_norm": 0.0012655869359150529, + "learning_rate": 1.3747527053417776e-05, + "loss": 0.0001, "step": 502 }, { - "epoch": 0.4950787401574803, - "grad_norm": 1.6208440065383911, - "learning_rate": 1.0154599778508434e-05, - "loss": 0.231, + "epoch": 0.3784800601956358, + "grad_norm": 0.0007887445390224457, + "learning_rate": 1.3725600512460606e-05, + "loss": 0.0, "step": 503 }, { - "epoch": 0.49606299212598426, - "grad_norm": 0.25073355436325073, - "learning_rate": 1.012368159663363e-05, - "loss": 0.0118, + "epoch": 0.3792325056433409, + "grad_norm": 0.1122029721736908, + "learning_rate": 1.3703653153174513e-05, + "loss": 0.0007, "step": 504 }, { - "epoch": 0.4970472440944882, - "grad_norm": 4.3933820724487305, - "learning_rate": 1.0092762232215486e-05, - "loss": 0.4789, + "epoch": 0.3799849510910459, + "grad_norm": 2.8246965408325195, + "learning_rate": 1.3681685098199418e-05, + "loss": 0.2841, "step": 505 }, { - "epoch": 0.49803149606299213, - "grad_norm": 0.9459077715873718, - "learning_rate": 1.0061841980879941e-05, - "loss": 0.0208, + "epoch": 0.38073739653875094, + "grad_norm": 0.006240316201001406, + "learning_rate": 1.3659696470290888e-05, + "loss": 0.0002, "step": 506 }, { - "epoch": 0.49901574803149606, - "grad_norm": 0.534449577331543, - "learning_rate": 1.0030921138261422e-05, - "loss": 0.096, + "epoch": 0.38148984198645597, + "grad_norm": 0.15288379788398743, + "learning_rate": 1.3637687392319443e-05, + "loss": 0.0032, "step": 507 }, { - "epoch": 0.5, - "grad_norm": 5.268911361694336, - "learning_rate": 1e-05, - "loss": 0.0997, + "epoch": 0.382242287434161, + "grad_norm": 0.005582885816693306, + "learning_rate": 1.3615657987269882e-05, + "loss": 0.0002, "step": 508 }, { - "epoch": 0.5009842519685039, - "grad_norm": 0.0008193289977498353, - "learning_rate": 9.96907886173858e-06, - "loss": 0.0001, + "epoch": 0.3829947328818661, + "grad_norm": 0.00010864822252187878, + "learning_rate": 1.3593608378240587e-05, + "loss": 0.0, "step": 509 }, { - "epoch": 0.5019685039370079, - "grad_norm": 0.2905411124229431, - "learning_rate": 9.93815801912006e-06, - "loss": 0.0074, + "epoch": 0.3837471783295711, + "grad_norm": 5.737289905548096, + "learning_rate": 1.3571538688442843e-05, + "loss": 0.1844, "step": 510 }, { - "epoch": 0.5029527559055118, - "grad_norm": 0.0002990937209688127, - "learning_rate": 9.907237767784518e-06, - "loss": 0.0, + "epoch": 0.38449962377727614, + "grad_norm": 0.002173966495320201, + "learning_rate": 1.3549449041200138e-05, + "loss": 0.0001, "step": 511 }, { - "epoch": 0.5039370078740157, - "grad_norm": 2.6059625148773193, - "learning_rate": 9.876318403366371e-06, - "loss": 0.0668, + "epoch": 0.3852520692249812, + "grad_norm": 0.8293859958648682, + "learning_rate": 1.3527339559947483e-05, + "loss": 0.1965, "step": 512 }, { - "epoch": 0.5049212598425197, - "grad_norm": 3.3259501457214355, - "learning_rate": 9.84540022149157e-06, - "loss": 0.079, + "epoch": 0.3860045146726862, + "grad_norm": 0.0026998259127140045, + "learning_rate": 1.3505210368230723e-05, + "loss": 0.0001, "step": 513 }, { - "epoch": 0.5059055118110236, - "grad_norm": 0.002007074421271682, - "learning_rate": 9.814483517774738e-06, - "loss": 0.0001, + "epoch": 0.3867569601203913, + "grad_norm": 0.00044145798892714083, + "learning_rate": 1.3483061589705839e-05, + "loss": 0.0, "step": 514 }, { - "epoch": 0.5068897637795275, - "grad_norm": 0.0018503802129998803, - "learning_rate": 9.783568587816384e-06, + "epoch": 0.3875094055680963, + "grad_norm": 0.004763697739690542, + "learning_rate": 1.3460893348138262e-05, "loss": 0.0001, "step": 515 }, { - "epoch": 0.5078740157480315, - "grad_norm": 0.3180418312549591, - "learning_rate": 9.752655727200051e-06, - "loss": 0.0079, + "epoch": 0.38826185101580135, + "grad_norm": 9.023599704960361e-05, + "learning_rate": 1.3438705767402185e-05, + "loss": 0.0, "step": 516 }, { - "epoch": 0.5088582677165354, - "grad_norm": 0.014291474595665932, - "learning_rate": 9.721745231489499e-06, - "loss": 0.0004, + "epoch": 0.3890142964635064, + "grad_norm": 0.0630866289138794, + "learning_rate": 1.341649897147986e-05, + "loss": 0.0006, "step": 517 }, { - "epoch": 0.5098425196850394, - "grad_norm": 0.2414456307888031, - "learning_rate": 9.690837396225867e-06, - "loss": 0.0066, + "epoch": 0.3897667419112114, + "grad_norm": 0.0039035833906382322, + "learning_rate": 1.3394273084460916e-05, + "loss": 0.0002, "step": 518 }, { - "epoch": 0.5108267716535433, - "grad_norm": 0.000964588369242847, - "learning_rate": 9.659932516924866e-06, - "loss": 0.0001, + "epoch": 0.3905191873589165, + "grad_norm": 3.1299986839294434, + "learning_rate": 1.3372028230541658e-05, + "loss": 0.4068, "step": 519 }, { - "epoch": 0.5118110236220472, - "grad_norm": 0.007653846405446529, - "learning_rate": 9.62903088907395e-06, - "loss": 0.0004, + "epoch": 0.3912716328066215, + "grad_norm": 0.0035276322159916162, + "learning_rate": 1.3349764534024385e-05, + "loss": 0.0002, "step": 520 }, { - "epoch": 0.5127952755905512, - "grad_norm": 0.12027694284915924, - "learning_rate": 9.598132808129469e-06, - "loss": 0.0041, + "epoch": 0.39202407825432656, + "grad_norm": 0.5055978894233704, + "learning_rate": 1.3327482119316674e-05, + "loss": 0.0223, "step": 521 }, { - "epoch": 0.5137795275590551, - "grad_norm": 0.0007745010079815984, - "learning_rate": 9.567238569513872e-06, - "loss": 0.0001, + "epoch": 0.3927765237020316, + "grad_norm": 0.0033338728826493025, + "learning_rate": 1.330518111093071e-05, + "loss": 0.0002, "step": 522 }, { - "epoch": 0.514763779527559, - "grad_norm": 0.09125042706727982, - "learning_rate": 9.536348468612872e-06, - "loss": 0.0036, + "epoch": 0.3935289691497366, + "grad_norm": 0.06779402494430542, + "learning_rate": 1.3282861633482566e-05, + "loss": 0.003, "step": 523 }, { - "epoch": 0.515748031496063, - "grad_norm": 0.5272306799888611, - "learning_rate": 9.505462800772612e-06, - "loss": 0.0916, + "epoch": 0.3942814145974417, + "grad_norm": 0.6969091892242432, + "learning_rate": 1.3260523811691527e-05, + "loss": 0.0111, "step": 524 }, { - "epoch": 0.5167322834645669, - "grad_norm": 1.2508244514465332, - "learning_rate": 9.474581861296855e-06, - "loss": 0.1358, + "epoch": 0.39503386004514673, + "grad_norm": 0.3388632535934448, + "learning_rate": 1.3238167770379384e-05, + "loss": 0.1329, "step": 525 }, { - "epoch": 0.5177165354330708, - "grad_norm": 2.3361175060272217, - "learning_rate": 9.443705945444158e-06, - "loss": 0.159, + "epoch": 0.39578630549285176, + "grad_norm": 0.15506567060947418, + "learning_rate": 1.3215793634469733e-05, + "loss": 0.0035, "step": 526 }, { - "epoch": 0.5187007874015748, - "grad_norm": 0.00012743513798341155, - "learning_rate": 9.412835348425038e-06, - "loss": 0.0001, + "epoch": 0.3965387509405568, + "grad_norm": 3.37923002243042, + "learning_rate": 1.3193401528987286e-05, + "loss": 0.3774, "step": 527 }, { - "epoch": 0.5196850393700787, - "grad_norm": 0.003939046990126371, - "learning_rate": 9.381970365399162e-06, - "loss": 0.0003, + "epoch": 0.3972911963882618, + "grad_norm": 0.6373834609985352, + "learning_rate": 1.3170991579057163e-05, + "loss": 0.0922, "step": 528 }, { - "epoch": 0.5206692913385826, - "grad_norm": 0.004564144182950258, - "learning_rate": 9.351111291472522e-06, - "loss": 0.0001, + "epoch": 0.3980436418359669, + "grad_norm": 0.23215818405151367, + "learning_rate": 1.3148563909904195e-05, + "loss": 0.007, "step": 529 }, { - "epoch": 0.5216535433070866, - "grad_norm": 0.46711844205856323, - "learning_rate": 9.320258421694615e-06, - "loss": 0.0287, + "epoch": 0.39879608728367194, + "grad_norm": 0.000551075441762805, + "learning_rate": 1.3126118646852235e-05, + "loss": 0.0, "step": 530 }, { - "epoch": 0.5226377952755905, - "grad_norm": 0.16989010572433472, - "learning_rate": 9.289412051055608e-06, - "loss": 0.0053, + "epoch": 0.39954853273137697, + "grad_norm": 2.5601372718811035, + "learning_rate": 1.3103655915323444e-05, + "loss": 0.0924, "step": 531 }, { - "epoch": 0.5236220472440944, - "grad_norm": 0.005036798305809498, - "learning_rate": 9.25857247448354e-06, - "loss": 0.0003, + "epoch": 0.400300978179082, + "grad_norm": 0.029645578935742378, + "learning_rate": 1.3081175840837595e-05, + "loss": 0.0007, "step": 532 }, { - "epoch": 0.5246062992125984, - "grad_norm": 0.017221862450242043, - "learning_rate": 9.227739986841485e-06, - "loss": 0.0007, + "epoch": 0.40105342362678703, + "grad_norm": 0.0015663238009437919, + "learning_rate": 1.3058678549011371e-05, + "loss": 0.0001, "step": 533 }, { - "epoch": 0.5255905511811023, - "grad_norm": 0.1715071201324463, - "learning_rate": 9.196914882924737e-06, - "loss": 0.007, + "epoch": 0.4018058690744921, + "grad_norm": 7.516592025756836, + "learning_rate": 1.3036164165557667e-05, + "loss": 0.1748, "step": 534 }, { - "epoch": 0.5265748031496063, - "grad_norm": 1.0040740966796875, - "learning_rate": 9.166097457457994e-06, - "loss": 0.0229, + "epoch": 0.40255831452219715, + "grad_norm": 0.02340114861726761, + "learning_rate": 1.3013632816284885e-05, + "loss": 0.001, "step": 535 }, { - "epoch": 0.5275590551181102, - "grad_norm": 0.4975915253162384, - "learning_rate": 9.135288005092546e-06, - "loss": 0.1165, + "epoch": 0.4033107599699022, + "grad_norm": 0.04894443601369858, + "learning_rate": 1.2991084627096226e-05, + "loss": 0.0016, "step": 536 }, { - "epoch": 0.5285433070866141, - "grad_norm": 0.13898475468158722, - "learning_rate": 9.104486820403438e-06, - "loss": 0.0029, + "epoch": 0.4040632054176072, + "grad_norm": 0.053800590336322784, + "learning_rate": 1.2968519723988994e-05, + "loss": 0.0021, "step": 537 }, { - "epoch": 0.5295275590551181, - "grad_norm": 0.8463035821914673, - "learning_rate": 9.073694197886676e-06, - "loss": 0.0257, + "epoch": 0.40481565086531224, + "grad_norm": 0.01409213524311781, + "learning_rate": 1.2945938233053892e-05, + "loss": 0.0005, "step": 538 }, { - "epoch": 0.530511811023622, - "grad_norm": 0.4724947512149811, - "learning_rate": 9.042910431956398e-06, - "loss": 0.0275, + "epoch": 0.4055680963130173, + "grad_norm": 0.4299611747264862, + "learning_rate": 1.2923340280474306e-05, + "loss": 0.0091, "step": 539 }, { - "epoch": 0.531496062992126, - "grad_norm": 0.46944767236709595, - "learning_rate": 9.012135816942058e-06, - "loss": 0.0232, + "epoch": 0.40632054176072235, + "grad_norm": 3.0297110080718994, + "learning_rate": 1.2900725992525618e-05, + "loss": 0.1769, "step": 540 }, { - "epoch": 0.53248031496063, - "grad_norm": 4.7974724769592285, - "learning_rate": 8.981370647085629e-06, - "loss": 0.1758, + "epoch": 0.4070729872084274, + "grad_norm": 0.012766638770699501, + "learning_rate": 1.2878095495574484e-05, + "loss": 0.0006, "step": 541 }, { - "epoch": 0.5334645669291339, - "grad_norm": 0.0008676442666910589, - "learning_rate": 8.950615216538765e-06, - "loss": 0.0001, + "epoch": 0.4078254326561324, + "grad_norm": 0.019243692979216576, + "learning_rate": 1.285544891607813e-05, + "loss": 0.0007, "step": 542 }, { - "epoch": 0.5344488188976378, - "grad_norm": 0.009300711564719677, - "learning_rate": 8.919869819360002e-06, - "loss": 0.0003, + "epoch": 0.40857787810383744, + "grad_norm": 0.4208061099052429, + "learning_rate": 1.2832786380583664e-05, + "loss": 0.0103, "step": 543 }, { - "epoch": 0.5354330708661418, - "grad_norm": 0.5879456996917725, - "learning_rate": 8.889134749511956e-06, - "loss": 0.0102, + "epoch": 0.40933032355154253, + "grad_norm": 0.07195150852203369, + "learning_rate": 1.2810108015727345e-05, + "loss": 0.0022, "step": 544 }, { - "epoch": 0.5364173228346457, - "grad_norm": 0.0015386503655463457, - "learning_rate": 8.858410300858482e-06, - "loss": 0.0001, + "epoch": 0.41008276899924756, + "grad_norm": 1.117520809173584, + "learning_rate": 1.2787413948233885e-05, + "loss": 0.0159, "step": 545 }, { - "epoch": 0.5374015748031497, - "grad_norm": 0.009641151875257492, - "learning_rate": 8.827696767161902e-06, - "loss": 0.0004, + "epoch": 0.4108352144469526, + "grad_norm": 0.15800736844539642, + "learning_rate": 1.2764704304915743e-05, + "loss": 0.0048, "step": 546 }, { - "epoch": 0.5383858267716536, - "grad_norm": 0.6984387636184692, - "learning_rate": 8.796994442080167e-06, - "loss": 0.0141, + "epoch": 0.4115876598946576, + "grad_norm": 0.026513785123825073, + "learning_rate": 1.2741979212672418e-05, + "loss": 0.0011, "step": 547 }, { - "epoch": 0.5393700787401575, - "grad_norm": 0.9815958738327026, - "learning_rate": 8.76630361916407e-06, - "loss": 0.1687, + "epoch": 0.4123401053423627, + "grad_norm": 0.27647465467453003, + "learning_rate": 1.2719238798489725e-05, + "loss": 0.0094, "step": 548 }, { - "epoch": 0.5403543307086615, - "grad_norm": 0.13998663425445557, - "learning_rate": 8.735624591854418e-06, - "loss": 0.0064, + "epoch": 0.41309255079006774, + "grad_norm": 0.003962705843150616, + "learning_rate": 1.2696483189439113e-05, + "loss": 0.0002, "step": 549 }, { - "epoch": 0.5413385826771654, - "grad_norm": 2.1254003047943115, - "learning_rate": 8.704957653479245e-06, - "loss": 0.1258, + "epoch": 0.41384499623777277, + "grad_norm": 0.006578361615538597, + "learning_rate": 1.2673712512676923e-05, + "loss": 0.0002, "step": 550 }, { - "epoch": 0.5423228346456693, - "grad_norm": 0.3112325072288513, - "learning_rate": 8.674303097251003e-06, - "loss": 0.0117, + "epoch": 0.4145974416854778, + "grad_norm": 0.4227232038974762, + "learning_rate": 1.2650926895443705e-05, + "loss": 0.0132, "step": 551 }, { - "epoch": 0.5433070866141733, - "grad_norm": 3.896050214767456, - "learning_rate": 8.643661216263744e-06, - "loss": 0.0911, + "epoch": 0.4153498871331828, + "grad_norm": 0.1907581388950348, + "learning_rate": 1.2628126465063483e-05, + "loss": 0.0062, "step": 552 }, { - "epoch": 0.5442913385826772, - "grad_norm": 7.140151865314692e-05, - "learning_rate": 8.613032303490343e-06, - "loss": 0.0, + "epoch": 0.4161023325808879, + "grad_norm": 0.02955441363155842, + "learning_rate": 1.2605311348943066e-05, + "loss": 0.0006, "step": 553 }, { - "epoch": 0.5452755905511811, - "grad_norm": 0.0019134563626721501, - "learning_rate": 8.58241665177968e-06, - "loss": 0.0001, + "epoch": 0.41685477802859294, + "grad_norm": 0.582580029964447, + "learning_rate": 1.2582481674571325e-05, + "loss": 0.0115, "step": 554 }, { - "epoch": 0.5462598425196851, - "grad_norm": 0.0025728214532136917, - "learning_rate": 8.551814553853836e-06, - "loss": 0.0002, + "epoch": 0.417607223476298, + "grad_norm": 0.0009048896026797593, + "learning_rate": 1.2559637569518472e-05, + "loss": 0.0, "step": 555 }, { - "epoch": 0.547244094488189, - "grad_norm": 0.06524799764156342, - "learning_rate": 8.52122630230531e-06, - "loss": 0.0016, + "epoch": 0.418359668924003, + "grad_norm": 0.03770684078335762, + "learning_rate": 1.2536779161435368e-05, + "loss": 0.0006, "step": 556 }, { - "epoch": 0.5482283464566929, - "grad_norm": 0.0057241772301495075, - "learning_rate": 8.490652189594212e-06, - "loss": 0.0002, + "epoch": 0.41911211437170803, + "grad_norm": 0.25430363416671753, + "learning_rate": 1.251390657805279e-05, + "loss": 0.1052, "step": 557 }, { - "epoch": 0.5492125984251969, - "grad_norm": 0.02337772771716118, - "learning_rate": 8.460092508045465e-06, - "loss": 0.0007, + "epoch": 0.4198645598194131, + "grad_norm": 0.8613071441650391, + "learning_rate": 1.2491019947180727e-05, + "loss": 0.0129, "step": 558 }, { - "epoch": 0.5501968503937008, - "grad_norm": 0.002759635215625167, - "learning_rate": 8.429547549846011e-06, - "loss": 0.0001, + "epoch": 0.42061700526711815, + "grad_norm": 0.00920679047703743, + "learning_rate": 1.2468119396707668e-05, + "loss": 0.0003, "step": 559 }, { - "epoch": 0.5511811023622047, - "grad_norm": 0.0019592391327023506, - "learning_rate": 8.399017607042025e-06, - "loss": 0.0001, + "epoch": 0.4213694507148232, + "grad_norm": 0.49765467643737793, + "learning_rate": 1.2445205054599879e-05, + "loss": 0.0065, "step": 560 }, { - "epoch": 0.5521653543307087, - "grad_norm": 0.003367210039868951, - "learning_rate": 8.368502971536116e-06, - "loss": 0.0002, + "epoch": 0.4221218961625282, + "grad_norm": 0.006361035164445639, + "learning_rate": 1.2422277048900694e-05, + "loss": 0.0003, "step": 561 }, { - "epoch": 0.5531496062992126, - "grad_norm": 5.432099342346191, - "learning_rate": 8.338003935084531e-06, - "loss": 0.1729, + "epoch": 0.42287434161023324, + "grad_norm": 0.3172746002674103, + "learning_rate": 1.23993355077298e-05, + "loss": 0.2152, "step": 562 }, { - "epoch": 0.5541338582677166, - "grad_norm": 0.9918099641799927, - "learning_rate": 8.307520789294382e-06, - "loss": 0.1083, + "epoch": 0.4236267870579383, + "grad_norm": 0.010659880936145782, + "learning_rate": 1.237638055928251e-05, + "loss": 0.0004, "step": 563 }, { - "epoch": 0.5551181102362205, - "grad_norm": 0.006833460181951523, - "learning_rate": 8.277053825620836e-06, - "loss": 0.0003, + "epoch": 0.42437923250564336, + "grad_norm": 0.18647299706935883, + "learning_rate": 1.2353412331829073e-05, + "loss": 0.0062, "step": 564 }, { - "epoch": 0.5561023622047244, - "grad_norm": 1.165911078453064, - "learning_rate": 8.246603335364343e-06, - "loss": 0.0202, + "epoch": 0.4251316779533484, + "grad_norm": 7.033252716064453, + "learning_rate": 1.2330430953713921e-05, + "loss": 0.3394, "step": 565 }, { - "epoch": 0.5570866141732284, - "grad_norm": 0.010280168615281582, - "learning_rate": 8.216169609667854e-06, - "loss": 0.0004, + "epoch": 0.4258841234010534, + "grad_norm": 0.035930879414081573, + "learning_rate": 1.2307436553354985e-05, + "loss": 0.0009, "step": 566 }, { - "epoch": 0.5580708661417323, - "grad_norm": 6.800440311431885, - "learning_rate": 8.185752939514026e-06, - "loss": 0.3401, + "epoch": 0.42663656884875845, + "grad_norm": 0.0031391908414661884, + "learning_rate": 1.2284429259242958e-05, + "loss": 0.0001, "step": 567 }, { - "epoch": 0.5590551181102362, - "grad_norm": 0.03663322702050209, - "learning_rate": 8.155353615722442e-06, - "loss": 0.0006, + "epoch": 0.42738901429646353, + "grad_norm": 9.076556205749512, + "learning_rate": 1.2261409199940574e-05, + "loss": 0.3117, "step": 568 }, { - "epoch": 0.5600393700787402, - "grad_norm": 2.2609219551086426, - "learning_rate": 8.124971928946837e-06, - "loss": 0.0968, + "epoch": 0.42814145974416856, + "grad_norm": 0.3694950044155121, + "learning_rate": 1.2238376504081913e-05, + "loss": 0.0936, "step": 569 }, { - "epoch": 0.5610236220472441, - "grad_norm": 0.006467037834227085, - "learning_rate": 8.094608169672318e-06, - "loss": 0.0002, + "epoch": 0.4288939051918736, + "grad_norm": 0.0012056067353114486, + "learning_rate": 1.221533130037166e-05, + "loss": 0.0, "step": 570 }, { - "epoch": 0.562007874015748, - "grad_norm": 0.0010649190517142415, - "learning_rate": 8.064262628212573e-06, - "loss": 0.0001, + "epoch": 0.4296463506395786, + "grad_norm": 3.5604007244110107, + "learning_rate": 1.2192273717584386e-05, + "loss": 0.0581, "step": 571 }, { - "epoch": 0.562992125984252, - "grad_norm": 7.15801288606599e-05, - "learning_rate": 8.033935594707116e-06, - "loss": 0.0, + "epoch": 0.43039879608728365, + "grad_norm": 0.005317636765539646, + "learning_rate": 1.2169203884563846e-05, + "loss": 0.0002, "step": 572 }, { - "epoch": 0.5639763779527559, - "grad_norm": 3.9041378498077393, - "learning_rate": 8.0036273591185e-06, - "loss": 0.4006, + "epoch": 0.43115124153498874, + "grad_norm": 0.413285493850708, + "learning_rate": 1.2146121930222241e-05, + "loss": 0.0035, "step": 573 }, { - "epoch": 0.5649606299212598, - "grad_norm": 0.0007572037284262478, - "learning_rate": 7.97333821122955e-06, - "loss": 0.0001, + "epoch": 0.43190368698269377, + "grad_norm": 4.266297340393066, + "learning_rate": 1.2123027983539511e-05, + "loss": 0.1756, "step": 574 }, { - "epoch": 0.5659448818897638, - "grad_norm": 0.30688348412513733, - "learning_rate": 7.943068440640582e-06, - "loss": 0.0103, + "epoch": 0.4326561324303988, + "grad_norm": 10.099092483520508, + "learning_rate": 1.2099922173562602e-05, + "loss": 0.0599, "step": 575 }, { - "epoch": 0.5669291338582677, - "grad_norm": 3.413010835647583, - "learning_rate": 7.91281833676665e-06, - "loss": 0.2584, + "epoch": 0.43340857787810383, + "grad_norm": 0.0031585677061229944, + "learning_rate": 1.2076804629404752e-05, + "loss": 0.0001, "step": 576 }, { - "epoch": 0.5679133858267716, - "grad_norm": 0.00011805386748164892, - "learning_rate": 7.88258818883477e-06, - "loss": 0.0, + "epoch": 0.43416102332580886, + "grad_norm": 0.13065661489963531, + "learning_rate": 1.2053675480244777e-05, + "loss": 0.0047, "step": 577 }, { - "epoch": 0.5688976377952756, - "grad_norm": 2.228557586669922, - "learning_rate": 7.852378285881148e-06, - "loss": 0.7601, + "epoch": 0.43491346877351394, + "grad_norm": 8.336974143981934, + "learning_rate": 1.2030534855326326e-05, + "loss": 0.1777, "step": 578 }, { - "epoch": 0.5698818897637795, - "grad_norm": 0.09803301841020584, - "learning_rate": 7.822188916748431e-06, - "loss": 0.0029, + "epoch": 0.435665914221219, + "grad_norm": 0.007145630661398172, + "learning_rate": 1.2007382883957186e-05, + "loss": 0.0003, "step": 579 }, { - "epoch": 0.5708661417322834, - "grad_norm": 0.010421579703688622, - "learning_rate": 7.79202037008294e-06, - "loss": 0.0004, + "epoch": 0.436418359668924, + "grad_norm": 1.3916585445404053, + "learning_rate": 1.1984219695508546e-05, + "loss": 0.0053, "step": 580 }, { - "epoch": 0.5718503937007874, - "grad_norm": 0.0005483022541739047, - "learning_rate": 7.761872934331901e-06, - "loss": 0.0, + "epoch": 0.43717080511662904, + "grad_norm": 19.2365665435791, + "learning_rate": 1.1961045419414264e-05, + "loss": 0.0929, "step": 581 }, { - "epoch": 0.5728346456692913, - "grad_norm": 0.09272383153438568, - "learning_rate": 7.7317468977407e-06, - "loss": 0.0028, + "epoch": 0.43792325056433407, + "grad_norm": 1.4760831618332304e-05, + "learning_rate": 1.1937860185170164e-05, + "loss": 0.0, "step": 582 }, { - "epoch": 0.5738188976377953, - "grad_norm": 0.0010022320784628391, - "learning_rate": 7.701642548350122e-06, + "epoch": 0.43867569601203915, + "grad_norm": 0.003603702411055565, + "learning_rate": 1.1914664122333305e-05, "loss": 0.0001, "step": 583 }, { - "epoch": 0.5748031496062992, - "grad_norm": 1.8828871250152588, - "learning_rate": 7.671560173993588e-06, - "loss": 0.472, + "epoch": 0.4394281414597442, + "grad_norm": 2.7130890885018744e-05, + "learning_rate": 1.1891457360521253e-05, + "loss": 0.0, "step": 584 }, { - "epoch": 0.5757874015748031, - "grad_norm": 0.6172698140144348, - "learning_rate": 7.641500062294423e-06, - "loss": 0.0222, + "epoch": 0.4401805869074492, + "grad_norm": 0.004726966377347708, + "learning_rate": 1.1868240029411351e-05, + "loss": 0.0002, "step": 585 }, { - "epoch": 0.5767716535433071, - "grad_norm": 0.029400793835520744, - "learning_rate": 7.6114625006630885e-06, - "loss": 0.0015, + "epoch": 0.44093303235515424, + "grad_norm": 0.007764340378344059, + "learning_rate": 1.1845012258740016e-05, + "loss": 0.0002, "step": 586 }, { - "epoch": 0.577755905511811, - "grad_norm": 1.355639100074768, - "learning_rate": 7.5814477762944435e-06, - "loss": 0.0422, + "epoch": 0.44168547780285927, + "grad_norm": 0.003988485783338547, + "learning_rate": 1.182177417830199e-05, + "loss": 0.0001, "step": 587 }, { - "epoch": 0.5787401574803149, - "grad_norm": 0.15182897448539734, - "learning_rate": 7.551456176164989e-06, - "loss": 0.004, + "epoch": 0.44243792325056436, + "grad_norm": 0.013868695124983788, + "learning_rate": 1.1798525917949626e-05, + "loss": 0.0004, "step": 588 }, { - "epoch": 0.5797244094488189, - "grad_norm": 0.022366374731063843, - "learning_rate": 7.52148798703014e-06, - "loss": 0.001, + "epoch": 0.4431903686982694, + "grad_norm": 0.4925473630428314, + "learning_rate": 1.177526760759217e-05, + "loss": 0.0091, "step": 589 }, { - "epoch": 0.5807086614173228, - "grad_norm": 0.9933125376701355, - "learning_rate": 7.491543495421468e-06, - "loss": 0.0329, + "epoch": 0.4439428141459744, + "grad_norm": 0.26699769496917725, + "learning_rate": 1.1751999377195014e-05, + "loss": 0.0881, "step": 590 }, { - "epoch": 0.5816929133858267, - "grad_norm": 1.739646077156067, - "learning_rate": 7.4616229876439664e-06, - "loss": 0.444, + "epoch": 0.44469525959367945, + "grad_norm": 2.3798282146453857, + "learning_rate": 1.1728721356778994e-05, + "loss": 0.0538, "step": 591 }, { - "epoch": 0.5826771653543307, - "grad_norm": 0.44527336955070496, - "learning_rate": 7.431726749773322e-06, - "loss": 0.0193, + "epoch": 0.4454477050413845, + "grad_norm": 0.001672322629019618, + "learning_rate": 1.1705433676419644e-05, + "loss": 0.0, "step": 592 }, { - "epoch": 0.5836614173228346, - "grad_norm": 0.003679246176034212, - "learning_rate": 7.401855067653168e-06, + "epoch": 0.44620015048908956, + "grad_norm": 0.005258211866021156, + "learning_rate": 1.168213646624648e-05, "loss": 0.0002, "step": 593 }, { - "epoch": 0.5846456692913385, - "grad_norm": 4.747228622436523, - "learning_rate": 7.372008226892354e-06, - "loss": 0.0726, + "epoch": 0.4469525959367946, + "grad_norm": 0.10926241427659988, + "learning_rate": 1.1658829856442269e-05, + "loss": 0.0013, "step": 594 }, { - "epoch": 0.5856299212598425, - "grad_norm": 0.0062688374891877174, - "learning_rate": 7.342186512862219e-06, - "loss": 0.0004, + "epoch": 0.4477050413844996, + "grad_norm": 0.0057695843279361725, + "learning_rate": 1.1635513977242304e-05, + "loss": 0.0002, "step": 595 }, { - "epoch": 0.5866141732283464, - "grad_norm": 0.7860682010650635, - "learning_rate": 7.312390210693863e-06, - "loss": 0.118, + "epoch": 0.44845748683220465, + "grad_norm": 0.059808410704135895, + "learning_rate": 1.1612188958933673e-05, + "loss": 0.0007, "step": 596 }, { - "epoch": 0.5875984251968503, - "grad_norm": 0.25527361035346985, - "learning_rate": 7.282619605275409e-06, - "loss": 0.0085, + "epoch": 0.4492099322799097, + "grad_norm": 0.0013155878987163305, + "learning_rate": 1.158885493185453e-05, + "loss": 0.0, "step": 597 }, { - "epoch": 0.5885826771653543, - "grad_norm": 1.127985954284668, - "learning_rate": 7.252874981249297e-06, - "loss": 0.0263, + "epoch": 0.44996237772761477, + "grad_norm": 10.205440521240234, + "learning_rate": 1.1565512026393371e-05, + "loss": 0.2459, "step": 598 }, { - "epoch": 0.5895669291338582, - "grad_norm": 0.41452479362487793, - "learning_rate": 7.223156623009554e-06, - "loss": 0.0195, + "epoch": 0.4507148231753198, + "grad_norm": 0.0017919522942975163, + "learning_rate": 1.1542160372988312e-05, + "loss": 0.0001, "step": 599 }, { - "epoch": 0.5905511811023622, - "grad_norm": 0.0663510113954544, - "learning_rate": 7.193464814699073e-06, - "loss": 0.0036, + "epoch": 0.45146726862302483, + "grad_norm": 0.0012058173306286335, + "learning_rate": 1.1518800102126334e-05, + "loss": 0.0, "step": 600 }, { - "epoch": 0.5915354330708661, - "grad_norm": 0.010237705893814564, - "learning_rate": 7.163799840206893e-06, - "loss": 0.0007, + "epoch": 0.45221971407072986, + "grad_norm": 0.0011316149029880762, + "learning_rate": 1.149543134434259e-05, + "loss": 0.0001, "step": 601 }, { - "epoch": 0.59251968503937, - "grad_norm": 0.7353321313858032, - "learning_rate": 7.134161983165498e-06, - "loss": 0.0745, + "epoch": 0.4529721595184349, + "grad_norm": 0.03514290601015091, + "learning_rate": 1.1472054230219644e-05, + "loss": 0.001, "step": 602 }, { - "epoch": 0.593503937007874, - "grad_norm": 0.015297899022698402, - "learning_rate": 7.104551526948091e-06, - "loss": 0.001, + "epoch": 0.45372460496614, + "grad_norm": 0.742960512638092, + "learning_rate": 1.1448668890386765e-05, + "loss": 0.0066, "step": 603 }, { - "epoch": 0.594488188976378, - "grad_norm": 0.0022395369596779346, - "learning_rate": 7.07496875466589e-06, - "loss": 0.0002, + "epoch": 0.454477050413845, + "grad_norm": 3.21566104888916, + "learning_rate": 1.1425275455519176e-05, + "loss": 0.3373, "step": 604 }, { - "epoch": 0.5954724409448819, - "grad_norm": 0.021553950384259224, - "learning_rate": 7.045413949165434e-06, - "loss": 0.0012, + "epoch": 0.45522949586155004, + "grad_norm": 0.0010515082394704223, + "learning_rate": 1.140187405633734e-05, + "loss": 0.0, "step": 605 }, { - "epoch": 0.5964566929133859, - "grad_norm": 0.45530006289482117, - "learning_rate": 7.015887393025847e-06, - "loss": 0.0159, + "epoch": 0.45598194130925507, + "grad_norm": 0.431357204914093, + "learning_rate": 1.1378464823606228e-05, + "loss": 0.0023, "step": 606 }, { - "epoch": 0.5974409448818898, - "grad_norm": 0.018222475424408913, - "learning_rate": 6.986389368556168e-06, - "loss": 0.0013, + "epoch": 0.4567343867569601, + "grad_norm": 0.43157681822776794, + "learning_rate": 1.1355047888134571e-05, + "loss": 0.0708, "step": 607 }, { - "epoch": 0.5984251968503937, - "grad_norm": 0.005042280536144972, - "learning_rate": 6.9569201577926395e-06, - "loss": 0.0004, + "epoch": 0.4574868322046652, + "grad_norm": 0.0020544701255857944, + "learning_rate": 1.1331623380774156e-05, + "loss": 0.0001, "step": 608 }, { - "epoch": 0.5994094488188977, - "grad_norm": 0.04521799832582474, - "learning_rate": 6.927480042496002e-06, - "loss": 0.0029, + "epoch": 0.4582392776523702, + "grad_norm": 2.6385327146272175e-05, + "learning_rate": 1.1308191432419078e-05, + "loss": 0.0, "step": 609 }, { - "epoch": 0.6003937007874016, - "grad_norm": 0.00425010547041893, - "learning_rate": 6.898069304148816e-06, - "loss": 0.0003, + "epoch": 0.45899172310007524, + "grad_norm": 0.006066860631108284, + "learning_rate": 1.1284752174005005e-05, + "loss": 0.0002, "step": 610 }, { - "epoch": 0.6013779527559056, - "grad_norm": 0.013767623342573643, - "learning_rate": 6.868688223952764e-06, - "loss": 0.001, + "epoch": 0.4597441685477803, + "grad_norm": 0.9779205322265625, + "learning_rate": 1.1261305736508458e-05, + "loss": 0.1798, "step": 611 }, { - "epoch": 0.6023622047244095, - "grad_norm": 0.26801133155822754, - "learning_rate": 6.839337082825954e-06, - "loss": 0.0142, + "epoch": 0.4604966139954853, + "grad_norm": 0.0006280777743086219, + "learning_rate": 1.1237852250946077e-05, + "loss": 0.0, "step": 612 }, { - "epoch": 0.6033464566929134, - "grad_norm": 0.20212164521217346, - "learning_rate": 6.81001616140025e-06, - "loss": 0.0106, + "epoch": 0.4612490594431904, + "grad_norm": 1.0666285753250122, + "learning_rate": 1.1214391848373876e-05, + "loss": 0.0166, "step": 613 }, { - "epoch": 0.6043307086614174, - "grad_norm": 0.4419066309928894, - "learning_rate": 6.7807257400185745e-06, - "loss": 0.0153, + "epoch": 0.4620015048908954, + "grad_norm": 0.018536796793341637, + "learning_rate": 1.1190924659886532e-05, + "loss": 0.0006, "step": 614 }, { - "epoch": 0.6053149606299213, - "grad_norm": 1.2423938512802124, - "learning_rate": 6.75146609873223e-06, - "loss": 0.1522, + "epoch": 0.46275395033860045, + "grad_norm": 0.17641091346740723, + "learning_rate": 1.1167450816616639e-05, + "loss": 0.0047, "step": 615 }, { - "epoch": 0.6062992125984252, - "grad_norm": 3.653700590133667, - "learning_rate": 6.722237517298232e-06, - "loss": 0.1541, + "epoch": 0.4635063957863055, + "grad_norm": 0.02300580032169819, + "learning_rate": 1.1143970449733968e-05, + "loss": 0.0007, "step": 616 }, { - "epoch": 0.6072834645669292, - "grad_norm": 0.06643989682197571, - "learning_rate": 6.693040275176623e-06, - "loss": 0.0032, + "epoch": 0.4642588412340105, + "grad_norm": 0.22667698562145233, + "learning_rate": 1.112048369044475e-05, + "loss": 0.0041, "step": 617 }, { - "epoch": 0.6082677165354331, - "grad_norm": 2.289261817932129, - "learning_rate": 6.6638746515278086e-06, - "loss": 0.3004, + "epoch": 0.4650112866817156, + "grad_norm": 0.0003491557145025581, + "learning_rate": 1.1096990669990942e-05, + "loss": 0.0, "step": 618 }, { - "epoch": 0.609251968503937, - "grad_norm": 2.498094081878662, - "learning_rate": 6.634740925209873e-06, - "loss": 0.3325, + "epoch": 0.4657637321294206, + "grad_norm": 8.98726939340122e-05, + "learning_rate": 1.1073491519649475e-05, + "loss": 0.0, "step": 619 }, { - "epoch": 0.610236220472441, - "grad_norm": 0.002739843912422657, - "learning_rate": 6.605639374775934e-06, - "loss": 0.0002, + "epoch": 0.46651617757712566, + "grad_norm": 0.23045086860656738, + "learning_rate": 1.1049986370731545e-05, + "loss": 0.0034, "step": 620 }, { - "epoch": 0.6112204724409449, - "grad_norm": 0.018509741872549057, - "learning_rate": 6.576570278471471e-06, - "loss": 0.001, + "epoch": 0.4672686230248307, + "grad_norm": 7.242973327636719, + "learning_rate": 1.102647535458186e-05, + "loss": 0.3392, "step": 621 }, { - "epoch": 0.6122047244094488, - "grad_norm": 0.009204177185893059, - "learning_rate": 6.547533914231654e-06, - "loss": 0.0005, + "epoch": 0.4680210684725357, + "grad_norm": 0.00343369715847075, + "learning_rate": 1.1002958602577922e-05, + "loss": 0.0001, "step": 622 }, { - "epoch": 0.6131889763779528, - "grad_norm": 0.13494901359081268, - "learning_rate": 6.518530559678703e-06, - "loss": 0.0073, + "epoch": 0.4687735139202408, + "grad_norm": 0.001794908195734024, + "learning_rate": 1.0979436246129267e-05, + "loss": 0.0001, "step": 623 }, { - "epoch": 0.6141732283464567, - "grad_norm": 0.10958428680896759, - "learning_rate": 6.489560492119225e-06, - "loss": 0.0054, + "epoch": 0.46952595936794583, + "grad_norm": 0.040094684809446335, + "learning_rate": 1.0955908416676772e-05, + "loss": 0.0014, "step": 624 }, { - "epoch": 0.6151574803149606, - "grad_norm": 2.021435022354126, - "learning_rate": 6.460623988541559e-06, - "loss": 0.3087, + "epoch": 0.47027840481565086, + "grad_norm": 1.4908933735569008e-05, + "learning_rate": 1.093237524569188e-05, + "loss": 0.0, "step": 625 }, { - "epoch": 0.6161417322834646, - "grad_norm": 3.2822439670562744, - "learning_rate": 6.431721325613138e-06, - "loss": 0.1506, + "epoch": 0.4710308502633559, + "grad_norm": 0.007998337037861347, + "learning_rate": 1.0908836864675884e-05, + "loss": 0.0004, "step": 626 }, { - "epoch": 0.6171259842519685, - "grad_norm": 1.555001974105835, - "learning_rate": 6.40285277967784e-06, - "loss": 0.0722, + "epoch": 0.4717832957110609, + "grad_norm": 0.039706043899059296, + "learning_rate": 1.0885293405159196e-05, + "loss": 0.0013, "step": 627 }, { - "epoch": 0.6181102362204725, - "grad_norm": 0.014875966124236584, - "learning_rate": 6.374018626753331e-06, - "loss": 0.001, + "epoch": 0.472535741158766, + "grad_norm": 0.002229271689429879, + "learning_rate": 1.0861744998700603e-05, + "loss": 0.0001, "step": 628 }, { - "epoch": 0.6190944881889764, - "grad_norm": 3.05005145072937, - "learning_rate": 6.345219142528454e-06, - "loss": 0.1029, + "epoch": 0.47328818660647104, + "grad_norm": 10.943811416625977, + "learning_rate": 1.0838191776886531e-05, + "loss": 0.0575, "step": 629 }, { - "epoch": 0.6200787401574803, - "grad_norm": 0.0976070687174797, - "learning_rate": 6.316454602360569e-06, - "loss": 0.0049, + "epoch": 0.47404063205417607, + "grad_norm": 1.3392058610916138, + "learning_rate": 1.0814633871330323e-05, + "loss": 0.1315, "step": 630 }, { - "epoch": 0.6210629921259843, - "grad_norm": 0.012725242413580418, - "learning_rate": 6.2877252812729365e-06, - "loss": 0.0009, + "epoch": 0.4747930775018811, + "grad_norm": 0.21532350778579712, + "learning_rate": 1.0791071413671487e-05, + "loss": 0.0045, "step": 631 }, { - "epoch": 0.6220472440944882, - "grad_norm": 0.3721928298473358, - "learning_rate": 6.2590314539520695e-06, - "loss": 0.0146, + "epoch": 0.47554552294958613, + "grad_norm": 0.27968382835388184, + "learning_rate": 1.0767504535574971e-05, + "loss": 0.0037, "step": 632 }, { - "epoch": 0.6230314960629921, - "grad_norm": 0.007010951172560453, - "learning_rate": 6.2303733947451285e-06, - "loss": 0.0004, + "epoch": 0.4762979683972912, + "grad_norm": 0.000970702909398824, + "learning_rate": 1.0743933368730417e-05, + "loss": 0.0, "step": 633 }, { - "epoch": 0.6240157480314961, - "grad_norm": 0.7185295224189758, - "learning_rate": 6.2017513776572855e-06, - "loss": 0.0318, + "epoch": 0.47705041384499625, + "grad_norm": 0.013053186237812042, + "learning_rate": 1.0720358044851448e-05, + "loss": 0.0005, "step": 634 }, { - "epoch": 0.625, - "grad_norm": 0.019771916791796684, - "learning_rate": 6.173165676349103e-06, - "loss": 0.001, + "epoch": 0.4778028592927013, + "grad_norm": 0.01000749971717596, + "learning_rate": 1.0696778695674899e-05, + "loss": 0.0004, "step": 635 }, { - "epoch": 0.6259842519685039, - "grad_norm": 2.4763894081115723, - "learning_rate": 6.144616564133927e-06, - "loss": 0.0585, + "epoch": 0.4785553047404063, + "grad_norm": 0.007056588772684336, + "learning_rate": 1.0673195452960107e-05, + "loss": 0.0002, "step": 636 }, { - "epoch": 0.6269685039370079, - "grad_norm": 0.180698424577713, - "learning_rate": 6.116104313975267e-06, - "loss": 0.0095, + "epoch": 0.47930775018811134, + "grad_norm": 0.014011607505381107, + "learning_rate": 1.0649608448488166e-05, + "loss": 0.0004, "step": 637 }, { - "epoch": 0.6279527559055118, - "grad_norm": 0.9879865050315857, - "learning_rate": 6.0876291984841795e-06, - "loss": 0.0308, + "epoch": 0.4800601956358164, + "grad_norm": 0.014452227391302586, + "learning_rate": 1.0626017814061186e-05, + "loss": 0.0004, "step": 638 }, { - "epoch": 0.6289370078740157, - "grad_norm": 0.021894028410315514, - "learning_rate": 6.059191489916681e-06, - "loss": 0.0015, + "epoch": 0.48081264108352145, + "grad_norm": 0.10185810178518295, + "learning_rate": 1.0602423681501564e-05, + "loss": 0.0018, "step": 639 }, { - "epoch": 0.6299212598425197, - "grad_norm": 0.11342475563287735, - "learning_rate": 6.03079146017113e-06, - "loss": 0.006, + "epoch": 0.4815650865312265, + "grad_norm": 0.0015508810756728053, + "learning_rate": 1.0578826182651243e-05, + "loss": 0.0001, "step": 640 }, { - "epoch": 0.6309055118110236, - "grad_norm": 0.007467803545296192, - "learning_rate": 6.002429380785624e-06, - "loss": 0.0005, + "epoch": 0.4823175319789315, + "grad_norm": 4.458479404449463, + "learning_rate": 1.055522544937098e-05, + "loss": 0.2037, "step": 641 }, { - "epoch": 0.6318897637795275, - "grad_norm": 0.34698984026908875, - "learning_rate": 5.974105522935416e-06, - "loss": 0.1161, + "epoch": 0.48306997742663654, + "grad_norm": 0.028539393097162247, + "learning_rate": 1.0531621613539598e-05, + "loss": 0.0004, "step": 642 }, { - "epoch": 0.6328740157480315, - "grad_norm": 3.867368698120117, - "learning_rate": 5.945820157430316e-06, - "loss": 0.2631, + "epoch": 0.48382242287434163, + "grad_norm": 0.02204856462776661, + "learning_rate": 1.0508014807053261e-05, + "loss": 0.0006, "step": 643 }, { - "epoch": 0.6338582677165354, - "grad_norm": 0.01272546872496605, - "learning_rate": 5.9175735547120975e-06, - "loss": 0.0009, + "epoch": 0.48457486832204666, + "grad_norm": 0.0006058391300030053, + "learning_rate": 1.0484405161824743e-05, + "loss": 0.0, "step": 644 }, { - "epoch": 0.6348425196850394, - "grad_norm": 0.06877221912145615, - "learning_rate": 5.889365984851918e-06, - "loss": 0.0037, + "epoch": 0.4853273137697517, + "grad_norm": 0.00671573868021369, + "learning_rate": 1.0460792809782659e-05, + "loss": 0.0002, "step": 645 }, { - "epoch": 0.6358267716535433, - "grad_norm": 0.23351243138313293, - "learning_rate": 5.8611977175477355e-06, - "loss": 0.0126, + "epoch": 0.4860797592174567, + "grad_norm": 0.004197990987449884, + "learning_rate": 1.0437177882870768e-05, + "loss": 0.0001, "step": 646 }, { - "epoch": 0.6368110236220472, - "grad_norm": 0.08977934718132019, - "learning_rate": 5.833069022121727e-06, - "loss": 0.0043, + "epoch": 0.48683220466516175, + "grad_norm": 9.33966064453125, + "learning_rate": 1.0413560513047208e-05, + "loss": 0.0376, "step": 647 }, { - "epoch": 0.6377952755905512, - "grad_norm": 1.9535436630249023, - "learning_rate": 5.804980167517712e-06, - "loss": 0.1651, + "epoch": 0.48758465011286684, + "grad_norm": 0.0011327359825372696, + "learning_rate": 1.038994083228377e-05, + "loss": 0.0, "step": 648 }, { - "epoch": 0.6387795275590551, - "grad_norm": 0.013944629579782486, - "learning_rate": 5.7769314222985905e-06, - "loss": 0.001, + "epoch": 0.48833709556057187, + "grad_norm": 3.5501651763916016, + "learning_rate": 1.0366318972565162e-05, + "loss": 0.0192, "step": 649 }, { - "epoch": 0.639763779527559, - "grad_norm": 0.025774765759706497, - "learning_rate": 5.748923054643767e-06, - "loss": 0.0017, + "epoch": 0.4890895410082769, + "grad_norm": 8.738161087036133, + "learning_rate": 1.0342695065888262e-05, + "loss": 0.2822, "step": 650 }, { - "epoch": 0.640748031496063, - "grad_norm": 0.171986386179924, - "learning_rate": 5.720955332346587e-06, - "loss": 0.0109, + "epoch": 0.4898419864559819, + "grad_norm": 0.0026988936588168144, + "learning_rate": 1.031906924426139e-05, + "loss": 0.0001, "step": 651 }, { - "epoch": 0.6417322834645669, - "grad_norm": 0.04593617096543312, - "learning_rate": 5.693028522811783e-06, - "loss": 0.0031, + "epoch": 0.49059443190368696, + "grad_norm": 0.011756551451981068, + "learning_rate": 1.0295441639703563e-05, + "loss": 0.0002, "step": 652 }, { - "epoch": 0.6427165354330708, - "grad_norm": 0.006693943403661251, - "learning_rate": 5.665142893052902e-06, - "loss": 0.0005, + "epoch": 0.49134687735139204, + "grad_norm": 0.04992348700761795, + "learning_rate": 1.027181238424376e-05, + "loss": 0.0009, "step": 653 }, { - "epoch": 0.6437007874015748, - "grad_norm": 0.002994636772200465, - "learning_rate": 5.63729870968977e-06, - "loss": 0.0002, + "epoch": 0.49209932279909707, + "grad_norm": 0.17335185408592224, + "learning_rate": 1.0248181609920198e-05, + "loss": 0.0035, "step": 654 }, { - "epoch": 0.6446850393700787, - "grad_norm": 1.536815881729126, - "learning_rate": 5.609496238945939e-06, - "loss": 0.376, + "epoch": 0.4928517682468021, + "grad_norm": 0.0004414380819071084, + "learning_rate": 1.0224549448779564e-05, + "loss": 0.0, "step": 655 }, { - "epoch": 0.6456692913385826, - "grad_norm": 0.013822553679347038, - "learning_rate": 5.581735746646134e-06, - "loss": 0.0009, + "epoch": 0.49360421369450713, + "grad_norm": 1.6693496704101562, + "learning_rate": 1.0200916032876303e-05, + "loss": 0.0686, "step": 656 }, { - "epoch": 0.6466535433070866, - "grad_norm": 0.040410954505205154, - "learning_rate": 5.5540174982137185e-06, - "loss": 0.0023, + "epoch": 0.49435665914221216, + "grad_norm": 0.0005949286860413849, + "learning_rate": 1.0177281494271873e-05, + "loss": 0.0, "step": 657 }, { - "epoch": 0.6476377952755905, - "grad_norm": 0.04101261496543884, - "learning_rate": 5.526341758668158e-06, - "loss": 0.0031, + "epoch": 0.49510910458991725, + "grad_norm": 0.09166250377893448, + "learning_rate": 1.0153645965033998e-05, + "loss": 0.0029, "step": 658 }, { - "epoch": 0.6486220472440944, - "grad_norm": 0.39171817898750305, - "learning_rate": 5.498708792622469e-06, - "loss": 0.0143, + "epoch": 0.4958615500376223, + "grad_norm": 0.0018533958354964852, + "learning_rate": 1.0130009577235946e-05, + "loss": 0.0001, "step": 659 }, { - "epoch": 0.6496062992125984, - "grad_norm": 0.06820935755968094, - "learning_rate": 5.471118864280716e-06, - "loss": 0.0045, + "epoch": 0.4966139954853273, + "grad_norm": 0.004575713537633419, + "learning_rate": 1.010637246295578e-05, + "loss": 0.0002, "step": 660 }, { - "epoch": 0.6505905511811023, - "grad_norm": 0.04471202194690704, - "learning_rate": 5.443572237435466e-06, - "loss": 0.0023, + "epoch": 0.49736644093303234, + "grad_norm": 0.49716654419898987, + "learning_rate": 1.008273475427562e-05, + "loss": 0.0048, "step": 661 }, { - "epoch": 0.6515748031496063, - "grad_norm": 0.08941526710987091, - "learning_rate": 5.416069175465274e-06, - "loss": 0.0044, + "epoch": 0.49811888638073737, + "grad_norm": 1.1305954456329346, + "learning_rate": 1.0059096583280907e-05, + "loss": 0.0953, "step": 662 }, { - "epoch": 0.6525590551181102, - "grad_norm": 0.16131488978862762, - "learning_rate": 5.388609941332164e-06, - "loss": 0.0071, + "epoch": 0.49887133182844245, + "grad_norm": 0.001503349863924086, + "learning_rate": 1.0035458082059672e-05, + "loss": 0.0001, "step": 663 }, { - "epoch": 0.6535433070866141, - "grad_norm": 0.17402151226997375, - "learning_rate": 5.361194797579108e-06, - "loss": 0.011, + "epoch": 0.4996237772761475, + "grad_norm": 0.010958625003695488, + "learning_rate": 1.0011819382701784e-05, + "loss": 0.0003, "step": 664 }, { - "epoch": 0.6545275590551181, - "grad_norm": 0.1764174848794937, - "learning_rate": 5.333824006327533e-06, - "loss": 0.0112, + "epoch": 0.5003762227238525, + "grad_norm": 0.0015685193939134479, + "learning_rate": 9.98818061729822e-06, + "loss": 0.0, "step": 665 }, { - "epoch": 0.655511811023622, - "grad_norm": 0.004917513579130173, - "learning_rate": 5.306497829274785e-06, - "loss": 0.0004, + "epoch": 0.5011286681715575, + "grad_norm": 0.010029254481196404, + "learning_rate": 9.964541917940331e-06, + "loss": 0.0003, "step": 666 }, { - "epoch": 0.656496062992126, - "grad_norm": 0.01852281205356121, - "learning_rate": 5.279216527691657e-06, - "loss": 0.0013, + "epoch": 0.5018811136192626, + "grad_norm": 0.004363952670246363, + "learning_rate": 9.940903416719097e-06, + "loss": 0.0001, "step": 667 }, { - "epoch": 0.65748031496063, - "grad_norm": 0.0919354259967804, - "learning_rate": 5.2519803624198865e-06, - "loss": 0.004, + "epoch": 0.5026335590669676, + "grad_norm": 0.01587914675474167, + "learning_rate": 9.917265245724385e-06, + "loss": 0.0003, "step": 668 }, { - "epoch": 0.6584645669291339, - "grad_norm": 0.007931046187877655, - "learning_rate": 5.2247895938696404e-06, - "loss": 0.0004, + "epoch": 0.5033860045146726, + "grad_norm": 0.005167737137526274, + "learning_rate": 9.893627537044223e-06, + "loss": 0.0002, "step": 669 }, { - "epoch": 0.6594488188976378, - "grad_norm": 0.00611425656825304, - "learning_rate": 5.197644482017048e-06, - "loss": 0.0005, + "epoch": 0.5041384499623778, + "grad_norm": 8.244839668273926, + "learning_rate": 9.869990422764056e-06, + "loss": 0.1592, "step": 670 }, { - "epoch": 0.6604330708661418, - "grad_norm": 0.015278506092727184, - "learning_rate": 5.170545286401694e-06, - "loss": 0.0007, + "epoch": 0.5048908954100828, + "grad_norm": 0.0, + "learning_rate": 9.846354034966003e-06, + "loss": 0.0, "step": 671 }, { - "epoch": 0.6614173228346457, - "grad_norm": 0.056785598397254944, - "learning_rate": 5.143492266124164e-06, - "loss": 0.0035, + "epoch": 0.5056433408577878, + "grad_norm": 18.1772518157959, + "learning_rate": 9.822718505728129e-06, + "loss": 0.3513, "step": 672 }, { - "epoch": 0.6624015748031497, - "grad_norm": 2.505824565887451, - "learning_rate": 5.116485679843542e-06, - "loss": 0.1927, + "epoch": 0.5063957863054929, + "grad_norm": 0.007274544797837734, + "learning_rate": 9.7990839671237e-06, + "loss": 0.0002, "step": 673 }, { - "epoch": 0.6633858267716536, - "grad_norm": 0.026091318577528, - "learning_rate": 5.089525785774951e-06, - "loss": 0.0016, + "epoch": 0.5071482317531979, + "grad_norm": 0.31394076347351074, + "learning_rate": 9.77545055122044e-06, + "loss": 0.1283, "step": 674 }, { - "epoch": 0.6643700787401575, - "grad_norm": 0.08652155846357346, - "learning_rate": 5.062612841687084e-06, - "loss": 0.0031, + "epoch": 0.5079006772009029, + "grad_norm": 0.03855385258793831, + "learning_rate": 9.751818390079805e-06, + "loss": 0.0003, "step": 675 }, { - "epoch": 0.6653543307086615, - "grad_norm": 1.6182775497436523, - "learning_rate": 5.035747104899738e-06, - "loss": 0.1878, + "epoch": 0.508653122648608, + "grad_norm": 0.000805127143394202, + "learning_rate": 9.728187615756243e-06, + "loss": 0.0, "step": 676 }, { - "epoch": 0.6663385826771654, - "grad_norm": 0.004458173643797636, - "learning_rate": 5.008928832281339e-06, - "loss": 0.0003, + "epoch": 0.509405568096313, + "grad_norm": 0.0, + "learning_rate": 9.704558360296444e-06, + "loss": 0.0, "step": 677 }, { - "epoch": 0.6673228346456693, - "grad_norm": 0.12087676674127579, - "learning_rate": 4.982158280246508e-06, - "loss": 0.0026, + "epoch": 0.510158013544018, + "grad_norm": 1.416871190071106, + "learning_rate": 9.680930755738616e-06, + "loss": 0.016, "step": 678 }, { - "epoch": 0.6683070866141733, - "grad_norm": 1.3387526273727417, - "learning_rate": 4.955435704753605e-06, - "loss": 0.0716, + "epoch": 0.510910458991723, + "grad_norm": 0.006285065319389105, + "learning_rate": 9.657304934111742e-06, + "loss": 0.0002, "step": 679 }, { - "epoch": 0.6692913385826772, - "grad_norm": 0.08262021094560623, - "learning_rate": 4.928761361302269e-06, - "loss": 0.0045, + "epoch": 0.5116629044394282, + "grad_norm": 1.3052698373794556, + "learning_rate": 9.633681027434838e-06, + "loss": 0.1261, "step": 680 }, { - "epoch": 0.6702755905511811, - "grad_norm": 0.01581563986837864, - "learning_rate": 4.902135504930987e-06, - "loss": 0.001, + "epoch": 0.5124153498871332, + "grad_norm": 0.0009769117459654808, + "learning_rate": 9.61005916771623e-06, + "loss": 0.0, "step": 681 }, { - "epoch": 0.6712598425196851, - "grad_norm": 0.010388122871518135, - "learning_rate": 4.875558390214652e-06, - "loss": 0.0006, + "epoch": 0.5131677953348383, + "grad_norm": 0.008825716562569141, + "learning_rate": 9.586439486952796e-06, + "loss": 0.0003, "step": 682 }, { - "epoch": 0.672244094488189, - "grad_norm": 1.9765303134918213, - "learning_rate": 4.8490302712621295e-06, - "loss": 0.0772, + "epoch": 0.5139202407825433, + "grad_norm": 0.004172160290181637, + "learning_rate": 9.562822117129235e-06, + "loss": 0.0001, "step": 683 }, { - "epoch": 0.6732283464566929, - "grad_norm": 0.541053295135498, - "learning_rate": 4.8225514017138205e-06, - "loss": 0.0669, + "epoch": 0.5146726862302483, + "grad_norm": 0.03515918552875519, + "learning_rate": 9.539207190217343e-06, + "loss": 0.0009, "step": 684 }, { - "epoch": 0.6742125984251969, - "grad_norm": 0.014389094896614552, - "learning_rate": 4.796122034739251e-06, - "loss": 0.0006, + "epoch": 0.5154251316779533, + "grad_norm": 0.0004445746308192611, + "learning_rate": 9.51559483817526e-06, + "loss": 0.0, "step": 685 }, { - "epoch": 0.6751968503937008, - "grad_norm": 0.02095104567706585, - "learning_rate": 4.76974242303464e-06, - "loss": 0.0009, + "epoch": 0.5161775771256584, + "grad_norm": 0.002501540118828416, + "learning_rate": 9.491985192946742e-06, + "loss": 0.0001, "step": 686 }, { - "epoch": 0.6761811023622047, - "grad_norm": 1.7018764019012451, - "learning_rate": 4.743412818820488e-06, - "loss": 0.1365, + "epoch": 0.5169300225733634, + "grad_norm": 0.21380773186683655, + "learning_rate": 9.468378386460406e-06, + "loss": 0.0934, "step": 687 }, { - "epoch": 0.6771653543307087, - "grad_norm": 0.05205598846077919, - "learning_rate": 4.717133473839163e-06, - "loss": 0.0033, + "epoch": 0.5176824680210684, + "grad_norm": 0.005090567748993635, + "learning_rate": 9.444774550629024e-06, + "loss": 0.0002, "step": 688 }, { - "epoch": 0.6781496062992126, - "grad_norm": 0.575622022151947, - "learning_rate": 4.690904639352499e-06, - "loss": 0.0274, + "epoch": 0.5184349134687735, + "grad_norm": 0.0015086415223777294, + "learning_rate": 9.42117381734876e-06, + "loss": 0.0001, "step": 689 }, { - "epoch": 0.6791338582677166, - "grad_norm": 0.056806761771440506, - "learning_rate": 4.6647265661393806e-06, - "loss": 0.0029, + "epoch": 0.5191873589164786, + "grad_norm": 0.018775468692183495, + "learning_rate": 9.397576318498438e-06, + "loss": 0.0007, "step": 690 }, { - "epoch": 0.6801181102362205, - "grad_norm": 0.03065960854291916, - "learning_rate": 4.638599504493362e-06, - "loss": 0.0017, + "epoch": 0.5199398043641836, + "grad_norm": 0.8499560356140137, + "learning_rate": 9.373982185938815e-06, + "loss": 0.0083, "step": 691 }, { - "epoch": 0.6811023622047244, - "grad_norm": 0.005358200054615736, - "learning_rate": 4.612523704220264e-06, - "loss": 0.0003, + "epoch": 0.5206922498118887, + "grad_norm": 0.2695012092590332, + "learning_rate": 9.350391551511837e-06, + "loss": 0.0015, "step": 692 }, { - "epoch": 0.6820866141732284, - "grad_norm": 0.004167455714195967, - "learning_rate": 4.586499414635788e-06, - "loss": 0.0003, + "epoch": 0.5214446952595937, + "grad_norm": 0.06739958375692368, + "learning_rate": 9.326804547039894e-06, + "loss": 0.0009, "step": 693 }, { - "epoch": 0.6830708661417323, - "grad_norm": 0.009238578379154205, - "learning_rate": 4.56052688456313e-06, - "loss": 0.0007, + "epoch": 0.5221971407072987, + "grad_norm": 1.989675521850586, + "learning_rate": 9.303221304325103e-06, + "loss": 0.2139, "step": 694 }, { - "epoch": 0.6840551181102362, - "grad_norm": 0.024184664711356163, - "learning_rate": 4.534606362330607e-06, - "loss": 0.0014, + "epoch": 0.5229495861550038, + "grad_norm": 0.04741863161325455, + "learning_rate": 9.279641955148553e-06, + "loss": 0.0013, "step": 695 }, { - "epoch": 0.6850393700787402, - "grad_norm": 1.3939580917358398, - "learning_rate": 4.508738095769278e-06, - "loss": 0.0483, + "epoch": 0.5237020316027088, + "grad_norm": 0.060340896248817444, + "learning_rate": 9.256066631269586e-06, + "loss": 0.0008, "step": 696 }, { - "epoch": 0.6860236220472441, - "grad_norm": 0.4179919362068176, - "learning_rate": 4.482922332210569e-06, - "loss": 0.0103, + "epoch": 0.5244544770504138, + "grad_norm": 2.983630895614624, + "learning_rate": 9.232495464425034e-06, + "loss": 0.4607, "step": 697 }, { - "epoch": 0.687007874015748, - "grad_norm": 0.07760833203792572, - "learning_rate": 4.457159318483922e-06, - "loss": 0.0019, + "epoch": 0.5252069224981188, + "grad_norm": 0.0010294626699760556, + "learning_rate": 9.208928586328518e-06, + "loss": 0.0, "step": 698 }, { - "epoch": 0.687992125984252, - "grad_norm": 0.2561807632446289, - "learning_rate": 4.431449300914428e-06, - "loss": 0.016, + "epoch": 0.5259593679458239, + "grad_norm": 1.120234537665965e-05, + "learning_rate": 9.185366128669682e-06, + "loss": 0.0, "step": 699 }, { - "epoch": 0.6889763779527559, - "grad_norm": 1.393060326576233, - "learning_rate": 4.405792525320469e-06, - "loss": 0.1742, + "epoch": 0.526711813393529, + "grad_norm": 2.128415107727051, + "learning_rate": 9.161808223113469e-06, + "loss": 0.1173, "step": 700 }, { - "epoch": 0.6899606299212598, - "grad_norm": 0.9923989176750183, - "learning_rate": 4.3801892370113695e-06, - "loss": 0.0399, + "epoch": 0.527464258841234, + "grad_norm": 0.0016029436374083161, + "learning_rate": 9.138255001299402e-06, + "loss": 0.0001, "step": 701 }, { - "epoch": 0.6909448818897638, - "grad_norm": 1.022470474243164, - "learning_rate": 4.354639680785059e-06, - "loss": 0.0564, + "epoch": 0.5282167042889391, + "grad_norm": 6.54170560836792, + "learning_rate": 9.114706594840806e-06, + "loss": 0.1784, "step": 702 }, { - "epoch": 0.6919291338582677, - "grad_norm": 3.045370578765869, - "learning_rate": 4.3291441009257105e-06, - "loss": 0.1541, + "epoch": 0.5289691497366441, + "grad_norm": 0.005230502225458622, + "learning_rate": 9.091163135324119e-06, + "loss": 0.0002, "step": 703 }, { - "epoch": 0.6929133858267716, - "grad_norm": 0.241269052028656, - "learning_rate": 4.303702741201431e-06, - "loss": 0.0047, + "epoch": 0.5297215951843491, + "grad_norm": 0.028300784528255463, + "learning_rate": 9.067624754308124e-06, + "loss": 0.0004, "step": 704 }, { - "epoch": 0.6938976377952756, - "grad_norm": 1.3594764471054077, - "learning_rate": 4.278315844861912e-06, - "loss": 0.0581, + "epoch": 0.5304740406320542, + "grad_norm": 1.312003587372601e-05, + "learning_rate": 9.044091583323231e-06, + "loss": 0.0, "step": 705 }, { - "epoch": 0.6948818897637795, - "grad_norm": 0.008809342980384827, - "learning_rate": 4.252983654636115e-06, - "loss": 0.0005, + "epoch": 0.5312264860797592, + "grad_norm": 0.03969345614314079, + "learning_rate": 9.020563753870734e-06, + "loss": 0.0009, "step": 706 }, { - "epoch": 0.6958661417322834, - "grad_norm": 3.0739593505859375, - "learning_rate": 4.227706412729943e-06, - "loss": 0.1479, + "epoch": 0.5319789315274642, + "grad_norm": 29.23634910583496, + "learning_rate": 8.997041397422083e-06, + "loss": 0.2304, "step": 707 }, { - "epoch": 0.6968503937007874, - "grad_norm": 2.0128350257873535, - "learning_rate": 4.202484360823926e-06, - "loss": 0.0684, + "epoch": 0.5327313769751693, + "grad_norm": 0.08085115998983383, + "learning_rate": 8.973524645418142e-06, + "loss": 0.0021, "step": 708 }, { - "epoch": 0.6978346456692913, - "grad_norm": 0.0030911520589143038, - "learning_rate": 4.177317740070919e-06, - "loss": 0.0002, + "epoch": 0.5334838224228743, + "grad_norm": 0.001823053928092122, + "learning_rate": 8.95001362926846e-06, + "loss": 0.0001, "step": 709 }, { - "epoch": 0.6988188976377953, - "grad_norm": 3.034400701522827, - "learning_rate": 4.152206791093777e-06, - "loss": 0.0873, + "epoch": 0.5342362678705794, + "grad_norm": 0.0950389876961708, + "learning_rate": 8.926508480350525e-06, + "loss": 0.0008, "step": 710 }, { - "epoch": 0.6998031496062992, - "grad_norm": 0.015304063446819782, - "learning_rate": 4.1271517539830765e-06, - "loss": 0.0008, + "epoch": 0.5349887133182845, + "grad_norm": 0.0005177335697226226, + "learning_rate": 8.903009330009063e-06, + "loss": 0.0, "step": 711 }, { - "epoch": 0.7007874015748031, - "grad_norm": 1.9755024909973145, - "learning_rate": 4.1021528682948064e-06, - "loss": 0.3885, + "epoch": 0.5357411587659895, + "grad_norm": 0.007335309404879808, + "learning_rate": 8.879516309555252e-06, + "loss": 0.0003, "step": 712 }, { - "epoch": 0.7017716535433071, - "grad_norm": 3.4663569927215576, - "learning_rate": 4.077210373048083e-06, - "loss": 0.0675, + "epoch": 0.5364936042136945, + "grad_norm": 0.0022024691570550203, + "learning_rate": 8.856029550266036e-06, + "loss": 0.0001, "step": 713 }, { - "epoch": 0.702755905511811, - "grad_norm": 0.9540387988090515, - "learning_rate": 4.052324506722861e-06, - "loss": 0.098, + "epoch": 0.5372460496613995, + "grad_norm": 0.0016011092811822891, + "learning_rate": 8.832549183383363e-06, + "loss": 0.0001, "step": 714 }, { - "epoch": 0.7037401574803149, - "grad_norm": 0.05092444643378258, - "learning_rate": 4.0274955072576605e-06, - "loss": 0.003, + "epoch": 0.5379984951091046, + "grad_norm": 0.009527268819510937, + "learning_rate": 8.80907534011347e-06, + "loss": 0.0003, "step": 715 }, { - "epoch": 0.7047244094488189, - "grad_norm": 0.125480055809021, - "learning_rate": 4.002723612047272e-06, - "loss": 0.0036, + "epoch": 0.5387509405568096, + "grad_norm": 0.04128224030137062, + "learning_rate": 8.785608151626126e-06, + "loss": 0.0012, "step": 716 }, { - "epoch": 0.7057086614173228, - "grad_norm": 0.021513557061553, - "learning_rate": 3.978009057940518e-06, - "loss": 0.0012, + "epoch": 0.5395033860045146, + "grad_norm": 0.013059580698609352, + "learning_rate": 8.762147749053928e-06, + "loss": 0.0004, "step": 717 }, { - "epoch": 0.7066929133858267, - "grad_norm": 0.012333592399954796, - "learning_rate": 3.953352081237963e-06, - "loss": 0.0007, + "epoch": 0.5402558314522197, + "grad_norm": 0.0006950248498469591, + "learning_rate": 8.738694263491545e-06, + "loss": 0.0, "step": 718 }, { - "epoch": 0.7076771653543307, - "grad_norm": 0.07267458736896515, - "learning_rate": 3.928752917689667e-06, - "loss": 0.0035, + "epoch": 0.5410082768999247, + "grad_norm": 0.050438202917575836, + "learning_rate": 8.715247825995e-06, + "loss": 0.0014, "step": 719 }, { - "epoch": 0.7086614173228346, - "grad_norm": 0.1251276135444641, - "learning_rate": 3.904211802492922e-06, - "loss": 0.0057, + "epoch": 0.5417607223476298, + "grad_norm": 0.007887017913162708, + "learning_rate": 8.691808567580922e-06, + "loss": 0.0002, "step": 720 }, { - "epoch": 0.7096456692913385, - "grad_norm": 0.020583776757121086, - "learning_rate": 3.879728970290016e-06, - "loss": 0.001, + "epoch": 0.5425131677953349, + "grad_norm": 1.9573593139648438, + "learning_rate": 8.668376619225846e-06, + "loss": 0.1971, "step": 721 }, { - "epoch": 0.7106299212598425, - "grad_norm": 0.03735816478729248, - "learning_rate": 3.855304655165978e-06, - "loss": 0.0013, + "epoch": 0.5432656132430399, + "grad_norm": 0.005204391200095415, + "learning_rate": 8.64495211186543e-06, + "loss": 0.0002, "step": 722 }, { - "epoch": 0.7116141732283464, - "grad_norm": 3.6045660972595215, - "learning_rate": 3.8309390906463405e-06, - "loss": 0.1728, + "epoch": 0.5440180586907449, + "grad_norm": 0.10841142386198044, + "learning_rate": 8.621535176393776e-06, + "loss": 0.0026, "step": 723 }, { - "epoch": 0.7125984251968503, - "grad_norm": 0.022377314046025276, - "learning_rate": 3.8066325096949153e-06, - "loss": 0.0016, + "epoch": 0.54477050413845, + "grad_norm": 0.006886586546897888, + "learning_rate": 8.598125943662662e-06, + "loss": 0.0003, "step": 724 }, { - "epoch": 0.7135826771653543, - "grad_norm": 2.760244369506836, - "learning_rate": 3.7823851447115613e-06, - "loss": 0.2237, + "epoch": 0.545522949586155, + "grad_norm": 1.904536485671997, + "learning_rate": 8.574724544480829e-06, + "loss": 0.0237, "step": 725 }, { - "epoch": 0.7145669291338582, - "grad_norm": 0.16297118365764618, - "learning_rate": 3.7581972275299606e-06, - "loss": 0.0045, + "epoch": 0.54627539503386, + "grad_norm": 0.0, + "learning_rate": 8.551331109613238e-06, + "loss": 0.0, "step": 726 }, { - "epoch": 0.7155511811023622, - "grad_norm": 0.012333176098763943, - "learning_rate": 3.7340689894154023e-06, - "loss": 0.0005, + "epoch": 0.547027840481565, + "grad_norm": 0.048206303268671036, + "learning_rate": 8.527945769780358e-06, + "loss": 0.0006, "step": 727 }, { - "epoch": 0.7165354330708661, - "grad_norm": 0.1792801022529602, - "learning_rate": 3.710000661062578e-06, - "loss": 0.0082, + "epoch": 0.5477802859292701, + "grad_norm": 0.05634288117289543, + "learning_rate": 8.504568655657415e-06, + "loss": 0.001, "step": 728 }, { - "epoch": 0.71751968503937, - "grad_norm": 0.027290988713502884, - "learning_rate": 3.6859924725933616e-06, - "loss": 0.0013, + "epoch": 0.5485327313769752, + "grad_norm": 0.43885576725006104, + "learning_rate": 8.481199897873667e-06, + "loss": 0.1386, "step": 729 }, { - "epoch": 0.718503937007874, - "grad_norm": 0.0829373225569725, - "learning_rate": 3.6620446535546227e-06, - "loss": 0.0031, + "epoch": 0.5492851768246803, + "grad_norm": 2.3144471645355225, + "learning_rate": 8.457839627011693e-06, + "loss": 0.0281, "step": 730 }, { - "epoch": 0.719488188976378, - "grad_norm": 0.022530362010002136, - "learning_rate": 3.6381574329160275e-06, - "loss": 0.0012, + "epoch": 0.5500376222723853, + "grad_norm": 0.016497652977705002, + "learning_rate": 8.43448797360663e-06, + "loss": 0.0004, "step": 731 }, { - "epoch": 0.7204724409448819, - "grad_norm": 0.03081757389008999, - "learning_rate": 3.6143310390678544e-06, - "loss": 0.0019, + "epoch": 0.5507900677200903, + "grad_norm": 3.199171543121338, + "learning_rate": 8.411145068145474e-06, + "loss": 0.5294, "step": 732 }, { - "epoch": 0.7214566929133859, - "grad_norm": 0.2475489228963852, - "learning_rate": 3.590565699818801e-06, - "loss": 0.0076, + "epoch": 0.5515425131677953, + "grad_norm": 0.08361708372831345, + "learning_rate": 8.38781104106633e-06, + "loss": 0.0013, "step": 733 }, { - "epoch": 0.7224409448818898, - "grad_norm": 3.6537704467773438, - "learning_rate": 3.566861642393803e-06, - "loss": 0.1518, + "epoch": 0.5522949586155004, + "grad_norm": 0.011733477003872395, + "learning_rate": 8.3644860227577e-06, + "loss": 0.0004, "step": 734 }, { - "epoch": 0.7234251968503937, - "grad_norm": 0.901038646697998, - "learning_rate": 3.5432190934318787e-06, - "loss": 0.0306, + "epoch": 0.5530474040632054, + "grad_norm": 0.005462857894599438, + "learning_rate": 8.341170143557733e-06, + "loss": 0.0002, "step": 735 }, { - "epoch": 0.7244094488188977, - "grad_norm": 0.49936366081237793, - "learning_rate": 3.5196382789839477e-06, - "loss": 0.0185, + "epoch": 0.5537998495109104, + "grad_norm": 0.0030956827104091644, + "learning_rate": 8.317863533753523e-06, + "loss": 0.0001, "step": 736 }, { - "epoch": 0.7253937007874016, - "grad_norm": 0.9579805731773376, - "learning_rate": 3.496119424510678e-06, - "loss": 0.0284, + "epoch": 0.5545522949586155, + "grad_norm": 1.1541496515274048, + "learning_rate": 8.294566323580359e-06, + "loss": 0.1397, "step": 737 }, { - "epoch": 0.7263779527559056, - "grad_norm": 0.002690441906452179, - "learning_rate": 3.4726627548803205e-06, - "loss": 0.0002, + "epoch": 0.5553047404063205, + "grad_norm": 4.27673864364624, + "learning_rate": 8.27127864322101e-06, + "loss": 0.0181, "step": 738 }, { - "epoch": 0.7273622047244095, - "grad_norm": 0.00433709379285574, - "learning_rate": 3.449268494366571e-06, - "loss": 0.0003, + "epoch": 0.5560571858540256, + "grad_norm": 0.05562518537044525, + "learning_rate": 8.248000622804986e-06, + "loss": 0.0015, "step": 739 }, { - "epoch": 0.7283464566929134, - "grad_norm": 0.2354598045349121, - "learning_rate": 3.425936866646419e-06, - "loss": 0.0052, + "epoch": 0.5568096313017307, + "grad_norm": 1.1070342063903809, + "learning_rate": 8.224732392407834e-06, + "loss": 0.0464, "step": 740 }, { - "epoch": 0.7293307086614174, - "grad_norm": 0.00649993447586894, - "learning_rate": 3.402668094798003e-06, - "loss": 0.0004, + "epoch": 0.5575620767494357, + "grad_norm": 0.007482460699975491, + "learning_rate": 8.201474082050376e-06, + "loss": 0.0003, "step": 741 }, { - "epoch": 0.7303149606299213, - "grad_norm": 0.013334258459508419, - "learning_rate": 3.3794624012984913e-06, - "loss": 0.001, + "epoch": 0.5583145221971407, + "grad_norm": 0.007131911348551512, + "learning_rate": 8.178225821698013e-06, + "loss": 0.0003, "step": 742 }, { - "epoch": 0.7312992125984252, - "grad_norm": 1.2315829992294312, - "learning_rate": 3.356320008021946e-06, - "loss": 0.0165, + "epoch": 0.5590669676448458, + "grad_norm": 0.010571115650236607, + "learning_rate": 8.154987741259986e-06, + "loss": 0.0002, "step": 743 }, { - "epoch": 0.7322834645669292, - "grad_norm": 0.002481029834598303, - "learning_rate": 3.3332411362372063e-06, - "loss": 0.0001, + "epoch": 0.5598194130925508, + "grad_norm": 0.007416147738695145, + "learning_rate": 8.13175997058865e-06, + "loss": 0.0003, "step": 744 }, { - "epoch": 0.7332677165354331, - "grad_norm": 0.017066115513443947, - "learning_rate": 3.3102260066057655e-06, - "loss": 0.0008, + "epoch": 0.5605718585402558, + "grad_norm": 0.07424626499414444, + "learning_rate": 8.10854263947875e-06, + "loss": 0.002, "step": 745 }, { - "epoch": 0.734251968503937, - "grad_norm": 0.5193158984184265, - "learning_rate": 3.2872748391796736e-06, - "loss": 0.0084, + "epoch": 0.5613243039879608, + "grad_norm": 0.009504971094429493, + "learning_rate": 8.085335877666696e-06, + "loss": 0.0004, "step": 746 }, { - "epoch": 0.735236220472441, - "grad_norm": 2.111267566680908, - "learning_rate": 3.2643878533994145e-06, - "loss": 0.304, + "epoch": 0.5620767494356659, + "grad_norm": 0.019785739481449127, + "learning_rate": 8.062139814829839e-06, + "loss": 0.0006, "step": 747 }, { - "epoch": 0.7362204724409449, - "grad_norm": 2.7837791442871094, - "learning_rate": 3.2415652680918262e-06, - "loss": 0.2631, + "epoch": 0.5628291948833709, + "grad_norm": 0.002929375506937504, + "learning_rate": 8.038954580585742e-06, + "loss": 0.0001, "step": 748 }, { - "epoch": 0.7372047244094488, - "grad_norm": 0.0712798461318016, - "learning_rate": 3.218807301468003e-06, - "loss": 0.0031, + "epoch": 0.563581640331076, + "grad_norm": 3.0780787467956543, + "learning_rate": 8.015780304491457e-06, + "loss": 0.069, "step": 749 }, { - "epoch": 0.7381889763779528, - "grad_norm": 0.4613245725631714, - "learning_rate": 3.196114171121205e-06, - "loss": 0.0382, + "epoch": 0.5643340857787811, + "grad_norm": 10.312975883483887, + "learning_rate": 7.992617116042813e-06, + "loss": 0.0922, "step": 750 }, { - "epoch": 0.7391732283464567, - "grad_norm": 0.4071942865848541, - "learning_rate": 3.173486094024779e-06, - "loss": 0.0206, + "epoch": 0.5650865312264861, + "grad_norm": 0.4201790690422058, + "learning_rate": 7.969465144673674e-06, + "loss": 0.0038, "step": 751 }, { - "epoch": 0.7401574803149606, - "grad_norm": 0.005993656814098358, - "learning_rate": 3.1509232865300886e-06, - "loss": 0.0004, + "epoch": 0.5658389766741911, + "grad_norm": 0.24568204581737518, + "learning_rate": 7.946324519755225e-06, + "loss": 0.0055, "step": 752 }, { - "epoch": 0.7411417322834646, - "grad_norm": 0.0022017699666321278, - "learning_rate": 3.128425964364442e-06, - "loss": 0.0001, + "epoch": 0.5665914221218962, + "grad_norm": 0.007804738823324442, + "learning_rate": 7.92319537059525e-06, + "loss": 0.0004, "step": 753 }, { - "epoch": 0.7421259842519685, - "grad_norm": 2.6292972564697266, - "learning_rate": 3.1059943426290228e-06, - "loss": 0.1854, + "epoch": 0.5673438675696012, + "grad_norm": 0.002531126607209444, + "learning_rate": 7.900077826437402e-06, + "loss": 0.0001, "step": 754 }, { - "epoch": 0.7431102362204725, - "grad_norm": 0.6754085421562195, - "learning_rate": 3.0836286357968503e-06, - "loss": 0.0136, + "epoch": 0.5680963130173062, + "grad_norm": 0.002385746920481324, + "learning_rate": 7.876972016460492e-06, + "loss": 0.0001, "step": 755 }, { - "epoch": 0.7440944881889764, - "grad_norm": 0.27799344062805176, - "learning_rate": 3.061329057710711e-06, - "loss": 0.0101, + "epoch": 0.5688487584650113, + "grad_norm": 0.009147096425294876, + "learning_rate": 7.853878069777762e-06, + "loss": 0.0004, "step": 756 }, { - "epoch": 0.7450787401574803, - "grad_norm": 0.19252178072929382, - "learning_rate": 3.039095821581127e-06, - "loss": 0.0079, + "epoch": 0.5696012039127163, + "grad_norm": 0.4006285071372986, + "learning_rate": 7.83079611543616e-06, + "loss": 0.0046, "step": 757 }, { - "epoch": 0.7460629921259843, - "grad_norm": 0.019021619111299515, - "learning_rate": 3.0169291399843105e-06, - "loss": 0.0008, + "epoch": 0.5703536493604213, + "grad_norm": 3.700183868408203, + "learning_rate": 7.80772628241562e-06, + "loss": 0.1669, "step": 758 }, { - "epoch": 0.7470472440944882, - "grad_norm": 0.005956071428954601, - "learning_rate": 2.994829224860135e-06, - "loss": 0.0004, + "epoch": 0.5711060948081265, + "grad_norm": 0.007939686998724937, + "learning_rate": 7.784668699628345e-06, + "loss": 0.0003, "step": 759 }, { - "epoch": 0.7480314960629921, - "grad_norm": 2.226811408996582, - "learning_rate": 2.9727962875101e-06, - "loss": 0.0974, + "epoch": 0.5718585402558315, + "grad_norm": 0.001986120129004121, + "learning_rate": 7.761623495918089e-06, + "loss": 0.0001, "step": 760 }, { - "epoch": 0.7490157480314961, - "grad_norm": 0.010731425136327744, - "learning_rate": 2.950830538595325e-06, - "loss": 0.0007, + "epoch": 0.5726109857035365, + "grad_norm": 0.0024444598238915205, + "learning_rate": 7.738590800059427e-06, + "loss": 0.0001, "step": 761 }, { - "epoch": 0.75, - "grad_norm": 0.2856708765029907, - "learning_rate": 2.9289321881345257e-06, - "loss": 0.0048, + "epoch": 0.5733634311512416, + "grad_norm": 0.0029928828589618206, + "learning_rate": 7.715570740757045e-06, + "loss": 0.0001, "step": 762 }, { - "epoch": 0.7509842519685039, - "grad_norm": 2.5673253536224365, - "learning_rate": 2.9071014455020086e-06, - "loss": 0.0344, + "epoch": 0.5741158765989466, + "grad_norm": 0.02208460308611393, + "learning_rate": 7.692563446645017e-06, + "loss": 0.0009, "step": 763 }, { - "epoch": 0.7519685039370079, - "grad_norm": 3.5749475955963135, - "learning_rate": 2.8853385194256677e-06, - "loss": 0.3186, + "epoch": 0.5748683220466516, + "grad_norm": 0.02059108205139637, + "learning_rate": 7.66956904628608e-06, + "loss": 0.0005, "step": 764 }, { - "epoch": 0.7529527559055118, - "grad_norm": 0.663025438785553, - "learning_rate": 2.8636436179849923e-06, - "loss": 0.0237, + "epoch": 0.5756207674943566, + "grad_norm": 4.884920120239258, + "learning_rate": 7.64658766817093e-06, + "loss": 0.0456, "step": 765 }, { - "epoch": 0.7539370078740157, - "grad_norm": 0.5449152588844299, - "learning_rate": 2.8420169486090765e-06, - "loss": 0.0152, + "epoch": 0.5763732129420617, + "grad_norm": 0.04961561784148216, + "learning_rate": 7.623619440717493e-06, + "loss": 0.0016, "step": 766 }, { - "epoch": 0.7549212598425197, - "grad_norm": 0.02205800637602806, - "learning_rate": 2.8204587180746256e-06, - "loss": 0.0014, + "epoch": 0.5771256583897667, + "grad_norm": 0.024083776399493217, + "learning_rate": 7.600664492270206e-06, + "loss": 0.0006, "step": 767 }, { - "epoch": 0.7559055118110236, - "grad_norm": 0.5135183930397034, - "learning_rate": 2.798969132503997e-06, - "loss": 0.1016, + "epoch": 0.5778781038374717, + "grad_norm": 0.017353009432554245, + "learning_rate": 7.57772295109931e-06, + "loss": 0.0006, "step": 768 }, { - "epoch": 0.7568897637795275, - "grad_norm": 2.3159573078155518, - "learning_rate": 2.7775483973632177e-06, - "loss": 0.2155, + "epoch": 0.5786305492851769, + "grad_norm": 0.00047786516370251775, + "learning_rate": 7.554794945400122e-06, + "loss": 0.0, "step": 769 }, { - "epoch": 0.7578740157480315, - "grad_norm": 0.03194061294198036, - "learning_rate": 2.7561967174600234e-06, - "loss": 0.0012, + "epoch": 0.5793829947328819, + "grad_norm": 0.2889784276485443, + "learning_rate": 7.531880603292333e-06, + "loss": 0.0052, "step": 770 }, { - "epoch": 0.7588582677165354, - "grad_norm": 0.0891883447766304, - "learning_rate": 2.7349142969418986e-06, - "loss": 0.004, + "epoch": 0.5801354401805869, + "grad_norm": 5.133121490478516, + "learning_rate": 7.508980052819274e-06, + "loss": 0.1127, "step": 771 }, { - "epoch": 0.7598425196850394, - "grad_norm": 2.238450288772583, - "learning_rate": 2.713701339294129e-06, - "loss": 0.1679, + "epoch": 0.580887885628292, + "grad_norm": 0.06616966426372528, + "learning_rate": 7.486093421947214e-06, + "loss": 0.0023, "step": 772 }, { - "epoch": 0.7608267716535433, - "grad_norm": 2.750286817550659, - "learning_rate": 2.692558047337843e-06, - "loss": 0.2688, + "epoch": 0.581640331075997, + "grad_norm": 0.0013333017705008388, + "learning_rate": 7.463220838564635e-06, + "loss": 0.0001, "step": 773 }, { - "epoch": 0.7618110236220472, - "grad_norm": 0.013673401437699795, - "learning_rate": 2.6714846232280932e-06, - "loss": 0.0007, + "epoch": 0.582392776523702, + "grad_norm": 1.0273889303207397, + "learning_rate": 7.440362430481529e-06, + "loss": 0.08, "step": 774 }, { - "epoch": 0.7627952755905512, - "grad_norm": 5.439116954803467, - "learning_rate": 2.6504812684519053e-06, - "loss": 0.3138, + "epoch": 0.5831452219714071, + "grad_norm": 0.7515692710876465, + "learning_rate": 7.417518325428678e-06, + "loss": 0.11, "step": 775 }, { - "epoch": 0.7637795275590551, - "grad_norm": 0.18272237479686737, - "learning_rate": 2.6295481838263628e-06, - "loss": 0.0076, + "epoch": 0.5838976674191121, + "grad_norm": 3.273358743172139e-05, + "learning_rate": 7.3946886510569385e-06, + "loss": 0.0, "step": 776 }, { - "epoch": 0.764763779527559, - "grad_norm": 0.23185092210769653, - "learning_rate": 2.6086855694966795e-06, - "loss": 0.009, + "epoch": 0.5846501128668171, + "grad_norm": 3.785158634185791, + "learning_rate": 7.371873534936522e-06, + "loss": 0.448, "step": 777 }, { - "epoch": 0.765748031496063, - "grad_norm": 0.7773996591567993, - "learning_rate": 2.587893624934292e-06, - "loss": 0.0199, + "epoch": 0.5854025583145221, + "grad_norm": 0.004100090358406305, + "learning_rate": 7.349073104556301e-06, + "loss": 0.0002, "step": 778 }, { - "epoch": 0.7667322834645669, - "grad_norm": 0.04924741014838219, - "learning_rate": 2.567172548934952e-06, - "loss": 0.0023, + "epoch": 0.5861550037622273, + "grad_norm": 0.28187495470046997, + "learning_rate": 7.326287487323078e-06, + "loss": 0.0012, "step": 779 }, { - "epoch": 0.7677165354330708, - "grad_norm": 0.4784688353538513, - "learning_rate": 2.5465225396168134e-06, - "loss": 0.1216, + "epoch": 0.5869074492099323, + "grad_norm": 5.85366678237915, + "learning_rate": 7.3035168105608885e-06, + "loss": 0.0818, "step": 780 }, { - "epoch": 0.7687007874015748, - "grad_norm": 0.00776605773717165, - "learning_rate": 2.5259437944185584e-06, - "loss": 0.0004, + "epoch": 0.5876598946576373, + "grad_norm": 0.005729475524276495, + "learning_rate": 7.280761201510275e-06, + "loss": 0.0002, "step": 781 }, { - "epoch": 0.7696850393700787, - "grad_norm": 0.0053635635413229465, - "learning_rate": 2.505436510097494e-06, - "loss": 0.0003, + "epoch": 0.5884123401053424, + "grad_norm": 0.1717999428510666, + "learning_rate": 7.2580207873275865e-06, + "loss": 0.0992, "step": 782 }, { - "epoch": 0.7706692913385826, - "grad_norm": 0.15141254663467407, - "learning_rate": 2.4850008827276796e-06, - "loss": 0.0064, + "epoch": 0.5891647855530474, + "grad_norm": 0.0329020619392395, + "learning_rate": 7.235295695084259e-06, + "loss": 0.0006, "step": 783 }, { - "epoch": 0.7716535433070866, - "grad_norm": 0.01565377414226532, - "learning_rate": 2.464637107698046e-06, - "loss": 0.0009, + "epoch": 0.5899172310007524, + "grad_norm": 0.012131314724683762, + "learning_rate": 7.212586051766118e-06, + "loss": 0.0003, "step": 784 }, { - "epoch": 0.7726377952755905, - "grad_norm": 0.026474611833691597, - "learning_rate": 2.444345379710533e-06, - "loss": 0.0013, + "epoch": 0.5906696764484575, + "grad_norm": 1.1857929229736328, + "learning_rate": 7.189891984272659e-06, + "loss": 0.0395, "step": 785 }, { - "epoch": 0.7736220472440944, - "grad_norm": 0.00480277044698596, - "learning_rate": 2.4241258927782197e-06, + "epoch": 0.5914221218961625, + "grad_norm": 0.0067533692345023155, + "learning_rate": 7.16721361941634e-06, "loss": 0.0003, "step": 786 }, { - "epoch": 0.7746062992125984, - "grad_norm": 0.007867386564612389, - "learning_rate": 2.4039788402234787e-06, - "loss": 0.0005, + "epoch": 0.5921745673438675, + "grad_norm": 0.0263225045055151, + "learning_rate": 7.144551083921875e-06, + "loss": 0.001, "step": 787 }, { - "epoch": 0.7755905511811023, - "grad_norm": 1.9401617050170898, - "learning_rate": 2.3839044146761227e-06, - "loss": 0.0915, + "epoch": 0.5929270127915726, + "grad_norm": 2.086378812789917, + "learning_rate": 7.121904504425523e-06, + "loss": 0.0377, "step": 788 }, { - "epoch": 0.7765748031496063, - "grad_norm": 1.5311763286590576, - "learning_rate": 2.3639028080715653e-06, - "loss": 0.0636, + "epoch": 0.5936794582392777, + "grad_norm": 2.842104196548462, + "learning_rate": 7.0992740074743835e-06, + "loss": 0.0224, "step": 789 }, { - "epoch": 0.7775590551181102, - "grad_norm": 0.11246434599161148, - "learning_rate": 2.3439742116489827e-06, - "loss": 0.0031, + "epoch": 0.5944319036869827, + "grad_norm": 0.00352369318716228, + "learning_rate": 7.076659719525694e-06, + "loss": 0.0002, "step": 790 }, { - "epoch": 0.7785433070866141, - "grad_norm": 0.02461765520274639, - "learning_rate": 2.3241188159494855e-06, - "loss": 0.0014, + "epoch": 0.5951843491346878, + "grad_norm": 0.02450047992169857, + "learning_rate": 7.05406176694611e-06, + "loss": 0.0006, "step": 791 }, { - "epoch": 0.7795275590551181, - "grad_norm": 0.3959275782108307, - "learning_rate": 2.304336810814305e-06, - "loss": 0.0156, + "epoch": 0.5959367945823928, + "grad_norm": 0.01613771915435791, + "learning_rate": 7.031480276011007e-06, + "loss": 0.0007, "step": 792 }, { - "epoch": 0.780511811023622, - "grad_norm": 0.5118983387947083, - "learning_rate": 2.2846283853829576e-06, - "loss": 0.0456, + "epoch": 0.5966892400300978, + "grad_norm": 16.005887985229492, + "learning_rate": 7.008915372903775e-06, + "loss": 0.1847, "step": 793 }, { - "epoch": 0.781496062992126, - "grad_norm": 1.8995429277420044, - "learning_rate": 2.2649937280914614e-06, - "loss": 0.0554, + "epoch": 0.5974416854778029, + "grad_norm": 0.023932024836540222, + "learning_rate": 6.986367183715117e-06, + "loss": 0.0009, "step": 794 }, { - "epoch": 0.78248031496063, - "grad_norm": 0.19242697954177856, - "learning_rate": 2.245433026670524e-06, - "loss": 0.0121, + "epoch": 0.5981941309255079, + "grad_norm": 0.07864760607481003, + "learning_rate": 6.963835834442336e-06, + "loss": 0.0028, "step": 795 }, { - "epoch": 0.7834645669291339, - "grad_norm": 0.021557975560426712, - "learning_rate": 2.2259464681437404e-06, - "loss": 0.0012, + "epoch": 0.5989465763732129, + "grad_norm": 0.09168053418397903, + "learning_rate": 6.941321450988633e-06, + "loss": 0.0023, "step": 796 }, { - "epoch": 0.7844488188976378, - "grad_norm": 0.6313809752464294, - "learning_rate": 2.2065342388258193e-06, - "loss": 0.1019, + "epoch": 0.5996990218209179, + "grad_norm": 2.519742965698242, + "learning_rate": 6.918824159162409e-06, + "loss": 0.3657, "step": 797 }, { - "epoch": 0.7854330708661418, - "grad_norm": 0.7024702429771423, - "learning_rate": 2.1871965243207795e-06, - "loss": 0.0365, + "epoch": 0.600451467268623, + "grad_norm": 0.7624035477638245, + "learning_rate": 6.89634408467656e-06, + "loss": 0.057, "step": 798 }, { - "epoch": 0.7864173228346457, - "grad_norm": 0.024476071819663048, - "learning_rate": 2.1679335095202035e-06, - "loss": 0.0015, + "epoch": 0.6012039127163281, + "grad_norm": 3.3864188194274902, + "learning_rate": 6.873881353147766e-06, + "loss": 0.3926, "step": 799 }, { - "epoch": 0.7874015748031497, - "grad_norm": 0.3336131274700165, - "learning_rate": 2.1487453786014513e-06, - "loss": 0.0069, + "epoch": 0.6019563581640331, + "grad_norm": 1.0742292404174805, + "learning_rate": 6.851436090095807e-06, + "loss": 0.0186, "step": 800 }, { - "epoch": 0.7883858267716536, - "grad_norm": 0.3403570055961609, - "learning_rate": 2.1296323150259e-06, - "loss": 0.0099, + "epoch": 0.6027088036117382, + "grad_norm": 3.5620031356811523, + "learning_rate": 6.829008420942842e-06, + "loss": 0.0536, "step": 801 }, { - "epoch": 0.7893700787401575, - "grad_norm": 0.013246452435851097, - "learning_rate": 2.1105945015371985e-06, - "loss": 0.0009, + "epoch": 0.6034612490594432, + "grad_norm": 2.453146457672119, + "learning_rate": 6.806598471012717e-06, + "loss": 0.0645, "step": 802 }, { - "epoch": 0.7903543307086615, - "grad_norm": 0.06293017417192459, - "learning_rate": 2.0916321201595167e-06, - "loss": 0.0036, + "epoch": 0.6042136945071482, + "grad_norm": 0.018604706972837448, + "learning_rate": 6.784206365530268e-06, + "loss": 0.0006, "step": 803 }, { - "epoch": 0.7913385826771654, - "grad_norm": 0.029213231056928635, - "learning_rate": 2.072745352195794e-06, - "loss": 0.0016, + "epoch": 0.6049661399548533, + "grad_norm": 0.6108080744743347, + "learning_rate": 6.761832229620618e-06, + "loss": 0.0065, "step": 804 }, { - "epoch": 0.7923228346456693, - "grad_norm": 0.004889960866421461, - "learning_rate": 2.0539343782260247e-06, - "loss": 0.0003, + "epoch": 0.6057185854025583, + "grad_norm": 0.9389123916625977, + "learning_rate": 6.739476188308476e-06, + "loss": 0.0085, "step": 805 }, { - "epoch": 0.7933070866141733, - "grad_norm": 0.17887891829013824, - "learning_rate": 2.0351993781055192e-06, - "loss": 0.0079, + "epoch": 0.6064710308502633, + "grad_norm": 0.0035854075103998184, + "learning_rate": 6.717138366517438e-06, + "loss": 0.0002, "step": 806 }, { - "epoch": 0.7942913385826772, - "grad_norm": 0.17912250757217407, - "learning_rate": 2.016540530963188e-06, - "loss": 0.0054, + "epoch": 0.6072234762979684, + "grad_norm": 0.003520939266309142, + "learning_rate": 6.694818889069294e-06, + "loss": 0.0002, "step": 807 }, { - "epoch": 0.7952755905511811, - "grad_norm": 1.3486394882202148, - "learning_rate": 1.997958015199829e-06, - "loss": 0.0222, + "epoch": 0.6079759217456734, + "grad_norm": 0.9055103659629822, + "learning_rate": 6.672517880683332e-06, + "loss": 0.0895, "step": 808 }, { - "epoch": 0.7962598425196851, - "grad_norm": 0.08057564496994019, - "learning_rate": 1.9794520084864187e-06, - "loss": 0.0031, + "epoch": 0.6087283671933785, + "grad_norm": 0.1636822521686554, + "learning_rate": 6.6502354659756165e-06, + "loss": 0.0064, "step": 809 }, { - "epoch": 0.797244094488189, - "grad_norm": 0.020239083096385002, - "learning_rate": 1.9610226877624217e-06, - "loss": 0.0013, + "epoch": 0.6094808126410836, + "grad_norm": 4.655453681945801, + "learning_rate": 6.627971769458341e-06, + "loss": 0.1029, "step": 810 }, { - "epoch": 0.7982283464566929, - "grad_norm": 0.18024179339408875, - "learning_rate": 1.9426702292340826e-06, - "loss": 0.0064, + "epoch": 0.6102332580887886, + "grad_norm": 0.3717281222343445, + "learning_rate": 6.605726915539088e-06, + "loss": 0.2568, "step": 811 }, { - "epoch": 0.7992125984251969, - "grad_norm": 0.037026502192020416, - "learning_rate": 1.9243948083727626e-06, - "loss": 0.0026, + "epoch": 0.6109857035364936, + "grad_norm": 0.27854296565055847, + "learning_rate": 6.583501028520143e-06, + "loss": 0.107, "step": 812 }, { - "epoch": 0.8001968503937008, - "grad_norm": 0.7159590125083923, - "learning_rate": 1.9061965999132448e-06, - "loss": 0.0145, + "epoch": 0.6117381489841986, + "grad_norm": 1.342350959777832, + "learning_rate": 6.561294232597817e-06, + "loss": 0.2153, "step": 813 }, { - "epoch": 0.8011811023622047, - "grad_norm": 2.5974228382110596, - "learning_rate": 1.8880757778520742e-06, - "loss": 0.0829, + "epoch": 0.6124905944319037, + "grad_norm": 0.017684536054730415, + "learning_rate": 6.539106651861741e-06, + "loss": 0.0005, "step": 814 }, { - "epoch": 0.8021653543307087, - "grad_norm": 0.036518096923828125, - "learning_rate": 1.8700325154458864e-06, - "loss": 0.0016, + "epoch": 0.6132430398796087, + "grad_norm": 0.005973528604954481, + "learning_rate": 6.516938410294165e-06, + "loss": 0.0002, "step": 815 }, { - "epoch": 0.8031496062992126, - "grad_norm": 0.004604843910783529, - "learning_rate": 1.8520669852097573e-06, - "loss": 0.0003, + "epoch": 0.6139954853273137, + "grad_norm": 0.12207470089197159, + "learning_rate": 6.494789631769281e-06, + "loss": 0.0047, "step": 816 }, { - "epoch": 0.8041338582677166, - "grad_norm": 0.016273021697998047, - "learning_rate": 1.8341793589155444e-06, - "loss": 0.0011, + "epoch": 0.6147479307750188, + "grad_norm": 0.2868960499763489, + "learning_rate": 6.472660440052521e-06, + "loss": 0.1167, "step": 817 }, { - "epoch": 0.8051181102362205, - "grad_norm": 2.2043075561523438, - "learning_rate": 1.816369807590258e-06, - "loss": 0.2688, + "epoch": 0.6155003762227238, + "grad_norm": 0.023519231006503105, + "learning_rate": 6.450550958799868e-06, + "loss": 0.0009, "step": 818 }, { - "epoch": 0.8061023622047244, - "grad_norm": 0.012433003634214401, - "learning_rate": 1.7986385015144148e-06, - "loss": 0.0009, + "epoch": 0.6162528216704289, + "grad_norm": 0.001310076448135078, + "learning_rate": 6.428461311557159e-06, + "loss": 0.0001, "step": 819 }, { - "epoch": 0.8070866141732284, - "grad_norm": 0.01360283326357603, - "learning_rate": 1.7809856102204148e-06, - "loss": 0.0006, + "epoch": 0.617005267118134, + "grad_norm": 0.009395822882652283, + "learning_rate": 6.406391621759416e-06, + "loss": 0.0005, "step": 820 }, { - "epoch": 0.8080708661417323, - "grad_norm": 1.3764728307724, - "learning_rate": 1.7634113024909204e-06, - "loss": 0.0288, + "epoch": 0.617757712565839, + "grad_norm": 0.0022374300751835108, + "learning_rate": 6.384342012730122e-06, + "loss": 0.0001, "step": 821 }, { - "epoch": 0.8090551181102362, - "grad_norm": 0.012660632841289043, - "learning_rate": 1.7459157463572396e-06, - "loss": 0.0007, + "epoch": 0.618510158013544, + "grad_norm": 0.02684149146080017, + "learning_rate": 6.362312607680559e-06, + "loss": 0.001, "step": 822 }, { - "epoch": 0.8100393700787402, - "grad_norm": 0.012494775466620922, - "learning_rate": 1.7284991090977255e-06, - "loss": 0.0008, + "epoch": 0.6192626034612491, + "grad_norm": 0.02339085564017296, + "learning_rate": 6.3403035297091145e-06, + "loss": 0.0011, "step": 823 }, { - "epoch": 0.8110236220472441, - "grad_norm": 0.012847901321947575, - "learning_rate": 1.7111615572361628e-06, - "loss": 0.0009, + "epoch": 0.6200150489089541, + "grad_norm": 0.3724048435688019, + "learning_rate": 6.318314901800584e-06, + "loss": 0.1327, "step": 824 }, { - "epoch": 0.812007874015748, - "grad_norm": 0.09309004992246628, - "learning_rate": 1.6939032565401958e-06, - "loss": 0.0038, + "epoch": 0.6207674943566591, + "grad_norm": 0.10902447998523712, + "learning_rate": 6.29634684682549e-06, + "loss": 0.0025, "step": 825 }, { - "epoch": 0.812992125984252, - "grad_norm": 0.006478773895651102, - "learning_rate": 1.6767243720197302e-06, - "loss": 0.0004, + "epoch": 0.6215199398043642, + "grad_norm": 1.5880934000015259, + "learning_rate": 6.274399487539397e-06, + "loss": 0.09, "step": 826 }, { - "epoch": 0.8139763779527559, - "grad_norm": 1.1516567468643188, - "learning_rate": 1.6596250679253568e-06, - "loss": 0.061, + "epoch": 0.6222723852520692, + "grad_norm": 0.024421492591500282, + "learning_rate": 6.2524729465822265e-06, + "loss": 0.0008, "step": 827 }, { - "epoch": 0.8149606299212598, - "grad_norm": 0.0033175654243677855, - "learning_rate": 1.642605507746786e-06, - "loss": 0.0001, + "epoch": 0.6230248306997742, + "grad_norm": 0.3446272909641266, + "learning_rate": 6.230567346477567e-06, + "loss": 0.0032, "step": 828 }, { - "epoch": 0.8159448818897638, - "grad_norm": 2.2606759071350098, - "learning_rate": 1.6256658542112803e-06, - "loss": 0.1115, + "epoch": 0.6237772761474794, + "grad_norm": 0.007315884344279766, + "learning_rate": 6.208682809631983e-06, + "loss": 0.0002, "step": 829 }, { - "epoch": 0.8169291338582677, - "grad_norm": 1.0202720165252686, - "learning_rate": 1.6088062692820939e-06, - "loss": 0.0825, + "epoch": 0.6245297215951844, + "grad_norm": 0.3761526346206665, + "learning_rate": 6.1868194583343585e-06, + "loss": 0.1232, "step": 830 }, { - "epoch": 0.8179133858267716, - "grad_norm": 0.04197832569479942, - "learning_rate": 1.5920269141569378e-06, - "loss": 0.0015, + "epoch": 0.6252821670428894, + "grad_norm": 0.005384144373238087, + "learning_rate": 6.1649774147551755e-06, + "loss": 0.0002, "step": 831 }, { - "epoch": 0.8188976377952756, - "grad_norm": 1.844979166984558, - "learning_rate": 1.5753279492664264e-06, - "loss": 0.4162, + "epoch": 0.6260346124905944, + "grad_norm": 2.6692087650299072, + "learning_rate": 6.1431568009458596e-06, + "loss": 0.4896, "step": 832 }, { - "epoch": 0.8198818897637795, - "grad_norm": 0.28964877128601074, - "learning_rate": 1.55870953427255e-06, - "loss": 0.0157, + "epoch": 0.6267870579382995, + "grad_norm": 0.008824239484965801, + "learning_rate": 6.121357738838088e-06, + "loss": 0.0004, "step": 833 }, { - "epoch": 0.8208661417322834, - "grad_norm": 0.014946633949875832, - "learning_rate": 1.5421718280671427e-06, - "loss": 0.0007, + "epoch": 0.6275395033860045, + "grad_norm": 3.028167963027954, + "learning_rate": 6.099580350243109e-06, + "loss": 0.2251, "step": 834 }, { - "epoch": 0.8218503937007874, - "grad_norm": 2.0832362174987793, - "learning_rate": 1.5257149887703703e-06, - "loss": 0.3616, + "epoch": 0.6282919488337095, + "grad_norm": 0.011604844592511654, + "learning_rate": 6.077824756851055e-06, + "loss": 0.0005, "step": 835 }, { - "epoch": 0.8228346456692913, - "grad_norm": 3.614567756652832, - "learning_rate": 1.509339173729214e-06, - "loss": 0.1283, + "epoch": 0.6290443942814146, + "grad_norm": 3.686901092529297, + "learning_rate": 6.056091080230279e-06, + "loss": 0.1939, "step": 836 }, { - "epoch": 0.8238188976377953, - "grad_norm": 0.6143249869346619, - "learning_rate": 1.493044539515961e-06, - "loss": 0.0479, + "epoch": 0.6297968397291196, + "grad_norm": 0.012510290369391441, + "learning_rate": 6.034379441826659e-06, + "loss": 0.0006, "step": 837 }, { - "epoch": 0.8248031496062992, - "grad_norm": 0.015475532971322536, - "learning_rate": 1.4768312419267194e-06, - "loss": 0.0009, + "epoch": 0.6305492851768246, + "grad_norm": 0.014967096969485283, + "learning_rate": 6.012689962962923e-06, + "loss": 0.0006, "step": 838 }, { - "epoch": 0.8257874015748031, - "grad_norm": 0.8146891593933105, - "learning_rate": 1.460699435979922e-06, - "loss": 0.0365, + "epoch": 0.6313017306245298, + "grad_norm": 0.00533378915861249, + "learning_rate": 5.991022764837979e-06, + "loss": 0.0003, "step": 839 }, { - "epoch": 0.8267716535433071, - "grad_norm": 0.022416390478610992, - "learning_rate": 1.4446492759148411e-06, - "loss": 0.001, + "epoch": 0.6320541760722348, + "grad_norm": 0.021024620160460472, + "learning_rate": 5.969377968526231e-06, + "loss": 0.0009, "step": 840 }, { - "epoch": 0.827755905511811, - "grad_norm": 0.11237171292304993, - "learning_rate": 1.4286809151901194e-06, - "loss": 0.0067, + "epoch": 0.6328066215199398, + "grad_norm": 0.15575391054153442, + "learning_rate": 5.947755694976902e-06, + "loss": 0.0038, "step": 841 }, { - "epoch": 0.8287401574803149, - "grad_norm": 0.022552266716957092, - "learning_rate": 1.4127945064823023e-06, - "loss": 0.0013, + "epoch": 0.6335590669676449, + "grad_norm": 0.5761941075325012, + "learning_rate": 5.926156065013359e-06, + "loss": 0.0294, "step": 842 }, { - "epoch": 0.8297244094488189, - "grad_norm": 0.028985949233174324, - "learning_rate": 1.3969902016843707e-06, - "loss": 0.0016, + "epoch": 0.6343115124153499, + "grad_norm": 0.5115591287612915, + "learning_rate": 5.904579199332443e-06, + "loss": 0.081, "step": 843 }, { - "epoch": 0.8307086614173228, - "grad_norm": 0.006765976548194885, - "learning_rate": 1.381268151904298e-06, - "loss": 0.0004, + "epoch": 0.6350639578630549, + "grad_norm": 0.017021209001541138, + "learning_rate": 5.883025218503781e-06, + "loss": 0.0008, "step": 844 }, { - "epoch": 0.8316929133858267, - "grad_norm": 0.004955321084707975, - "learning_rate": 1.3656285074636022e-06, - "loss": 0.0003, + "epoch": 0.63581640331076, + "grad_norm": 0.0051472182385623455, + "learning_rate": 5.861494242969134e-06, + "loss": 0.0002, "step": 845 }, { - "epoch": 0.8326771653543307, - "grad_norm": 0.4646233022212982, - "learning_rate": 1.3500714178959084e-06, - "loss": 0.0168, + "epoch": 0.636568848758465, + "grad_norm": 0.007996303029358387, + "learning_rate": 5.839986393041701e-06, + "loss": 0.0003, "step": 846 }, { - "epoch": 0.8336614173228346, - "grad_norm": 0.07057463377714157, - "learning_rate": 1.334597031945517e-06, - "loss": 0.0033, + "epoch": 0.63732129420617, + "grad_norm": 0.0032203912269324064, + "learning_rate": 5.818501788905464e-06, + "loss": 0.0001, "step": 847 }, { - "epoch": 0.8346456692913385, - "grad_norm": 0.3868122696876526, - "learning_rate": 1.319205497565983e-06, - "loss": 0.01, + "epoch": 0.6380737396538751, + "grad_norm": 0.01805216819047928, + "learning_rate": 5.7970405506145e-06, + "loss": 0.0008, "step": 848 }, { - "epoch": 0.8356299212598425, - "grad_norm": 0.002050751354545355, - "learning_rate": 1.303896961918707e-06, - "loss": 0.0001, + "epoch": 0.6388261851015802, + "grad_norm": 0.015916842967271805, + "learning_rate": 5.775602798092335e-06, + "loss": 0.0007, "step": 849 }, { - "epoch": 0.8366141732283464, - "grad_norm": 0.5282914638519287, - "learning_rate": 1.2886715713715126e-06, - "loss": 0.0138, + "epoch": 0.6395786305492852, + "grad_norm": 0.18560563027858734, + "learning_rate": 5.754188651131246e-06, + "loss": 0.0048, "step": 850 }, { - "epoch": 0.8375984251968503, - "grad_norm": 0.010376269929111004, - "learning_rate": 1.2735294714972668e-06, - "loss": 0.0006, + "epoch": 0.6403310759969902, + "grad_norm": 0.02914322167634964, + "learning_rate": 5.732798229391613e-06, + "loss": 0.0012, "step": 851 }, { - "epoch": 0.8385826771653543, - "grad_norm": 0.020782766863703728, - "learning_rate": 1.2584708070724738e-06, - "loss": 0.0015, + "epoch": 0.6410835214446953, + "grad_norm": 0.0026829675771296024, + "learning_rate": 5.711431652401227e-06, + "loss": 0.0001, "step": 852 }, { - "epoch": 0.8395669291338582, - "grad_norm": 0.010914409533143044, - "learning_rate": 1.2434957220758982e-06, - "loss": 0.0007, + "epoch": 0.6418359668924003, + "grad_norm": 0.1754864752292633, + "learning_rate": 5.690089039554654e-06, + "loss": 0.1013, "step": 853 }, { - "epoch": 0.8405511811023622, - "grad_norm": 2.0541369915008545, - "learning_rate": 1.2286043596871843e-06, - "loss": 0.0978, + "epoch": 0.6425884123401053, + "grad_norm": 0.07116664201021194, + "learning_rate": 5.668770510112538e-06, + "loss": 0.0032, "step": 854 }, { - "epoch": 0.8415354330708661, - "grad_norm": 0.020481707528233528, - "learning_rate": 1.21379686228549e-06, - "loss": 0.0012, + "epoch": 0.6433408577878104, + "grad_norm": 0.8461427092552185, + "learning_rate": 5.6474761832009554e-06, + "loss": 0.0517, "step": 855 }, { - "epoch": 0.84251968503937, - "grad_norm": 0.01398189552128315, - "learning_rate": 1.1990733714481185e-06, - "loss": 0.0007, + "epoch": 0.6440933032355154, + "grad_norm": 0.009436607360839844, + "learning_rate": 5.626206177810735e-06, + "loss": 0.0004, "step": 856 }, { - "epoch": 0.843503937007874, - "grad_norm": 2.4280009269714355, - "learning_rate": 1.1844340279491772e-06, - "loss": 0.1008, + "epoch": 0.6448457486832204, + "grad_norm": 2.8411285877227783, + "learning_rate": 5.604960612796805e-06, + "loss": 0.3936, "step": 857 }, { - "epoch": 0.844488188976378, - "grad_norm": 0.0013540080981329083, - "learning_rate": 1.1698789717582203e-06, - "loss": 0.0001, + "epoch": 0.6455981941309256, + "grad_norm": 0.05091340094804764, + "learning_rate": 5.583739606877516e-06, + "loss": 0.0014, "step": 858 }, { - "epoch": 0.8454724409448819, - "grad_norm": 0.024509334936738014, - "learning_rate": 1.1554083420389194e-06, - "loss": 0.0011, + "epoch": 0.6463506395786306, + "grad_norm": 0.00600972305983305, + "learning_rate": 5.562543278633988e-06, + "loss": 0.0002, "step": 859 }, { - "epoch": 0.8464566929133859, - "grad_norm": 0.002505276584997773, - "learning_rate": 1.1410222771477276e-06, - "loss": 0.0002, + "epoch": 0.6471030850263356, + "grad_norm": 0.0064705973491072655, + "learning_rate": 5.541371746509448e-06, + "loss": 0.0003, "step": 860 }, { - "epoch": 0.8474409448818898, - "grad_norm": 0.25860899686813354, - "learning_rate": 1.1267209146325498e-06, - "loss": 0.0123, + "epoch": 0.6478555304740407, + "grad_norm": 5.701414585113525, + "learning_rate": 5.520225128808555e-06, + "loss": 0.0259, "step": 861 }, { - "epoch": 0.8484251968503937, - "grad_norm": 0.004326984751969576, - "learning_rate": 1.1125043912314438e-06, - "loss": 0.0002, + "epoch": 0.6486079759217457, + "grad_norm": 0.021785501390695572, + "learning_rate": 5.4991035436967585e-06, + "loss": 0.0009, "step": 862 }, { - "epoch": 0.8494094488188977, - "grad_norm": 0.0027909427881240845, - "learning_rate": 1.0983728428713024e-06, - "loss": 0.0001, + "epoch": 0.6493604213694507, + "grad_norm": 0.017925115302205086, + "learning_rate": 5.478007109199624e-06, + "loss": 0.0008, "step": 863 }, { - "epoch": 0.8503937007874016, - "grad_norm": 0.016817815601825714, - "learning_rate": 1.0843264046665558e-06, - "loss": 0.001, + "epoch": 0.6501128668171557, + "grad_norm": 0.012886104173958302, + "learning_rate": 5.456935943202177e-06, + "loss": 0.0005, "step": 864 }, { - "epoch": 0.8513779527559056, - "grad_norm": 0.840516984462738, - "learning_rate": 1.0703652109178785e-06, - "loss": 0.0378, + "epoch": 0.6508653122648608, + "grad_norm": 0.015779219567775726, + "learning_rate": 5.4358901634482404e-06, + "loss": 0.0006, "step": 865 }, { - "epoch": 0.8523622047244095, - "grad_norm": 0.004841144662350416, - "learning_rate": 1.0564893951109068e-06, - "loss": 0.0002, + "epoch": 0.6516177577125658, + "grad_norm": 0.05147948116064072, + "learning_rate": 5.4148698875397905e-06, + "loss": 0.0025, "step": 866 }, { - "epoch": 0.8533464566929134, - "grad_norm": 1.7365187406539917, - "learning_rate": 1.0426990899149658e-06, - "loss": 0.0397, + "epoch": 0.6523702031602708, + "grad_norm": 0.019827265292406082, + "learning_rate": 5.393875232936283e-06, + "loss": 0.0007, "step": 867 }, { - "epoch": 0.8543307086614174, - "grad_norm": 2.114926338195801, - "learning_rate": 1.0289944271817898e-06, - "loss": 0.4407, + "epoch": 0.653122648607976, + "grad_norm": 0.01970357820391655, + "learning_rate": 5.372906316954005e-06, + "loss": 0.0006, "step": 868 }, { - "epoch": 0.8553149606299213, - "grad_norm": 0.24757209420204163, - "learning_rate": 1.0153755379442754e-06, - "loss": 0.007, + "epoch": 0.653875094055681, + "grad_norm": 8.924382209777832, + "learning_rate": 5.351963256765426e-06, + "loss": 0.1568, "step": 869 }, { - "epoch": 0.8562992125984252, - "grad_norm": 0.48165109753608704, - "learning_rate": 1.001842552415221e-06, - "loss": 0.0181, + "epoch": 0.654627539503386, + "grad_norm": 1.7220969200134277, + "learning_rate": 5.33104616939853e-06, + "loss": 0.0397, "step": 870 }, { - "epoch": 0.8572834645669292, - "grad_norm": 1.1679885387420654, - "learning_rate": 9.883955999860816e-07, - "loss": 0.0225, + "epoch": 0.6553799849510911, + "grad_norm": 0.3469778895378113, + "learning_rate": 5.3101551717361586e-06, + "loss": 0.0112, "step": 871 }, { - "epoch": 0.8582677165354331, - "grad_norm": 0.652866780757904, - "learning_rate": 9.750348092257368e-07, - "loss": 0.0694, + "epoch": 0.6561324303987961, + "grad_norm": 0.3850734233856201, + "learning_rate": 5.2892903805153795e-06, + "loss": 0.1255, "step": 872 }, { - "epoch": 0.859251968503937, - "grad_norm": 1.0167498588562012, - "learning_rate": 9.61760307879256e-07, - "loss": 0.0223, + "epoch": 0.6568848758465011, + "grad_norm": 2.2157397270202637, + "learning_rate": 5.2684519123268155e-06, + "loss": 0.3644, "step": 873 }, { - "epoch": 0.860236220472441, - "grad_norm": 1.9887505769729614, - "learning_rate": 9.48572222866676e-07, - "loss": 0.2891, + "epoch": 0.6576373212942062, + "grad_norm": 0.011840404942631721, + "learning_rate": 5.247639883613999e-06, + "loss": 0.0005, "step": 874 }, { - "epoch": 0.8612204724409449, - "grad_norm": 0.08713765442371368, - "learning_rate": 9.354706802817948e-07, - "loss": 0.0052, + "epoch": 0.6583897667419112, + "grad_norm": 0.02047601528465748, + "learning_rate": 5.226854410672724e-06, + "loss": 0.0008, "step": 875 }, { - "epoch": 0.8622047244094488, - "grad_norm": 2.3826208114624023, - "learning_rate": 9.224558053909615e-07, - "loss": 0.1815, + "epoch": 0.6591422121896162, + "grad_norm": 4.430492401123047, + "learning_rate": 5.2060956096503854e-06, + "loss": 0.1308, "step": 876 }, { - "epoch": 0.8631889763779528, - "grad_norm": 0.20682862401008606, - "learning_rate": 9.095277226318766e-07, - "loss": 0.0068, + "epoch": 0.6598946576373212, + "grad_norm": 0.004179192706942558, + "learning_rate": 5.1853635965453495e-06, + "loss": 0.0002, "step": 877 }, { - "epoch": 0.8641732283464567, - "grad_norm": 1.689835548400879, - "learning_rate": 8.966865556124061e-07, - "loss": 0.0276, + "epoch": 0.6606471030850264, + "grad_norm": 0.3641025424003601, + "learning_rate": 5.164658487206275e-06, + "loss": 0.1212, "step": 878 }, { - "epoch": 0.8651574803149606, - "grad_norm": 0.031897708773612976, - "learning_rate": 8.839324271093974e-07, - "loss": 0.0013, + "epoch": 0.6613995485327314, + "grad_norm": 0.061624638736248016, + "learning_rate": 5.143980397331512e-06, + "loss": 0.0025, "step": 879 }, { - "epoch": 0.8661417322834646, - "grad_norm": 1.7181419134140015, - "learning_rate": 8.712654590675085e-07, - "loss": 0.0831, + "epoch": 0.6621519939804364, + "grad_norm": 5.032725811004639, + "learning_rate": 5.123329442468403e-06, + "loss": 0.035, "step": 880 }, { - "epoch": 0.8671259842519685, - "grad_norm": 0.03877853602170944, - "learning_rate": 8.586857725980325e-07, - "loss": 0.0017, + "epoch": 0.6629044394281415, + "grad_norm": 1.7214125394821167, + "learning_rate": 5.102705738012676e-06, + "loss": 0.0527, "step": 881 }, { - "epoch": 0.8681102362204725, - "grad_norm": 0.0030326335690915585, - "learning_rate": 8.461934879777545e-07, - "loss": 0.0002, + "epoch": 0.6636568848758465, + "grad_norm": 0.7142964005470276, + "learning_rate": 5.082109399207784e-06, + "loss": 0.1174, "step": 882 }, { - "epoch": 0.8690944881889764, - "grad_norm": 0.006465593818575144, - "learning_rate": 8.337887246477905e-07, - "loss": 0.0004, + "epoch": 0.6644093303235515, + "grad_norm": 0.062142811715602875, + "learning_rate": 5.061540541144265e-06, + "loss": 0.0029, "step": 883 }, { - "epoch": 0.8700787401574803, - "grad_norm": 0.004565288778394461, - "learning_rate": 8.214716012124491e-07, - "loss": 0.0002, + "epoch": 0.6651617757712566, + "grad_norm": 0.2744309902191162, + "learning_rate": 5.0409992787590845e-06, + "loss": 0.0093, "step": 884 }, { - "epoch": 0.8710629921259843, - "grad_norm": 0.055550165474414825, - "learning_rate": 8.09242235438098e-07, - "loss": 0.0024, + "epoch": 0.6659142212189616, + "grad_norm": 0.059226468205451965, + "learning_rate": 5.02048572683502e-06, + "loss": 0.0028, "step": 885 }, { - "epoch": 0.8720472440944882, - "grad_norm": 0.020595313981175423, - "learning_rate": 7.971007442520363e-07, - "loss": 0.001, + "epoch": 0.6666666666666666, + "grad_norm": 0.007528245449066162, + "learning_rate": 5.000000000000003e-06, + "loss": 0.0003, "step": 886 }, { - "epoch": 0.8730314960629921, - "grad_norm": 2.3358795642852783, - "learning_rate": 7.850472437413748e-07, - "loss": 0.0579, + "epoch": 0.6674191121143717, + "grad_norm": 2.5258829593658447, + "learning_rate": 4.979542212726474e-06, + "loss": 0.2801, "step": 887 }, { - "epoch": 0.8740157480314961, - "grad_norm": 0.6420374512672424, - "learning_rate": 7.730818491519343e-07, - "loss": 0.0258, + "epoch": 0.6681715575620768, + "grad_norm": 3.617130756378174, + "learning_rate": 4.959112479330753e-06, + "loss": 0.1343, "step": 888 }, { - "epoch": 0.875, - "grad_norm": 0.002170541090890765, - "learning_rate": 7.612046748871327e-07, - "loss": 0.0001, + "epoch": 0.6689240030097818, + "grad_norm": 0.3309485912322998, + "learning_rate": 4.93871091397241e-06, + "loss": 0.0082, "step": 889 }, { - "epoch": 0.8759842519685039, - "grad_norm": 0.006496890913695097, - "learning_rate": 7.494158345069014e-07, - "loss": 0.0004, + "epoch": 0.6696764484574869, + "grad_norm": 0.031886566430330276, + "learning_rate": 4.9183376306535904e-06, + "loss": 0.0009, "step": 890 }, { - "epoch": 0.8769685039370079, - "grad_norm": 1.0813565254211426, - "learning_rate": 7.377154407265897e-07, - "loss": 0.0292, + "epoch": 0.6704288939051919, + "grad_norm": 0.04454849287867546, + "learning_rate": 4.897992743218419e-06, + "loss": 0.0019, "step": 891 }, { - "epoch": 0.8779527559055118, - "grad_norm": 0.17985296249389648, - "learning_rate": 7.261036054158965e-07, - "loss": 0.006, + "epoch": 0.6711813393528969, + "grad_norm": 0.19051343202590942, + "learning_rate": 4.877676365352343e-06, + "loss": 0.1, "step": 892 }, { - "epoch": 0.8789370078740157, - "grad_norm": 0.0374092198908329, - "learning_rate": 7.145804395977962e-07, - "loss": 0.0014, + "epoch": 0.671933784800602, + "grad_norm": 2.1760129928588867, + "learning_rate": 4.857388610581499e-06, + "loss": 0.1941, "step": 893 }, { - "epoch": 0.8799212598425197, - "grad_norm": 0.48693081736564636, - "learning_rate": 7.031460534474699e-07, - "loss": 0.018, + "epoch": 0.672686230248307, + "grad_norm": 0.020949775353074074, + "learning_rate": 4.837129592272083e-06, + "loss": 0.0008, "step": 894 }, { - "epoch": 0.8809055118110236, - "grad_norm": 1.497611403465271, - "learning_rate": 6.918005562912689e-07, - "loss": 0.1863, + "epoch": 0.673438675696012, + "grad_norm": 0.08075495064258575, + "learning_rate": 4.81689942362971e-06, + "loss": 0.0042, "step": 895 }, { - "epoch": 0.8818897637795275, - "grad_norm": 0.003164686029776931, - "learning_rate": 6.805440566056554e-07, + "epoch": 0.674191121143717, + "grad_norm": 0.004490719176828861, + "learning_rate": 4.796698217698791e-06, "loss": 0.0002, "step": 896 }, { - "epoch": 0.8828740157480315, - "grad_norm": 0.625789999961853, - "learning_rate": 6.693766620161691e-07, - "loss": 0.0374, + "epoch": 0.6749435665914221, + "grad_norm": 0.04653012752532959, + "learning_rate": 4.776526087361896e-06, + "loss": 0.0022, "step": 897 }, { - "epoch": 0.8838582677165354, - "grad_norm": 2.6650314331054688, - "learning_rate": 6.582984792964009e-07, - "loss": 0.2928, + "epoch": 0.6756960120391272, + "grad_norm": 0.009343601763248444, + "learning_rate": 4.756383145339107e-06, + "loss": 0.0003, "step": 898 }, { - "epoch": 0.8848425196850394, - "grad_norm": 1.911970615386963, - "learning_rate": 6.473096143669699e-07, - "loss": 0.2482, + "epoch": 0.6764484574868322, + "grad_norm": 0.12858974933624268, + "learning_rate": 4.736269504187431e-06, + "loss": 0.0055, "step": 899 }, { - "epoch": 0.8858267716535433, - "grad_norm": 2.6830039024353027, - "learning_rate": 6.364101722945082e-07, - "loss": 0.1237, + "epoch": 0.6772009029345373, + "grad_norm": 0.020633621141314507, + "learning_rate": 4.716185276300126e-06, + "loss": 0.0009, "step": 900 }, { - "epoch": 0.8868110236220472, - "grad_norm": 0.30362462997436523, - "learning_rate": 6.256002572906627e-07, - "loss": 0.016, + "epoch": 0.6779533483822423, + "grad_norm": 0.023468296974897385, + "learning_rate": 4.696130573906096e-06, + "loss": 0.0009, "step": 901 }, { - "epoch": 0.8877952755905512, - "grad_norm": 5.334184646606445, - "learning_rate": 6.148799727110911e-07, - "loss": 0.5792, + "epoch": 0.6787057938299473, + "grad_norm": 1.5509250164031982, + "learning_rate": 4.676105509069263e-06, + "loss": 0.0409, "step": 902 }, { - "epoch": 0.8887795275590551, - "grad_norm": 0.00885742250829935, - "learning_rate": 6.042494210544791e-07, - "loss": 0.0006, + "epoch": 0.6794582392776524, + "grad_norm": 0.607180118560791, + "learning_rate": 4.656110193687925e-06, + "loss": 0.0688, "step": 903 }, { - "epoch": 0.889763779527559, - "grad_norm": 0.10470603406429291, - "learning_rate": 5.937087039615619e-07, - "loss": 0.0039, + "epoch": 0.6802106847253574, + "grad_norm": 0.04802005738019943, + "learning_rate": 4.636144739494156e-06, + "loss": 0.0021, "step": 904 }, { - "epoch": 0.890748031496063, - "grad_norm": 0.0013712242944166064, - "learning_rate": 5.832579222141432e-07, - "loss": 0.0001, + "epoch": 0.6809631301730624, + "grad_norm": 0.009008445776998997, + "learning_rate": 4.616209258053163e-06, + "loss": 0.0003, "step": 905 }, { - "epoch": 0.8917322834645669, - "grad_norm": 0.008078637532889843, - "learning_rate": 5.72897175734145e-07, - "loss": 0.0005, + "epoch": 0.6817155756207675, + "grad_norm": 0.02914116531610489, + "learning_rate": 4.5963038607626655e-06, + "loss": 0.0009, "step": 906 }, { - "epoch": 0.8927165354330708, - "grad_norm": 0.20869997143745422, - "learning_rate": 5.626265635826367e-07, - "loss": 0.0089, + "epoch": 0.6824680210684725, + "grad_norm": 1.7884653806686401, + "learning_rate": 4.57642865885228e-06, + "loss": 0.3394, "step": 907 }, { - "epoch": 0.8937007874015748, - "grad_norm": 0.002778980415314436, - "learning_rate": 5.524461839589012e-07, - "loss": 0.0002, + "epoch": 0.6832204665161776, + "grad_norm": 0.3303041160106659, + "learning_rate": 4.5565837633828904e-06, + "loss": 0.1329, "step": 908 }, { - "epoch": 0.8946850393700787, - "grad_norm": 0.0049267299473285675, - "learning_rate": 5.42356134199491e-07, - "loss": 0.0003, + "epoch": 0.6839729119638827, + "grad_norm": 0.06692855805158615, + "learning_rate": 4.536769285246033e-06, + "loss": 0.003, "step": 909 }, { - "epoch": 0.8956692913385826, - "grad_norm": 1.5696227550506592, - "learning_rate": 5.323565107772977e-07, - "loss": 0.1721, + "epoch": 0.6847253574115877, + "grad_norm": 0.04765620082616806, + "learning_rate": 4.516985335163274e-06, + "loss": 0.0022, "step": 910 }, { - "epoch": 0.8966535433070866, - "grad_norm": 0.014570971950888634, - "learning_rate": 5.224474093006271e-07, - "loss": 0.0009, + "epoch": 0.6854778028592927, + "grad_norm": 0.18733732402324677, + "learning_rate": 4.4972320236855916e-06, + "loss": 0.0051, "step": 911 }, { - "epoch": 0.8976377952755905, - "grad_norm": 0.010583105497062206, - "learning_rate": 5.126289245122906e-07, - "loss": 0.0007, + "epoch": 0.6862302483069977, + "grad_norm": 0.07214639335870743, + "learning_rate": 4.477509461192756e-06, + "loss": 0.0036, "step": 912 }, { - "epoch": 0.8986220472440944, - "grad_norm": 0.18873628973960876, - "learning_rate": 5.029011502886905e-07, - "loss": 0.0077, + "epoch": 0.6869826937547028, + "grad_norm": 0.23084872961044312, + "learning_rate": 4.457817757892718e-06, + "loss": 0.1103, "step": 913 }, { - "epoch": 0.8996062992125984, - "grad_norm": 0.011854403652250767, - "learning_rate": 4.932641796389348e-07, - "loss": 0.0005, + "epoch": 0.6877351392024078, + "grad_norm": 0.3813030421733856, + "learning_rate": 4.438157023820991e-06, + "loss": 0.0089, "step": 914 }, { - "epoch": 0.9005905511811023, - "grad_norm": 0.18268321454524994, - "learning_rate": 4.837181047039375e-07, - "loss": 0.0053, + "epoch": 0.6884875846501128, + "grad_norm": 0.15035146474838257, + "learning_rate": 4.4185273688400274e-06, + "loss": 0.0044, "step": 915 }, { - "epoch": 0.9015748031496063, - "grad_norm": 0.015285042114555836, - "learning_rate": 4.7426301675554285e-07, - "loss": 0.0007, + "epoch": 0.6892400300978179, + "grad_norm": 0.00839236006140709, + "learning_rate": 4.398928902638626e-06, + "loss": 0.0003, "step": 916 }, { - "epoch": 0.9025590551181102, - "grad_norm": 1.9459728002548218, - "learning_rate": 4.648990061956493e-07, - "loss": 0.1071, + "epoch": 0.6899924755455229, + "grad_norm": 0.14042149484157562, + "learning_rate": 4.379361734731289e-06, + "loss": 0.0057, "step": 917 }, { - "epoch": 0.9035433070866141, - "grad_norm": 3.810908794403076, - "learning_rate": 4.5562616255534933e-07, - "loss": 0.1662, + "epoch": 0.690744920993228, + "grad_norm": 0.3208416700363159, + "learning_rate": 4.359825974457632e-06, + "loss": 0.1172, "step": 918 }, { - "epoch": 0.9045275590551181, - "grad_norm": 0.00518028112128377, - "learning_rate": 4.464445744940715e-07, - "loss": 0.0003, + "epoch": 0.6914973664409331, + "grad_norm": 0.1413826197385788, + "learning_rate": 4.340321730981779e-06, + "loss": 0.0033, "step": 919 }, { - "epoch": 0.905511811023622, - "grad_norm": 0.6172063946723938, - "learning_rate": 4.3735432979872593e-07, - "loss": 0.0253, + "epoch": 0.6922498118886381, + "grad_norm": 0.09464599937200546, + "learning_rate": 4.32084911329173e-06, + "loss": 0.0045, "step": 920 }, { - "epoch": 0.906496062992126, - "grad_norm": 1.042099952697754, - "learning_rate": 4.283555153828789e-07, - "loss": 0.1532, + "epoch": 0.6930022573363431, + "grad_norm": 0.04123972728848457, + "learning_rate": 4.301408230198763e-06, + "loss": 0.0015, "step": 921 }, { - "epoch": 0.90748031496063, - "grad_norm": 0.012854104861617088, - "learning_rate": 4.194482172859127e-07, - "loss": 0.0009, + "epoch": 0.6937547027840482, + "grad_norm": 0.5074460506439209, + "learning_rate": 4.28199919033683e-06, + "loss": 0.0119, "step": 922 }, { - "epoch": 0.9084645669291339, - "grad_norm": 0.006141228601336479, - "learning_rate": 4.106325206722028e-07, - "loss": 0.0003, + "epoch": 0.6945071482317532, + "grad_norm": 0.052684567868709564, + "learning_rate": 4.2626221021619396e-06, + "loss": 0.0027, "step": 923 }, { - "epoch": 0.9094488188976378, - "grad_norm": 0.052238598465919495, - "learning_rate": 4.019085098303077e-07, - "loss": 0.0027, + "epoch": 0.6952595936794582, + "grad_norm": 0.016798803582787514, + "learning_rate": 4.243277073951562e-06, + "loss": 0.0008, "step": 924 }, { - "epoch": 0.9104330708661418, - "grad_norm": 0.06066101789474487, - "learning_rate": 3.932762681721569e-07, - "loss": 0.0026, + "epoch": 0.6960120391271633, + "grad_norm": 0.006241548340767622, + "learning_rate": 4.223964213804019e-06, + "loss": 0.0002, "step": 925 }, { - "epoch": 0.9114173228346457, - "grad_norm": 0.3152275085449219, - "learning_rate": 3.847358782322608e-07, - "loss": 0.0098, + "epoch": 0.6967644845748683, + "grad_norm": 3.114851474761963, + "learning_rate": 4.204683629637881e-06, + "loss": 0.32, "step": 926 }, { - "epoch": 0.9124015748031497, - "grad_norm": 1.4815139770507812, - "learning_rate": 3.762874216669166e-07, - "loss": 0.0442, + "epoch": 0.6975169300225733, + "grad_norm": 0.23592227697372437, + "learning_rate": 4.1854354291913594e-06, + "loss": 0.0053, "step": 927 }, { - "epoch": 0.9133858267716536, - "grad_norm": 0.43753549456596375, - "learning_rate": 3.679309792534291e-07, - "loss": 0.1341, + "epoch": 0.6982693754702785, + "grad_norm": 0.7205729484558105, + "learning_rate": 4.1662197200217116e-06, + "loss": 0.0056, "step": 928 }, { - "epoch": 0.9143700787401575, - "grad_norm": 0.0070118652656674385, - "learning_rate": 3.5966663088933927e-07, - "loss": 0.0004, + "epoch": 0.6990218209179835, + "grad_norm": 0.0018878192640841007, + "learning_rate": 4.147036609504633e-06, + "loss": 0.0001, "step": 929 }, { - "epoch": 0.9153543307086615, - "grad_norm": 0.013707267120480537, - "learning_rate": 3.5149445559165886e-07, - "loss": 0.0007, + "epoch": 0.6997742663656885, + "grad_norm": 0.24593710899353027, + "learning_rate": 4.1278862048336645e-06, + "loss": 0.0058, "step": 930 }, { - "epoch": 0.9163385826771654, - "grad_norm": 1.8214647769927979, - "learning_rate": 3.434145314961124e-07, - "loss": 0.3667, + "epoch": 0.7005267118133935, + "grad_norm": 0.035347383469343185, + "learning_rate": 4.108768613019588e-06, + "loss": 0.0015, "step": 931 }, { - "epoch": 0.9173228346456693, - "grad_norm": 0.018571775406599045, - "learning_rate": 3.354269358563966e-07, - "loss": 0.001, + "epoch": 0.7012791572610986, + "grad_norm": 0.15606430172920227, + "learning_rate": 4.089683940889829e-06, + "loss": 0.0069, "step": 932 }, { - "epoch": 0.9183070866141733, - "grad_norm": 0.00397480046376586, - "learning_rate": 3.2753174504343786e-07, - "loss": 0.0002, + "epoch": 0.7020316027088036, + "grad_norm": 1.3364535570144653, + "learning_rate": 4.070632295087863e-06, + "loss": 0.0344, "step": 933 }, { - "epoch": 0.9192913385826772, - "grad_norm": 1.3928825855255127, - "learning_rate": 3.197290345446613e-07, - "loss": 0.1688, + "epoch": 0.7027840481565086, + "grad_norm": 0.1990508735179901, + "learning_rate": 4.051613782072614e-06, + "loss": 0.0034, "step": 934 }, { - "epoch": 0.9202755905511811, - "grad_norm": 0.026968184858560562, - "learning_rate": 3.1201887896327055e-07, - "loss": 0.0018, + "epoch": 0.7035364936042137, + "grad_norm": 0.004956225864589214, + "learning_rate": 4.0326285081178695e-06, + "loss": 0.0002, "step": 935 }, { - "epoch": 0.9212598425196851, - "grad_norm": 0.0024406416341662407, - "learning_rate": 3.044013520175337e-07, - "loss": 0.0001, + "epoch": 0.7042889390519187, + "grad_norm": 0.09243706613779068, + "learning_rate": 4.013676579311668e-06, + "loss": 0.0032, "step": 936 }, { - "epoch": 0.922244094488189, - "grad_norm": 0.007555016782134771, - "learning_rate": 2.968765265400808e-07, - "loss": 0.0004, + "epoch": 0.7050413844996237, + "grad_norm": 0.2956998944282532, + "learning_rate": 3.994758101555729e-06, + "loss": 0.0037, "step": 937 }, { - "epoch": 0.9232283464566929, - "grad_norm": 0.005352628882974386, - "learning_rate": 2.894444744772007e-07, - "loss": 0.0003, + "epoch": 0.7057938299473289, + "grad_norm": 6.872308254241943, + "learning_rate": 3.975873180564843e-06, + "loss": 0.185, "step": 938 }, { - "epoch": 0.9242125984251969, - "grad_norm": 0.004397415556013584, - "learning_rate": 2.8210526688816297e-07, - "loss": 0.0003, + "epoch": 0.7065462753950339, + "grad_norm": 5.041158676147461, + "learning_rate": 3.957021921866301e-06, + "loss": 0.1589, "step": 939 }, { - "epoch": 0.9251968503937008, - "grad_norm": 0.1908765733242035, - "learning_rate": 2.7485897394453067e-07, - "loss": 0.0097, + "epoch": 0.7072987208427389, + "grad_norm": 0.004548253491520882, + "learning_rate": 3.938204430799278e-06, + "loss": 0.0002, "step": 940 }, { - "epoch": 0.9261811023622047, - "grad_norm": 0.0313321053981781, - "learning_rate": 2.67705664929494e-07, - "loss": 0.0019, + "epoch": 0.708051166290444, + "grad_norm": 4.161418437957764, + "learning_rate": 3.919420812514267e-06, + "loss": 0.0773, "step": 941 }, { - "epoch": 0.9271653543307087, - "grad_norm": 0.8587534427642822, - "learning_rate": 2.606454082372045e-07, - "loss": 0.0496, + "epoch": 0.708803611738149, + "grad_norm": 8.64138412475586, + "learning_rate": 3.9006711719724755e-06, + "loss": 0.3975, "step": 942 }, { - "epoch": 0.9281496062992126, - "grad_norm": 0.003788806265220046, - "learning_rate": 2.536782713721231e-07, - "loss": 0.0002, + "epoch": 0.709556057185854, + "grad_norm": 0.10942798107862473, + "learning_rate": 3.881955613945251e-06, + "loss": 0.0037, "step": 943 }, { - "epoch": 0.9291338582677166, - "grad_norm": 0.08406230807304382, - "learning_rate": 2.4680432094837394e-07, - "loss": 0.0037, + "epoch": 0.710308502633559, + "grad_norm": 0.07507241517305374, + "learning_rate": 3.8632742430134905e-06, + "loss": 0.002, "step": 944 }, { - "epoch": 0.9301181102362205, - "grad_norm": 2.2428107261657715, - "learning_rate": 2.400236226891095e-07, - "loss": 0.1937, + "epoch": 0.7110609480812641, + "grad_norm": 0.020177626982331276, + "learning_rate": 3.844627163567059e-06, + "loss": 0.0008, "step": 945 }, { - "epoch": 0.9311023622047244, - "grad_norm": 0.1266690045595169, - "learning_rate": 2.3333624142587884e-07, - "loss": 0.0051, + "epoch": 0.7118133935289691, + "grad_norm": 6.14890718460083, + "learning_rate": 3.826014479804198e-06, + "loss": 0.0767, "step": 946 }, { - "epoch": 0.9320866141732284, - "grad_norm": 0.07008988410234451, - "learning_rate": 2.2674224109800913e-07, - "loss": 0.0032, + "epoch": 0.7125658389766741, + "grad_norm": 0.330617755651474, + "learning_rate": 3.807436295730953e-06, + "loss": 0.0074, "step": 947 }, { - "epoch": 0.9330708661417323, - "grad_norm": 0.002285080263391137, - "learning_rate": 2.2024168475199615e-07, - "loss": 0.0001, + "epoch": 0.7133182844243793, + "grad_norm": 0.07356506586074829, + "learning_rate": 3.788892715160588e-06, + "loss": 0.0027, "step": 948 }, { - "epoch": 0.9340551181102362, - "grad_norm": 0.007554065436124802, - "learning_rate": 2.1383463454090037e-07, - "loss": 0.0004, + "epoch": 0.7140707298720843, + "grad_norm": 5.51183557510376, + "learning_rate": 3.7703838417130045e-06, + "loss": 0.1942, "step": 949 }, { - "epoch": 0.9350393700787402, - "grad_norm": 0.131584033370018, - "learning_rate": 2.0752115172375076e-07, - "loss": 0.0056, + "epoch": 0.7148231753197893, + "grad_norm": 0.07540974020957947, + "learning_rate": 3.7519097788141635e-06, + "loss": 0.0032, "step": 950 }, { - "epoch": 0.9360236220472441, - "grad_norm": 0.04156121239066124, - "learning_rate": 2.0130129666495856e-07, - "loss": 0.0021, + "epoch": 0.7155756207674944, + "grad_norm": 0.1256755292415619, + "learning_rate": 3.7334706296955093e-06, + "loss": 0.0027, "step": 951 }, { - "epoch": 0.937007874015748, - "grad_norm": 0.005513249896466732, - "learning_rate": 1.9517512883374667e-07, - "loss": 0.0004, + "epoch": 0.7163280662151994, + "grad_norm": 0.0014390931464731693, + "learning_rate": 3.7150664973933893e-06, + "loss": 0.0001, "step": 952 }, { - "epoch": 0.937992125984252, - "grad_norm": 0.455952525138855, - "learning_rate": 1.891427068035745e-07, - "loss": 0.0333, + "epoch": 0.7170805116629044, + "grad_norm": 0.19743457436561584, + "learning_rate": 3.6966974847484805e-06, + "loss": 0.0047, "step": 953 }, { - "epoch": 0.9389763779527559, - "grad_norm": 0.007683460135012865, - "learning_rate": 1.8320408825157843e-07, - "loss": 0.0003, + "epoch": 0.7178329571106095, + "grad_norm": 1.814272165298462, + "learning_rate": 3.6783636944052193e-06, + "loss": 0.2478, "step": 954 }, { - "epoch": 0.9399606299212598, - "grad_norm": 0.01539621688425541, - "learning_rate": 1.7735932995802563e-07, - "loss": 0.0007, + "epoch": 0.7185854025583145, + "grad_norm": 0.35399460792541504, + "learning_rate": 3.66006522881121e-06, + "loss": 0.0039, "step": 955 }, { - "epoch": 0.9409448818897638, - "grad_norm": 0.001507644890807569, - "learning_rate": 1.7160848780576334e-07, - "loss": 0.0001, + "epoch": 0.7193378480060195, + "grad_norm": 0.6553662419319153, + "learning_rate": 3.641802190216678e-06, + "loss": 0.0906, "step": 956 }, { - "epoch": 0.9419291338582677, - "grad_norm": 0.6130296587944031, - "learning_rate": 1.659516167796904e-07, - "loss": 0.0146, + "epoch": 0.7200902934537246, + "grad_norm": 0.005217390134930611, + "learning_rate": 3.623574680673879e-06, + "loss": 0.0002, "step": 957 }, { - "epoch": 0.9429133858267716, - "grad_norm": 0.7020381093025208, - "learning_rate": 1.6038877096622995e-07, - "loss": 0.0179, + "epoch": 0.7208427389014297, + "grad_norm": 0.006294188555330038, + "learning_rate": 3.605382802036538e-06, + "loss": 0.0003, "step": 958 }, { - "epoch": 0.9438976377952756, - "grad_norm": 0.9760167002677917, - "learning_rate": 1.549200035528131e-07, - "loss": 0.1525, + "epoch": 0.7215951843491347, + "grad_norm": 4.843064785003662, + "learning_rate": 3.5872266559592817e-06, + "loss": 0.3738, "step": 959 }, { - "epoch": 0.9448818897637795, - "grad_norm": 0.0758519098162651, - "learning_rate": 1.495453668273672e-07, - "loss": 0.0026, + "epoch": 0.7223476297968398, + "grad_norm": 0.020986851304769516, + "learning_rate": 3.5691063438970618e-06, + "loss": 0.0008, "step": 960 }, { - "epoch": 0.9458661417322834, - "grad_norm": 0.004223812837153673, - "learning_rate": 1.4426491217781945e-07, - "loss": 0.0002, + "epoch": 0.7231000752445448, + "grad_norm": 4.1812334060668945, + "learning_rate": 3.5510219671045875e-06, + "loss": 0.4018, "step": 961 }, { - "epoch": 0.9468503937007874, - "grad_norm": 0.10971449315547943, - "learning_rate": 1.3907869009160525e-07, - "loss": 0.0036, + "epoch": 0.7238525206922498, + "grad_norm": 0.0036870623007416725, + "learning_rate": 3.532973626635773e-06, + "loss": 0.0001, "step": 962 }, { - "epoch": 0.9478346456692913, - "grad_norm": 0.009282004088163376, - "learning_rate": 1.339867501551817e-07, - "loss": 0.0005, + "epoch": 0.7246049661399548, + "grad_norm": 0.014761660248041153, + "learning_rate": 3.5149614233431616e-06, + "loss": 0.0007, "step": 963 }, { - "epoch": 0.9488188976377953, - "grad_norm": 0.008931891061365604, - "learning_rate": 1.289891410535593e-07, - "loss": 0.0006, + "epoch": 0.7253574115876599, + "grad_norm": 0.03565015643835068, + "learning_rate": 3.4969854578773667e-06, + "loss": 0.0016, "step": 964 }, { - "epoch": 0.9498031496062992, - "grad_norm": 0.047727763652801514, - "learning_rate": 1.240859105698311e-07, - "loss": 0.0026, + "epoch": 0.7261098570353649, + "grad_norm": 0.24551992118358612, + "learning_rate": 3.479045830686506e-06, + "loss": 0.006, "step": 965 }, { - "epoch": 0.9507874015748031, - "grad_norm": 0.0067417738027870655, - "learning_rate": 1.192771055847197e-07, - "loss": 0.0004, + "epoch": 0.7268623024830699, + "grad_norm": 0.03166704997420311, + "learning_rate": 3.4611426420156422e-06, + "loss": 0.0013, "step": 966 }, { - "epoch": 0.9517716535433071, - "grad_norm": 0.0021078267600387335, - "learning_rate": 1.1456277207612554e-07, - "loss": 0.0001, + "epoch": 0.7276147479307751, + "grad_norm": 0.4878341257572174, + "learning_rate": 3.4432759919062253e-06, + "loss": 0.1401, "step": 967 }, { - "epoch": 0.952755905511811, - "grad_norm": 0.0881347581744194, - "learning_rate": 1.0994295511869257e-07, - "loss": 0.0031, + "epoch": 0.7283671933784801, + "grad_norm": 8.347463607788086, + "learning_rate": 3.4254459801955276e-06, + "loss": 0.0622, "step": 968 }, { - "epoch": 0.9537401574803149, - "grad_norm": 0.009866403415799141, - "learning_rate": 1.0541769888337217e-07, - "loss": 0.0005, + "epoch": 0.7291196388261851, + "grad_norm": 0.05901602655649185, + "learning_rate": 3.4076527065160914e-06, + "loss": 0.0025, "step": 969 }, { - "epoch": 0.9547244094488189, - "grad_norm": 2.30700421333313, - "learning_rate": 1.0098704663699998e-07, - "loss": 0.1754, + "epoch": 0.7298720842738902, + "grad_norm": 0.1788146197795868, + "learning_rate": 3.3898962702951687e-06, + "loss": 0.0094, "step": 970 }, { - "epoch": 0.9557086614173228, - "grad_norm": 0.0050672246143221855, - "learning_rate": 9.665104074188968e-08, + "epoch": 0.7306245297215952, + "grad_norm": 0.008673246949911118, + "learning_rate": 3.3721767707541696e-06, "loss": 0.0003, "step": 971 }, { - "epoch": 0.9566929133858267, - "grad_norm": 0.006644292734563351, - "learning_rate": 9.240972265541992e-08, - "loss": 0.0004, + "epoch": 0.7313769751693002, + "grad_norm": 0.05570625886321068, + "learning_rate": 3.3544943069081025e-06, + "loss": 0.0022, "step": 972 }, { - "epoch": 0.9576771653543307, - "grad_norm": 0.00149418821092695, - "learning_rate": 8.826313292964239e-08, - "loss": 0.0001, + "epoch": 0.7321294206170053, + "grad_norm": 0.04295314475893974, + "learning_rate": 3.3368489775650282e-06, + "loss": 0.0014, "step": 973 }, { - "epoch": 0.9586614173228346, - "grad_norm": 0.23594264686107635, - "learning_rate": 8.421131121089221e-08, - "loss": 0.0044, + "epoch": 0.7328818660647103, + "grad_norm": 0.025052128359675407, + "learning_rate": 3.3192408813254918e-06, + "loss": 0.0009, "step": 974 }, { - "epoch": 0.9596456692913385, - "grad_norm": 0.5788177251815796, - "learning_rate": 8.025429623940928e-08, - "loss": 0.0945, + "epoch": 0.7336343115124153, + "grad_norm": 5.636648178100586, + "learning_rate": 3.3016701165819943e-06, + "loss": 0.1106, "step": 975 }, { - "epoch": 0.9606299212598425, - "grad_norm": 0.00522016454488039, - "learning_rate": 7.639212584897082e-08, - "loss": 0.0003, + "epoch": 0.7343867569601203, + "grad_norm": 0.43065962195396423, + "learning_rate": 3.28413678151843e-06, + "loss": 0.0105, "step": 976 }, { - "epoch": 0.9616141732283464, - "grad_norm": 0.03146470710635185, - "learning_rate": 7.262483696652167e-08, - "loss": 0.0016, + "epoch": 0.7351392024078255, + "grad_norm": 0.006689665839076042, + "learning_rate": 3.2666409741095328e-06, + "loss": 0.0002, "step": 977 }, { - "epoch": 0.9625984251968503, - "grad_norm": 1.1203941106796265, - "learning_rate": 6.895246561183011e-08, - "loss": 0.0167, + "epoch": 0.7358916478555305, + "grad_norm": 0.009542127139866352, + "learning_rate": 3.2491827921203456e-06, + "loss": 0.0004, "step": 978 }, { - "epoch": 0.9635826771653543, - "grad_norm": 0.0037836511619389057, - "learning_rate": 6.537504689714147e-08, - "loss": 0.0002, + "epoch": 0.7366440933032355, + "grad_norm": 2.010205030441284, + "learning_rate": 3.231762333105661e-06, + "loss": 0.0458, "step": 979 }, { - "epoch": 0.9645669291338582, - "grad_norm": 0.013656680472195148, - "learning_rate": 6.189261502683619e-08, - "loss": 0.001, + "epoch": 0.7373965387509406, + "grad_norm": 0.015615508891642094, + "learning_rate": 3.2143796944094675e-06, + "loss": 0.0006, "step": 980 }, { - "epoch": 0.9655511811023622, - "grad_norm": 0.0019141866359859705, - "learning_rate": 5.850520329711118e-08, - "loss": 0.0001, + "epoch": 0.7381489841986456, + "grad_norm": 0.6981082558631897, + "learning_rate": 3.197034973164429e-06, + "loss": 0.0136, "step": 981 }, { - "epoch": 0.9665354330708661, - "grad_norm": 0.002077992307022214, - "learning_rate": 5.521284409565675e-08, - "loss": 0.0001, + "epoch": 0.7389014296463506, + "grad_norm": 0.6512033939361572, + "learning_rate": 3.1797282662913277e-06, + "loss": 0.0279, "step": 982 }, { - "epoch": 0.96751968503937, - "grad_norm": 2.2678544521331787, - "learning_rate": 5.201556890134463e-08, - "loss": 0.311, + "epoch": 0.7396538750940557, + "grad_norm": 1.36346435546875, + "learning_rate": 3.162459670498523e-06, + "loss": 0.1529, "step": 983 }, { - "epoch": 0.968503937007874, - "grad_norm": 0.09743795543909073, - "learning_rate": 4.8913408283934874e-08, - "loss": 0.0039, + "epoch": 0.7404063205417607, + "grad_norm": 0.0437924787402153, + "learning_rate": 3.1452292822814145e-06, + "loss": 0.0021, "step": 984 }, { - "epoch": 0.969488188976378, - "grad_norm": 0.04795536398887634, - "learning_rate": 4.5906391903776104e-08, - "loss": 0.0016, + "epoch": 0.7411587659894657, + "grad_norm": 0.21114251017570496, + "learning_rate": 3.1280371979218993e-06, + "loss": 0.0053, "step": 985 }, { - "epoch": 0.9704724409448819, - "grad_norm": 0.00522790988907218, - "learning_rate": 4.2994548511525735e-08, - "loss": 0.0003, + "epoch": 0.7419112114371708, + "grad_norm": 0.0944240614771843, + "learning_rate": 3.1108835134878367e-06, + "loss": 0.0029, "step": 986 }, { - "epoch": 0.9714566929133859, - "grad_norm": 3.8015875816345215, - "learning_rate": 4.017790594787574e-08, - "loss": 0.1488, + "epoch": 0.7426636568848759, + "grad_norm": 0.044163547456264496, + "learning_rate": 3.0937683248325133e-06, + "loss": 0.0013, "step": 987 }, { - "epoch": 0.9724409448818898, - "grad_norm": 0.0076496112160384655, - "learning_rate": 3.745649114328065e-08, - "loss": 0.0004, + "epoch": 0.7434161023325809, + "grad_norm": 0.056986529380083084, + "learning_rate": 3.0766917275941e-06, + "loss": 0.0022, "step": 988 }, { - "epoch": 0.9734251968503937, - "grad_norm": 1.182790994644165, - "learning_rate": 3.4830330117706644e-08, - "loss": 0.2066, + "epoch": 0.744168547780286, + "grad_norm": 0.039454780519008636, + "learning_rate": 3.0596538171951252e-06, + "loss": 0.0016, "step": 989 }, { - "epoch": 0.9744094488188977, - "grad_norm": 0.007166962139308453, - "learning_rate": 3.229944798038176e-08, - "loss": 0.0004, + "epoch": 0.744920993227991, + "grad_norm": 0.017716459929943085, + "learning_rate": 3.0426546888419385e-06, + "loss": 0.0006, "step": 990 }, { - "epoch": 0.9753937007874016, - "grad_norm": 0.2715499699115753, - "learning_rate": 2.986386892955162e-08, - "loss": 0.0122, + "epoch": 0.745673438675696, + "grad_norm": 2.3982226848602295, + "learning_rate": 3.025694437524177e-06, + "loss": 0.2701, "step": 991 }, { - "epoch": 0.9763779527559056, - "grad_norm": 0.020617423579096794, - "learning_rate": 2.7523616252252972e-08, - "loss": 0.0008, + "epoch": 0.746425884123401, + "grad_norm": 0.17275047302246094, + "learning_rate": 3.008773158014242e-06, + "loss": 0.0043, "step": 992 }, { - "epoch": 0.9773622047244095, - "grad_norm": 1.0068769454956055, - "learning_rate": 2.5278712324088294e-08, - "loss": 0.1884, + "epoch": 0.7471783295711061, + "grad_norm": 0.028289055451750755, + "learning_rate": 2.991890944866752e-06, + "loss": 0.0013, "step": 993 }, { - "epoch": 0.9783464566929134, - "grad_norm": 0.005086081568151712, - "learning_rate": 2.312917860901154e-08, - "loss": 0.0002, + "epoch": 0.7479307750188111, + "grad_norm": 2.991833448410034, + "learning_rate": 2.9750478924180383e-06, + "loss": 0.0494, "step": 994 }, { - "epoch": 0.9793307086614174, - "grad_norm": 0.03319551423192024, - "learning_rate": 2.1075035659124944e-08, - "loss": 0.002, + "epoch": 0.7486832204665161, + "grad_norm": 0.15955565869808197, + "learning_rate": 2.9582440947855993e-06, + "loss": 0.0054, "step": 995 }, { - "epoch": 0.9803149606299213, - "grad_norm": 0.0031260980758816004, - "learning_rate": 1.9116303114480316e-08, - "loss": 0.0002, + "epoch": 0.7494356659142212, + "grad_norm": 0.23433822393417358, + "learning_rate": 2.941479645867583e-06, + "loss": 0.0084, "step": 996 }, { - "epoch": 0.9812992125984252, - "grad_norm": 0.043803855776786804, - "learning_rate": 1.7252999702894736e-08, - "loss": 0.0016, + "epoch": 0.7501881113619263, + "grad_norm": 0.08780021965503693, + "learning_rate": 2.9247546393422566e-06, + "loss": 0.004, "step": 997 }, { - "epoch": 0.9822834645669292, - "grad_norm": 0.045091111212968826, - "learning_rate": 1.5485143239767352e-08, - "loss": 0.0022, + "epoch": 0.7509405568096313, + "grad_norm": 0.13379798829555511, + "learning_rate": 2.9080691686674977e-06, + "loss": 0.0026, "step": 998 }, { - "epoch": 0.9832677165354331, - "grad_norm": 0.009936131536960602, - "learning_rate": 1.3812750627909543e-08, + "epoch": 0.7516930022573364, + "grad_norm": 0.01313999854028225, + "learning_rate": 2.891423327080246e-06, "loss": 0.0006, "step": 999 }, { - "epoch": 0.984251968503937, - "grad_norm": 0.02666308917105198, - "learning_rate": 1.2235837857387246e-08, - "loss": 0.0015, + "epoch": 0.7524454477050414, + "grad_norm": 0.03100876323878765, + "learning_rate": 2.874817207596007e-06, + "loss": 0.0012, "step": 1000 }, { - "epoch": 0.985236220472441, - "grad_norm": 0.04674910381436348, - "learning_rate": 1.0754420005364418e-08, - "loss": 0.0024, + "epoch": 0.7531978931527464, + "grad_norm": 0.018929272890090942, + "learning_rate": 2.8582509030083184e-06, + "loss": 0.0007, "step": 1001 }, { - "epoch": 0.9862204724409449, - "grad_norm": 0.28211429715156555, - "learning_rate": 9.368511235958722e-09, - "loss": 0.0051, + "epoch": 0.7539503386004515, + "grad_norm": 4.446130752563477, + "learning_rate": 2.841724505888239e-06, + "loss": 0.2995, "step": 1002 }, { - "epoch": 0.9872047244094488, - "grad_norm": 0.003161977045238018, - "learning_rate": 8.078124800109388e-09, - "loss": 0.0002, + "epoch": 0.7547027840481565, + "grad_norm": 0.27879974246025085, + "learning_rate": 2.8252381085838266e-06, + "loss": 0.0064, "step": 1003 }, { - "epoch": 0.9881889763779528, - "grad_norm": 0.015495805069804192, - "learning_rate": 6.883273035447335e-09, - "loss": 0.0008, + "epoch": 0.7554552294958615, + "grad_norm": 0.06390407681465149, + "learning_rate": 2.8087918032196214e-06, + "loss": 0.003, "step": 1004 }, { - "epoch": 0.9891732283464567, - "grad_norm": 0.0015122792683541775, - "learning_rate": 5.783967366177478e-09, - "loss": 0.0001, + "epoch": 0.7562076749435666, + "grad_norm": 0.05781351029872894, + "learning_rate": 2.792385681696138e-06, + "loss": 0.0029, "step": 1005 }, { - "epoch": 0.9901574803149606, - "grad_norm": 0.003112461883574724, - "learning_rate": 4.7802183029710406e-09, - "loss": 0.0002, + "epoch": 0.7569601203912716, + "grad_norm": 0.035376448184251785, + "learning_rate": 2.7760198356893466e-06, + "loss": 0.0017, "step": 1006 }, { - "epoch": 0.9911417322834646, - "grad_norm": 1.892495036125183, - "learning_rate": 3.87203544286563e-09, - "loss": 0.1724, + "epoch": 0.7577125658389767, + "grad_norm": 0.024740157648921013, + "learning_rate": 2.759694356650149e-06, + "loss": 0.0008, "step": 1007 }, { - "epoch": 0.9921259842519685, - "grad_norm": 1.35779869556427, - "learning_rate": 3.0594274691686522e-09, - "loss": 0.3667, + "epoch": 0.7584650112866818, + "grad_norm": 0.5133036375045776, + "learning_rate": 2.7434093358039003e-06, + "loss": 0.0121, "step": 1008 }, { - "epoch": 0.9931102362204725, - "grad_norm": 0.36665621399879456, - "learning_rate": 2.342402151381817e-09, - "loss": 0.0112, + "epoch": 0.7592174567343868, + "grad_norm": 0.017775315791368484, + "learning_rate": 2.727164864149867e-06, + "loss": 0.0007, "step": 1009 }, { - "epoch": 0.9940944881889764, - "grad_norm": 1.6031553745269775, - "learning_rate": 1.7209663451200897e-09, - "loss": 0.2406, + "epoch": 0.7599699021820918, + "grad_norm": 0.004680894315242767, + "learning_rate": 2.7109610324607305e-06, + "loss": 0.0002, "step": 1010 }, { - "epoch": 0.9950787401574803, - "grad_norm": 0.07739420235157013, - "learning_rate": 1.1951259920495173e-09, - "loss": 0.0013, + "epoch": 0.7607223476297968, + "grad_norm": 0.01725710742175579, + "learning_rate": 2.6947979312820825e-06, + "loss": 0.0008, "step": 1011 }, { - "epoch": 0.9960629921259843, - "grad_norm": 0.05894338712096214, - "learning_rate": 7.648861198306101e-10, - "loss": 0.0022, + "epoch": 0.7614747930775019, + "grad_norm": 0.279935747385025, + "learning_rate": 2.678675650931917e-06, + "loss": 0.0039, "step": 1012 }, { - "epoch": 0.9970472440944882, - "grad_norm": 0.00805114395916462, - "learning_rate": 4.302508420694906e-10, - "loss": 0.0005, + "epoch": 0.7622272385252069, + "grad_norm": 0.14282621443271637, + "learning_rate": 2.662594281500115e-06, + "loss": 0.0036, "step": 1013 }, { - "epoch": 0.9980314960629921, - "grad_norm": 3.5414252281188965, - "learning_rate": 1.9122335827681527e-10, - "loss": 0.2594, + "epoch": 0.7629796839729119, + "grad_norm": 0.011395514011383057, + "learning_rate": 2.6465539128479646e-06, + "loss": 0.0005, "step": 1014 }, { - "epoch": 0.9990157480314961, - "grad_norm": 0.0018194133881479502, - "learning_rate": 4.780595384001885e-11, - "loss": 0.0001, + "epoch": 0.763732129420617, + "grad_norm": 0.04269295185804367, + "learning_rate": 2.630554634607637e-06, + "loss": 0.0014, "step": 1015 }, { - "epoch": 1.0, - "grad_norm": 1.380206823348999, - "learning_rate": 0.0, - "loss": 0.0395, + "epoch": 0.764484574868322, + "grad_norm": 0.012843809090554714, + "learning_rate": 2.614596536181697e-06, + "loss": 0.0006, "step": 1016 + }, + { + "epoch": 0.7652370203160271, + "grad_norm": 0.018597368150949478, + "learning_rate": 2.5986797067425972e-06, + "loss": 0.0008, + "step": 1017 + }, + { + "epoch": 0.7659894657637322, + "grad_norm": 0.019066110253334045, + "learning_rate": 2.582804235232187e-06, + "loss": 0.0008, + "step": 1018 + }, + { + "epoch": 0.7667419112114372, + "grad_norm": 0.011421947740018368, + "learning_rate": 2.566970210361208e-06, + "loss": 0.0005, + "step": 1019 + }, + { + "epoch": 0.7674943566591422, + "grad_norm": 0.016336999833583832, + "learning_rate": 2.551177720608802e-06, + "loss": 0.0007, + "step": 1020 + }, + { + "epoch": 0.7682468021068473, + "grad_norm": 1.4327224493026733, + "learning_rate": 2.5354268542220163e-06, + "loss": 0.1008, + "step": 1021 + }, + { + "epoch": 0.7689992475545523, + "grad_norm": 0.014406280592083931, + "learning_rate": 2.5197176992153125e-06, + "loss": 0.0004, + "step": 1022 + }, + { + "epoch": 0.7697516930022573, + "grad_norm": 2.0984880924224854, + "learning_rate": 2.5040503433700702e-06, + "loss": 0.4668, + "step": 1023 + }, + { + "epoch": 0.7705041384499624, + "grad_norm": 0.002694485941901803, + "learning_rate": 2.4884248742340987e-06, + "loss": 0.0001, + "step": 1024 + }, + { + "epoch": 0.7712565838976674, + "grad_norm": 0.16212724149227142, + "learning_rate": 2.472841379121154e-06, + "loss": 0.0048, + "step": 1025 + }, + { + "epoch": 0.7720090293453724, + "grad_norm": 2.619164228439331, + "learning_rate": 2.457299945110433e-06, + "loss": 0.2616, + "step": 1026 + }, + { + "epoch": 0.7727614747930776, + "grad_norm": 0.003759812330827117, + "learning_rate": 2.441800659046106e-06, + "loss": 0.0001, + "step": 1027 + }, + { + "epoch": 0.7735139202407826, + "grad_norm": 3.9366838932037354, + "learning_rate": 2.4263436075368307e-06, + "loss": 0.2472, + "step": 1028 + }, + { + "epoch": 0.7742663656884876, + "grad_norm": 2.153656005859375, + "learning_rate": 2.4109288769552518e-06, + "loss": 0.1468, + "step": 1029 + }, + { + "epoch": 0.7750188111361926, + "grad_norm": 0.3993730843067169, + "learning_rate": 2.3955565534375326e-06, + "loss": 0.1389, + "step": 1030 + }, + { + "epoch": 0.7757712565838977, + "grad_norm": 0.0949854701757431, + "learning_rate": 2.3802267228828703e-06, + "loss": 0.0027, + "step": 1031 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 1.1868627071380615, + "learning_rate": 2.3649394709530093e-06, + "loss": 0.0392, + "step": 1032 + }, + { + "epoch": 0.7772761474793077, + "grad_norm": 0.0136722382158041, + "learning_rate": 2.349694883071775e-06, + "loss": 0.0006, + "step": 1033 + }, + { + "epoch": 0.7780285929270128, + "grad_norm": 2.6067428588867188, + "learning_rate": 2.3344930444245863e-06, + "loss": 0.4598, + "step": 1034 + }, + { + "epoch": 0.7787810383747178, + "grad_norm": 0.005388299468904734, + "learning_rate": 2.3193340399579865e-06, + "loss": 0.0002, + "step": 1035 + }, + { + "epoch": 0.7795334838224228, + "grad_norm": 0.032653093338012695, + "learning_rate": 2.304217954379162e-06, + "loss": 0.0012, + "step": 1036 + }, + { + "epoch": 0.780285929270128, + "grad_norm": 0.11140411347150803, + "learning_rate": 2.2891448721554733e-06, + "loss": 0.0052, + "step": 1037 + }, + { + "epoch": 0.781038374717833, + "grad_norm": 0.6110525727272034, + "learning_rate": 2.274114877513981e-06, + "loss": 0.0083, + "step": 1038 + }, + { + "epoch": 0.781790820165538, + "grad_norm": 0.3631162643432617, + "learning_rate": 2.259128054440979e-06, + "loss": 0.0101, + "step": 1039 + }, + { + "epoch": 0.782543265613243, + "grad_norm": 0.5492317080497742, + "learning_rate": 2.2441844866815188e-06, + "loss": 0.0043, + "step": 1040 + }, + { + "epoch": 0.7832957110609481, + "grad_norm": 0.29190927743911743, + "learning_rate": 2.229284257738946e-06, + "loss": 0.0063, + "step": 1041 + }, + { + "epoch": 0.7840481565086531, + "grad_norm": 0.012482292018830776, + "learning_rate": 2.2144274508744355e-06, + "loss": 0.0005, + "step": 1042 + }, + { + "epoch": 0.7848006019563581, + "grad_norm": 0.011028854176402092, + "learning_rate": 2.199614149106519e-06, + "loss": 0.0004, + "step": 1043 + }, + { + "epoch": 0.7855530474040632, + "grad_norm": 0.11335982382297516, + "learning_rate": 2.1848444352106314e-06, + "loss": 0.0025, + "step": 1044 + }, + { + "epoch": 0.7863054928517682, + "grad_norm": 0.17785978317260742, + "learning_rate": 2.1701183917186317e-06, + "loss": 0.0053, + "step": 1045 + }, + { + "epoch": 0.7870579382994732, + "grad_norm": 0.0240810327231884, + "learning_rate": 2.155436100918363e-06, + "loss": 0.001, + "step": 1046 + }, + { + "epoch": 0.7878103837471784, + "grad_norm": 0.048680830746889114, + "learning_rate": 2.1407976448531776e-06, + "loss": 0.0021, + "step": 1047 + }, + { + "epoch": 0.7885628291948834, + "grad_norm": 0.03647977486252785, + "learning_rate": 2.126203105321487e-06, + "loss": 0.0012, + "step": 1048 + }, + { + "epoch": 0.7893152746425884, + "grad_norm": 0.012947115115821362, + "learning_rate": 2.1116525638762963e-06, + "loss": 0.0004, + "step": 1049 + }, + { + "epoch": 0.7900677200902935, + "grad_norm": 0.015302013605833054, + "learning_rate": 2.0971461018247586e-06, + "loss": 0.0007, + "step": 1050 + }, + { + "epoch": 0.7908201655379985, + "grad_norm": 0.9521199464797974, + "learning_rate": 2.082683800227705e-06, + "loss": 0.2254, + "step": 1051 + }, + { + "epoch": 0.7915726109857035, + "grad_norm": 0.011116956360638142, + "learning_rate": 2.0682657398992124e-06, + "loss": 0.0004, + "step": 1052 + }, + { + "epoch": 0.7923250564334086, + "grad_norm": 0.20475442707538605, + "learning_rate": 2.053892001406136e-06, + "loss": 0.1026, + "step": 1053 + }, + { + "epoch": 0.7930775018811136, + "grad_norm": 0.026699619367718697, + "learning_rate": 2.039562665067667e-06, + "loss": 0.0013, + "step": 1054 + }, + { + "epoch": 0.7938299473288186, + "grad_norm": 0.0033015466760843992, + "learning_rate": 2.0252778109548785e-06, + "loss": 0.0001, + "step": 1055 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 1.792168140411377, + "learning_rate": 2.0110375188902852e-06, + "loss": 0.1835, + "step": 1056 + }, + { + "epoch": 0.7953348382242288, + "grad_norm": 0.4291799068450928, + "learning_rate": 1.996841868447388e-06, + "loss": 0.1194, + "step": 1057 + }, + { + "epoch": 0.7960872836719338, + "grad_norm": 18.169370651245117, + "learning_rate": 1.9826909389502368e-06, + "loss": 0.0432, + "step": 1058 + }, + { + "epoch": 0.7968397291196389, + "grad_norm": 2.5589189529418945, + "learning_rate": 1.9685848094729853e-06, + "loss": 0.1892, + "step": 1059 + }, + { + "epoch": 0.7975921745673439, + "grad_norm": 2.7977375984191895, + "learning_rate": 1.9545235588394484e-06, + "loss": 0.1353, + "step": 1060 + }, + { + "epoch": 0.7983446200150489, + "grad_norm": 0.035498447716236115, + "learning_rate": 1.940507265622661e-06, + "loss": 0.0014, + "step": 1061 + }, + { + "epoch": 0.7990970654627539, + "grad_norm": 0.04865005239844322, + "learning_rate": 1.9265360081444385e-06, + "loss": 0.0014, + "step": 1062 + }, + { + "epoch": 0.799849510910459, + "grad_norm": 3.3623549938201904, + "learning_rate": 1.9126098644749482e-06, + "loss": 0.2956, + "step": 1063 + }, + { + "epoch": 0.800601956358164, + "grad_norm": 0.05257292836904526, + "learning_rate": 1.8987289124322517e-06, + "loss": 0.0025, + "step": 1064 + }, + { + "epoch": 0.801354401805869, + "grad_norm": 0.003795795841142535, + "learning_rate": 1.8848932295818945e-06, + "loss": 0.0001, + "step": 1065 + }, + { + "epoch": 0.8021068472535741, + "grad_norm": 0.005308138206601143, + "learning_rate": 1.8711028932364604e-06, + "loss": 0.0002, + "step": 1066 + }, + { + "epoch": 0.8028592927012792, + "grad_norm": 0.7294948697090149, + "learning_rate": 1.8573579804551367e-06, + "loss": 0.0244, + "step": 1067 + }, + { + "epoch": 0.8036117381489842, + "grad_norm": 0.005934171844273806, + "learning_rate": 1.8436585680432951e-06, + "loss": 0.0002, + "step": 1068 + }, + { + "epoch": 0.8043641835966893, + "grad_norm": 5.70277738571167, + "learning_rate": 1.8300047325520508e-06, + "loss": 0.1113, + "step": 1069 + }, + { + "epoch": 0.8051166290443943, + "grad_norm": 0.813459038734436, + "learning_rate": 1.8163965502778337e-06, + "loss": 0.0269, + "step": 1070 + }, + { + "epoch": 0.8058690744920993, + "grad_norm": 1.580662488937378, + "learning_rate": 1.802834097261975e-06, + "loss": 0.0958, + "step": 1071 + }, + { + "epoch": 0.8066215199398044, + "grad_norm": 0.013537387363612652, + "learning_rate": 1.7893174492902742e-06, + "loss": 0.0005, + "step": 1072 + }, + { + "epoch": 0.8073739653875094, + "grad_norm": 3.1393344402313232, + "learning_rate": 1.7758466818925735e-06, + "loss": 0.2465, + "step": 1073 + }, + { + "epoch": 0.8081264108352144, + "grad_norm": 0.17277193069458008, + "learning_rate": 1.7624218703423402e-06, + "loss": 0.0045, + "step": 1074 + }, + { + "epoch": 0.8088788562829194, + "grad_norm": 1.4239376783370972, + "learning_rate": 1.7490430896562439e-06, + "loss": 0.0138, + "step": 1075 + }, + { + "epoch": 0.8096313017306245, + "grad_norm": 0.3208409249782562, + "learning_rate": 1.7357104145937365e-06, + "loss": 0.1175, + "step": 1076 + }, + { + "epoch": 0.8103837471783296, + "grad_norm": 0.30378955602645874, + "learning_rate": 1.7224239196566395e-06, + "loss": 0.0059, + "step": 1077 + }, + { + "epoch": 0.8111361926260346, + "grad_norm": 0.004885723814368248, + "learning_rate": 1.7091836790887196e-06, + "loss": 0.0002, + "step": 1078 + }, + { + "epoch": 0.8118886380737397, + "grad_norm": 0.04553482308983803, + "learning_rate": 1.695989766875279e-06, + "loss": 0.0019, + "step": 1079 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 0.12680576741695404, + "learning_rate": 1.682842256742744e-06, + "loss": 0.0056, + "step": 1080 + }, + { + "epoch": 0.8133935289691497, + "grad_norm": 0.05921807140111923, + "learning_rate": 1.6697412221582477e-06, + "loss": 0.0026, + "step": 1081 + }, + { + "epoch": 0.8141459744168548, + "grad_norm": 0.03163035586476326, + "learning_rate": 1.6566867363292238e-06, + "loss": 0.0014, + "step": 1082 + }, + { + "epoch": 0.8148984198645598, + "grad_norm": 0.027834394946694374, + "learning_rate": 1.6436788722029906e-06, + "loss": 0.001, + "step": 1083 + }, + { + "epoch": 0.8156508653122648, + "grad_norm": 0.3784581422805786, + "learning_rate": 1.6307177024663534e-06, + "loss": 0.1234, + "step": 1084 + }, + { + "epoch": 0.8164033107599699, + "grad_norm": 0.09414243698120117, + "learning_rate": 1.617803299545192e-06, + "loss": 0.0038, + "step": 1085 + }, + { + "epoch": 0.8171557562076749, + "grad_norm": 1.488474726676941, + "learning_rate": 1.6049357356040584e-06, + "loss": 0.1, + "step": 1086 + }, + { + "epoch": 0.81790820165538, + "grad_norm": 0.0046142516657710075, + "learning_rate": 1.5921150825457677e-06, + "loss": 0.0002, + "step": 1087 + }, + { + "epoch": 0.8186606471030851, + "grad_norm": 0.06056587025523186, + "learning_rate": 1.579341412011014e-06, + "loss": 0.0023, + "step": 1088 + }, + { + "epoch": 0.8194130925507901, + "grad_norm": 4.677060127258301, + "learning_rate": 1.5666147953779376e-06, + "loss": 0.1317, + "step": 1089 + }, + { + "epoch": 0.8201655379984951, + "grad_norm": 2.52014422416687, + "learning_rate": 1.553935303761761e-06, + "loss": 0.503, + "step": 1090 + }, + { + "epoch": 0.8209179834462002, + "grad_norm": 0.023583421483635902, + "learning_rate": 1.5413030080143708e-06, + "loss": 0.0009, + "step": 1091 + }, + { + "epoch": 0.8216704288939052, + "grad_norm": 0.2893773317337036, + "learning_rate": 1.5287179787239282e-06, + "loss": 0.0056, + "step": 1092 + }, + { + "epoch": 0.8224228743416102, + "grad_norm": 1.8263130187988281, + "learning_rate": 1.5161802862144715e-06, + "loss": 0.0311, + "step": 1093 + }, + { + "epoch": 0.8231753197893152, + "grad_norm": 0.2679802477359772, + "learning_rate": 1.503690000545528e-06, + "loss": 0.0109, + "step": 1094 + }, + { + "epoch": 0.8239277652370203, + "grad_norm": 0.004781723953783512, + "learning_rate": 1.4912471915117189e-06, + "loss": 0.0002, + "step": 1095 + }, + { + "epoch": 0.8246802106847254, + "grad_norm": 1.211321473121643, + "learning_rate": 1.4788519286423687e-06, + "loss": 0.0369, + "step": 1096 + }, + { + "epoch": 0.8254326561324304, + "grad_norm": 0.03698015958070755, + "learning_rate": 1.46650428120112e-06, + "loss": 0.0013, + "step": 1097 + }, + { + "epoch": 0.8261851015801355, + "grad_norm": 0.23283882439136505, + "learning_rate": 1.4542043181855447e-06, + "loss": 0.1022, + "step": 1098 + }, + { + "epoch": 0.8269375470278405, + "grad_norm": 0.045804113149642944, + "learning_rate": 1.441952108326755e-06, + "loss": 0.0007, + "step": 1099 + }, + { + "epoch": 0.8276899924755455, + "grad_norm": 0.08991260826587677, + "learning_rate": 1.4297477200890275e-06, + "loss": 0.0031, + "step": 1100 + }, + { + "epoch": 0.8284424379232506, + "grad_norm": 0.059289537370204926, + "learning_rate": 1.417591221669412e-06, + "loss": 0.0013, + "step": 1101 + }, + { + "epoch": 0.8291948833709556, + "grad_norm": 0.25663039088249207, + "learning_rate": 1.4054826809973576e-06, + "loss": 0.0031, + "step": 1102 + }, + { + "epoch": 0.8299473288186606, + "grad_norm": 0.047977883368730545, + "learning_rate": 1.393422165734325e-06, + "loss": 0.0019, + "step": 1103 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 0.17930328845977783, + "learning_rate": 1.3814097432734154e-06, + "loss": 0.0026, + "step": 1104 + }, + { + "epoch": 0.8314522197140707, + "grad_norm": 0.05746970325708389, + "learning_rate": 1.3694454807389935e-06, + "loss": 0.0019, + "step": 1105 + }, + { + "epoch": 0.8322046651617758, + "grad_norm": 0.011797560378909111, + "learning_rate": 1.3575294449863063e-06, + "loss": 0.0005, + "step": 1106 + }, + { + "epoch": 0.8329571106094809, + "grad_norm": 0.00394852552562952, + "learning_rate": 1.3456617026011233e-06, + "loss": 0.0001, + "step": 1107 + }, + { + "epoch": 0.8337095560571859, + "grad_norm": 0.07486367225646973, + "learning_rate": 1.3338423198993422e-06, + "loss": 0.003, + "step": 1108 + }, + { + "epoch": 0.8344620015048909, + "grad_norm": 0.05970097705721855, + "learning_rate": 1.322071362926638e-06, + "loss": 0.0027, + "step": 1109 + }, + { + "epoch": 0.835214446952596, + "grad_norm": 0.002417666371911764, + "learning_rate": 1.3103488974580858e-06, + "loss": 0.0001, + "step": 1110 + }, + { + "epoch": 0.835966892400301, + "grad_norm": 0.029511481523513794, + "learning_rate": 1.2986749889977968e-06, + "loss": 0.0011, + "step": 1111 + }, + { + "epoch": 0.836719337848006, + "grad_norm": 2.6889474391937256, + "learning_rate": 1.2870497027785444e-06, + "loss": 0.0219, + "step": 1112 + }, + { + "epoch": 0.837471783295711, + "grad_norm": 0.015240820124745369, + "learning_rate": 1.2754731037614122e-06, + "loss": 0.0007, + "step": 1113 + }, + { + "epoch": 0.8382242287434161, + "grad_norm": 0.01888788491487503, + "learning_rate": 1.263945256635416e-06, + "loss": 0.0008, + "step": 1114 + }, + { + "epoch": 0.8389766741911211, + "grad_norm": 0.047674112021923065, + "learning_rate": 1.2524662258171605e-06, + "loss": 0.0022, + "step": 1115 + }, + { + "epoch": 0.8397291196388262, + "grad_norm": 0.07721062004566193, + "learning_rate": 1.2410360754504536e-06, + "loss": 0.0032, + "step": 1116 + }, + { + "epoch": 0.8404815650865313, + "grad_norm": 0.002638805890455842, + "learning_rate": 1.2296548694059818e-06, + "loss": 0.0001, + "step": 1117 + }, + { + "epoch": 0.8412340105342363, + "grad_norm": 2.895000457763672, + "learning_rate": 1.2183226712809238e-06, + "loss": 0.2227, + "step": 1118 + }, + { + "epoch": 0.8419864559819413, + "grad_norm": 4.1699957847595215, + "learning_rate": 1.207039544398607e-06, + "loss": 0.1277, + "step": 1119 + }, + { + "epoch": 0.8427389014296464, + "grad_norm": 0.03461211919784546, + "learning_rate": 1.195805551808158e-06, + "loss": 0.0017, + "step": 1120 + }, + { + "epoch": 0.8434913468773514, + "grad_norm": 0.011402531526982784, + "learning_rate": 1.1846207562841416e-06, + "loss": 0.0004, + "step": 1121 + }, + { + "epoch": 0.8442437923250564, + "grad_norm": 4.339677810668945, + "learning_rate": 1.1734852203262115e-06, + "loss": 0.1059, + "step": 1122 + }, + { + "epoch": 0.8449962377727614, + "grad_norm": 2.331862688064575, + "learning_rate": 1.1623990061587665e-06, + "loss": 0.3458, + "step": 1123 + }, + { + "epoch": 0.8457486832204665, + "grad_norm": 0.40041258931159973, + "learning_rate": 1.1513621757306015e-06, + "loss": 0.133, + "step": 1124 + }, + { + "epoch": 0.8465011286681715, + "grad_norm": 0.1884562075138092, + "learning_rate": 1.1403747907145546e-06, + "loss": 0.0999, + "step": 1125 + }, + { + "epoch": 0.8472535741158767, + "grad_norm": 0.0410892590880394, + "learning_rate": 1.1294369125071692e-06, + "loss": 0.0019, + "step": 1126 + }, + { + "epoch": 0.8480060195635817, + "grad_norm": 0.2528564929962158, + "learning_rate": 1.1185486022283553e-06, + "loss": 0.1256, + "step": 1127 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 0.0033206380903720856, + "learning_rate": 1.1077099207210296e-06, + "loss": 0.0001, + "step": 1128 + }, + { + "epoch": 0.8495109104589917, + "grad_norm": 0.05121954157948494, + "learning_rate": 1.0969209285507954e-06, + "loss": 0.0018, + "step": 1129 + }, + { + "epoch": 0.8502633559066968, + "grad_norm": 0.0063232192769646645, + "learning_rate": 1.0861816860055952e-06, + "loss": 0.0002, + "step": 1130 + }, + { + "epoch": 0.8510158013544018, + "grad_norm": 0.03940580040216446, + "learning_rate": 1.0754922530953737e-06, + "loss": 0.001, + "step": 1131 + }, + { + "epoch": 0.8517682468021068, + "grad_norm": 0.2049761265516281, + "learning_rate": 1.0648526895517464e-06, + "loss": 0.0064, + "step": 1132 + }, + { + "epoch": 0.8525206922498119, + "grad_norm": 0.2495325803756714, + "learning_rate": 1.0542630548276588e-06, + "loss": 0.1243, + "step": 1133 + }, + { + "epoch": 0.8532731376975169, + "grad_norm": 0.04376570135354996, + "learning_rate": 1.043723408097065e-06, + "loss": 0.0021, + "step": 1134 + }, + { + "epoch": 0.8540255831452219, + "grad_norm": 0.8723930716514587, + "learning_rate": 1.0332338082545812e-06, + "loss": 0.0126, + "step": 1135 + }, + { + "epoch": 0.8547780285929271, + "grad_norm": 0.016171878203749657, + "learning_rate": 1.0227943139151719e-06, + "loss": 0.0006, + "step": 1136 + }, + { + "epoch": 0.8555304740406321, + "grad_norm": 0.004397090524435043, + "learning_rate": 1.0124049834138205e-06, + "loss": 0.0001, + "step": 1137 + }, + { + "epoch": 0.8562829194883371, + "grad_norm": 0.10261158645153046, + "learning_rate": 1.0020658748051925e-06, + "loss": 0.0046, + "step": 1138 + }, + { + "epoch": 0.8570353649360422, + "grad_norm": 0.07801058888435364, + "learning_rate": 9.91777045863319e-07, + "loss": 0.0037, + "step": 1139 + }, + { + "epoch": 0.8577878103837472, + "grad_norm": 0.37156495451927185, + "learning_rate": 9.815385540812761e-07, + "loss": 0.1171, + "step": 1140 + }, + { + "epoch": 0.8585402558314522, + "grad_norm": 0.0733116865158081, + "learning_rate": 9.713504566708554e-07, + "loss": 0.0033, + "step": 1141 + }, + { + "epoch": 0.8592927012791572, + "grad_norm": 0.0857694074511528, + "learning_rate": 9.61212810562252e-07, + "loss": 0.0036, + "step": 1142 + }, + { + "epoch": 0.8600451467268623, + "grad_norm": 0.41983649134635925, + "learning_rate": 9.511256724037443e-07, + "loss": 0.0053, + "step": 1143 + }, + { + "epoch": 0.8607975921745673, + "grad_norm": 7.980868816375732, + "learning_rate": 9.410890985613741e-07, + "loss": 0.2031, + "step": 1144 + }, + { + "epoch": 0.8615500376222723, + "grad_norm": 3.7673096656799316, + "learning_rate": 9.311031451186381e-07, + "loss": 0.3022, + "step": 1145 + }, + { + "epoch": 0.8623024830699775, + "grad_norm": 0.028988122940063477, + "learning_rate": 9.21167867876167e-07, + "loss": 0.0011, + "step": 1146 + }, + { + "epoch": 0.8630549285176825, + "grad_norm": 2.064547300338745, + "learning_rate": 9.112833223514183e-07, + "loss": 0.0675, + "step": 1147 + }, + { + "epoch": 0.8638073739653875, + "grad_norm": 1.2910840511322021, + "learning_rate": 9.014495637783671e-07, + "loss": 0.093, + "step": 1148 + }, + { + "epoch": 0.8645598194130926, + "grad_norm": 5.712249279022217, + "learning_rate": 8.916666471071922e-07, + "loss": 0.439, + "step": 1149 + }, + { + "epoch": 0.8653122648607976, + "grad_norm": 0.04252437502145767, + "learning_rate": 8.819346270039752e-07, + "loss": 0.0019, + "step": 1150 + }, + { + "epoch": 0.8660647103085026, + "grad_norm": 0.02926601469516754, + "learning_rate": 8.722535578503899e-07, + "loss": 0.0014, + "step": 1151 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 0.3518989086151123, + "learning_rate": 8.62623493743402e-07, + "loss": 0.0112, + "step": 1152 + }, + { + "epoch": 0.8675696012039127, + "grad_norm": 3.836555242538452, + "learning_rate": 8.530444884949674e-07, + "loss": 0.0891, + "step": 1153 + }, + { + "epoch": 0.8683220466516177, + "grad_norm": 0.04374834522604942, + "learning_rate": 8.435165956317226e-07, + "loss": 0.0021, + "step": 1154 + }, + { + "epoch": 0.8690744920993227, + "grad_norm": 0.10189584642648697, + "learning_rate": 8.340398683947004e-07, + "loss": 0.0039, + "step": 1155 + }, + { + "epoch": 0.8698269375470279, + "grad_norm": 0.058472152799367905, + "learning_rate": 8.2461435973902e-07, + "loss": 0.0029, + "step": 1156 + }, + { + "epoch": 0.8705793829947329, + "grad_norm": 0.01816386543214321, + "learning_rate": 8.152401223335993e-07, + "loss": 0.0007, + "step": 1157 + }, + { + "epoch": 0.871331828442438, + "grad_norm": 13.75079345703125, + "learning_rate": 8.059172085608535e-07, + "loss": 0.1557, + "step": 1158 + }, + { + "epoch": 0.872084273890143, + "grad_norm": 0.01980912685394287, + "learning_rate": 7.966456705164094e-07, + "loss": 0.0008, + "step": 1159 + }, + { + "epoch": 0.872836719337848, + "grad_norm": 0.016307447105646133, + "learning_rate": 7.874255600088043e-07, + "loss": 0.0006, + "step": 1160 + }, + { + "epoch": 0.873589164785553, + "grad_norm": 0.11552978307008743, + "learning_rate": 7.78256928559209e-07, + "loss": 0.0054, + "step": 1161 + }, + { + "epoch": 0.8743416102332581, + "grad_norm": 0.3584482967853546, + "learning_rate": 7.69139827401132e-07, + "loss": 0.0154, + "step": 1162 + }, + { + "epoch": 0.8750940556809631, + "grad_norm": 0.07163400202989578, + "learning_rate": 7.600743074801353e-07, + "loss": 0.0036, + "step": 1163 + }, + { + "epoch": 0.8758465011286681, + "grad_norm": 0.01637900248169899, + "learning_rate": 7.510604194535487e-07, + "loss": 0.0007, + "step": 1164 + }, + { + "epoch": 0.8765989465763732, + "grad_norm": 0.017222406342625618, + "learning_rate": 7.420982136901888e-07, + "loss": 0.0007, + "step": 1165 + }, + { + "epoch": 0.8773513920240783, + "grad_norm": 0.032630909234285355, + "learning_rate": 7.331877402700737e-07, + "loss": 0.0008, + "step": 1166 + }, + { + "epoch": 0.8781038374717833, + "grad_norm": 0.1266675889492035, + "learning_rate": 7.243290489841493e-07, + "loss": 0.0061, + "step": 1167 + }, + { + "epoch": 0.8788562829194884, + "grad_norm": 0.0033598667941987514, + "learning_rate": 7.155221893340036e-07, + "loss": 0.0001, + "step": 1168 + }, + { + "epoch": 0.8796087283671934, + "grad_norm": 0.004325724206864834, + "learning_rate": 7.067672105315981e-07, + "loss": 0.0001, + "step": 1169 + }, + { + "epoch": 0.8803611738148984, + "grad_norm": 0.009192912839353085, + "learning_rate": 6.980641614989847e-07, + "loss": 0.0004, + "step": 1170 + }, + { + "epoch": 0.8811136192626035, + "grad_norm": 0.037033502012491226, + "learning_rate": 6.894130908680396e-07, + "loss": 0.0016, + "step": 1171 + }, + { + "epoch": 0.8818660647103085, + "grad_norm": 9.960691452026367, + "learning_rate": 6.808140469801872e-07, + "loss": 0.137, + "step": 1172 + }, + { + "epoch": 0.8826185101580135, + "grad_norm": 1.7104170322418213, + "learning_rate": 6.722670778861284e-07, + "loss": 0.0401, + "step": 1173 + }, + { + "epoch": 0.8833709556057185, + "grad_norm": 0.0026178702246397734, + "learning_rate": 6.637722313455774e-07, + "loss": 0.0001, + "step": 1174 + }, + { + "epoch": 0.8841234010534236, + "grad_norm": 0.03204850107431412, + "learning_rate": 6.553295548269922e-07, + "loss": 0.0011, + "step": 1175 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 0.03557371720671654, + "learning_rate": 6.469390955073073e-07, + "loss": 0.0012, + "step": 1176 + }, + { + "epoch": 0.8856282919488337, + "grad_norm": 0.3740195035934448, + "learning_rate": 6.386009002716776e-07, + "loss": 0.0097, + "step": 1177 + }, + { + "epoch": 0.8863807373965388, + "grad_norm": 0.06629537791013718, + "learning_rate": 6.303150157132044e-07, + "loss": 0.0027, + "step": 1178 + }, + { + "epoch": 0.8871331828442438, + "grad_norm": 0.0544668547809124, + "learning_rate": 6.22081488132682e-07, + "loss": 0.0024, + "step": 1179 + }, + { + "epoch": 0.8878856282919488, + "grad_norm": 3.0258607864379883, + "learning_rate": 6.139003635383433e-07, + "loss": 0.0818, + "step": 1180 + }, + { + "epoch": 0.8886380737396539, + "grad_norm": 3.636326313018799, + "learning_rate": 6.057716876455932e-07, + "loss": 0.0272, + "step": 1181 + }, + { + "epoch": 0.8893905191873589, + "grad_norm": 2.166203260421753, + "learning_rate": 5.976955058767609e-07, + "loss": 0.2443, + "step": 1182 + }, + { + "epoch": 0.8901429646350639, + "grad_norm": 0.9411613345146179, + "learning_rate": 5.896718633608412e-07, + "loss": 0.0109, + "step": 1183 + }, + { + "epoch": 0.890895410082769, + "grad_norm": 0.05662931129336357, + "learning_rate": 5.81700804933244e-07, + "loss": 0.0028, + "step": 1184 + }, + { + "epoch": 0.891647855530474, + "grad_norm": 0.005325346253812313, + "learning_rate": 5.737823751355465e-07, + "loss": 0.0002, + "step": 1185 + }, + { + "epoch": 0.8924003009781791, + "grad_norm": 0.009086296893656254, + "learning_rate": 5.659166182152387e-07, + "loss": 0.0003, + "step": 1186 + }, + { + "epoch": 0.8931527464258842, + "grad_norm": 0.05443952605128288, + "learning_rate": 5.581035781254807e-07, + "loss": 0.0023, + "step": 1187 + }, + { + "epoch": 0.8939051918735892, + "grad_norm": 0.09654070436954498, + "learning_rate": 5.503432985248558e-07, + "loss": 0.0041, + "step": 1188 + }, + { + "epoch": 0.8946576373212942, + "grad_norm": 0.022555604577064514, + "learning_rate": 5.426358227771245e-07, + "loss": 0.0008, + "step": 1189 + }, + { + "epoch": 0.8954100827689992, + "grad_norm": 0.022699879482388496, + "learning_rate": 5.349811939509874e-07, + "loss": 0.0009, + "step": 1190 + }, + { + "epoch": 0.8961625282167043, + "grad_norm": 0.013877753168344498, + "learning_rate": 5.273794548198374e-07, + "loss": 0.0005, + "step": 1191 + }, + { + "epoch": 0.8969149736644093, + "grad_norm": 0.23715998232364655, + "learning_rate": 5.198306478615278e-07, + "loss": 0.0989, + "step": 1192 + }, + { + "epoch": 0.8976674191121143, + "grad_norm": 0.010840164497494698, + "learning_rate": 5.123348152581264e-07, + "loss": 0.0005, + "step": 1193 + }, + { + "epoch": 0.8984198645598194, + "grad_norm": 0.026456279680132866, + "learning_rate": 5.048919988956913e-07, + "loss": 0.0008, + "step": 1194 + }, + { + "epoch": 0.8991723100075244, + "grad_norm": 0.0075182537548244, + "learning_rate": 4.975022403640273e-07, + "loss": 0.0002, + "step": 1195 + }, + { + "epoch": 0.8999247554552295, + "grad_norm": 0.028369707986712456, + "learning_rate": 4.901655809564543e-07, + "loss": 0.0011, + "step": 1196 + }, + { + "epoch": 0.9006772009029346, + "grad_norm": 0.029247762635350227, + "learning_rate": 4.828820616695873e-07, + "loss": 0.0012, + "step": 1197 + }, + { + "epoch": 0.9014296463506396, + "grad_norm": 0.011888640001416206, + "learning_rate": 4.7565172320308886e-07, + "loss": 0.0004, + "step": 1198 + }, + { + "epoch": 0.9021820917983446, + "grad_norm": 0.20360657572746277, + "learning_rate": 4.684746059594558e-07, + "loss": 0.0057, + "step": 1199 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 0.020525943487882614, + "learning_rate": 4.6135075004379193e-07, + "loss": 0.0005, + "step": 1200 + }, + { + "epoch": 0.9036869826937547, + "grad_norm": 5.512493133544922, + "learning_rate": 4.542801952635789e-07, + "loss": 0.0409, + "step": 1201 + }, + { + "epoch": 0.9044394281414597, + "grad_norm": 0.041713930666446686, + "learning_rate": 4.472629811284568e-07, + "loss": 0.0014, + "step": 1202 + }, + { + "epoch": 0.9051918735891648, + "grad_norm": 0.023644356057047844, + "learning_rate": 4.4029914685000176e-07, + "loss": 0.0011, + "step": 1203 + }, + { + "epoch": 0.9059443190368698, + "grad_norm": 0.009974486194550991, + "learning_rate": 4.333887313415097e-07, + "loss": 0.0004, + "step": 1204 + }, + { + "epoch": 0.9066967644845748, + "grad_norm": 10.814530372619629, + "learning_rate": 4.265317732177787e-07, + "loss": 0.0471, + "step": 1205 + }, + { + "epoch": 0.90744920993228, + "grad_norm": 0.04487287253141403, + "learning_rate": 4.1972831079488354e-07, + "loss": 0.0012, + "step": 1206 + }, + { + "epoch": 0.908201655379985, + "grad_norm": 0.304022878408432, + "learning_rate": 4.129783820899802e-07, + "loss": 0.1235, + "step": 1207 + }, + { + "epoch": 0.90895410082769, + "grad_norm": 0.08795291930437088, + "learning_rate": 4.0628202482107747e-07, + "loss": 0.004, + "step": 1208 + }, + { + "epoch": 0.909706546275395, + "grad_norm": 0.07199563086032867, + "learning_rate": 3.9963927640683243e-07, + "loss": 0.0035, + "step": 1209 + }, + { + "epoch": 0.9104589917231001, + "grad_norm": 0.021918121725320816, + "learning_rate": 3.930501739663406e-07, + "loss": 0.0009, + "step": 1210 + }, + { + "epoch": 0.9112114371708051, + "grad_norm": 0.10286161303520203, + "learning_rate": 3.865147543189296e-07, + "loss": 0.0036, + "step": 1211 + }, + { + "epoch": 0.9119638826185101, + "grad_norm": 0.25747624039649963, + "learning_rate": 3.8003305398394916e-07, + "loss": 0.1216, + "step": 1212 + }, + { + "epoch": 0.9127163280662152, + "grad_norm": 1.8191041946411133, + "learning_rate": 3.7360510918057256e-07, + "loss": 0.021, + "step": 1213 + }, + { + "epoch": 0.9134687735139202, + "grad_norm": 0.15397605299949646, + "learning_rate": 3.672309558275922e-07, + "loss": 0.0057, + "step": 1214 + }, + { + "epoch": 0.9142212189616253, + "grad_norm": 0.03293481469154358, + "learning_rate": 3.6091062954321634e-07, + "loss": 0.0016, + "step": 1215 + }, + { + "epoch": 0.9149736644093304, + "grad_norm": 0.002885388908907771, + "learning_rate": 3.5464416564487734e-07, + "loss": 0.0001, + "step": 1216 + }, + { + "epoch": 0.9157261098570354, + "grad_norm": 0.02011510170996189, + "learning_rate": 3.484315991490261e-07, + "loss": 0.0006, + "step": 1217 + }, + { + "epoch": 0.9164785553047404, + "grad_norm": 0.0348239503800869, + "learning_rate": 3.422729647709355e-07, + "loss": 0.0015, + "step": 1218 + }, + { + "epoch": 0.9172310007524455, + "grad_norm": 0.001900368370115757, + "learning_rate": 3.361682969245161e-07, + "loss": 0.0001, + "step": 1219 + }, + { + "epoch": 0.9179834462001505, + "grad_norm": 0.013635323382914066, + "learning_rate": 3.3011762972211647e-07, + "loss": 0.0006, + "step": 1220 + }, + { + "epoch": 0.9187358916478555, + "grad_norm": 0.5103453397750854, + "learning_rate": 3.241209969743353e-07, + "loss": 0.0088, + "step": 1221 + }, + { + "epoch": 0.9194883370955605, + "grad_norm": 0.03513436019420624, + "learning_rate": 3.181784321898285e-07, + "loss": 0.001, + "step": 1222 + }, + { + "epoch": 0.9202407825432656, + "grad_norm": 0.019935129210352898, + "learning_rate": 3.1228996857512795e-07, + "loss": 0.0007, + "step": 1223 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 0.0077493940480053425, + "learning_rate": 3.064556390344542e-07, + "loss": 0.0003, + "step": 1224 + }, + { + "epoch": 0.9217456734386757, + "grad_norm": 1.4610238075256348, + "learning_rate": 3.0067547616952297e-07, + "loss": 0.0216, + "step": 1225 + }, + { + "epoch": 0.9224981188863808, + "grad_norm": 0.013395492918789387, + "learning_rate": 2.949495122793833e-07, + "loss": 0.0005, + "step": 1226 + }, + { + "epoch": 0.9232505643340858, + "grad_norm": 0.022489206865429878, + "learning_rate": 2.892777793602175e-07, + "loss": 0.0009, + "step": 1227 + }, + { + "epoch": 0.9240030097817908, + "grad_norm": 0.04702683165669441, + "learning_rate": 2.836603091051704e-07, + "loss": 0.0019, + "step": 1228 + }, + { + "epoch": 0.9247554552294959, + "grad_norm": 0.34132277965545654, + "learning_rate": 2.7809713290417486e-07, + "loss": 0.012, + "step": 1229 + }, + { + "epoch": 0.9255079006772009, + "grad_norm": 1.5338892936706543, + "learning_rate": 2.7258828184377086e-07, + "loss": 0.1041, + "step": 1230 + }, + { + "epoch": 0.9262603461249059, + "grad_norm": 0.00669435178861022, + "learning_rate": 2.6713378670693455e-07, + "loss": 0.0002, + "step": 1231 + }, + { + "epoch": 0.927012791572611, + "grad_norm": 9.580220222473145, + "learning_rate": 2.617336779729063e-07, + "loss": 0.1428, + "step": 1232 + }, + { + "epoch": 0.927765237020316, + "grad_norm": 3.141953945159912, + "learning_rate": 2.563879858170215e-07, + "loss": 0.0465, + "step": 1233 + }, + { + "epoch": 0.928517682468021, + "grad_norm": 0.04021593928337097, + "learning_rate": 2.5109674011053684e-07, + "loss": 0.0016, + "step": 1234 + }, + { + "epoch": 0.9292701279157262, + "grad_norm": 0.12653590738773346, + "learning_rate": 2.458599704204712e-07, + "loss": 0.0028, + "step": 1235 + }, + { + "epoch": 0.9300225733634312, + "grad_norm": 1.136615514755249, + "learning_rate": 2.406777060094345e-07, + "loss": 0.016, + "step": 1236 + }, + { + "epoch": 0.9307750188111362, + "grad_norm": 0.0015482566086575389, + "learning_rate": 2.3554997583546402e-07, + "loss": 0.0001, + "step": 1237 + }, + { + "epoch": 0.9315274642588413, + "grad_norm": 0.025700274854898453, + "learning_rate": 2.3047680855186716e-07, + "loss": 0.0012, + "step": 1238 + }, + { + "epoch": 0.9322799097065463, + "grad_norm": 0.03950847312808037, + "learning_rate": 2.2545823250705867e-07, + "loss": 0.0008, + "step": 1239 + }, + { + "epoch": 0.9330323551542513, + "grad_norm": 0.01254583802074194, + "learning_rate": 2.2049427574439953e-07, + "loss": 0.0005, + "step": 1240 + }, + { + "epoch": 0.9337848006019563, + "grad_norm": 0.1133909747004509, + "learning_rate": 2.1558496600204703e-07, + "loss": 0.0028, + "step": 1241 + }, + { + "epoch": 0.9345372460496614, + "grad_norm": 0.04309477657079697, + "learning_rate": 2.1073033071279057e-07, + "loss": 0.0021, + "step": 1242 + }, + { + "epoch": 0.9352896914973664, + "grad_norm": 2.5412564277648926, + "learning_rate": 2.059303970039106e-07, + "loss": 0.3464, + "step": 1243 + }, + { + "epoch": 0.9360421369450714, + "grad_norm": 0.013797776773571968, + "learning_rate": 2.011851916970109e-07, + "loss": 0.0006, + "step": 1244 + }, + { + "epoch": 0.9367945823927766, + "grad_norm": 0.0062391371466219425, + "learning_rate": 1.9649474130788438e-07, + "loss": 0.0002, + "step": 1245 + }, + { + "epoch": 0.9375470278404816, + "grad_norm": 0.4302297830581665, + "learning_rate": 1.9185907204635755e-07, + "loss": 0.1256, + "step": 1246 + }, + { + "epoch": 0.9382994732881866, + "grad_norm": 0.002676435513421893, + "learning_rate": 1.8727820981614407e-07, + "loss": 0.0001, + "step": 1247 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 5.231035232543945, + "learning_rate": 1.827521802146981e-07, + "loss": 0.4477, + "step": 1248 + }, + { + "epoch": 0.9398043641835967, + "grad_norm": 1.7769052982330322, + "learning_rate": 1.7828100853307884e-07, + "loss": 0.2043, + "step": 1249 + }, + { + "epoch": 0.9405568096313017, + "grad_norm": 0.005753179080784321, + "learning_rate": 1.7386471975579854e-07, + "loss": 0.0002, + "step": 1250 + }, + { + "epoch": 0.9413092550790068, + "grad_norm": 0.04713589698076248, + "learning_rate": 1.6950333856069369e-07, + "loss": 0.0013, + "step": 1251 + }, + { + "epoch": 0.9420617005267118, + "grad_norm": 0.042459528893232346, + "learning_rate": 1.651968893187783e-07, + "loss": 0.002, + "step": 1252 + }, + { + "epoch": 0.9428141459744168, + "grad_norm": 0.1957351565361023, + "learning_rate": 1.609453960941143e-07, + "loss": 0.0071, + "step": 1253 + }, + { + "epoch": 0.9435665914221218, + "grad_norm": 0.028360065072774887, + "learning_rate": 1.567488826436725e-07, + "loss": 0.0013, + "step": 1254 + }, + { + "epoch": 0.944319036869827, + "grad_norm": 1.9565247297286987, + "learning_rate": 1.526073724172028e-07, + "loss": 0.0938, + "step": 1255 + }, + { + "epoch": 0.945071482317532, + "grad_norm": 0.023253565654158592, + "learning_rate": 1.485208885570999e-07, + "loss": 0.0009, + "step": 1256 + }, + { + "epoch": 0.945823927765237, + "grad_norm": 0.005460272543132305, + "learning_rate": 1.4448945389827772e-07, + "loss": 0.0002, + "step": 1257 + }, + { + "epoch": 0.9465763732129421, + "grad_norm": 0.07867200672626495, + "learning_rate": 1.4051309096803967e-07, + "loss": 0.0031, + "step": 1258 + }, + { + "epoch": 0.9473288186606471, + "grad_norm": 0.011308335699141026, + "learning_rate": 1.36591821985953e-07, + "loss": 0.0004, + "step": 1259 + }, + { + "epoch": 0.9480812641083521, + "grad_norm": 0.7972699999809265, + "learning_rate": 1.3272566886372572e-07, + "loss": 0.0923, + "step": 1260 + }, + { + "epoch": 0.9488337095560572, + "grad_norm": 0.01437709853053093, + "learning_rate": 1.2891465320508113e-07, + "loss": 0.0006, + "step": 1261 + }, + { + "epoch": 0.9495861550037622, + "grad_norm": 0.021911505609750748, + "learning_rate": 1.2515879630564108e-07, + "loss": 0.0009, + "step": 1262 + }, + { + "epoch": 0.9503386004514672, + "grad_norm": 0.018048502504825592, + "learning_rate": 1.2145811915280414e-07, + "loss": 0.0009, + "step": 1263 + }, + { + "epoch": 0.9510910458991723, + "grad_norm": 0.01497666072100401, + "learning_rate": 1.1781264242562984e-07, + "loss": 0.0006, + "step": 1264 + }, + { + "epoch": 0.9518434913468774, + "grad_norm": 0.01695135422050953, + "learning_rate": 1.1422238649472228e-07, + "loss": 0.0005, + "step": 1265 + }, + { + "epoch": 0.9525959367945824, + "grad_norm": 0.05780097469687462, + "learning_rate": 1.1068737142211683e-07, + "loss": 0.0023, + "step": 1266 + }, + { + "epoch": 0.9533483822422875, + "grad_norm": 0.4244280457496643, + "learning_rate": 1.072076169611691e-07, + "loss": 0.0127, + "step": 1267 + }, + { + "epoch": 0.9541008276899925, + "grad_norm": 0.28837308287620544, + "learning_rate": 1.0378314255643951e-07, + "loss": 0.0074, + "step": 1268 + }, + { + "epoch": 0.9548532731376975, + "grad_norm": 0.05638807266950607, + "learning_rate": 1.004139673435922e-07, + "loss": 0.0019, + "step": 1269 + }, + { + "epoch": 0.9556057185854026, + "grad_norm": 0.025965796783566475, + "learning_rate": 9.7100110149283e-08, + "loss": 0.0011, + "step": 1270 + }, + { + "epoch": 0.9563581640331076, + "grad_norm": 0.00950747448951006, + "learning_rate": 9.384158949105382e-08, + "loss": 0.0004, + "step": 1271 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 0.4918772280216217, + "learning_rate": 9.063842357723284e-08, + "loss": 0.0149, + "step": 1272 + }, + { + "epoch": 0.9578630549285176, + "grad_norm": 4.298508167266846, + "learning_rate": 8.749063030683125e-08, + "loss": 0.0998, + "step": 1273 + }, + { + "epoch": 0.9586155003762227, + "grad_norm": 4.235317230224609, + "learning_rate": 8.439822726943991e-08, + "loss": 0.2552, + "step": 1274 + }, + { + "epoch": 0.9593679458239278, + "grad_norm": 0.04486255720257759, + "learning_rate": 8.136123174513843e-08, + "loss": 0.0019, + "step": 1275 + }, + { + "epoch": 0.9601203912716328, + "grad_norm": 0.004140100441873074, + "learning_rate": 7.837966070438851e-08, + "loss": 0.0001, + "step": 1276 + }, + { + "epoch": 0.9608728367193379, + "grad_norm": 0.01324907224625349, + "learning_rate": 7.54535308079507e-08, + "loss": 0.0006, + "step": 1277 + }, + { + "epoch": 0.9616252821670429, + "grad_norm": 0.04146108031272888, + "learning_rate": 7.258285840677893e-08, + "loss": 0.0018, + "step": 1278 + }, + { + "epoch": 0.9623777276147479, + "grad_norm": 0.008022352121770382, + "learning_rate": 6.976765954194165e-08, + "loss": 0.0003, + "step": 1279 + }, + { + "epoch": 0.963130173062453, + "grad_norm": 0.03538546711206436, + "learning_rate": 6.700794994452198e-08, + "loss": 0.0018, + "step": 1280 + }, + { + "epoch": 0.963882618510158, + "grad_norm": 2.600939989089966, + "learning_rate": 6.430374503553439e-08, + "loss": 0.5258, + "step": 1281 + }, + { + "epoch": 0.964635063957863, + "grad_norm": 0.13569378852844238, + "learning_rate": 6.165505992584142e-08, + "loss": 0.0053, + "step": 1282 + }, + { + "epoch": 0.9653875094055681, + "grad_norm": 0.004670869559049606, + "learning_rate": 5.9061909416059385e-08, + "loss": 0.0002, + "step": 1283 + }, + { + "epoch": 0.9661399548532731, + "grad_norm": 0.014692548662424088, + "learning_rate": 5.652430799648945e-08, + "loss": 0.0005, + "step": 1284 + }, + { + "epoch": 0.9668924003009782, + "grad_norm": 0.8171061873435974, + "learning_rate": 5.404226984702221e-08, + "loss": 0.1252, + "step": 1285 + }, + { + "epoch": 0.9676448457486833, + "grad_norm": 0.018849292770028114, + "learning_rate": 5.161580883707218e-08, + "loss": 0.0007, + "step": 1286 + }, + { + "epoch": 0.9683972911963883, + "grad_norm": 3.7725510597229004, + "learning_rate": 4.924493852549006e-08, + "loss": 0.2135, + "step": 1287 + }, + { + "epoch": 0.9691497366440933, + "grad_norm": 0.005706661846488714, + "learning_rate": 4.69296721604906e-08, + "loss": 0.0002, + "step": 1288 + }, + { + "epoch": 0.9699021820917983, + "grad_norm": 1.751198649406433, + "learning_rate": 4.4670022679579314e-08, + "loss": 0.0254, + "step": 1289 + }, + { + "epoch": 0.9706546275395034, + "grad_norm": 0.015075215138494968, + "learning_rate": 4.24660027094792e-08, + "loss": 0.0006, + "step": 1290 + }, + { + "epoch": 0.9714070729872084, + "grad_norm": 0.12630032002925873, + "learning_rate": 4.0317624566060806e-08, + "loss": 0.0049, + "step": 1291 + }, + { + "epoch": 0.9721595184349134, + "grad_norm": 0.01940903812646866, + "learning_rate": 3.822490025427339e-08, + "loss": 0.0005, + "step": 1292 + }, + { + "epoch": 0.9729119638826185, + "grad_norm": 0.025583157315850258, + "learning_rate": 3.618784146807497e-08, + "loss": 0.0011, + "step": 1293 + }, + { + "epoch": 0.9736644093303235, + "grad_norm": 3.0885636806488037, + "learning_rate": 3.42064595903735e-08, + "loss": 0.3399, + "step": 1294 + }, + { + "epoch": 0.9744168547780286, + "grad_norm": 0.04877516254782677, + "learning_rate": 3.2280765692956904e-08, + "loss": 0.0022, + "step": 1295 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 2.90549635887146, + "learning_rate": 3.0410770536432047e-08, + "loss": 0.2458, + "step": 1296 + }, + { + "epoch": 0.9759217456734387, + "grad_norm": 0.039560992270708084, + "learning_rate": 2.859648457016917e-08, + "loss": 0.0017, + "step": 1297 + }, + { + "epoch": 0.9766741911211437, + "grad_norm": 0.10429071635007858, + "learning_rate": 2.6837917932238667e-08, + "loss": 0.0048, + "step": 1298 + }, + { + "epoch": 0.9774266365688488, + "grad_norm": 0.24316257238388062, + "learning_rate": 2.513508044935775e-08, + "loss": 0.0068, + "step": 1299 + }, + { + "epoch": 0.9781790820165538, + "grad_norm": 0.020967544987797737, + "learning_rate": 2.3487981636831635e-08, + "loss": 0.0006, + "step": 1300 + }, + { + "epoch": 0.9789315274642588, + "grad_norm": 0.05963461101055145, + "learning_rate": 2.189663069850578e-08, + "loss": 0.0026, + "step": 1301 + }, + { + "epoch": 0.9796839729119639, + "grad_norm": 0.022699451074004173, + "learning_rate": 2.0361036526707067e-08, + "loss": 0.001, + "step": 1302 + }, + { + "epoch": 0.9804364183596689, + "grad_norm": 2.274538278579712, + "learning_rate": 1.8881207702202696e-08, + "loss": 0.1143, + "step": 1303 + }, + { + "epoch": 0.9811888638073739, + "grad_norm": 0.12566229701042175, + "learning_rate": 1.7457152494145814e-08, + "loss": 0.0061, + "step": 1304 + }, + { + "epoch": 0.981941309255079, + "grad_norm": 0.03288928419351578, + "learning_rate": 1.6088878860032187e-08, + "loss": 0.0016, + "step": 1305 + }, + { + "epoch": 0.9826937547027841, + "grad_norm": 0.0259998869150877, + "learning_rate": 1.4776394445655818e-08, + "loss": 0.0009, + "step": 1306 + }, + { + "epoch": 0.9834462001504891, + "grad_norm": 1.0154011249542236, + "learning_rate": 1.3519706585063408e-08, + "loss": 0.0419, + "step": 1307 + }, + { + "epoch": 0.9841986455981941, + "grad_norm": 4.762252330780029, + "learning_rate": 1.231882230051662e-08, + "loss": 0.0761, + "step": 1308 + }, + { + "epoch": 0.9849510910458992, + "grad_norm": 5.905474662780762, + "learning_rate": 1.1173748302450993e-08, + "loss": 0.2441, + "step": 1309 + }, + { + "epoch": 0.9857035364936042, + "grad_norm": 4.897678852081299, + "learning_rate": 1.0084490989441531e-08, + "loss": 0.228, + "step": 1310 + }, + { + "epoch": 0.9864559819413092, + "grad_norm": 0.00369740417227149, + "learning_rate": 9.051056448160511e-09, + "loss": 0.0001, + "step": 1311 + }, + { + "epoch": 0.9872084273890143, + "grad_norm": 0.0034076508600264788, + "learning_rate": 8.07345045334973e-09, + "loss": 0.0001, + "step": 1312 + }, + { + "epoch": 0.9879608728367193, + "grad_norm": 0.007578455843031406, + "learning_rate": 7.151678467787193e-09, + "loss": 0.0003, + "step": 1313 + }, + { + "epoch": 0.9887133182844243, + "grad_norm": 0.1452741026878357, + "learning_rate": 6.285745642253816e-09, + "loss": 0.0057, + "step": 1314 + }, + { + "epoch": 0.9894657637321295, + "grad_norm": 0.007722517475485802, + "learning_rate": 5.475656815504549e-09, + "loss": 0.0003, + "step": 1315 + }, + { + "epoch": 0.9902182091798345, + "grad_norm": 0.04480676352977753, + "learning_rate": 4.721416514245069e-09, + "loss": 0.002, + "step": 1316 + }, + { + "epoch": 0.9909706546275395, + "grad_norm": 0.6687166094779968, + "learning_rate": 4.023028953106245e-09, + "loss": 0.1434, + "step": 1317 + }, + { + "epoch": 0.9917231000752446, + "grad_norm": 0.014944463968276978, + "learning_rate": 3.3804980346141547e-09, + "loss": 0.0006, + "step": 1318 + }, + { + "epoch": 0.9924755455229496, + "grad_norm": 0.01458480954170227, + "learning_rate": 2.7938273491756596e-09, + "loss": 0.0006, + "step": 1319 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 0.01680012419819832, + "learning_rate": 2.2630201750561965e-09, + "loss": 0.0008, + "step": 1320 + }, + { + "epoch": 0.9939804364183596, + "grad_norm": 0.054570991545915604, + "learning_rate": 1.7880794783575738e-09, + "loss": 0.0019, + "step": 1321 + }, + { + "epoch": 0.9947328818660647, + "grad_norm": 0.07745349407196045, + "learning_rate": 1.3690079130090905e-09, + "loss": 0.0017, + "step": 1322 + }, + { + "epoch": 0.9954853273137697, + "grad_norm": 9.53114128112793, + "learning_rate": 1.0058078207453303e-09, + "loss": 0.0771, + "step": 1323 + }, + { + "epoch": 0.9962377727614747, + "grad_norm": 0.049212612211704254, + "learning_rate": 6.984812310950606e-10, + "loss": 0.0018, + "step": 1324 + }, + { + "epoch": 0.9969902182091799, + "grad_norm": 0.004117061849683523, + "learning_rate": 4.470298613745705e-10, + "loss": 0.0002, + "step": 1325 + }, + { + "epoch": 0.9977426636568849, + "grad_norm": 0.06400679796934128, + "learning_rate": 2.514551166699075e-10, + "loss": 0.0028, + "step": 1326 + }, + { + "epoch": 0.9984951091045899, + "grad_norm": 1.3703467845916748, + "learning_rate": 1.1175808983687752e-10, + "loss": 0.0185, + "step": 1327 + }, + { + "epoch": 0.999247554552295, + "grad_norm": 0.01683868281543255, + "learning_rate": 2.793956148994248e-11, + "loss": 0.0007, + "step": 1328 + }, + { + "epoch": 1.0, + "grad_norm": 0.0036646851804107428, + "learning_rate": 0.0, + "loss": 0.0001, + "step": 1329 } ], "logging_steps": 1, - "max_steps": 1016, + "max_steps": 1329, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, @@ -7138,7 +9329,7 @@ "attributes": {} } }, - "total_flos": 3.5215451850761626e+17, + "total_flos": 4.439066769492296e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null