diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9321 +1,2342 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.0, + "epoch": 1.99849510910459, "eval_steps": 500, - "global_step": 1329, + "global_step": 332, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0007524454477050414, - "grad_norm": 0.006495347712188959, - "learning_rate": 1.999997206043851e-05, - "loss": 0.0004, + "epoch": 0.006019563581640331, + "grad_norm": 41.037254333496094, + "learning_rate": 1.4705882352941177e-06, + "loss": 4.2766, "step": 1 }, { - "epoch": 0.0015048908954100827, - "grad_norm": 0.05232545733451843, - "learning_rate": 1.9999888241910165e-05, - "loss": 0.0017, + "epoch": 0.012039127163280662, + "grad_norm": 43.19103240966797, + "learning_rate": 2.9411764705882355e-06, + "loss": 4.2655, "step": 2 }, { - "epoch": 0.002257336343115124, - "grad_norm": 0.016717378050088882, - "learning_rate": 1.999974854488333e-05, - "loss": 0.0009, + "epoch": 0.01805869074492099, + "grad_norm": 41.71216583251953, + "learning_rate": 4.411764705882353e-06, + "loss": 4.1574, "step": 3 }, { - "epoch": 0.0030097817908201654, - "grad_norm": 0.51008540391922, - "learning_rate": 1.9999552970138628e-05, - "loss": 0.0233, + "epoch": 0.024078254326561323, + "grad_norm": 51.17884063720703, + "learning_rate": 5.882352941176471e-06, + "loss": 3.3329, "step": 4 }, { - "epoch": 0.003762227238525207, - "grad_norm": 3.963065682910383e-05, - "learning_rate": 1.999930151876891e-05, - "loss": 0.0, + "epoch": 0.030097817908201655, + "grad_norm": 28.706247329711914, + "learning_rate": 7.3529411764705884e-06, + "loss": 2.0066, "step": 5 }, { - "epoch": 0.004514672686230248, - "grad_norm": 8.221525604312774e-06, - "learning_rate": 1.9998994192179256e-05, - "loss": 0.0, + "epoch": 0.03611738148984198, + "grad_norm": 19.702205657958984, + "learning_rate": 8.823529411764707e-06, + "loss": 1.2989, "step": 6 }, { - "epoch": 0.005267118133935289, - "grad_norm": 0.019136814400553703, - "learning_rate": 1.999863099208699e-05, - "loss": 0.0005, + "epoch": 0.042136945071482315, + "grad_norm": 10.646201133728027, + "learning_rate": 1.0294117647058824e-05, + "loss": 0.4697, "step": 7 }, { - "epoch": 0.006019563581640331, - "grad_norm": 0.0003651017614174634, - "learning_rate": 1.9998211920521646e-05, - "loss": 0.0, + "epoch": 0.04815650865312265, + "grad_norm": 7.015563488006592, + "learning_rate": 1.1764705882352942e-05, + "loss": 0.167, "step": 8 }, { - "epoch": 0.006772009029345372, - "grad_norm": 0.0008514254004694521, - "learning_rate": 1.9997736979824944e-05, - "loss": 0.0, + "epoch": 0.05417607223476298, + "grad_norm": 2.405210494995117, + "learning_rate": 1.323529411764706e-05, + "loss": 0.054, "step": 9 }, { - "epoch": 0.007524454477050414, - "grad_norm": 0.005437616258859634, - "learning_rate": 1.9997206172650826e-05, - "loss": 0.0002, + "epoch": 0.06019563581640331, + "grad_norm": 2.9235267639160156, + "learning_rate": 1.4705882352941177e-05, + "loss": 0.1401, "step": 10 }, { - "epoch": 0.008276899924755455, - "grad_norm": 0.5016034245491028, - "learning_rate": 1.9996619501965385e-05, - "loss": 0.0063, + "epoch": 0.06621519939804364, + "grad_norm": 3.1382505893707275, + "learning_rate": 1.6176470588235296e-05, + "loss": 0.0757, "step": 11 }, { - "epoch": 0.009029345372460496, - "grad_norm": 0.0040865750052034855, - "learning_rate": 1.9995976971046896e-05, - "loss": 0.0001, + "epoch": 0.07223476297968397, + "grad_norm": 2.7751779556274414, + "learning_rate": 1.7647058823529414e-05, + "loss": 0.089, "step": 12 }, { - "epoch": 0.009781790820165538, - "grad_norm": 0.06221418455243111, - "learning_rate": 1.9995278583485755e-05, - "loss": 0.0013, + "epoch": 0.0782543265613243, + "grad_norm": 1.7391453981399536, + "learning_rate": 1.9117647058823528e-05, + "loss": 0.11, "step": 13 }, { - "epoch": 0.010534236267870579, - "grad_norm": 0.34792619943618774, - "learning_rate": 1.9994524343184494e-05, - "loss": 0.0127, + "epoch": 0.08427389014296463, + "grad_norm": 2.010361671447754, + "learning_rate": 2.058823529411765e-05, + "loss": 0.1479, "step": 14 }, { - "epoch": 0.011286681715575621, - "grad_norm": 0.0, - "learning_rate": 1.999371425435775e-05, - "loss": 0.0, + "epoch": 0.09029345372460497, + "grad_norm": 3.3061070442199707, + "learning_rate": 2.2058823529411766e-05, + "loss": 0.0665, "step": 15 }, { - "epoch": 0.012039127163280662, - "grad_norm": 7.092195510864258, - "learning_rate": 1.9992848321532213e-05, - "loss": 0.5353, + "epoch": 0.0963130173062453, + "grad_norm": 2.8843464851379395, + "learning_rate": 2.3529411764705884e-05, + "loss": 0.074, "step": 16 }, { - "epoch": 0.012791572610985704, - "grad_norm": 0.0006019376451149583, - "learning_rate": 1.9991926549546653e-05, - "loss": 0.0, + "epoch": 0.10233258088788563, + "grad_norm": 0.8146764039993286, + "learning_rate": 2.5e-05, + "loss": 0.0315, "step": 17 }, { - "epoch": 0.013544018058690745, - "grad_norm": 0.026538817211985588, - "learning_rate": 1.9990948943551843e-05, - "loss": 0.0006, + "epoch": 0.10835214446952596, + "grad_norm": 2.45939040184021, + "learning_rate": 2.647058823529412e-05, + "loss": 0.0863, "step": 18 }, { - "epoch": 0.014296463506395787, - "grad_norm": 0.0013464675284922123, - "learning_rate": 1.998991550901056e-05, - "loss": 0.0, + "epoch": 0.1143717080511663, + "grad_norm": 2.4333105087280273, + "learning_rate": 2.7941176470588236e-05, + "loss": 0.1191, "step": 19 }, { - "epoch": 0.015048908954100828, - "grad_norm": 5.049379825592041, - "learning_rate": 1.998882625169755e-05, - "loss": 0.2596, + "epoch": 0.12039127163280662, + "grad_norm": 1.5534690618515015, + "learning_rate": 2.9411764705882354e-05, + "loss": 0.1213, "step": 20 }, { - "epoch": 0.01580135440180587, - "grad_norm": 0.0, - "learning_rate": 1.9987681177699486e-05, - "loss": 0.0, + "epoch": 0.12641083521444696, + "grad_norm": 1.2200188636779785, + "learning_rate": 3.0882352941176475e-05, + "loss": 0.0619, "step": 21 }, { - "epoch": 0.01655379984951091, - "grad_norm": 0.0007531401934102178, - "learning_rate": 1.9986480293414938e-05, - "loss": 0.0, + "epoch": 0.13243039879608728, + "grad_norm": 1.9440735578536987, + "learning_rate": 3.235294117647059e-05, + "loss": 0.1123, "step": 22 }, { - "epoch": 0.01730624529721595, - "grad_norm": 1.847428560256958, - "learning_rate": 1.9985223605554346e-05, - "loss": 0.1395, + "epoch": 0.1384499623777276, + "grad_norm": 1.5800230503082275, + "learning_rate": 3.382352941176471e-05, + "loss": 0.0655, "step": 23 }, { - "epoch": 0.01805869074492099, - "grad_norm": 0.013212837278842926, - "learning_rate": 1.998391112113997e-05, - "loss": 0.0001, + "epoch": 0.14446952595936793, + "grad_norm": 0.6280108690261841, + "learning_rate": 3.529411764705883e-05, + "loss": 0.0309, "step": 24 }, { - "epoch": 0.018811136192626036, - "grad_norm": 0.41415101289749146, - "learning_rate": 1.9982542847505858e-05, - "loss": 0.0105, + "epoch": 0.1504890895410083, + "grad_norm": 1.1837276220321655, + "learning_rate": 3.6764705882352945e-05, + "loss": 0.082, "step": 25 }, { - "epoch": 0.019563581640331076, - "grad_norm": 0.0014141664141789079, - "learning_rate": 1.99811187922978e-05, - "loss": 0.0001, + "epoch": 0.1565086531226486, + "grad_norm": 3.0979809761047363, + "learning_rate": 3.8235294117647055e-05, + "loss": 0.0754, "step": 26 }, { - "epoch": 0.020316027088036117, - "grad_norm": 0.002521197311580181, - "learning_rate": 1.9979638963473294e-05, - "loss": 0.0001, + "epoch": 0.16252821670428894, + "grad_norm": 0.919219434261322, + "learning_rate": 3.970588235294117e-05, + "loss": 0.0652, "step": 27 }, { - "epoch": 0.021068472535741158, - "grad_norm": 4.3378071784973145, - "learning_rate": 1.9978103369301495e-05, - "loss": 0.3246, + "epoch": 0.16854778028592926, + "grad_norm": 1.2674806118011475, + "learning_rate": 4.11764705882353e-05, + "loss": 0.0524, "step": 28 }, { - "epoch": 0.0218209179834462, - "grad_norm": 4.08248852181714e-05, - "learning_rate": 1.997651201836317e-05, - "loss": 0.0, + "epoch": 0.1745673438675696, + "grad_norm": 1.4973307847976685, + "learning_rate": 4.2647058823529415e-05, + "loss": 0.0739, "step": 29 }, { - "epoch": 0.022573363431151242, - "grad_norm": 0.006794753950089216, - "learning_rate": 1.9974864919550642e-05, - "loss": 0.0003, + "epoch": 0.18058690744920994, + "grad_norm": 1.3600691556930542, + "learning_rate": 4.411764705882353e-05, + "loss": 0.0932, "step": 30 }, { - "epoch": 0.023325808878856283, - "grad_norm": 0.05969928577542305, - "learning_rate": 1.9973162082067762e-05, - "loss": 0.0011, + "epoch": 0.18660647103085026, + "grad_norm": 0.6800034046173096, + "learning_rate": 4.558823529411765e-05, + "loss": 0.0525, "step": 31 }, { - "epoch": 0.024078254326561323, - "grad_norm": 0.07203614711761475, - "learning_rate": 1.9971403515429833e-05, - "loss": 0.002, + "epoch": 0.1926260346124906, + "grad_norm": 0.27061381936073303, + "learning_rate": 4.705882352941177e-05, + "loss": 0.0183, "step": 32 }, { - "epoch": 0.024830699774266364, - "grad_norm": 0.0011654079426079988, - "learning_rate": 1.996958922946357e-05, - "loss": 0.0001, + "epoch": 0.1986455981941309, + "grad_norm": 0.5821884870529175, + "learning_rate": 4.8529411764705885e-05, + "loss": 0.0342, "step": 33 }, { - "epoch": 0.025583145221971408, - "grad_norm": 0.002619029488414526, - "learning_rate": 1.9967719234307044e-05, - "loss": 0.0001, + "epoch": 0.20466516177577126, + "grad_norm": 1.5963926315307617, + "learning_rate": 5e-05, + "loss": 0.086, "step": 34 }, { - "epoch": 0.02633559066967645, - "grad_norm": 0.48534148931503296, - "learning_rate": 1.9965793540409628e-05, - "loss": 0.0088, + "epoch": 0.2106847253574116, + "grad_norm": 1.2303105592727661, + "learning_rate": 4.983221476510067e-05, + "loss": 0.1283, "step": 35 }, { - "epoch": 0.02708803611738149, - "grad_norm": 0.00325326737947762, - "learning_rate": 1.9963812158531926e-05, - "loss": 0.0001, + "epoch": 0.21670428893905191, + "grad_norm": 0.7925997376441956, + "learning_rate": 4.966442953020135e-05, + "loss": 0.0465, "step": 36 }, { - "epoch": 0.02784048156508653, - "grad_norm": 0.008502104319632053, - "learning_rate": 1.9961775099745727e-05, - "loss": 0.0003, + "epoch": 0.22272385252069224, + "grad_norm": 0.44674405455589294, + "learning_rate": 4.9496644295302015e-05, + "loss": 0.0208, "step": 37 }, { - "epoch": 0.028592927012791574, - "grad_norm": 0.9508514404296875, - "learning_rate": 1.995968237543394e-05, - "loss": 0.114, + "epoch": 0.2287434161023326, + "grad_norm": 1.129119873046875, + "learning_rate": 4.932885906040269e-05, + "loss": 0.1034, "step": 38 }, { - "epoch": 0.029345372460496615, - "grad_norm": 1.4725682735443115, - "learning_rate": 1.9957533997290524e-05, - "loss": 0.096, + "epoch": 0.23476297968397292, + "grad_norm": 0.747196614742279, + "learning_rate": 4.9161073825503354e-05, + "loss": 0.1117, "step": 39 }, { - "epoch": 0.030097817908201655, - "grad_norm": 0.1169065311551094, - "learning_rate": 1.9955329977320422e-05, - "loss": 0.0018, + "epoch": 0.24078254326561324, + "grad_norm": 1.0140711069107056, + "learning_rate": 4.8993288590604034e-05, + "loss": 0.0713, "step": 40 }, { - "epoch": 0.030850263355906696, - "grad_norm": 0.054003071039915085, - "learning_rate": 1.9953070327839513e-05, - "loss": 0.0012, + "epoch": 0.24680210684725357, + "grad_norm": 0.9150713086128235, + "learning_rate": 4.88255033557047e-05, + "loss": 0.1045, "step": 41 }, { - "epoch": 0.03160270880361174, - "grad_norm": 0.0011868480360135436, - "learning_rate": 1.9950755061474513e-05, - "loss": 0.0001, + "epoch": 0.2528216704288939, + "grad_norm": 0.7237759232521057, + "learning_rate": 4.865771812080537e-05, + "loss": 0.0399, "step": 42 }, { - "epoch": 0.03235515425131678, - "grad_norm": 0.27812469005584717, - "learning_rate": 1.9948384191162932e-05, - "loss": 0.0074, + "epoch": 0.2588412340105342, + "grad_norm": 0.4736149311065674, + "learning_rate": 4.848993288590604e-05, + "loss": 0.0283, "step": 43 }, { - "epoch": 0.03310759969902182, - "grad_norm": 2.0421926975250244, - "learning_rate": 1.994595773015298e-05, - "loss": 0.2558, + "epoch": 0.26486079759217457, + "grad_norm": 0.8596872091293335, + "learning_rate": 4.832214765100672e-05, + "loss": 0.0516, "step": 44 }, { - "epoch": 0.033860045146726865, - "grad_norm": 0.0036680996417999268, - "learning_rate": 1.9943475692003514e-05, - "loss": 0.0001, + "epoch": 0.2708803611738149, + "grad_norm": 0.8274044394493103, + "learning_rate": 4.8154362416107385e-05, + "loss": 0.0866, "step": 45 }, { - "epoch": 0.0346124905944319, - "grad_norm": 0.0018471762305125594, - "learning_rate": 1.994093809058394e-05, - "loss": 0.0001, + "epoch": 0.2768999247554552, + "grad_norm": 1.1380550861358643, + "learning_rate": 4.798657718120805e-05, + "loss": 0.0628, "step": 46 }, { - "epoch": 0.035364936042136946, - "grad_norm": 2.0573832988739014, - "learning_rate": 1.9938344940074162e-05, - "loss": 0.2316, + "epoch": 0.28291948833709557, + "grad_norm": 1.1349643468856812, + "learning_rate": 4.7818791946308725e-05, + "loss": 0.0997, "step": 47 }, { - "epoch": 0.03611738148984198, - "grad_norm": 0.02541675977408886, - "learning_rate": 1.9935696254964468e-05, - "loss": 0.0009, + "epoch": 0.28893905191873587, + "grad_norm": 1.2396087646484375, + "learning_rate": 4.76510067114094e-05, + "loss": 0.0668, "step": 48 }, { - "epoch": 0.03686982693754703, - "grad_norm": 0.039695482701063156, - "learning_rate": 1.9932992050055478e-05, - "loss": 0.001, + "epoch": 0.2949586155003762, + "grad_norm": 0.6159345507621765, + "learning_rate": 4.748322147651007e-05, + "loss": 0.0454, "step": 49 }, { - "epoch": 0.03762227238525207, - "grad_norm": 0.0008929629693739116, - "learning_rate": 1.993023234045806e-05, - "loss": 0.0001, + "epoch": 0.3009781790820166, + "grad_norm": 0.9823417663574219, + "learning_rate": 4.731543624161074e-05, + "loss": 0.0358, "step": 50 }, { - "epoch": 0.03837471783295711, - "grad_norm": 0.0004025468078907579, - "learning_rate": 1.992741714159322e-05, - "loss": 0.0, + "epoch": 0.30699774266365687, + "grad_norm": 1.3460859060287476, + "learning_rate": 4.714765100671141e-05, + "loss": 0.1146, "step": 51 }, { - "epoch": 0.03912716328066215, - "grad_norm": 0.012651098892092705, - "learning_rate": 1.992454646919205e-05, - "loss": 0.0006, + "epoch": 0.3130173062452972, + "grad_norm": 0.8716734647750854, + "learning_rate": 4.697986577181208e-05, + "loss": 0.0996, "step": 52 }, { - "epoch": 0.0398796087283672, - "grad_norm": 5.611824035644531, - "learning_rate": 1.9921620339295612e-05, - "loss": 0.0996, + "epoch": 0.3190368698269376, + "grad_norm": 0.8868650794029236, + "learning_rate": 4.6812080536912756e-05, + "loss": 0.0607, "step": 53 }, { - "epoch": 0.040632054176072234, - "grad_norm": 0.024261457845568657, - "learning_rate": 1.9918638768254865e-05, - "loss": 0.0008, + "epoch": 0.32505643340857787, + "grad_norm": 0.5762543678283691, + "learning_rate": 4.664429530201342e-05, + "loss": 0.0603, "step": 54 }, { - "epoch": 0.04138449962377728, - "grad_norm": 0.04541151970624924, - "learning_rate": 1.9915601772730562e-05, - "loss": 0.0013, + "epoch": 0.3310759969902182, + "grad_norm": 0.5473377704620361, + "learning_rate": 4.6476510067114095e-05, + "loss": 0.031, "step": 55 }, { - "epoch": 0.042136945071482315, - "grad_norm": 0.0012648458359763026, - "learning_rate": 1.9912509369693172e-05, - "loss": 0.0001, + "epoch": 0.3370955605718585, + "grad_norm": 0.4517374634742737, + "learning_rate": 4.630872483221477e-05, + "loss": 0.0318, "step": 56 }, { - "epoch": 0.04288939051918736, - "grad_norm": 0.009661543183028698, - "learning_rate": 1.990936157642277e-05, - "loss": 0.0004, + "epoch": 0.3431151241534989, + "grad_norm": 1.007686734199524, + "learning_rate": 4.6140939597315434e-05, + "loss": 0.0596, "step": 57 }, { - "epoch": 0.0436418359668924, - "grad_norm": 0.00047065879334695637, - "learning_rate": 1.990615841050895e-05, - "loss": 0.0, + "epoch": 0.3491346877351392, + "grad_norm": 0.5532180666923523, + "learning_rate": 4.597315436241611e-05, + "loss": 0.0917, "step": 58 }, { - "epoch": 0.04439428141459744, - "grad_norm": 2.226644992828369, - "learning_rate": 1.990289988985072e-05, - "loss": 0.0725, + "epoch": 0.3551542513167795, + "grad_norm": 0.6608918309211731, + "learning_rate": 4.580536912751678e-05, + "loss": 0.0768, "step": 59 }, { - "epoch": 0.045146726862302484, - "grad_norm": 0.007543186657130718, - "learning_rate": 1.9899586032656407e-05, - "loss": 0.0003, + "epoch": 0.3611738148984199, + "grad_norm": 0.9971833229064941, + "learning_rate": 4.5637583892617453e-05, + "loss": 0.0614, "step": 60 }, { - "epoch": 0.04589917231000752, - "grad_norm": 0.3191293478012085, - "learning_rate": 1.9896216857443563e-05, - "loss": 0.0149, + "epoch": 0.3671933784800602, + "grad_norm": 0.4296749532222748, + "learning_rate": 4.546979865771812e-05, + "loss": 0.0503, "step": 61 }, { - "epoch": 0.046651617757712566, - "grad_norm": 0.0007462618523277342, - "learning_rate": 1.989279238303883e-05, - "loss": 0.0, + "epoch": 0.3732129420617005, + "grad_norm": 0.5677506923675537, + "learning_rate": 4.530201342281879e-05, + "loss": 0.0644, "step": 62 }, { - "epoch": 0.04740406320541761, - "grad_norm": 0.1068795919418335, - "learning_rate": 1.9889312628577887e-05, - "loss": 0.0024, + "epoch": 0.3792325056433409, + "grad_norm": 0.6360709071159363, + "learning_rate": 4.5134228187919466e-05, + "loss": 0.0989, "step": 63 }, { - "epoch": 0.04815650865312265, - "grad_norm": 0.003478952683508396, - "learning_rate": 1.9885777613505278e-05, - "loss": 0.0002, + "epoch": 0.3852520692249812, + "grad_norm": 1.348215937614441, + "learning_rate": 4.496644295302014e-05, + "loss": 0.0967, "step": 64 }, { - "epoch": 0.04890895410082769, - "grad_norm": 0.002991155255585909, - "learning_rate": 1.988218735757437e-05, - "loss": 0.0001, + "epoch": 0.3912716328066215, + "grad_norm": 0.6315649747848511, + "learning_rate": 4.4798657718120805e-05, + "loss": 0.0489, "step": 65 }, { - "epoch": 0.04966139954853273, - "grad_norm": 0.003038755152374506, - "learning_rate": 1.98785418808472e-05, - "loss": 0.0001, + "epoch": 0.3972911963882618, + "grad_norm": 0.6150538921356201, + "learning_rate": 4.463087248322148e-05, + "loss": 0.0634, "step": 66 }, { - "epoch": 0.05041384499623777, - "grad_norm": 1.9981627464294434, - "learning_rate": 1.987484120369436e-05, - "loss": 0.5126, + "epoch": 0.4033107599699022, + "grad_norm": 0.3067854344844818, + "learning_rate": 4.446308724832215e-05, + "loss": 0.0181, "step": 67 }, { - "epoch": 0.051166290443942816, - "grad_norm": 0.1265110969543457, - "learning_rate": 1.9871085346794922e-05, - "loss": 0.0043, + "epoch": 0.40933032355154253, + "grad_norm": 0.7488537430763245, + "learning_rate": 4.4295302013422824e-05, + "loss": 0.0847, "step": 68 }, { - "epoch": 0.05191873589164785, - "grad_norm": 0.5709707736968994, - "learning_rate": 1.9867274331136276e-05, - "loss": 0.1178, + "epoch": 0.4153498871331828, + "grad_norm": 0.769372284412384, + "learning_rate": 4.412751677852349e-05, + "loss": 0.037, "step": 69 }, { - "epoch": 0.0526711813393529, - "grad_norm": 0.17010557651519775, - "learning_rate": 1.986340817801405e-05, - "loss": 0.0031, + "epoch": 0.4213694507148232, + "grad_norm": 0.9909029006958008, + "learning_rate": 4.395973154362416e-05, + "loss": 0.0417, "step": 70 }, { - "epoch": 0.05342362678705794, - "grad_norm": 0.1551416665315628, - "learning_rate": 1.985948690903196e-05, - "loss": 0.0035, + "epoch": 0.42738901429646353, + "grad_norm": 0.7407757043838501, + "learning_rate": 4.3791946308724836e-05, + "loss": 0.051, "step": 71 }, { - "epoch": 0.05417607223476298, - "grad_norm": 0.17882609367370605, - "learning_rate": 1.9855510546101725e-05, - "loss": 0.0066, + "epoch": 0.43340857787810383, + "grad_norm": 1.149268388748169, + "learning_rate": 4.36241610738255e-05, + "loss": 0.0409, "step": 72 }, { - "epoch": 0.05492851768246802, - "grad_norm": 0.014555556699633598, - "learning_rate": 1.9851479111442902e-05, - "loss": 0.0008, + "epoch": 0.4394281414597442, + "grad_norm": 0.9848851561546326, + "learning_rate": 4.3456375838926176e-05, + "loss": 0.0149, "step": 73 }, { - "epoch": 0.05568096313017306, - "grad_norm": 0.03164775297045708, - "learning_rate": 1.98473926275828e-05, - "loss": 0.0014, + "epoch": 0.4454477050413845, + "grad_norm": 1.4406760931015015, + "learning_rate": 4.328859060402685e-05, + "loss": 0.0762, "step": 74 }, { - "epoch": 0.056433408577878104, - "grad_norm": 1.0504040718078613, - "learning_rate": 1.984325111735633e-05, - "loss": 0.047, + "epoch": 0.45146726862302483, + "grad_norm": 1.003056526184082, + "learning_rate": 4.312080536912752e-05, + "loss": 0.0865, "step": 75 }, { - "epoch": 0.05718585402558315, - "grad_norm": 0.06992053985595703, - "learning_rate": 1.9839054603905887e-05, - "loss": 0.0025, + "epoch": 0.4574868322046652, + "grad_norm": 1.0864567756652832, + "learning_rate": 4.295302013422819e-05, + "loss": 0.0535, "step": 76 }, { - "epoch": 0.057938299473288185, - "grad_norm": 3.3065266609191895, - "learning_rate": 1.9834803110681223e-05, - "loss": 0.039, + "epoch": 0.4635063957863055, + "grad_norm": 1.5504230260849, + "learning_rate": 4.278523489932886e-05, + "loss": 0.0679, "step": 77 }, { - "epoch": 0.05869074492099323, - "grad_norm": 0.04477689415216446, - "learning_rate": 1.983049666143931e-05, - "loss": 0.0017, + "epoch": 0.46952595936794583, + "grad_norm": 1.389381766319275, + "learning_rate": 4.2617449664429534e-05, + "loss": 0.1011, "step": 78 }, { - "epoch": 0.059443190368698266, - "grad_norm": 0.027361512184143066, - "learning_rate": 1.9826135280244204e-05, - "loss": 0.0011, + "epoch": 0.47554552294958613, + "grad_norm": 0.06696069985628128, + "learning_rate": 4.244966442953021e-05, + "loss": 0.0017, "step": 79 }, { - "epoch": 0.06019563581640331, - "grad_norm": 0.2297053188085556, - "learning_rate": 1.9821718991466925e-05, - "loss": 0.0076, + "epoch": 0.4815650865312265, + "grad_norm": 0.8552239537239075, + "learning_rate": 4.228187919463087e-05, + "loss": 0.1052, "step": 80 }, { - "epoch": 0.060948081264108354, - "grad_norm": 0.015010291710495949, - "learning_rate": 1.9817247819785303e-05, - "loss": 0.0007, + "epoch": 0.48758465011286684, + "grad_norm": 1.8147671222686768, + "learning_rate": 4.2114093959731546e-05, + "loss": 0.0364, "step": 81 }, { - "epoch": 0.06170052671181339, - "grad_norm": 1.8525534868240356, - "learning_rate": 1.981272179018386e-05, - "loss": 0.3579, + "epoch": 0.49360421369450713, + "grad_norm": 0.7592940330505371, + "learning_rate": 4.194630872483222e-05, + "loss": 0.0373, "step": 82 }, { - "epoch": 0.062452972159518436, - "grad_norm": 0.02152411825954914, - "learning_rate": 1.9808140927953644e-05, - "loss": 0.0006, + "epoch": 0.4996237772761475, + "grad_norm": 0.7351986765861511, + "learning_rate": 4.1778523489932886e-05, + "loss": 0.1033, "step": 83 }, { - "epoch": 0.06320541760722348, - "grad_norm": 0.0057389941066503525, - "learning_rate": 1.9803505258692117e-05, - "loss": 0.0004, + "epoch": 0.5056433408577878, + "grad_norm": 0.3439026176929474, + "learning_rate": 4.161073825503356e-05, + "loss": 0.0166, "step": 84 }, { - "epoch": 0.06395786305492852, - "grad_norm": 0.021787824109196663, - "learning_rate": 1.9798814808302992e-05, - "loss": 0.0008, + "epoch": 0.5116629044394282, + "grad_norm": 0.41652995347976685, + "learning_rate": 4.144295302013423e-05, + "loss": 0.0591, "step": 85 }, { - "epoch": 0.06471030850263355, - "grad_norm": 1.793501853942871, - "learning_rate": 1.9794069602996093e-05, - "loss": 0.1386, + "epoch": 0.5176824680210684, + "grad_norm": 0.5505680441856384, + "learning_rate": 4.1275167785234905e-05, + "loss": 0.0719, "step": 86 }, { - "epoch": 0.0654627539503386, - "grad_norm": 0.9608948826789856, - "learning_rate": 1.9789269669287212e-05, - "loss": 0.126, + "epoch": 0.5237020316027088, + "grad_norm": 0.5010355114936829, + "learning_rate": 4.110738255033557e-05, + "loss": 0.0598, "step": 87 }, { - "epoch": 0.06621519939804364, - "grad_norm": 0.01728960871696472, - "learning_rate": 1.9784415033997955e-05, - "loss": 0.0008, + "epoch": 0.5297215951843491, + "grad_norm": 0.5710484385490417, + "learning_rate": 4.0939597315436244e-05, + "loss": 0.0354, "step": 88 }, { - "epoch": 0.06696764484574869, - "grad_norm": 0.0034939961042255163, - "learning_rate": 1.9779505724255602e-05, - "loss": 0.0002, + "epoch": 0.5357411587659895, + "grad_norm": 0.815994381904602, + "learning_rate": 4.077181208053692e-05, + "loss": 0.0583, "step": 89 }, { - "epoch": 0.06772009029345373, - "grad_norm": 0.27871963381767273, - "learning_rate": 1.9774541767492942e-05, - "loss": 0.1145, + "epoch": 0.5417607223476298, + "grad_norm": 0.5417527556419373, + "learning_rate": 4.060402684563759e-05, + "loss": 0.0551, "step": 90 }, { - "epoch": 0.06847253574115876, - "grad_norm": 0.020328784361481667, - "learning_rate": 1.9769523191448136e-05, - "loss": 0.0009, + "epoch": 0.5477802859292701, + "grad_norm": 0.6098296046257019, + "learning_rate": 4.0436241610738256e-05, + "loss": 0.0675, "step": 91 }, { - "epoch": 0.0692249811888638, - "grad_norm": 0.003004940692335367, - "learning_rate": 1.976445002416454e-05, - "loss": 0.0002, + "epoch": 0.5537998495109104, + "grad_norm": 1.0332790613174438, + "learning_rate": 4.026845637583892e-05, + "loss": 0.0726, "step": 92 }, { - "epoch": 0.06997742663656885, - "grad_norm": 0.051723234355449677, - "learning_rate": 1.975932229399057e-05, - "loss": 0.0021, + "epoch": 0.5598194130925508, + "grad_norm": 0.5562874674797058, + "learning_rate": 4.01006711409396e-05, + "loss": 0.0493, "step": 93 }, { - "epoch": 0.07072987208427389, - "grad_norm": 0.02858145534992218, - "learning_rate": 1.975414002957953e-05, - "loss": 0.0011, + "epoch": 0.5658389766741911, + "grad_norm": 0.8887888789176941, + "learning_rate": 3.993288590604027e-05, + "loss": 0.0924, "step": 94 }, { - "epoch": 0.07148231753197894, - "grad_norm": 0.076414555311203, - "learning_rate": 1.9748903259889466e-05, - "loss": 0.0024, + "epoch": 0.5718585402558315, + "grad_norm": 0.67585688829422, + "learning_rate": 3.976510067114094e-05, + "loss": 0.03, "step": 95 }, { - "epoch": 0.07223476297968397, - "grad_norm": 1.1670242547988892, - "learning_rate": 1.9743612014182982e-05, - "loss": 0.0559, + "epoch": 0.5778781038374717, + "grad_norm": 3.7685415744781494, + "learning_rate": 3.959731543624161e-05, + "loss": 0.0489, "step": 96 }, { - "epoch": 0.07298720842738901, - "grad_norm": 0.05960305780172348, - "learning_rate": 1.9738266322027094e-05, - "loss": 0.0026, + "epoch": 0.5838976674191121, + "grad_norm": 0.8312086462974548, + "learning_rate": 3.942953020134229e-05, + "loss": 0.071, "step": 97 }, { - "epoch": 0.07373965387509406, - "grad_norm": 0.4573524296283722, - "learning_rate": 1.9732866213293066e-05, - "loss": 0.082, + "epoch": 0.5899172310007524, + "grad_norm": 0.6857490539550781, + "learning_rate": 3.9261744966442954e-05, + "loss": 0.0166, "step": 98 }, { - "epoch": 0.0744920993227991, - "grad_norm": 0.02990127168595791, - "learning_rate": 1.972741171815623e-05, - "loss": 0.0011, + "epoch": 0.5959367945823928, + "grad_norm": 0.6559200882911682, + "learning_rate": 3.909395973154363e-05, + "loss": 0.1012, "step": 99 }, { - "epoch": 0.07524454477050414, - "grad_norm": 1.6312720775604248, - "learning_rate": 1.9721902867095828e-05, - "loss": 0.1075, + "epoch": 0.6019563581640331, + "grad_norm": 1.4492859840393066, + "learning_rate": 3.89261744966443e-05, + "loss": 0.076, "step": 100 }, { - "epoch": 0.07599699021820917, - "grad_norm": 0.3260236978530884, - "learning_rate": 1.9716339690894834e-05, - "loss": 0.0788, + "epoch": 0.6079759217456734, + "grad_norm": 0.7843137383460999, + "learning_rate": 3.875838926174497e-05, + "loss": 0.0577, "step": 101 }, { - "epoch": 0.07674943566591422, - "grad_norm": 0.0010308397468179464, - "learning_rate": 1.9710722220639785e-05, - "loss": 0.0001, + "epoch": 0.6139954853273137, + "grad_norm": 0.794602632522583, + "learning_rate": 3.859060402684564e-05, + "loss": 0.0626, "step": 102 }, { - "epoch": 0.07750188111361926, - "grad_norm": 0.04024788737297058, - "learning_rate": 1.9705050487720618e-05, - "loss": 0.0016, + "epoch": 0.6200150489089541, + "grad_norm": 1.4473546743392944, + "learning_rate": 3.8422818791946305e-05, + "loss": 0.0356, "step": 103 }, { - "epoch": 0.0782543265613243, - "grad_norm": 5.523307800292969, - "learning_rate": 1.969932452383048e-05, - "loss": 0.3277, + "epoch": 0.6260346124905944, + "grad_norm": 0.4639027416706085, + "learning_rate": 3.8255033557046985e-05, + "loss": 0.0259, "step": 104 }, { - "epoch": 0.07900677200902935, - "grad_norm": 2.359933853149414, - "learning_rate": 1.9693544360965548e-05, - "loss": 0.189, + "epoch": 0.6320541760722348, + "grad_norm": 1.1497997045516968, + "learning_rate": 3.808724832214765e-05, + "loss": 0.0516, "step": 105 }, { - "epoch": 0.0797592174567344, - "grad_norm": 0.6973968744277954, - "learning_rate": 1.9687710031424873e-05, - "loss": 0.0215, + "epoch": 0.6380737396538751, + "grad_norm": 0.327901691198349, + "learning_rate": 3.7919463087248324e-05, + "loss": 0.0405, "step": 106 }, { - "epoch": 0.08051166290443942, - "grad_norm": 0.01043291948735714, - "learning_rate": 1.9681821567810172e-05, - "loss": 0.0004, + "epoch": 0.6440933032355154, + "grad_norm": 0.4509243369102478, + "learning_rate": 3.775167785234899e-05, + "loss": 0.0892, "step": 107 }, { - "epoch": 0.08126410835214447, - "grad_norm": 0.0018412582576274872, - "learning_rate": 1.9675879003025668e-05, - "loss": 0.0001, + "epoch": 0.6501128668171557, + "grad_norm": 0.6975520849227905, + "learning_rate": 3.758389261744967e-05, + "loss": 0.0978, "step": 108 }, { - "epoch": 0.08201655379984951, - "grad_norm": 0.6676415205001831, - "learning_rate": 1.9669882370277885e-05, - "loss": 0.1539, + "epoch": 0.6561324303987961, + "grad_norm": 0.6053667664527893, + "learning_rate": 3.741610738255034e-05, + "loss": 0.0318, "step": 109 }, { - "epoch": 0.08276899924755456, - "grad_norm": 0.006418765056878328, - "learning_rate": 1.9663831703075488e-05, - "loss": 0.0004, + "epoch": 0.6621519939804364, + "grad_norm": 0.5161236524581909, + "learning_rate": 3.724832214765101e-05, + "loss": 0.0561, "step": 110 }, { - "epoch": 0.0835214446952596, - "grad_norm": 0.0075980802066624165, - "learning_rate": 1.9657727035229066e-05, - "loss": 0.0004, + "epoch": 0.6681715575620768, + "grad_norm": 0.4180920124053955, + "learning_rate": 3.7080536912751676e-05, + "loss": 0.0404, "step": 111 }, { - "epoch": 0.08427389014296463, - "grad_norm": 0.004072375129908323, - "learning_rate": 1.9651568400850976e-05, - "loss": 0.0001, + "epoch": 0.674191121143717, + "grad_norm": 0.4068116843700409, + "learning_rate": 3.6912751677852356e-05, + "loss": 0.0399, "step": 112 }, { - "epoch": 0.08502633559066967, - "grad_norm": 0.6514503359794617, - "learning_rate": 1.964535583435512e-05, - "loss": 0.0636, + "epoch": 0.6802106847253574, + "grad_norm": 0.25368958711624146, + "learning_rate": 3.674496644295302e-05, + "loss": 0.0374, "step": 113 }, { - "epoch": 0.08577878103837472, - "grad_norm": 0.6835861802101135, - "learning_rate": 1.9639089370456784e-05, - "loss": 0.1035, + "epoch": 0.6862302483069977, + "grad_norm": 0.4473256766796112, + "learning_rate": 3.6577181208053695e-05, + "loss": 0.0533, "step": 114 }, { - "epoch": 0.08653122648607976, - "grad_norm": 0.18389393389225006, - "learning_rate": 1.963276904417241e-05, - "loss": 0.0048, + "epoch": 0.6922498118886381, + "grad_norm": 0.39927905797958374, + "learning_rate": 3.640939597315436e-05, + "loss": 0.0367, "step": 115 }, { - "epoch": 0.0872836719337848, - "grad_norm": 0.002048420486971736, - "learning_rate": 1.962639489081943e-05, - "loss": 0.0001, + "epoch": 0.6982693754702785, + "grad_norm": 0.5100545883178711, + "learning_rate": 3.6241610738255034e-05, + "loss": 0.0841, "step": 116 }, { - "epoch": 0.08803611738148984, - "grad_norm": 0.010052050463855267, - "learning_rate": 1.9619966946016054e-05, - "loss": 0.0005, + "epoch": 0.7042889390519187, + "grad_norm": 1.113686203956604, + "learning_rate": 3.607382550335571e-05, + "loss": 0.0798, "step": 117 }, { - "epoch": 0.08878856282919488, - "grad_norm": 0.100750632584095, - "learning_rate": 1.9613485245681073e-05, - "loss": 0.0027, + "epoch": 0.710308502633559, + "grad_norm": 0.4927540123462677, + "learning_rate": 3.5906040268456373e-05, + "loss": 0.0201, "step": 118 }, { - "epoch": 0.08954100827689992, - "grad_norm": 0.04654483497142792, - "learning_rate": 1.960694982603366e-05, - "loss": 0.0019, + "epoch": 0.7163280662151994, + "grad_norm": 0.2962929904460907, + "learning_rate": 3.5738255033557046e-05, + "loss": 0.0413, "step": 119 }, { - "epoch": 0.09029345372460497, - "grad_norm": 0.01746903918683529, - "learning_rate": 1.960036072359317e-05, - "loss": 0.0006, + "epoch": 0.7223476297968398, + "grad_norm": 0.4307601749897003, + "learning_rate": 3.557046979865772e-05, + "loss": 0.022, "step": 120 }, { - "epoch": 0.09104589917231001, - "grad_norm": 0.05034410208463669, - "learning_rate": 1.9593717975178924e-05, - "loss": 0.002, + "epoch": 0.7283671933784801, + "grad_norm": 0.5823848247528076, + "learning_rate": 3.540268456375839e-05, + "loss": 0.0357, "step": 121 }, { - "epoch": 0.09179834462001504, - "grad_norm": 3.37355637550354, - "learning_rate": 1.958702161791002e-05, - "loss": 0.5772, + "epoch": 0.7343867569601203, + "grad_norm": 0.3515729010105133, + "learning_rate": 3.523489932885906e-05, + "loss": 0.016, "step": 122 }, { - "epoch": 0.09255079006772009, - "grad_norm": 0.0004869294061791152, - "learning_rate": 1.958027168920512e-05, - "loss": 0.0, + "epoch": 0.7404063205417607, + "grad_norm": 0.6808828115463257, + "learning_rate": 3.506711409395974e-05, + "loss": 0.0143, "step": 123 }, { - "epoch": 0.09330323551542513, - "grad_norm": 0.013796951621770859, - "learning_rate": 1.9573468226782224e-05, - "loss": 0.0006, + "epoch": 0.746425884123401, + "grad_norm": 0.7892507910728455, + "learning_rate": 3.4899328859060405e-05, + "loss": 0.0264, "step": 124 }, { - "epoch": 0.09405568096313018, - "grad_norm": 0.020256061106920242, - "learning_rate": 1.956661126865849e-05, - "loss": 0.0007, + "epoch": 0.7524454477050414, + "grad_norm": 1.1977708339691162, + "learning_rate": 3.473154362416108e-05, + "loss": 0.0229, "step": 125 }, { - "epoch": 0.09480812641083522, - "grad_norm": 0.4556258022785187, - "learning_rate": 1.9559700853149997e-05, - "loss": 0.007, + "epoch": 0.7584650112866818, + "grad_norm": 1.4794739484786987, + "learning_rate": 3.4563758389261744e-05, + "loss": 0.0504, "step": 126 }, { - "epoch": 0.09556057185854025, - "grad_norm": 1.9277138710021973, - "learning_rate": 1.9552737018871543e-05, - "loss": 0.0393, + "epoch": 0.764484574868322, + "grad_norm": 0.9014606475830078, + "learning_rate": 3.439597315436242e-05, + "loss": 0.0223, "step": 127 }, { - "epoch": 0.0963130173062453, - "grad_norm": 0.0008740609628148377, - "learning_rate": 1.954571980473642e-05, - "loss": 0.0001, + "epoch": 0.7705041384499624, + "grad_norm": 0.5018836855888367, + "learning_rate": 3.422818791946309e-05, + "loss": 0.0359, "step": 128 }, { - "epoch": 0.09706546275395034, - "grad_norm": 0.03682279586791992, - "learning_rate": 1.953864924995621e-05, - "loss": 0.0015, + "epoch": 0.7765237020316027, + "grad_norm": 1.3349637985229492, + "learning_rate": 3.4060402684563756e-05, + "loss": 0.0572, "step": 129 }, { - "epoch": 0.09781790820165538, - "grad_norm": 0.03125663846731186, - "learning_rate": 1.9531525394040546e-05, - "loss": 0.0008, + "epoch": 0.782543265613243, + "grad_norm": 1.1911202669143677, + "learning_rate": 3.389261744966443e-05, + "loss": 0.0956, "step": 130 }, { - "epoch": 0.09857035364936043, - "grad_norm": 0.0102821821346879, - "learning_rate": 1.9524348276796913e-05, - "loss": 0.0004, + "epoch": 0.7885628291948834, + "grad_norm": 2.8993449211120605, + "learning_rate": 3.37248322147651e-05, + "loss": 0.1212, "step": 131 }, { - "epoch": 0.09932279909706546, - "grad_norm": 0.07433848083019257, - "learning_rate": 1.9517117938330414e-05, - "loss": 0.0029, + "epoch": 0.7945823927765236, + "grad_norm": 0.6151400208473206, + "learning_rate": 3.3557046979865775e-05, + "loss": 0.0641, "step": 132 }, { - "epoch": 0.1000752445447705, - "grad_norm": 0.31897610425949097, - "learning_rate": 1.9509834419043544e-05, - "loss": 0.0816, + "epoch": 0.800601956358164, + "grad_norm": 1.7681182622909546, + "learning_rate": 3.338926174496644e-05, + "loss": 0.1288, "step": 133 }, { - "epoch": 0.10082768999247554, - "grad_norm": 0.042768437415361404, - "learning_rate": 1.9502497759635973e-05, - "loss": 0.0017, + "epoch": 0.8066215199398044, + "grad_norm": 1.9313393831253052, + "learning_rate": 3.3221476510067115e-05, + "loss": 0.122, "step": 134 }, { - "epoch": 0.10158013544018059, - "grad_norm": 0.27461105585098267, - "learning_rate": 1.9495108001104312e-05, - "loss": 0.0972, + "epoch": 0.8126410835214447, + "grad_norm": 0.7092230916023254, + "learning_rate": 3.305369127516779e-05, + "loss": 0.0395, "step": 135 }, { - "epoch": 0.10233258088788563, - "grad_norm": 0.012465243227779865, - "learning_rate": 1.9487665184741878e-05, - "loss": 0.0006, + "epoch": 0.8186606471030851, + "grad_norm": 0.6039671301841736, + "learning_rate": 3.288590604026846e-05, + "loss": 0.0517, "step": 136 }, { - "epoch": 0.10308502633559068, - "grad_norm": 0.042917292565107346, - "learning_rate": 1.9480169352138473e-05, - "loss": 0.0014, + "epoch": 0.8246802106847254, + "grad_norm": 0.6897003054618835, + "learning_rate": 3.271812080536913e-05, + "loss": 0.0507, "step": 137 }, { - "epoch": 0.1038374717832957, - "grad_norm": 0.06652813404798508, - "learning_rate": 1.9472620545180165e-05, - "loss": 0.0018, + "epoch": 0.8306997742663657, + "grad_norm": 0.05981295928359032, + "learning_rate": 3.25503355704698e-05, + "loss": 0.0023, "step": 138 }, { - "epoch": 0.10458991723100075, - "grad_norm": 3.4145336151123047, - "learning_rate": 1.9465018806049014e-05, - "loss": 0.0251, + "epoch": 0.836719337848006, + "grad_norm": 0.48830369114875793, + "learning_rate": 3.238255033557047e-05, + "loss": 0.0652, "step": 139 }, { - "epoch": 0.1053423626787058, - "grad_norm": 0.5462217926979065, - "learning_rate": 1.9457364177222877e-05, - "loss": 0.1197, + "epoch": 0.8427389014296464, + "grad_norm": 1.0506463050842285, + "learning_rate": 3.221476510067114e-05, + "loss": 0.0709, "step": 140 }, { - "epoch": 0.10609480812641084, - "grad_norm": 0.00809057243168354, - "learning_rate": 1.9449656701475147e-05, - "loss": 0.0004, + "epoch": 0.8487584650112867, + "grad_norm": 0.3859744668006897, + "learning_rate": 3.204697986577181e-05, + "loss": 0.0396, "step": 141 }, { - "epoch": 0.10684725357411588, - "grad_norm": 0.03647633641958237, - "learning_rate": 1.944189642187452e-05, - "loss": 0.0008, + "epoch": 0.8547780285929271, + "grad_norm": 0.768587052822113, + "learning_rate": 3.1879194630872485e-05, + "loss": 0.0599, "step": 142 }, { - "epoch": 0.10759969902182091, - "grad_norm": 0.007750968914479017, - "learning_rate": 1.9434083381784764e-05, - "loss": 0.0002, + "epoch": 0.8607975921745673, + "grad_norm": 0.6868757605552673, + "learning_rate": 3.171140939597316e-05, + "loss": 0.0373, "step": 143 }, { - "epoch": 0.10835214446952596, - "grad_norm": 0.002081293612718582, - "learning_rate": 1.9426217624864456e-05, - "loss": 0.0001, + "epoch": 0.8668171557562077, + "grad_norm": 0.7010259628295898, + "learning_rate": 3.1543624161073825e-05, + "loss": 0.0717, "step": 144 }, { - "epoch": 0.109104589917231, - "grad_norm": 2.682119131088257, - "learning_rate": 1.9418299195066755e-05, - "loss": 0.1876, + "epoch": 0.872836719337848, + "grad_norm": 0.5125268697738647, + "learning_rate": 3.13758389261745e-05, + "loss": 0.0457, "step": 145 }, { - "epoch": 0.10985703536493605, - "grad_norm": 0.21694301068782806, - "learning_rate": 1.9410328136639163e-05, - "loss": 0.0035, + "epoch": 0.8788562829194884, + "grad_norm": 0.9679777026176453, + "learning_rate": 3.120805369127517e-05, + "loss": 0.0988, "step": 146 }, { - "epoch": 0.11060948081264109, - "grad_norm": 0.020226623862981796, - "learning_rate": 1.940230449412324e-05, - "loss": 0.0006, + "epoch": 0.8848758465011287, + "grad_norm": 0.7588280439376831, + "learning_rate": 3.1040268456375844e-05, + "loss": 0.0691, "step": 147 }, { - "epoch": 0.11136192626034612, - "grad_norm": 0.006655423901975155, - "learning_rate": 1.939422831235441e-05, - "loss": 0.0002, + "epoch": 0.890895410082769, + "grad_norm": 0.4412769079208374, + "learning_rate": 3.087248322147651e-05, + "loss": 0.0243, "step": 148 }, { - "epoch": 0.11211437170805116, - "grad_norm": 0.011194628663361073, - "learning_rate": 1.938609963646166e-05, - "loss": 0.0005, + "epoch": 0.8969149736644093, + "grad_norm": 0.5840623378753662, + "learning_rate": 3.070469798657718e-05, + "loss": 0.0553, "step": 149 }, { - "epoch": 0.11286681715575621, - "grad_norm": 1.6491260528564453, - "learning_rate": 1.9377918511867318e-05, - "loss": 0.1711, + "epoch": 0.9029345372460497, + "grad_norm": 0.34683701395988464, + "learning_rate": 3.0536912751677856e-05, + "loss": 0.0568, "step": 150 }, { - "epoch": 0.11361926260346125, - "grad_norm": 0.20927202701568604, - "learning_rate": 1.9369684984286798e-05, - "loss": 0.0075, + "epoch": 0.90895410082769, + "grad_norm": 0.6545599102973938, + "learning_rate": 3.0369127516778522e-05, + "loss": 0.0523, "step": 151 }, { - "epoch": 0.1143717080511663, - "grad_norm": 0.10048650205135345, - "learning_rate": 1.9361399099728326e-05, - "loss": 0.0033, + "epoch": 0.9149736644093304, + "grad_norm": 0.3024606704711914, + "learning_rate": 3.02013422818792e-05, + "loss": 0.04, "step": 152 }, { - "epoch": 0.11512415349887133, - "grad_norm": 0.09928741306066513, - "learning_rate": 1.9353060904492694e-05, - "loss": 0.0038, + "epoch": 0.9209932279909706, + "grad_norm": 0.40984031558036804, + "learning_rate": 3.0033557046979865e-05, + "loss": 0.027, "step": 153 }, { - "epoch": 0.11587659894657637, - "grad_norm": 0.3446449935436249, - "learning_rate": 1.934467044517301e-05, - "loss": 0.0654, + "epoch": 0.927012791572611, + "grad_norm": 0.3794308602809906, + "learning_rate": 2.986577181208054e-05, + "loss": 0.0927, "step": 154 }, { - "epoch": 0.11662904439428141, - "grad_norm": 0.007841469720005989, - "learning_rate": 1.9336227768654424e-05, - "loss": 0.0004, + "epoch": 0.9330323551542513, + "grad_norm": 0.6882305145263672, + "learning_rate": 2.9697986577181207e-05, + "loss": 0.0343, "step": 155 }, { - "epoch": 0.11738148984198646, - "grad_norm": 0.24826376140117645, - "learning_rate": 1.9327732922113872e-05, - "loss": 0.006, + "epoch": 0.9390519187358917, + "grad_norm": 0.6028600335121155, + "learning_rate": 2.9530201342281884e-05, + "loss": 0.0479, "step": 156 }, { - "epoch": 0.1181339352896915, - "grad_norm": 2.012798547744751, - "learning_rate": 1.9319185953019817e-05, - "loss": 0.1098, + "epoch": 0.945071482317532, + "grad_norm": 3.9858436584472656, + "learning_rate": 2.936241610738255e-05, + "loss": 0.0709, "step": 157 }, { - "epoch": 0.11888638073739653, - "grad_norm": 0.02779470756649971, - "learning_rate": 1.9310586909131964e-05, - "loss": 0.0009, + "epoch": 0.9510910458991723, + "grad_norm": 0.193309485912323, + "learning_rate": 2.9194630872483227e-05, + "loss": 0.0098, "step": 158 }, { - "epoch": 0.11963882618510158, - "grad_norm": 3.675586462020874, - "learning_rate": 1.930193583850102e-05, - "loss": 0.5443, + "epoch": 0.9571106094808126, + "grad_norm": 0.6534713506698608, + "learning_rate": 2.9026845637583893e-05, + "loss": 0.0519, "step": 159 }, { - "epoch": 0.12039127163280662, - "grad_norm": 0.009048126637935638, - "learning_rate": 1.9293232789468403e-05, - "loss": 0.0005, + "epoch": 0.963130173062453, + "grad_norm": 0.27771925926208496, + "learning_rate": 2.885906040268457e-05, + "loss": 0.0238, "step": 160 }, { - "epoch": 0.12114371708051166, - "grad_norm": 0.00870896689593792, - "learning_rate": 1.9284477810666e-05, - "loss": 0.0004, + "epoch": 0.9691497366440933, + "grad_norm": 0.7335049510002136, + "learning_rate": 2.8691275167785235e-05, + "loss": 0.0677, "step": 161 }, { - "epoch": 0.12189616252821671, - "grad_norm": 0.01799067109823227, - "learning_rate": 1.9275670951015854e-05, - "loss": 0.0009, + "epoch": 0.9751693002257337, + "grad_norm": 1.1832383871078491, + "learning_rate": 2.8523489932885905e-05, + "loss": 0.1371, "step": 162 }, { - "epoch": 0.12264860797592174, - "grad_norm": 0.07708598673343658, - "learning_rate": 1.9266812259729927e-05, - "loss": 0.0032, + "epoch": 0.9811888638073739, + "grad_norm": 0.7754644155502319, + "learning_rate": 2.8355704697986578e-05, + "loss": 0.0619, "step": 163 }, { - "epoch": 0.12340105342362678, - "grad_norm": 0.012997242622077465, - "learning_rate": 1.9257901786309813e-05, - "loss": 0.0003, + "epoch": 0.9872084273890143, + "grad_norm": 0.6826126575469971, + "learning_rate": 2.8187919463087248e-05, + "loss": 0.0682, "step": 164 }, { - "epoch": 0.12415349887133183, - "grad_norm": 0.08036967366933823, - "learning_rate": 1.9248939580546453e-05, - "loss": 0.0021, + "epoch": 0.9932279909706546, + "grad_norm": 0.9121167659759521, + "learning_rate": 2.802013422818792e-05, + "loss": 0.0234, "step": 165 }, { - "epoch": 0.12490594431903687, - "grad_norm": 0.08968926221132278, - "learning_rate": 1.9239925692519867e-05, - "loss": 0.0036, + "epoch": 0.999247554552295, + "grad_norm": 0.6562134623527527, + "learning_rate": 2.785234899328859e-05, + "loss": 0.0615, "step": 166 }, { - "epoch": 0.1256583897667419, - "grad_norm": 0.10857436060905457, - "learning_rate": 1.923086017259887e-05, - "loss": 0.0045, + "epoch": 1.0052671181339352, + "grad_norm": 2.136812925338745, + "learning_rate": 2.7684563758389263e-05, + "loss": 0.0835, "step": 167 }, { - "epoch": 0.12641083521444696, - "grad_norm": 0.33034518361091614, - "learning_rate": 1.9221743071440792e-05, - "loss": 0.012, + "epoch": 1.0112866817155757, + "grad_norm": 0.430277019739151, + "learning_rate": 2.7516778523489933e-05, + "loss": 0.0205, "step": 168 }, { - "epoch": 0.127163280662152, - "grad_norm": 0.020488563925027847, - "learning_rate": 1.92125744399912e-05, - "loss": 0.0011, + "epoch": 1.017306245297216, + "grad_norm": 0.37437501549720764, + "learning_rate": 2.7348993288590606e-05, + "loss": 0.0147, "step": 169 }, { - "epoch": 0.12791572610985705, - "grad_norm": 1.1725200414657593, - "learning_rate": 1.9203354329483593e-05, - "loss": 0.2263, + "epoch": 1.0233258088788564, + "grad_norm": 0.09916182607412338, + "learning_rate": 2.7181208053691276e-05, + "loss": 0.0166, "step": 170 }, { - "epoch": 0.12866817155756208, - "grad_norm": 0.009000587277114391, - "learning_rate": 1.9194082791439146e-05, - "loss": 0.0005, + "epoch": 1.0293453724604966, + "grad_norm": 0.22763441503047943, + "learning_rate": 2.701342281879195e-05, + "loss": 0.0296, "step": 171 }, { - "epoch": 0.1294206170052671, - "grad_norm": 0.15425604581832886, - "learning_rate": 1.9184759877666403e-05, - "loss": 0.006, + "epoch": 1.0353649360421369, + "grad_norm": 0.3235589265823364, + "learning_rate": 2.6845637583892618e-05, + "loss": 0.0214, "step": 172 }, { - "epoch": 0.13017306245297217, - "grad_norm": 0.04585198312997818, - "learning_rate": 1.917538564026098e-05, - "loss": 0.0011, + "epoch": 1.0413844996237773, + "grad_norm": 0.09323552995920181, + "learning_rate": 2.6677852348993288e-05, + "loss": 0.0029, "step": 173 }, { - "epoch": 0.1309255079006772, - "grad_norm": 0.012005002237856388, - "learning_rate": 1.9165960131605304e-05, - "loss": 0.0007, + "epoch": 1.0474040632054176, + "grad_norm": 0.18400466442108154, + "learning_rate": 2.651006711409396e-05, + "loss": 0.0227, "step": 174 }, { - "epoch": 0.13167795334838225, - "grad_norm": 0.04149067774415016, - "learning_rate": 1.915648340436828e-05, - "loss": 0.0017, + "epoch": 1.053423626787058, + "grad_norm": 0.4601859450340271, + "learning_rate": 2.634228187919463e-05, + "loss": 0.0247, "step": 175 }, { - "epoch": 0.13243039879608728, - "grad_norm": 0.13023491203784943, - "learning_rate": 1.9146955511505035e-05, - "loss": 0.0047, + "epoch": 1.0594431903686983, + "grad_norm": 0.06925185769796371, + "learning_rate": 2.6174496644295304e-05, + "loss": 0.0028, "step": 176 }, { - "epoch": 0.13318284424379231, - "grad_norm": 1.0637022256851196, - "learning_rate": 1.91373765062566e-05, - "loss": 0.2339, + "epoch": 1.0654627539503385, + "grad_norm": 0.7103378772735596, + "learning_rate": 2.6006711409395973e-05, + "loss": 0.0679, "step": 177 }, { - "epoch": 0.13393528969149737, - "grad_norm": 0.6275419592857361, - "learning_rate": 1.9127746442149612e-05, - "loss": 0.1668, + "epoch": 1.071482317531979, + "grad_norm": 0.2948612868785858, + "learning_rate": 2.5838926174496646e-05, + "loss": 0.0158, "step": 178 }, { - "epoch": 0.1346877351392024, - "grad_norm": 0.009538069367408752, - "learning_rate": 1.9118065372996027e-05, - "loss": 0.0006, + "epoch": 1.0775018811136192, + "grad_norm": 0.5460957288742065, + "learning_rate": 2.5671140939597316e-05, + "loss": 0.0252, "step": 179 }, { - "epoch": 0.13544018058690746, - "grad_norm": 0.01008265744894743, - "learning_rate": 1.910833335289281e-05, - "loss": 0.0006, + "epoch": 1.0835214446952597, + "grad_norm": 0.13775992393493652, + "learning_rate": 2.550335570469799e-05, + "loss": 0.0125, "step": 180 }, { - "epoch": 0.1361926260346125, - "grad_norm": 0.030792739242315292, - "learning_rate": 1.9098550436221636e-05, - "loss": 0.0014, + "epoch": 1.0895410082769, + "grad_norm": 0.2737879753112793, + "learning_rate": 2.533557046979866e-05, + "loss": 0.0087, "step": 181 }, { - "epoch": 0.13694507148231752, - "grad_norm": 0.010570832528173923, - "learning_rate": 1.9088716677648583e-05, - "loss": 0.0005, + "epoch": 1.0955605718585402, + "grad_norm": 0.37196484208106995, + "learning_rate": 2.516778523489933e-05, + "loss": 0.0579, "step": 182 }, { - "epoch": 0.13769751693002258, - "grad_norm": 0.7934188842773438, - "learning_rate": 1.9078832132123833e-05, - "loss": 0.0201, + "epoch": 1.1015801354401806, + "grad_norm": 0.3493405282497406, + "learning_rate": 2.5e-05, + "loss": 0.0126, "step": 183 }, { - "epoch": 0.1384499623777276, - "grad_norm": 0.004436484072357416, - "learning_rate": 1.9068896854881364e-05, - "loss": 0.0002, + "epoch": 1.1075996990218209, + "grad_norm": 1.0219722986221313, + "learning_rate": 2.4832214765100674e-05, + "loss": 0.0701, "step": 184 }, { - "epoch": 0.13920240782543267, - "grad_norm": 0.025362318381667137, - "learning_rate": 1.9058910901438628e-05, - "loss": 0.0017, + "epoch": 1.1136192626034613, + "grad_norm": 0.32175976037979126, + "learning_rate": 2.4664429530201344e-05, + "loss": 0.012, "step": 185 }, { - "epoch": 0.1399548532731377, - "grad_norm": 0.007729747798293829, - "learning_rate": 1.904887432759626e-05, - "loss": 0.0004, + "epoch": 1.1196388261851016, + "grad_norm": 0.33765479922294617, + "learning_rate": 2.4496644295302017e-05, + "loss": 0.0106, "step": 186 }, { - "epoch": 0.14070729872084273, - "grad_norm": 2.8755412101745605, - "learning_rate": 1.9038787189437752e-05, - "loss": 0.1614, + "epoch": 1.1256583897667418, + "grad_norm": 0.17531374096870422, + "learning_rate": 2.4328859060402687e-05, + "loss": 0.0161, "step": 187 }, { - "epoch": 0.14145974416854779, - "grad_norm": 2.1707630157470703, - "learning_rate": 1.902864954332915e-05, - "loss": 0.3061, + "epoch": 1.1316779533483823, + "grad_norm": 0.1013503223657608, + "learning_rate": 2.416107382550336e-05, + "loss": 0.0057, "step": 188 }, { - "epoch": 0.14221218961625282, - "grad_norm": 0.5352933406829834, - "learning_rate": 1.9018461445918727e-05, - "loss": 0.1411, + "epoch": 1.1376975169300225, + "grad_norm": 0.5186209082603455, + "learning_rate": 2.3993288590604026e-05, + "loss": 0.0189, "step": 189 }, { - "epoch": 0.14296463506395787, - "grad_norm": 0.44579315185546875, - "learning_rate": 1.900822295413668e-05, - "loss": 0.1438, + "epoch": 1.143717080511663, + "grad_norm": 0.577898383140564, + "learning_rate": 2.38255033557047e-05, + "loss": 0.0315, "step": 190 }, { - "epoch": 0.1437170805116629, - "grad_norm": 2.404839038848877, - "learning_rate": 1.8997934125194806e-05, - "loss": 0.2562, + "epoch": 1.1497366440933032, + "grad_norm": 0.2543765604496002, + "learning_rate": 2.365771812080537e-05, + "loss": 0.0259, "step": 191 }, { - "epoch": 0.14446952595936793, - "grad_norm": 2.7213966846466064, - "learning_rate": 1.898759501658618e-05, - "loss": 0.1053, + "epoch": 1.1557562076749435, + "grad_norm": 1.04751718044281, + "learning_rate": 2.348993288590604e-05, + "loss": 0.0267, "step": 192 }, { - "epoch": 0.145221971407073, - "grad_norm": 1.2511435747146606, - "learning_rate": 1.8977205686084828e-05, - "loss": 0.1214, + "epoch": 1.161775771256584, + "grad_norm": 0.30151480436325073, + "learning_rate": 2.332214765100671e-05, + "loss": 0.016, "step": 193 }, { - "epoch": 0.14597441685477802, - "grad_norm": 0.9504565000534058, - "learning_rate": 1.8966766191745423e-05, - "loss": 0.136, + "epoch": 1.1677953348382242, + "grad_norm": 1.1602953672409058, + "learning_rate": 2.3154362416107384e-05, + "loss": 0.0342, "step": 194 }, { - "epoch": 0.14672686230248308, - "grad_norm": 0.053916994482278824, - "learning_rate": 1.895627659190294e-05, - "loss": 0.0035, + "epoch": 1.1738148984198646, + "grad_norm": 0.6510918140411377, + "learning_rate": 2.2986577181208054e-05, + "loss": 0.0367, "step": 195 }, { - "epoch": 0.1474793077501881, - "grad_norm": 0.21418577432632446, - "learning_rate": 1.8945736945172345e-05, - "loss": 0.0128, + "epoch": 1.1798344620015049, + "grad_norm": 0.2937709093093872, + "learning_rate": 2.2818791946308727e-05, + "loss": 0.0124, "step": 196 }, { - "epoch": 0.14823175319789314, - "grad_norm": 0.22742077708244324, - "learning_rate": 1.8935147310448258e-05, - "loss": 0.011, + "epoch": 1.1858540255831453, + "grad_norm": 0.3778565526008606, + "learning_rate": 2.2651006711409396e-05, + "loss": 0.0353, "step": 197 }, { - "epoch": 0.1489841986455982, - "grad_norm": 0.06237643212080002, - "learning_rate": 1.8924507746904628e-05, - "loss": 0.0035, + "epoch": 1.1918735891647856, + "grad_norm": 0.34342288970947266, + "learning_rate": 2.248322147651007e-05, + "loss": 0.0228, "step": 198 }, { - "epoch": 0.14973664409330323, - "grad_norm": 0.027896396815776825, - "learning_rate": 1.8913818313994406e-05, - "loss": 0.0017, + "epoch": 1.1978931527464258, + "grad_norm": 0.25225672125816345, + "learning_rate": 2.231543624161074e-05, + "loss": 0.0037, "step": 199 }, { - "epoch": 0.1504890895410083, - "grad_norm": 0.021885687485337257, - "learning_rate": 1.8903079071449206e-05, - "loss": 0.0013, + "epoch": 1.2039127163280663, + "grad_norm": 0.3875395953655243, + "learning_rate": 2.2147651006711412e-05, + "loss": 0.024, "step": 200 }, { - "epoch": 0.15124153498871332, - "grad_norm": 0.004547674674540758, - "learning_rate": 1.889229007927897e-05, - "loss": 0.0003, + "epoch": 1.2099322799097065, + "grad_norm": 0.48843473196029663, + "learning_rate": 2.197986577181208e-05, + "loss": 0.0411, "step": 201 }, { - "epoch": 0.15199398043641835, - "grad_norm": 0.08865097910165787, - "learning_rate": 1.8881451397771647e-05, - "loss": 0.0046, + "epoch": 1.215951843491347, + "grad_norm": 0.008358384482562542, + "learning_rate": 2.181208053691275e-05, + "loss": 0.0002, "step": 202 }, { - "epoch": 0.1527464258841234, - "grad_norm": 0.13727599382400513, - "learning_rate": 1.887056308749283e-05, - "loss": 0.0075, + "epoch": 1.2219714070729872, + "grad_norm": 0.0617498978972435, + "learning_rate": 2.1644295302013424e-05, + "loss": 0.0016, "step": 203 }, { - "epoch": 0.15349887133182843, - "grad_norm": 0.6231827735900879, - "learning_rate": 1.885962520928545e-05, - "loss": 0.028, + "epoch": 1.2279909706546275, + "grad_norm": 0.5839952826499939, + "learning_rate": 2.1476510067114094e-05, + "loss": 0.0255, "step": 204 }, { - "epoch": 0.1542513167795335, - "grad_norm": 0.1673537939786911, - "learning_rate": 1.88486378242694e-05, - "loss": 0.0099, + "epoch": 1.234010534236268, + "grad_norm": 0.6008470058441162, + "learning_rate": 2.1308724832214767e-05, + "loss": 0.0279, "step": 205 }, { - "epoch": 0.15500376222723852, - "grad_norm": 0.008311329409480095, - "learning_rate": 1.8837600993841237e-05, - "loss": 0.0005, + "epoch": 1.2400300978179082, + "grad_norm": 0.08057394623756409, + "learning_rate": 2.1140939597315437e-05, + "loss": 0.014, "step": 206 }, { - "epoch": 0.15575620767494355, - "grad_norm": 0.1961093544960022, - "learning_rate": 1.8826514779673792e-05, - "loss": 0.01, + "epoch": 1.2460496613995486, + "grad_norm": 0.8297271728515625, + "learning_rate": 2.097315436241611e-05, + "loss": 0.0433, "step": 207 }, { - "epoch": 0.1565086531226486, - "grad_norm": 0.3649809658527374, - "learning_rate": 1.881537924371586e-05, - "loss": 0.1412, + "epoch": 1.2520692249811889, + "grad_norm": 1.0753511190414429, + "learning_rate": 2.080536912751678e-05, + "loss": 0.0342, "step": 208 }, { - "epoch": 0.15726109857035364, - "grad_norm": 1.2528727054595947, - "learning_rate": 1.8804194448191843e-05, - "loss": 0.1621, + "epoch": 1.2580887885628291, + "grad_norm": 0.11652516573667526, + "learning_rate": 2.0637583892617452e-05, + "loss": 0.0122, "step": 209 }, { - "epoch": 0.1580135440180587, - "grad_norm": 0.0017916648648679256, - "learning_rate": 1.8792960455601396e-05, - "loss": 0.0001, + "epoch": 1.2641083521444696, + "grad_norm": 0.23289084434509277, + "learning_rate": 2.0469798657718122e-05, + "loss": 0.0229, "step": 210 }, { - "epoch": 0.15876598946576373, - "grad_norm": 0.2502264976501465, - "learning_rate": 1.8781677328719078e-05, - "loss": 0.01, + "epoch": 1.2701279157261098, + "grad_norm": 0.5731219053268433, + "learning_rate": 2.0302013422818795e-05, + "loss": 0.0455, "step": 211 }, { - "epoch": 0.1595184349134688, - "grad_norm": 0.3349985182285309, - "learning_rate": 1.8770345130594017e-05, - "loss": 0.0182, + "epoch": 1.2761474793077503, + "grad_norm": 0.8601072430610657, + "learning_rate": 2.013422818791946e-05, + "loss": 0.0294, "step": 212 }, { - "epoch": 0.16027088036117382, - "grad_norm": 0.007236495614051819, - "learning_rate": 1.875896392454955e-05, - "loss": 0.0005, + "epoch": 1.2821670428893905, + "grad_norm": 0.9172778129577637, + "learning_rate": 1.9966442953020134e-05, + "loss": 0.0803, "step": 213 }, { - "epoch": 0.16102332580887885, - "grad_norm": 0.7724468111991882, - "learning_rate": 1.8747533774182845e-05, - "loss": 0.0307, + "epoch": 1.2881866064710308, + "grad_norm": 0.18378061056137085, + "learning_rate": 1.9798657718120804e-05, + "loss": 0.0151, "step": 214 }, { - "epoch": 0.1617757712565839, - "grad_norm": 0.0025126487016677856, - "learning_rate": 1.8736054743364587e-05, - "loss": 0.0001, + "epoch": 1.2942061700526712, + "grad_norm": 0.2338120937347412, + "learning_rate": 1.9630872483221477e-05, + "loss": 0.0214, "step": 215 }, { - "epoch": 0.16252821670428894, - "grad_norm": 0.1561364233493805, - "learning_rate": 1.872452689623859e-05, - "loss": 0.0034, + "epoch": 1.3002257336343115, + "grad_norm": 0.09691441804170609, + "learning_rate": 1.946308724832215e-05, + "loss": 0.0087, "step": 216 }, { - "epoch": 0.163280662151994, - "grad_norm": 0.004739896394312382, - "learning_rate": 1.8712950297221457e-05, - "loss": 0.0003, + "epoch": 1.306245297215952, + "grad_norm": 0.1699642539024353, + "learning_rate": 1.929530201342282e-05, + "loss": 0.0088, "step": 217 }, { - "epoch": 0.16403310759969902, - "grad_norm": 0.5904443264007568, - "learning_rate": 1.8701325011002204e-05, - "loss": 0.1231, + "epoch": 1.3122648607975922, + "grad_norm": 0.014856619760394096, + "learning_rate": 1.9127516778523493e-05, + "loss": 0.0003, "step": 218 }, { - "epoch": 0.16478555304740405, - "grad_norm": 0.002162502147257328, - "learning_rate": 1.8689651102541915e-05, - "loss": 0.0001, + "epoch": 1.3182844243792324, + "grad_norm": 0.17981240153312683, + "learning_rate": 1.8959731543624162e-05, + "loss": 0.0148, "step": 219 }, { - "epoch": 0.1655379984951091, - "grad_norm": 0.022224850952625275, - "learning_rate": 1.8677928637073367e-05, - "loss": 0.0007, + "epoch": 1.324303987960873, + "grad_norm": 0.1564723402261734, + "learning_rate": 1.8791946308724835e-05, + "loss": 0.025, "step": 220 }, { - "epoch": 0.16629044394281414, - "grad_norm": 0.0014565088786184788, - "learning_rate": 1.8666157680100663e-05, - "loss": 0.0001, + "epoch": 1.3303235515425131, + "grad_norm": 0.05128008872270584, + "learning_rate": 1.8624161073825505e-05, + "loss": 0.0013, "step": 221 }, { - "epoch": 0.1670428893905192, - "grad_norm": 0.0017851298907771707, - "learning_rate": 1.865433829739888e-05, - "loss": 0.0001, + "epoch": 1.3363431151241536, + "grad_norm": 0.018907951191067696, + "learning_rate": 1.8456375838926178e-05, + "loss": 0.0006, "step": 222 }, { - "epoch": 0.16779533483822423, - "grad_norm": 0.16844072937965393, - "learning_rate": 1.8642470555013695e-05, - "loss": 0.0023, + "epoch": 1.3423626787057938, + "grad_norm": 0.047860465943813324, + "learning_rate": 1.8288590604026847e-05, + "loss": 0.0018, "step": 223 }, { - "epoch": 0.16854778028592926, - "grad_norm": 3.2930855751037598, - "learning_rate": 1.8630554519261007e-05, - "loss": 0.0823, + "epoch": 1.348382242287434, + "grad_norm": 1.6964343786239624, + "learning_rate": 1.8120805369127517e-05, + "loss": 0.0834, "step": 224 }, { - "epoch": 0.16930022573363432, - "grad_norm": 0.00976943876594305, - "learning_rate": 1.8618590256726587e-05, - "loss": 0.0006, + "epoch": 1.3544018058690745, + "grad_norm": 0.09841305017471313, + "learning_rate": 1.7953020134228187e-05, + "loss": 0.0018, "step": 225 }, { - "epoch": 0.17005267118133935, - "grad_norm": 0.3796117901802063, - "learning_rate": 1.860657783426568e-05, - "loss": 0.1048, + "epoch": 1.3604213694507148, + "grad_norm": 0.29318398237228394, + "learning_rate": 1.778523489932886e-05, + "loss": 0.0292, "step": 226 }, { - "epoch": 0.1708051166290444, - "grad_norm": 2.5069665908813477, - "learning_rate": 1.8594517319002646e-05, - "loss": 0.3186, + "epoch": 1.3664409330323553, + "grad_norm": 0.6777030229568481, + "learning_rate": 1.761744966442953e-05, + "loss": 0.0453, "step": 227 }, { - "epoch": 0.17155756207674944, - "grad_norm": 0.05536516755819321, - "learning_rate": 1.8582408778330588e-05, - "loss": 0.0019, + "epoch": 1.3724604966139955, + "grad_norm": 0.09742780774831772, + "learning_rate": 1.7449664429530202e-05, + "loss": 0.0022, "step": 228 }, { - "epoch": 0.17231000752445447, - "grad_norm": 0.013733879663050175, - "learning_rate": 1.8570252279910975e-05, - "loss": 0.0005, + "epoch": 1.3784800601956357, + "grad_norm": 0.21270929276943207, + "learning_rate": 1.7281879194630872e-05, + "loss": 0.0216, "step": 229 }, { - "epoch": 0.17306245297215953, - "grad_norm": 0.26580435037612915, - "learning_rate": 1.8558047891673247e-05, - "loss": 0.0078, + "epoch": 1.3844996237772762, + "grad_norm": 0.10257343202829361, + "learning_rate": 1.7114093959731545e-05, + "loss": 0.0032, "step": 230 }, { - "epoch": 0.17381489841986456, - "grad_norm": 0.005103581584990025, - "learning_rate": 1.854579568181446e-05, - "loss": 0.0003, + "epoch": 1.3905191873589164, + "grad_norm": 0.2899154722690582, + "learning_rate": 1.6946308724832215e-05, + "loss": 0.0336, "step": 231 }, { - "epoch": 0.1745673438675696, - "grad_norm": 0.8314829468727112, - "learning_rate": 1.8533495718798882e-05, - "loss": 0.0161, + "epoch": 1.396538750940557, + "grad_norm": 0.560697615146637, + "learning_rate": 1.6778523489932888e-05, + "loss": 0.0167, "step": 232 }, { - "epoch": 0.17531978931527464, - "grad_norm": 0.04428403079509735, - "learning_rate": 1.8521148071357633e-05, - "loss": 0.0012, + "epoch": 1.4025583145221971, + "grad_norm": 0.15792670845985413, + "learning_rate": 1.6610738255033557e-05, + "loss": 0.0161, "step": 233 }, { - "epoch": 0.17607223476297967, - "grad_norm": 0.033717166632413864, - "learning_rate": 1.8508752808488283e-05, - "loss": 0.0014, + "epoch": 1.4085778781038374, + "grad_norm": 0.112309031188488, + "learning_rate": 1.644295302013423e-05, + "loss": 0.0081, "step": 234 }, { - "epoch": 0.17682468021068473, - "grad_norm": 0.03402457386255264, - "learning_rate": 1.8496309999454475e-05, - "loss": 0.0013, + "epoch": 1.4145974416854779, + "grad_norm": 0.6623883247375488, + "learning_rate": 1.62751677852349e-05, + "loss": 0.0679, "step": 235 }, { - "epoch": 0.17757712565838976, - "grad_norm": 0.062332432717084885, - "learning_rate": 1.848381971378553e-05, - "loss": 0.0019, + "epoch": 1.420617005267118, + "grad_norm": 0.27897122502326965, + "learning_rate": 1.610738255033557e-05, + "loss": 0.0162, "step": 236 }, { - "epoch": 0.17832957110609482, - "grad_norm": 5.1166229248046875, - "learning_rate": 1.8471282021276073e-05, - "loss": 0.1784, + "epoch": 1.4266365688487586, + "grad_norm": 0.08262226730585098, + "learning_rate": 1.5939597315436243e-05, + "loss": 0.0029, "step": 237 }, { - "epoch": 0.17908201655379985, - "grad_norm": 0.0013702790020033717, - "learning_rate": 1.845869699198563e-05, - "loss": 0.0001, + "epoch": 1.4326561324303988, + "grad_norm": 0.13499091565608978, + "learning_rate": 1.5771812080536912e-05, + "loss": 0.0147, "step": 238 }, { - "epoch": 0.17983446200150488, - "grad_norm": 0.01880730129778385, - "learning_rate": 1.844606469623824e-05, - "loss": 0.0005, + "epoch": 1.438675696012039, + "grad_norm": 0.2563413977622986, + "learning_rate": 1.5604026845637585e-05, + "loss": 0.0186, "step": 239 }, { - "epoch": 0.18058690744920994, - "grad_norm": 1.5539264678955078, - "learning_rate": 1.8433385204622067e-05, - "loss": 0.1897, + "epoch": 1.4446952595936795, + "grad_norm": 0.38309767842292786, + "learning_rate": 1.5436241610738255e-05, + "loss": 0.0042, "step": 240 }, { - "epoch": 0.18133935289691497, - "grad_norm": 4.555483341217041, - "learning_rate": 1.842065858798899e-05, - "loss": 0.327, + "epoch": 1.4507148231753197, + "grad_norm": 0.5308915972709656, + "learning_rate": 1.5268456375838928e-05, + "loss": 0.023, "step": 241 }, { - "epoch": 0.18209179834462003, - "grad_norm": 0.014411378651857376, - "learning_rate": 1.8407884917454233e-05, - "loss": 0.0008, + "epoch": 1.4567343867569602, + "grad_norm": 0.5418457984924316, + "learning_rate": 1.51006711409396e-05, + "loss": 0.0271, "step": 242 }, { - "epoch": 0.18284424379232506, - "grad_norm": 0.3658387362957001, - "learning_rate": 1.8395064264395945e-05, - "loss": 0.0159, + "epoch": 1.4627539503386005, + "grad_norm": 0.16427500545978546, + "learning_rate": 1.493288590604027e-05, + "loss": 0.0167, "step": 243 }, { - "epoch": 0.1835966892400301, - "grad_norm": 0.052855461835861206, - "learning_rate": 1.838219670045481e-05, - "loss": 0.0018, + "epoch": 1.4687735139202407, + "grad_norm": 0.1764906644821167, + "learning_rate": 1.4765100671140942e-05, + "loss": 0.0041, "step": 244 }, { - "epoch": 0.18434913468773514, - "grad_norm": 0.03438251465559006, - "learning_rate": 1.836928229753365e-05, - "loss": 0.0012, + "epoch": 1.4747930775018812, + "grad_norm": 0.028177335858345032, + "learning_rate": 1.4597315436241613e-05, + "loss": 0.0011, "step": 245 }, { - "epoch": 0.18510158013544017, - "grad_norm": 0.24612069129943848, - "learning_rate": 1.835632112779701e-05, - "loss": 0.1226, + "epoch": 1.4808126410835214, + "grad_norm": 0.28984132409095764, + "learning_rate": 1.4429530201342285e-05, + "loss": 0.0037, "step": 246 }, { - "epoch": 0.18585402558314523, - "grad_norm": 0.04806346446275711, - "learning_rate": 1.8343313263670782e-05, - "loss": 0.0021, + "epoch": 1.4868322046651619, + "grad_norm": 0.016668178141117096, + "learning_rate": 1.4261744966442953e-05, + "loss": 0.0007, "step": 247 }, { - "epoch": 0.18660647103085026, - "grad_norm": 0.6806573867797852, - "learning_rate": 1.8330258777841755e-05, - "loss": 0.0208, + "epoch": 1.492851768246802, + "grad_norm": 0.11294250190258026, + "learning_rate": 1.4093959731543624e-05, + "loss": 0.0117, "step": 248 }, { - "epoch": 0.1873589164785553, - "grad_norm": 0.11799991130828857, - "learning_rate": 1.831715774325726e-05, - "loss": 0.0036, + "epoch": 1.4988713318284423, + "grad_norm": 0.18805146217346191, + "learning_rate": 1.3926174496644295e-05, + "loss": 0.0152, "step": 249 }, { - "epoch": 0.18811136192626035, - "grad_norm": 0.002671067602932453, - "learning_rate": 1.830401023312472e-05, - "loss": 0.0001, + "epoch": 1.5048908954100828, + "grad_norm": 0.15651652216911316, + "learning_rate": 1.3758389261744966e-05, + "loss": 0.0195, "step": 250 }, { - "epoch": 0.18886380737396538, - "grad_norm": 0.2947131097316742, - "learning_rate": 1.8290816320911285e-05, - "loss": 0.1261, + "epoch": 1.510910458991723, + "grad_norm": 0.07596039772033691, + "learning_rate": 1.3590604026845638e-05, + "loss": 0.0035, "step": 251 }, { - "epoch": 0.18961625282167044, - "grad_norm": 1.2162272930145264, - "learning_rate": 1.8277576080343362e-05, - "loss": 0.1268, + "epoch": 1.5169300225733635, + "grad_norm": 0.5767983198165894, + "learning_rate": 1.3422818791946309e-05, + "loss": 0.0209, "step": 252 }, { - "epoch": 0.19036869826937547, - "grad_norm": 1.2172619104385376, - "learning_rate": 1.8264289585406266e-05, - "loss": 0.0297, + "epoch": 1.5229495861550038, + "grad_norm": 0.14054809510707855, + "learning_rate": 1.325503355704698e-05, + "loss": 0.0156, "step": 253 }, { - "epoch": 0.1911211437170805, - "grad_norm": 0.20552274584770203, - "learning_rate": 1.825095691034376e-05, - "loss": 0.0085, + "epoch": 1.528969149736644, + "grad_norm": 0.5480087995529175, + "learning_rate": 1.3087248322147652e-05, + "loss": 0.0372, "step": 254 }, { - "epoch": 0.19187358916478556, - "grad_norm": 0.008679857477545738, - "learning_rate": 1.8237578129657664e-05, - "loss": 0.0002, + "epoch": 1.5349887133182845, + "grad_norm": 0.21327197551727295, + "learning_rate": 1.2919463087248323e-05, + "loss": 0.037, "step": 255 }, { - "epoch": 0.1926260346124906, - "grad_norm": 0.0659710094332695, - "learning_rate": 1.822415331810743e-05, - "loss": 0.002, + "epoch": 1.5410082768999247, + "grad_norm": 0.8947880268096924, + "learning_rate": 1.2751677852348994e-05, + "loss": 0.0352, "step": 256 }, { - "epoch": 0.19337848006019565, - "grad_norm": 0.4817584753036499, - "learning_rate": 1.821068255070973e-05, - "loss": 0.1022, + "epoch": 1.5470278404815652, + "grad_norm": 0.16651660203933716, + "learning_rate": 1.2583892617449666e-05, + "loss": 0.0062, "step": 257 }, { - "epoch": 0.19413092550790068, - "grad_norm": 1.5975401401519775, - "learning_rate": 1.819716590273803e-05, - "loss": 0.1331, + "epoch": 1.5530474040632054, + "grad_norm": 0.18476413190364838, + "learning_rate": 1.2416107382550337e-05, + "loss": 0.0177, "step": 258 }, { - "epoch": 0.1948833709556057, - "grad_norm": 0.05313832312822342, - "learning_rate": 1.818360344972217e-05, - "loss": 0.0016, + "epoch": 1.5590669676448456, + "grad_norm": 0.013061203993856907, + "learning_rate": 1.2248322147651008e-05, + "loss": 0.0005, "step": 259 }, { - "epoch": 0.19563581640331076, - "grad_norm": 0.02122344821691513, - "learning_rate": 1.8169995267447953e-05, - "loss": 0.0008, + "epoch": 1.565086531226486, + "grad_norm": 0.31754836440086365, + "learning_rate": 1.208053691275168e-05, + "loss": 0.0181, "step": 260 }, { - "epoch": 0.1963882618510158, - "grad_norm": 0.0018245746614411473, - "learning_rate": 1.8156341431956706e-05, - "loss": 0.0001, + "epoch": 1.5711060948081266, + "grad_norm": 0.26034584641456604, + "learning_rate": 1.191275167785235e-05, + "loss": 0.0342, "step": 261 }, { - "epoch": 0.19714070729872085, - "grad_norm": 0.07786925882101059, - "learning_rate": 1.814264201954486e-05, - "loss": 0.0018, + "epoch": 1.5771256583897668, + "grad_norm": 0.15222539007663727, + "learning_rate": 1.174496644295302e-05, + "loss": 0.0081, "step": 262 }, { - "epoch": 0.19789315274642588, - "grad_norm": 0.46468350291252136, - "learning_rate": 1.812889710676354e-05, - "loss": 0.013, + "epoch": 1.583145221971407, + "grad_norm": 0.1855451464653015, + "learning_rate": 1.1577181208053692e-05, + "loss": 0.0222, "step": 263 }, { - "epoch": 0.1986455981941309, - "grad_norm": 0.4126303493976593, - "learning_rate": 1.811510677041811e-05, - "loss": 0.0055, + "epoch": 1.5891647855530473, + "grad_norm": 0.47386428713798523, + "learning_rate": 1.1409395973154363e-05, + "loss": 0.0484, "step": 264 }, { - "epoch": 0.19939804364183597, - "grad_norm": 0.20423245429992676, - "learning_rate": 1.8101271087567753e-05, - "loss": 0.1157, + "epoch": 1.5951843491346878, + "grad_norm": 0.05222529545426369, + "learning_rate": 1.1241610738255035e-05, + "loss": 0.0018, "step": 265 }, { - "epoch": 0.200150489089541, - "grad_norm": 0.9413779377937317, - "learning_rate": 1.8087390135525056e-05, - "loss": 0.0718, + "epoch": 1.6012039127163282, + "grad_norm": 0.36145541071891785, + "learning_rate": 1.1073825503355706e-05, + "loss": 0.0256, "step": 266 }, { - "epoch": 0.20090293453724606, - "grad_norm": 0.5326927304267883, - "learning_rate": 1.8073463991855562e-05, - "loss": 0.0053, + "epoch": 1.6072234762979685, + "grad_norm": 0.2200651317834854, + "learning_rate": 1.0906040268456376e-05, + "loss": 0.0048, "step": 267 }, { - "epoch": 0.2016553799849511, - "grad_norm": 0.2141987830400467, - "learning_rate": 1.8059492734377342e-05, - "loss": 0.1024, + "epoch": 1.6132430398796087, + "grad_norm": 0.2838999330997467, + "learning_rate": 1.0738255033557047e-05, + "loss": 0.0088, "step": 268 }, { - "epoch": 0.20240782543265612, - "grad_norm": 4.482271194458008, - "learning_rate": 1.8045476441160552e-05, - "loss": 0.3966, + "epoch": 1.619262603461249, + "grad_norm": 0.5340823531150818, + "learning_rate": 1.0570469798657718e-05, + "loss": 0.0054, "step": 269 }, { - "epoch": 0.20316027088036118, - "grad_norm": 0.001211398164741695, - "learning_rate": 1.8031415190527016e-05, - "loss": 0.0001, + "epoch": 1.6252821670428894, + "grad_norm": 0.27307260036468506, + "learning_rate": 1.040268456375839e-05, + "loss": 0.014, "step": 270 }, { - "epoch": 0.2039127163280662, - "grad_norm": 0.06283948570489883, - "learning_rate": 1.8017309061049767e-05, - "loss": 0.0014, + "epoch": 1.6313017306245299, + "grad_norm": 0.694962739944458, + "learning_rate": 1.0234899328859061e-05, + "loss": 0.0126, "step": 271 }, { - "epoch": 0.20466516177577126, - "grad_norm": 0.655404806137085, - "learning_rate": 1.8003158131552615e-05, - "loss": 0.0746, + "epoch": 1.6373212942061701, + "grad_norm": 0.19789136946201324, + "learning_rate": 1.006711409395973e-05, + "loss": 0.0172, "step": 272 }, { - "epoch": 0.2054176072234763, - "grad_norm": 0.010463729500770569, - "learning_rate": 1.7988962481109716e-05, - "loss": 0.0003, + "epoch": 1.6433408577878104, + "grad_norm": 0.5267607569694519, + "learning_rate": 9.899328859060402e-06, + "loss": 0.0408, "step": 273 }, { - "epoch": 0.20617005267118135, - "grad_norm": 0.0010110132861882448, - "learning_rate": 1.7974722189045126e-05, - "loss": 0.0001, + "epoch": 1.6493604213694506, + "grad_norm": 0.015556755475699902, + "learning_rate": 9.731543624161075e-06, + "loss": 0.0006, "step": 274 }, { - "epoch": 0.20692249811888638, - "grad_norm": 0.1091262623667717, - "learning_rate": 1.7960437334932334e-05, - "loss": 0.0042, + "epoch": 1.655379984951091, + "grad_norm": 0.5071566104888916, + "learning_rate": 9.563758389261746e-06, + "loss": 0.0281, "step": 275 }, { - "epoch": 0.2076749435665914, - "grad_norm": 5.209902286529541, - "learning_rate": 1.7946107998593867e-05, - "loss": 0.0551, + "epoch": 1.6613995485327315, + "grad_norm": 0.1441573202610016, + "learning_rate": 9.395973154362418e-06, + "loss": 0.0151, "step": 276 }, { - "epoch": 0.20842738901429647, - "grad_norm": 0.00470168748870492, - "learning_rate": 1.7931734260100792e-05, - "loss": 0.0002, + "epoch": 1.6674191121143718, + "grad_norm": 0.22274713218212128, + "learning_rate": 9.228187919463089e-06, + "loss": 0.0174, "step": 277 }, { - "epoch": 0.2091798344620015, - "grad_norm": 0.7987887263298035, - "learning_rate": 1.7917316199772296e-05, - "loss": 0.0129, + "epoch": 1.673438675696012, + "grad_norm": 1.108049988746643, + "learning_rate": 9.060402684563759e-06, + "loss": 0.0189, "step": 278 }, { - "epoch": 0.20993227990970656, - "grad_norm": 0.021809808909893036, - "learning_rate": 1.7902853898175244e-05, - "loss": 0.0011, + "epoch": 1.6794582392776523, + "grad_norm": 0.47223615646362305, + "learning_rate": 8.89261744966443e-06, + "loss": 0.0261, "step": 279 }, { - "epoch": 0.2106847253574116, - "grad_norm": 0.3490143120288849, - "learning_rate": 1.7888347436123707e-05, - "loss": 0.1188, + "epoch": 1.6854778028592927, + "grad_norm": 0.11383321136236191, + "learning_rate": 8.724832214765101e-06, + "loss": 0.0139, "step": 280 }, { - "epoch": 0.21143717080511662, - "grad_norm": 0.6932041645050049, - "learning_rate": 1.7873796894678514e-05, - "loss": 0.0158, + "epoch": 1.6914973664409332, + "grad_norm": 0.01508291345089674, + "learning_rate": 8.557046979865773e-06, + "loss": 0.0006, "step": 281 }, { - "epoch": 0.21218961625282168, - "grad_norm": 0.006826372817158699, - "learning_rate": 1.7859202355146826e-05, - "loss": 0.0003, + "epoch": 1.6975169300225734, + "grad_norm": 0.572291910648346, + "learning_rate": 8.389261744966444e-06, + "loss": 0.0587, "step": 282 }, { - "epoch": 0.2129420617005267, - "grad_norm": 0.00620373385027051, - "learning_rate": 1.7844563899081642e-05, - "loss": 0.0002, + "epoch": 1.7035364936042137, + "grad_norm": 0.8027609586715698, + "learning_rate": 8.221476510067115e-06, + "loss": 0.0252, "step": 283 }, { - "epoch": 0.21369450714823177, - "grad_norm": 0.0037128764670342207, - "learning_rate": 1.782988160828137e-05, + "epoch": 1.709556057185854, + "grad_norm": 0.0062585920095443726, + "learning_rate": 8.053691275167785e-06, "loss": 0.0002, "step": 284 }, { - "epoch": 0.2144469525959368, - "grad_norm": 0.008649413473904133, - "learning_rate": 1.7815155564789374e-05, - "loss": 0.0002, + "epoch": 1.7155756207674944, + "grad_norm": 0.18026615679264069, + "learning_rate": 7.885906040268456e-06, + "loss": 0.0069, "step": 285 }, { - "epoch": 0.21519939804364183, - "grad_norm": 0.0007761880406178534, - "learning_rate": 1.780038585089348e-05, - "loss": 0.0, + "epoch": 1.7215951843491348, + "grad_norm": 0.02148846536874771, + "learning_rate": 7.718120805369127e-06, + "loss": 0.0006, "step": 286 }, { - "epoch": 0.21595184349134688, - "grad_norm": 8.137001037597656, - "learning_rate": 1.7785572549125566e-05, - "loss": 0.2602, + "epoch": 1.727614747930775, + "grad_norm": 0.40165480971336365, + "learning_rate": 7.5503355704698e-06, + "loss": 0.0483, "step": 287 }, { - "epoch": 0.21670428893905191, - "grad_norm": 0.011192454025149345, - "learning_rate": 1.7770715742261057e-05, - "loss": 0.0003, + "epoch": 1.7336343115124153, + "grad_norm": 0.34756484627723694, + "learning_rate": 7.382550335570471e-06, + "loss": 0.0089, "step": 288 }, { - "epoch": 0.21745673438675697, - "grad_norm": 0.9992365837097168, - "learning_rate": 1.775581551331848e-05, - "loss": 0.013, + "epoch": 1.7396538750940556, + "grad_norm": 0.8310803771018982, + "learning_rate": 7.214765100671142e-06, + "loss": 0.0349, "step": 289 }, { - "epoch": 0.218209179834462, - "grad_norm": 3.0127367973327637, - "learning_rate": 1.7740871945559022e-05, - "loss": 0.2808, + "epoch": 1.745673438675696, + "grad_norm": 0.21227827668190002, + "learning_rate": 7.046979865771812e-06, + "loss": 0.0098, "step": 290 }, { - "epoch": 0.21896162528216703, - "grad_norm": 6.692657188978046e-05, - "learning_rate": 1.772588512248602e-05, - "loss": 0.0, + "epoch": 1.7516930022573365, + "grad_norm": 0.18423959612846375, + "learning_rate": 6.879194630872483e-06, + "loss": 0.0196, "step": 291 }, { - "epoch": 0.2197140707298721, - "grad_norm": 9.9885743111372e-05, - "learning_rate": 1.771085512784453e-05, - "loss": 0.0, + "epoch": 1.7577125658389767, + "grad_norm": 0.13899557292461395, + "learning_rate": 6.7114093959731546e-06, + "loss": 0.0086, "step": 292 }, { - "epoch": 0.22046651617757712, - "grad_norm": 0.002337940037250519, - "learning_rate": 1.769578204562084e-05, - "loss": 0.0001, + "epoch": 1.763732129420617, + "grad_norm": 0.9509983658790588, + "learning_rate": 6.543624161073826e-06, + "loss": 0.0227, "step": 293 }, { - "epoch": 0.22121896162528218, - "grad_norm": 4.053681373596191, - "learning_rate": 1.7680665960042016e-05, - "loss": 0.2921, + "epoch": 1.7697516930022572, + "grad_norm": 0.2806952893733978, + "learning_rate": 6.375838926174497e-06, + "loss": 0.0308, "step": 294 }, { - "epoch": 0.2219714070729872, - "grad_norm": 0.004839350003749132, - "learning_rate": 1.7665506955575417e-05, - "loss": 0.0003, + "epoch": 1.7757712565838977, + "grad_norm": 0.2584255337715149, + "learning_rate": 6.2080536912751686e-06, + "loss": 0.0078, "step": 295 }, { - "epoch": 0.22272385252069224, - "grad_norm": 3.5069193840026855, - "learning_rate": 1.765030511692823e-05, - "loss": 0.3916, + "epoch": 1.7817908201655381, + "grad_norm": 0.6496636867523193, + "learning_rate": 6.04026845637584e-06, + "loss": 0.0764, "step": 296 }, { - "epoch": 0.2234762979683973, - "grad_norm": 0.10265739262104034, - "learning_rate": 1.7635060529046994e-05, - "loss": 0.0018, + "epoch": 1.7878103837471784, + "grad_norm": 2.131640672683716, + "learning_rate": 5.87248322147651e-06, + "loss": 0.0485, "step": 297 }, { - "epoch": 0.22422874341610233, - "grad_norm": 8.340572357177734, - "learning_rate": 1.7619773277117135e-05, - "loss": 0.2325, + "epoch": 1.7938299473288186, + "grad_norm": 0.25984302163124084, + "learning_rate": 5.704697986577182e-06, + "loss": 0.0256, "step": 298 }, { - "epoch": 0.22498118886380739, - "grad_norm": 0.004961786326020956, - "learning_rate": 1.760444344656247e-05, - "loss": 0.0003, + "epoch": 1.7998495109104589, + "grad_norm": 0.8501216769218445, + "learning_rate": 5.536912751677853e-06, + "loss": 0.0559, "step": 299 }, { - "epoch": 0.22573363431151242, - "grad_norm": 0.2516036033630371, - "learning_rate": 1.758907112304475e-05, - "loss": 0.0051, + "epoch": 1.8058690744920993, + "grad_norm": 0.3276824355125427, + "learning_rate": 5.3691275167785235e-06, + "loss": 0.0228, "step": 300 }, { - "epoch": 0.22648607975921745, - "grad_norm": 0.01247998047620058, - "learning_rate": 1.757365639246317e-05, - "loss": 0.0006, + "epoch": 1.8118886380737398, + "grad_norm": 0.19804896414279938, + "learning_rate": 5.201342281879195e-06, + "loss": 0.0121, "step": 301 }, { - "epoch": 0.2272385252069225, - "grad_norm": 0.0020404146052896976, - "learning_rate": 1.7558199340953893e-05, - "loss": 0.0001, + "epoch": 1.81790820165538, + "grad_norm": 0.6229518055915833, + "learning_rate": 5.033557046979865e-06, + "loss": 0.0046, "step": 302 }, { - "epoch": 0.22799097065462753, - "grad_norm": 0.3927852213382721, - "learning_rate": 1.7542700054889572e-05, - "loss": 0.0126, + "epoch": 1.8239277652370203, + "grad_norm": 0.16715337336063385, + "learning_rate": 4.8657718120805375e-06, + "loss": 0.0107, "step": 303 }, { - "epoch": 0.2287434161023326, - "grad_norm": 0.012273562140762806, - "learning_rate": 1.752715862087885e-05, - "loss": 0.0003, + "epoch": 1.8299473288186605, + "grad_norm": 0.14998656511306763, + "learning_rate": 4.697986577181209e-06, + "loss": 0.0041, "step": 304 }, { - "epoch": 0.22949586155003762, - "grad_norm": 0.0031097414903342724, - "learning_rate": 1.7511575125765902e-05, - "loss": 0.0002, + "epoch": 1.835966892400301, + "grad_norm": 0.3184771239757538, + "learning_rate": 4.530201342281879e-06, + "loss": 0.0107, "step": 305 }, { - "epoch": 0.23024830699774265, - "grad_norm": 0.36811593174934387, - "learning_rate": 1.7495949656629933e-05, - "loss": 0.0076, + "epoch": 1.8419864559819414, + "grad_norm": 0.665398120880127, + "learning_rate": 4.362416107382551e-06, + "loss": 0.0227, "step": 306 }, { - "epoch": 0.2310007524454477, - "grad_norm": 0.008364018052816391, - "learning_rate": 1.748028230078469e-05, - "loss": 0.0002, + "epoch": 1.8480060195635817, + "grad_norm": 0.19727759063243866, + "learning_rate": 4.194630872483222e-06, + "loss": 0.0122, "step": 307 }, { - "epoch": 0.23175319789315274, - "grad_norm": 0.47602003812789917, - "learning_rate": 1.7464573145777987e-05, - "loss": 0.1192, + "epoch": 1.854025583145222, + "grad_norm": 0.13394923508167267, + "learning_rate": 4.026845637583892e-06, + "loss": 0.0053, "step": 308 }, { - "epoch": 0.2325056433408578, - "grad_norm": 0.858063280582428, - "learning_rate": 1.7448822279391204e-05, - "loss": 0.0553, + "epoch": 1.8600451467268622, + "grad_norm": 0.17236709594726562, + "learning_rate": 3.859060402684564e-06, + "loss": 0.0184, "step": 309 }, { - "epoch": 0.23325808878856283, - "grad_norm": 4.250735759735107, - "learning_rate": 1.7433029789638794e-05, - "loss": 0.0937, + "epoch": 1.8660647103085026, + "grad_norm": 1.0638315677642822, + "learning_rate": 3.6912751677852355e-06, + "loss": 0.0311, "step": 310 }, { - "epoch": 0.23401053423626786, - "grad_norm": 0.029880870133638382, - "learning_rate": 1.7417195764767816e-05, - "loss": 0.0013, + "epoch": 1.872084273890143, + "grad_norm": 0.06651457399129868, + "learning_rate": 3.523489932885906e-06, + "loss": 0.0023, "step": 311 }, { - "epoch": 0.23476297968397292, - "grad_norm": 0.04074908420443535, - "learning_rate": 1.7401320293257403e-05, - "loss": 0.0021, + "epoch": 1.8781038374717833, + "grad_norm": 0.19191402196884155, + "learning_rate": 3.3557046979865773e-06, + "loss": 0.0195, "step": 312 }, { - "epoch": 0.23551542513167795, - "grad_norm": 0.03922910988330841, - "learning_rate": 1.7385403463818308e-05, - "loss": 0.0012, + "epoch": 1.8841234010534236, + "grad_norm": 0.004882109817117453, + "learning_rate": 3.1879194630872486e-06, + "loss": 0.0002, "step": 313 }, { - "epoch": 0.236267870579383, - "grad_norm": 0.06185416132211685, - "learning_rate": 1.7369445365392365e-05, - "loss": 0.0031, + "epoch": 1.8901429646350638, + "grad_norm": 0.2256098985671997, + "learning_rate": 3.02013422818792e-06, + "loss": 0.0145, "step": 314 }, { - "epoch": 0.23702031602708803, - "grad_norm": 6.229604721069336, - "learning_rate": 1.7353446087152038e-05, - "loss": 0.0754, + "epoch": 1.8961625282167043, + "grad_norm": 0.49490997195243835, + "learning_rate": 2.852348993288591e-06, + "loss": 0.0378, "step": 315 }, { - "epoch": 0.23777276147479307, - "grad_norm": 0.4178987443447113, - "learning_rate": 1.733740571849989e-05, - "loss": 0.1242, + "epoch": 1.9021820917983447, + "grad_norm": 0.2518860101699829, + "learning_rate": 2.6845637583892617e-06, + "loss": 0.0424, "step": 316 }, { - "epoch": 0.23852520692249812, - "grad_norm": 0.2695218324661255, - "learning_rate": 1.732132434906809e-05, - "loss": 0.0052, + "epoch": 1.908201655379985, + "grad_norm": 0.01092343870550394, + "learning_rate": 2.5167785234899326e-06, + "loss": 0.0003, "step": 317 }, { - "epoch": 0.23927765237020315, - "grad_norm": 0.011477818712592125, - "learning_rate": 1.730520206871792e-05, - "loss": 0.0004, + "epoch": 1.9142212189616252, + "grad_norm": 0.45049425959587097, + "learning_rate": 2.3489932885906044e-06, + "loss": 0.053, "step": 318 }, { - "epoch": 0.2400300978179082, - "grad_norm": 0.01223902590572834, - "learning_rate": 1.728903896753927e-05, - "loss": 0.0006, + "epoch": 1.9202407825432655, + "grad_norm": 0.01676754839718342, + "learning_rate": 2.1812080536912753e-06, + "loss": 0.0007, "step": 319 }, { - "epoch": 0.24078254326561324, - "grad_norm": 0.02941061556339264, - "learning_rate": 1.7272835135850133e-05, - "loss": 0.0012, + "epoch": 1.926260346124906, + "grad_norm": 0.08567023277282715, + "learning_rate": 2.013422818791946e-06, + "loss": 0.0022, "step": 320 }, { - "epoch": 0.24153498871331827, - "grad_norm": 0.45937439799308777, - "learning_rate": 1.72565906641961e-05, - "loss": 0.0131, + "epoch": 1.9322799097065464, + "grad_norm": 0.243726447224617, + "learning_rate": 1.8456375838926177e-06, + "loss": 0.022, "step": 321 }, { - "epoch": 0.24228743416102333, - "grad_norm": 0.06633768230676651, - "learning_rate": 1.7240305643349854e-05, - "loss": 0.0012, + "epoch": 1.9382994732881866, + "grad_norm": 0.20735050737857819, + "learning_rate": 1.6778523489932886e-06, + "loss": 0.0113, "step": 322 }, { - "epoch": 0.24303987960872836, - "grad_norm": 1.1109821796417236, - "learning_rate": 1.7223980164310658e-05, - "loss": 0.0369, + "epoch": 1.9443190368698269, + "grad_norm": 0.013643703423440456, + "learning_rate": 1.51006711409396e-06, + "loss": 0.0005, "step": 323 }, { - "epoch": 0.24379232505643342, - "grad_norm": 5.301788330078125, - "learning_rate": 1.720761431830386e-05, - "loss": 0.2436, + "epoch": 1.9503386004514671, + "grad_norm": 0.14169737696647644, + "learning_rate": 1.3422818791946309e-06, + "loss": 0.004, "step": 324 }, { - "epoch": 0.24454477050413845, - "grad_norm": 3.4349093437194824, - "learning_rate": 1.719120819678038e-05, - "loss": 0.3296, + "epoch": 1.9563581640331076, + "grad_norm": 0.21097888052463531, + "learning_rate": 1.1744966442953022e-06, + "loss": 0.0163, "step": 325 }, { - "epoch": 0.24529721595184348, - "grad_norm": 0.13575129210948944, - "learning_rate": 1.7174761891416176e-05, - "loss": 0.0041, + "epoch": 1.962377727614748, + "grad_norm": 0.3481338620185852, + "learning_rate": 1.006711409395973e-06, + "loss": 0.0328, "step": 326 }, { - "epoch": 0.24604966139954854, - "grad_norm": 0.7291696667671204, - "learning_rate": 1.7158275494111763e-05, - "loss": 0.1419, + "epoch": 1.9683972911963883, + "grad_norm": 0.639370858669281, + "learning_rate": 8.389261744966443e-07, + "loss": 0.0385, "step": 327 }, { - "epoch": 0.24680210684725357, - "grad_norm": 0.011178013868629932, - "learning_rate": 1.7141749096991686e-05, - "loss": 0.0004, + "epoch": 1.9744168547780285, + "grad_norm": 0.01708345301449299, + "learning_rate": 6.711409395973154e-07, + "loss": 0.0006, "step": 328 }, { - "epoch": 0.24755455229495862, - "grad_norm": 0.17662179470062256, - "learning_rate": 1.7125182792403995e-05, - "loss": 0.0028, + "epoch": 1.9804364183596688, + "grad_norm": 1.5634732246398926, + "learning_rate": 5.033557046979866e-07, + "loss": 0.0966, "step": 329 }, { - "epoch": 0.24830699774266365, - "grad_norm": 0.8367990851402283, - "learning_rate": 1.7108576672919757e-05, - "loss": 0.0191, + "epoch": 1.9864559819413092, + "grad_norm": 0.03818768635392189, + "learning_rate": 3.355704697986577e-07, + "loss": 0.001, "step": 330 }, { - "epoch": 0.24905944319036868, - "grad_norm": 3.8350296020507812, - "learning_rate": 1.7091930831332507e-05, - "loss": 0.1464, + "epoch": 1.9924755455229497, + "grad_norm": 1.7485132217407227, + "learning_rate": 1.6778523489932886e-07, + "loss": 0.0881, "step": 331 }, { - "epoch": 0.24981188863807374, - "grad_norm": 0.8443143963813782, - "learning_rate": 1.7075245360657744e-05, - "loss": 0.0053, + "epoch": 1.99849510910459, + "grad_norm": 2.490537643432617, + "learning_rate": 0.0, + "loss": 0.0464, "step": 332 - }, - { - "epoch": 0.2505643340857788, - "grad_norm": 0.04546971619129181, - "learning_rate": 1.705852035413242e-05, - "loss": 0.0014, - "step": 333 - }, - { - "epoch": 0.2513167795334838, - "grad_norm": 0.025590356439352036, - "learning_rate": 1.7041755905214404e-05, - "loss": 0.0013, - "step": 334 - }, - { - "epoch": 0.2520692249811889, - "grad_norm": 0.013746229000389576, - "learning_rate": 1.7024952107581965e-05, - "loss": 0.0004, - "step": 335 - }, - { - "epoch": 0.2528216704288939, - "grad_norm": 0.12485899776220322, - "learning_rate": 1.700810905513325e-05, - "loss": 0.0049, - "step": 336 - }, - { - "epoch": 0.25357411587659895, - "grad_norm": 0.20258508622646332, - "learning_rate": 1.699122684198576e-05, - "loss": 0.1096, - "step": 337 - }, - { - "epoch": 0.254326561324304, - "grad_norm": 0.026910100132226944, - "learning_rate": 1.6974305562475825e-05, - "loss": 0.0011, - "step": 338 - }, - { - "epoch": 0.255079006772009, - "grad_norm": 0.8987321853637695, - "learning_rate": 1.6957345311158066e-05, - "loss": 0.0763, - "step": 339 - }, - { - "epoch": 0.2558314522197141, - "grad_norm": 0.0026313799899071455, - "learning_rate": 1.6940346182804876e-05, - "loss": 0.0001, - "step": 340 - }, - { - "epoch": 0.2565838976674191, - "grad_norm": 6.234673976898193, - "learning_rate": 1.69233082724059e-05, - "loss": 0.1531, - "step": 341 - }, - { - "epoch": 0.25733634311512416, - "grad_norm": 0.0024953181855380535, - "learning_rate": 1.6906231675167488e-05, - "loss": 0.0002, - "step": 342 - }, - { - "epoch": 0.2580887885628292, - "grad_norm": 0.3546563684940338, - "learning_rate": 1.6889116486512165e-05, - "loss": 0.0036, - "step": 343 - }, - { - "epoch": 0.2588412340105342, - "grad_norm": 0.0036818115040659904, - "learning_rate": 1.6871962802078103e-05, - "loss": 0.0002, - "step": 344 - }, - { - "epoch": 0.2595936794582393, - "grad_norm": 0.7731168270111084, - "learning_rate": 1.6854770717718587e-05, - "loss": 0.0134, - "step": 345 - }, - { - "epoch": 0.26034612490594433, - "grad_norm": 0.006896435748785734, - "learning_rate": 1.683754032950148e-05, - "loss": 0.0004, - "step": 346 - }, - { - "epoch": 0.26109857035364936, - "grad_norm": 0.08711463212966919, - "learning_rate": 1.6820271733708676e-05, - "loss": 0.0019, - "step": 347 - }, - { - "epoch": 0.2618510158013544, - "grad_norm": 0.31446611881256104, - "learning_rate": 1.6802965026835575e-05, - "loss": 0.0066, - "step": 348 - }, - { - "epoch": 0.2626034612490594, - "grad_norm": 0.038149867206811905, - "learning_rate": 1.6785620305590536e-05, - "loss": 0.0012, - "step": 349 - }, - { - "epoch": 0.2633559066967645, - "grad_norm": 0.003243704093620181, - "learning_rate": 1.676823766689434e-05, - "loss": 0.0002, - "step": 350 - }, - { - "epoch": 0.26410835214446954, - "grad_norm": 0.0029318176675587893, - "learning_rate": 1.6750817207879655e-05, - "loss": 0.0001, - "step": 351 - }, - { - "epoch": 0.26486079759217457, - "grad_norm": 1.8430068492889404, - "learning_rate": 1.673335902589047e-05, - "loss": 0.0449, - "step": 352 - }, - { - "epoch": 0.2656132430398796, - "grad_norm": 0.00958191603422165, - "learning_rate": 1.6715863218481573e-05, - "loss": 0.0004, - "step": 353 - }, - { - "epoch": 0.26636568848758463, - "grad_norm": 2.806882619857788, - "learning_rate": 1.6698329883418008e-05, - "loss": 0.1586, - "step": 354 - }, - { - "epoch": 0.2671181339352897, - "grad_norm": 0.0034162893425673246, - "learning_rate": 1.6680759118674512e-05, - "loss": 0.0002, - "step": 355 - }, - { - "epoch": 0.26787057938299474, - "grad_norm": 0.008835663087666035, - "learning_rate": 1.6663151022434978e-05, - "loss": 0.0003, - "step": 356 - }, - { - "epoch": 0.2686230248306998, - "grad_norm": 0.0020498326048254967, - "learning_rate": 1.6645505693091897e-05, - "loss": 0.0001, - "step": 357 - }, - { - "epoch": 0.2693754702784048, - "grad_norm": 0.0011977710528299212, - "learning_rate": 1.662782322924583e-05, - "loss": 0.0001, - "step": 358 - }, - { - "epoch": 0.27012791572610984, - "grad_norm": 0.013555724173784256, - "learning_rate": 1.661010372970483e-05, - "loss": 0.0004, - "step": 359 - }, - { - "epoch": 0.2708803611738149, - "grad_norm": 0.0006222277879714966, - "learning_rate": 1.6592347293483908e-05, - "loss": 0.0, - "step": 360 - }, - { - "epoch": 0.27163280662151995, - "grad_norm": 0.008171453140676022, - "learning_rate": 1.6574554019804474e-05, - "loss": 0.0003, - "step": 361 - }, - { - "epoch": 0.272385252069225, - "grad_norm": 0.004504789598286152, - "learning_rate": 1.655672400809378e-05, - "loss": 0.0001, - "step": 362 - }, - { - "epoch": 0.27313769751693, - "grad_norm": 0.00137099449057132, - "learning_rate": 1.6538857357984358e-05, - "loss": 0.0001, - "step": 363 - }, - { - "epoch": 0.27389014296463504, - "grad_norm": 0.06337831914424896, - "learning_rate": 1.6520954169313498e-05, - "loss": 0.0016, - "step": 364 - }, - { - "epoch": 0.2746425884123401, - "grad_norm": 0.0002586379705462605, - "learning_rate": 1.6503014542122637e-05, - "loss": 0.0, - "step": 365 - }, - { - "epoch": 0.27539503386004516, - "grad_norm": 0.004504516255110502, - "learning_rate": 1.6485038576656842e-05, - "loss": 0.0001, - "step": 366 - }, - { - "epoch": 0.2761474793077502, - "grad_norm": 0.31715670228004456, - "learning_rate": 1.646702637336423e-05, - "loss": 0.0032, - "step": 367 - }, - { - "epoch": 0.2768999247554552, - "grad_norm": 0.0008062532288022339, - "learning_rate": 1.6448978032895417e-05, - "loss": 0.0, - "step": 368 - }, - { - "epoch": 0.27765237020316025, - "grad_norm": 0.0017397012561559677, - "learning_rate": 1.6430893656102942e-05, - "loss": 0.0001, - "step": 369 - }, - { - "epoch": 0.27840481565086533, - "grad_norm": 0.3345002830028534, - "learning_rate": 1.6412773344040717e-05, - "loss": 0.0042, - "step": 370 - }, - { - "epoch": 0.27915726109857036, - "grad_norm": 9.196252358378842e-05, - "learning_rate": 1.6394617197963462e-05, - "loss": 0.0, - "step": 371 - }, - { - "epoch": 0.2799097065462754, - "grad_norm": 0.2911829352378845, - "learning_rate": 1.6376425319326125e-05, - "loss": 0.1043, - "step": 372 - }, - { - "epoch": 0.2806621519939804, - "grad_norm": 0.0008261505281552672, - "learning_rate": 1.635819780978333e-05, - "loss": 0.0, - "step": 373 - }, - { - "epoch": 0.28141459744168545, - "grad_norm": 0.013018965721130371, - "learning_rate": 1.6339934771188796e-05, - "loss": 0.0003, - "step": 374 - }, - { - "epoch": 0.28216704288939054, - "grad_norm": 0.5947379469871521, - "learning_rate": 1.6321636305594784e-05, - "loss": 0.0068, - "step": 375 - }, - { - "epoch": 0.28291948833709557, - "grad_norm": 0.005715230479836464, - "learning_rate": 1.630330251525152e-05, - "loss": 0.0002, - "step": 376 - }, - { - "epoch": 0.2836719337848006, - "grad_norm": 0.07143538445234299, - "learning_rate": 1.6284933502606614e-05, - "loss": 0.0009, - "step": 377 - }, - { - "epoch": 0.28442437923250563, - "grad_norm": 4.414242267608643, - "learning_rate": 1.6266529370304492e-05, - "loss": 0.0671, - "step": 378 - }, - { - "epoch": 0.28517682468021066, - "grad_norm": 0.0007347184582613409, - "learning_rate": 1.624809022118584e-05, - "loss": 0.0, - "step": 379 - }, - { - "epoch": 0.28592927012791575, - "grad_norm": 0.00025847938377410173, - "learning_rate": 1.6229616158286997e-05, - "loss": 0.0, - "step": 380 - }, - { - "epoch": 0.2866817155756208, - "grad_norm": 12.279823303222656, - "learning_rate": 1.6211107284839417e-05, - "loss": 0.1936, - "step": 381 - }, - { - "epoch": 0.2874341610233258, - "grad_norm": 0.0016370975645259023, - "learning_rate": 1.6192563704269048e-05, - "loss": 0.0001, - "step": 382 - }, - { - "epoch": 0.28818660647103084, - "grad_norm": 5.603551835520193e-05, - "learning_rate": 1.6173985520195805e-05, - "loss": 0.0, - "step": 383 - }, - { - "epoch": 0.28893905191873587, - "grad_norm": 0.017844753339886665, - "learning_rate": 1.6155372836432944e-05, - "loss": 0.0007, - "step": 384 - }, - { - "epoch": 0.28969149736644095, - "grad_norm": 0.0008964858716353774, - "learning_rate": 1.6136725756986514e-05, - "loss": 0.0, - "step": 385 - }, - { - "epoch": 0.290443942814146, - "grad_norm": 0.12251462787389755, - "learning_rate": 1.6118044386054755e-05, - "loss": 0.0021, - "step": 386 - }, - { - "epoch": 0.291196388261851, - "grad_norm": 8.942232131958008, - "learning_rate": 1.609932882802753e-05, - "loss": 0.1369, - "step": 387 - }, - { - "epoch": 0.29194883370955604, - "grad_norm": 0.005296614021062851, - "learning_rate": 1.6080579187485738e-05, - "loss": 0.0001, - "step": 388 - }, - { - "epoch": 0.2927012791572611, - "grad_norm": 0.00021417946845758706, - "learning_rate": 1.6061795569200725e-05, - "loss": 0.0, - "step": 389 - }, - { - "epoch": 0.29345372460496616, - "grad_norm": 0.0008177552372217178, - "learning_rate": 1.60429780781337e-05, - "loss": 0.0, - "step": 390 - }, - { - "epoch": 0.2942061700526712, - "grad_norm": 2.832350015640259, - "learning_rate": 1.6024126819435156e-05, - "loss": 0.0632, - "step": 391 - }, - { - "epoch": 0.2949586155003762, - "grad_norm": 0.007019749376922846, - "learning_rate": 1.6005241898444275e-05, - "loss": 0.0002, - "step": 392 - }, - { - "epoch": 0.29571106094808125, - "grad_norm": 0.49707838892936707, - "learning_rate": 1.5986323420688335e-05, - "loss": 0.093, - "step": 393 - }, - { - "epoch": 0.2964635063957863, - "grad_norm": 5.089029788970947, - "learning_rate": 1.5967371491882136e-05, - "loss": 0.0233, - "step": 394 - }, - { - "epoch": 0.29721595184349137, - "grad_norm": 0.0013898630859330297, - "learning_rate": 1.5948386217927384e-05, - "loss": 0.0001, - "step": 395 - }, - { - "epoch": 0.2979683972911964, - "grad_norm": 0.0012057372368872166, - "learning_rate": 1.592936770491214e-05, - "loss": 0.0001, - "step": 396 - }, - { - "epoch": 0.2987208427389014, - "grad_norm": 0.0038578074891120195, - "learning_rate": 1.591031605911017e-05, - "loss": 0.0001, - "step": 397 - }, - { - "epoch": 0.29947328818660646, - "grad_norm": 8.180018630810082e-05, - "learning_rate": 1.5891231386980415e-05, - "loss": 0.0, - "step": 398 - }, - { - "epoch": 0.3002257336343115, - "grad_norm": 0.0012178582837805152, - "learning_rate": 1.5872113795166337e-05, - "loss": 0.0, - "step": 399 - }, - { - "epoch": 0.3009781790820166, - "grad_norm": 7.556064520031214e-05, - "learning_rate": 1.585296339049537e-05, - "loss": 0.0, - "step": 400 - }, - { - "epoch": 0.3017306245297216, - "grad_norm": 0.0032481662929058075, - "learning_rate": 1.5833780279978293e-05, - "loss": 0.0001, - "step": 401 - }, - { - "epoch": 0.30248306997742663, - "grad_norm": 4.7872754294076e-06, - "learning_rate": 1.5814564570808643e-05, - "loss": 0.0, - "step": 402 - }, - { - "epoch": 0.30323551542513166, - "grad_norm": 0.014581550844013691, - "learning_rate": 1.5795316370362122e-05, - "loss": 0.0003, - "step": 403 - }, - { - "epoch": 0.3039879608728367, - "grad_norm": 0.3884536623954773, - "learning_rate": 1.5776035786195983e-05, - "loss": 0.1174, - "step": 404 - }, - { - "epoch": 0.3047404063205418, - "grad_norm": 0.061029549688100815, - "learning_rate": 1.575672292604844e-05, - "loss": 0.0002, - "step": 405 - }, - { - "epoch": 0.3054928517682468, - "grad_norm": 0.0007864750223234296, - "learning_rate": 1.5737377897838065e-05, - "loss": 0.0, - "step": 406 - }, - { - "epoch": 0.30624529721595184, - "grad_norm": 0.0008468715823255479, - "learning_rate": 1.5718000809663173e-05, - "loss": 0.0, - "step": 407 - }, - { - "epoch": 0.30699774266365687, - "grad_norm": 6.305576243903488e-05, - "learning_rate": 1.569859176980124e-05, - "loss": 0.0, - "step": 408 - }, - { - "epoch": 0.3077501881113619, - "grad_norm": 0.6824759840965271, - "learning_rate": 1.5679150886708273e-05, - "loss": 0.1164, - "step": 409 - }, - { - "epoch": 0.308502633559067, - "grad_norm": 0.01927897147834301, - "learning_rate": 1.565967826901822e-05, - "loss": 0.0002, - "step": 410 - }, - { - "epoch": 0.309255079006772, - "grad_norm": 0.001680307206697762, - "learning_rate": 1.564017402554237e-05, - "loss": 0.0001, - "step": 411 - }, - { - "epoch": 0.31000752445447705, - "grad_norm": 0.036916520446538925, - "learning_rate": 1.5620638265268718e-05, - "loss": 0.0005, - "step": 412 - }, - { - "epoch": 0.3107599699021821, - "grad_norm": 4.316399097442627, - "learning_rate": 1.560107109736138e-05, - "loss": 0.4106, - "step": 413 - }, - { - "epoch": 0.3115124153498871, - "grad_norm": 0.0003387642209418118, - "learning_rate": 1.5581472631159977e-05, - "loss": 0.0, - "step": 414 - }, - { - "epoch": 0.3122648607975922, - "grad_norm": 0.5591127872467041, - "learning_rate": 1.5561842976179013e-05, - "loss": 0.1023, - "step": 415 - }, - { - "epoch": 0.3130173062452972, - "grad_norm": 6.071132659912109, - "learning_rate": 1.5542182242107284e-05, - "loss": 0.0244, - "step": 416 - }, - { - "epoch": 0.31376975169300225, - "grad_norm": 0.0023193645756691694, - "learning_rate": 1.5522490538807248e-05, - "loss": 0.0001, - "step": 417 - }, - { - "epoch": 0.3145221971407073, - "grad_norm": 0.0017903498373925686, - "learning_rate": 1.5502767976314413e-05, - "loss": 0.0001, - "step": 418 - }, - { - "epoch": 0.3152746425884123, - "grad_norm": 10.95837116241455, - "learning_rate": 1.5483014664836732e-05, - "loss": 0.6184, - "step": 419 - }, - { - "epoch": 0.3160270880361174, - "grad_norm": 0.002409271663054824, - "learning_rate": 1.546323071475397e-05, - "loss": 0.0001, - "step": 420 - }, - { - "epoch": 0.31677953348382243, - "grad_norm": 7.29747480363585e-05, - "learning_rate": 1.544341623661711e-05, - "loss": 0.0, - "step": 421 - }, - { - "epoch": 0.31753197893152746, - "grad_norm": 2.6204562187194824, - "learning_rate": 1.5423571341147724e-05, - "loss": 0.1352, - "step": 422 - }, - { - "epoch": 0.3182844243792325, - "grad_norm": 0.0007865215302444994, - "learning_rate": 1.5403696139237338e-05, - "loss": 0.0, - "step": 423 - }, - { - "epoch": 0.3190368698269376, - "grad_norm": 3.1011784076690674, - "learning_rate": 1.538379074194684e-05, - "loss": 0.4246, - "step": 424 - }, - { - "epoch": 0.3197893152746426, - "grad_norm": 0.000731561507564038, - "learning_rate": 1.5363855260505848e-05, - "loss": 0.0, - "step": 425 - }, - { - "epoch": 0.32054176072234764, - "grad_norm": 6.430411338806152, - "learning_rate": 1.534388980631208e-05, - "loss": 0.2966, - "step": 426 - }, - { - "epoch": 0.32129420617005267, - "grad_norm": 0.010142396204173565, - "learning_rate": 1.5323894490930743e-05, - "loss": 0.0004, - "step": 427 - }, - { - "epoch": 0.3220466516177577, - "grad_norm": 0.5542206764221191, - "learning_rate": 1.5303869426093906e-05, - "loss": 0.0149, - "step": 428 - }, - { - "epoch": 0.3227990970654628, - "grad_norm": 0.35877907276153564, - "learning_rate": 1.5283814723699877e-05, - "loss": 0.0831, - "step": 429 - }, - { - "epoch": 0.3235515425131678, - "grad_norm": 0.082036592066288, - "learning_rate": 1.5263730495812568e-05, - "loss": 0.0032, - "step": 430 - }, - { - "epoch": 0.32430398796087284, - "grad_norm": 0.017158055678009987, - "learning_rate": 1.5243616854660894e-05, - "loss": 0.001, - "step": 431 - }, - { - "epoch": 0.32505643340857787, - "grad_norm": 0.00297352927736938, - "learning_rate": 1.522347391263811e-05, - "loss": 0.0001, - "step": 432 - }, - { - "epoch": 0.3258088788562829, - "grad_norm": 0.003190380521118641, - "learning_rate": 1.5203301782301212e-05, - "loss": 0.0001, - "step": 433 - }, - { - "epoch": 0.326561324303988, - "grad_norm": 0.0062565989792346954, - "learning_rate": 1.5183100576370291e-05, - "loss": 0.0003, - "step": 434 - }, - { - "epoch": 0.327313769751693, - "grad_norm": 2.2504420280456543, - "learning_rate": 1.5162870407727922e-05, - "loss": 0.1951, - "step": 435 - }, - { - "epoch": 0.32806621519939805, - "grad_norm": 1.0565892457962036, - "learning_rate": 1.5142611389418505e-05, - "loss": 0.0199, - "step": 436 - }, - { - "epoch": 0.3288186606471031, - "grad_norm": 1.0833739042282104, - "learning_rate": 1.512232363464766e-05, - "loss": 0.0284, - "step": 437 - }, - { - "epoch": 0.3295711060948081, - "grad_norm": 0.32872238755226135, - "learning_rate": 1.5102007256781583e-05, - "loss": 0.0861, - "step": 438 - }, - { - "epoch": 0.3303235515425132, - "grad_norm": 0.08302167803049088, - "learning_rate": 1.5081662369346412e-05, - "loss": 0.0027, - "step": 439 - }, - { - "epoch": 0.3310759969902182, - "grad_norm": 0.01665945164859295, - "learning_rate": 1.5061289086027593e-05, - "loss": 0.0009, - "step": 440 - }, - { - "epoch": 0.33182844243792325, - "grad_norm": 0.00258431863039732, - "learning_rate": 1.5040887520669245e-05, - "loss": 0.0001, - "step": 441 - }, - { - "epoch": 0.3325808878856283, - "grad_norm": 0.11993154883384705, - "learning_rate": 1.502045778727353e-05, - "loss": 0.0023, - "step": 442 - }, - { - "epoch": 0.3333333333333333, - "grad_norm": 10.732891082763672, - "learning_rate": 1.5000000000000002e-05, - "loss": 0.0681, - "step": 443 - }, - { - "epoch": 0.3340857787810384, - "grad_norm": 0.0034522817004472017, - "learning_rate": 1.497951427316498e-05, - "loss": 0.0002, - "step": 444 - }, - { - "epoch": 0.33483822422874343, - "grad_norm": 0.0015809908509254456, - "learning_rate": 1.495900072124092e-05, - "loss": 0.0001, - "step": 445 - }, - { - "epoch": 0.33559066967644846, - "grad_norm": 0.006714050658047199, - "learning_rate": 1.4938459458855739e-05, - "loss": 0.0004, - "step": 446 - }, - { - "epoch": 0.3363431151241535, - "grad_norm": 0.010480109602212906, - "learning_rate": 1.4917890600792215e-05, - "loss": 0.0006, - "step": 447 - }, - { - "epoch": 0.3370955605718585, - "grad_norm": 0.06001771241426468, - "learning_rate": 1.4897294261987325e-05, - "loss": 0.0021, - "step": 448 - }, - { - "epoch": 0.3378480060195636, - "grad_norm": 0.0026993490755558014, - "learning_rate": 1.4876670557531598e-05, - "loss": 0.0001, - "step": 449 - }, - { - "epoch": 0.33860045146726864, - "grad_norm": 1.938813328742981, - "learning_rate": 1.485601960266849e-05, - "loss": 0.0378, - "step": 450 - }, - { - "epoch": 0.33935289691497367, - "grad_norm": 0.004080132115632296, - "learning_rate": 1.4835341512793727e-05, - "loss": 0.0002, - "step": 451 - }, - { - "epoch": 0.3401053423626787, - "grad_norm": 3.724914073944092, - "learning_rate": 1.4814636403454656e-05, - "loss": 0.1663, - "step": 452 - }, - { - "epoch": 0.34085778781038373, - "grad_norm": 0.6385138034820557, - "learning_rate": 1.4793904390349618e-05, - "loss": 0.1188, - "step": 453 - }, - { - "epoch": 0.3416102332580888, - "grad_norm": 4.093441963195801, - "learning_rate": 1.477314558932728e-05, - "loss": 0.0387, - "step": 454 - }, - { - "epoch": 0.34236267870579384, - "grad_norm": 0.028323644772171974, - "learning_rate": 1.4752360116386002e-05, - "loss": 0.0012, - "step": 455 - }, - { - "epoch": 0.3431151241534989, - "grad_norm": 0.14701040089130402, - "learning_rate": 1.4731548087673186e-05, - "loss": 0.0046, - "step": 456 - }, - { - "epoch": 0.3438675696012039, - "grad_norm": 0.22901006042957306, - "learning_rate": 1.4710709619484623e-05, - "loss": 0.0061, - "step": 457 - }, - { - "epoch": 0.34462001504890893, - "grad_norm": 0.007301859557628632, - "learning_rate": 1.4689844828263846e-05, - "loss": 0.0003, - "step": 458 - }, - { - "epoch": 0.345372460496614, - "grad_norm": 0.005133870989084244, - "learning_rate": 1.4668953830601473e-05, - "loss": 0.0002, - "step": 459 - }, - { - "epoch": 0.34612490594431905, - "grad_norm": 0.003975029569119215, - "learning_rate": 1.4648036743234573e-05, - "loss": 0.0002, - "step": 460 - }, - { - "epoch": 0.3468773513920241, - "grad_norm": 0.012133643962442875, - "learning_rate": 1.4627093683045997e-05, - "loss": 0.0006, - "step": 461 - }, - { - "epoch": 0.3476297968397291, - "grad_norm": 0.00555196451023221, - "learning_rate": 1.4606124767063721e-05, - "loss": 0.0003, - "step": 462 - }, - { - "epoch": 0.34838224228743414, - "grad_norm": 0.001998197054490447, - "learning_rate": 1.4585130112460214e-05, - "loss": 0.0001, - "step": 463 - }, - { - "epoch": 0.3491346877351392, - "grad_norm": 0.4198460578918457, - "learning_rate": 1.4564109836551763e-05, - "loss": 0.0066, - "step": 464 - }, - { - "epoch": 0.34988713318284426, - "grad_norm": 0.5744123458862305, - "learning_rate": 1.4543064056797826e-05, - "loss": 0.0147, - "step": 465 - }, - { - "epoch": 0.3506395786305493, - "grad_norm": 0.0055709234438836575, - "learning_rate": 1.4521992890800379e-05, - "loss": 0.0002, - "step": 466 - }, - { - "epoch": 0.3513920240782543, - "grad_norm": 0.0019197987858206034, - "learning_rate": 1.4500896456303241e-05, - "loss": 0.0001, - "step": 467 - }, - { - "epoch": 0.35214446952595935, - "grad_norm": 0.8345107436180115, - "learning_rate": 1.4479774871191447e-05, - "loss": 0.1345, - "step": 468 - }, - { - "epoch": 0.35289691497366443, - "grad_norm": 0.6766433715820312, - "learning_rate": 1.4458628253490555e-05, - "loss": 0.179, - "step": 469 - }, - { - "epoch": 0.35364936042136946, - "grad_norm": 0.014497430063784122, - "learning_rate": 1.4437456721366013e-05, - "loss": 0.0007, - "step": 470 - }, - { - "epoch": 0.3544018058690745, - "grad_norm": 0.11778023838996887, - "learning_rate": 1.4416260393122487e-05, - "loss": 0.0046, - "step": 471 - }, - { - "epoch": 0.3551542513167795, - "grad_norm": 0.47652357816696167, - "learning_rate": 1.4395039387203197e-05, - "loss": 0.0069, - "step": 472 - }, - { - "epoch": 0.35590669676448455, - "grad_norm": 0.3849281668663025, - "learning_rate": 1.4373793822189266e-05, - "loss": 0.11, - "step": 473 - }, - { - "epoch": 0.35665914221218964, - "grad_norm": 2.7059948444366455, - "learning_rate": 1.4352523816799046e-05, - "loss": 0.1874, - "step": 474 - }, - { - "epoch": 0.35741158765989467, - "grad_norm": 0.7493427395820618, - "learning_rate": 1.4331229489887463e-05, - "loss": 0.0055, - "step": 475 - }, - { - "epoch": 0.3581640331075997, - "grad_norm": 0.002171468921005726, - "learning_rate": 1.430991096044535e-05, - "loss": 0.0001, - "step": 476 - }, - { - "epoch": 0.35891647855530473, - "grad_norm": 12.364990234375, - "learning_rate": 1.4288568347598777e-05, - "loss": 0.0784, - "step": 477 - }, - { - "epoch": 0.35966892400300976, - "grad_norm": 0.022090526297688484, - "learning_rate": 1.4267201770608392e-05, - "loss": 0.0008, - "step": 478 - }, - { - "epoch": 0.36042136945071485, - "grad_norm": 1.5892497301101685, - "learning_rate": 1.4245811348868753e-05, - "loss": 0.0918, - "step": 479 - }, - { - "epoch": 0.3611738148984199, - "grad_norm": 0.005540026817470789, - "learning_rate": 1.4224397201907664e-05, - "loss": 0.0003, - "step": 480 - }, - { - "epoch": 0.3619262603461249, - "grad_norm": 0.06051875278353691, - "learning_rate": 1.42029594493855e-05, - "loss": 0.0014, - "step": 481 - }, - { - "epoch": 0.36267870579382994, - "grad_norm": 0.011512890458106995, - "learning_rate": 1.418149821109454e-05, - "loss": 0.0005, - "step": 482 - }, - { - "epoch": 0.36343115124153497, - "grad_norm": 0.002135386224836111, - "learning_rate": 1.4160013606958303e-05, - "loss": 0.0001, - "step": 483 - }, - { - "epoch": 0.36418359668924005, - "grad_norm": 0.018821068108081818, - "learning_rate": 1.4138505757030869e-05, - "loss": 0.0004, - "step": 484 - }, - { - "epoch": 0.3649360421369451, - "grad_norm": 0.0008346544927917421, - "learning_rate": 1.411697478149622e-05, - "loss": 0.0, - "step": 485 - }, - { - "epoch": 0.3656884875846501, - "grad_norm": 0.18350642919540405, - "learning_rate": 1.409542080066756e-05, - "loss": 0.004, - "step": 486 - }, - { - "epoch": 0.36644093303235514, - "grad_norm": 2.6405723094940186, - "learning_rate": 1.4073843934986644e-05, - "loss": 0.3307, - "step": 487 - }, - { - "epoch": 0.3671933784800602, - "grad_norm": 0.0034803429152816534, - "learning_rate": 1.4052244305023101e-05, - "loss": 0.0001, - "step": 488 - }, - { - "epoch": 0.36794582392776526, - "grad_norm": 0.006721539422869682, - "learning_rate": 1.403062203147377e-05, - "loss": 0.0003, - "step": 489 - }, - { - "epoch": 0.3686982693754703, - "grad_norm": 0.6599304676055908, - "learning_rate": 1.4008977235162024e-05, - "loss": 0.0107, - "step": 490 - }, - { - "epoch": 0.3694507148231753, - "grad_norm": 0.05510720983147621, - "learning_rate": 1.3987310037037081e-05, - "loss": 0.0022, - "step": 491 - }, - { - "epoch": 0.37020316027088035, - "grad_norm": 0.0020602026488631964, - "learning_rate": 1.3965620558173345e-05, - "loss": 0.0001, - "step": 492 - }, - { - "epoch": 0.3709556057185854, - "grad_norm": 3.3849992752075195, - "learning_rate": 1.3943908919769724e-05, - "loss": 0.0947, - "step": 493 - }, - { - "epoch": 0.37170805116629047, - "grad_norm": 0.0011134468950331211, - "learning_rate": 1.3922175243148948e-05, - "loss": 0.0001, - "step": 494 - }, - { - "epoch": 0.3724604966139955, - "grad_norm": 0.4826012849807739, - "learning_rate": 1.3900419649756895e-05, - "loss": 0.0075, - "step": 495 - }, - { - "epoch": 0.3732129420617005, - "grad_norm": 1.0771576166152954, - "learning_rate": 1.3878642261161916e-05, - "loss": 0.015, - "step": 496 - }, - { - "epoch": 0.37396538750940556, - "grad_norm": 0.14827294647693634, - "learning_rate": 1.3856843199054144e-05, - "loss": 0.0036, - "step": 497 - }, - { - "epoch": 0.3747178329571106, - "grad_norm": 0.0043605901300907135, - "learning_rate": 1.3835022585244829e-05, - "loss": 0.0002, - "step": 498 - }, - { - "epoch": 0.37547027840481567, - "grad_norm": 0.05336622893810272, - "learning_rate": 1.3813180541665646e-05, - "loss": 0.0025, - "step": 499 - }, - { - "epoch": 0.3762227238525207, - "grad_norm": 0.00369238737039268, - "learning_rate": 1.3791317190368018e-05, - "loss": 0.0002, - "step": 500 - }, - { - "epoch": 0.37697516930022573, - "grad_norm": 1.3321623802185059, - "learning_rate": 1.3769432653522436e-05, - "loss": 0.0112, - "step": 501 - }, - { - "epoch": 0.37772761474793076, - "grad_norm": 0.0012655869359150529, - "learning_rate": 1.3747527053417776e-05, - "loss": 0.0001, - "step": 502 - }, - { - "epoch": 0.3784800601956358, - "grad_norm": 0.0007887445390224457, - "learning_rate": 1.3725600512460606e-05, - "loss": 0.0, - "step": 503 - }, - { - "epoch": 0.3792325056433409, - "grad_norm": 0.1122029721736908, - "learning_rate": 1.3703653153174513e-05, - "loss": 0.0007, - "step": 504 - }, - { - "epoch": 0.3799849510910459, - "grad_norm": 2.8246965408325195, - "learning_rate": 1.3681685098199418e-05, - "loss": 0.2841, - "step": 505 - }, - { - "epoch": 0.38073739653875094, - "grad_norm": 0.006240316201001406, - "learning_rate": 1.3659696470290888e-05, - "loss": 0.0002, - "step": 506 - }, - { - "epoch": 0.38148984198645597, - "grad_norm": 0.15288379788398743, - "learning_rate": 1.3637687392319443e-05, - "loss": 0.0032, - "step": 507 - }, - { - "epoch": 0.382242287434161, - "grad_norm": 0.005582885816693306, - "learning_rate": 1.3615657987269882e-05, - "loss": 0.0002, - "step": 508 - }, - { - "epoch": 0.3829947328818661, - "grad_norm": 0.00010864822252187878, - "learning_rate": 1.3593608378240587e-05, - "loss": 0.0, - "step": 509 - }, - { - "epoch": 0.3837471783295711, - "grad_norm": 5.737289905548096, - "learning_rate": 1.3571538688442843e-05, - "loss": 0.1844, - "step": 510 - }, - { - "epoch": 0.38449962377727614, - "grad_norm": 0.002173966495320201, - "learning_rate": 1.3549449041200138e-05, - "loss": 0.0001, - "step": 511 - }, - { - "epoch": 0.3852520692249812, - "grad_norm": 0.8293859958648682, - "learning_rate": 1.3527339559947483e-05, - "loss": 0.1965, - "step": 512 - }, - { - "epoch": 0.3860045146726862, - "grad_norm": 0.0026998259127140045, - "learning_rate": 1.3505210368230723e-05, - "loss": 0.0001, - "step": 513 - }, - { - "epoch": 0.3867569601203913, - "grad_norm": 0.00044145798892714083, - "learning_rate": 1.3483061589705839e-05, - "loss": 0.0, - "step": 514 - }, - { - "epoch": 0.3875094055680963, - "grad_norm": 0.004763697739690542, - "learning_rate": 1.3460893348138262e-05, - "loss": 0.0001, - "step": 515 - }, - { - "epoch": 0.38826185101580135, - "grad_norm": 9.023599704960361e-05, - "learning_rate": 1.3438705767402185e-05, - "loss": 0.0, - "step": 516 - }, - { - "epoch": 0.3890142964635064, - "grad_norm": 0.0630866289138794, - "learning_rate": 1.341649897147986e-05, - "loss": 0.0006, - "step": 517 - }, - { - "epoch": 0.3897667419112114, - "grad_norm": 0.0039035833906382322, - "learning_rate": 1.3394273084460916e-05, - "loss": 0.0002, - "step": 518 - }, - { - "epoch": 0.3905191873589165, - "grad_norm": 3.1299986839294434, - "learning_rate": 1.3372028230541658e-05, - "loss": 0.4068, - "step": 519 - }, - { - "epoch": 0.3912716328066215, - "grad_norm": 0.0035276322159916162, - "learning_rate": 1.3349764534024385e-05, - "loss": 0.0002, - "step": 520 - }, - { - "epoch": 0.39202407825432656, - "grad_norm": 0.5055978894233704, - "learning_rate": 1.3327482119316674e-05, - "loss": 0.0223, - "step": 521 - }, - { - "epoch": 0.3927765237020316, - "grad_norm": 0.0033338728826493025, - "learning_rate": 1.330518111093071e-05, - "loss": 0.0002, - "step": 522 - }, - { - "epoch": 0.3935289691497366, - "grad_norm": 0.06779402494430542, - "learning_rate": 1.3282861633482566e-05, - "loss": 0.003, - "step": 523 - }, - { - "epoch": 0.3942814145974417, - "grad_norm": 0.6969091892242432, - "learning_rate": 1.3260523811691527e-05, - "loss": 0.0111, - "step": 524 - }, - { - "epoch": 0.39503386004514673, - "grad_norm": 0.3388632535934448, - "learning_rate": 1.3238167770379384e-05, - "loss": 0.1329, - "step": 525 - }, - { - "epoch": 0.39578630549285176, - "grad_norm": 0.15506567060947418, - "learning_rate": 1.3215793634469733e-05, - "loss": 0.0035, - "step": 526 - }, - { - "epoch": 0.3965387509405568, - "grad_norm": 3.37923002243042, - "learning_rate": 1.3193401528987286e-05, - "loss": 0.3774, - "step": 527 - }, - { - "epoch": 0.3972911963882618, - "grad_norm": 0.6373834609985352, - "learning_rate": 1.3170991579057163e-05, - "loss": 0.0922, - "step": 528 - }, - { - "epoch": 0.3980436418359669, - "grad_norm": 0.23215818405151367, - "learning_rate": 1.3148563909904195e-05, - "loss": 0.007, - "step": 529 - }, - { - "epoch": 0.39879608728367194, - "grad_norm": 0.000551075441762805, - "learning_rate": 1.3126118646852235e-05, - "loss": 0.0, - "step": 530 - }, - { - "epoch": 0.39954853273137697, - "grad_norm": 2.5601372718811035, - "learning_rate": 1.3103655915323444e-05, - "loss": 0.0924, - "step": 531 - }, - { - "epoch": 0.400300978179082, - "grad_norm": 0.029645578935742378, - "learning_rate": 1.3081175840837595e-05, - "loss": 0.0007, - "step": 532 - }, - { - "epoch": 0.40105342362678703, - "grad_norm": 0.0015663238009437919, - "learning_rate": 1.3058678549011371e-05, - "loss": 0.0001, - "step": 533 - }, - { - "epoch": 0.4018058690744921, - "grad_norm": 7.516592025756836, - "learning_rate": 1.3036164165557667e-05, - "loss": 0.1748, - "step": 534 - }, - { - "epoch": 0.40255831452219715, - "grad_norm": 0.02340114861726761, - "learning_rate": 1.3013632816284885e-05, - "loss": 0.001, - "step": 535 - }, - { - "epoch": 0.4033107599699022, - "grad_norm": 0.04894443601369858, - "learning_rate": 1.2991084627096226e-05, - "loss": 0.0016, - "step": 536 - }, - { - "epoch": 0.4040632054176072, - "grad_norm": 0.053800590336322784, - "learning_rate": 1.2968519723988994e-05, - "loss": 0.0021, - "step": 537 - }, - { - "epoch": 0.40481565086531224, - "grad_norm": 0.01409213524311781, - "learning_rate": 1.2945938233053892e-05, - "loss": 0.0005, - "step": 538 - }, - { - "epoch": 0.4055680963130173, - "grad_norm": 0.4299611747264862, - "learning_rate": 1.2923340280474306e-05, - "loss": 0.0091, - "step": 539 - }, - { - "epoch": 0.40632054176072235, - "grad_norm": 3.0297110080718994, - "learning_rate": 1.2900725992525618e-05, - "loss": 0.1769, - "step": 540 - }, - { - "epoch": 0.4070729872084274, - "grad_norm": 0.012766638770699501, - "learning_rate": 1.2878095495574484e-05, - "loss": 0.0006, - "step": 541 - }, - { - "epoch": 0.4078254326561324, - "grad_norm": 0.019243692979216576, - "learning_rate": 1.285544891607813e-05, - "loss": 0.0007, - "step": 542 - }, - { - "epoch": 0.40857787810383744, - "grad_norm": 0.4208061099052429, - "learning_rate": 1.2832786380583664e-05, - "loss": 0.0103, - "step": 543 - }, - { - "epoch": 0.40933032355154253, - "grad_norm": 0.07195150852203369, - "learning_rate": 1.2810108015727345e-05, - "loss": 0.0022, - "step": 544 - }, - { - "epoch": 0.41008276899924756, - "grad_norm": 1.117520809173584, - "learning_rate": 1.2787413948233885e-05, - "loss": 0.0159, - "step": 545 - }, - { - "epoch": 0.4108352144469526, - "grad_norm": 0.15800736844539642, - "learning_rate": 1.2764704304915743e-05, - "loss": 0.0048, - "step": 546 - }, - { - "epoch": 0.4115876598946576, - "grad_norm": 0.026513785123825073, - "learning_rate": 1.2741979212672418e-05, - "loss": 0.0011, - "step": 547 - }, - { - "epoch": 0.4123401053423627, - "grad_norm": 0.27647465467453003, - "learning_rate": 1.2719238798489725e-05, - "loss": 0.0094, - "step": 548 - }, - { - "epoch": 0.41309255079006774, - "grad_norm": 0.003962705843150616, - "learning_rate": 1.2696483189439113e-05, - "loss": 0.0002, - "step": 549 - }, - { - "epoch": 0.41384499623777277, - "grad_norm": 0.006578361615538597, - "learning_rate": 1.2673712512676923e-05, - "loss": 0.0002, - "step": 550 - }, - { - "epoch": 0.4145974416854778, - "grad_norm": 0.4227232038974762, - "learning_rate": 1.2650926895443705e-05, - "loss": 0.0132, - "step": 551 - }, - { - "epoch": 0.4153498871331828, - "grad_norm": 0.1907581388950348, - "learning_rate": 1.2628126465063483e-05, - "loss": 0.0062, - "step": 552 - }, - { - "epoch": 0.4161023325808879, - "grad_norm": 0.02955441363155842, - "learning_rate": 1.2605311348943066e-05, - "loss": 0.0006, - "step": 553 - }, - { - "epoch": 0.41685477802859294, - "grad_norm": 0.582580029964447, - "learning_rate": 1.2582481674571325e-05, - "loss": 0.0115, - "step": 554 - }, - { - "epoch": 0.417607223476298, - "grad_norm": 0.0009048896026797593, - "learning_rate": 1.2559637569518472e-05, - "loss": 0.0, - "step": 555 - }, - { - "epoch": 0.418359668924003, - "grad_norm": 0.03770684078335762, - "learning_rate": 1.2536779161435368e-05, - "loss": 0.0006, - "step": 556 - }, - { - "epoch": 0.41911211437170803, - "grad_norm": 0.25430363416671753, - "learning_rate": 1.251390657805279e-05, - "loss": 0.1052, - "step": 557 - }, - { - "epoch": 0.4198645598194131, - "grad_norm": 0.8613071441650391, - "learning_rate": 1.2491019947180727e-05, - "loss": 0.0129, - "step": 558 - }, - { - "epoch": 0.42061700526711815, - "grad_norm": 0.00920679047703743, - "learning_rate": 1.2468119396707668e-05, - "loss": 0.0003, - "step": 559 - }, - { - "epoch": 0.4213694507148232, - "grad_norm": 0.49765467643737793, - "learning_rate": 1.2445205054599879e-05, - "loss": 0.0065, - "step": 560 - }, - { - "epoch": 0.4221218961625282, - "grad_norm": 0.006361035164445639, - "learning_rate": 1.2422277048900694e-05, - "loss": 0.0003, - "step": 561 - }, - { - "epoch": 0.42287434161023324, - "grad_norm": 0.3172746002674103, - "learning_rate": 1.23993355077298e-05, - "loss": 0.2152, - "step": 562 - }, - { - "epoch": 0.4236267870579383, - "grad_norm": 0.010659880936145782, - "learning_rate": 1.237638055928251e-05, - "loss": 0.0004, - "step": 563 - }, - { - "epoch": 0.42437923250564336, - "grad_norm": 0.18647299706935883, - "learning_rate": 1.2353412331829073e-05, - "loss": 0.0062, - "step": 564 - }, - { - "epoch": 0.4251316779533484, - "grad_norm": 7.033252716064453, - "learning_rate": 1.2330430953713921e-05, - "loss": 0.3394, - "step": 565 - }, - { - "epoch": 0.4258841234010534, - "grad_norm": 0.035930879414081573, - "learning_rate": 1.2307436553354985e-05, - "loss": 0.0009, - "step": 566 - }, - { - "epoch": 0.42663656884875845, - "grad_norm": 0.0031391908414661884, - "learning_rate": 1.2284429259242958e-05, - "loss": 0.0001, - "step": 567 - }, - { - "epoch": 0.42738901429646353, - "grad_norm": 9.076556205749512, - "learning_rate": 1.2261409199940574e-05, - "loss": 0.3117, - "step": 568 - }, - { - "epoch": 0.42814145974416856, - "grad_norm": 0.3694950044155121, - "learning_rate": 1.2238376504081913e-05, - "loss": 0.0936, - "step": 569 - }, - { - "epoch": 0.4288939051918736, - "grad_norm": 0.0012056067353114486, - "learning_rate": 1.221533130037166e-05, - "loss": 0.0, - "step": 570 - }, - { - "epoch": 0.4296463506395786, - "grad_norm": 3.5604007244110107, - "learning_rate": 1.2192273717584386e-05, - "loss": 0.0581, - "step": 571 - }, - { - "epoch": 0.43039879608728365, - "grad_norm": 0.005317636765539646, - "learning_rate": 1.2169203884563846e-05, - "loss": 0.0002, - "step": 572 - }, - { - "epoch": 0.43115124153498874, - "grad_norm": 0.413285493850708, - "learning_rate": 1.2146121930222241e-05, - "loss": 0.0035, - "step": 573 - }, - { - "epoch": 0.43190368698269377, - "grad_norm": 4.266297340393066, - "learning_rate": 1.2123027983539511e-05, - "loss": 0.1756, - "step": 574 - }, - { - "epoch": 0.4326561324303988, - "grad_norm": 10.099092483520508, - "learning_rate": 1.2099922173562602e-05, - "loss": 0.0599, - "step": 575 - }, - { - "epoch": 0.43340857787810383, - "grad_norm": 0.0031585677061229944, - "learning_rate": 1.2076804629404752e-05, - "loss": 0.0001, - "step": 576 - }, - { - "epoch": 0.43416102332580886, - "grad_norm": 0.13065661489963531, - "learning_rate": 1.2053675480244777e-05, - "loss": 0.0047, - "step": 577 - }, - { - "epoch": 0.43491346877351394, - "grad_norm": 8.336974143981934, - "learning_rate": 1.2030534855326326e-05, - "loss": 0.1777, - "step": 578 - }, - { - "epoch": 0.435665914221219, - "grad_norm": 0.007145630661398172, - "learning_rate": 1.2007382883957186e-05, - "loss": 0.0003, - "step": 579 - }, - { - "epoch": 0.436418359668924, - "grad_norm": 1.3916585445404053, - "learning_rate": 1.1984219695508546e-05, - "loss": 0.0053, - "step": 580 - }, - { - "epoch": 0.43717080511662904, - "grad_norm": 19.2365665435791, - "learning_rate": 1.1961045419414264e-05, - "loss": 0.0929, - "step": 581 - }, - { - "epoch": 0.43792325056433407, - "grad_norm": 1.4760831618332304e-05, - "learning_rate": 1.1937860185170164e-05, - "loss": 0.0, - "step": 582 - }, - { - "epoch": 0.43867569601203915, - "grad_norm": 0.003603702411055565, - "learning_rate": 1.1914664122333305e-05, - "loss": 0.0001, - "step": 583 - }, - { - "epoch": 0.4394281414597442, - "grad_norm": 2.7130890885018744e-05, - "learning_rate": 1.1891457360521253e-05, - "loss": 0.0, - "step": 584 - }, - { - "epoch": 0.4401805869074492, - "grad_norm": 0.004726966377347708, - "learning_rate": 1.1868240029411351e-05, - "loss": 0.0002, - "step": 585 - }, - { - "epoch": 0.44093303235515424, - "grad_norm": 0.007764340378344059, - "learning_rate": 1.1845012258740016e-05, - "loss": 0.0002, - "step": 586 - }, - { - "epoch": 0.44168547780285927, - "grad_norm": 0.003988485783338547, - "learning_rate": 1.182177417830199e-05, - "loss": 0.0001, - "step": 587 - }, - { - "epoch": 0.44243792325056436, - "grad_norm": 0.013868695124983788, - "learning_rate": 1.1798525917949626e-05, - "loss": 0.0004, - "step": 588 - }, - { - "epoch": 0.4431903686982694, - "grad_norm": 0.4925473630428314, - "learning_rate": 1.177526760759217e-05, - "loss": 0.0091, - "step": 589 - }, - { - "epoch": 0.4439428141459744, - "grad_norm": 0.26699769496917725, - "learning_rate": 1.1751999377195014e-05, - "loss": 0.0881, - "step": 590 - }, - { - "epoch": 0.44469525959367945, - "grad_norm": 2.3798282146453857, - "learning_rate": 1.1728721356778994e-05, - "loss": 0.0538, - "step": 591 - }, - { - "epoch": 0.4454477050413845, - "grad_norm": 0.001672322629019618, - "learning_rate": 1.1705433676419644e-05, - "loss": 0.0, - "step": 592 - }, - { - "epoch": 0.44620015048908956, - "grad_norm": 0.005258211866021156, - "learning_rate": 1.168213646624648e-05, - "loss": 0.0002, - "step": 593 - }, - { - "epoch": 0.4469525959367946, - "grad_norm": 0.10926241427659988, - "learning_rate": 1.1658829856442269e-05, - "loss": 0.0013, - "step": 594 - }, - { - "epoch": 0.4477050413844996, - "grad_norm": 0.0057695843279361725, - "learning_rate": 1.1635513977242304e-05, - "loss": 0.0002, - "step": 595 - }, - { - "epoch": 0.44845748683220465, - "grad_norm": 0.059808410704135895, - "learning_rate": 1.1612188958933673e-05, - "loss": 0.0007, - "step": 596 - }, - { - "epoch": 0.4492099322799097, - "grad_norm": 0.0013155878987163305, - "learning_rate": 1.158885493185453e-05, - "loss": 0.0, - "step": 597 - }, - { - "epoch": 0.44996237772761477, - "grad_norm": 10.205440521240234, - "learning_rate": 1.1565512026393371e-05, - "loss": 0.2459, - "step": 598 - }, - { - "epoch": 0.4507148231753198, - "grad_norm": 0.0017919522942975163, - "learning_rate": 1.1542160372988312e-05, - "loss": 0.0001, - "step": 599 - }, - { - "epoch": 0.45146726862302483, - "grad_norm": 0.0012058173306286335, - "learning_rate": 1.1518800102126334e-05, - "loss": 0.0, - "step": 600 - }, - { - "epoch": 0.45221971407072986, - "grad_norm": 0.0011316149029880762, - "learning_rate": 1.149543134434259e-05, - "loss": 0.0001, - "step": 601 - }, - { - "epoch": 0.4529721595184349, - "grad_norm": 0.03514290601015091, - "learning_rate": 1.1472054230219644e-05, - "loss": 0.001, - "step": 602 - }, - { - "epoch": 0.45372460496614, - "grad_norm": 0.742960512638092, - "learning_rate": 1.1448668890386765e-05, - "loss": 0.0066, - "step": 603 - }, - { - "epoch": 0.454477050413845, - "grad_norm": 3.21566104888916, - "learning_rate": 1.1425275455519176e-05, - "loss": 0.3373, - "step": 604 - }, - { - "epoch": 0.45522949586155004, - "grad_norm": 0.0010515082394704223, - "learning_rate": 1.140187405633734e-05, - "loss": 0.0, - "step": 605 - }, - { - "epoch": 0.45598194130925507, - "grad_norm": 0.431357204914093, - "learning_rate": 1.1378464823606228e-05, - "loss": 0.0023, - "step": 606 - }, - { - "epoch": 0.4567343867569601, - "grad_norm": 0.43157681822776794, - "learning_rate": 1.1355047888134571e-05, - "loss": 0.0708, - "step": 607 - }, - { - "epoch": 0.4574868322046652, - "grad_norm": 0.0020544701255857944, - "learning_rate": 1.1331623380774156e-05, - "loss": 0.0001, - "step": 608 - }, - { - "epoch": 0.4582392776523702, - "grad_norm": 2.6385327146272175e-05, - "learning_rate": 1.1308191432419078e-05, - "loss": 0.0, - "step": 609 - }, - { - "epoch": 0.45899172310007524, - "grad_norm": 0.006066860631108284, - "learning_rate": 1.1284752174005005e-05, - "loss": 0.0002, - "step": 610 - }, - { - "epoch": 0.4597441685477803, - "grad_norm": 0.9779205322265625, - "learning_rate": 1.1261305736508458e-05, - "loss": 0.1798, - "step": 611 - }, - { - "epoch": 0.4604966139954853, - "grad_norm": 0.0006280777743086219, - "learning_rate": 1.1237852250946077e-05, - "loss": 0.0, - "step": 612 - }, - { - "epoch": 0.4612490594431904, - "grad_norm": 1.0666285753250122, - "learning_rate": 1.1214391848373876e-05, - "loss": 0.0166, - "step": 613 - }, - { - "epoch": 0.4620015048908954, - "grad_norm": 0.018536796793341637, - "learning_rate": 1.1190924659886532e-05, - "loss": 0.0006, - "step": 614 - }, - { - "epoch": 0.46275395033860045, - "grad_norm": 0.17641091346740723, - "learning_rate": 1.1167450816616639e-05, - "loss": 0.0047, - "step": 615 - }, - { - "epoch": 0.4635063957863055, - "grad_norm": 0.02300580032169819, - "learning_rate": 1.1143970449733968e-05, - "loss": 0.0007, - "step": 616 - }, - { - "epoch": 0.4642588412340105, - "grad_norm": 0.22667698562145233, - "learning_rate": 1.112048369044475e-05, - "loss": 0.0041, - "step": 617 - }, - { - "epoch": 0.4650112866817156, - "grad_norm": 0.0003491557145025581, - "learning_rate": 1.1096990669990942e-05, - "loss": 0.0, - "step": 618 - }, - { - "epoch": 0.4657637321294206, - "grad_norm": 8.98726939340122e-05, - "learning_rate": 1.1073491519649475e-05, - "loss": 0.0, - "step": 619 - }, - { - "epoch": 0.46651617757712566, - "grad_norm": 0.23045086860656738, - "learning_rate": 1.1049986370731545e-05, - "loss": 0.0034, - "step": 620 - }, - { - "epoch": 0.4672686230248307, - "grad_norm": 7.242973327636719, - "learning_rate": 1.102647535458186e-05, - "loss": 0.3392, - "step": 621 - }, - { - "epoch": 0.4680210684725357, - "grad_norm": 0.00343369715847075, - "learning_rate": 1.1002958602577922e-05, - "loss": 0.0001, - "step": 622 - }, - { - "epoch": 0.4687735139202408, - "grad_norm": 0.001794908195734024, - "learning_rate": 1.0979436246129267e-05, - "loss": 0.0001, - "step": 623 - }, - { - "epoch": 0.46952595936794583, - "grad_norm": 0.040094684809446335, - "learning_rate": 1.0955908416676772e-05, - "loss": 0.0014, - "step": 624 - }, - { - "epoch": 0.47027840481565086, - "grad_norm": 1.4908933735569008e-05, - "learning_rate": 1.093237524569188e-05, - "loss": 0.0, - "step": 625 - }, - { - "epoch": 0.4710308502633559, - "grad_norm": 0.007998337037861347, - "learning_rate": 1.0908836864675884e-05, - "loss": 0.0004, - "step": 626 - }, - { - "epoch": 0.4717832957110609, - "grad_norm": 0.039706043899059296, - "learning_rate": 1.0885293405159196e-05, - "loss": 0.0013, - "step": 627 - }, - { - "epoch": 0.472535741158766, - "grad_norm": 0.002229271689429879, - "learning_rate": 1.0861744998700603e-05, - "loss": 0.0001, - "step": 628 - }, - { - "epoch": 0.47328818660647104, - "grad_norm": 10.943811416625977, - "learning_rate": 1.0838191776886531e-05, - "loss": 0.0575, - "step": 629 - }, - { - "epoch": 0.47404063205417607, - "grad_norm": 1.3392058610916138, - "learning_rate": 1.0814633871330323e-05, - "loss": 0.1315, - "step": 630 - }, - { - "epoch": 0.4747930775018811, - "grad_norm": 0.21532350778579712, - "learning_rate": 1.0791071413671487e-05, - "loss": 0.0045, - "step": 631 - }, - { - "epoch": 0.47554552294958613, - "grad_norm": 0.27968382835388184, - "learning_rate": 1.0767504535574971e-05, - "loss": 0.0037, - "step": 632 - }, - { - "epoch": 0.4762979683972912, - "grad_norm": 0.000970702909398824, - "learning_rate": 1.0743933368730417e-05, - "loss": 0.0, - "step": 633 - }, - { - "epoch": 0.47705041384499625, - "grad_norm": 0.013053186237812042, - "learning_rate": 1.0720358044851448e-05, - "loss": 0.0005, - "step": 634 - }, - { - "epoch": 0.4778028592927013, - "grad_norm": 0.01000749971717596, - "learning_rate": 1.0696778695674899e-05, - "loss": 0.0004, - "step": 635 - }, - { - "epoch": 0.4785553047404063, - "grad_norm": 0.007056588772684336, - "learning_rate": 1.0673195452960107e-05, - "loss": 0.0002, - "step": 636 - }, - { - "epoch": 0.47930775018811134, - "grad_norm": 0.014011607505381107, - "learning_rate": 1.0649608448488166e-05, - "loss": 0.0004, - "step": 637 - }, - { - "epoch": 0.4800601956358164, - "grad_norm": 0.014452227391302586, - "learning_rate": 1.0626017814061186e-05, - "loss": 0.0004, - "step": 638 - }, - { - "epoch": 0.48081264108352145, - "grad_norm": 0.10185810178518295, - "learning_rate": 1.0602423681501564e-05, - "loss": 0.0018, - "step": 639 - }, - { - "epoch": 0.4815650865312265, - "grad_norm": 0.0015508810756728053, - "learning_rate": 1.0578826182651243e-05, - "loss": 0.0001, - "step": 640 - }, - { - "epoch": 0.4823175319789315, - "grad_norm": 4.458479404449463, - "learning_rate": 1.055522544937098e-05, - "loss": 0.2037, - "step": 641 - }, - { - "epoch": 0.48306997742663654, - "grad_norm": 0.028539393097162247, - "learning_rate": 1.0531621613539598e-05, - "loss": 0.0004, - "step": 642 - }, - { - "epoch": 0.48382242287434163, - "grad_norm": 0.02204856462776661, - "learning_rate": 1.0508014807053261e-05, - "loss": 0.0006, - "step": 643 - }, - { - "epoch": 0.48457486832204666, - "grad_norm": 0.0006058391300030053, - "learning_rate": 1.0484405161824743e-05, - "loss": 0.0, - "step": 644 - }, - { - "epoch": 0.4853273137697517, - "grad_norm": 0.00671573868021369, - "learning_rate": 1.0460792809782659e-05, - "loss": 0.0002, - "step": 645 - }, - { - "epoch": 0.4860797592174567, - "grad_norm": 0.004197990987449884, - "learning_rate": 1.0437177882870768e-05, - "loss": 0.0001, - "step": 646 - }, - { - "epoch": 0.48683220466516175, - "grad_norm": 9.33966064453125, - "learning_rate": 1.0413560513047208e-05, - "loss": 0.0376, - "step": 647 - }, - { - "epoch": 0.48758465011286684, - "grad_norm": 0.0011327359825372696, - "learning_rate": 1.038994083228377e-05, - "loss": 0.0, - "step": 648 - }, - { - "epoch": 0.48833709556057187, - "grad_norm": 3.5501651763916016, - "learning_rate": 1.0366318972565162e-05, - "loss": 0.0192, - "step": 649 - }, - { - "epoch": 0.4890895410082769, - "grad_norm": 8.738161087036133, - "learning_rate": 1.0342695065888262e-05, - "loss": 0.2822, - "step": 650 - }, - { - "epoch": 0.4898419864559819, - "grad_norm": 0.0026988936588168144, - "learning_rate": 1.031906924426139e-05, - "loss": 0.0001, - "step": 651 - }, - { - "epoch": 0.49059443190368696, - "grad_norm": 0.011756551451981068, - "learning_rate": 1.0295441639703563e-05, - "loss": 0.0002, - "step": 652 - }, - { - "epoch": 0.49134687735139204, - "grad_norm": 0.04992348700761795, - "learning_rate": 1.027181238424376e-05, - "loss": 0.0009, - "step": 653 - }, - { - "epoch": 0.49209932279909707, - "grad_norm": 0.17335185408592224, - "learning_rate": 1.0248181609920198e-05, - "loss": 0.0035, - "step": 654 - }, - { - "epoch": 0.4928517682468021, - "grad_norm": 0.0004414380819071084, - "learning_rate": 1.0224549448779564e-05, - "loss": 0.0, - "step": 655 - }, - { - "epoch": 0.49360421369450713, - "grad_norm": 1.6693496704101562, - "learning_rate": 1.0200916032876303e-05, - "loss": 0.0686, - "step": 656 - }, - { - "epoch": 0.49435665914221216, - "grad_norm": 0.0005949286860413849, - "learning_rate": 1.0177281494271873e-05, - "loss": 0.0, - "step": 657 - }, - { - "epoch": 0.49510910458991725, - "grad_norm": 0.09166250377893448, - "learning_rate": 1.0153645965033998e-05, - "loss": 0.0029, - "step": 658 - }, - { - "epoch": 0.4958615500376223, - "grad_norm": 0.0018533958354964852, - "learning_rate": 1.0130009577235946e-05, - "loss": 0.0001, - "step": 659 - }, - { - "epoch": 0.4966139954853273, - "grad_norm": 0.004575713537633419, - "learning_rate": 1.010637246295578e-05, - "loss": 0.0002, - "step": 660 - }, - { - "epoch": 0.49736644093303234, - "grad_norm": 0.49716654419898987, - "learning_rate": 1.008273475427562e-05, - "loss": 0.0048, - "step": 661 - }, - { - "epoch": 0.49811888638073737, - "grad_norm": 1.1305954456329346, - "learning_rate": 1.0059096583280907e-05, - "loss": 0.0953, - "step": 662 - }, - { - "epoch": 0.49887133182844245, - "grad_norm": 0.001503349863924086, - "learning_rate": 1.0035458082059672e-05, - "loss": 0.0001, - "step": 663 - }, - { - "epoch": 0.4996237772761475, - "grad_norm": 0.010958625003695488, - "learning_rate": 1.0011819382701784e-05, - "loss": 0.0003, - "step": 664 - }, - { - "epoch": 0.5003762227238525, - "grad_norm": 0.0015685193939134479, - "learning_rate": 9.98818061729822e-06, - "loss": 0.0, - "step": 665 - }, - { - "epoch": 0.5011286681715575, - "grad_norm": 0.010029254481196404, - "learning_rate": 9.964541917940331e-06, - "loss": 0.0003, - "step": 666 - }, - { - "epoch": 0.5018811136192626, - "grad_norm": 0.004363952670246363, - "learning_rate": 9.940903416719097e-06, - "loss": 0.0001, - "step": 667 - }, - { - "epoch": 0.5026335590669676, - "grad_norm": 0.01587914675474167, - "learning_rate": 9.917265245724385e-06, - "loss": 0.0003, - "step": 668 - }, - { - "epoch": 0.5033860045146726, - "grad_norm": 0.005167737137526274, - "learning_rate": 9.893627537044223e-06, - "loss": 0.0002, - "step": 669 - }, - { - "epoch": 0.5041384499623778, - "grad_norm": 8.244839668273926, - "learning_rate": 9.869990422764056e-06, - "loss": 0.1592, - "step": 670 - }, - { - "epoch": 0.5048908954100828, - "grad_norm": 0.0, - "learning_rate": 9.846354034966003e-06, - "loss": 0.0, - "step": 671 - }, - { - "epoch": 0.5056433408577878, - "grad_norm": 18.1772518157959, - "learning_rate": 9.822718505728129e-06, - "loss": 0.3513, - "step": 672 - }, - { - "epoch": 0.5063957863054929, - "grad_norm": 0.007274544797837734, - "learning_rate": 9.7990839671237e-06, - "loss": 0.0002, - "step": 673 - }, - { - "epoch": 0.5071482317531979, - "grad_norm": 0.31394076347351074, - "learning_rate": 9.77545055122044e-06, - "loss": 0.1283, - "step": 674 - }, - { - "epoch": 0.5079006772009029, - "grad_norm": 0.03855385258793831, - "learning_rate": 9.751818390079805e-06, - "loss": 0.0003, - "step": 675 - }, - { - "epoch": 0.508653122648608, - "grad_norm": 0.000805127143394202, - "learning_rate": 9.728187615756243e-06, - "loss": 0.0, - "step": 676 - }, - { - "epoch": 0.509405568096313, - "grad_norm": 0.0, - "learning_rate": 9.704558360296444e-06, - "loss": 0.0, - "step": 677 - }, - { - "epoch": 0.510158013544018, - "grad_norm": 1.416871190071106, - "learning_rate": 9.680930755738616e-06, - "loss": 0.016, - "step": 678 - }, - { - "epoch": 0.510910458991723, - "grad_norm": 0.006285065319389105, - "learning_rate": 9.657304934111742e-06, - "loss": 0.0002, - "step": 679 - }, - { - "epoch": 0.5116629044394282, - "grad_norm": 1.3052698373794556, - "learning_rate": 9.633681027434838e-06, - "loss": 0.1261, - "step": 680 - }, - { - "epoch": 0.5124153498871332, - "grad_norm": 0.0009769117459654808, - "learning_rate": 9.61005916771623e-06, - "loss": 0.0, - "step": 681 - }, - { - "epoch": 0.5131677953348383, - "grad_norm": 0.008825716562569141, - "learning_rate": 9.586439486952796e-06, - "loss": 0.0003, - "step": 682 - }, - { - "epoch": 0.5139202407825433, - "grad_norm": 0.004172160290181637, - "learning_rate": 9.562822117129235e-06, - "loss": 0.0001, - "step": 683 - }, - { - "epoch": 0.5146726862302483, - "grad_norm": 0.03515918552875519, - "learning_rate": 9.539207190217343e-06, - "loss": 0.0009, - "step": 684 - }, - { - "epoch": 0.5154251316779533, - "grad_norm": 0.0004445746308192611, - "learning_rate": 9.51559483817526e-06, - "loss": 0.0, - "step": 685 - }, - { - "epoch": 0.5161775771256584, - "grad_norm": 0.002501540118828416, - "learning_rate": 9.491985192946742e-06, - "loss": 0.0001, - "step": 686 - }, - { - "epoch": 0.5169300225733634, - "grad_norm": 0.21380773186683655, - "learning_rate": 9.468378386460406e-06, - "loss": 0.0934, - "step": 687 - }, - { - "epoch": 0.5176824680210684, - "grad_norm": 0.005090567748993635, - "learning_rate": 9.444774550629024e-06, - "loss": 0.0002, - "step": 688 - }, - { - "epoch": 0.5184349134687735, - "grad_norm": 0.0015086415223777294, - "learning_rate": 9.42117381734876e-06, - "loss": 0.0001, - "step": 689 - }, - { - "epoch": 0.5191873589164786, - "grad_norm": 0.018775468692183495, - "learning_rate": 9.397576318498438e-06, - "loss": 0.0007, - "step": 690 - }, - { - "epoch": 0.5199398043641836, - "grad_norm": 0.8499560356140137, - "learning_rate": 9.373982185938815e-06, - "loss": 0.0083, - "step": 691 - }, - { - "epoch": 0.5206922498118887, - "grad_norm": 0.2695012092590332, - "learning_rate": 9.350391551511837e-06, - "loss": 0.0015, - "step": 692 - }, - { - "epoch": 0.5214446952595937, - "grad_norm": 0.06739958375692368, - "learning_rate": 9.326804547039894e-06, - "loss": 0.0009, - "step": 693 - }, - { - "epoch": 0.5221971407072987, - "grad_norm": 1.989675521850586, - "learning_rate": 9.303221304325103e-06, - "loss": 0.2139, - "step": 694 - }, - { - "epoch": 0.5229495861550038, - "grad_norm": 0.04741863161325455, - "learning_rate": 9.279641955148553e-06, - "loss": 0.0013, - "step": 695 - }, - { - "epoch": 0.5237020316027088, - "grad_norm": 0.060340896248817444, - "learning_rate": 9.256066631269586e-06, - "loss": 0.0008, - "step": 696 - }, - { - "epoch": 0.5244544770504138, - "grad_norm": 2.983630895614624, - "learning_rate": 9.232495464425034e-06, - "loss": 0.4607, - "step": 697 - }, - { - "epoch": 0.5252069224981188, - "grad_norm": 0.0010294626699760556, - "learning_rate": 9.208928586328518e-06, - "loss": 0.0, - "step": 698 - }, - { - "epoch": 0.5259593679458239, - "grad_norm": 1.120234537665965e-05, - "learning_rate": 9.185366128669682e-06, - "loss": 0.0, - "step": 699 - }, - { - "epoch": 0.526711813393529, - "grad_norm": 2.128415107727051, - "learning_rate": 9.161808223113469e-06, - "loss": 0.1173, - "step": 700 - }, - { - "epoch": 0.527464258841234, - "grad_norm": 0.0016029436374083161, - "learning_rate": 9.138255001299402e-06, - "loss": 0.0001, - "step": 701 - }, - { - "epoch": 0.5282167042889391, - "grad_norm": 6.54170560836792, - "learning_rate": 9.114706594840806e-06, - "loss": 0.1784, - "step": 702 - }, - { - "epoch": 0.5289691497366441, - "grad_norm": 0.005230502225458622, - "learning_rate": 9.091163135324119e-06, - "loss": 0.0002, - "step": 703 - }, - { - "epoch": 0.5297215951843491, - "grad_norm": 0.028300784528255463, - "learning_rate": 9.067624754308124e-06, - "loss": 0.0004, - "step": 704 - }, - { - "epoch": 0.5304740406320542, - "grad_norm": 1.312003587372601e-05, - "learning_rate": 9.044091583323231e-06, - "loss": 0.0, - "step": 705 - }, - { - "epoch": 0.5312264860797592, - "grad_norm": 0.03969345614314079, - "learning_rate": 9.020563753870734e-06, - "loss": 0.0009, - "step": 706 - }, - { - "epoch": 0.5319789315274642, - "grad_norm": 29.23634910583496, - "learning_rate": 8.997041397422083e-06, - "loss": 0.2304, - "step": 707 - }, - { - "epoch": 0.5327313769751693, - "grad_norm": 0.08085115998983383, - "learning_rate": 8.973524645418142e-06, - "loss": 0.0021, - "step": 708 - }, - { - "epoch": 0.5334838224228743, - "grad_norm": 0.001823053928092122, - "learning_rate": 8.95001362926846e-06, - "loss": 0.0001, - "step": 709 - }, - { - "epoch": 0.5342362678705794, - "grad_norm": 0.0950389876961708, - "learning_rate": 8.926508480350525e-06, - "loss": 0.0008, - "step": 710 - }, - { - "epoch": 0.5349887133182845, - "grad_norm": 0.0005177335697226226, - "learning_rate": 8.903009330009063e-06, - "loss": 0.0, - "step": 711 - }, - { - "epoch": 0.5357411587659895, - "grad_norm": 0.007335309404879808, - "learning_rate": 8.879516309555252e-06, - "loss": 0.0003, - "step": 712 - }, - { - "epoch": 0.5364936042136945, - "grad_norm": 0.0022024691570550203, - "learning_rate": 8.856029550266036e-06, - "loss": 0.0001, - "step": 713 - }, - { - "epoch": 0.5372460496613995, - "grad_norm": 0.0016011092811822891, - "learning_rate": 8.832549183383363e-06, - "loss": 0.0001, - "step": 714 - }, - { - "epoch": 0.5379984951091046, - "grad_norm": 0.009527268819510937, - "learning_rate": 8.80907534011347e-06, - "loss": 0.0003, - "step": 715 - }, - { - "epoch": 0.5387509405568096, - "grad_norm": 0.04128224030137062, - "learning_rate": 8.785608151626126e-06, - "loss": 0.0012, - "step": 716 - }, - { - "epoch": 0.5395033860045146, - "grad_norm": 0.013059580698609352, - "learning_rate": 8.762147749053928e-06, - "loss": 0.0004, - "step": 717 - }, - { - "epoch": 0.5402558314522197, - "grad_norm": 0.0006950248498469591, - "learning_rate": 8.738694263491545e-06, - "loss": 0.0, - "step": 718 - }, - { - "epoch": 0.5410082768999247, - "grad_norm": 0.050438202917575836, - "learning_rate": 8.715247825995e-06, - "loss": 0.0014, - "step": 719 - }, - { - "epoch": 0.5417607223476298, - "grad_norm": 0.007887017913162708, - "learning_rate": 8.691808567580922e-06, - "loss": 0.0002, - "step": 720 - }, - { - "epoch": 0.5425131677953349, - "grad_norm": 1.9573593139648438, - "learning_rate": 8.668376619225846e-06, - "loss": 0.1971, - "step": 721 - }, - { - "epoch": 0.5432656132430399, - "grad_norm": 0.005204391200095415, - "learning_rate": 8.64495211186543e-06, - "loss": 0.0002, - "step": 722 - }, - { - "epoch": 0.5440180586907449, - "grad_norm": 0.10841142386198044, - "learning_rate": 8.621535176393776e-06, - "loss": 0.0026, - "step": 723 - }, - { - "epoch": 0.54477050413845, - "grad_norm": 0.006886586546897888, - "learning_rate": 8.598125943662662e-06, - "loss": 0.0003, - "step": 724 - }, - { - "epoch": 0.545522949586155, - "grad_norm": 1.904536485671997, - "learning_rate": 8.574724544480829e-06, - "loss": 0.0237, - "step": 725 - }, - { - "epoch": 0.54627539503386, - "grad_norm": 0.0, - "learning_rate": 8.551331109613238e-06, - "loss": 0.0, - "step": 726 - }, - { - "epoch": 0.547027840481565, - "grad_norm": 0.048206303268671036, - "learning_rate": 8.527945769780358e-06, - "loss": 0.0006, - "step": 727 - }, - { - "epoch": 0.5477802859292701, - "grad_norm": 0.05634288117289543, - "learning_rate": 8.504568655657415e-06, - "loss": 0.001, - "step": 728 - }, - { - "epoch": 0.5485327313769752, - "grad_norm": 0.43885576725006104, - "learning_rate": 8.481199897873667e-06, - "loss": 0.1386, - "step": 729 - }, - { - "epoch": 0.5492851768246803, - "grad_norm": 2.3144471645355225, - "learning_rate": 8.457839627011693e-06, - "loss": 0.0281, - "step": 730 - }, - { - "epoch": 0.5500376222723853, - "grad_norm": 0.016497652977705002, - "learning_rate": 8.43448797360663e-06, - "loss": 0.0004, - "step": 731 - }, - { - "epoch": 0.5507900677200903, - "grad_norm": 3.199171543121338, - "learning_rate": 8.411145068145474e-06, - "loss": 0.5294, - "step": 732 - }, - { - "epoch": 0.5515425131677953, - "grad_norm": 0.08361708372831345, - "learning_rate": 8.38781104106633e-06, - "loss": 0.0013, - "step": 733 - }, - { - "epoch": 0.5522949586155004, - "grad_norm": 0.011733477003872395, - "learning_rate": 8.3644860227577e-06, - "loss": 0.0004, - "step": 734 - }, - { - "epoch": 0.5530474040632054, - "grad_norm": 0.005462857894599438, - "learning_rate": 8.341170143557733e-06, - "loss": 0.0002, - "step": 735 - }, - { - "epoch": 0.5537998495109104, - "grad_norm": 0.0030956827104091644, - "learning_rate": 8.317863533753523e-06, - "loss": 0.0001, - "step": 736 - }, - { - "epoch": 0.5545522949586155, - "grad_norm": 1.1541496515274048, - "learning_rate": 8.294566323580359e-06, - "loss": 0.1397, - "step": 737 - }, - { - "epoch": 0.5553047404063205, - "grad_norm": 4.27673864364624, - "learning_rate": 8.27127864322101e-06, - "loss": 0.0181, - "step": 738 - }, - { - "epoch": 0.5560571858540256, - "grad_norm": 0.05562518537044525, - "learning_rate": 8.248000622804986e-06, - "loss": 0.0015, - "step": 739 - }, - { - "epoch": 0.5568096313017307, - "grad_norm": 1.1070342063903809, - "learning_rate": 8.224732392407834e-06, - "loss": 0.0464, - "step": 740 - }, - { - "epoch": 0.5575620767494357, - "grad_norm": 0.007482460699975491, - "learning_rate": 8.201474082050376e-06, - "loss": 0.0003, - "step": 741 - }, - { - "epoch": 0.5583145221971407, - "grad_norm": 0.007131911348551512, - "learning_rate": 8.178225821698013e-06, - "loss": 0.0003, - "step": 742 - }, - { - "epoch": 0.5590669676448458, - "grad_norm": 0.010571115650236607, - "learning_rate": 8.154987741259986e-06, - "loss": 0.0002, - "step": 743 - }, - { - "epoch": 0.5598194130925508, - "grad_norm": 0.007416147738695145, - "learning_rate": 8.13175997058865e-06, - "loss": 0.0003, - "step": 744 - }, - { - "epoch": 0.5605718585402558, - "grad_norm": 0.07424626499414444, - "learning_rate": 8.10854263947875e-06, - "loss": 0.002, - "step": 745 - }, - { - "epoch": 0.5613243039879608, - "grad_norm": 0.009504971094429493, - "learning_rate": 8.085335877666696e-06, - "loss": 0.0004, - "step": 746 - }, - { - "epoch": 0.5620767494356659, - "grad_norm": 0.019785739481449127, - "learning_rate": 8.062139814829839e-06, - "loss": 0.0006, - "step": 747 - }, - { - "epoch": 0.5628291948833709, - "grad_norm": 0.002929375506937504, - "learning_rate": 8.038954580585742e-06, - "loss": 0.0001, - "step": 748 - }, - { - "epoch": 0.563581640331076, - "grad_norm": 3.0780787467956543, - "learning_rate": 8.015780304491457e-06, - "loss": 0.069, - "step": 749 - }, - { - "epoch": 0.5643340857787811, - "grad_norm": 10.312975883483887, - "learning_rate": 7.992617116042813e-06, - "loss": 0.0922, - "step": 750 - }, - { - "epoch": 0.5650865312264861, - "grad_norm": 0.4201790690422058, - "learning_rate": 7.969465144673674e-06, - "loss": 0.0038, - "step": 751 - }, - { - "epoch": 0.5658389766741911, - "grad_norm": 0.24568204581737518, - "learning_rate": 7.946324519755225e-06, - "loss": 0.0055, - "step": 752 - }, - { - "epoch": 0.5665914221218962, - "grad_norm": 0.007804738823324442, - "learning_rate": 7.92319537059525e-06, - "loss": 0.0004, - "step": 753 - }, - { - "epoch": 0.5673438675696012, - "grad_norm": 0.002531126607209444, - "learning_rate": 7.900077826437402e-06, - "loss": 0.0001, - "step": 754 - }, - { - "epoch": 0.5680963130173062, - "grad_norm": 0.002385746920481324, - "learning_rate": 7.876972016460492e-06, - "loss": 0.0001, - "step": 755 - }, - { - "epoch": 0.5688487584650113, - "grad_norm": 0.009147096425294876, - "learning_rate": 7.853878069777762e-06, - "loss": 0.0004, - "step": 756 - }, - { - "epoch": 0.5696012039127163, - "grad_norm": 0.4006285071372986, - "learning_rate": 7.83079611543616e-06, - "loss": 0.0046, - "step": 757 - }, - { - "epoch": 0.5703536493604213, - "grad_norm": 3.700183868408203, - "learning_rate": 7.80772628241562e-06, - "loss": 0.1669, - "step": 758 - }, - { - "epoch": 0.5711060948081265, - "grad_norm": 0.007939686998724937, - "learning_rate": 7.784668699628345e-06, - "loss": 0.0003, - "step": 759 - }, - { - "epoch": 0.5718585402558315, - "grad_norm": 0.001986120129004121, - "learning_rate": 7.761623495918089e-06, - "loss": 0.0001, - "step": 760 - }, - { - "epoch": 0.5726109857035365, - "grad_norm": 0.0024444598238915205, - "learning_rate": 7.738590800059427e-06, - "loss": 0.0001, - "step": 761 - }, - { - "epoch": 0.5733634311512416, - "grad_norm": 0.0029928828589618206, - "learning_rate": 7.715570740757045e-06, - "loss": 0.0001, - "step": 762 - }, - { - "epoch": 0.5741158765989466, - "grad_norm": 0.02208460308611393, - "learning_rate": 7.692563446645017e-06, - "loss": 0.0009, - "step": 763 - }, - { - "epoch": 0.5748683220466516, - "grad_norm": 0.02059108205139637, - "learning_rate": 7.66956904628608e-06, - "loss": 0.0005, - "step": 764 - }, - { - "epoch": 0.5756207674943566, - "grad_norm": 4.884920120239258, - "learning_rate": 7.64658766817093e-06, - "loss": 0.0456, - "step": 765 - }, - { - "epoch": 0.5763732129420617, - "grad_norm": 0.04961561784148216, - "learning_rate": 7.623619440717493e-06, - "loss": 0.0016, - "step": 766 - }, - { - "epoch": 0.5771256583897667, - "grad_norm": 0.024083776399493217, - "learning_rate": 7.600664492270206e-06, - "loss": 0.0006, - "step": 767 - }, - { - "epoch": 0.5778781038374717, - "grad_norm": 0.017353009432554245, - "learning_rate": 7.57772295109931e-06, - "loss": 0.0006, - "step": 768 - }, - { - "epoch": 0.5786305492851769, - "grad_norm": 0.00047786516370251775, - "learning_rate": 7.554794945400122e-06, - "loss": 0.0, - "step": 769 - }, - { - "epoch": 0.5793829947328819, - "grad_norm": 0.2889784276485443, - "learning_rate": 7.531880603292333e-06, - "loss": 0.0052, - "step": 770 - }, - { - "epoch": 0.5801354401805869, - "grad_norm": 5.133121490478516, - "learning_rate": 7.508980052819274e-06, - "loss": 0.1127, - "step": 771 - }, - { - "epoch": 0.580887885628292, - "grad_norm": 0.06616966426372528, - "learning_rate": 7.486093421947214e-06, - "loss": 0.0023, - "step": 772 - }, - { - "epoch": 0.581640331075997, - "grad_norm": 0.0013333017705008388, - "learning_rate": 7.463220838564635e-06, - "loss": 0.0001, - "step": 773 - }, - { - "epoch": 0.582392776523702, - "grad_norm": 1.0273889303207397, - "learning_rate": 7.440362430481529e-06, - "loss": 0.08, - "step": 774 - }, - { - "epoch": 0.5831452219714071, - "grad_norm": 0.7515692710876465, - "learning_rate": 7.417518325428678e-06, - "loss": 0.11, - "step": 775 - }, - { - "epoch": 0.5838976674191121, - "grad_norm": 3.273358743172139e-05, - "learning_rate": 7.3946886510569385e-06, - "loss": 0.0, - "step": 776 - }, - { - "epoch": 0.5846501128668171, - "grad_norm": 3.785158634185791, - "learning_rate": 7.371873534936522e-06, - "loss": 0.448, - "step": 777 - }, - { - "epoch": 0.5854025583145221, - "grad_norm": 0.004100090358406305, - "learning_rate": 7.349073104556301e-06, - "loss": 0.0002, - "step": 778 - }, - { - "epoch": 0.5861550037622273, - "grad_norm": 0.28187495470046997, - "learning_rate": 7.326287487323078e-06, - "loss": 0.0012, - "step": 779 - }, - { - "epoch": 0.5869074492099323, - "grad_norm": 5.85366678237915, - "learning_rate": 7.3035168105608885e-06, - "loss": 0.0818, - "step": 780 - }, - { - "epoch": 0.5876598946576373, - "grad_norm": 0.005729475524276495, - "learning_rate": 7.280761201510275e-06, - "loss": 0.0002, - "step": 781 - }, - { - "epoch": 0.5884123401053424, - "grad_norm": 0.1717999428510666, - "learning_rate": 7.2580207873275865e-06, - "loss": 0.0992, - "step": 782 - }, - { - "epoch": 0.5891647855530474, - "grad_norm": 0.0329020619392395, - "learning_rate": 7.235295695084259e-06, - "loss": 0.0006, - "step": 783 - }, - { - "epoch": 0.5899172310007524, - "grad_norm": 0.012131314724683762, - "learning_rate": 7.212586051766118e-06, - "loss": 0.0003, - "step": 784 - }, - { - "epoch": 0.5906696764484575, - "grad_norm": 1.1857929229736328, - "learning_rate": 7.189891984272659e-06, - "loss": 0.0395, - "step": 785 - }, - { - "epoch": 0.5914221218961625, - "grad_norm": 0.0067533692345023155, - "learning_rate": 7.16721361941634e-06, - "loss": 0.0003, - "step": 786 - }, - { - "epoch": 0.5921745673438675, - "grad_norm": 0.0263225045055151, - "learning_rate": 7.144551083921875e-06, - "loss": 0.001, - "step": 787 - }, - { - "epoch": 0.5929270127915726, - "grad_norm": 2.086378812789917, - "learning_rate": 7.121904504425523e-06, - "loss": 0.0377, - "step": 788 - }, - { - "epoch": 0.5936794582392777, - "grad_norm": 2.842104196548462, - "learning_rate": 7.0992740074743835e-06, - "loss": 0.0224, - "step": 789 - }, - { - "epoch": 0.5944319036869827, - "grad_norm": 0.00352369318716228, - "learning_rate": 7.076659719525694e-06, - "loss": 0.0002, - "step": 790 - }, - { - "epoch": 0.5951843491346878, - "grad_norm": 0.02450047992169857, - "learning_rate": 7.05406176694611e-06, - "loss": 0.0006, - "step": 791 - }, - { - "epoch": 0.5959367945823928, - "grad_norm": 0.01613771915435791, - "learning_rate": 7.031480276011007e-06, - "loss": 0.0007, - "step": 792 - }, - { - "epoch": 0.5966892400300978, - "grad_norm": 16.005887985229492, - "learning_rate": 7.008915372903775e-06, - "loss": 0.1847, - "step": 793 - }, - { - "epoch": 0.5974416854778029, - "grad_norm": 0.023932024836540222, - "learning_rate": 6.986367183715117e-06, - "loss": 0.0009, - "step": 794 - }, - { - "epoch": 0.5981941309255079, - "grad_norm": 0.07864760607481003, - "learning_rate": 6.963835834442336e-06, - "loss": 0.0028, - "step": 795 - }, - { - "epoch": 0.5989465763732129, - "grad_norm": 0.09168053418397903, - "learning_rate": 6.941321450988633e-06, - "loss": 0.0023, - "step": 796 - }, - { - "epoch": 0.5996990218209179, - "grad_norm": 2.519742965698242, - "learning_rate": 6.918824159162409e-06, - "loss": 0.3657, - "step": 797 - }, - { - "epoch": 0.600451467268623, - "grad_norm": 0.7624035477638245, - "learning_rate": 6.89634408467656e-06, - "loss": 0.057, - "step": 798 - }, - { - "epoch": 0.6012039127163281, - "grad_norm": 3.3864188194274902, - "learning_rate": 6.873881353147766e-06, - "loss": 0.3926, - "step": 799 - }, - { - "epoch": 0.6019563581640331, - "grad_norm": 1.0742292404174805, - "learning_rate": 6.851436090095807e-06, - "loss": 0.0186, - "step": 800 - }, - { - "epoch": 0.6027088036117382, - "grad_norm": 3.5620031356811523, - "learning_rate": 6.829008420942842e-06, - "loss": 0.0536, - "step": 801 - }, - { - "epoch": 0.6034612490594432, - "grad_norm": 2.453146457672119, - "learning_rate": 6.806598471012717e-06, - "loss": 0.0645, - "step": 802 - }, - { - "epoch": 0.6042136945071482, - "grad_norm": 0.018604706972837448, - "learning_rate": 6.784206365530268e-06, - "loss": 0.0006, - "step": 803 - }, - { - "epoch": 0.6049661399548533, - "grad_norm": 0.6108080744743347, - "learning_rate": 6.761832229620618e-06, - "loss": 0.0065, - "step": 804 - }, - { - "epoch": 0.6057185854025583, - "grad_norm": 0.9389123916625977, - "learning_rate": 6.739476188308476e-06, - "loss": 0.0085, - "step": 805 - }, - { - "epoch": 0.6064710308502633, - "grad_norm": 0.0035854075103998184, - "learning_rate": 6.717138366517438e-06, - "loss": 0.0002, - "step": 806 - }, - { - "epoch": 0.6072234762979684, - "grad_norm": 0.003520939266309142, - "learning_rate": 6.694818889069294e-06, - "loss": 0.0002, - "step": 807 - }, - { - "epoch": 0.6079759217456734, - "grad_norm": 0.9055103659629822, - "learning_rate": 6.672517880683332e-06, - "loss": 0.0895, - "step": 808 - }, - { - "epoch": 0.6087283671933785, - "grad_norm": 0.1636822521686554, - "learning_rate": 6.6502354659756165e-06, - "loss": 0.0064, - "step": 809 - }, - { - "epoch": 0.6094808126410836, - "grad_norm": 4.655453681945801, - "learning_rate": 6.627971769458341e-06, - "loss": 0.1029, - "step": 810 - }, - { - "epoch": 0.6102332580887886, - "grad_norm": 0.3717281222343445, - "learning_rate": 6.605726915539088e-06, - "loss": 0.2568, - "step": 811 - }, - { - "epoch": 0.6109857035364936, - "grad_norm": 0.27854296565055847, - "learning_rate": 6.583501028520143e-06, - "loss": 0.107, - "step": 812 - }, - { - "epoch": 0.6117381489841986, - "grad_norm": 1.342350959777832, - "learning_rate": 6.561294232597817e-06, - "loss": 0.2153, - "step": 813 - }, - { - "epoch": 0.6124905944319037, - "grad_norm": 0.017684536054730415, - "learning_rate": 6.539106651861741e-06, - "loss": 0.0005, - "step": 814 - }, - { - "epoch": 0.6132430398796087, - "grad_norm": 0.005973528604954481, - "learning_rate": 6.516938410294165e-06, - "loss": 0.0002, - "step": 815 - }, - { - "epoch": 0.6139954853273137, - "grad_norm": 0.12207470089197159, - "learning_rate": 6.494789631769281e-06, - "loss": 0.0047, - "step": 816 - }, - { - "epoch": 0.6147479307750188, - "grad_norm": 0.2868960499763489, - "learning_rate": 6.472660440052521e-06, - "loss": 0.1167, - "step": 817 - }, - { - "epoch": 0.6155003762227238, - "grad_norm": 0.023519231006503105, - "learning_rate": 6.450550958799868e-06, - "loss": 0.0009, - "step": 818 - }, - { - "epoch": 0.6162528216704289, - "grad_norm": 0.001310076448135078, - "learning_rate": 6.428461311557159e-06, - "loss": 0.0001, - "step": 819 - }, - { - "epoch": 0.617005267118134, - "grad_norm": 0.009395822882652283, - "learning_rate": 6.406391621759416e-06, - "loss": 0.0005, - "step": 820 - }, - { - "epoch": 0.617757712565839, - "grad_norm": 0.0022374300751835108, - "learning_rate": 6.384342012730122e-06, - "loss": 0.0001, - "step": 821 - }, - { - "epoch": 0.618510158013544, - "grad_norm": 0.02684149146080017, - "learning_rate": 6.362312607680559e-06, - "loss": 0.001, - "step": 822 - }, - { - "epoch": 0.6192626034612491, - "grad_norm": 0.02339085564017296, - "learning_rate": 6.3403035297091145e-06, - "loss": 0.0011, - "step": 823 - }, - { - "epoch": 0.6200150489089541, - "grad_norm": 0.3724048435688019, - "learning_rate": 6.318314901800584e-06, - "loss": 0.1327, - "step": 824 - }, - { - "epoch": 0.6207674943566591, - "grad_norm": 0.10902447998523712, - "learning_rate": 6.29634684682549e-06, - "loss": 0.0025, - "step": 825 - }, - { - "epoch": 0.6215199398043642, - "grad_norm": 1.5880934000015259, - "learning_rate": 6.274399487539397e-06, - "loss": 0.09, - "step": 826 - }, - { - "epoch": 0.6222723852520692, - "grad_norm": 0.024421492591500282, - "learning_rate": 6.2524729465822265e-06, - "loss": 0.0008, - "step": 827 - }, - { - "epoch": 0.6230248306997742, - "grad_norm": 0.3446272909641266, - "learning_rate": 6.230567346477567e-06, - "loss": 0.0032, - "step": 828 - }, - { - "epoch": 0.6237772761474794, - "grad_norm": 0.007315884344279766, - "learning_rate": 6.208682809631983e-06, - "loss": 0.0002, - "step": 829 - }, - { - "epoch": 0.6245297215951844, - "grad_norm": 0.3761526346206665, - "learning_rate": 6.1868194583343585e-06, - "loss": 0.1232, - "step": 830 - }, - { - "epoch": 0.6252821670428894, - "grad_norm": 0.005384144373238087, - "learning_rate": 6.1649774147551755e-06, - "loss": 0.0002, - "step": 831 - }, - { - "epoch": 0.6260346124905944, - "grad_norm": 2.6692087650299072, - "learning_rate": 6.1431568009458596e-06, - "loss": 0.4896, - "step": 832 - }, - { - "epoch": 0.6267870579382995, - "grad_norm": 0.008824239484965801, - "learning_rate": 6.121357738838088e-06, - "loss": 0.0004, - "step": 833 - }, - { - "epoch": 0.6275395033860045, - "grad_norm": 3.028167963027954, - "learning_rate": 6.099580350243109e-06, - "loss": 0.2251, - "step": 834 - }, - { - "epoch": 0.6282919488337095, - "grad_norm": 0.011604844592511654, - "learning_rate": 6.077824756851055e-06, - "loss": 0.0005, - "step": 835 - }, - { - "epoch": 0.6290443942814146, - "grad_norm": 3.686901092529297, - "learning_rate": 6.056091080230279e-06, - "loss": 0.1939, - "step": 836 - }, - { - "epoch": 0.6297968397291196, - "grad_norm": 0.012510290369391441, - "learning_rate": 6.034379441826659e-06, - "loss": 0.0006, - "step": 837 - }, - { - "epoch": 0.6305492851768246, - "grad_norm": 0.014967096969485283, - "learning_rate": 6.012689962962923e-06, - "loss": 0.0006, - "step": 838 - }, - { - "epoch": 0.6313017306245298, - "grad_norm": 0.00533378915861249, - "learning_rate": 5.991022764837979e-06, - "loss": 0.0003, - "step": 839 - }, - { - "epoch": 0.6320541760722348, - "grad_norm": 0.021024620160460472, - "learning_rate": 5.969377968526231e-06, - "loss": 0.0009, - "step": 840 - }, - { - "epoch": 0.6328066215199398, - "grad_norm": 0.15575391054153442, - "learning_rate": 5.947755694976902e-06, - "loss": 0.0038, - "step": 841 - }, - { - "epoch": 0.6335590669676449, - "grad_norm": 0.5761941075325012, - "learning_rate": 5.926156065013359e-06, - "loss": 0.0294, - "step": 842 - }, - { - "epoch": 0.6343115124153499, - "grad_norm": 0.5115591287612915, - "learning_rate": 5.904579199332443e-06, - "loss": 0.081, - "step": 843 - }, - { - "epoch": 0.6350639578630549, - "grad_norm": 0.017021209001541138, - "learning_rate": 5.883025218503781e-06, - "loss": 0.0008, - "step": 844 - }, - { - "epoch": 0.63581640331076, - "grad_norm": 0.0051472182385623455, - "learning_rate": 5.861494242969134e-06, - "loss": 0.0002, - "step": 845 - }, - { - "epoch": 0.636568848758465, - "grad_norm": 0.007996303029358387, - "learning_rate": 5.839986393041701e-06, - "loss": 0.0003, - "step": 846 - }, - { - "epoch": 0.63732129420617, - "grad_norm": 0.0032203912269324064, - "learning_rate": 5.818501788905464e-06, - "loss": 0.0001, - "step": 847 - }, - { - "epoch": 0.6380737396538751, - "grad_norm": 0.01805216819047928, - "learning_rate": 5.7970405506145e-06, - "loss": 0.0008, - "step": 848 - }, - { - "epoch": 0.6388261851015802, - "grad_norm": 0.015916842967271805, - "learning_rate": 5.775602798092335e-06, - "loss": 0.0007, - "step": 849 - }, - { - "epoch": 0.6395786305492852, - "grad_norm": 0.18560563027858734, - "learning_rate": 5.754188651131246e-06, - "loss": 0.0048, - "step": 850 - }, - { - "epoch": 0.6403310759969902, - "grad_norm": 0.02914322167634964, - "learning_rate": 5.732798229391613e-06, - "loss": 0.0012, - "step": 851 - }, - { - "epoch": 0.6410835214446953, - "grad_norm": 0.0026829675771296024, - "learning_rate": 5.711431652401227e-06, - "loss": 0.0001, - "step": 852 - }, - { - "epoch": 0.6418359668924003, - "grad_norm": 0.1754864752292633, - "learning_rate": 5.690089039554654e-06, - "loss": 0.1013, - "step": 853 - }, - { - "epoch": 0.6425884123401053, - "grad_norm": 0.07116664201021194, - "learning_rate": 5.668770510112538e-06, - "loss": 0.0032, - "step": 854 - }, - { - "epoch": 0.6433408577878104, - "grad_norm": 0.8461427092552185, - "learning_rate": 5.6474761832009554e-06, - "loss": 0.0517, - "step": 855 - }, - { - "epoch": 0.6440933032355154, - "grad_norm": 0.009436607360839844, - "learning_rate": 5.626206177810735e-06, - "loss": 0.0004, - "step": 856 - }, - { - "epoch": 0.6448457486832204, - "grad_norm": 2.8411285877227783, - "learning_rate": 5.604960612796805e-06, - "loss": 0.3936, - "step": 857 - }, - { - "epoch": 0.6455981941309256, - "grad_norm": 0.05091340094804764, - "learning_rate": 5.583739606877516e-06, - "loss": 0.0014, - "step": 858 - }, - { - "epoch": 0.6463506395786306, - "grad_norm": 0.00600972305983305, - "learning_rate": 5.562543278633988e-06, - "loss": 0.0002, - "step": 859 - }, - { - "epoch": 0.6471030850263356, - "grad_norm": 0.0064705973491072655, - "learning_rate": 5.541371746509448e-06, - "loss": 0.0003, - "step": 860 - }, - { - "epoch": 0.6478555304740407, - "grad_norm": 5.701414585113525, - "learning_rate": 5.520225128808555e-06, - "loss": 0.0259, - "step": 861 - }, - { - "epoch": 0.6486079759217457, - "grad_norm": 0.021785501390695572, - "learning_rate": 5.4991035436967585e-06, - "loss": 0.0009, - "step": 862 - }, - { - "epoch": 0.6493604213694507, - "grad_norm": 0.017925115302205086, - "learning_rate": 5.478007109199624e-06, - "loss": 0.0008, - "step": 863 - }, - { - "epoch": 0.6501128668171557, - "grad_norm": 0.012886104173958302, - "learning_rate": 5.456935943202177e-06, - "loss": 0.0005, - "step": 864 - }, - { - "epoch": 0.6508653122648608, - "grad_norm": 0.015779219567775726, - "learning_rate": 5.4358901634482404e-06, - "loss": 0.0006, - "step": 865 - }, - { - "epoch": 0.6516177577125658, - "grad_norm": 0.05147948116064072, - "learning_rate": 5.4148698875397905e-06, - "loss": 0.0025, - "step": 866 - }, - { - "epoch": 0.6523702031602708, - "grad_norm": 0.019827265292406082, - "learning_rate": 5.393875232936283e-06, - "loss": 0.0007, - "step": 867 - }, - { - "epoch": 0.653122648607976, - "grad_norm": 0.01970357820391655, - "learning_rate": 5.372906316954005e-06, - "loss": 0.0006, - "step": 868 - }, - { - "epoch": 0.653875094055681, - "grad_norm": 8.924382209777832, - "learning_rate": 5.351963256765426e-06, - "loss": 0.1568, - "step": 869 - }, - { - "epoch": 0.654627539503386, - "grad_norm": 1.7220969200134277, - "learning_rate": 5.33104616939853e-06, - "loss": 0.0397, - "step": 870 - }, - { - "epoch": 0.6553799849510911, - "grad_norm": 0.3469778895378113, - "learning_rate": 5.3101551717361586e-06, - "loss": 0.0112, - "step": 871 - }, - { - "epoch": 0.6561324303987961, - "grad_norm": 0.3850734233856201, - "learning_rate": 5.2892903805153795e-06, - "loss": 0.1255, - "step": 872 - }, - { - "epoch": 0.6568848758465011, - "grad_norm": 2.2157397270202637, - "learning_rate": 5.2684519123268155e-06, - "loss": 0.3644, - "step": 873 - }, - { - "epoch": 0.6576373212942062, - "grad_norm": 0.011840404942631721, - "learning_rate": 5.247639883613999e-06, - "loss": 0.0005, - "step": 874 - }, - { - "epoch": 0.6583897667419112, - "grad_norm": 0.02047601528465748, - "learning_rate": 5.226854410672724e-06, - "loss": 0.0008, - "step": 875 - }, - { - "epoch": 0.6591422121896162, - "grad_norm": 4.430492401123047, - "learning_rate": 5.2060956096503854e-06, - "loss": 0.1308, - "step": 876 - }, - { - "epoch": 0.6598946576373212, - "grad_norm": 0.004179192706942558, - "learning_rate": 5.1853635965453495e-06, - "loss": 0.0002, - "step": 877 - }, - { - "epoch": 0.6606471030850264, - "grad_norm": 0.3641025424003601, - "learning_rate": 5.164658487206275e-06, - "loss": 0.1212, - "step": 878 - }, - { - "epoch": 0.6613995485327314, - "grad_norm": 0.061624638736248016, - "learning_rate": 5.143980397331512e-06, - "loss": 0.0025, - "step": 879 - }, - { - "epoch": 0.6621519939804364, - "grad_norm": 5.032725811004639, - "learning_rate": 5.123329442468403e-06, - "loss": 0.035, - "step": 880 - }, - { - "epoch": 0.6629044394281415, - "grad_norm": 1.7214125394821167, - "learning_rate": 5.102705738012676e-06, - "loss": 0.0527, - "step": 881 - }, - { - "epoch": 0.6636568848758465, - "grad_norm": 0.7142964005470276, - "learning_rate": 5.082109399207784e-06, - "loss": 0.1174, - "step": 882 - }, - { - "epoch": 0.6644093303235515, - "grad_norm": 0.062142811715602875, - "learning_rate": 5.061540541144265e-06, - "loss": 0.0029, - "step": 883 - }, - { - "epoch": 0.6651617757712566, - "grad_norm": 0.2744309902191162, - "learning_rate": 5.0409992787590845e-06, - "loss": 0.0093, - "step": 884 - }, - { - "epoch": 0.6659142212189616, - "grad_norm": 0.059226468205451965, - "learning_rate": 5.02048572683502e-06, - "loss": 0.0028, - "step": 885 - }, - { - "epoch": 0.6666666666666666, - "grad_norm": 0.007528245449066162, - "learning_rate": 5.000000000000003e-06, - "loss": 0.0003, - "step": 886 - }, - { - "epoch": 0.6674191121143717, - "grad_norm": 2.5258829593658447, - "learning_rate": 4.979542212726474e-06, - "loss": 0.2801, - "step": 887 - }, - { - "epoch": 0.6681715575620768, - "grad_norm": 3.617130756378174, - "learning_rate": 4.959112479330753e-06, - "loss": 0.1343, - "step": 888 - }, - { - "epoch": 0.6689240030097818, - "grad_norm": 0.3309485912322998, - "learning_rate": 4.93871091397241e-06, - "loss": 0.0082, - "step": 889 - }, - { - "epoch": 0.6696764484574869, - "grad_norm": 0.031886566430330276, - "learning_rate": 4.9183376306535904e-06, - "loss": 0.0009, - "step": 890 - }, - { - "epoch": 0.6704288939051919, - "grad_norm": 0.04454849287867546, - "learning_rate": 4.897992743218419e-06, - "loss": 0.0019, - "step": 891 - }, - { - "epoch": 0.6711813393528969, - "grad_norm": 0.19051343202590942, - "learning_rate": 4.877676365352343e-06, - "loss": 0.1, - "step": 892 - }, - { - "epoch": 0.671933784800602, - "grad_norm": 2.1760129928588867, - "learning_rate": 4.857388610581499e-06, - "loss": 0.1941, - "step": 893 - }, - { - "epoch": 0.672686230248307, - "grad_norm": 0.020949775353074074, - "learning_rate": 4.837129592272083e-06, - "loss": 0.0008, - "step": 894 - }, - { - "epoch": 0.673438675696012, - "grad_norm": 0.08075495064258575, - "learning_rate": 4.81689942362971e-06, - "loss": 0.0042, - "step": 895 - }, - { - "epoch": 0.674191121143717, - "grad_norm": 0.004490719176828861, - "learning_rate": 4.796698217698791e-06, - "loss": 0.0002, - "step": 896 - }, - { - "epoch": 0.6749435665914221, - "grad_norm": 0.04653012752532959, - "learning_rate": 4.776526087361896e-06, - "loss": 0.0022, - "step": 897 - }, - { - "epoch": 0.6756960120391272, - "grad_norm": 0.009343601763248444, - "learning_rate": 4.756383145339107e-06, - "loss": 0.0003, - "step": 898 - }, - { - "epoch": 0.6764484574868322, - "grad_norm": 0.12858974933624268, - "learning_rate": 4.736269504187431e-06, - "loss": 0.0055, - "step": 899 - }, - { - "epoch": 0.6772009029345373, - "grad_norm": 0.020633621141314507, - "learning_rate": 4.716185276300126e-06, - "loss": 0.0009, - "step": 900 - }, - { - "epoch": 0.6779533483822423, - "grad_norm": 0.023468296974897385, - "learning_rate": 4.696130573906096e-06, - "loss": 0.0009, - "step": 901 - }, - { - "epoch": 0.6787057938299473, - "grad_norm": 1.5509250164031982, - "learning_rate": 4.676105509069263e-06, - "loss": 0.0409, - "step": 902 - }, - { - "epoch": 0.6794582392776524, - "grad_norm": 0.607180118560791, - "learning_rate": 4.656110193687925e-06, - "loss": 0.0688, - "step": 903 - }, - { - "epoch": 0.6802106847253574, - "grad_norm": 0.04802005738019943, - "learning_rate": 4.636144739494156e-06, - "loss": 0.0021, - "step": 904 - }, - { - "epoch": 0.6809631301730624, - "grad_norm": 0.009008445776998997, - "learning_rate": 4.616209258053163e-06, - "loss": 0.0003, - "step": 905 - }, - { - "epoch": 0.6817155756207675, - "grad_norm": 0.02914116531610489, - "learning_rate": 4.5963038607626655e-06, - "loss": 0.0009, - "step": 906 - }, - { - "epoch": 0.6824680210684725, - "grad_norm": 1.7884653806686401, - "learning_rate": 4.57642865885228e-06, - "loss": 0.3394, - "step": 907 - }, - { - "epoch": 0.6832204665161776, - "grad_norm": 0.3303041160106659, - "learning_rate": 4.5565837633828904e-06, - "loss": 0.1329, - "step": 908 - }, - { - "epoch": 0.6839729119638827, - "grad_norm": 0.06692855805158615, - "learning_rate": 4.536769285246033e-06, - "loss": 0.003, - "step": 909 - }, - { - "epoch": 0.6847253574115877, - "grad_norm": 0.04765620082616806, - "learning_rate": 4.516985335163274e-06, - "loss": 0.0022, - "step": 910 - }, - { - "epoch": 0.6854778028592927, - "grad_norm": 0.18733732402324677, - "learning_rate": 4.4972320236855916e-06, - "loss": 0.0051, - "step": 911 - }, - { - "epoch": 0.6862302483069977, - "grad_norm": 0.07214639335870743, - "learning_rate": 4.477509461192756e-06, - "loss": 0.0036, - "step": 912 - }, - { - "epoch": 0.6869826937547028, - "grad_norm": 0.23084872961044312, - "learning_rate": 4.457817757892718e-06, - "loss": 0.1103, - "step": 913 - }, - { - "epoch": 0.6877351392024078, - "grad_norm": 0.3813030421733856, - "learning_rate": 4.438157023820991e-06, - "loss": 0.0089, - "step": 914 - }, - { - "epoch": 0.6884875846501128, - "grad_norm": 0.15035146474838257, - "learning_rate": 4.4185273688400274e-06, - "loss": 0.0044, - "step": 915 - }, - { - "epoch": 0.6892400300978179, - "grad_norm": 0.00839236006140709, - "learning_rate": 4.398928902638626e-06, - "loss": 0.0003, - "step": 916 - }, - { - "epoch": 0.6899924755455229, - "grad_norm": 0.14042149484157562, - "learning_rate": 4.379361734731289e-06, - "loss": 0.0057, - "step": 917 - }, - { - "epoch": 0.690744920993228, - "grad_norm": 0.3208416700363159, - "learning_rate": 4.359825974457632e-06, - "loss": 0.1172, - "step": 918 - }, - { - "epoch": 0.6914973664409331, - "grad_norm": 0.1413826197385788, - "learning_rate": 4.340321730981779e-06, - "loss": 0.0033, - "step": 919 - }, - { - "epoch": 0.6922498118886381, - "grad_norm": 0.09464599937200546, - "learning_rate": 4.32084911329173e-06, - "loss": 0.0045, - "step": 920 - }, - { - "epoch": 0.6930022573363431, - "grad_norm": 0.04123972728848457, - "learning_rate": 4.301408230198763e-06, - "loss": 0.0015, - "step": 921 - }, - { - "epoch": 0.6937547027840482, - "grad_norm": 0.5074460506439209, - "learning_rate": 4.28199919033683e-06, - "loss": 0.0119, - "step": 922 - }, - { - "epoch": 0.6945071482317532, - "grad_norm": 0.052684567868709564, - "learning_rate": 4.2626221021619396e-06, - "loss": 0.0027, - "step": 923 - }, - { - "epoch": 0.6952595936794582, - "grad_norm": 0.016798803582787514, - "learning_rate": 4.243277073951562e-06, - "loss": 0.0008, - "step": 924 - }, - { - "epoch": 0.6960120391271633, - "grad_norm": 0.006241548340767622, - "learning_rate": 4.223964213804019e-06, - "loss": 0.0002, - "step": 925 - }, - { - "epoch": 0.6967644845748683, - "grad_norm": 3.114851474761963, - "learning_rate": 4.204683629637881e-06, - "loss": 0.32, - "step": 926 - }, - { - "epoch": 0.6975169300225733, - "grad_norm": 0.23592227697372437, - "learning_rate": 4.1854354291913594e-06, - "loss": 0.0053, - "step": 927 - }, - { - "epoch": 0.6982693754702785, - "grad_norm": 0.7205729484558105, - "learning_rate": 4.1662197200217116e-06, - "loss": 0.0056, - "step": 928 - }, - { - "epoch": 0.6990218209179835, - "grad_norm": 0.0018878192640841007, - "learning_rate": 4.147036609504633e-06, - "loss": 0.0001, - "step": 929 - }, - { - "epoch": 0.6997742663656885, - "grad_norm": 0.24593710899353027, - "learning_rate": 4.1278862048336645e-06, - "loss": 0.0058, - "step": 930 - }, - { - "epoch": 0.7005267118133935, - "grad_norm": 0.035347383469343185, - "learning_rate": 4.108768613019588e-06, - "loss": 0.0015, - "step": 931 - }, - { - "epoch": 0.7012791572610986, - "grad_norm": 0.15606430172920227, - "learning_rate": 4.089683940889829e-06, - "loss": 0.0069, - "step": 932 - }, - { - "epoch": 0.7020316027088036, - "grad_norm": 1.3364535570144653, - "learning_rate": 4.070632295087863e-06, - "loss": 0.0344, - "step": 933 - }, - { - "epoch": 0.7027840481565086, - "grad_norm": 0.1990508735179901, - "learning_rate": 4.051613782072614e-06, - "loss": 0.0034, - "step": 934 - }, - { - "epoch": 0.7035364936042137, - "grad_norm": 0.004956225864589214, - "learning_rate": 4.0326285081178695e-06, - "loss": 0.0002, - "step": 935 - }, - { - "epoch": 0.7042889390519187, - "grad_norm": 0.09243706613779068, - "learning_rate": 4.013676579311668e-06, - "loss": 0.0032, - "step": 936 - }, - { - "epoch": 0.7050413844996237, - "grad_norm": 0.2956998944282532, - "learning_rate": 3.994758101555729e-06, - "loss": 0.0037, - "step": 937 - }, - { - "epoch": 0.7057938299473289, - "grad_norm": 6.872308254241943, - "learning_rate": 3.975873180564843e-06, - "loss": 0.185, - "step": 938 - }, - { - "epoch": 0.7065462753950339, - "grad_norm": 5.041158676147461, - "learning_rate": 3.957021921866301e-06, - "loss": 0.1589, - "step": 939 - }, - { - "epoch": 0.7072987208427389, - "grad_norm": 0.004548253491520882, - "learning_rate": 3.938204430799278e-06, - "loss": 0.0002, - "step": 940 - }, - { - "epoch": 0.708051166290444, - "grad_norm": 4.161418437957764, - "learning_rate": 3.919420812514267e-06, - "loss": 0.0773, - "step": 941 - }, - { - "epoch": 0.708803611738149, - "grad_norm": 8.64138412475586, - "learning_rate": 3.9006711719724755e-06, - "loss": 0.3975, - "step": 942 - }, - { - "epoch": 0.709556057185854, - "grad_norm": 0.10942798107862473, - "learning_rate": 3.881955613945251e-06, - "loss": 0.0037, - "step": 943 - }, - { - "epoch": 0.710308502633559, - "grad_norm": 0.07507241517305374, - "learning_rate": 3.8632742430134905e-06, - "loss": 0.002, - "step": 944 - }, - { - "epoch": 0.7110609480812641, - "grad_norm": 0.020177626982331276, - "learning_rate": 3.844627163567059e-06, - "loss": 0.0008, - "step": 945 - }, - { - "epoch": 0.7118133935289691, - "grad_norm": 6.14890718460083, - "learning_rate": 3.826014479804198e-06, - "loss": 0.0767, - "step": 946 - }, - { - "epoch": 0.7125658389766741, - "grad_norm": 0.330617755651474, - "learning_rate": 3.807436295730953e-06, - "loss": 0.0074, - "step": 947 - }, - { - "epoch": 0.7133182844243793, - "grad_norm": 0.07356506586074829, - "learning_rate": 3.788892715160588e-06, - "loss": 0.0027, - "step": 948 - }, - { - "epoch": 0.7140707298720843, - "grad_norm": 5.51183557510376, - "learning_rate": 3.7703838417130045e-06, - "loss": 0.1942, - "step": 949 - }, - { - "epoch": 0.7148231753197893, - "grad_norm": 0.07540974020957947, - "learning_rate": 3.7519097788141635e-06, - "loss": 0.0032, - "step": 950 - }, - { - "epoch": 0.7155756207674944, - "grad_norm": 0.1256755292415619, - "learning_rate": 3.7334706296955093e-06, - "loss": 0.0027, - "step": 951 - }, - { - "epoch": 0.7163280662151994, - "grad_norm": 0.0014390931464731693, - "learning_rate": 3.7150664973933893e-06, - "loss": 0.0001, - "step": 952 - }, - { - "epoch": 0.7170805116629044, - "grad_norm": 0.19743457436561584, - "learning_rate": 3.6966974847484805e-06, - "loss": 0.0047, - "step": 953 - }, - { - "epoch": 0.7178329571106095, - "grad_norm": 1.814272165298462, - "learning_rate": 3.6783636944052193e-06, - "loss": 0.2478, - "step": 954 - }, - { - "epoch": 0.7185854025583145, - "grad_norm": 0.35399460792541504, - "learning_rate": 3.66006522881121e-06, - "loss": 0.0039, - "step": 955 - }, - { - "epoch": 0.7193378480060195, - "grad_norm": 0.6553662419319153, - "learning_rate": 3.641802190216678e-06, - "loss": 0.0906, - "step": 956 - }, - { - "epoch": 0.7200902934537246, - "grad_norm": 0.005217390134930611, - "learning_rate": 3.623574680673879e-06, - "loss": 0.0002, - "step": 957 - }, - { - "epoch": 0.7208427389014297, - "grad_norm": 0.006294188555330038, - "learning_rate": 3.605382802036538e-06, - "loss": 0.0003, - "step": 958 - }, - { - "epoch": 0.7215951843491347, - "grad_norm": 4.843064785003662, - "learning_rate": 3.5872266559592817e-06, - "loss": 0.3738, - "step": 959 - }, - { - "epoch": 0.7223476297968398, - "grad_norm": 0.020986851304769516, - "learning_rate": 3.5691063438970618e-06, - "loss": 0.0008, - "step": 960 - }, - { - "epoch": 0.7231000752445448, - "grad_norm": 4.1812334060668945, - "learning_rate": 3.5510219671045875e-06, - "loss": 0.4018, - "step": 961 - }, - { - "epoch": 0.7238525206922498, - "grad_norm": 0.0036870623007416725, - "learning_rate": 3.532973626635773e-06, - "loss": 0.0001, - "step": 962 - }, - { - "epoch": 0.7246049661399548, - "grad_norm": 0.014761660248041153, - "learning_rate": 3.5149614233431616e-06, - "loss": 0.0007, - "step": 963 - }, - { - "epoch": 0.7253574115876599, - "grad_norm": 0.03565015643835068, - "learning_rate": 3.4969854578773667e-06, - "loss": 0.0016, - "step": 964 - }, - { - "epoch": 0.7261098570353649, - "grad_norm": 0.24551992118358612, - "learning_rate": 3.479045830686506e-06, - "loss": 0.006, - "step": 965 - }, - { - "epoch": 0.7268623024830699, - "grad_norm": 0.03166704997420311, - "learning_rate": 3.4611426420156422e-06, - "loss": 0.0013, - "step": 966 - }, - { - "epoch": 0.7276147479307751, - "grad_norm": 0.4878341257572174, - "learning_rate": 3.4432759919062253e-06, - "loss": 0.1401, - "step": 967 - }, - { - "epoch": 0.7283671933784801, - "grad_norm": 8.347463607788086, - "learning_rate": 3.4254459801955276e-06, - "loss": 0.0622, - "step": 968 - }, - { - "epoch": 0.7291196388261851, - "grad_norm": 0.05901602655649185, - "learning_rate": 3.4076527065160914e-06, - "loss": 0.0025, - "step": 969 - }, - { - "epoch": 0.7298720842738902, - "grad_norm": 0.1788146197795868, - "learning_rate": 3.3898962702951687e-06, - "loss": 0.0094, - "step": 970 - }, - { - "epoch": 0.7306245297215952, - "grad_norm": 0.008673246949911118, - "learning_rate": 3.3721767707541696e-06, - "loss": 0.0003, - "step": 971 - }, - { - "epoch": 0.7313769751693002, - "grad_norm": 0.05570625886321068, - "learning_rate": 3.3544943069081025e-06, - "loss": 0.0022, - "step": 972 - }, - { - "epoch": 0.7321294206170053, - "grad_norm": 0.04295314475893974, - "learning_rate": 3.3368489775650282e-06, - "loss": 0.0014, - "step": 973 - }, - { - "epoch": 0.7328818660647103, - "grad_norm": 0.025052128359675407, - "learning_rate": 3.3192408813254918e-06, - "loss": 0.0009, - "step": 974 - }, - { - "epoch": 0.7336343115124153, - "grad_norm": 5.636648178100586, - "learning_rate": 3.3016701165819943e-06, - "loss": 0.1106, - "step": 975 - }, - { - "epoch": 0.7343867569601203, - "grad_norm": 0.43065962195396423, - "learning_rate": 3.28413678151843e-06, - "loss": 0.0105, - "step": 976 - }, - { - "epoch": 0.7351392024078255, - "grad_norm": 0.006689665839076042, - "learning_rate": 3.2666409741095328e-06, - "loss": 0.0002, - "step": 977 - }, - { - "epoch": 0.7358916478555305, - "grad_norm": 0.009542127139866352, - "learning_rate": 3.2491827921203456e-06, - "loss": 0.0004, - "step": 978 - }, - { - "epoch": 0.7366440933032355, - "grad_norm": 2.010205030441284, - "learning_rate": 3.231762333105661e-06, - "loss": 0.0458, - "step": 979 - }, - { - "epoch": 0.7373965387509406, - "grad_norm": 0.015615508891642094, - "learning_rate": 3.2143796944094675e-06, - "loss": 0.0006, - "step": 980 - }, - { - "epoch": 0.7381489841986456, - "grad_norm": 0.6981082558631897, - "learning_rate": 3.197034973164429e-06, - "loss": 0.0136, - "step": 981 - }, - { - "epoch": 0.7389014296463506, - "grad_norm": 0.6512033939361572, - "learning_rate": 3.1797282662913277e-06, - "loss": 0.0279, - "step": 982 - }, - { - "epoch": 0.7396538750940557, - "grad_norm": 1.36346435546875, - "learning_rate": 3.162459670498523e-06, - "loss": 0.1529, - "step": 983 - }, - { - "epoch": 0.7404063205417607, - "grad_norm": 0.0437924787402153, - "learning_rate": 3.1452292822814145e-06, - "loss": 0.0021, - "step": 984 - }, - { - "epoch": 0.7411587659894657, - "grad_norm": 0.21114251017570496, - "learning_rate": 3.1280371979218993e-06, - "loss": 0.0053, - "step": 985 - }, - { - "epoch": 0.7419112114371708, - "grad_norm": 0.0944240614771843, - "learning_rate": 3.1108835134878367e-06, - "loss": 0.0029, - "step": 986 - }, - { - "epoch": 0.7426636568848759, - "grad_norm": 0.044163547456264496, - "learning_rate": 3.0937683248325133e-06, - "loss": 0.0013, - "step": 987 - }, - { - "epoch": 0.7434161023325809, - "grad_norm": 0.056986529380083084, - "learning_rate": 3.0766917275941e-06, - "loss": 0.0022, - "step": 988 - }, - { - "epoch": 0.744168547780286, - "grad_norm": 0.039454780519008636, - "learning_rate": 3.0596538171951252e-06, - "loss": 0.0016, - "step": 989 - }, - { - "epoch": 0.744920993227991, - "grad_norm": 0.017716459929943085, - "learning_rate": 3.0426546888419385e-06, - "loss": 0.0006, - "step": 990 - }, - { - "epoch": 0.745673438675696, - "grad_norm": 2.3982226848602295, - "learning_rate": 3.025694437524177e-06, - "loss": 0.2701, - "step": 991 - }, - { - "epoch": 0.746425884123401, - "grad_norm": 0.17275047302246094, - "learning_rate": 3.008773158014242e-06, - "loss": 0.0043, - "step": 992 - }, - { - "epoch": 0.7471783295711061, - "grad_norm": 0.028289055451750755, - "learning_rate": 2.991890944866752e-06, - "loss": 0.0013, - "step": 993 - }, - { - "epoch": 0.7479307750188111, - "grad_norm": 2.991833448410034, - "learning_rate": 2.9750478924180383e-06, - "loss": 0.0494, - "step": 994 - }, - { - "epoch": 0.7486832204665161, - "grad_norm": 0.15955565869808197, - "learning_rate": 2.9582440947855993e-06, - "loss": 0.0054, - "step": 995 - }, - { - "epoch": 0.7494356659142212, - "grad_norm": 0.23433822393417358, - "learning_rate": 2.941479645867583e-06, - "loss": 0.0084, - "step": 996 - }, - { - "epoch": 0.7501881113619263, - "grad_norm": 0.08780021965503693, - "learning_rate": 2.9247546393422566e-06, - "loss": 0.004, - "step": 997 - }, - { - "epoch": 0.7509405568096313, - "grad_norm": 0.13379798829555511, - "learning_rate": 2.9080691686674977e-06, - "loss": 0.0026, - "step": 998 - }, - { - "epoch": 0.7516930022573364, - "grad_norm": 0.01313999854028225, - "learning_rate": 2.891423327080246e-06, - "loss": 0.0006, - "step": 999 - }, - { - "epoch": 0.7524454477050414, - "grad_norm": 0.03100876323878765, - "learning_rate": 2.874817207596007e-06, - "loss": 0.0012, - "step": 1000 - }, - { - "epoch": 0.7531978931527464, - "grad_norm": 0.018929272890090942, - "learning_rate": 2.8582509030083184e-06, - "loss": 0.0007, - "step": 1001 - }, - { - "epoch": 0.7539503386004515, - "grad_norm": 4.446130752563477, - "learning_rate": 2.841724505888239e-06, - "loss": 0.2995, - "step": 1002 - }, - { - "epoch": 0.7547027840481565, - "grad_norm": 0.27879974246025085, - "learning_rate": 2.8252381085838266e-06, - "loss": 0.0064, - "step": 1003 - }, - { - "epoch": 0.7554552294958615, - "grad_norm": 0.06390407681465149, - "learning_rate": 2.8087918032196214e-06, - "loss": 0.003, - "step": 1004 - }, - { - "epoch": 0.7562076749435666, - "grad_norm": 0.05781351029872894, - "learning_rate": 2.792385681696138e-06, - "loss": 0.0029, - "step": 1005 - }, - { - "epoch": 0.7569601203912716, - "grad_norm": 0.035376448184251785, - "learning_rate": 2.7760198356893466e-06, - "loss": 0.0017, - "step": 1006 - }, - { - "epoch": 0.7577125658389767, - "grad_norm": 0.024740157648921013, - "learning_rate": 2.759694356650149e-06, - "loss": 0.0008, - "step": 1007 - }, - { - "epoch": 0.7584650112866818, - "grad_norm": 0.5133036375045776, - "learning_rate": 2.7434093358039003e-06, - "loss": 0.0121, - "step": 1008 - }, - { - "epoch": 0.7592174567343868, - "grad_norm": 0.017775315791368484, - "learning_rate": 2.727164864149867e-06, - "loss": 0.0007, - "step": 1009 - }, - { - "epoch": 0.7599699021820918, - "grad_norm": 0.004680894315242767, - "learning_rate": 2.7109610324607305e-06, - "loss": 0.0002, - "step": 1010 - }, - { - "epoch": 0.7607223476297968, - "grad_norm": 0.01725710742175579, - "learning_rate": 2.6947979312820825e-06, - "loss": 0.0008, - "step": 1011 - }, - { - "epoch": 0.7614747930775019, - "grad_norm": 0.279935747385025, - "learning_rate": 2.678675650931917e-06, - "loss": 0.0039, - "step": 1012 - }, - { - "epoch": 0.7622272385252069, - "grad_norm": 0.14282621443271637, - "learning_rate": 2.662594281500115e-06, - "loss": 0.0036, - "step": 1013 - }, - { - "epoch": 0.7629796839729119, - "grad_norm": 0.011395514011383057, - "learning_rate": 2.6465539128479646e-06, - "loss": 0.0005, - "step": 1014 - }, - { - "epoch": 0.763732129420617, - "grad_norm": 0.04269295185804367, - "learning_rate": 2.630554634607637e-06, - "loss": 0.0014, - "step": 1015 - }, - { - "epoch": 0.764484574868322, - "grad_norm": 0.012843809090554714, - "learning_rate": 2.614596536181697e-06, - "loss": 0.0006, - "step": 1016 - }, - { - "epoch": 0.7652370203160271, - "grad_norm": 0.018597368150949478, - "learning_rate": 2.5986797067425972e-06, - "loss": 0.0008, - "step": 1017 - }, - { - "epoch": 0.7659894657637322, - "grad_norm": 0.019066110253334045, - "learning_rate": 2.582804235232187e-06, - "loss": 0.0008, - "step": 1018 - }, - { - "epoch": 0.7667419112114372, - "grad_norm": 0.011421947740018368, - "learning_rate": 2.566970210361208e-06, - "loss": 0.0005, - "step": 1019 - }, - { - "epoch": 0.7674943566591422, - "grad_norm": 0.016336999833583832, - "learning_rate": 2.551177720608802e-06, - "loss": 0.0007, - "step": 1020 - }, - { - "epoch": 0.7682468021068473, - "grad_norm": 1.4327224493026733, - "learning_rate": 2.5354268542220163e-06, - "loss": 0.1008, - "step": 1021 - }, - { - "epoch": 0.7689992475545523, - "grad_norm": 0.014406280592083931, - "learning_rate": 2.5197176992153125e-06, - "loss": 0.0004, - "step": 1022 - }, - { - "epoch": 0.7697516930022573, - "grad_norm": 2.0984880924224854, - "learning_rate": 2.5040503433700702e-06, - "loss": 0.4668, - "step": 1023 - }, - { - "epoch": 0.7705041384499624, - "grad_norm": 0.002694485941901803, - "learning_rate": 2.4884248742340987e-06, - "loss": 0.0001, - "step": 1024 - }, - { - "epoch": 0.7712565838976674, - "grad_norm": 0.16212724149227142, - "learning_rate": 2.472841379121154e-06, - "loss": 0.0048, - "step": 1025 - }, - { - "epoch": 0.7720090293453724, - "grad_norm": 2.619164228439331, - "learning_rate": 2.457299945110433e-06, - "loss": 0.2616, - "step": 1026 - }, - { - "epoch": 0.7727614747930776, - "grad_norm": 0.003759812330827117, - "learning_rate": 2.441800659046106e-06, - "loss": 0.0001, - "step": 1027 - }, - { - "epoch": 0.7735139202407826, - "grad_norm": 3.9366838932037354, - "learning_rate": 2.4263436075368307e-06, - "loss": 0.2472, - "step": 1028 - }, - { - "epoch": 0.7742663656884876, - "grad_norm": 2.153656005859375, - "learning_rate": 2.4109288769552518e-06, - "loss": 0.1468, - "step": 1029 - }, - { - "epoch": 0.7750188111361926, - "grad_norm": 0.3993730843067169, - "learning_rate": 2.3955565534375326e-06, - "loss": 0.1389, - "step": 1030 - }, - { - "epoch": 0.7757712565838977, - "grad_norm": 0.0949854701757431, - "learning_rate": 2.3802267228828703e-06, - "loss": 0.0027, - "step": 1031 - }, - { - "epoch": 0.7765237020316027, - "grad_norm": 1.1868627071380615, - "learning_rate": 2.3649394709530093e-06, - "loss": 0.0392, - "step": 1032 - }, - { - "epoch": 0.7772761474793077, - "grad_norm": 0.0136722382158041, - "learning_rate": 2.349694883071775e-06, - "loss": 0.0006, - "step": 1033 - }, - { - "epoch": 0.7780285929270128, - "grad_norm": 2.6067428588867188, - "learning_rate": 2.3344930444245863e-06, - "loss": 0.4598, - "step": 1034 - }, - { - "epoch": 0.7787810383747178, - "grad_norm": 0.005388299468904734, - "learning_rate": 2.3193340399579865e-06, - "loss": 0.0002, - "step": 1035 - }, - { - "epoch": 0.7795334838224228, - "grad_norm": 0.032653093338012695, - "learning_rate": 2.304217954379162e-06, - "loss": 0.0012, - "step": 1036 - }, - { - "epoch": 0.780285929270128, - "grad_norm": 0.11140411347150803, - "learning_rate": 2.2891448721554733e-06, - "loss": 0.0052, - "step": 1037 - }, - { - "epoch": 0.781038374717833, - "grad_norm": 0.6110525727272034, - "learning_rate": 2.274114877513981e-06, - "loss": 0.0083, - "step": 1038 - }, - { - "epoch": 0.781790820165538, - "grad_norm": 0.3631162643432617, - "learning_rate": 2.259128054440979e-06, - "loss": 0.0101, - "step": 1039 - }, - { - "epoch": 0.782543265613243, - "grad_norm": 0.5492317080497742, - "learning_rate": 2.2441844866815188e-06, - "loss": 0.0043, - "step": 1040 - }, - { - "epoch": 0.7832957110609481, - "grad_norm": 0.29190927743911743, - "learning_rate": 2.229284257738946e-06, - "loss": 0.0063, - "step": 1041 - }, - { - "epoch": 0.7840481565086531, - "grad_norm": 0.012482292018830776, - "learning_rate": 2.2144274508744355e-06, - "loss": 0.0005, - "step": 1042 - }, - { - "epoch": 0.7848006019563581, - "grad_norm": 0.011028854176402092, - "learning_rate": 2.199614149106519e-06, - "loss": 0.0004, - "step": 1043 - }, - { - "epoch": 0.7855530474040632, - "grad_norm": 0.11335982382297516, - "learning_rate": 2.1848444352106314e-06, - "loss": 0.0025, - "step": 1044 - }, - { - "epoch": 0.7863054928517682, - "grad_norm": 0.17785978317260742, - "learning_rate": 2.1701183917186317e-06, - "loss": 0.0053, - "step": 1045 - }, - { - "epoch": 0.7870579382994732, - "grad_norm": 0.0240810327231884, - "learning_rate": 2.155436100918363e-06, - "loss": 0.001, - "step": 1046 - }, - { - "epoch": 0.7878103837471784, - "grad_norm": 0.048680830746889114, - "learning_rate": 2.1407976448531776e-06, - "loss": 0.0021, - "step": 1047 - }, - { - "epoch": 0.7885628291948834, - "grad_norm": 0.03647977486252785, - "learning_rate": 2.126203105321487e-06, - "loss": 0.0012, - "step": 1048 - }, - { - "epoch": 0.7893152746425884, - "grad_norm": 0.012947115115821362, - "learning_rate": 2.1116525638762963e-06, - "loss": 0.0004, - "step": 1049 - }, - { - "epoch": 0.7900677200902935, - "grad_norm": 0.015302013605833054, - "learning_rate": 2.0971461018247586e-06, - "loss": 0.0007, - "step": 1050 - }, - { - "epoch": 0.7908201655379985, - "grad_norm": 0.9521199464797974, - "learning_rate": 2.082683800227705e-06, - "loss": 0.2254, - "step": 1051 - }, - { - "epoch": 0.7915726109857035, - "grad_norm": 0.011116956360638142, - "learning_rate": 2.0682657398992124e-06, - "loss": 0.0004, - "step": 1052 - }, - { - "epoch": 0.7923250564334086, - "grad_norm": 0.20475442707538605, - "learning_rate": 2.053892001406136e-06, - "loss": 0.1026, - "step": 1053 - }, - { - "epoch": 0.7930775018811136, - "grad_norm": 0.026699619367718697, - "learning_rate": 2.039562665067667e-06, - "loss": 0.0013, - "step": 1054 - }, - { - "epoch": 0.7938299473288186, - "grad_norm": 0.0033015466760843992, - "learning_rate": 2.0252778109548785e-06, - "loss": 0.0001, - "step": 1055 - }, - { - "epoch": 0.7945823927765236, - "grad_norm": 1.792168140411377, - "learning_rate": 2.0110375188902852e-06, - "loss": 0.1835, - "step": 1056 - }, - { - "epoch": 0.7953348382242288, - "grad_norm": 0.4291799068450928, - "learning_rate": 1.996841868447388e-06, - "loss": 0.1194, - "step": 1057 - }, - { - "epoch": 0.7960872836719338, - "grad_norm": 18.169370651245117, - "learning_rate": 1.9826909389502368e-06, - "loss": 0.0432, - "step": 1058 - }, - { - "epoch": 0.7968397291196389, - "grad_norm": 2.5589189529418945, - "learning_rate": 1.9685848094729853e-06, - "loss": 0.1892, - "step": 1059 - }, - { - "epoch": 0.7975921745673439, - "grad_norm": 2.7977375984191895, - "learning_rate": 1.9545235588394484e-06, - "loss": 0.1353, - "step": 1060 - }, - { - "epoch": 0.7983446200150489, - "grad_norm": 0.035498447716236115, - "learning_rate": 1.940507265622661e-06, - "loss": 0.0014, - "step": 1061 - }, - { - "epoch": 0.7990970654627539, - "grad_norm": 0.04865005239844322, - "learning_rate": 1.9265360081444385e-06, - "loss": 0.0014, - "step": 1062 - }, - { - "epoch": 0.799849510910459, - "grad_norm": 3.3623549938201904, - "learning_rate": 1.9126098644749482e-06, - "loss": 0.2956, - "step": 1063 - }, - { - "epoch": 0.800601956358164, - "grad_norm": 0.05257292836904526, - "learning_rate": 1.8987289124322517e-06, - "loss": 0.0025, - "step": 1064 - }, - { - "epoch": 0.801354401805869, - "grad_norm": 0.003795795841142535, - "learning_rate": 1.8848932295818945e-06, - "loss": 0.0001, - "step": 1065 - }, - { - "epoch": 0.8021068472535741, - "grad_norm": 0.005308138206601143, - "learning_rate": 1.8711028932364604e-06, - "loss": 0.0002, - "step": 1066 - }, - { - "epoch": 0.8028592927012792, - "grad_norm": 0.7294948697090149, - "learning_rate": 1.8573579804551367e-06, - "loss": 0.0244, - "step": 1067 - }, - { - "epoch": 0.8036117381489842, - "grad_norm": 0.005934171844273806, - "learning_rate": 1.8436585680432951e-06, - "loss": 0.0002, - "step": 1068 - }, - { - "epoch": 0.8043641835966893, - "grad_norm": 5.70277738571167, - "learning_rate": 1.8300047325520508e-06, - "loss": 0.1113, - "step": 1069 - }, - { - "epoch": 0.8051166290443943, - "grad_norm": 0.813459038734436, - "learning_rate": 1.8163965502778337e-06, - "loss": 0.0269, - "step": 1070 - }, - { - "epoch": 0.8058690744920993, - "grad_norm": 1.580662488937378, - "learning_rate": 1.802834097261975e-06, - "loss": 0.0958, - "step": 1071 - }, - { - "epoch": 0.8066215199398044, - "grad_norm": 0.013537387363612652, - "learning_rate": 1.7893174492902742e-06, - "loss": 0.0005, - "step": 1072 - }, - { - "epoch": 0.8073739653875094, - "grad_norm": 3.1393344402313232, - "learning_rate": 1.7758466818925735e-06, - "loss": 0.2465, - "step": 1073 - }, - { - "epoch": 0.8081264108352144, - "grad_norm": 0.17277193069458008, - "learning_rate": 1.7624218703423402e-06, - "loss": 0.0045, - "step": 1074 - }, - { - "epoch": 0.8088788562829194, - "grad_norm": 1.4239376783370972, - "learning_rate": 1.7490430896562439e-06, - "loss": 0.0138, - "step": 1075 - }, - { - "epoch": 0.8096313017306245, - "grad_norm": 0.3208409249782562, - "learning_rate": 1.7357104145937365e-06, - "loss": 0.1175, - "step": 1076 - }, - { - "epoch": 0.8103837471783296, - "grad_norm": 0.30378955602645874, - "learning_rate": 1.7224239196566395e-06, - "loss": 0.0059, - "step": 1077 - }, - { - "epoch": 0.8111361926260346, - "grad_norm": 0.004885723814368248, - "learning_rate": 1.7091836790887196e-06, - "loss": 0.0002, - "step": 1078 - }, - { - "epoch": 0.8118886380737397, - "grad_norm": 0.04553482308983803, - "learning_rate": 1.695989766875279e-06, - "loss": 0.0019, - "step": 1079 - }, - { - "epoch": 0.8126410835214447, - "grad_norm": 0.12680576741695404, - "learning_rate": 1.682842256742744e-06, - "loss": 0.0056, - "step": 1080 - }, - { - "epoch": 0.8133935289691497, - "grad_norm": 0.05921807140111923, - "learning_rate": 1.6697412221582477e-06, - "loss": 0.0026, - "step": 1081 - }, - { - "epoch": 0.8141459744168548, - "grad_norm": 0.03163035586476326, - "learning_rate": 1.6566867363292238e-06, - "loss": 0.0014, - "step": 1082 - }, - { - "epoch": 0.8148984198645598, - "grad_norm": 0.027834394946694374, - "learning_rate": 1.6436788722029906e-06, - "loss": 0.001, - "step": 1083 - }, - { - "epoch": 0.8156508653122648, - "grad_norm": 0.3784581422805786, - "learning_rate": 1.6307177024663534e-06, - "loss": 0.1234, - "step": 1084 - }, - { - "epoch": 0.8164033107599699, - "grad_norm": 0.09414243698120117, - "learning_rate": 1.617803299545192e-06, - "loss": 0.0038, - "step": 1085 - }, - { - "epoch": 0.8171557562076749, - "grad_norm": 1.488474726676941, - "learning_rate": 1.6049357356040584e-06, - "loss": 0.1, - "step": 1086 - }, - { - "epoch": 0.81790820165538, - "grad_norm": 0.0046142516657710075, - "learning_rate": 1.5921150825457677e-06, - "loss": 0.0002, - "step": 1087 - }, - { - "epoch": 0.8186606471030851, - "grad_norm": 0.06056587025523186, - "learning_rate": 1.579341412011014e-06, - "loss": 0.0023, - "step": 1088 - }, - { - "epoch": 0.8194130925507901, - "grad_norm": 4.677060127258301, - "learning_rate": 1.5666147953779376e-06, - "loss": 0.1317, - "step": 1089 - }, - { - "epoch": 0.8201655379984951, - "grad_norm": 2.52014422416687, - "learning_rate": 1.553935303761761e-06, - "loss": 0.503, - "step": 1090 - }, - { - "epoch": 0.8209179834462002, - "grad_norm": 0.023583421483635902, - "learning_rate": 1.5413030080143708e-06, - "loss": 0.0009, - "step": 1091 - }, - { - "epoch": 0.8216704288939052, - "grad_norm": 0.2893773317337036, - "learning_rate": 1.5287179787239282e-06, - "loss": 0.0056, - "step": 1092 - }, - { - "epoch": 0.8224228743416102, - "grad_norm": 1.8263130187988281, - "learning_rate": 1.5161802862144715e-06, - "loss": 0.0311, - "step": 1093 - }, - { - "epoch": 0.8231753197893152, - "grad_norm": 0.2679802477359772, - "learning_rate": 1.503690000545528e-06, - "loss": 0.0109, - "step": 1094 - }, - { - "epoch": 0.8239277652370203, - "grad_norm": 0.004781723953783512, - "learning_rate": 1.4912471915117189e-06, - "loss": 0.0002, - "step": 1095 - }, - { - "epoch": 0.8246802106847254, - "grad_norm": 1.211321473121643, - "learning_rate": 1.4788519286423687e-06, - "loss": 0.0369, - "step": 1096 - }, - { - "epoch": 0.8254326561324304, - "grad_norm": 0.03698015958070755, - "learning_rate": 1.46650428120112e-06, - "loss": 0.0013, - "step": 1097 - }, - { - "epoch": 0.8261851015801355, - "grad_norm": 0.23283882439136505, - "learning_rate": 1.4542043181855447e-06, - "loss": 0.1022, - "step": 1098 - }, - { - "epoch": 0.8269375470278405, - "grad_norm": 0.045804113149642944, - "learning_rate": 1.441952108326755e-06, - "loss": 0.0007, - "step": 1099 - }, - { - "epoch": 0.8276899924755455, - "grad_norm": 0.08991260826587677, - "learning_rate": 1.4297477200890275e-06, - "loss": 0.0031, - "step": 1100 - }, - { - "epoch": 0.8284424379232506, - "grad_norm": 0.059289537370204926, - "learning_rate": 1.417591221669412e-06, - "loss": 0.0013, - "step": 1101 - }, - { - "epoch": 0.8291948833709556, - "grad_norm": 0.25663039088249207, - "learning_rate": 1.4054826809973576e-06, - "loss": 0.0031, - "step": 1102 - }, - { - "epoch": 0.8299473288186606, - "grad_norm": 0.047977883368730545, - "learning_rate": 1.393422165734325e-06, - "loss": 0.0019, - "step": 1103 - }, - { - "epoch": 0.8306997742663657, - "grad_norm": 0.17930328845977783, - "learning_rate": 1.3814097432734154e-06, - "loss": 0.0026, - "step": 1104 - }, - { - "epoch": 0.8314522197140707, - "grad_norm": 0.05746970325708389, - "learning_rate": 1.3694454807389935e-06, - "loss": 0.0019, - "step": 1105 - }, - { - "epoch": 0.8322046651617758, - "grad_norm": 0.011797560378909111, - "learning_rate": 1.3575294449863063e-06, - "loss": 0.0005, - "step": 1106 - }, - { - "epoch": 0.8329571106094809, - "grad_norm": 0.00394852552562952, - "learning_rate": 1.3456617026011233e-06, - "loss": 0.0001, - "step": 1107 - }, - { - "epoch": 0.8337095560571859, - "grad_norm": 0.07486367225646973, - "learning_rate": 1.3338423198993422e-06, - "loss": 0.003, - "step": 1108 - }, - { - "epoch": 0.8344620015048909, - "grad_norm": 0.05970097705721855, - "learning_rate": 1.322071362926638e-06, - "loss": 0.0027, - "step": 1109 - }, - { - "epoch": 0.835214446952596, - "grad_norm": 0.002417666371911764, - "learning_rate": 1.3103488974580858e-06, - "loss": 0.0001, - "step": 1110 - }, - { - "epoch": 0.835966892400301, - "grad_norm": 0.029511481523513794, - "learning_rate": 1.2986749889977968e-06, - "loss": 0.0011, - "step": 1111 - }, - { - "epoch": 0.836719337848006, - "grad_norm": 2.6889474391937256, - "learning_rate": 1.2870497027785444e-06, - "loss": 0.0219, - "step": 1112 - }, - { - "epoch": 0.837471783295711, - "grad_norm": 0.015240820124745369, - "learning_rate": 1.2754731037614122e-06, - "loss": 0.0007, - "step": 1113 - }, - { - "epoch": 0.8382242287434161, - "grad_norm": 0.01888788491487503, - "learning_rate": 1.263945256635416e-06, - "loss": 0.0008, - "step": 1114 - }, - { - "epoch": 0.8389766741911211, - "grad_norm": 0.047674112021923065, - "learning_rate": 1.2524662258171605e-06, - "loss": 0.0022, - "step": 1115 - }, - { - "epoch": 0.8397291196388262, - "grad_norm": 0.07721062004566193, - "learning_rate": 1.2410360754504536e-06, - "loss": 0.0032, - "step": 1116 - }, - { - "epoch": 0.8404815650865313, - "grad_norm": 0.002638805890455842, - "learning_rate": 1.2296548694059818e-06, - "loss": 0.0001, - "step": 1117 - }, - { - "epoch": 0.8412340105342363, - "grad_norm": 2.895000457763672, - "learning_rate": 1.2183226712809238e-06, - "loss": 0.2227, - "step": 1118 - }, - { - "epoch": 0.8419864559819413, - "grad_norm": 4.1699957847595215, - "learning_rate": 1.207039544398607e-06, - "loss": 0.1277, - "step": 1119 - }, - { - "epoch": 0.8427389014296464, - "grad_norm": 0.03461211919784546, - "learning_rate": 1.195805551808158e-06, - "loss": 0.0017, - "step": 1120 - }, - { - "epoch": 0.8434913468773514, - "grad_norm": 0.011402531526982784, - "learning_rate": 1.1846207562841416e-06, - "loss": 0.0004, - "step": 1121 - }, - { - "epoch": 0.8442437923250564, - "grad_norm": 4.339677810668945, - "learning_rate": 1.1734852203262115e-06, - "loss": 0.1059, - "step": 1122 - }, - { - "epoch": 0.8449962377727614, - "grad_norm": 2.331862688064575, - "learning_rate": 1.1623990061587665e-06, - "loss": 0.3458, - "step": 1123 - }, - { - "epoch": 0.8457486832204665, - "grad_norm": 0.40041258931159973, - "learning_rate": 1.1513621757306015e-06, - "loss": 0.133, - "step": 1124 - }, - { - "epoch": 0.8465011286681715, - "grad_norm": 0.1884562075138092, - "learning_rate": 1.1403747907145546e-06, - "loss": 0.0999, - "step": 1125 - }, - { - "epoch": 0.8472535741158767, - "grad_norm": 0.0410892590880394, - "learning_rate": 1.1294369125071692e-06, - "loss": 0.0019, - "step": 1126 - }, - { - "epoch": 0.8480060195635817, - "grad_norm": 0.2528564929962158, - "learning_rate": 1.1185486022283553e-06, - "loss": 0.1256, - "step": 1127 - }, - { - "epoch": 0.8487584650112867, - "grad_norm": 0.0033206380903720856, - "learning_rate": 1.1077099207210296e-06, - "loss": 0.0001, - "step": 1128 - }, - { - "epoch": 0.8495109104589917, - "grad_norm": 0.05121954157948494, - "learning_rate": 1.0969209285507954e-06, - "loss": 0.0018, - "step": 1129 - }, - { - "epoch": 0.8502633559066968, - "grad_norm": 0.0063232192769646645, - "learning_rate": 1.0861816860055952e-06, - "loss": 0.0002, - "step": 1130 - }, - { - "epoch": 0.8510158013544018, - "grad_norm": 0.03940580040216446, - "learning_rate": 1.0754922530953737e-06, - "loss": 0.001, - "step": 1131 - }, - { - "epoch": 0.8517682468021068, - "grad_norm": 0.2049761265516281, - "learning_rate": 1.0648526895517464e-06, - "loss": 0.0064, - "step": 1132 - }, - { - "epoch": 0.8525206922498119, - "grad_norm": 0.2495325803756714, - "learning_rate": 1.0542630548276588e-06, - "loss": 0.1243, - "step": 1133 - }, - { - "epoch": 0.8532731376975169, - "grad_norm": 0.04376570135354996, - "learning_rate": 1.043723408097065e-06, - "loss": 0.0021, - "step": 1134 - }, - { - "epoch": 0.8540255831452219, - "grad_norm": 0.8723930716514587, - "learning_rate": 1.0332338082545812e-06, - "loss": 0.0126, - "step": 1135 - }, - { - "epoch": 0.8547780285929271, - "grad_norm": 0.016171878203749657, - "learning_rate": 1.0227943139151719e-06, - "loss": 0.0006, - "step": 1136 - }, - { - "epoch": 0.8555304740406321, - "grad_norm": 0.004397090524435043, - "learning_rate": 1.0124049834138205e-06, - "loss": 0.0001, - "step": 1137 - }, - { - "epoch": 0.8562829194883371, - "grad_norm": 0.10261158645153046, - "learning_rate": 1.0020658748051925e-06, - "loss": 0.0046, - "step": 1138 - }, - { - "epoch": 0.8570353649360422, - "grad_norm": 0.07801058888435364, - "learning_rate": 9.91777045863319e-07, - "loss": 0.0037, - "step": 1139 - }, - { - "epoch": 0.8577878103837472, - "grad_norm": 0.37156495451927185, - "learning_rate": 9.815385540812761e-07, - "loss": 0.1171, - "step": 1140 - }, - { - "epoch": 0.8585402558314522, - "grad_norm": 0.0733116865158081, - "learning_rate": 9.713504566708554e-07, - "loss": 0.0033, - "step": 1141 - }, - { - "epoch": 0.8592927012791572, - "grad_norm": 0.0857694074511528, - "learning_rate": 9.61212810562252e-07, - "loss": 0.0036, - "step": 1142 - }, - { - "epoch": 0.8600451467268623, - "grad_norm": 0.41983649134635925, - "learning_rate": 9.511256724037443e-07, - "loss": 0.0053, - "step": 1143 - }, - { - "epoch": 0.8607975921745673, - "grad_norm": 7.980868816375732, - "learning_rate": 9.410890985613741e-07, - "loss": 0.2031, - "step": 1144 - }, - { - "epoch": 0.8615500376222723, - "grad_norm": 3.7673096656799316, - "learning_rate": 9.311031451186381e-07, - "loss": 0.3022, - "step": 1145 - }, - { - "epoch": 0.8623024830699775, - "grad_norm": 0.028988122940063477, - "learning_rate": 9.21167867876167e-07, - "loss": 0.0011, - "step": 1146 - }, - { - "epoch": 0.8630549285176825, - "grad_norm": 2.064547300338745, - "learning_rate": 9.112833223514183e-07, - "loss": 0.0675, - "step": 1147 - }, - { - "epoch": 0.8638073739653875, - "grad_norm": 1.2910840511322021, - "learning_rate": 9.014495637783671e-07, - "loss": 0.093, - "step": 1148 - }, - { - "epoch": 0.8645598194130926, - "grad_norm": 5.712249279022217, - "learning_rate": 8.916666471071922e-07, - "loss": 0.439, - "step": 1149 - }, - { - "epoch": 0.8653122648607976, - "grad_norm": 0.04252437502145767, - "learning_rate": 8.819346270039752e-07, - "loss": 0.0019, - "step": 1150 - }, - { - "epoch": 0.8660647103085026, - "grad_norm": 0.02926601469516754, - "learning_rate": 8.722535578503899e-07, - "loss": 0.0014, - "step": 1151 - }, - { - "epoch": 0.8668171557562077, - "grad_norm": 0.3518989086151123, - "learning_rate": 8.62623493743402e-07, - "loss": 0.0112, - "step": 1152 - }, - { - "epoch": 0.8675696012039127, - "grad_norm": 3.836555242538452, - "learning_rate": 8.530444884949674e-07, - "loss": 0.0891, - "step": 1153 - }, - { - "epoch": 0.8683220466516177, - "grad_norm": 0.04374834522604942, - "learning_rate": 8.435165956317226e-07, - "loss": 0.0021, - "step": 1154 - }, - { - "epoch": 0.8690744920993227, - "grad_norm": 0.10189584642648697, - "learning_rate": 8.340398683947004e-07, - "loss": 0.0039, - "step": 1155 - }, - { - "epoch": 0.8698269375470279, - "grad_norm": 0.058472152799367905, - "learning_rate": 8.2461435973902e-07, - "loss": 0.0029, - "step": 1156 - }, - { - "epoch": 0.8705793829947329, - "grad_norm": 0.01816386543214321, - "learning_rate": 8.152401223335993e-07, - "loss": 0.0007, - "step": 1157 - }, - { - "epoch": 0.871331828442438, - "grad_norm": 13.75079345703125, - "learning_rate": 8.059172085608535e-07, - "loss": 0.1557, - "step": 1158 - }, - { - "epoch": 0.872084273890143, - "grad_norm": 0.01980912685394287, - "learning_rate": 7.966456705164094e-07, - "loss": 0.0008, - "step": 1159 - }, - { - "epoch": 0.872836719337848, - "grad_norm": 0.016307447105646133, - "learning_rate": 7.874255600088043e-07, - "loss": 0.0006, - "step": 1160 - }, - { - "epoch": 0.873589164785553, - "grad_norm": 0.11552978307008743, - "learning_rate": 7.78256928559209e-07, - "loss": 0.0054, - "step": 1161 - }, - { - "epoch": 0.8743416102332581, - "grad_norm": 0.3584482967853546, - "learning_rate": 7.69139827401132e-07, - "loss": 0.0154, - "step": 1162 - }, - { - "epoch": 0.8750940556809631, - "grad_norm": 0.07163400202989578, - "learning_rate": 7.600743074801353e-07, - "loss": 0.0036, - "step": 1163 - }, - { - "epoch": 0.8758465011286681, - "grad_norm": 0.01637900248169899, - "learning_rate": 7.510604194535487e-07, - "loss": 0.0007, - "step": 1164 - }, - { - "epoch": 0.8765989465763732, - "grad_norm": 0.017222406342625618, - "learning_rate": 7.420982136901888e-07, - "loss": 0.0007, - "step": 1165 - }, - { - "epoch": 0.8773513920240783, - "grad_norm": 0.032630909234285355, - "learning_rate": 7.331877402700737e-07, - "loss": 0.0008, - "step": 1166 - }, - { - "epoch": 0.8781038374717833, - "grad_norm": 0.1266675889492035, - "learning_rate": 7.243290489841493e-07, - "loss": 0.0061, - "step": 1167 - }, - { - "epoch": 0.8788562829194884, - "grad_norm": 0.0033598667941987514, - "learning_rate": 7.155221893340036e-07, - "loss": 0.0001, - "step": 1168 - }, - { - "epoch": 0.8796087283671934, - "grad_norm": 0.004325724206864834, - "learning_rate": 7.067672105315981e-07, - "loss": 0.0001, - "step": 1169 - }, - { - "epoch": 0.8803611738148984, - "grad_norm": 0.009192912839353085, - "learning_rate": 6.980641614989847e-07, - "loss": 0.0004, - "step": 1170 - }, - { - "epoch": 0.8811136192626035, - "grad_norm": 0.037033502012491226, - "learning_rate": 6.894130908680396e-07, - "loss": 0.0016, - "step": 1171 - }, - { - "epoch": 0.8818660647103085, - "grad_norm": 9.960691452026367, - "learning_rate": 6.808140469801872e-07, - "loss": 0.137, - "step": 1172 - }, - { - "epoch": 0.8826185101580135, - "grad_norm": 1.7104170322418213, - "learning_rate": 6.722670778861284e-07, - "loss": 0.0401, - "step": 1173 - }, - { - "epoch": 0.8833709556057185, - "grad_norm": 0.0026178702246397734, - "learning_rate": 6.637722313455774e-07, - "loss": 0.0001, - "step": 1174 - }, - { - "epoch": 0.8841234010534236, - "grad_norm": 0.03204850107431412, - "learning_rate": 6.553295548269922e-07, - "loss": 0.0011, - "step": 1175 - }, - { - "epoch": 0.8848758465011287, - "grad_norm": 0.03557371720671654, - "learning_rate": 6.469390955073073e-07, - "loss": 0.0012, - "step": 1176 - }, - { - "epoch": 0.8856282919488337, - "grad_norm": 0.3740195035934448, - "learning_rate": 6.386009002716776e-07, - "loss": 0.0097, - "step": 1177 - }, - { - "epoch": 0.8863807373965388, - "grad_norm": 0.06629537791013718, - "learning_rate": 6.303150157132044e-07, - "loss": 0.0027, - "step": 1178 - }, - { - "epoch": 0.8871331828442438, - "grad_norm": 0.0544668547809124, - "learning_rate": 6.22081488132682e-07, - "loss": 0.0024, - "step": 1179 - }, - { - "epoch": 0.8878856282919488, - "grad_norm": 3.0258607864379883, - "learning_rate": 6.139003635383433e-07, - "loss": 0.0818, - "step": 1180 - }, - { - "epoch": 0.8886380737396539, - "grad_norm": 3.636326313018799, - "learning_rate": 6.057716876455932e-07, - "loss": 0.0272, - "step": 1181 - }, - { - "epoch": 0.8893905191873589, - "grad_norm": 2.166203260421753, - "learning_rate": 5.976955058767609e-07, - "loss": 0.2443, - "step": 1182 - }, - { - "epoch": 0.8901429646350639, - "grad_norm": 0.9411613345146179, - "learning_rate": 5.896718633608412e-07, - "loss": 0.0109, - "step": 1183 - }, - { - "epoch": 0.890895410082769, - "grad_norm": 0.05662931129336357, - "learning_rate": 5.81700804933244e-07, - "loss": 0.0028, - "step": 1184 - }, - { - "epoch": 0.891647855530474, - "grad_norm": 0.005325346253812313, - "learning_rate": 5.737823751355465e-07, - "loss": 0.0002, - "step": 1185 - }, - { - "epoch": 0.8924003009781791, - "grad_norm": 0.009086296893656254, - "learning_rate": 5.659166182152387e-07, - "loss": 0.0003, - "step": 1186 - }, - { - "epoch": 0.8931527464258842, - "grad_norm": 0.05443952605128288, - "learning_rate": 5.581035781254807e-07, - "loss": 0.0023, - "step": 1187 - }, - { - "epoch": 0.8939051918735892, - "grad_norm": 0.09654070436954498, - "learning_rate": 5.503432985248558e-07, - "loss": 0.0041, - "step": 1188 - }, - { - "epoch": 0.8946576373212942, - "grad_norm": 0.022555604577064514, - "learning_rate": 5.426358227771245e-07, - "loss": 0.0008, - "step": 1189 - }, - { - "epoch": 0.8954100827689992, - "grad_norm": 0.022699879482388496, - "learning_rate": 5.349811939509874e-07, - "loss": 0.0009, - "step": 1190 - }, - { - "epoch": 0.8961625282167043, - "grad_norm": 0.013877753168344498, - "learning_rate": 5.273794548198374e-07, - "loss": 0.0005, - "step": 1191 - }, - { - "epoch": 0.8969149736644093, - "grad_norm": 0.23715998232364655, - "learning_rate": 5.198306478615278e-07, - "loss": 0.0989, - "step": 1192 - }, - { - "epoch": 0.8976674191121143, - "grad_norm": 0.010840164497494698, - "learning_rate": 5.123348152581264e-07, - "loss": 0.0005, - "step": 1193 - }, - { - "epoch": 0.8984198645598194, - "grad_norm": 0.026456279680132866, - "learning_rate": 5.048919988956913e-07, - "loss": 0.0008, - "step": 1194 - }, - { - "epoch": 0.8991723100075244, - "grad_norm": 0.0075182537548244, - "learning_rate": 4.975022403640273e-07, - "loss": 0.0002, - "step": 1195 - }, - { - "epoch": 0.8999247554552295, - "grad_norm": 0.028369707986712456, - "learning_rate": 4.901655809564543e-07, - "loss": 0.0011, - "step": 1196 - }, - { - "epoch": 0.9006772009029346, - "grad_norm": 0.029247762635350227, - "learning_rate": 4.828820616695873e-07, - "loss": 0.0012, - "step": 1197 - }, - { - "epoch": 0.9014296463506396, - "grad_norm": 0.011888640001416206, - "learning_rate": 4.7565172320308886e-07, - "loss": 0.0004, - "step": 1198 - }, - { - "epoch": 0.9021820917983446, - "grad_norm": 0.20360657572746277, - "learning_rate": 4.684746059594558e-07, - "loss": 0.0057, - "step": 1199 - }, - { - "epoch": 0.9029345372460497, - "grad_norm": 0.020525943487882614, - "learning_rate": 4.6135075004379193e-07, - "loss": 0.0005, - "step": 1200 - }, - { - "epoch": 0.9036869826937547, - "grad_norm": 5.512493133544922, - "learning_rate": 4.542801952635789e-07, - "loss": 0.0409, - "step": 1201 - }, - { - "epoch": 0.9044394281414597, - "grad_norm": 0.041713930666446686, - "learning_rate": 4.472629811284568e-07, - "loss": 0.0014, - "step": 1202 - }, - { - "epoch": 0.9051918735891648, - "grad_norm": 0.023644356057047844, - "learning_rate": 4.4029914685000176e-07, - "loss": 0.0011, - "step": 1203 - }, - { - "epoch": 0.9059443190368698, - "grad_norm": 0.009974486194550991, - "learning_rate": 4.333887313415097e-07, - "loss": 0.0004, - "step": 1204 - }, - { - "epoch": 0.9066967644845748, - "grad_norm": 10.814530372619629, - "learning_rate": 4.265317732177787e-07, - "loss": 0.0471, - "step": 1205 - }, - { - "epoch": 0.90744920993228, - "grad_norm": 0.04487287253141403, - "learning_rate": 4.1972831079488354e-07, - "loss": 0.0012, - "step": 1206 - }, - { - "epoch": 0.908201655379985, - "grad_norm": 0.304022878408432, - "learning_rate": 4.129783820899802e-07, - "loss": 0.1235, - "step": 1207 - }, - { - "epoch": 0.90895410082769, - "grad_norm": 0.08795291930437088, - "learning_rate": 4.0628202482107747e-07, - "loss": 0.004, - "step": 1208 - }, - { - "epoch": 0.909706546275395, - "grad_norm": 0.07199563086032867, - "learning_rate": 3.9963927640683243e-07, - "loss": 0.0035, - "step": 1209 - }, - { - "epoch": 0.9104589917231001, - "grad_norm": 0.021918121725320816, - "learning_rate": 3.930501739663406e-07, - "loss": 0.0009, - "step": 1210 - }, - { - "epoch": 0.9112114371708051, - "grad_norm": 0.10286161303520203, - "learning_rate": 3.865147543189296e-07, - "loss": 0.0036, - "step": 1211 - }, - { - "epoch": 0.9119638826185101, - "grad_norm": 0.25747624039649963, - "learning_rate": 3.8003305398394916e-07, - "loss": 0.1216, - "step": 1212 - }, - { - "epoch": 0.9127163280662152, - "grad_norm": 1.8191041946411133, - "learning_rate": 3.7360510918057256e-07, - "loss": 0.021, - "step": 1213 - }, - { - "epoch": 0.9134687735139202, - "grad_norm": 0.15397605299949646, - "learning_rate": 3.672309558275922e-07, - "loss": 0.0057, - "step": 1214 - }, - { - "epoch": 0.9142212189616253, - "grad_norm": 0.03293481469154358, - "learning_rate": 3.6091062954321634e-07, - "loss": 0.0016, - "step": 1215 - }, - { - "epoch": 0.9149736644093304, - "grad_norm": 0.002885388908907771, - "learning_rate": 3.5464416564487734e-07, - "loss": 0.0001, - "step": 1216 - }, - { - "epoch": 0.9157261098570354, - "grad_norm": 0.02011510170996189, - "learning_rate": 3.484315991490261e-07, - "loss": 0.0006, - "step": 1217 - }, - { - "epoch": 0.9164785553047404, - "grad_norm": 0.0348239503800869, - "learning_rate": 3.422729647709355e-07, - "loss": 0.0015, - "step": 1218 - }, - { - "epoch": 0.9172310007524455, - "grad_norm": 0.001900368370115757, - "learning_rate": 3.361682969245161e-07, - "loss": 0.0001, - "step": 1219 - }, - { - "epoch": 0.9179834462001505, - "grad_norm": 0.013635323382914066, - "learning_rate": 3.3011762972211647e-07, - "loss": 0.0006, - "step": 1220 - }, - { - "epoch": 0.9187358916478555, - "grad_norm": 0.5103453397750854, - "learning_rate": 3.241209969743353e-07, - "loss": 0.0088, - "step": 1221 - }, - { - "epoch": 0.9194883370955605, - "grad_norm": 0.03513436019420624, - "learning_rate": 3.181784321898285e-07, - "loss": 0.001, - "step": 1222 - }, - { - "epoch": 0.9202407825432656, - "grad_norm": 0.019935129210352898, - "learning_rate": 3.1228996857512795e-07, - "loss": 0.0007, - "step": 1223 - }, - { - "epoch": 0.9209932279909706, - "grad_norm": 0.0077493940480053425, - "learning_rate": 3.064556390344542e-07, - "loss": 0.0003, - "step": 1224 - }, - { - "epoch": 0.9217456734386757, - "grad_norm": 1.4610238075256348, - "learning_rate": 3.0067547616952297e-07, - "loss": 0.0216, - "step": 1225 - }, - { - "epoch": 0.9224981188863808, - "grad_norm": 0.013395492918789387, - "learning_rate": 2.949495122793833e-07, - "loss": 0.0005, - "step": 1226 - }, - { - "epoch": 0.9232505643340858, - "grad_norm": 0.022489206865429878, - "learning_rate": 2.892777793602175e-07, - "loss": 0.0009, - "step": 1227 - }, - { - "epoch": 0.9240030097817908, - "grad_norm": 0.04702683165669441, - "learning_rate": 2.836603091051704e-07, - "loss": 0.0019, - "step": 1228 - }, - { - "epoch": 0.9247554552294959, - "grad_norm": 0.34132277965545654, - "learning_rate": 2.7809713290417486e-07, - "loss": 0.012, - "step": 1229 - }, - { - "epoch": 0.9255079006772009, - "grad_norm": 1.5338892936706543, - "learning_rate": 2.7258828184377086e-07, - "loss": 0.1041, - "step": 1230 - }, - { - "epoch": 0.9262603461249059, - "grad_norm": 0.00669435178861022, - "learning_rate": 2.6713378670693455e-07, - "loss": 0.0002, - "step": 1231 - }, - { - "epoch": 0.927012791572611, - "grad_norm": 9.580220222473145, - "learning_rate": 2.617336779729063e-07, - "loss": 0.1428, - "step": 1232 - }, - { - "epoch": 0.927765237020316, - "grad_norm": 3.141953945159912, - "learning_rate": 2.563879858170215e-07, - "loss": 0.0465, - "step": 1233 - }, - { - "epoch": 0.928517682468021, - "grad_norm": 0.04021593928337097, - "learning_rate": 2.5109674011053684e-07, - "loss": 0.0016, - "step": 1234 - }, - { - "epoch": 0.9292701279157262, - "grad_norm": 0.12653590738773346, - "learning_rate": 2.458599704204712e-07, - "loss": 0.0028, - "step": 1235 - }, - { - "epoch": 0.9300225733634312, - "grad_norm": 1.136615514755249, - "learning_rate": 2.406777060094345e-07, - "loss": 0.016, - "step": 1236 - }, - { - "epoch": 0.9307750188111362, - "grad_norm": 0.0015482566086575389, - "learning_rate": 2.3554997583546402e-07, - "loss": 0.0001, - "step": 1237 - }, - { - "epoch": 0.9315274642588413, - "grad_norm": 0.025700274854898453, - "learning_rate": 2.3047680855186716e-07, - "loss": 0.0012, - "step": 1238 - }, - { - "epoch": 0.9322799097065463, - "grad_norm": 0.03950847312808037, - "learning_rate": 2.2545823250705867e-07, - "loss": 0.0008, - "step": 1239 - }, - { - "epoch": 0.9330323551542513, - "grad_norm": 0.01254583802074194, - "learning_rate": 2.2049427574439953e-07, - "loss": 0.0005, - "step": 1240 - }, - { - "epoch": 0.9337848006019563, - "grad_norm": 0.1133909747004509, - "learning_rate": 2.1558496600204703e-07, - "loss": 0.0028, - "step": 1241 - }, - { - "epoch": 0.9345372460496614, - "grad_norm": 0.04309477657079697, - "learning_rate": 2.1073033071279057e-07, - "loss": 0.0021, - "step": 1242 - }, - { - "epoch": 0.9352896914973664, - "grad_norm": 2.5412564277648926, - "learning_rate": 2.059303970039106e-07, - "loss": 0.3464, - "step": 1243 - }, - { - "epoch": 0.9360421369450714, - "grad_norm": 0.013797776773571968, - "learning_rate": 2.011851916970109e-07, - "loss": 0.0006, - "step": 1244 - }, - { - "epoch": 0.9367945823927766, - "grad_norm": 0.0062391371466219425, - "learning_rate": 1.9649474130788438e-07, - "loss": 0.0002, - "step": 1245 - }, - { - "epoch": 0.9375470278404816, - "grad_norm": 0.4302297830581665, - "learning_rate": 1.9185907204635755e-07, - "loss": 0.1256, - "step": 1246 - }, - { - "epoch": 0.9382994732881866, - "grad_norm": 0.002676435513421893, - "learning_rate": 1.8727820981614407e-07, - "loss": 0.0001, - "step": 1247 - }, - { - "epoch": 0.9390519187358917, - "grad_norm": 5.231035232543945, - "learning_rate": 1.827521802146981e-07, - "loss": 0.4477, - "step": 1248 - }, - { - "epoch": 0.9398043641835967, - "grad_norm": 1.7769052982330322, - "learning_rate": 1.7828100853307884e-07, - "loss": 0.2043, - "step": 1249 - }, - { - "epoch": 0.9405568096313017, - "grad_norm": 0.005753179080784321, - "learning_rate": 1.7386471975579854e-07, - "loss": 0.0002, - "step": 1250 - }, - { - "epoch": 0.9413092550790068, - "grad_norm": 0.04713589698076248, - "learning_rate": 1.6950333856069369e-07, - "loss": 0.0013, - "step": 1251 - }, - { - "epoch": 0.9420617005267118, - "grad_norm": 0.042459528893232346, - "learning_rate": 1.651968893187783e-07, - "loss": 0.002, - "step": 1252 - }, - { - "epoch": 0.9428141459744168, - "grad_norm": 0.1957351565361023, - "learning_rate": 1.609453960941143e-07, - "loss": 0.0071, - "step": 1253 - }, - { - "epoch": 0.9435665914221218, - "grad_norm": 0.028360065072774887, - "learning_rate": 1.567488826436725e-07, - "loss": 0.0013, - "step": 1254 - }, - { - "epoch": 0.944319036869827, - "grad_norm": 1.9565247297286987, - "learning_rate": 1.526073724172028e-07, - "loss": 0.0938, - "step": 1255 - }, - { - "epoch": 0.945071482317532, - "grad_norm": 0.023253565654158592, - "learning_rate": 1.485208885570999e-07, - "loss": 0.0009, - "step": 1256 - }, - { - "epoch": 0.945823927765237, - "grad_norm": 0.005460272543132305, - "learning_rate": 1.4448945389827772e-07, - "loss": 0.0002, - "step": 1257 - }, - { - "epoch": 0.9465763732129421, - "grad_norm": 0.07867200672626495, - "learning_rate": 1.4051309096803967e-07, - "loss": 0.0031, - "step": 1258 - }, - { - "epoch": 0.9473288186606471, - "grad_norm": 0.011308335699141026, - "learning_rate": 1.36591821985953e-07, - "loss": 0.0004, - "step": 1259 - }, - { - "epoch": 0.9480812641083521, - "grad_norm": 0.7972699999809265, - "learning_rate": 1.3272566886372572e-07, - "loss": 0.0923, - "step": 1260 - }, - { - "epoch": 0.9488337095560572, - "grad_norm": 0.01437709853053093, - "learning_rate": 1.2891465320508113e-07, - "loss": 0.0006, - "step": 1261 - }, - { - "epoch": 0.9495861550037622, - "grad_norm": 0.021911505609750748, - "learning_rate": 1.2515879630564108e-07, - "loss": 0.0009, - "step": 1262 - }, - { - "epoch": 0.9503386004514672, - "grad_norm": 0.018048502504825592, - "learning_rate": 1.2145811915280414e-07, - "loss": 0.0009, - "step": 1263 - }, - { - "epoch": 0.9510910458991723, - "grad_norm": 0.01497666072100401, - "learning_rate": 1.1781264242562984e-07, - "loss": 0.0006, - "step": 1264 - }, - { - "epoch": 0.9518434913468774, - "grad_norm": 0.01695135422050953, - "learning_rate": 1.1422238649472228e-07, - "loss": 0.0005, - "step": 1265 - }, - { - "epoch": 0.9525959367945824, - "grad_norm": 0.05780097469687462, - "learning_rate": 1.1068737142211683e-07, - "loss": 0.0023, - "step": 1266 - }, - { - "epoch": 0.9533483822422875, - "grad_norm": 0.4244280457496643, - "learning_rate": 1.072076169611691e-07, - "loss": 0.0127, - "step": 1267 - }, - { - "epoch": 0.9541008276899925, - "grad_norm": 0.28837308287620544, - "learning_rate": 1.0378314255643951e-07, - "loss": 0.0074, - "step": 1268 - }, - { - "epoch": 0.9548532731376975, - "grad_norm": 0.05638807266950607, - "learning_rate": 1.004139673435922e-07, - "loss": 0.0019, - "step": 1269 - }, - { - "epoch": 0.9556057185854026, - "grad_norm": 0.025965796783566475, - "learning_rate": 9.7100110149283e-08, - "loss": 0.0011, - "step": 1270 - }, - { - "epoch": 0.9563581640331076, - "grad_norm": 0.00950747448951006, - "learning_rate": 9.384158949105382e-08, - "loss": 0.0004, - "step": 1271 - }, - { - "epoch": 0.9571106094808126, - "grad_norm": 0.4918772280216217, - "learning_rate": 9.063842357723284e-08, - "loss": 0.0149, - "step": 1272 - }, - { - "epoch": 0.9578630549285176, - "grad_norm": 4.298508167266846, - "learning_rate": 8.749063030683125e-08, - "loss": 0.0998, - "step": 1273 - }, - { - "epoch": 0.9586155003762227, - "grad_norm": 4.235317230224609, - "learning_rate": 8.439822726943991e-08, - "loss": 0.2552, - "step": 1274 - }, - { - "epoch": 0.9593679458239278, - "grad_norm": 0.04486255720257759, - "learning_rate": 8.136123174513843e-08, - "loss": 0.0019, - "step": 1275 - }, - { - "epoch": 0.9601203912716328, - "grad_norm": 0.004140100441873074, - "learning_rate": 7.837966070438851e-08, - "loss": 0.0001, - "step": 1276 - }, - { - "epoch": 0.9608728367193379, - "grad_norm": 0.01324907224625349, - "learning_rate": 7.54535308079507e-08, - "loss": 0.0006, - "step": 1277 - }, - { - "epoch": 0.9616252821670429, - "grad_norm": 0.04146108031272888, - "learning_rate": 7.258285840677893e-08, - "loss": 0.0018, - "step": 1278 - }, - { - "epoch": 0.9623777276147479, - "grad_norm": 0.008022352121770382, - "learning_rate": 6.976765954194165e-08, - "loss": 0.0003, - "step": 1279 - }, - { - "epoch": 0.963130173062453, - "grad_norm": 0.03538546711206436, - "learning_rate": 6.700794994452198e-08, - "loss": 0.0018, - "step": 1280 - }, - { - "epoch": 0.963882618510158, - "grad_norm": 2.600939989089966, - "learning_rate": 6.430374503553439e-08, - "loss": 0.5258, - "step": 1281 - }, - { - "epoch": 0.964635063957863, - "grad_norm": 0.13569378852844238, - "learning_rate": 6.165505992584142e-08, - "loss": 0.0053, - "step": 1282 - }, - { - "epoch": 0.9653875094055681, - "grad_norm": 0.004670869559049606, - "learning_rate": 5.9061909416059385e-08, - "loss": 0.0002, - "step": 1283 - }, - { - "epoch": 0.9661399548532731, - "grad_norm": 0.014692548662424088, - "learning_rate": 5.652430799648945e-08, - "loss": 0.0005, - "step": 1284 - }, - { - "epoch": 0.9668924003009782, - "grad_norm": 0.8171061873435974, - "learning_rate": 5.404226984702221e-08, - "loss": 0.1252, - "step": 1285 - }, - { - "epoch": 0.9676448457486833, - "grad_norm": 0.018849292770028114, - "learning_rate": 5.161580883707218e-08, - "loss": 0.0007, - "step": 1286 - }, - { - "epoch": 0.9683972911963883, - "grad_norm": 3.7725510597229004, - "learning_rate": 4.924493852549006e-08, - "loss": 0.2135, - "step": 1287 - }, - { - "epoch": 0.9691497366440933, - "grad_norm": 0.005706661846488714, - "learning_rate": 4.69296721604906e-08, - "loss": 0.0002, - "step": 1288 - }, - { - "epoch": 0.9699021820917983, - "grad_norm": 1.751198649406433, - "learning_rate": 4.4670022679579314e-08, - "loss": 0.0254, - "step": 1289 - }, - { - "epoch": 0.9706546275395034, - "grad_norm": 0.015075215138494968, - "learning_rate": 4.24660027094792e-08, - "loss": 0.0006, - "step": 1290 - }, - { - "epoch": 0.9714070729872084, - "grad_norm": 0.12630032002925873, - "learning_rate": 4.0317624566060806e-08, - "loss": 0.0049, - "step": 1291 - }, - { - "epoch": 0.9721595184349134, - "grad_norm": 0.01940903812646866, - "learning_rate": 3.822490025427339e-08, - "loss": 0.0005, - "step": 1292 - }, - { - "epoch": 0.9729119638826185, - "grad_norm": 0.025583157315850258, - "learning_rate": 3.618784146807497e-08, - "loss": 0.0011, - "step": 1293 - }, - { - "epoch": 0.9736644093303235, - "grad_norm": 3.0885636806488037, - "learning_rate": 3.42064595903735e-08, - "loss": 0.3399, - "step": 1294 - }, - { - "epoch": 0.9744168547780286, - "grad_norm": 0.04877516254782677, - "learning_rate": 3.2280765692956904e-08, - "loss": 0.0022, - "step": 1295 - }, - { - "epoch": 0.9751693002257337, - "grad_norm": 2.90549635887146, - "learning_rate": 3.0410770536432047e-08, - "loss": 0.2458, - "step": 1296 - }, - { - "epoch": 0.9759217456734387, - "grad_norm": 0.039560992270708084, - "learning_rate": 2.859648457016917e-08, - "loss": 0.0017, - "step": 1297 - }, - { - "epoch": 0.9766741911211437, - "grad_norm": 0.10429071635007858, - "learning_rate": 2.6837917932238667e-08, - "loss": 0.0048, - "step": 1298 - }, - { - "epoch": 0.9774266365688488, - "grad_norm": 0.24316257238388062, - "learning_rate": 2.513508044935775e-08, - "loss": 0.0068, - "step": 1299 - }, - { - "epoch": 0.9781790820165538, - "grad_norm": 0.020967544987797737, - "learning_rate": 2.3487981636831635e-08, - "loss": 0.0006, - "step": 1300 - }, - { - "epoch": 0.9789315274642588, - "grad_norm": 0.05963461101055145, - "learning_rate": 2.189663069850578e-08, - "loss": 0.0026, - "step": 1301 - }, - { - "epoch": 0.9796839729119639, - "grad_norm": 0.022699451074004173, - "learning_rate": 2.0361036526707067e-08, - "loss": 0.001, - "step": 1302 - }, - { - "epoch": 0.9804364183596689, - "grad_norm": 2.274538278579712, - "learning_rate": 1.8881207702202696e-08, - "loss": 0.1143, - "step": 1303 - }, - { - "epoch": 0.9811888638073739, - "grad_norm": 0.12566229701042175, - "learning_rate": 1.7457152494145814e-08, - "loss": 0.0061, - "step": 1304 - }, - { - "epoch": 0.981941309255079, - "grad_norm": 0.03288928419351578, - "learning_rate": 1.6088878860032187e-08, - "loss": 0.0016, - "step": 1305 - }, - { - "epoch": 0.9826937547027841, - "grad_norm": 0.0259998869150877, - "learning_rate": 1.4776394445655818e-08, - "loss": 0.0009, - "step": 1306 - }, - { - "epoch": 0.9834462001504891, - "grad_norm": 1.0154011249542236, - "learning_rate": 1.3519706585063408e-08, - "loss": 0.0419, - "step": 1307 - }, - { - "epoch": 0.9841986455981941, - "grad_norm": 4.762252330780029, - "learning_rate": 1.231882230051662e-08, - "loss": 0.0761, - "step": 1308 - }, - { - "epoch": 0.9849510910458992, - "grad_norm": 5.905474662780762, - "learning_rate": 1.1173748302450993e-08, - "loss": 0.2441, - "step": 1309 - }, - { - "epoch": 0.9857035364936042, - "grad_norm": 4.897678852081299, - "learning_rate": 1.0084490989441531e-08, - "loss": 0.228, - "step": 1310 - }, - { - "epoch": 0.9864559819413092, - "grad_norm": 0.00369740417227149, - "learning_rate": 9.051056448160511e-09, - "loss": 0.0001, - "step": 1311 - }, - { - "epoch": 0.9872084273890143, - "grad_norm": 0.0034076508600264788, - "learning_rate": 8.07345045334973e-09, - "loss": 0.0001, - "step": 1312 - }, - { - "epoch": 0.9879608728367193, - "grad_norm": 0.007578455843031406, - "learning_rate": 7.151678467787193e-09, - "loss": 0.0003, - "step": 1313 - }, - { - "epoch": 0.9887133182844243, - "grad_norm": 0.1452741026878357, - "learning_rate": 6.285745642253816e-09, - "loss": 0.0057, - "step": 1314 - }, - { - "epoch": 0.9894657637321295, - "grad_norm": 0.007722517475485802, - "learning_rate": 5.475656815504549e-09, - "loss": 0.0003, - "step": 1315 - }, - { - "epoch": 0.9902182091798345, - "grad_norm": 0.04480676352977753, - "learning_rate": 4.721416514245069e-09, - "loss": 0.002, - "step": 1316 - }, - { - "epoch": 0.9909706546275395, - "grad_norm": 0.6687166094779968, - "learning_rate": 4.023028953106245e-09, - "loss": 0.1434, - "step": 1317 - }, - { - "epoch": 0.9917231000752446, - "grad_norm": 0.014944463968276978, - "learning_rate": 3.3804980346141547e-09, - "loss": 0.0006, - "step": 1318 - }, - { - "epoch": 0.9924755455229496, - "grad_norm": 0.01458480954170227, - "learning_rate": 2.7938273491756596e-09, - "loss": 0.0006, - "step": 1319 - }, - { - "epoch": 0.9932279909706546, - "grad_norm": 0.01680012419819832, - "learning_rate": 2.2630201750561965e-09, - "loss": 0.0008, - "step": 1320 - }, - { - "epoch": 0.9939804364183596, - "grad_norm": 0.054570991545915604, - "learning_rate": 1.7880794783575738e-09, - "loss": 0.0019, - "step": 1321 - }, - { - "epoch": 0.9947328818660647, - "grad_norm": 0.07745349407196045, - "learning_rate": 1.3690079130090905e-09, - "loss": 0.0017, - "step": 1322 - }, - { - "epoch": 0.9954853273137697, - "grad_norm": 9.53114128112793, - "learning_rate": 1.0058078207453303e-09, - "loss": 0.0771, - "step": 1323 - }, - { - "epoch": 0.9962377727614747, - "grad_norm": 0.049212612211704254, - "learning_rate": 6.984812310950606e-10, - "loss": 0.0018, - "step": 1324 - }, - { - "epoch": 0.9969902182091799, - "grad_norm": 0.004117061849683523, - "learning_rate": 4.470298613745705e-10, - "loss": 0.0002, - "step": 1325 - }, - { - "epoch": 0.9977426636568849, - "grad_norm": 0.06400679796934128, - "learning_rate": 2.514551166699075e-10, - "loss": 0.0028, - "step": 1326 - }, - { - "epoch": 0.9984951091045899, - "grad_norm": 1.3703467845916748, - "learning_rate": 1.1175808983687752e-10, - "loss": 0.0185, - "step": 1327 - }, - { - "epoch": 0.999247554552295, - "grad_norm": 0.01683868281543255, - "learning_rate": 2.793956148994248e-11, - "loss": 0.0007, - "step": 1328 - }, - { - "epoch": 1.0, - "grad_norm": 0.0036646851804107428, - "learning_rate": 0.0, - "loss": 0.0001, - "step": 1329 } ], "logging_steps": 1, - "max_steps": 1329, + "max_steps": 332, "num_input_tokens_seen": 0, - "num_train_epochs": 1, + "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -9329,7 +2350,7 @@ "attributes": {} } }, - "total_flos": 4.439066769492296e+17, + "total_flos": 8.806882647064781e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null