{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9754098360655736, "eval_steps": 500, "global_step": 183, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016260162601626018, "grad_norm": 38.715505104327306, "learning_rate": 6.666666666666667e-07, "loss": 2.7935, "step": 1 }, { "epoch": 0.032520325203252036, "grad_norm": 43.92734901516536, "learning_rate": 1.3333333333333334e-06, "loss": 2.8618, "step": 2 }, { "epoch": 0.04878048780487805, "grad_norm": 40.50339510154553, "learning_rate": 2.0000000000000003e-06, "loss": 2.8164, "step": 3 }, { "epoch": 0.06504065040650407, "grad_norm": 28.51481486426967, "learning_rate": 2.666666666666667e-06, "loss": 2.672, "step": 4 }, { "epoch": 0.08130081300813008, "grad_norm": 17.557085161970114, "learning_rate": 3.3333333333333333e-06, "loss": 2.526, "step": 5 }, { "epoch": 0.0975609756097561, "grad_norm": 9.833831469561684, "learning_rate": 4.000000000000001e-06, "loss": 2.4556, "step": 6 }, { "epoch": 0.11382113821138211, "grad_norm": 5.8460959136595845, "learning_rate": 4.666666666666667e-06, "loss": 2.1935, "step": 7 }, { "epoch": 0.13008130081300814, "grad_norm": 5.207230159450673, "learning_rate": 5.333333333333334e-06, "loss": 2.3815, "step": 8 }, { "epoch": 0.14634146341463414, "grad_norm": 7.281692741856846, "learning_rate": 6e-06, "loss": 2.311, "step": 9 }, { "epoch": 0.16260162601626016, "grad_norm": 4.085861410961766, "learning_rate": 6.666666666666667e-06, "loss": 2.3801, "step": 10 }, { "epoch": 0.17886178861788618, "grad_norm": 3.1648108898931473, "learning_rate": 7.333333333333333e-06, "loss": 2.2303, "step": 11 }, { "epoch": 0.1951219512195122, "grad_norm": 2.733124694129773, "learning_rate": 8.000000000000001e-06, "loss": 2.2489, "step": 12 }, { "epoch": 0.21138211382113822, "grad_norm": 2.601048953766184, "learning_rate": 8.666666666666668e-06, "loss": 2.3216, "step": 13 }, { "epoch": 0.22764227642276422, "grad_norm": 2.799670635370881, "learning_rate": 9.333333333333334e-06, "loss": 2.3405, "step": 14 }, { "epoch": 0.24390243902439024, "grad_norm": 2.2889145851702035, "learning_rate": 1e-05, "loss": 2.2761, "step": 15 }, { "epoch": 0.2601626016260163, "grad_norm": 2.074354640710373, "learning_rate": 9.999529497453782e-06, "loss": 2.2498, "step": 16 }, { "epoch": 0.2764227642276423, "grad_norm": 1.8496605844704992, "learning_rate": 9.998118078364186e-06, "loss": 2.1694, "step": 17 }, { "epoch": 0.2926829268292683, "grad_norm": 1.657414876254685, "learning_rate": 9.99576600836172e-06, "loss": 2.1922, "step": 18 }, { "epoch": 0.3089430894308943, "grad_norm": 1.6874201712764785, "learning_rate": 9.992473730108354e-06, "loss": 2.2212, "step": 19 }, { "epoch": 0.3252032520325203, "grad_norm": 1.6218362871051897, "learning_rate": 9.988241863214212e-06, "loss": 2.2361, "step": 20 }, { "epoch": 0.34146341463414637, "grad_norm": 1.7125149417249366, "learning_rate": 9.98307120412095e-06, "loss": 2.0696, "step": 21 }, { "epoch": 0.35772357723577236, "grad_norm": 1.748991995811329, "learning_rate": 9.976962725951878e-06, "loss": 2.3047, "step": 22 }, { "epoch": 0.37398373983739835, "grad_norm": 1.635597001502544, "learning_rate": 9.969917578328808e-06, "loss": 2.1672, "step": 23 }, { "epoch": 0.3902439024390244, "grad_norm": 1.5126187316968263, "learning_rate": 9.961937087155697e-06, "loss": 2.3152, "step": 24 }, { "epoch": 0.4065040650406504, "grad_norm": 1.5822580849972034, "learning_rate": 9.953022754369115e-06, "loss": 2.2176, "step": 25 }, { "epoch": 0.42276422764227645, "grad_norm": 1.5777453977666092, "learning_rate": 9.943176257655567e-06, "loss": 2.3213, "step": 26 }, { "epoch": 0.43902439024390244, "grad_norm": 1.4094336427074805, "learning_rate": 9.932399450135765e-06, "loss": 2.1472, "step": 27 }, { "epoch": 0.45528455284552843, "grad_norm": 1.4386799640865944, "learning_rate": 9.920694360015864e-06, "loss": 2.2464, "step": 28 }, { "epoch": 0.4715447154471545, "grad_norm": 1.5398404589418597, "learning_rate": 9.908063190205739e-06, "loss": 2.2179, "step": 29 }, { "epoch": 0.4878048780487805, "grad_norm": 1.4686769996035434, "learning_rate": 9.894508317904418e-06, "loss": 2.1055, "step": 30 }, { "epoch": 0.5040650406504065, "grad_norm": 1.5185344171327764, "learning_rate": 9.880032294152673e-06, "loss": 2.4138, "step": 31 }, { "epoch": 0.5203252032520326, "grad_norm": 1.4788353106580088, "learning_rate": 9.864637843352916e-06, "loss": 2.1975, "step": 32 }, { "epoch": 0.5365853658536586, "grad_norm": 1.5482980039161824, "learning_rate": 9.848327862756466e-06, "loss": 2.2368, "step": 33 }, { "epoch": 0.5528455284552846, "grad_norm": 1.5529583533392486, "learning_rate": 9.831105421918287e-06, "loss": 2.244, "step": 34 }, { "epoch": 0.5691056910569106, "grad_norm": 1.2347439521124879, "learning_rate": 9.812973762119282e-06, "loss": 2.1969, "step": 35 }, { "epoch": 0.5853658536585366, "grad_norm": 1.7342145031832543, "learning_rate": 9.793936295756292e-06, "loss": 2.1326, "step": 36 }, { "epoch": 0.6016260162601627, "grad_norm": 1.6027037759927296, "learning_rate": 9.773996605699876e-06, "loss": 2.2234, "step": 37 }, { "epoch": 0.6178861788617886, "grad_norm": 1.5818719102225083, "learning_rate": 9.753158444620013e-06, "loss": 2.2304, "step": 38 }, { "epoch": 0.6341463414634146, "grad_norm": 1.5993004636102213, "learning_rate": 9.73142573427984e-06, "loss": 2.0879, "step": 39 }, { "epoch": 0.6504065040650406, "grad_norm": 1.4999201972757639, "learning_rate": 9.70880256479758e-06, "loss": 2.3389, "step": 40 }, { "epoch": 0.6666666666666666, "grad_norm": 1.5972052352215045, "learning_rate": 9.685293193876766e-06, "loss": 2.1184, "step": 41 }, { "epoch": 0.6829268292682927, "grad_norm": 1.5799871790968651, "learning_rate": 9.660902046004954e-06, "loss": 2.2381, "step": 42 }, { "epoch": 0.6991869918699187, "grad_norm": 1.7655935713426523, "learning_rate": 9.635633711621014e-06, "loss": 1.9785, "step": 43 }, { "epoch": 0.7154471544715447, "grad_norm": 1.4455569662489212, "learning_rate": 9.60949294625121e-06, "loss": 2.2598, "step": 44 }, { "epoch": 0.7317073170731707, "grad_norm": 1.6438734541523634, "learning_rate": 9.582484669614212e-06, "loss": 2.2254, "step": 45 }, { "epoch": 0.7479674796747967, "grad_norm": 1.5826415829816778, "learning_rate": 9.554613964695189e-06, "loss": 2.3195, "step": 46 }, { "epoch": 0.7642276422764228, "grad_norm": 1.5129794859111765, "learning_rate": 9.525886076789195e-06, "loss": 2.309, "step": 47 }, { "epoch": 0.7804878048780488, "grad_norm": 1.4425514088997948, "learning_rate": 9.496306412513989e-06, "loss": 2.1234, "step": 48 }, { "epoch": 0.7967479674796748, "grad_norm": 1.4940088273936272, "learning_rate": 9.465880538792519e-06, "loss": 2.2481, "step": 49 }, { "epoch": 0.8130081300813008, "grad_norm": 1.4327286196072573, "learning_rate": 9.434614181805203e-06, "loss": 2.18, "step": 50 }, { "epoch": 0.8292682926829268, "grad_norm": 1.431778831080833, "learning_rate": 9.402513225912273e-06, "loss": 2.1464, "step": 51 }, { "epoch": 0.8455284552845529, "grad_norm": 1.4448422926978557, "learning_rate": 9.369583712546322e-06, "loss": 2.1412, "step": 52 }, { "epoch": 0.8617886178861789, "grad_norm": 1.4679959607329234, "learning_rate": 9.335831839075303e-06, "loss": 2.0886, "step": 53 }, { "epoch": 0.8780487804878049, "grad_norm": 1.4297931147920537, "learning_rate": 9.30126395763618e-06, "loss": 2.2567, "step": 54 }, { "epoch": 0.8943089430894309, "grad_norm": 1.3468034919950198, "learning_rate": 9.265886573939448e-06, "loss": 2.2071, "step": 55 }, { "epoch": 0.9105691056910569, "grad_norm": 1.4936153334385893, "learning_rate": 9.229706346044749e-06, "loss": 2.2185, "step": 56 }, { "epoch": 0.926829268292683, "grad_norm": 1.6789184395018877, "learning_rate": 9.19273008310782e-06, "loss": 2.2014, "step": 57 }, { "epoch": 0.943089430894309, "grad_norm": 2.328661298153007, "learning_rate": 9.154964744099006e-06, "loss": 2.2311, "step": 58 }, { "epoch": 0.959349593495935, "grad_norm": 1.6031494617421091, "learning_rate": 9.116417436493574e-06, "loss": 2.2817, "step": 59 }, { "epoch": 0.975609756097561, "grad_norm": 1.378022632096063, "learning_rate": 9.077095414934076e-06, "loss": 2.3173, "step": 60 }, { "epoch": 0.991869918699187, "grad_norm": 1.378014833579439, "learning_rate": 9.037006079865017e-06, "loss": 2.1865, "step": 61 }, { "epoch": 1.008130081300813, "grad_norm": 1.777721048156252, "learning_rate": 8.996156976140088e-06, "loss": 1.9477, "step": 62 }, { "epoch": 1.024390243902439, "grad_norm": 1.9572083586736355, "learning_rate": 8.95455579160221e-06, "loss": 1.9606, "step": 63 }, { "epoch": 1.040650406504065, "grad_norm": 1.83840626983557, "learning_rate": 8.91221035563669e-06, "loss": 1.9641, "step": 64 }, { "epoch": 1.056910569105691, "grad_norm": 1.7482663924396353, "learning_rate": 8.869128637697702e-06, "loss": 1.912, "step": 65 }, { "epoch": 1.0731707317073171, "grad_norm": 2.436293071917533, "learning_rate": 8.82531874580844e-06, "loss": 2.0053, "step": 66 }, { "epoch": 1.089430894308943, "grad_norm": 1.7774996740932676, "learning_rate": 8.780788925035178e-06, "loss": 1.8775, "step": 67 }, { "epoch": 1.1056910569105691, "grad_norm": 1.5469357572162499, "learning_rate": 8.735547555935538e-06, "loss": 1.8116, "step": 68 }, { "epoch": 1.1219512195121952, "grad_norm": 1.9514287859645285, "learning_rate": 8.689603152981262e-06, "loss": 1.9458, "step": 69 }, { "epoch": 1.1382113821138211, "grad_norm": 2.16053582553305, "learning_rate": 8.642964362955781e-06, "loss": 1.8903, "step": 70 }, { "epoch": 1.1544715447154472, "grad_norm": 1.8295345259876294, "learning_rate": 8.59563996332688e-06, "loss": 1.9097, "step": 71 }, { "epoch": 1.170731707317073, "grad_norm": 1.6192771141787508, "learning_rate": 8.547638860594765e-06, "loss": 1.9404, "step": 72 }, { "epoch": 1.1869918699186992, "grad_norm": 1.732769202153044, "learning_rate": 8.498970088615861e-06, "loss": 1.9072, "step": 73 }, { "epoch": 1.203252032520325, "grad_norm": 1.7124937892640733, "learning_rate": 8.449642806902623e-06, "loss": 1.9361, "step": 74 }, { "epoch": 1.2195121951219512, "grad_norm": 1.4632968865855904, "learning_rate": 8.399666298899706e-06, "loss": 1.9064, "step": 75 }, { "epoch": 1.2357723577235773, "grad_norm": 1.57738250567877, "learning_rate": 8.349049970236822e-06, "loss": 1.8884, "step": 76 }, { "epoch": 1.2520325203252032, "grad_norm": 1.6860131175954445, "learning_rate": 8.29780334695857e-06, "loss": 1.8003, "step": 77 }, { "epoch": 1.2682926829268293, "grad_norm": 1.6737455231202842, "learning_rate": 8.245936073731654e-06, "loss": 1.9188, "step": 78 }, { "epoch": 1.2845528455284554, "grad_norm": 1.5244480150455564, "learning_rate": 8.193457912029713e-06, "loss": 1.9428, "step": 79 }, { "epoch": 1.3008130081300813, "grad_norm": 2.3077062591013546, "learning_rate": 8.140378738296233e-06, "loss": 1.9652, "step": 80 }, { "epoch": 1.3170731707317074, "grad_norm": 2.0246897260170433, "learning_rate": 8.086708542085769e-06, "loss": 1.8873, "step": 81 }, { "epoch": 1.3333333333333333, "grad_norm": 1.6727268288560573, "learning_rate": 8.032457424183909e-06, "loss": 1.8492, "step": 82 }, { "epoch": 1.3495934959349594, "grad_norm": 1.5569674481989706, "learning_rate": 7.977635594706298e-06, "loss": 1.9254, "step": 83 }, { "epoch": 1.3658536585365852, "grad_norm": 1.6023097703448475, "learning_rate": 7.922253371177081e-06, "loss": 1.9188, "step": 84 }, { "epoch": 1.3821138211382114, "grad_norm": 1.579712800121583, "learning_rate": 7.866321176587129e-06, "loss": 1.7658, "step": 85 }, { "epoch": 1.3983739837398375, "grad_norm": 1.5575689401980928, "learning_rate": 7.809849537432432e-06, "loss": 1.8844, "step": 86 }, { "epoch": 1.4146341463414633, "grad_norm": 2.0996103231960186, "learning_rate": 7.752849081732993e-06, "loss": 1.8144, "step": 87 }, { "epoch": 1.4308943089430894, "grad_norm": 2.256462070639042, "learning_rate": 7.695330537032629e-06, "loss": 1.9585, "step": 88 }, { "epoch": 1.4471544715447155, "grad_norm": 1.5391039524045877, "learning_rate": 7.637304728380036e-06, "loss": 1.7978, "step": 89 }, { "epoch": 1.4634146341463414, "grad_norm": 1.6288407781749783, "learning_rate": 7.578782576291501e-06, "loss": 1.8906, "step": 90 }, { "epoch": 1.4796747967479675, "grad_norm": 1.4226382530812676, "learning_rate": 7.51977509469565e-06, "loss": 1.8687, "step": 91 }, { "epoch": 1.4959349593495934, "grad_norm": 1.5704917583247462, "learning_rate": 7.460293388860616e-06, "loss": 2.0512, "step": 92 }, { "epoch": 1.5121951219512195, "grad_norm": 1.4787223094009658, "learning_rate": 7.400348653304022e-06, "loss": 1.9736, "step": 93 }, { "epoch": 1.5284552845528454, "grad_norm": 1.4275720991608627, "learning_rate": 7.3399521696861505e-06, "loss": 1.9977, "step": 94 }, { "epoch": 1.5447154471544715, "grad_norm": 1.4098433082701363, "learning_rate": 7.2791153046867344e-06, "loss": 1.8225, "step": 95 }, { "epoch": 1.5609756097560976, "grad_norm": 4.67217439880156, "learning_rate": 7.217849507865724e-06, "loss": 1.9615, "step": 96 }, { "epoch": 1.5772357723577235, "grad_norm": 1.519886113544732, "learning_rate": 7.156166309508482e-06, "loss": 1.8566, "step": 97 }, { "epoch": 1.5934959349593496, "grad_norm": 1.4312222151537652, "learning_rate": 7.094077318455762e-06, "loss": 1.8472, "step": 98 }, { "epoch": 1.6097560975609757, "grad_norm": 1.595194011963869, "learning_rate": 7.031594219918916e-06, "loss": 1.8246, "step": 99 }, { "epoch": 1.6260162601626016, "grad_norm": 1.5531618087372638, "learning_rate": 6.96872877328073e-06, "loss": 1.9809, "step": 100 }, { "epoch": 1.6422764227642277, "grad_norm": 1.4743001623349186, "learning_rate": 6.905492809882286e-06, "loss": 2.0107, "step": 101 }, { "epoch": 1.6585365853658538, "grad_norm": 1.9754475724007412, "learning_rate": 6.841898230796302e-06, "loss": 1.8143, "step": 102 }, { "epoch": 1.6747967479674797, "grad_norm": 2.2741340273630635, "learning_rate": 6.777957004587332e-06, "loss": 1.7824, "step": 103 }, { "epoch": 1.6910569105691056, "grad_norm": 1.6564200369629691, "learning_rate": 6.713681165059271e-06, "loss": 1.8844, "step": 104 }, { "epoch": 1.7073170731707317, "grad_norm": 1.5809400530395674, "learning_rate": 6.6490828089905854e-06, "loss": 1.8789, "step": 105 }, { "epoch": 1.7235772357723578, "grad_norm": 1.4746817698203465, "learning_rate": 6.584174093857676e-06, "loss": 1.9045, "step": 106 }, { "epoch": 1.7398373983739837, "grad_norm": 1.353250857647505, "learning_rate": 6.5189672355468415e-06, "loss": 1.8118, "step": 107 }, { "epoch": 1.7560975609756098, "grad_norm": 1.4850655925014062, "learning_rate": 6.453474506055228e-06, "loss": 1.8122, "step": 108 }, { "epoch": 1.7723577235772359, "grad_norm": 1.8401558539962772, "learning_rate": 6.387708231181229e-06, "loss": 1.7482, "step": 109 }, { "epoch": 1.7886178861788617, "grad_norm": 1.5200607844016307, "learning_rate": 6.3216807882047585e-06, "loss": 1.9692, "step": 110 }, { "epoch": 1.8048780487804879, "grad_norm": 1.5040703158854816, "learning_rate": 6.255404603557833e-06, "loss": 1.8885, "step": 111 }, { "epoch": 1.821138211382114, "grad_norm": 1.5595167294836927, "learning_rate": 6.188892150485904e-06, "loss": 1.8763, "step": 112 }, { "epoch": 1.8373983739837398, "grad_norm": 1.5145771953515301, "learning_rate": 6.122155946700381e-06, "loss": 2.0202, "step": 113 }, { "epoch": 1.8536585365853657, "grad_norm": 1.3682270101469567, "learning_rate": 6.0552085520227875e-06, "loss": 1.9047, "step": 114 }, { "epoch": 1.8699186991869918, "grad_norm": 1.6370392819430508, "learning_rate": 5.988062566020987e-06, "loss": 1.9071, "step": 115 }, { "epoch": 1.886178861788618, "grad_norm": 1.5064391288691565, "learning_rate": 5.920730625637934e-06, "loss": 1.9622, "step": 116 }, { "epoch": 1.9024390243902438, "grad_norm": 1.5117346102674705, "learning_rate": 5.853225402813381e-06, "loss": 1.8889, "step": 117 }, { "epoch": 1.91869918699187, "grad_norm": 1.5330246513699148, "learning_rate": 5.785559602099019e-06, "loss": 1.8971, "step": 118 }, { "epoch": 1.934959349593496, "grad_norm": 1.446396984889757, "learning_rate": 5.7177459582674595e-06, "loss": 1.8328, "step": 119 }, { "epoch": 1.951219512195122, "grad_norm": 1.495856639357874, "learning_rate": 5.649797233915539e-06, "loss": 1.8684, "step": 120 }, { "epoch": 1.967479674796748, "grad_norm": 1.4499233339195112, "learning_rate": 5.5817262170623865e-06, "loss": 1.8167, "step": 121 }, { "epoch": 1.9837398373983741, "grad_norm": 2.7226474462829744, "learning_rate": 5.513545718742702e-06, "loss": 1.9347, "step": 122 }, { "epoch": 2.0, "grad_norm": 1.6284827727818851, "learning_rate": 5.4452685705957084e-06, "loss": 1.9544, "step": 123 }, { "epoch": 2.0081967213114753, "grad_norm": 2.196083883493474, "learning_rate": 5.376907622450229e-06, "loss": 1.726, "step": 124 }, { "epoch": 2.0245901639344264, "grad_norm": 2.5371696562778325, "learning_rate": 5.308475739906329e-06, "loss": 1.6243, "step": 125 }, { "epoch": 2.040983606557377, "grad_norm": 2.268797600455164, "learning_rate": 5.2399858019140005e-06, "loss": 1.6506, "step": 126 }, { "epoch": 2.057377049180328, "grad_norm": 2.189268946289623, "learning_rate": 5.171450698349329e-06, "loss": 1.5677, "step": 127 }, { "epoch": 2.0737704918032787, "grad_norm": 3.330757242919671, "learning_rate": 5.102883327588608e-06, "loss": 1.5414, "step": 128 }, { "epoch": 2.0901639344262297, "grad_norm": 4.975856342017294, "learning_rate": 5.034296594080849e-06, "loss": 1.5934, "step": 129 }, { "epoch": 2.1065573770491803, "grad_norm": 2.6202824821832644, "learning_rate": 4.965703405919154e-06, "loss": 1.5114, "step": 130 }, { "epoch": 2.122950819672131, "grad_norm": 2.145467593529425, "learning_rate": 4.897116672411395e-06, "loss": 1.6081, "step": 131 }, { "epoch": 2.139344262295082, "grad_norm": 2.2918703289964597, "learning_rate": 4.828549301650673e-06, "loss": 1.6128, "step": 132 }, { "epoch": 2.1557377049180326, "grad_norm": 2.2020919561070307, "learning_rate": 4.760014198086001e-06, "loss": 1.5385, "step": 133 }, { "epoch": 2.1721311475409837, "grad_norm": 1.9897624139230765, "learning_rate": 4.691524260093672e-06, "loss": 1.5731, "step": 134 }, { "epoch": 2.1885245901639343, "grad_norm": 2.1542550052135665, "learning_rate": 4.623092377549772e-06, "loss": 1.671, "step": 135 }, { "epoch": 2.2049180327868854, "grad_norm": 2.2842158477910512, "learning_rate": 4.554731429404293e-06, "loss": 1.614, "step": 136 }, { "epoch": 2.221311475409836, "grad_norm": 2.1239054707901452, "learning_rate": 4.4864542812573e-06, "loss": 1.6136, "step": 137 }, { "epoch": 2.237704918032787, "grad_norm": 2.0393208350755008, "learning_rate": 4.4182737829376135e-06, "loss": 1.6467, "step": 138 }, { "epoch": 2.2540983606557377, "grad_norm": 2.0314018092300103, "learning_rate": 4.3502027660844606e-06, "loss": 1.6771, "step": 139 }, { "epoch": 2.2704918032786887, "grad_norm": 2.258947100849125, "learning_rate": 4.28225404173254e-06, "loss": 1.5915, "step": 140 }, { "epoch": 2.2868852459016393, "grad_norm": 2.1050457746344313, "learning_rate": 4.214440397900983e-06, "loss": 1.6608, "step": 141 }, { "epoch": 2.30327868852459, "grad_norm": 1.948114104146267, "learning_rate": 4.146774597186622e-06, "loss": 1.5369, "step": 142 }, { "epoch": 2.319672131147541, "grad_norm": 1.864917840283612, "learning_rate": 4.0792693743620695e-06, "loss": 1.5717, "step": 143 }, { "epoch": 2.3360655737704916, "grad_norm": 1.7461920012860392, "learning_rate": 4.011937433979014e-06, "loss": 1.5871, "step": 144 }, { "epoch": 2.3524590163934427, "grad_norm": 1.798186392967839, "learning_rate": 3.944791447977213e-06, "loss": 1.5592, "step": 145 }, { "epoch": 2.3688524590163933, "grad_norm": 1.840314539800089, "learning_rate": 3.87784405329962e-06, "loss": 1.5807, "step": 146 }, { "epoch": 2.3852459016393444, "grad_norm": 1.9329074085202937, "learning_rate": 3.811107849514098e-06, "loss": 1.5505, "step": 147 }, { "epoch": 2.401639344262295, "grad_norm": 1.652283737337734, "learning_rate": 3.744595396442169e-06, "loss": 1.5601, "step": 148 }, { "epoch": 2.418032786885246, "grad_norm": 1.6942011471416905, "learning_rate": 3.6783192117952427e-06, "loss": 1.5762, "step": 149 }, { "epoch": 2.4344262295081966, "grad_norm": 1.6999827546312305, "learning_rate": 3.612291768818772e-06, "loss": 1.5286, "step": 150 }, { "epoch": 2.4508196721311477, "grad_norm": 1.717480901806302, "learning_rate": 3.5465254939447737e-06, "loss": 1.6165, "step": 151 }, { "epoch": 2.4672131147540983, "grad_norm": 1.7354378842697238, "learning_rate": 3.4810327644531606e-06, "loss": 1.5636, "step": 152 }, { "epoch": 2.4836065573770494, "grad_norm": 2.076605836354899, "learning_rate": 3.415825906142326e-06, "loss": 1.5426, "step": 153 }, { "epoch": 2.5, "grad_norm": 2.807787441701596, "learning_rate": 3.3509171910094162e-06, "loss": 1.4229, "step": 154 }, { "epoch": 2.5163934426229506, "grad_norm": 2.1162841013271, "learning_rate": 3.2863188349407293e-06, "loss": 1.5828, "step": 155 }, { "epoch": 2.5327868852459017, "grad_norm": 2.06404980427501, "learning_rate": 3.222042995412669e-06, "loss": 1.6511, "step": 156 }, { "epoch": 2.5491803278688527, "grad_norm": 1.954408720302879, "learning_rate": 3.1581017692036986e-06, "loss": 1.5611, "step": 157 }, { "epoch": 2.5655737704918034, "grad_norm": 1.8718671260321513, "learning_rate": 3.094507190117715e-06, "loss": 1.5528, "step": 158 }, { "epoch": 2.581967213114754, "grad_norm": 1.8323058517821103, "learning_rate": 3.0312712267192713e-06, "loss": 1.5525, "step": 159 }, { "epoch": 2.598360655737705, "grad_norm": 4.8860562051247385, "learning_rate": 2.9684057800810844e-06, "loss": 1.5571, "step": 160 }, { "epoch": 2.6147540983606556, "grad_norm": 2.1738936906241237, "learning_rate": 2.9059226815442386e-06, "loss": 1.4133, "step": 161 }, { "epoch": 2.6311475409836067, "grad_norm": 2.1707521374782246, "learning_rate": 2.8438336904915186e-06, "loss": 1.7308, "step": 162 }, { "epoch": 2.6475409836065573, "grad_norm": 1.848892940417498, "learning_rate": 2.782150492134278e-06, "loss": 1.6001, "step": 163 }, { "epoch": 2.663934426229508, "grad_norm": 1.7230291366018273, "learning_rate": 2.7208846953132685e-06, "loss": 1.6081, "step": 164 }, { "epoch": 2.680327868852459, "grad_norm": 1.6901177596723493, "learning_rate": 2.6600478303138503e-06, "loss": 1.4536, "step": 165 }, { "epoch": 2.69672131147541, "grad_norm": 1.770734235373982, "learning_rate": 2.599651346695979e-06, "loss": 1.5516, "step": 166 }, { "epoch": 2.7131147540983607, "grad_norm": 1.9713299716871673, "learning_rate": 2.539706611139385e-06, "loss": 1.6036, "step": 167 }, { "epoch": 2.7295081967213113, "grad_norm": 1.944020272636568, "learning_rate": 2.4802249053043525e-06, "loss": 1.6063, "step": 168 }, { "epoch": 2.7459016393442623, "grad_norm": 1.917025051456852, "learning_rate": 2.4212174237085007e-06, "loss": 1.484, "step": 169 }, { "epoch": 2.762295081967213, "grad_norm": 1.7078308287195758, "learning_rate": 2.3626952716199647e-06, "loss": 1.6206, "step": 170 }, { "epoch": 2.778688524590164, "grad_norm": 2.105784906420741, "learning_rate": 2.3046694629673715e-06, "loss": 1.5728, "step": 171 }, { "epoch": 2.7950819672131146, "grad_norm": 1.8395129946614983, "learning_rate": 2.247150918267008e-06, "loss": 1.5713, "step": 172 }, { "epoch": 2.8114754098360657, "grad_norm": 1.6207009651182827, "learning_rate": 2.190150462567569e-06, "loss": 1.5185, "step": 173 }, { "epoch": 2.8278688524590163, "grad_norm": 1.5443883689967923, "learning_rate": 2.133678823412873e-06, "loss": 1.5119, "step": 174 }, { "epoch": 2.8442622950819674, "grad_norm": 1.6761100058671115, "learning_rate": 2.077746628822921e-06, "loss": 1.5463, "step": 175 }, { "epoch": 2.860655737704918, "grad_norm": 1.982740476512917, "learning_rate": 2.022364405293703e-06, "loss": 1.5463, "step": 176 }, { "epoch": 2.8770491803278686, "grad_norm": 1.792200717708596, "learning_rate": 1.9675425758160927e-06, "loss": 1.6641, "step": 177 }, { "epoch": 2.8934426229508197, "grad_norm": 2.129611048939019, "learning_rate": 1.913291457914234e-06, "loss": 1.6427, "step": 178 }, { "epoch": 2.9098360655737707, "grad_norm": 1.703026911107873, "learning_rate": 1.8596212617037695e-06, "loss": 1.5071, "step": 179 }, { "epoch": 2.9262295081967213, "grad_norm": 1.684347363697613, "learning_rate": 1.8065420879702888e-06, "loss": 1.613, "step": 180 }, { "epoch": 2.942622950819672, "grad_norm": 4.094268965277064, "learning_rate": 1.754063926268349e-06, "loss": 1.562, "step": 181 }, { "epoch": 2.959016393442623, "grad_norm": 1.8320884272338482, "learning_rate": 1.7021966530414303e-06, "loss": 1.5323, "step": 182 }, { "epoch": 2.9754098360655736, "grad_norm": 2.1968448353373833, "learning_rate": 1.6509500297631786e-06, "loss": 1.6683, "step": 183 } ], "logging_steps": 1, "max_steps": 244, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 61, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.784065850068173e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }