|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9754098360655736, |
|
"eval_steps": 500, |
|
"global_step": 183, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016260162601626018, |
|
"grad_norm": 38.715505104327306, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 2.7935, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.032520325203252036, |
|
"grad_norm": 43.92734901516536, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 2.8618, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.04878048780487805, |
|
"grad_norm": 40.50339510154553, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.8164, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.06504065040650407, |
|
"grad_norm": 28.51481486426967, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 2.672, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.08130081300813008, |
|
"grad_norm": 17.557085161970114, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 2.526, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0975609756097561, |
|
"grad_norm": 9.833831469561684, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.4556, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.11382113821138211, |
|
"grad_norm": 5.8460959136595845, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 2.1935, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.13008130081300814, |
|
"grad_norm": 5.207230159450673, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 2.3815, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.14634146341463414, |
|
"grad_norm": 7.281692741856846, |
|
"learning_rate": 6e-06, |
|
"loss": 2.311, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.16260162601626016, |
|
"grad_norm": 4.085861410961766, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 2.3801, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.17886178861788618, |
|
"grad_norm": 3.1648108898931473, |
|
"learning_rate": 7.333333333333333e-06, |
|
"loss": 2.2303, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1951219512195122, |
|
"grad_norm": 2.733124694129773, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.2489, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.21138211382113822, |
|
"grad_norm": 2.601048953766184, |
|
"learning_rate": 8.666666666666668e-06, |
|
"loss": 2.3216, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.22764227642276422, |
|
"grad_norm": 2.799670635370881, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 2.3405, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.24390243902439024, |
|
"grad_norm": 2.2889145851702035, |
|
"learning_rate": 1e-05, |
|
"loss": 2.2761, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2601626016260163, |
|
"grad_norm": 2.074354640710373, |
|
"learning_rate": 9.999529497453782e-06, |
|
"loss": 2.2498, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2764227642276423, |
|
"grad_norm": 1.8496605844704992, |
|
"learning_rate": 9.998118078364186e-06, |
|
"loss": 2.1694, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.2926829268292683, |
|
"grad_norm": 1.657414876254685, |
|
"learning_rate": 9.99576600836172e-06, |
|
"loss": 2.1922, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.3089430894308943, |
|
"grad_norm": 1.6874201712764785, |
|
"learning_rate": 9.992473730108354e-06, |
|
"loss": 2.2212, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.3252032520325203, |
|
"grad_norm": 1.6218362871051897, |
|
"learning_rate": 9.988241863214212e-06, |
|
"loss": 2.2361, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.34146341463414637, |
|
"grad_norm": 1.7125149417249366, |
|
"learning_rate": 9.98307120412095e-06, |
|
"loss": 2.0696, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.35772357723577236, |
|
"grad_norm": 1.748991995811329, |
|
"learning_rate": 9.976962725951878e-06, |
|
"loss": 2.3047, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.37398373983739835, |
|
"grad_norm": 1.635597001502544, |
|
"learning_rate": 9.969917578328808e-06, |
|
"loss": 2.1672, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.3902439024390244, |
|
"grad_norm": 1.5126187316968263, |
|
"learning_rate": 9.961937087155697e-06, |
|
"loss": 2.3152, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.4065040650406504, |
|
"grad_norm": 1.5822580849972034, |
|
"learning_rate": 9.953022754369115e-06, |
|
"loss": 2.2176, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.42276422764227645, |
|
"grad_norm": 1.5777453977666092, |
|
"learning_rate": 9.943176257655567e-06, |
|
"loss": 2.3213, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.43902439024390244, |
|
"grad_norm": 1.4094336427074805, |
|
"learning_rate": 9.932399450135765e-06, |
|
"loss": 2.1472, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.45528455284552843, |
|
"grad_norm": 1.4386799640865944, |
|
"learning_rate": 9.920694360015864e-06, |
|
"loss": 2.2464, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.4715447154471545, |
|
"grad_norm": 1.5398404589418597, |
|
"learning_rate": 9.908063190205739e-06, |
|
"loss": 2.2179, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.4878048780487805, |
|
"grad_norm": 1.4686769996035434, |
|
"learning_rate": 9.894508317904418e-06, |
|
"loss": 2.1055, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5040650406504065, |
|
"grad_norm": 1.5185344171327764, |
|
"learning_rate": 9.880032294152673e-06, |
|
"loss": 2.4138, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.5203252032520326, |
|
"grad_norm": 1.4788353106580088, |
|
"learning_rate": 9.864637843352916e-06, |
|
"loss": 2.1975, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5365853658536586, |
|
"grad_norm": 1.5482980039161824, |
|
"learning_rate": 9.848327862756466e-06, |
|
"loss": 2.2368, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5528455284552846, |
|
"grad_norm": 1.5529583533392486, |
|
"learning_rate": 9.831105421918287e-06, |
|
"loss": 2.244, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5691056910569106, |
|
"grad_norm": 1.2347439521124879, |
|
"learning_rate": 9.812973762119282e-06, |
|
"loss": 2.1969, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5853658536585366, |
|
"grad_norm": 1.7342145031832543, |
|
"learning_rate": 9.793936295756292e-06, |
|
"loss": 2.1326, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.6016260162601627, |
|
"grad_norm": 1.6027037759927296, |
|
"learning_rate": 9.773996605699876e-06, |
|
"loss": 2.2234, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.6178861788617886, |
|
"grad_norm": 1.5818719102225083, |
|
"learning_rate": 9.753158444620013e-06, |
|
"loss": 2.2304, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.6341463414634146, |
|
"grad_norm": 1.5993004636102213, |
|
"learning_rate": 9.73142573427984e-06, |
|
"loss": 2.0879, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.6504065040650406, |
|
"grad_norm": 1.4999201972757639, |
|
"learning_rate": 9.70880256479758e-06, |
|
"loss": 2.3389, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 1.5972052352215045, |
|
"learning_rate": 9.685293193876766e-06, |
|
"loss": 2.1184, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6829268292682927, |
|
"grad_norm": 1.5799871790968651, |
|
"learning_rate": 9.660902046004954e-06, |
|
"loss": 2.2381, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6991869918699187, |
|
"grad_norm": 1.7655935713426523, |
|
"learning_rate": 9.635633711621014e-06, |
|
"loss": 1.9785, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.7154471544715447, |
|
"grad_norm": 1.4455569662489212, |
|
"learning_rate": 9.60949294625121e-06, |
|
"loss": 2.2598, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.7317073170731707, |
|
"grad_norm": 1.6438734541523634, |
|
"learning_rate": 9.582484669614212e-06, |
|
"loss": 2.2254, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.7479674796747967, |
|
"grad_norm": 1.5826415829816778, |
|
"learning_rate": 9.554613964695189e-06, |
|
"loss": 2.3195, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.7642276422764228, |
|
"grad_norm": 1.5129794859111765, |
|
"learning_rate": 9.525886076789195e-06, |
|
"loss": 2.309, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7804878048780488, |
|
"grad_norm": 1.4425514088997948, |
|
"learning_rate": 9.496306412513989e-06, |
|
"loss": 2.1234, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7967479674796748, |
|
"grad_norm": 1.4940088273936272, |
|
"learning_rate": 9.465880538792519e-06, |
|
"loss": 2.2481, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.8130081300813008, |
|
"grad_norm": 1.4327286196072573, |
|
"learning_rate": 9.434614181805203e-06, |
|
"loss": 2.18, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8292682926829268, |
|
"grad_norm": 1.431778831080833, |
|
"learning_rate": 9.402513225912273e-06, |
|
"loss": 2.1464, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.8455284552845529, |
|
"grad_norm": 1.4448422926978557, |
|
"learning_rate": 9.369583712546322e-06, |
|
"loss": 2.1412, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.8617886178861789, |
|
"grad_norm": 1.4679959607329234, |
|
"learning_rate": 9.335831839075303e-06, |
|
"loss": 2.0886, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.8780487804878049, |
|
"grad_norm": 1.4297931147920537, |
|
"learning_rate": 9.30126395763618e-06, |
|
"loss": 2.2567, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.8943089430894309, |
|
"grad_norm": 1.3468034919950198, |
|
"learning_rate": 9.265886573939448e-06, |
|
"loss": 2.2071, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.9105691056910569, |
|
"grad_norm": 1.4936153334385893, |
|
"learning_rate": 9.229706346044749e-06, |
|
"loss": 2.2185, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.926829268292683, |
|
"grad_norm": 1.6789184395018877, |
|
"learning_rate": 9.19273008310782e-06, |
|
"loss": 2.2014, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.943089430894309, |
|
"grad_norm": 2.328661298153007, |
|
"learning_rate": 9.154964744099006e-06, |
|
"loss": 2.2311, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.959349593495935, |
|
"grad_norm": 1.6031494617421091, |
|
"learning_rate": 9.116417436493574e-06, |
|
"loss": 2.2817, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.975609756097561, |
|
"grad_norm": 1.378022632096063, |
|
"learning_rate": 9.077095414934076e-06, |
|
"loss": 2.3173, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.991869918699187, |
|
"grad_norm": 1.378014833579439, |
|
"learning_rate": 9.037006079865017e-06, |
|
"loss": 2.1865, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.008130081300813, |
|
"grad_norm": 1.777721048156252, |
|
"learning_rate": 8.996156976140088e-06, |
|
"loss": 1.9477, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.024390243902439, |
|
"grad_norm": 1.9572083586736355, |
|
"learning_rate": 8.95455579160221e-06, |
|
"loss": 1.9606, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.040650406504065, |
|
"grad_norm": 1.83840626983557, |
|
"learning_rate": 8.91221035563669e-06, |
|
"loss": 1.9641, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.056910569105691, |
|
"grad_norm": 1.7482663924396353, |
|
"learning_rate": 8.869128637697702e-06, |
|
"loss": 1.912, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.0731707317073171, |
|
"grad_norm": 2.436293071917533, |
|
"learning_rate": 8.82531874580844e-06, |
|
"loss": 2.0053, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.089430894308943, |
|
"grad_norm": 1.7774996740932676, |
|
"learning_rate": 8.780788925035178e-06, |
|
"loss": 1.8775, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.1056910569105691, |
|
"grad_norm": 1.5469357572162499, |
|
"learning_rate": 8.735547555935538e-06, |
|
"loss": 1.8116, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.1219512195121952, |
|
"grad_norm": 1.9514287859645285, |
|
"learning_rate": 8.689603152981262e-06, |
|
"loss": 1.9458, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.1382113821138211, |
|
"grad_norm": 2.16053582553305, |
|
"learning_rate": 8.642964362955781e-06, |
|
"loss": 1.8903, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.1544715447154472, |
|
"grad_norm": 1.8295345259876294, |
|
"learning_rate": 8.59563996332688e-06, |
|
"loss": 1.9097, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.170731707317073, |
|
"grad_norm": 1.6192771141787508, |
|
"learning_rate": 8.547638860594765e-06, |
|
"loss": 1.9404, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.1869918699186992, |
|
"grad_norm": 1.732769202153044, |
|
"learning_rate": 8.498970088615861e-06, |
|
"loss": 1.9072, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.203252032520325, |
|
"grad_norm": 1.7124937892640733, |
|
"learning_rate": 8.449642806902623e-06, |
|
"loss": 1.9361, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.2195121951219512, |
|
"grad_norm": 1.4632968865855904, |
|
"learning_rate": 8.399666298899706e-06, |
|
"loss": 1.9064, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.2357723577235773, |
|
"grad_norm": 1.57738250567877, |
|
"learning_rate": 8.349049970236822e-06, |
|
"loss": 1.8884, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.2520325203252032, |
|
"grad_norm": 1.6860131175954445, |
|
"learning_rate": 8.29780334695857e-06, |
|
"loss": 1.8003, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.2682926829268293, |
|
"grad_norm": 1.6737455231202842, |
|
"learning_rate": 8.245936073731654e-06, |
|
"loss": 1.9188, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.2845528455284554, |
|
"grad_norm": 1.5244480150455564, |
|
"learning_rate": 8.193457912029713e-06, |
|
"loss": 1.9428, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.3008130081300813, |
|
"grad_norm": 2.3077062591013546, |
|
"learning_rate": 8.140378738296233e-06, |
|
"loss": 1.9652, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.3170731707317074, |
|
"grad_norm": 2.0246897260170433, |
|
"learning_rate": 8.086708542085769e-06, |
|
"loss": 1.8873, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 1.6727268288560573, |
|
"learning_rate": 8.032457424183909e-06, |
|
"loss": 1.8492, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.3495934959349594, |
|
"grad_norm": 1.5569674481989706, |
|
"learning_rate": 7.977635594706298e-06, |
|
"loss": 1.9254, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.3658536585365852, |
|
"grad_norm": 1.6023097703448475, |
|
"learning_rate": 7.922253371177081e-06, |
|
"loss": 1.9188, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.3821138211382114, |
|
"grad_norm": 1.579712800121583, |
|
"learning_rate": 7.866321176587129e-06, |
|
"loss": 1.7658, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.3983739837398375, |
|
"grad_norm": 1.5575689401980928, |
|
"learning_rate": 7.809849537432432e-06, |
|
"loss": 1.8844, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.4146341463414633, |
|
"grad_norm": 2.0996103231960186, |
|
"learning_rate": 7.752849081732993e-06, |
|
"loss": 1.8144, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.4308943089430894, |
|
"grad_norm": 2.256462070639042, |
|
"learning_rate": 7.695330537032629e-06, |
|
"loss": 1.9585, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.4471544715447155, |
|
"grad_norm": 1.5391039524045877, |
|
"learning_rate": 7.637304728380036e-06, |
|
"loss": 1.7978, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.4634146341463414, |
|
"grad_norm": 1.6288407781749783, |
|
"learning_rate": 7.578782576291501e-06, |
|
"loss": 1.8906, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.4796747967479675, |
|
"grad_norm": 1.4226382530812676, |
|
"learning_rate": 7.51977509469565e-06, |
|
"loss": 1.8687, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.4959349593495934, |
|
"grad_norm": 1.5704917583247462, |
|
"learning_rate": 7.460293388860616e-06, |
|
"loss": 2.0512, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.5121951219512195, |
|
"grad_norm": 1.4787223094009658, |
|
"learning_rate": 7.400348653304022e-06, |
|
"loss": 1.9736, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.5284552845528454, |
|
"grad_norm": 1.4275720991608627, |
|
"learning_rate": 7.3399521696861505e-06, |
|
"loss": 1.9977, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.5447154471544715, |
|
"grad_norm": 1.4098433082701363, |
|
"learning_rate": 7.2791153046867344e-06, |
|
"loss": 1.8225, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.5609756097560976, |
|
"grad_norm": 4.67217439880156, |
|
"learning_rate": 7.217849507865724e-06, |
|
"loss": 1.9615, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.5772357723577235, |
|
"grad_norm": 1.519886113544732, |
|
"learning_rate": 7.156166309508482e-06, |
|
"loss": 1.8566, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.5934959349593496, |
|
"grad_norm": 1.4312222151537652, |
|
"learning_rate": 7.094077318455762e-06, |
|
"loss": 1.8472, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.6097560975609757, |
|
"grad_norm": 1.595194011963869, |
|
"learning_rate": 7.031594219918916e-06, |
|
"loss": 1.8246, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.6260162601626016, |
|
"grad_norm": 1.5531618087372638, |
|
"learning_rate": 6.96872877328073e-06, |
|
"loss": 1.9809, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.6422764227642277, |
|
"grad_norm": 1.4743001623349186, |
|
"learning_rate": 6.905492809882286e-06, |
|
"loss": 2.0107, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.6585365853658538, |
|
"grad_norm": 1.9754475724007412, |
|
"learning_rate": 6.841898230796302e-06, |
|
"loss": 1.8143, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.6747967479674797, |
|
"grad_norm": 2.2741340273630635, |
|
"learning_rate": 6.777957004587332e-06, |
|
"loss": 1.7824, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.6910569105691056, |
|
"grad_norm": 1.6564200369629691, |
|
"learning_rate": 6.713681165059271e-06, |
|
"loss": 1.8844, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.7073170731707317, |
|
"grad_norm": 1.5809400530395674, |
|
"learning_rate": 6.6490828089905854e-06, |
|
"loss": 1.8789, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.7235772357723578, |
|
"grad_norm": 1.4746817698203465, |
|
"learning_rate": 6.584174093857676e-06, |
|
"loss": 1.9045, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.7398373983739837, |
|
"grad_norm": 1.353250857647505, |
|
"learning_rate": 6.5189672355468415e-06, |
|
"loss": 1.8118, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.7560975609756098, |
|
"grad_norm": 1.4850655925014062, |
|
"learning_rate": 6.453474506055228e-06, |
|
"loss": 1.8122, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.7723577235772359, |
|
"grad_norm": 1.8401558539962772, |
|
"learning_rate": 6.387708231181229e-06, |
|
"loss": 1.7482, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.7886178861788617, |
|
"grad_norm": 1.5200607844016307, |
|
"learning_rate": 6.3216807882047585e-06, |
|
"loss": 1.9692, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.8048780487804879, |
|
"grad_norm": 1.5040703158854816, |
|
"learning_rate": 6.255404603557833e-06, |
|
"loss": 1.8885, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.821138211382114, |
|
"grad_norm": 1.5595167294836927, |
|
"learning_rate": 6.188892150485904e-06, |
|
"loss": 1.8763, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.8373983739837398, |
|
"grad_norm": 1.5145771953515301, |
|
"learning_rate": 6.122155946700381e-06, |
|
"loss": 2.0202, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.8536585365853657, |
|
"grad_norm": 1.3682270101469567, |
|
"learning_rate": 6.0552085520227875e-06, |
|
"loss": 1.9047, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.8699186991869918, |
|
"grad_norm": 1.6370392819430508, |
|
"learning_rate": 5.988062566020987e-06, |
|
"loss": 1.9071, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.886178861788618, |
|
"grad_norm": 1.5064391288691565, |
|
"learning_rate": 5.920730625637934e-06, |
|
"loss": 1.9622, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.9024390243902438, |
|
"grad_norm": 1.5117346102674705, |
|
"learning_rate": 5.853225402813381e-06, |
|
"loss": 1.8889, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.91869918699187, |
|
"grad_norm": 1.5330246513699148, |
|
"learning_rate": 5.785559602099019e-06, |
|
"loss": 1.8971, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.934959349593496, |
|
"grad_norm": 1.446396984889757, |
|
"learning_rate": 5.7177459582674595e-06, |
|
"loss": 1.8328, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.951219512195122, |
|
"grad_norm": 1.495856639357874, |
|
"learning_rate": 5.649797233915539e-06, |
|
"loss": 1.8684, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.967479674796748, |
|
"grad_norm": 1.4499233339195112, |
|
"learning_rate": 5.5817262170623865e-06, |
|
"loss": 1.8167, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.9837398373983741, |
|
"grad_norm": 2.7226474462829744, |
|
"learning_rate": 5.513545718742702e-06, |
|
"loss": 1.9347, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.6284827727818851, |
|
"learning_rate": 5.4452685705957084e-06, |
|
"loss": 1.9544, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.0081967213114753, |
|
"grad_norm": 2.196083883493474, |
|
"learning_rate": 5.376907622450229e-06, |
|
"loss": 1.726, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.0245901639344264, |
|
"grad_norm": 2.5371696562778325, |
|
"learning_rate": 5.308475739906329e-06, |
|
"loss": 1.6243, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.040983606557377, |
|
"grad_norm": 2.268797600455164, |
|
"learning_rate": 5.2399858019140005e-06, |
|
"loss": 1.6506, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.057377049180328, |
|
"grad_norm": 2.189268946289623, |
|
"learning_rate": 5.171450698349329e-06, |
|
"loss": 1.5677, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.0737704918032787, |
|
"grad_norm": 3.330757242919671, |
|
"learning_rate": 5.102883327588608e-06, |
|
"loss": 1.5414, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.0901639344262297, |
|
"grad_norm": 4.975856342017294, |
|
"learning_rate": 5.034296594080849e-06, |
|
"loss": 1.5934, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.1065573770491803, |
|
"grad_norm": 2.6202824821832644, |
|
"learning_rate": 4.965703405919154e-06, |
|
"loss": 1.5114, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.122950819672131, |
|
"grad_norm": 2.145467593529425, |
|
"learning_rate": 4.897116672411395e-06, |
|
"loss": 1.6081, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.139344262295082, |
|
"grad_norm": 2.2918703289964597, |
|
"learning_rate": 4.828549301650673e-06, |
|
"loss": 1.6128, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.1557377049180326, |
|
"grad_norm": 2.2020919561070307, |
|
"learning_rate": 4.760014198086001e-06, |
|
"loss": 1.5385, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.1721311475409837, |
|
"grad_norm": 1.9897624139230765, |
|
"learning_rate": 4.691524260093672e-06, |
|
"loss": 1.5731, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.1885245901639343, |
|
"grad_norm": 2.1542550052135665, |
|
"learning_rate": 4.623092377549772e-06, |
|
"loss": 1.671, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.2049180327868854, |
|
"grad_norm": 2.2842158477910512, |
|
"learning_rate": 4.554731429404293e-06, |
|
"loss": 1.614, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.221311475409836, |
|
"grad_norm": 2.1239054707901452, |
|
"learning_rate": 4.4864542812573e-06, |
|
"loss": 1.6136, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.237704918032787, |
|
"grad_norm": 2.0393208350755008, |
|
"learning_rate": 4.4182737829376135e-06, |
|
"loss": 1.6467, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.2540983606557377, |
|
"grad_norm": 2.0314018092300103, |
|
"learning_rate": 4.3502027660844606e-06, |
|
"loss": 1.6771, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.2704918032786887, |
|
"grad_norm": 2.258947100849125, |
|
"learning_rate": 4.28225404173254e-06, |
|
"loss": 1.5915, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.2868852459016393, |
|
"grad_norm": 2.1050457746344313, |
|
"learning_rate": 4.214440397900983e-06, |
|
"loss": 1.6608, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.30327868852459, |
|
"grad_norm": 1.948114104146267, |
|
"learning_rate": 4.146774597186622e-06, |
|
"loss": 1.5369, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.319672131147541, |
|
"grad_norm": 1.864917840283612, |
|
"learning_rate": 4.0792693743620695e-06, |
|
"loss": 1.5717, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.3360655737704916, |
|
"grad_norm": 1.7461920012860392, |
|
"learning_rate": 4.011937433979014e-06, |
|
"loss": 1.5871, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.3524590163934427, |
|
"grad_norm": 1.798186392967839, |
|
"learning_rate": 3.944791447977213e-06, |
|
"loss": 1.5592, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.3688524590163933, |
|
"grad_norm": 1.840314539800089, |
|
"learning_rate": 3.87784405329962e-06, |
|
"loss": 1.5807, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.3852459016393444, |
|
"grad_norm": 1.9329074085202937, |
|
"learning_rate": 3.811107849514098e-06, |
|
"loss": 1.5505, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.401639344262295, |
|
"grad_norm": 1.652283737337734, |
|
"learning_rate": 3.744595396442169e-06, |
|
"loss": 1.5601, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.418032786885246, |
|
"grad_norm": 1.6942011471416905, |
|
"learning_rate": 3.6783192117952427e-06, |
|
"loss": 1.5762, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.4344262295081966, |
|
"grad_norm": 1.6999827546312305, |
|
"learning_rate": 3.612291768818772e-06, |
|
"loss": 1.5286, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.4508196721311477, |
|
"grad_norm": 1.717480901806302, |
|
"learning_rate": 3.5465254939447737e-06, |
|
"loss": 1.6165, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.4672131147540983, |
|
"grad_norm": 1.7354378842697238, |
|
"learning_rate": 3.4810327644531606e-06, |
|
"loss": 1.5636, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.4836065573770494, |
|
"grad_norm": 2.076605836354899, |
|
"learning_rate": 3.415825906142326e-06, |
|
"loss": 1.5426, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.807787441701596, |
|
"learning_rate": 3.3509171910094162e-06, |
|
"loss": 1.4229, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.5163934426229506, |
|
"grad_norm": 2.1162841013271, |
|
"learning_rate": 3.2863188349407293e-06, |
|
"loss": 1.5828, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.5327868852459017, |
|
"grad_norm": 2.06404980427501, |
|
"learning_rate": 3.222042995412669e-06, |
|
"loss": 1.6511, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.5491803278688527, |
|
"grad_norm": 1.954408720302879, |
|
"learning_rate": 3.1581017692036986e-06, |
|
"loss": 1.5611, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.5655737704918034, |
|
"grad_norm": 1.8718671260321513, |
|
"learning_rate": 3.094507190117715e-06, |
|
"loss": 1.5528, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.581967213114754, |
|
"grad_norm": 1.8323058517821103, |
|
"learning_rate": 3.0312712267192713e-06, |
|
"loss": 1.5525, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.598360655737705, |
|
"grad_norm": 4.8860562051247385, |
|
"learning_rate": 2.9684057800810844e-06, |
|
"loss": 1.5571, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.6147540983606556, |
|
"grad_norm": 2.1738936906241237, |
|
"learning_rate": 2.9059226815442386e-06, |
|
"loss": 1.4133, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.6311475409836067, |
|
"grad_norm": 2.1707521374782246, |
|
"learning_rate": 2.8438336904915186e-06, |
|
"loss": 1.7308, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.6475409836065573, |
|
"grad_norm": 1.848892940417498, |
|
"learning_rate": 2.782150492134278e-06, |
|
"loss": 1.6001, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.663934426229508, |
|
"grad_norm": 1.7230291366018273, |
|
"learning_rate": 2.7208846953132685e-06, |
|
"loss": 1.6081, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.680327868852459, |
|
"grad_norm": 1.6901177596723493, |
|
"learning_rate": 2.6600478303138503e-06, |
|
"loss": 1.4536, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.69672131147541, |
|
"grad_norm": 1.770734235373982, |
|
"learning_rate": 2.599651346695979e-06, |
|
"loss": 1.5516, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.7131147540983607, |
|
"grad_norm": 1.9713299716871673, |
|
"learning_rate": 2.539706611139385e-06, |
|
"loss": 1.6036, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.7295081967213113, |
|
"grad_norm": 1.944020272636568, |
|
"learning_rate": 2.4802249053043525e-06, |
|
"loss": 1.6063, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.7459016393442623, |
|
"grad_norm": 1.917025051456852, |
|
"learning_rate": 2.4212174237085007e-06, |
|
"loss": 1.484, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.762295081967213, |
|
"grad_norm": 1.7078308287195758, |
|
"learning_rate": 2.3626952716199647e-06, |
|
"loss": 1.6206, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.778688524590164, |
|
"grad_norm": 2.105784906420741, |
|
"learning_rate": 2.3046694629673715e-06, |
|
"loss": 1.5728, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.7950819672131146, |
|
"grad_norm": 1.8395129946614983, |
|
"learning_rate": 2.247150918267008e-06, |
|
"loss": 1.5713, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.8114754098360657, |
|
"grad_norm": 1.6207009651182827, |
|
"learning_rate": 2.190150462567569e-06, |
|
"loss": 1.5185, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.8278688524590163, |
|
"grad_norm": 1.5443883689967923, |
|
"learning_rate": 2.133678823412873e-06, |
|
"loss": 1.5119, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.8442622950819674, |
|
"grad_norm": 1.6761100058671115, |
|
"learning_rate": 2.077746628822921e-06, |
|
"loss": 1.5463, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.860655737704918, |
|
"grad_norm": 1.982740476512917, |
|
"learning_rate": 2.022364405293703e-06, |
|
"loss": 1.5463, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.8770491803278686, |
|
"grad_norm": 1.792200717708596, |
|
"learning_rate": 1.9675425758160927e-06, |
|
"loss": 1.6641, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.8934426229508197, |
|
"grad_norm": 2.129611048939019, |
|
"learning_rate": 1.913291457914234e-06, |
|
"loss": 1.6427, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.9098360655737707, |
|
"grad_norm": 1.703026911107873, |
|
"learning_rate": 1.8596212617037695e-06, |
|
"loss": 1.5071, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.9262295081967213, |
|
"grad_norm": 1.684347363697613, |
|
"learning_rate": 1.8065420879702888e-06, |
|
"loss": 1.613, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.942622950819672, |
|
"grad_norm": 4.094268965277064, |
|
"learning_rate": 1.754063926268349e-06, |
|
"loss": 1.562, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.959016393442623, |
|
"grad_norm": 1.8320884272338482, |
|
"learning_rate": 1.7021966530414303e-06, |
|
"loss": 1.5323, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.9754098360655736, |
|
"grad_norm": 2.1968448353373833, |
|
"learning_rate": 1.6509500297631786e-06, |
|
"loss": 1.6683, |
|
"step": 183 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 244, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 61, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.784065850068173e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|