CAI-Supernova-r2 / checkpoint-183 /trainer_state.json
intervitens's picture
Upload folder using huggingface_hub
a597be6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9754098360655736,
"eval_steps": 500,
"global_step": 183,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016260162601626018,
"grad_norm": 38.715505104327306,
"learning_rate": 6.666666666666667e-07,
"loss": 2.7935,
"step": 1
},
{
"epoch": 0.032520325203252036,
"grad_norm": 43.92734901516536,
"learning_rate": 1.3333333333333334e-06,
"loss": 2.8618,
"step": 2
},
{
"epoch": 0.04878048780487805,
"grad_norm": 40.50339510154553,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.8164,
"step": 3
},
{
"epoch": 0.06504065040650407,
"grad_norm": 28.51481486426967,
"learning_rate": 2.666666666666667e-06,
"loss": 2.672,
"step": 4
},
{
"epoch": 0.08130081300813008,
"grad_norm": 17.557085161970114,
"learning_rate": 3.3333333333333333e-06,
"loss": 2.526,
"step": 5
},
{
"epoch": 0.0975609756097561,
"grad_norm": 9.833831469561684,
"learning_rate": 4.000000000000001e-06,
"loss": 2.4556,
"step": 6
},
{
"epoch": 0.11382113821138211,
"grad_norm": 5.8460959136595845,
"learning_rate": 4.666666666666667e-06,
"loss": 2.1935,
"step": 7
},
{
"epoch": 0.13008130081300814,
"grad_norm": 5.207230159450673,
"learning_rate": 5.333333333333334e-06,
"loss": 2.3815,
"step": 8
},
{
"epoch": 0.14634146341463414,
"grad_norm": 7.281692741856846,
"learning_rate": 6e-06,
"loss": 2.311,
"step": 9
},
{
"epoch": 0.16260162601626016,
"grad_norm": 4.085861410961766,
"learning_rate": 6.666666666666667e-06,
"loss": 2.3801,
"step": 10
},
{
"epoch": 0.17886178861788618,
"grad_norm": 3.1648108898931473,
"learning_rate": 7.333333333333333e-06,
"loss": 2.2303,
"step": 11
},
{
"epoch": 0.1951219512195122,
"grad_norm": 2.733124694129773,
"learning_rate": 8.000000000000001e-06,
"loss": 2.2489,
"step": 12
},
{
"epoch": 0.21138211382113822,
"grad_norm": 2.601048953766184,
"learning_rate": 8.666666666666668e-06,
"loss": 2.3216,
"step": 13
},
{
"epoch": 0.22764227642276422,
"grad_norm": 2.799670635370881,
"learning_rate": 9.333333333333334e-06,
"loss": 2.3405,
"step": 14
},
{
"epoch": 0.24390243902439024,
"grad_norm": 2.2889145851702035,
"learning_rate": 1e-05,
"loss": 2.2761,
"step": 15
},
{
"epoch": 0.2601626016260163,
"grad_norm": 2.074354640710373,
"learning_rate": 9.999529497453782e-06,
"loss": 2.2498,
"step": 16
},
{
"epoch": 0.2764227642276423,
"grad_norm": 1.8496605844704992,
"learning_rate": 9.998118078364186e-06,
"loss": 2.1694,
"step": 17
},
{
"epoch": 0.2926829268292683,
"grad_norm": 1.657414876254685,
"learning_rate": 9.99576600836172e-06,
"loss": 2.1922,
"step": 18
},
{
"epoch": 0.3089430894308943,
"grad_norm": 1.6874201712764785,
"learning_rate": 9.992473730108354e-06,
"loss": 2.2212,
"step": 19
},
{
"epoch": 0.3252032520325203,
"grad_norm": 1.6218362871051897,
"learning_rate": 9.988241863214212e-06,
"loss": 2.2361,
"step": 20
},
{
"epoch": 0.34146341463414637,
"grad_norm": 1.7125149417249366,
"learning_rate": 9.98307120412095e-06,
"loss": 2.0696,
"step": 21
},
{
"epoch": 0.35772357723577236,
"grad_norm": 1.748991995811329,
"learning_rate": 9.976962725951878e-06,
"loss": 2.3047,
"step": 22
},
{
"epoch": 0.37398373983739835,
"grad_norm": 1.635597001502544,
"learning_rate": 9.969917578328808e-06,
"loss": 2.1672,
"step": 23
},
{
"epoch": 0.3902439024390244,
"grad_norm": 1.5126187316968263,
"learning_rate": 9.961937087155697e-06,
"loss": 2.3152,
"step": 24
},
{
"epoch": 0.4065040650406504,
"grad_norm": 1.5822580849972034,
"learning_rate": 9.953022754369115e-06,
"loss": 2.2176,
"step": 25
},
{
"epoch": 0.42276422764227645,
"grad_norm": 1.5777453977666092,
"learning_rate": 9.943176257655567e-06,
"loss": 2.3213,
"step": 26
},
{
"epoch": 0.43902439024390244,
"grad_norm": 1.4094336427074805,
"learning_rate": 9.932399450135765e-06,
"loss": 2.1472,
"step": 27
},
{
"epoch": 0.45528455284552843,
"grad_norm": 1.4386799640865944,
"learning_rate": 9.920694360015864e-06,
"loss": 2.2464,
"step": 28
},
{
"epoch": 0.4715447154471545,
"grad_norm": 1.5398404589418597,
"learning_rate": 9.908063190205739e-06,
"loss": 2.2179,
"step": 29
},
{
"epoch": 0.4878048780487805,
"grad_norm": 1.4686769996035434,
"learning_rate": 9.894508317904418e-06,
"loss": 2.1055,
"step": 30
},
{
"epoch": 0.5040650406504065,
"grad_norm": 1.5185344171327764,
"learning_rate": 9.880032294152673e-06,
"loss": 2.4138,
"step": 31
},
{
"epoch": 0.5203252032520326,
"grad_norm": 1.4788353106580088,
"learning_rate": 9.864637843352916e-06,
"loss": 2.1975,
"step": 32
},
{
"epoch": 0.5365853658536586,
"grad_norm": 1.5482980039161824,
"learning_rate": 9.848327862756466e-06,
"loss": 2.2368,
"step": 33
},
{
"epoch": 0.5528455284552846,
"grad_norm": 1.5529583533392486,
"learning_rate": 9.831105421918287e-06,
"loss": 2.244,
"step": 34
},
{
"epoch": 0.5691056910569106,
"grad_norm": 1.2347439521124879,
"learning_rate": 9.812973762119282e-06,
"loss": 2.1969,
"step": 35
},
{
"epoch": 0.5853658536585366,
"grad_norm": 1.7342145031832543,
"learning_rate": 9.793936295756292e-06,
"loss": 2.1326,
"step": 36
},
{
"epoch": 0.6016260162601627,
"grad_norm": 1.6027037759927296,
"learning_rate": 9.773996605699876e-06,
"loss": 2.2234,
"step": 37
},
{
"epoch": 0.6178861788617886,
"grad_norm": 1.5818719102225083,
"learning_rate": 9.753158444620013e-06,
"loss": 2.2304,
"step": 38
},
{
"epoch": 0.6341463414634146,
"grad_norm": 1.5993004636102213,
"learning_rate": 9.73142573427984e-06,
"loss": 2.0879,
"step": 39
},
{
"epoch": 0.6504065040650406,
"grad_norm": 1.4999201972757639,
"learning_rate": 9.70880256479758e-06,
"loss": 2.3389,
"step": 40
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.5972052352215045,
"learning_rate": 9.685293193876766e-06,
"loss": 2.1184,
"step": 41
},
{
"epoch": 0.6829268292682927,
"grad_norm": 1.5799871790968651,
"learning_rate": 9.660902046004954e-06,
"loss": 2.2381,
"step": 42
},
{
"epoch": 0.6991869918699187,
"grad_norm": 1.7655935713426523,
"learning_rate": 9.635633711621014e-06,
"loss": 1.9785,
"step": 43
},
{
"epoch": 0.7154471544715447,
"grad_norm": 1.4455569662489212,
"learning_rate": 9.60949294625121e-06,
"loss": 2.2598,
"step": 44
},
{
"epoch": 0.7317073170731707,
"grad_norm": 1.6438734541523634,
"learning_rate": 9.582484669614212e-06,
"loss": 2.2254,
"step": 45
},
{
"epoch": 0.7479674796747967,
"grad_norm": 1.5826415829816778,
"learning_rate": 9.554613964695189e-06,
"loss": 2.3195,
"step": 46
},
{
"epoch": 0.7642276422764228,
"grad_norm": 1.5129794859111765,
"learning_rate": 9.525886076789195e-06,
"loss": 2.309,
"step": 47
},
{
"epoch": 0.7804878048780488,
"grad_norm": 1.4425514088997948,
"learning_rate": 9.496306412513989e-06,
"loss": 2.1234,
"step": 48
},
{
"epoch": 0.7967479674796748,
"grad_norm": 1.4940088273936272,
"learning_rate": 9.465880538792519e-06,
"loss": 2.2481,
"step": 49
},
{
"epoch": 0.8130081300813008,
"grad_norm": 1.4327286196072573,
"learning_rate": 9.434614181805203e-06,
"loss": 2.18,
"step": 50
},
{
"epoch": 0.8292682926829268,
"grad_norm": 1.431778831080833,
"learning_rate": 9.402513225912273e-06,
"loss": 2.1464,
"step": 51
},
{
"epoch": 0.8455284552845529,
"grad_norm": 1.4448422926978557,
"learning_rate": 9.369583712546322e-06,
"loss": 2.1412,
"step": 52
},
{
"epoch": 0.8617886178861789,
"grad_norm": 1.4679959607329234,
"learning_rate": 9.335831839075303e-06,
"loss": 2.0886,
"step": 53
},
{
"epoch": 0.8780487804878049,
"grad_norm": 1.4297931147920537,
"learning_rate": 9.30126395763618e-06,
"loss": 2.2567,
"step": 54
},
{
"epoch": 0.8943089430894309,
"grad_norm": 1.3468034919950198,
"learning_rate": 9.265886573939448e-06,
"loss": 2.2071,
"step": 55
},
{
"epoch": 0.9105691056910569,
"grad_norm": 1.4936153334385893,
"learning_rate": 9.229706346044749e-06,
"loss": 2.2185,
"step": 56
},
{
"epoch": 0.926829268292683,
"grad_norm": 1.6789184395018877,
"learning_rate": 9.19273008310782e-06,
"loss": 2.2014,
"step": 57
},
{
"epoch": 0.943089430894309,
"grad_norm": 2.328661298153007,
"learning_rate": 9.154964744099006e-06,
"loss": 2.2311,
"step": 58
},
{
"epoch": 0.959349593495935,
"grad_norm": 1.6031494617421091,
"learning_rate": 9.116417436493574e-06,
"loss": 2.2817,
"step": 59
},
{
"epoch": 0.975609756097561,
"grad_norm": 1.378022632096063,
"learning_rate": 9.077095414934076e-06,
"loss": 2.3173,
"step": 60
},
{
"epoch": 0.991869918699187,
"grad_norm": 1.378014833579439,
"learning_rate": 9.037006079865017e-06,
"loss": 2.1865,
"step": 61
},
{
"epoch": 1.008130081300813,
"grad_norm": 1.777721048156252,
"learning_rate": 8.996156976140088e-06,
"loss": 1.9477,
"step": 62
},
{
"epoch": 1.024390243902439,
"grad_norm": 1.9572083586736355,
"learning_rate": 8.95455579160221e-06,
"loss": 1.9606,
"step": 63
},
{
"epoch": 1.040650406504065,
"grad_norm": 1.83840626983557,
"learning_rate": 8.91221035563669e-06,
"loss": 1.9641,
"step": 64
},
{
"epoch": 1.056910569105691,
"grad_norm": 1.7482663924396353,
"learning_rate": 8.869128637697702e-06,
"loss": 1.912,
"step": 65
},
{
"epoch": 1.0731707317073171,
"grad_norm": 2.436293071917533,
"learning_rate": 8.82531874580844e-06,
"loss": 2.0053,
"step": 66
},
{
"epoch": 1.089430894308943,
"grad_norm": 1.7774996740932676,
"learning_rate": 8.780788925035178e-06,
"loss": 1.8775,
"step": 67
},
{
"epoch": 1.1056910569105691,
"grad_norm": 1.5469357572162499,
"learning_rate": 8.735547555935538e-06,
"loss": 1.8116,
"step": 68
},
{
"epoch": 1.1219512195121952,
"grad_norm": 1.9514287859645285,
"learning_rate": 8.689603152981262e-06,
"loss": 1.9458,
"step": 69
},
{
"epoch": 1.1382113821138211,
"grad_norm": 2.16053582553305,
"learning_rate": 8.642964362955781e-06,
"loss": 1.8903,
"step": 70
},
{
"epoch": 1.1544715447154472,
"grad_norm": 1.8295345259876294,
"learning_rate": 8.59563996332688e-06,
"loss": 1.9097,
"step": 71
},
{
"epoch": 1.170731707317073,
"grad_norm": 1.6192771141787508,
"learning_rate": 8.547638860594765e-06,
"loss": 1.9404,
"step": 72
},
{
"epoch": 1.1869918699186992,
"grad_norm": 1.732769202153044,
"learning_rate": 8.498970088615861e-06,
"loss": 1.9072,
"step": 73
},
{
"epoch": 1.203252032520325,
"grad_norm": 1.7124937892640733,
"learning_rate": 8.449642806902623e-06,
"loss": 1.9361,
"step": 74
},
{
"epoch": 1.2195121951219512,
"grad_norm": 1.4632968865855904,
"learning_rate": 8.399666298899706e-06,
"loss": 1.9064,
"step": 75
},
{
"epoch": 1.2357723577235773,
"grad_norm": 1.57738250567877,
"learning_rate": 8.349049970236822e-06,
"loss": 1.8884,
"step": 76
},
{
"epoch": 1.2520325203252032,
"grad_norm": 1.6860131175954445,
"learning_rate": 8.29780334695857e-06,
"loss": 1.8003,
"step": 77
},
{
"epoch": 1.2682926829268293,
"grad_norm": 1.6737455231202842,
"learning_rate": 8.245936073731654e-06,
"loss": 1.9188,
"step": 78
},
{
"epoch": 1.2845528455284554,
"grad_norm": 1.5244480150455564,
"learning_rate": 8.193457912029713e-06,
"loss": 1.9428,
"step": 79
},
{
"epoch": 1.3008130081300813,
"grad_norm": 2.3077062591013546,
"learning_rate": 8.140378738296233e-06,
"loss": 1.9652,
"step": 80
},
{
"epoch": 1.3170731707317074,
"grad_norm": 2.0246897260170433,
"learning_rate": 8.086708542085769e-06,
"loss": 1.8873,
"step": 81
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.6727268288560573,
"learning_rate": 8.032457424183909e-06,
"loss": 1.8492,
"step": 82
},
{
"epoch": 1.3495934959349594,
"grad_norm": 1.5569674481989706,
"learning_rate": 7.977635594706298e-06,
"loss": 1.9254,
"step": 83
},
{
"epoch": 1.3658536585365852,
"grad_norm": 1.6023097703448475,
"learning_rate": 7.922253371177081e-06,
"loss": 1.9188,
"step": 84
},
{
"epoch": 1.3821138211382114,
"grad_norm": 1.579712800121583,
"learning_rate": 7.866321176587129e-06,
"loss": 1.7658,
"step": 85
},
{
"epoch": 1.3983739837398375,
"grad_norm": 1.5575689401980928,
"learning_rate": 7.809849537432432e-06,
"loss": 1.8844,
"step": 86
},
{
"epoch": 1.4146341463414633,
"grad_norm": 2.0996103231960186,
"learning_rate": 7.752849081732993e-06,
"loss": 1.8144,
"step": 87
},
{
"epoch": 1.4308943089430894,
"grad_norm": 2.256462070639042,
"learning_rate": 7.695330537032629e-06,
"loss": 1.9585,
"step": 88
},
{
"epoch": 1.4471544715447155,
"grad_norm": 1.5391039524045877,
"learning_rate": 7.637304728380036e-06,
"loss": 1.7978,
"step": 89
},
{
"epoch": 1.4634146341463414,
"grad_norm": 1.6288407781749783,
"learning_rate": 7.578782576291501e-06,
"loss": 1.8906,
"step": 90
},
{
"epoch": 1.4796747967479675,
"grad_norm": 1.4226382530812676,
"learning_rate": 7.51977509469565e-06,
"loss": 1.8687,
"step": 91
},
{
"epoch": 1.4959349593495934,
"grad_norm": 1.5704917583247462,
"learning_rate": 7.460293388860616e-06,
"loss": 2.0512,
"step": 92
},
{
"epoch": 1.5121951219512195,
"grad_norm": 1.4787223094009658,
"learning_rate": 7.400348653304022e-06,
"loss": 1.9736,
"step": 93
},
{
"epoch": 1.5284552845528454,
"grad_norm": 1.4275720991608627,
"learning_rate": 7.3399521696861505e-06,
"loss": 1.9977,
"step": 94
},
{
"epoch": 1.5447154471544715,
"grad_norm": 1.4098433082701363,
"learning_rate": 7.2791153046867344e-06,
"loss": 1.8225,
"step": 95
},
{
"epoch": 1.5609756097560976,
"grad_norm": 4.67217439880156,
"learning_rate": 7.217849507865724e-06,
"loss": 1.9615,
"step": 96
},
{
"epoch": 1.5772357723577235,
"grad_norm": 1.519886113544732,
"learning_rate": 7.156166309508482e-06,
"loss": 1.8566,
"step": 97
},
{
"epoch": 1.5934959349593496,
"grad_norm": 1.4312222151537652,
"learning_rate": 7.094077318455762e-06,
"loss": 1.8472,
"step": 98
},
{
"epoch": 1.6097560975609757,
"grad_norm": 1.595194011963869,
"learning_rate": 7.031594219918916e-06,
"loss": 1.8246,
"step": 99
},
{
"epoch": 1.6260162601626016,
"grad_norm": 1.5531618087372638,
"learning_rate": 6.96872877328073e-06,
"loss": 1.9809,
"step": 100
},
{
"epoch": 1.6422764227642277,
"grad_norm": 1.4743001623349186,
"learning_rate": 6.905492809882286e-06,
"loss": 2.0107,
"step": 101
},
{
"epoch": 1.6585365853658538,
"grad_norm": 1.9754475724007412,
"learning_rate": 6.841898230796302e-06,
"loss": 1.8143,
"step": 102
},
{
"epoch": 1.6747967479674797,
"grad_norm": 2.2741340273630635,
"learning_rate": 6.777957004587332e-06,
"loss": 1.7824,
"step": 103
},
{
"epoch": 1.6910569105691056,
"grad_norm": 1.6564200369629691,
"learning_rate": 6.713681165059271e-06,
"loss": 1.8844,
"step": 104
},
{
"epoch": 1.7073170731707317,
"grad_norm": 1.5809400530395674,
"learning_rate": 6.6490828089905854e-06,
"loss": 1.8789,
"step": 105
},
{
"epoch": 1.7235772357723578,
"grad_norm": 1.4746817698203465,
"learning_rate": 6.584174093857676e-06,
"loss": 1.9045,
"step": 106
},
{
"epoch": 1.7398373983739837,
"grad_norm": 1.353250857647505,
"learning_rate": 6.5189672355468415e-06,
"loss": 1.8118,
"step": 107
},
{
"epoch": 1.7560975609756098,
"grad_norm": 1.4850655925014062,
"learning_rate": 6.453474506055228e-06,
"loss": 1.8122,
"step": 108
},
{
"epoch": 1.7723577235772359,
"grad_norm": 1.8401558539962772,
"learning_rate": 6.387708231181229e-06,
"loss": 1.7482,
"step": 109
},
{
"epoch": 1.7886178861788617,
"grad_norm": 1.5200607844016307,
"learning_rate": 6.3216807882047585e-06,
"loss": 1.9692,
"step": 110
},
{
"epoch": 1.8048780487804879,
"grad_norm": 1.5040703158854816,
"learning_rate": 6.255404603557833e-06,
"loss": 1.8885,
"step": 111
},
{
"epoch": 1.821138211382114,
"grad_norm": 1.5595167294836927,
"learning_rate": 6.188892150485904e-06,
"loss": 1.8763,
"step": 112
},
{
"epoch": 1.8373983739837398,
"grad_norm": 1.5145771953515301,
"learning_rate": 6.122155946700381e-06,
"loss": 2.0202,
"step": 113
},
{
"epoch": 1.8536585365853657,
"grad_norm": 1.3682270101469567,
"learning_rate": 6.0552085520227875e-06,
"loss": 1.9047,
"step": 114
},
{
"epoch": 1.8699186991869918,
"grad_norm": 1.6370392819430508,
"learning_rate": 5.988062566020987e-06,
"loss": 1.9071,
"step": 115
},
{
"epoch": 1.886178861788618,
"grad_norm": 1.5064391288691565,
"learning_rate": 5.920730625637934e-06,
"loss": 1.9622,
"step": 116
},
{
"epoch": 1.9024390243902438,
"grad_norm": 1.5117346102674705,
"learning_rate": 5.853225402813381e-06,
"loss": 1.8889,
"step": 117
},
{
"epoch": 1.91869918699187,
"grad_norm": 1.5330246513699148,
"learning_rate": 5.785559602099019e-06,
"loss": 1.8971,
"step": 118
},
{
"epoch": 1.934959349593496,
"grad_norm": 1.446396984889757,
"learning_rate": 5.7177459582674595e-06,
"loss": 1.8328,
"step": 119
},
{
"epoch": 1.951219512195122,
"grad_norm": 1.495856639357874,
"learning_rate": 5.649797233915539e-06,
"loss": 1.8684,
"step": 120
},
{
"epoch": 1.967479674796748,
"grad_norm": 1.4499233339195112,
"learning_rate": 5.5817262170623865e-06,
"loss": 1.8167,
"step": 121
},
{
"epoch": 1.9837398373983741,
"grad_norm": 2.7226474462829744,
"learning_rate": 5.513545718742702e-06,
"loss": 1.9347,
"step": 122
},
{
"epoch": 2.0,
"grad_norm": 1.6284827727818851,
"learning_rate": 5.4452685705957084e-06,
"loss": 1.9544,
"step": 123
},
{
"epoch": 2.0081967213114753,
"grad_norm": 2.196083883493474,
"learning_rate": 5.376907622450229e-06,
"loss": 1.726,
"step": 124
},
{
"epoch": 2.0245901639344264,
"grad_norm": 2.5371696562778325,
"learning_rate": 5.308475739906329e-06,
"loss": 1.6243,
"step": 125
},
{
"epoch": 2.040983606557377,
"grad_norm": 2.268797600455164,
"learning_rate": 5.2399858019140005e-06,
"loss": 1.6506,
"step": 126
},
{
"epoch": 2.057377049180328,
"grad_norm": 2.189268946289623,
"learning_rate": 5.171450698349329e-06,
"loss": 1.5677,
"step": 127
},
{
"epoch": 2.0737704918032787,
"grad_norm": 3.330757242919671,
"learning_rate": 5.102883327588608e-06,
"loss": 1.5414,
"step": 128
},
{
"epoch": 2.0901639344262297,
"grad_norm": 4.975856342017294,
"learning_rate": 5.034296594080849e-06,
"loss": 1.5934,
"step": 129
},
{
"epoch": 2.1065573770491803,
"grad_norm": 2.6202824821832644,
"learning_rate": 4.965703405919154e-06,
"loss": 1.5114,
"step": 130
},
{
"epoch": 2.122950819672131,
"grad_norm": 2.145467593529425,
"learning_rate": 4.897116672411395e-06,
"loss": 1.6081,
"step": 131
},
{
"epoch": 2.139344262295082,
"grad_norm": 2.2918703289964597,
"learning_rate": 4.828549301650673e-06,
"loss": 1.6128,
"step": 132
},
{
"epoch": 2.1557377049180326,
"grad_norm": 2.2020919561070307,
"learning_rate": 4.760014198086001e-06,
"loss": 1.5385,
"step": 133
},
{
"epoch": 2.1721311475409837,
"grad_norm": 1.9897624139230765,
"learning_rate": 4.691524260093672e-06,
"loss": 1.5731,
"step": 134
},
{
"epoch": 2.1885245901639343,
"grad_norm": 2.1542550052135665,
"learning_rate": 4.623092377549772e-06,
"loss": 1.671,
"step": 135
},
{
"epoch": 2.2049180327868854,
"grad_norm": 2.2842158477910512,
"learning_rate": 4.554731429404293e-06,
"loss": 1.614,
"step": 136
},
{
"epoch": 2.221311475409836,
"grad_norm": 2.1239054707901452,
"learning_rate": 4.4864542812573e-06,
"loss": 1.6136,
"step": 137
},
{
"epoch": 2.237704918032787,
"grad_norm": 2.0393208350755008,
"learning_rate": 4.4182737829376135e-06,
"loss": 1.6467,
"step": 138
},
{
"epoch": 2.2540983606557377,
"grad_norm": 2.0314018092300103,
"learning_rate": 4.3502027660844606e-06,
"loss": 1.6771,
"step": 139
},
{
"epoch": 2.2704918032786887,
"grad_norm": 2.258947100849125,
"learning_rate": 4.28225404173254e-06,
"loss": 1.5915,
"step": 140
},
{
"epoch": 2.2868852459016393,
"grad_norm": 2.1050457746344313,
"learning_rate": 4.214440397900983e-06,
"loss": 1.6608,
"step": 141
},
{
"epoch": 2.30327868852459,
"grad_norm": 1.948114104146267,
"learning_rate": 4.146774597186622e-06,
"loss": 1.5369,
"step": 142
},
{
"epoch": 2.319672131147541,
"grad_norm": 1.864917840283612,
"learning_rate": 4.0792693743620695e-06,
"loss": 1.5717,
"step": 143
},
{
"epoch": 2.3360655737704916,
"grad_norm": 1.7461920012860392,
"learning_rate": 4.011937433979014e-06,
"loss": 1.5871,
"step": 144
},
{
"epoch": 2.3524590163934427,
"grad_norm": 1.798186392967839,
"learning_rate": 3.944791447977213e-06,
"loss": 1.5592,
"step": 145
},
{
"epoch": 2.3688524590163933,
"grad_norm": 1.840314539800089,
"learning_rate": 3.87784405329962e-06,
"loss": 1.5807,
"step": 146
},
{
"epoch": 2.3852459016393444,
"grad_norm": 1.9329074085202937,
"learning_rate": 3.811107849514098e-06,
"loss": 1.5505,
"step": 147
},
{
"epoch": 2.401639344262295,
"grad_norm": 1.652283737337734,
"learning_rate": 3.744595396442169e-06,
"loss": 1.5601,
"step": 148
},
{
"epoch": 2.418032786885246,
"grad_norm": 1.6942011471416905,
"learning_rate": 3.6783192117952427e-06,
"loss": 1.5762,
"step": 149
},
{
"epoch": 2.4344262295081966,
"grad_norm": 1.6999827546312305,
"learning_rate": 3.612291768818772e-06,
"loss": 1.5286,
"step": 150
},
{
"epoch": 2.4508196721311477,
"grad_norm": 1.717480901806302,
"learning_rate": 3.5465254939447737e-06,
"loss": 1.6165,
"step": 151
},
{
"epoch": 2.4672131147540983,
"grad_norm": 1.7354378842697238,
"learning_rate": 3.4810327644531606e-06,
"loss": 1.5636,
"step": 152
},
{
"epoch": 2.4836065573770494,
"grad_norm": 2.076605836354899,
"learning_rate": 3.415825906142326e-06,
"loss": 1.5426,
"step": 153
},
{
"epoch": 2.5,
"grad_norm": 2.807787441701596,
"learning_rate": 3.3509171910094162e-06,
"loss": 1.4229,
"step": 154
},
{
"epoch": 2.5163934426229506,
"grad_norm": 2.1162841013271,
"learning_rate": 3.2863188349407293e-06,
"loss": 1.5828,
"step": 155
},
{
"epoch": 2.5327868852459017,
"grad_norm": 2.06404980427501,
"learning_rate": 3.222042995412669e-06,
"loss": 1.6511,
"step": 156
},
{
"epoch": 2.5491803278688527,
"grad_norm": 1.954408720302879,
"learning_rate": 3.1581017692036986e-06,
"loss": 1.5611,
"step": 157
},
{
"epoch": 2.5655737704918034,
"grad_norm": 1.8718671260321513,
"learning_rate": 3.094507190117715e-06,
"loss": 1.5528,
"step": 158
},
{
"epoch": 2.581967213114754,
"grad_norm": 1.8323058517821103,
"learning_rate": 3.0312712267192713e-06,
"loss": 1.5525,
"step": 159
},
{
"epoch": 2.598360655737705,
"grad_norm": 4.8860562051247385,
"learning_rate": 2.9684057800810844e-06,
"loss": 1.5571,
"step": 160
},
{
"epoch": 2.6147540983606556,
"grad_norm": 2.1738936906241237,
"learning_rate": 2.9059226815442386e-06,
"loss": 1.4133,
"step": 161
},
{
"epoch": 2.6311475409836067,
"grad_norm": 2.1707521374782246,
"learning_rate": 2.8438336904915186e-06,
"loss": 1.7308,
"step": 162
},
{
"epoch": 2.6475409836065573,
"grad_norm": 1.848892940417498,
"learning_rate": 2.782150492134278e-06,
"loss": 1.6001,
"step": 163
},
{
"epoch": 2.663934426229508,
"grad_norm": 1.7230291366018273,
"learning_rate": 2.7208846953132685e-06,
"loss": 1.6081,
"step": 164
},
{
"epoch": 2.680327868852459,
"grad_norm": 1.6901177596723493,
"learning_rate": 2.6600478303138503e-06,
"loss": 1.4536,
"step": 165
},
{
"epoch": 2.69672131147541,
"grad_norm": 1.770734235373982,
"learning_rate": 2.599651346695979e-06,
"loss": 1.5516,
"step": 166
},
{
"epoch": 2.7131147540983607,
"grad_norm": 1.9713299716871673,
"learning_rate": 2.539706611139385e-06,
"loss": 1.6036,
"step": 167
},
{
"epoch": 2.7295081967213113,
"grad_norm": 1.944020272636568,
"learning_rate": 2.4802249053043525e-06,
"loss": 1.6063,
"step": 168
},
{
"epoch": 2.7459016393442623,
"grad_norm": 1.917025051456852,
"learning_rate": 2.4212174237085007e-06,
"loss": 1.484,
"step": 169
},
{
"epoch": 2.762295081967213,
"grad_norm": 1.7078308287195758,
"learning_rate": 2.3626952716199647e-06,
"loss": 1.6206,
"step": 170
},
{
"epoch": 2.778688524590164,
"grad_norm": 2.105784906420741,
"learning_rate": 2.3046694629673715e-06,
"loss": 1.5728,
"step": 171
},
{
"epoch": 2.7950819672131146,
"grad_norm": 1.8395129946614983,
"learning_rate": 2.247150918267008e-06,
"loss": 1.5713,
"step": 172
},
{
"epoch": 2.8114754098360657,
"grad_norm": 1.6207009651182827,
"learning_rate": 2.190150462567569e-06,
"loss": 1.5185,
"step": 173
},
{
"epoch": 2.8278688524590163,
"grad_norm": 1.5443883689967923,
"learning_rate": 2.133678823412873e-06,
"loss": 1.5119,
"step": 174
},
{
"epoch": 2.8442622950819674,
"grad_norm": 1.6761100058671115,
"learning_rate": 2.077746628822921e-06,
"loss": 1.5463,
"step": 175
},
{
"epoch": 2.860655737704918,
"grad_norm": 1.982740476512917,
"learning_rate": 2.022364405293703e-06,
"loss": 1.5463,
"step": 176
},
{
"epoch": 2.8770491803278686,
"grad_norm": 1.792200717708596,
"learning_rate": 1.9675425758160927e-06,
"loss": 1.6641,
"step": 177
},
{
"epoch": 2.8934426229508197,
"grad_norm": 2.129611048939019,
"learning_rate": 1.913291457914234e-06,
"loss": 1.6427,
"step": 178
},
{
"epoch": 2.9098360655737707,
"grad_norm": 1.703026911107873,
"learning_rate": 1.8596212617037695e-06,
"loss": 1.5071,
"step": 179
},
{
"epoch": 2.9262295081967213,
"grad_norm": 1.684347363697613,
"learning_rate": 1.8065420879702888e-06,
"loss": 1.613,
"step": 180
},
{
"epoch": 2.942622950819672,
"grad_norm": 4.094268965277064,
"learning_rate": 1.754063926268349e-06,
"loss": 1.562,
"step": 181
},
{
"epoch": 2.959016393442623,
"grad_norm": 1.8320884272338482,
"learning_rate": 1.7021966530414303e-06,
"loss": 1.5323,
"step": 182
},
{
"epoch": 2.9754098360655736,
"grad_norm": 2.1968448353373833,
"learning_rate": 1.6509500297631786e-06,
"loss": 1.6683,
"step": 183
}
],
"logging_steps": 1,
"max_steps": 244,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 61,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.784065850068173e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}