bbytxt's picture
Training in progress, step 177, checkpoint
b97a1c1 verified
{
"best_metric": 0.9860224723815918,
"best_model_checkpoint": "miner_id_24/checkpoint-50",
"epoch": 3.0127659574468084,
"eval_steps": 50,
"global_step": 177,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01702127659574468,
"grad_norm": 2.3956034183502197,
"learning_rate": 1e-05,
"loss": 1.5021,
"step": 1
},
{
"epoch": 0.01702127659574468,
"eval_loss": 1.6579022407531738,
"eval_runtime": 4.4292,
"eval_samples_per_second": 22.352,
"eval_steps_per_second": 5.644,
"step": 1
},
{
"epoch": 0.03404255319148936,
"grad_norm": 2.8143370151519775,
"learning_rate": 2e-05,
"loss": 1.4469,
"step": 2
},
{
"epoch": 0.05106382978723404,
"grad_norm": 2.780684232711792,
"learning_rate": 3e-05,
"loss": 1.5057,
"step": 3
},
{
"epoch": 0.06808510638297872,
"grad_norm": 2.466012716293335,
"learning_rate": 4e-05,
"loss": 1.5455,
"step": 4
},
{
"epoch": 0.0851063829787234,
"grad_norm": 2.0236899852752686,
"learning_rate": 5e-05,
"loss": 1.3597,
"step": 5
},
{
"epoch": 0.10212765957446808,
"grad_norm": 1.7500122785568237,
"learning_rate": 6e-05,
"loss": 1.3609,
"step": 6
},
{
"epoch": 0.11914893617021277,
"grad_norm": 2.0319266319274902,
"learning_rate": 7e-05,
"loss": 1.325,
"step": 7
},
{
"epoch": 0.13617021276595745,
"grad_norm": 1.9214025735855103,
"learning_rate": 8e-05,
"loss": 1.2035,
"step": 8
},
{
"epoch": 0.15319148936170213,
"grad_norm": 1.8180416822433472,
"learning_rate": 9e-05,
"loss": 1.1698,
"step": 9
},
{
"epoch": 0.1702127659574468,
"grad_norm": 1.722793459892273,
"learning_rate": 0.0001,
"loss": 1.0561,
"step": 10
},
{
"epoch": 0.18723404255319148,
"grad_norm": 1.8683631420135498,
"learning_rate": 9.999115304121457e-05,
"loss": 1.1662,
"step": 11
},
{
"epoch": 0.20425531914893616,
"grad_norm": 2.0223588943481445,
"learning_rate": 9.996461529560553e-05,
"loss": 1.1261,
"step": 12
},
{
"epoch": 0.22127659574468084,
"grad_norm": 1.9186768531799316,
"learning_rate": 9.992039615430648e-05,
"loss": 0.9619,
"step": 13
},
{
"epoch": 0.23829787234042554,
"grad_norm": 2.4184582233428955,
"learning_rate": 9.985851126551428e-05,
"loss": 1.1466,
"step": 14
},
{
"epoch": 0.2553191489361702,
"grad_norm": 1.746878743171692,
"learning_rate": 9.977898252895134e-05,
"loss": 1.0855,
"step": 15
},
{
"epoch": 0.2723404255319149,
"grad_norm": 1.4888535737991333,
"learning_rate": 9.968183808811586e-05,
"loss": 1.0404,
"step": 16
},
{
"epoch": 0.28936170212765955,
"grad_norm": 1.3668509721755981,
"learning_rate": 9.95671123203224e-05,
"loss": 1.1569,
"step": 17
},
{
"epoch": 0.30638297872340425,
"grad_norm": 1.2972848415374756,
"learning_rate": 9.943484582453653e-05,
"loss": 1.0638,
"step": 18
},
{
"epoch": 0.32340425531914896,
"grad_norm": 1.6263564825057983,
"learning_rate": 9.928508540700774e-05,
"loss": 1.1049,
"step": 19
},
{
"epoch": 0.3404255319148936,
"grad_norm": 1.4598687887191772,
"learning_rate": 9.911788406470569e-05,
"loss": 1.0198,
"step": 20
},
{
"epoch": 0.3574468085106383,
"grad_norm": 1.431780457496643,
"learning_rate": 9.893330096656574e-05,
"loss": 1.0425,
"step": 21
},
{
"epoch": 0.37446808510638296,
"grad_norm": 1.275739073753357,
"learning_rate": 9.873140143255036e-05,
"loss": 1.0508,
"step": 22
},
{
"epoch": 0.39148936170212767,
"grad_norm": 1.3290923833847046,
"learning_rate": 9.85122569105338e-05,
"loss": 1.0344,
"step": 23
},
{
"epoch": 0.4085106382978723,
"grad_norm": 1.385582685470581,
"learning_rate": 9.827594495101823e-05,
"loss": 0.9878,
"step": 24
},
{
"epoch": 0.425531914893617,
"grad_norm": 1.3875707387924194,
"learning_rate": 9.802254917969032e-05,
"loss": 1.0025,
"step": 25
},
{
"epoch": 0.4425531914893617,
"grad_norm": 1.3922779560089111,
"learning_rate": 9.775215926782788e-05,
"loss": 0.9845,
"step": 26
},
{
"epoch": 0.4595744680851064,
"grad_norm": 1.487823247909546,
"learning_rate": 9.746487090056713e-05,
"loss": 0.8108,
"step": 27
},
{
"epoch": 0.4765957446808511,
"grad_norm": 2.1065125465393066,
"learning_rate": 9.716078574304189e-05,
"loss": 1.0918,
"step": 28
},
{
"epoch": 0.49361702127659574,
"grad_norm": 1.2851176261901855,
"learning_rate": 9.684001140440639e-05,
"loss": 1.1534,
"step": 29
},
{
"epoch": 0.5106382978723404,
"grad_norm": 1.350062370300293,
"learning_rate": 9.650266139975474e-05,
"loss": 1.0002,
"step": 30
},
{
"epoch": 0.5276595744680851,
"grad_norm": 1.263066291809082,
"learning_rate": 9.614885510995047e-05,
"loss": 1.0785,
"step": 31
},
{
"epoch": 0.5446808510638298,
"grad_norm": 1.1791244745254517,
"learning_rate": 9.577871773938011e-05,
"loss": 1.0402,
"step": 32
},
{
"epoch": 0.5617021276595745,
"grad_norm": 1.1864782571792603,
"learning_rate": 9.539238027164619e-05,
"loss": 1.091,
"step": 33
},
{
"epoch": 0.5787234042553191,
"grad_norm": 1.2298381328582764,
"learning_rate": 9.498997942321483e-05,
"loss": 0.998,
"step": 34
},
{
"epoch": 0.5957446808510638,
"grad_norm": 1.2237919569015503,
"learning_rate": 9.457165759503493e-05,
"loss": 0.9893,
"step": 35
},
{
"epoch": 0.6127659574468085,
"grad_norm": 1.3423452377319336,
"learning_rate": 9.413756282214537e-05,
"loss": 0.955,
"step": 36
},
{
"epoch": 0.6297872340425532,
"grad_norm": 1.3232065439224243,
"learning_rate": 9.368784872128878e-05,
"loss": 1.0207,
"step": 37
},
{
"epoch": 0.6468085106382979,
"grad_norm": 1.2950204610824585,
"learning_rate": 9.322267443654972e-05,
"loss": 0.8997,
"step": 38
},
{
"epoch": 0.6638297872340425,
"grad_norm": 1.3665379285812378,
"learning_rate": 9.274220458303727e-05,
"loss": 0.9627,
"step": 39
},
{
"epoch": 0.6808510638297872,
"grad_norm": 1.5041331052780151,
"learning_rate": 9.224660918863104e-05,
"loss": 0.9353,
"step": 40
},
{
"epoch": 0.6978723404255319,
"grad_norm": 1.484763264656067,
"learning_rate": 9.173606363381219e-05,
"loss": 0.9178,
"step": 41
},
{
"epoch": 0.7148936170212766,
"grad_norm": 2.1842877864837646,
"learning_rate": 9.121074858959997e-05,
"loss": 0.912,
"step": 42
},
{
"epoch": 0.7319148936170212,
"grad_norm": 1.146817922592163,
"learning_rate": 9.067084995361623e-05,
"loss": 1.0537,
"step": 43
},
{
"epoch": 0.7489361702127659,
"grad_norm": 1.1362913846969604,
"learning_rate": 9.011655878430019e-05,
"loss": 0.9867,
"step": 44
},
{
"epoch": 0.7659574468085106,
"grad_norm": 1.1847161054611206,
"learning_rate": 8.954807123329704e-05,
"loss": 1.0172,
"step": 45
},
{
"epoch": 0.7829787234042553,
"grad_norm": 1.1899924278259277,
"learning_rate": 8.896558847604414e-05,
"loss": 1.0202,
"step": 46
},
{
"epoch": 0.8,
"grad_norm": 1.22841477394104,
"learning_rate": 8.836931664057935e-05,
"loss": 1.0795,
"step": 47
},
{
"epoch": 0.8170212765957446,
"grad_norm": 1.1131497621536255,
"learning_rate": 8.775946673459681e-05,
"loss": 0.951,
"step": 48
},
{
"epoch": 0.8340425531914893,
"grad_norm": 1.2665643692016602,
"learning_rate": 8.713625457077585e-05,
"loss": 0.936,
"step": 49
},
{
"epoch": 0.851063829787234,
"grad_norm": 1.2944315671920776,
"learning_rate": 8.649990069040961e-05,
"loss": 1.098,
"step": 50
},
{
"epoch": 0.851063829787234,
"eval_loss": 0.9860224723815918,
"eval_runtime": 4.4613,
"eval_samples_per_second": 22.191,
"eval_steps_per_second": 5.604,
"step": 50
},
{
"epoch": 0.8680851063829788,
"grad_norm": 1.3318464756011963,
"learning_rate": 8.585063028536016e-05,
"loss": 0.9964,
"step": 51
},
{
"epoch": 0.8851063829787233,
"grad_norm": 1.2892343997955322,
"learning_rate": 8.518867311836808e-05,
"loss": 0.8719,
"step": 52
},
{
"epoch": 0.902127659574468,
"grad_norm": 1.4797875881195068,
"learning_rate": 8.451426344174433e-05,
"loss": 1.0485,
"step": 53
},
{
"epoch": 0.9191489361702128,
"grad_norm": 1.351209044456482,
"learning_rate": 8.382763991447344e-05,
"loss": 0.8408,
"step": 54
},
{
"epoch": 0.9361702127659575,
"grad_norm": 1.4917362928390503,
"learning_rate": 8.312904551775731e-05,
"loss": 0.9081,
"step": 55
},
{
"epoch": 0.9531914893617022,
"grad_norm": 1.636767864227295,
"learning_rate": 8.241872746902935e-05,
"loss": 0.7751,
"step": 56
},
{
"epoch": 0.9702127659574468,
"grad_norm": 1.0549850463867188,
"learning_rate": 8.169693713446959e-05,
"loss": 0.9314,
"step": 57
},
{
"epoch": 0.9872340425531915,
"grad_norm": 1.174147367477417,
"learning_rate": 8.096392994005177e-05,
"loss": 0.839,
"step": 58
},
{
"epoch": 1.004255319148936,
"grad_norm": 1.7790963649749756,
"learning_rate": 8.021996528115335e-05,
"loss": 1.0854,
"step": 59
},
{
"epoch": 1.0212765957446808,
"grad_norm": 0.8404403924942017,
"learning_rate": 7.946530643076138e-05,
"loss": 0.791,
"step": 60
},
{
"epoch": 1.0382978723404255,
"grad_norm": 0.9521711468696594,
"learning_rate": 7.870022044630569e-05,
"loss": 0.6483,
"step": 61
},
{
"epoch": 1.0553191489361702,
"grad_norm": 0.9962537288665771,
"learning_rate": 7.792497807515317e-05,
"loss": 0.7106,
"step": 62
},
{
"epoch": 1.0723404255319149,
"grad_norm": 0.9780325293540955,
"learning_rate": 7.713985365879606e-05,
"loss": 0.6272,
"step": 63
},
{
"epoch": 1.0893617021276596,
"grad_norm": 0.9337742328643799,
"learning_rate": 7.63451250357685e-05,
"loss": 0.6399,
"step": 64
},
{
"epoch": 1.1063829787234043,
"grad_norm": 0.9755467176437378,
"learning_rate": 7.55410734433254e-05,
"loss": 0.5694,
"step": 65
},
{
"epoch": 1.123404255319149,
"grad_norm": 1.0322693586349487,
"learning_rate": 7.472798341791877e-05,
"loss": 0.5253,
"step": 66
},
{
"epoch": 1.1404255319148937,
"grad_norm": 1.1534591913223267,
"learning_rate": 7.390614269450634e-05,
"loss": 0.6757,
"step": 67
},
{
"epoch": 1.1574468085106382,
"grad_norm": 1.1756784915924072,
"learning_rate": 7.307584210472844e-05,
"loss": 0.5744,
"step": 68
},
{
"epoch": 1.174468085106383,
"grad_norm": 1.227107048034668,
"learning_rate": 7.223737547398898e-05,
"loss": 0.471,
"step": 69
},
{
"epoch": 1.1914893617021276,
"grad_norm": 1.4421412944793701,
"learning_rate": 7.139103951747695e-05,
"loss": 0.5671,
"step": 70
},
{
"epoch": 1.2085106382978723,
"grad_norm": 1.5038673877716064,
"learning_rate": 7.053713373516538e-05,
"loss": 0.5429,
"step": 71
},
{
"epoch": 1.225531914893617,
"grad_norm": 1.6070609092712402,
"learning_rate": 6.967596030582478e-05,
"loss": 0.3801,
"step": 72
},
{
"epoch": 1.2425531914893617,
"grad_norm": 1.7779772281646729,
"learning_rate": 6.880782398008862e-05,
"loss": 0.6141,
"step": 73
},
{
"epoch": 1.2595744680851064,
"grad_norm": 1.2830954790115356,
"learning_rate": 6.793303197260864e-05,
"loss": 0.667,
"step": 74
},
{
"epoch": 1.2765957446808511,
"grad_norm": 1.3784350156784058,
"learning_rate": 6.70518938533383e-05,
"loss": 0.6144,
"step": 75
},
{
"epoch": 1.2936170212765958,
"grad_norm": 1.772112488746643,
"learning_rate": 6.616472143798261e-05,
"loss": 0.8042,
"step": 76
},
{
"epoch": 1.3106382978723405,
"grad_norm": 1.6115304231643677,
"learning_rate": 6.527182867765332e-05,
"loss": 0.6911,
"step": 77
},
{
"epoch": 1.327659574468085,
"grad_norm": 1.4860713481903076,
"learning_rate": 6.437353154776849e-05,
"loss": 0.668,
"step": 78
},
{
"epoch": 1.3446808510638297,
"grad_norm": 1.3420848846435547,
"learning_rate": 6.347014793623547e-05,
"loss": 0.5545,
"step": 79
},
{
"epoch": 1.3617021276595744,
"grad_norm": 1.3645442724227905,
"learning_rate": 6.256199753095745e-05,
"loss": 0.6041,
"step": 80
},
{
"epoch": 1.3787234042553191,
"grad_norm": 1.2304599285125732,
"learning_rate": 6.164940170670266e-05,
"loss": 0.4798,
"step": 81
},
{
"epoch": 1.3957446808510638,
"grad_norm": 1.1637616157531738,
"learning_rate": 6.0732683411376935e-05,
"loss": 0.4231,
"step": 82
},
{
"epoch": 1.4127659574468086,
"grad_norm": 1.2594172954559326,
"learning_rate": 5.98121670517393e-05,
"loss": 0.4704,
"step": 83
},
{
"epoch": 1.4297872340425533,
"grad_norm": 1.3521147966384888,
"learning_rate": 5.8888178378601565e-05,
"loss": 0.4922,
"step": 84
},
{
"epoch": 1.4468085106382977,
"grad_norm": 1.2671799659729004,
"learning_rate": 5.796104437155213e-05,
"loss": 0.3777,
"step": 85
},
{
"epoch": 1.4638297872340424,
"grad_norm": 1.4365724325180054,
"learning_rate": 5.7031093123244925e-05,
"loss": 0.4116,
"step": 86
},
{
"epoch": 1.4808510638297872,
"grad_norm": 1.3327573537826538,
"learning_rate": 5.6098653723294604e-05,
"loss": 0.4738,
"step": 87
},
{
"epoch": 1.4978723404255319,
"grad_norm": 1.1536500453948975,
"learning_rate": 5.516405614181883e-05,
"loss": 0.6458,
"step": 88
},
{
"epoch": 1.5148936170212766,
"grad_norm": 1.2099850177764893,
"learning_rate": 5.4227631112668955e-05,
"loss": 0.628,
"step": 89
},
{
"epoch": 1.5319148936170213,
"grad_norm": 1.3336706161499023,
"learning_rate": 5.3289710016390535e-05,
"loss": 0.6473,
"step": 90
},
{
"epoch": 1.548936170212766,
"grad_norm": 1.279130220413208,
"learning_rate": 5.2350624762954884e-05,
"loss": 0.6072,
"step": 91
},
{
"epoch": 1.5659574468085107,
"grad_norm": 1.2410911321640015,
"learning_rate": 5.14107076743033e-05,
"loss": 0.5237,
"step": 92
},
{
"epoch": 1.5829787234042554,
"grad_norm": 1.3553860187530518,
"learning_rate": 5.047029136674563e-05,
"loss": 0.6179,
"step": 93
},
{
"epoch": 1.6,
"grad_norm": 1.236146092414856,
"learning_rate": 4.95297086332544e-05,
"loss": 0.4814,
"step": 94
},
{
"epoch": 1.6170212765957448,
"grad_norm": 1.4804574251174927,
"learning_rate": 4.858929232569671e-05,
"loss": 0.5876,
"step": 95
},
{
"epoch": 1.6340425531914895,
"grad_norm": 1.3778893947601318,
"learning_rate": 4.7649375237045135e-05,
"loss": 0.4875,
"step": 96
},
{
"epoch": 1.6510638297872342,
"grad_norm": 1.3673796653747559,
"learning_rate": 4.671028998360947e-05,
"loss": 0.4232,
"step": 97
},
{
"epoch": 1.6680851063829787,
"grad_norm": 1.4746342897415161,
"learning_rate": 4.577236888733105e-05,
"loss": 0.4221,
"step": 98
},
{
"epoch": 1.6851063829787234,
"grad_norm": 1.4111493825912476,
"learning_rate": 4.483594385818118e-05,
"loss": 0.4132,
"step": 99
},
{
"epoch": 1.702127659574468,
"grad_norm": 1.6582450866699219,
"learning_rate": 4.39013462767054e-05,
"loss": 0.3586,
"step": 100
},
{
"epoch": 1.702127659574468,
"eval_loss": 1.0457570552825928,
"eval_runtime": 4.456,
"eval_samples_per_second": 22.217,
"eval_steps_per_second": 5.61,
"step": 100
},
{
"epoch": 1.7191489361702128,
"grad_norm": 1.557518720626831,
"learning_rate": 4.29689068767551e-05,
"loss": 0.4448,
"step": 101
},
{
"epoch": 1.7361702127659573,
"grad_norm": 1.330199122428894,
"learning_rate": 4.203895562844789e-05,
"loss": 0.6906,
"step": 102
},
{
"epoch": 1.753191489361702,
"grad_norm": 1.3650050163269043,
"learning_rate": 4.1111821621398446e-05,
"loss": 0.572,
"step": 103
},
{
"epoch": 1.7702127659574467,
"grad_norm": 1.286249041557312,
"learning_rate": 4.0187832948260705e-05,
"loss": 0.5293,
"step": 104
},
{
"epoch": 1.7872340425531914,
"grad_norm": 1.4000760316848755,
"learning_rate": 3.926731658862307e-05,
"loss": 0.5973,
"step": 105
},
{
"epoch": 1.804255319148936,
"grad_norm": 1.450547456741333,
"learning_rate": 3.835059829329735e-05,
"loss": 0.5751,
"step": 106
},
{
"epoch": 1.8212765957446808,
"grad_norm": 1.4450291395187378,
"learning_rate": 3.7438002469042565e-05,
"loss": 0.49,
"step": 107
},
{
"epoch": 1.8382978723404255,
"grad_norm": 1.4160994291305542,
"learning_rate": 3.6529852063764545e-05,
"loss": 0.5344,
"step": 108
},
{
"epoch": 1.8553191489361702,
"grad_norm": 1.4834502935409546,
"learning_rate": 3.562646845223153e-05,
"loss": 0.5433,
"step": 109
},
{
"epoch": 1.872340425531915,
"grad_norm": 1.3811678886413574,
"learning_rate": 3.4728171322346694e-05,
"loss": 0.4107,
"step": 110
},
{
"epoch": 1.8893617021276596,
"grad_norm": 1.3840794563293457,
"learning_rate": 3.38352785620174e-05,
"loss": 0.4713,
"step": 111
},
{
"epoch": 1.9063829787234043,
"grad_norm": 1.5129518508911133,
"learning_rate": 3.29481061466617e-05,
"loss": 0.4653,
"step": 112
},
{
"epoch": 1.923404255319149,
"grad_norm": 1.569916009902954,
"learning_rate": 3.2066968027391374e-05,
"loss": 0.3947,
"step": 113
},
{
"epoch": 1.9404255319148938,
"grad_norm": 1.7366918325424194,
"learning_rate": 3.119217601991139e-05,
"loss": 0.3459,
"step": 114
},
{
"epoch": 1.9574468085106385,
"grad_norm": 1.6125799417495728,
"learning_rate": 3.0324039694175233e-05,
"loss": 0.3711,
"step": 115
},
{
"epoch": 1.974468085106383,
"grad_norm": 1.3394767045974731,
"learning_rate": 2.946286626483463e-05,
"loss": 0.6046,
"step": 116
},
{
"epoch": 1.9914893617021276,
"grad_norm": 1.6290675401687622,
"learning_rate": 2.8608960482523056e-05,
"loss": 0.5723,
"step": 117
},
{
"epoch": 2.008510638297872,
"grad_norm": 1.4972840547561646,
"learning_rate": 2.7762624526011038e-05,
"loss": 0.5839,
"step": 118
},
{
"epoch": 2.025531914893617,
"grad_norm": 0.92827969789505,
"learning_rate": 2.6924157895271563e-05,
"loss": 0.414,
"step": 119
},
{
"epoch": 2.0425531914893615,
"grad_norm": 0.9881418943405151,
"learning_rate": 2.6093857305493664e-05,
"loss": 0.338,
"step": 120
},
{
"epoch": 2.0595744680851062,
"grad_norm": 1.0140328407287598,
"learning_rate": 2.5272016582081236e-05,
"loss": 0.3692,
"step": 121
},
{
"epoch": 2.076595744680851,
"grad_norm": 1.0715774297714233,
"learning_rate": 2.4458926556674615e-05,
"loss": 0.3516,
"step": 122
},
{
"epoch": 2.0936170212765957,
"grad_norm": 0.9538404941558838,
"learning_rate": 2.3654874964231518e-05,
"loss": 0.2303,
"step": 123
},
{
"epoch": 2.1106382978723404,
"grad_norm": 1.1230480670928955,
"learning_rate": 2.2860146341203937e-05,
"loss": 0.293,
"step": 124
},
{
"epoch": 2.127659574468085,
"grad_norm": 1.1339483261108398,
"learning_rate": 2.207502192484685e-05,
"loss": 0.3407,
"step": 125
},
{
"epoch": 2.1446808510638298,
"grad_norm": 1.0703402757644653,
"learning_rate": 2.1299779553694323e-05,
"loss": 0.2584,
"step": 126
},
{
"epoch": 2.1617021276595745,
"grad_norm": 1.0961896181106567,
"learning_rate": 2.053469356923865e-05,
"loss": 0.2334,
"step": 127
},
{
"epoch": 2.178723404255319,
"grad_norm": 1.2202945947647095,
"learning_rate": 1.978003471884665e-05,
"loss": 0.2705,
"step": 128
},
{
"epoch": 2.195744680851064,
"grad_norm": 1.281968593597412,
"learning_rate": 1.9036070059948252e-05,
"loss": 0.2222,
"step": 129
},
{
"epoch": 2.2127659574468086,
"grad_norm": 1.530350685119629,
"learning_rate": 1.8303062865530406e-05,
"loss": 0.1816,
"step": 130
},
{
"epoch": 2.2297872340425533,
"grad_norm": 1.5826750993728638,
"learning_rate": 1.7581272530970667e-05,
"loss": 0.2032,
"step": 131
},
{
"epoch": 2.246808510638298,
"grad_norm": 1.333558440208435,
"learning_rate": 1.6870954482242707e-05,
"loss": 0.4571,
"step": 132
},
{
"epoch": 2.2638297872340427,
"grad_norm": 1.2566792964935303,
"learning_rate": 1.6172360085526565e-05,
"loss": 0.394,
"step": 133
},
{
"epoch": 2.2808510638297874,
"grad_norm": 1.2111644744873047,
"learning_rate": 1.5485736558255697e-05,
"loss": 0.3292,
"step": 134
},
{
"epoch": 2.297872340425532,
"grad_norm": 1.3459227085113525,
"learning_rate": 1.4811326881631937e-05,
"loss": 0.3263,
"step": 135
},
{
"epoch": 2.3148936170212764,
"grad_norm": 1.3436534404754639,
"learning_rate": 1.4149369714639853e-05,
"loss": 0.3349,
"step": 136
},
{
"epoch": 2.331914893617021,
"grad_norm": 1.4782233238220215,
"learning_rate": 1.3500099309590397e-05,
"loss": 0.3526,
"step": 137
},
{
"epoch": 2.348936170212766,
"grad_norm": 1.3280771970748901,
"learning_rate": 1.2863745429224144e-05,
"loss": 0.2997,
"step": 138
},
{
"epoch": 2.3659574468085105,
"grad_norm": 1.3815573453903198,
"learning_rate": 1.2240533265403198e-05,
"loss": 0.2855,
"step": 139
},
{
"epoch": 2.382978723404255,
"grad_norm": 1.4903278350830078,
"learning_rate": 1.1630683359420652e-05,
"loss": 0.2904,
"step": 140
},
{
"epoch": 2.4,
"grad_norm": 1.3255101442337036,
"learning_rate": 1.103441152395588e-05,
"loss": 0.1762,
"step": 141
},
{
"epoch": 2.4170212765957446,
"grad_norm": 1.3879741430282593,
"learning_rate": 1.0451928766702979e-05,
"loss": 0.2153,
"step": 142
},
{
"epoch": 2.4340425531914893,
"grad_norm": 1.2487590312957764,
"learning_rate": 9.883441215699823e-06,
"loss": 0.1699,
"step": 143
},
{
"epoch": 2.451063829787234,
"grad_norm": 1.142252802848816,
"learning_rate": 9.329150046383772e-06,
"loss": 0.1227,
"step": 144
},
{
"epoch": 2.4680851063829787,
"grad_norm": 1.8175076246261597,
"learning_rate": 8.789251410400023e-06,
"loss": 0.1298,
"step": 145
},
{
"epoch": 2.4851063829787234,
"grad_norm": 1.2175217866897583,
"learning_rate": 8.263936366187824e-06,
"loss": 0.3301,
"step": 146
},
{
"epoch": 2.502127659574468,
"grad_norm": 1.314565896987915,
"learning_rate": 7.753390811368971e-06,
"loss": 0.3832,
"step": 147
},
{
"epoch": 2.519148936170213,
"grad_norm": 1.393023133277893,
"learning_rate": 7.257795416962753e-06,
"loss": 0.331,
"step": 148
},
{
"epoch": 2.5361702127659576,
"grad_norm": 1.4550484418869019,
"learning_rate": 6.777325563450282e-06,
"loss": 0.3105,
"step": 149
},
{
"epoch": 2.5531914893617023,
"grad_norm": 1.555659294128418,
"learning_rate": 6.312151278711237e-06,
"loss": 0.3881,
"step": 150
},
{
"epoch": 2.5531914893617023,
"eval_loss": 1.250982403755188,
"eval_runtime": 4.4645,
"eval_samples_per_second": 22.175,
"eval_steps_per_second": 5.6,
"step": 150
},
{
"epoch": 2.570212765957447,
"grad_norm": 1.4013265371322632,
"learning_rate": 5.86243717785463e-06,
"loss": 0.3459,
"step": 151
},
{
"epoch": 2.5872340425531917,
"grad_norm": 1.444057822227478,
"learning_rate": 5.428342404965076e-06,
"loss": 0.277,
"step": 152
},
{
"epoch": 2.604255319148936,
"grad_norm": 1.328897476196289,
"learning_rate": 5.010020576785174e-06,
"loss": 0.2324,
"step": 153
},
{
"epoch": 2.621276595744681,
"grad_norm": 1.4151270389556885,
"learning_rate": 4.607619728353818e-06,
"loss": 0.2435,
"step": 154
},
{
"epoch": 2.6382978723404253,
"grad_norm": 1.3859100341796875,
"learning_rate": 4.221282260619891e-06,
"loss": 0.2068,
"step": 155
},
{
"epoch": 2.65531914893617,
"grad_norm": 1.1932073831558228,
"learning_rate": 3.851144890049535e-06,
"loss": 0.156,
"step": 156
},
{
"epoch": 2.6723404255319148,
"grad_norm": 1.4522919654846191,
"learning_rate": 3.4973386002452535e-06,
"loss": 0.2544,
"step": 157
},
{
"epoch": 2.6893617021276595,
"grad_norm": 1.2598811388015747,
"learning_rate": 3.159988595593616e-06,
"loss": 0.1501,
"step": 158
},
{
"epoch": 2.706382978723404,
"grad_norm": 1.3632055521011353,
"learning_rate": 2.839214256958106e-06,
"loss": 0.1397,
"step": 159
},
{
"epoch": 2.723404255319149,
"grad_norm": 1.353302240371704,
"learning_rate": 2.53512909943287e-06,
"loss": 0.305,
"step": 160
},
{
"epoch": 2.7404255319148936,
"grad_norm": 1.339839220046997,
"learning_rate": 2.2478407321721296e-06,
"loss": 0.366,
"step": 161
},
{
"epoch": 2.7574468085106383,
"grad_norm": 1.3590214252471924,
"learning_rate": 1.977450820309684e-06,
"loss": 0.3311,
"step": 162
},
{
"epoch": 2.774468085106383,
"grad_norm": 1.4828448295593262,
"learning_rate": 1.7240550489817653e-06,
"loss": 0.3547,
"step": 163
},
{
"epoch": 2.7914893617021277,
"grad_norm": 1.3542424440383911,
"learning_rate": 1.4877430894662036e-06,
"loss": 0.2935,
"step": 164
},
{
"epoch": 2.8085106382978724,
"grad_norm": 1.304952621459961,
"learning_rate": 1.268598567449647e-06,
"loss": 0.2388,
"step": 165
},
{
"epoch": 2.825531914893617,
"grad_norm": 1.453139066696167,
"learning_rate": 1.0666990334342707e-06,
"loss": 0.2971,
"step": 166
},
{
"epoch": 2.842553191489362,
"grad_norm": 1.335195541381836,
"learning_rate": 8.821159352943143e-07,
"loss": 0.2004,
"step": 167
},
{
"epoch": 2.8595744680851065,
"grad_norm": 1.482730746269226,
"learning_rate": 7.149145929922607e-07,
"loss": 0.2955,
"step": 168
},
{
"epoch": 2.876595744680851,
"grad_norm": 1.3346716165542603,
"learning_rate": 5.651541754634726e-07,
"loss": 0.2225,
"step": 169
},
{
"epoch": 2.8936170212765955,
"grad_norm": 1.362627625465393,
"learning_rate": 4.3288767967760715e-07,
"loss": 0.1993,
"step": 170
},
{
"epoch": 2.9106382978723406,
"grad_norm": 1.4417704343795776,
"learning_rate": 3.1816191188415166e-07,
"loss": 0.2222,
"step": 171
},
{
"epoch": 2.927659574468085,
"grad_norm": 1.2695422172546387,
"learning_rate": 2.2101747104866788e-07,
"loss": 0.1648,
"step": 172
},
{
"epoch": 2.94468085106383,
"grad_norm": 1.325673222541809,
"learning_rate": 1.4148873448573408e-07,
"loss": 0.1262,
"step": 173
},
{
"epoch": 2.9617021276595743,
"grad_norm": 1.330980896949768,
"learning_rate": 7.960384569353219e-08,
"loss": 0.2392,
"step": 174
},
{
"epoch": 2.978723404255319,
"grad_norm": 1.4023689031600952,
"learning_rate": 3.538470439448105e-08,
"loss": 0.2823,
"step": 175
},
{
"epoch": 2.9957446808510637,
"grad_norm": 1.465209722518921,
"learning_rate": 8.846958785418968e-09,
"loss": 0.192,
"step": 176
},
{
"epoch": 3.0127659574468084,
"grad_norm": 1.1684350967407227,
"learning_rate": 0.0,
"loss": 0.345,
"step": 177
}
],
"logging_steps": 1,
"max_steps": 177,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 2
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.334314061384909e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}