lesso's picture
Training in progress, step 200, checkpoint
663a696 verified
{
"best_metric": 0.0018136479193344712,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.03170577045022194,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0001585288522511097,
"grad_norm": 12.169403076171875,
"learning_rate": 1.013e-05,
"loss": 4.2706,
"step": 1
},
{
"epoch": 0.0001585288522511097,
"eval_loss": 4.04376220703125,
"eval_runtime": 284.7456,
"eval_samples_per_second": 9.328,
"eval_steps_per_second": 2.332,
"step": 1
},
{
"epoch": 0.0003170577045022194,
"grad_norm": 16.00709342956543,
"learning_rate": 2.026e-05,
"loss": 3.9804,
"step": 2
},
{
"epoch": 0.0004755865567533291,
"grad_norm": 15.737310409545898,
"learning_rate": 3.039e-05,
"loss": 3.9673,
"step": 3
},
{
"epoch": 0.0006341154090044388,
"grad_norm": 14.818962097167969,
"learning_rate": 4.052e-05,
"loss": 3.2126,
"step": 4
},
{
"epoch": 0.0007926442612555486,
"grad_norm": 10.428650856018066,
"learning_rate": 5.065e-05,
"loss": 2.0077,
"step": 5
},
{
"epoch": 0.0009511731135066582,
"grad_norm": 12.499900817871094,
"learning_rate": 6.078e-05,
"loss": 1.2136,
"step": 6
},
{
"epoch": 0.0011097019657577679,
"grad_norm": 10.663622856140137,
"learning_rate": 7.091e-05,
"loss": 0.3359,
"step": 7
},
{
"epoch": 0.0012682308180088776,
"grad_norm": 46.17512512207031,
"learning_rate": 8.104e-05,
"loss": 0.412,
"step": 8
},
{
"epoch": 0.0014267596702599874,
"grad_norm": 1.8651701211929321,
"learning_rate": 9.117e-05,
"loss": 0.0131,
"step": 9
},
{
"epoch": 0.0015852885225110971,
"grad_norm": 0.16528020799160004,
"learning_rate": 0.0001013,
"loss": 0.0008,
"step": 10
},
{
"epoch": 0.0017438173747622067,
"grad_norm": 0.012765788473188877,
"learning_rate": 0.00010076684210526316,
"loss": 0.0001,
"step": 11
},
{
"epoch": 0.0019023462270133164,
"grad_norm": 0.013106022030115128,
"learning_rate": 0.0001002336842105263,
"loss": 0.0001,
"step": 12
},
{
"epoch": 0.002060875079264426,
"grad_norm": 0.024374201893806458,
"learning_rate": 9.970052631578946e-05,
"loss": 0.0001,
"step": 13
},
{
"epoch": 0.0022194039315155357,
"grad_norm": 0.021921832114458084,
"learning_rate": 9.916736842105263e-05,
"loss": 0.0001,
"step": 14
},
{
"epoch": 0.0023779327837666455,
"grad_norm": 0.04089897498488426,
"learning_rate": 9.863421052631579e-05,
"loss": 0.0001,
"step": 15
},
{
"epoch": 0.0025364616360177552,
"grad_norm": 0.021697349846363068,
"learning_rate": 9.810105263157895e-05,
"loss": 0.0001,
"step": 16
},
{
"epoch": 0.002694990488268865,
"grad_norm": 0.0019667658489197493,
"learning_rate": 9.756789473684211e-05,
"loss": 0.0,
"step": 17
},
{
"epoch": 0.0028535193405199747,
"grad_norm": 0.0005555606330744922,
"learning_rate": 9.703473684210525e-05,
"loss": 0.0,
"step": 18
},
{
"epoch": 0.0030120481927710845,
"grad_norm": 0.0005540283164009452,
"learning_rate": 9.650157894736842e-05,
"loss": 0.0,
"step": 19
},
{
"epoch": 0.0031705770450221942,
"grad_norm": 0.0006892455858178437,
"learning_rate": 9.596842105263158e-05,
"loss": 0.0,
"step": 20
},
{
"epoch": 0.0033291058972733036,
"grad_norm": 0.0022405856288969517,
"learning_rate": 9.543526315789474e-05,
"loss": 0.0,
"step": 21
},
{
"epoch": 0.0034876347495244133,
"grad_norm": 0.0009618853800930083,
"learning_rate": 9.49021052631579e-05,
"loss": 0.0,
"step": 22
},
{
"epoch": 0.003646163601775523,
"grad_norm": 0.0009560861508361995,
"learning_rate": 9.436894736842105e-05,
"loss": 0.0,
"step": 23
},
{
"epoch": 0.003804692454026633,
"grad_norm": 0.0029522525146603584,
"learning_rate": 9.38357894736842e-05,
"loss": 0.0,
"step": 24
},
{
"epoch": 0.003963221306277742,
"grad_norm": 0.0008082777494564652,
"learning_rate": 9.330263157894737e-05,
"loss": 0.0,
"step": 25
},
{
"epoch": 0.004121750158528852,
"grad_norm": 0.0015600634505972266,
"learning_rate": 9.276947368421051e-05,
"loss": 0.0,
"step": 26
},
{
"epoch": 0.004280279010779962,
"grad_norm": 0.0023957917001098394,
"learning_rate": 9.223631578947369e-05,
"loss": 0.0,
"step": 27
},
{
"epoch": 0.004438807863031071,
"grad_norm": 0.0004076052864547819,
"learning_rate": 9.170315789473684e-05,
"loss": 0.0,
"step": 28
},
{
"epoch": 0.004597336715282181,
"grad_norm": 0.00036426776205189526,
"learning_rate": 9.117e-05,
"loss": 0.0,
"step": 29
},
{
"epoch": 0.004755865567533291,
"grad_norm": 0.0003226393018849194,
"learning_rate": 9.063684210526316e-05,
"loss": 0.0,
"step": 30
},
{
"epoch": 0.004914394419784401,
"grad_norm": 0.0004357333527877927,
"learning_rate": 9.010368421052632e-05,
"loss": 0.0,
"step": 31
},
{
"epoch": 0.0050729232720355105,
"grad_norm": 0.00029580152477137744,
"learning_rate": 8.957052631578946e-05,
"loss": 0.0,
"step": 32
},
{
"epoch": 0.00523145212428662,
"grad_norm": 0.00045763631351292133,
"learning_rate": 8.903736842105263e-05,
"loss": 0.0,
"step": 33
},
{
"epoch": 0.00538998097653773,
"grad_norm": 0.0003399694978725165,
"learning_rate": 8.850421052631579e-05,
"loss": 0.0,
"step": 34
},
{
"epoch": 0.00554850982878884,
"grad_norm": 0.0003934537817258388,
"learning_rate": 8.797105263157895e-05,
"loss": 0.0,
"step": 35
},
{
"epoch": 0.0057070386810399495,
"grad_norm": 0.0005533421644940972,
"learning_rate": 8.743789473684211e-05,
"loss": 0.0,
"step": 36
},
{
"epoch": 0.005865567533291059,
"grad_norm": 0.002100046258419752,
"learning_rate": 8.690473684210526e-05,
"loss": 0.0,
"step": 37
},
{
"epoch": 0.006024096385542169,
"grad_norm": 0.0006316175567917526,
"learning_rate": 8.637157894736842e-05,
"loss": 0.0,
"step": 38
},
{
"epoch": 0.006182625237793279,
"grad_norm": 0.0004144099948462099,
"learning_rate": 8.583842105263158e-05,
"loss": 0.0,
"step": 39
},
{
"epoch": 0.0063411540900443885,
"grad_norm": 0.002568572061136365,
"learning_rate": 8.530526315789472e-05,
"loss": 0.0,
"step": 40
},
{
"epoch": 0.006499682942295497,
"grad_norm": 0.0009425367461517453,
"learning_rate": 8.47721052631579e-05,
"loss": 0.0,
"step": 41
},
{
"epoch": 0.006658211794546607,
"grad_norm": 0.00028256585937924683,
"learning_rate": 8.423894736842105e-05,
"loss": 0.0,
"step": 42
},
{
"epoch": 0.006816740646797717,
"grad_norm": 0.000341152714099735,
"learning_rate": 8.37057894736842e-05,
"loss": 0.0,
"step": 43
},
{
"epoch": 0.006975269499048827,
"grad_norm": 0.00031369487987831235,
"learning_rate": 8.317263157894737e-05,
"loss": 0.0,
"step": 44
},
{
"epoch": 0.007133798351299936,
"grad_norm": 0.00039442809065803885,
"learning_rate": 8.263947368421053e-05,
"loss": 0.0,
"step": 45
},
{
"epoch": 0.007292327203551046,
"grad_norm": 0.00029302932671271265,
"learning_rate": 8.210631578947368e-05,
"loss": 0.0,
"step": 46
},
{
"epoch": 0.007450856055802156,
"grad_norm": 0.0003696536587085575,
"learning_rate": 8.157315789473684e-05,
"loss": 0.0,
"step": 47
},
{
"epoch": 0.007609384908053266,
"grad_norm": 0.00028774861129932106,
"learning_rate": 8.104e-05,
"loss": 0.0,
"step": 48
},
{
"epoch": 0.0077679137603043754,
"grad_norm": 0.00027515244437381625,
"learning_rate": 8.050684210526316e-05,
"loss": 0.0,
"step": 49
},
{
"epoch": 0.007926442612555484,
"grad_norm": 0.000243888032855466,
"learning_rate": 7.997368421052632e-05,
"loss": 0.0,
"step": 50
},
{
"epoch": 0.007926442612555484,
"eval_loss": 0.002942207735031843,
"eval_runtime": 284.8103,
"eval_samples_per_second": 9.326,
"eval_steps_per_second": 2.331,
"step": 50
},
{
"epoch": 0.008084971464806594,
"grad_norm": 9.338234901428223,
"learning_rate": 7.944052631578947e-05,
"loss": 0.3261,
"step": 51
},
{
"epoch": 0.008243500317057704,
"grad_norm": 6.803124415455386e-05,
"learning_rate": 7.890736842105263e-05,
"loss": 0.0,
"step": 52
},
{
"epoch": 0.008402029169308814,
"grad_norm": 0.00013048792607150972,
"learning_rate": 7.837421052631579e-05,
"loss": 0.0,
"step": 53
},
{
"epoch": 0.008560558021559923,
"grad_norm": 0.0003305167774669826,
"learning_rate": 7.784105263157893e-05,
"loss": 0.0,
"step": 54
},
{
"epoch": 0.008719086873811033,
"grad_norm": 0.0002505861921235919,
"learning_rate": 7.730789473684211e-05,
"loss": 0.0,
"step": 55
},
{
"epoch": 0.008877615726062143,
"grad_norm": 0.0005144188180565834,
"learning_rate": 7.677473684210526e-05,
"loss": 0.0,
"step": 56
},
{
"epoch": 0.009036144578313253,
"grad_norm": 0.0003912892425432801,
"learning_rate": 7.624157894736842e-05,
"loss": 0.0,
"step": 57
},
{
"epoch": 0.009194673430564362,
"grad_norm": 0.0003942723269574344,
"learning_rate": 7.570842105263158e-05,
"loss": 0.0,
"step": 58
},
{
"epoch": 0.009353202282815472,
"grad_norm": 0.0006460827426053584,
"learning_rate": 7.517526315789474e-05,
"loss": 0.0,
"step": 59
},
{
"epoch": 0.009511731135066582,
"grad_norm": 0.0014212304959073663,
"learning_rate": 7.464210526315789e-05,
"loss": 0.0,
"step": 60
},
{
"epoch": 0.009670259987317692,
"grad_norm": 0.0020509830210357904,
"learning_rate": 7.410894736842106e-05,
"loss": 0.0,
"step": 61
},
{
"epoch": 0.009828788839568801,
"grad_norm": 0.003535072784870863,
"learning_rate": 7.35757894736842e-05,
"loss": 0.0,
"step": 62
},
{
"epoch": 0.009987317691819911,
"grad_norm": 0.0031283413991332054,
"learning_rate": 7.304263157894737e-05,
"loss": 0.0,
"step": 63
},
{
"epoch": 0.010145846544071021,
"grad_norm": 0.004257969092577696,
"learning_rate": 7.250947368421053e-05,
"loss": 0.0,
"step": 64
},
{
"epoch": 0.01030437539632213,
"grad_norm": 0.0008224630146287382,
"learning_rate": 7.197631578947368e-05,
"loss": 0.0,
"step": 65
},
{
"epoch": 0.01046290424857324,
"grad_norm": 0.00048312422586604953,
"learning_rate": 7.144315789473684e-05,
"loss": 0.0,
"step": 66
},
{
"epoch": 0.01062143310082435,
"grad_norm": 0.0006397454999387264,
"learning_rate": 7.091e-05,
"loss": 0.0,
"step": 67
},
{
"epoch": 0.01077996195307546,
"grad_norm": 0.0005098761757835746,
"learning_rate": 7.037684210526316e-05,
"loss": 0.0,
"step": 68
},
{
"epoch": 0.01093849080532657,
"grad_norm": 0.0004371613613329828,
"learning_rate": 6.984368421052632e-05,
"loss": 0.0,
"step": 69
},
{
"epoch": 0.01109701965757768,
"grad_norm": 0.0003893864050041884,
"learning_rate": 6.931052631578947e-05,
"loss": 0.0,
"step": 70
},
{
"epoch": 0.01125554850982879,
"grad_norm": 0.000385530962375924,
"learning_rate": 6.877736842105263e-05,
"loss": 0.0,
"step": 71
},
{
"epoch": 0.011414077362079899,
"grad_norm": 0.0004044200468342751,
"learning_rate": 6.824421052631579e-05,
"loss": 0.0,
"step": 72
},
{
"epoch": 0.011572606214331009,
"grad_norm": 0.0003012254892382771,
"learning_rate": 6.771105263157895e-05,
"loss": 0.0,
"step": 73
},
{
"epoch": 0.011731135066582118,
"grad_norm": 0.0002749539853539318,
"learning_rate": 6.71778947368421e-05,
"loss": 0.0,
"step": 74
},
{
"epoch": 0.011889663918833228,
"grad_norm": 0.00024228697293438017,
"learning_rate": 6.664473684210527e-05,
"loss": 0.0,
"step": 75
},
{
"epoch": 0.012048192771084338,
"grad_norm": 0.00025467583327554166,
"learning_rate": 6.611157894736842e-05,
"loss": 0.0,
"step": 76
},
{
"epoch": 0.012206721623335448,
"grad_norm": 0.00024287324049510062,
"learning_rate": 6.557842105263158e-05,
"loss": 0.0,
"step": 77
},
{
"epoch": 0.012365250475586557,
"grad_norm": 0.0002723891520872712,
"learning_rate": 6.504526315789474e-05,
"loss": 0.0,
"step": 78
},
{
"epoch": 0.012523779327837667,
"grad_norm": 0.0002199763839598745,
"learning_rate": 6.451210526315789e-05,
"loss": 0.0,
"step": 79
},
{
"epoch": 0.012682308180088777,
"grad_norm": 0.0002384045801591128,
"learning_rate": 6.397894736842105e-05,
"loss": 0.0,
"step": 80
},
{
"epoch": 0.012840837032339885,
"grad_norm": 0.00017316907178610563,
"learning_rate": 6.344578947368421e-05,
"loss": 0.0,
"step": 81
},
{
"epoch": 0.012999365884590995,
"grad_norm": 0.00017045978165697306,
"learning_rate": 6.291263157894737e-05,
"loss": 0.0,
"step": 82
},
{
"epoch": 0.013157894736842105,
"grad_norm": 0.00020900469098705798,
"learning_rate": 6.237947368421053e-05,
"loss": 0.0,
"step": 83
},
{
"epoch": 0.013316423589093214,
"grad_norm": 0.00019530224381014705,
"learning_rate": 6.184631578947368e-05,
"loss": 0.0,
"step": 84
},
{
"epoch": 0.013474952441344324,
"grad_norm": 0.00019037234596908092,
"learning_rate": 6.131315789473684e-05,
"loss": 0.0,
"step": 85
},
{
"epoch": 0.013633481293595434,
"grad_norm": 0.00016165118722710758,
"learning_rate": 6.078e-05,
"loss": 0.0,
"step": 86
},
{
"epoch": 0.013792010145846544,
"grad_norm": 0.00014685910718981177,
"learning_rate": 6.024684210526315e-05,
"loss": 0.0,
"step": 87
},
{
"epoch": 0.013950538998097653,
"grad_norm": 0.00014423737593460828,
"learning_rate": 5.9713684210526305e-05,
"loss": 0.0,
"step": 88
},
{
"epoch": 0.014109067850348763,
"grad_norm": 0.00018514647672418505,
"learning_rate": 5.918052631578947e-05,
"loss": 0.0,
"step": 89
},
{
"epoch": 0.014267596702599873,
"grad_norm": 0.00020832290465477854,
"learning_rate": 5.8647368421052634e-05,
"loss": 0.0,
"step": 90
},
{
"epoch": 0.014426125554850983,
"grad_norm": 0.0001367575750919059,
"learning_rate": 5.811421052631579e-05,
"loss": 0.0,
"step": 91
},
{
"epoch": 0.014584654407102092,
"grad_norm": 0.00012530827370937914,
"learning_rate": 5.758105263157894e-05,
"loss": 0.0,
"step": 92
},
{
"epoch": 0.014743183259353202,
"grad_norm": 0.0001300648000324145,
"learning_rate": 5.70478947368421e-05,
"loss": 0.0,
"step": 93
},
{
"epoch": 0.014901712111604312,
"grad_norm": 0.00019942222570534796,
"learning_rate": 5.6514736842105256e-05,
"loss": 0.0,
"step": 94
},
{
"epoch": 0.015060240963855422,
"grad_norm": 0.00024527753703296185,
"learning_rate": 5.5981578947368424e-05,
"loss": 0.0,
"step": 95
},
{
"epoch": 0.015218769816106531,
"grad_norm": 0.00016720422718208283,
"learning_rate": 5.544842105263158e-05,
"loss": 0.0,
"step": 96
},
{
"epoch": 0.015377298668357641,
"grad_norm": 0.00019603196415118873,
"learning_rate": 5.491526315789474e-05,
"loss": 0.0,
"step": 97
},
{
"epoch": 0.015535827520608751,
"grad_norm": 0.0002464319404680282,
"learning_rate": 5.438210526315789e-05,
"loss": 0.0,
"step": 98
},
{
"epoch": 0.01569435637285986,
"grad_norm": 0.00019614743359852582,
"learning_rate": 5.384894736842105e-05,
"loss": 0.0,
"step": 99
},
{
"epoch": 0.01585288522511097,
"grad_norm": 0.00017342373030260205,
"learning_rate": 5.331578947368421e-05,
"loss": 0.0,
"step": 100
},
{
"epoch": 0.01585288522511097,
"eval_loss": 0.0025951999705284834,
"eval_runtime": 284.9449,
"eval_samples_per_second": 9.321,
"eval_steps_per_second": 2.33,
"step": 100
},
{
"epoch": 0.01601141407736208,
"grad_norm": 0.0001433816651115194,
"learning_rate": 5.278263157894736e-05,
"loss": 0.0,
"step": 101
},
{
"epoch": 0.016169942929613188,
"grad_norm": 0.00011838250065920874,
"learning_rate": 5.224947368421053e-05,
"loss": 0.0,
"step": 102
},
{
"epoch": 0.0163284717818643,
"grad_norm": 0.00010393361299065873,
"learning_rate": 5.171631578947368e-05,
"loss": 0.0,
"step": 103
},
{
"epoch": 0.016487000634115408,
"grad_norm": 9.349365427624434e-05,
"learning_rate": 5.1183157894736844e-05,
"loss": 0.0,
"step": 104
},
{
"epoch": 0.01664552948636652,
"grad_norm": 9.53435810515657e-05,
"learning_rate": 5.065e-05,
"loss": 0.0,
"step": 105
},
{
"epoch": 0.016804058338617627,
"grad_norm": 8.479709504172206e-05,
"learning_rate": 5.011684210526315e-05,
"loss": 0.0,
"step": 106
},
{
"epoch": 0.01696258719086874,
"grad_norm": 7.610375905642286e-05,
"learning_rate": 4.958368421052631e-05,
"loss": 0.0,
"step": 107
},
{
"epoch": 0.017121116043119847,
"grad_norm": 7.144361006794497e-05,
"learning_rate": 4.9050526315789473e-05,
"loss": 0.0,
"step": 108
},
{
"epoch": 0.017279644895370958,
"grad_norm": 6.864387250971049e-05,
"learning_rate": 4.851736842105263e-05,
"loss": 0.0,
"step": 109
},
{
"epoch": 0.017438173747622066,
"grad_norm": 7.527913840021938e-05,
"learning_rate": 4.798421052631579e-05,
"loss": 0.0,
"step": 110
},
{
"epoch": 0.017596702599873178,
"grad_norm": 7.126829586923122e-05,
"learning_rate": 4.745105263157895e-05,
"loss": 0.0,
"step": 111
},
{
"epoch": 0.017755231452124286,
"grad_norm": 8.467756561003625e-05,
"learning_rate": 4.69178947368421e-05,
"loss": 0.0,
"step": 112
},
{
"epoch": 0.017913760304375397,
"grad_norm": 7.283923332579434e-05,
"learning_rate": 4.638473684210526e-05,
"loss": 0.0,
"step": 113
},
{
"epoch": 0.018072289156626505,
"grad_norm": 7.065037789288908e-05,
"learning_rate": 4.585157894736842e-05,
"loss": 0.0,
"step": 114
},
{
"epoch": 0.018230818008877617,
"grad_norm": 6.294441118370742e-05,
"learning_rate": 4.531842105263158e-05,
"loss": 0.0,
"step": 115
},
{
"epoch": 0.018389346861128725,
"grad_norm": 6.32048977422528e-05,
"learning_rate": 4.478526315789473e-05,
"loss": 0.0,
"step": 116
},
{
"epoch": 0.018547875713379836,
"grad_norm": 6.300484528765082e-05,
"learning_rate": 4.425210526315789e-05,
"loss": 0.0,
"step": 117
},
{
"epoch": 0.018706404565630944,
"grad_norm": 6.188850966282189e-05,
"learning_rate": 4.3718947368421054e-05,
"loss": 0.0,
"step": 118
},
{
"epoch": 0.018864933417882056,
"grad_norm": 6.136750744190067e-05,
"learning_rate": 4.318578947368421e-05,
"loss": 0.0,
"step": 119
},
{
"epoch": 0.019023462270133164,
"grad_norm": 5.3558338549919426e-05,
"learning_rate": 4.265263157894736e-05,
"loss": 0.0,
"step": 120
},
{
"epoch": 0.019181991122384275,
"grad_norm": 5.407687422120944e-05,
"learning_rate": 4.211947368421052e-05,
"loss": 0.0,
"step": 121
},
{
"epoch": 0.019340519974635383,
"grad_norm": 6.248530553421006e-05,
"learning_rate": 4.1586315789473684e-05,
"loss": 0.0,
"step": 122
},
{
"epoch": 0.019499048826886495,
"grad_norm": 5.69553958484903e-05,
"learning_rate": 4.105315789473684e-05,
"loss": 0.0,
"step": 123
},
{
"epoch": 0.019657577679137603,
"grad_norm": 5.165593756828457e-05,
"learning_rate": 4.052e-05,
"loss": 0.0,
"step": 124
},
{
"epoch": 0.019816106531388714,
"grad_norm": 6.129803659860045e-05,
"learning_rate": 3.998684210526316e-05,
"loss": 0.0,
"step": 125
},
{
"epoch": 0.019974635383639822,
"grad_norm": 5.8346176956547424e-05,
"learning_rate": 3.945368421052631e-05,
"loss": 0.0,
"step": 126
},
{
"epoch": 0.020133164235890934,
"grad_norm": 5.828439680044539e-05,
"learning_rate": 3.892052631578947e-05,
"loss": 0.0,
"step": 127
},
{
"epoch": 0.020291693088142042,
"grad_norm": 6.158412725199014e-05,
"learning_rate": 3.838736842105263e-05,
"loss": 0.0,
"step": 128
},
{
"epoch": 0.02045022194039315,
"grad_norm": 5.588992280536331e-05,
"learning_rate": 3.785421052631579e-05,
"loss": 0.0,
"step": 129
},
{
"epoch": 0.02060875079264426,
"grad_norm": 4.7499852371402085e-05,
"learning_rate": 3.732105263157894e-05,
"loss": 0.0,
"step": 130
},
{
"epoch": 0.02076727964489537,
"grad_norm": 5.2001607400598004e-05,
"learning_rate": 3.67878947368421e-05,
"loss": 0.0,
"step": 131
},
{
"epoch": 0.02092580849714648,
"grad_norm": 5.1708582759601995e-05,
"learning_rate": 3.6254736842105264e-05,
"loss": 0.0,
"step": 132
},
{
"epoch": 0.02108433734939759,
"grad_norm": 4.733254172606394e-05,
"learning_rate": 3.572157894736842e-05,
"loss": 0.0,
"step": 133
},
{
"epoch": 0.0212428662016487,
"grad_norm": 5.3371717513073236e-05,
"learning_rate": 3.518842105263158e-05,
"loss": 0.0,
"step": 134
},
{
"epoch": 0.02140139505389981,
"grad_norm": 4.716423063655384e-05,
"learning_rate": 3.465526315789473e-05,
"loss": 0.0,
"step": 135
},
{
"epoch": 0.02155992390615092,
"grad_norm": 4.844993964070454e-05,
"learning_rate": 3.4122105263157894e-05,
"loss": 0.0,
"step": 136
},
{
"epoch": 0.021718452758402028,
"grad_norm": 5.031727778259665e-05,
"learning_rate": 3.358894736842105e-05,
"loss": 0.0,
"step": 137
},
{
"epoch": 0.02187698161065314,
"grad_norm": 4.5576842239825055e-05,
"learning_rate": 3.305578947368421e-05,
"loss": 0.0,
"step": 138
},
{
"epoch": 0.022035510462904247,
"grad_norm": 5.910011168452911e-05,
"learning_rate": 3.252263157894737e-05,
"loss": 0.0,
"step": 139
},
{
"epoch": 0.02219403931515536,
"grad_norm": 4.8003654228523374e-05,
"learning_rate": 3.198947368421052e-05,
"loss": 0.0,
"step": 140
},
{
"epoch": 0.022352568167406467,
"grad_norm": 4.5099386625224724e-05,
"learning_rate": 3.1456315789473684e-05,
"loss": 0.0,
"step": 141
},
{
"epoch": 0.02251109701965758,
"grad_norm": 5.6082306400639936e-05,
"learning_rate": 3.092315789473684e-05,
"loss": 0.0,
"step": 142
},
{
"epoch": 0.022669625871908686,
"grad_norm": 4.4970944145461544e-05,
"learning_rate": 3.039e-05,
"loss": 0.0,
"step": 143
},
{
"epoch": 0.022828154724159798,
"grad_norm": 5.0454917072784156e-05,
"learning_rate": 2.9856842105263153e-05,
"loss": 0.0,
"step": 144
},
{
"epoch": 0.022986683576410906,
"grad_norm": 8.930585318012163e-05,
"learning_rate": 2.9323684210526317e-05,
"loss": 0.0,
"step": 145
},
{
"epoch": 0.023145212428662017,
"grad_norm": 6.606173701584339e-05,
"learning_rate": 2.879052631578947e-05,
"loss": 0.0,
"step": 146
},
{
"epoch": 0.023303741280913125,
"grad_norm": 7.71412014728412e-05,
"learning_rate": 2.8257368421052628e-05,
"loss": 0.0,
"step": 147
},
{
"epoch": 0.023462270133164237,
"grad_norm": 7.689122867304832e-05,
"learning_rate": 2.772421052631579e-05,
"loss": 0.0,
"step": 148
},
{
"epoch": 0.023620798985415345,
"grad_norm": 7.011953857727349e-05,
"learning_rate": 2.7191052631578946e-05,
"loss": 0.0,
"step": 149
},
{
"epoch": 0.023779327837666456,
"grad_norm": 8.041402179514989e-05,
"learning_rate": 2.6657894736842104e-05,
"loss": 0.0,
"step": 150
},
{
"epoch": 0.023779327837666456,
"eval_loss": 0.002579666208475828,
"eval_runtime": 284.7051,
"eval_samples_per_second": 9.329,
"eval_steps_per_second": 2.332,
"step": 150
},
{
"epoch": 0.023937856689917564,
"grad_norm": 4.936805248260498,
"learning_rate": 2.6124736842105265e-05,
"loss": 0.118,
"step": 151
},
{
"epoch": 0.024096385542168676,
"grad_norm": 8.385646651731804e-05,
"learning_rate": 2.5591578947368422e-05,
"loss": 0.0,
"step": 152
},
{
"epoch": 0.024254914394419784,
"grad_norm": 0.00011956329399254173,
"learning_rate": 2.5058421052631576e-05,
"loss": 0.0,
"step": 153
},
{
"epoch": 0.024413443246670895,
"grad_norm": 0.00013739150017499924,
"learning_rate": 2.4525263157894737e-05,
"loss": 0.0,
"step": 154
},
{
"epoch": 0.024571972098922003,
"grad_norm": 0.00016575689369346946,
"learning_rate": 2.3992105263157894e-05,
"loss": 0.0,
"step": 155
},
{
"epoch": 0.024730500951173115,
"grad_norm": 0.0002567583287600428,
"learning_rate": 2.345894736842105e-05,
"loss": 0.0,
"step": 156
},
{
"epoch": 0.024889029803424223,
"grad_norm": 0.0002865093993023038,
"learning_rate": 2.292578947368421e-05,
"loss": 0.0,
"step": 157
},
{
"epoch": 0.025047558655675334,
"grad_norm": 0.0004684887535404414,
"learning_rate": 2.2392631578947366e-05,
"loss": 0.0,
"step": 158
},
{
"epoch": 0.025206087507926443,
"grad_norm": 0.0003041870950255543,
"learning_rate": 2.1859473684210527e-05,
"loss": 0.0,
"step": 159
},
{
"epoch": 0.025364616360177554,
"grad_norm": 0.0004387758672237396,
"learning_rate": 2.132631578947368e-05,
"loss": 0.0,
"step": 160
},
{
"epoch": 0.025523145212428662,
"grad_norm": 0.00046817571274004877,
"learning_rate": 2.0793157894736842e-05,
"loss": 0.0,
"step": 161
},
{
"epoch": 0.02568167406467977,
"grad_norm": 0.0006510078674182296,
"learning_rate": 2.026e-05,
"loss": 0.0,
"step": 162
},
{
"epoch": 0.02584020291693088,
"grad_norm": 0.00042408722219988704,
"learning_rate": 1.9726842105263157e-05,
"loss": 0.0,
"step": 163
},
{
"epoch": 0.02599873176918199,
"grad_norm": 0.0006571735139004886,
"learning_rate": 1.9193684210526314e-05,
"loss": 0.0,
"step": 164
},
{
"epoch": 0.0261572606214331,
"grad_norm": 0.0005432140314951539,
"learning_rate": 1.866052631578947e-05,
"loss": 0.0,
"step": 165
},
{
"epoch": 0.02631578947368421,
"grad_norm": 0.0005207346403039992,
"learning_rate": 1.8127368421052632e-05,
"loss": 0.0,
"step": 166
},
{
"epoch": 0.02647431832593532,
"grad_norm": 0.0007389848469756544,
"learning_rate": 1.759421052631579e-05,
"loss": 0.0,
"step": 167
},
{
"epoch": 0.02663284717818643,
"grad_norm": 0.0006487572682090104,
"learning_rate": 1.7061052631578947e-05,
"loss": 0.0,
"step": 168
},
{
"epoch": 0.02679137603043754,
"grad_norm": 0.000673374452162534,
"learning_rate": 1.6527894736842104e-05,
"loss": 0.0,
"step": 169
},
{
"epoch": 0.026949904882688648,
"grad_norm": 0.0006955991266295314,
"learning_rate": 1.599473684210526e-05,
"loss": 0.0,
"step": 170
},
{
"epoch": 0.02710843373493976,
"grad_norm": 0.0007812317926436663,
"learning_rate": 1.546157894736842e-05,
"loss": 0.0,
"step": 171
},
{
"epoch": 0.027266962587190868,
"grad_norm": 0.0007272360380738974,
"learning_rate": 1.4928421052631576e-05,
"loss": 0.0,
"step": 172
},
{
"epoch": 0.02742549143944198,
"grad_norm": 0.0007635234505869448,
"learning_rate": 1.4395263157894735e-05,
"loss": 0.0,
"step": 173
},
{
"epoch": 0.027584020291693087,
"grad_norm": 0.0007117385393939912,
"learning_rate": 1.3862105263157895e-05,
"loss": 0.0,
"step": 174
},
{
"epoch": 0.0277425491439442,
"grad_norm": 0.0005687863449566066,
"learning_rate": 1.3328947368421052e-05,
"loss": 0.0,
"step": 175
},
{
"epoch": 0.027901077996195307,
"grad_norm": 0.0005387436249293387,
"learning_rate": 1.2795789473684211e-05,
"loss": 0.0,
"step": 176
},
{
"epoch": 0.028059606848446418,
"grad_norm": 0.0005557397962547839,
"learning_rate": 1.2262631578947368e-05,
"loss": 0.0,
"step": 177
},
{
"epoch": 0.028218135700697526,
"grad_norm": 0.0005531954229809344,
"learning_rate": 1.1729473684210526e-05,
"loss": 0.0,
"step": 178
},
{
"epoch": 0.028376664552948638,
"grad_norm": 0.0010639647953212261,
"learning_rate": 1.1196315789473683e-05,
"loss": 0.0,
"step": 179
},
{
"epoch": 0.028535193405199746,
"grad_norm": 0.0005514100193977356,
"learning_rate": 1.066315789473684e-05,
"loss": 0.0,
"step": 180
},
{
"epoch": 0.028693722257450857,
"grad_norm": 0.000542394642252475,
"learning_rate": 1.013e-05,
"loss": 0.0,
"step": 181
},
{
"epoch": 0.028852251109701965,
"grad_norm": 0.00043924085912294686,
"learning_rate": 9.596842105263157e-06,
"loss": 0.0,
"step": 182
},
{
"epoch": 0.029010779961953077,
"grad_norm": 0.000603148655500263,
"learning_rate": 9.063684210526316e-06,
"loss": 0.0,
"step": 183
},
{
"epoch": 0.029169308814204185,
"grad_norm": 0.00047173965140245855,
"learning_rate": 8.530526315789473e-06,
"loss": 0.0,
"step": 184
},
{
"epoch": 0.029327837666455296,
"grad_norm": 0.0006395349046215415,
"learning_rate": 7.99736842105263e-06,
"loss": 0.0,
"step": 185
},
{
"epoch": 0.029486366518706404,
"grad_norm": 0.0005980796995572746,
"learning_rate": 7.464210526315788e-06,
"loss": 0.0,
"step": 186
},
{
"epoch": 0.029644895370957516,
"grad_norm": 0.0006005927571095526,
"learning_rate": 6.931052631578947e-06,
"loss": 0.0,
"step": 187
},
{
"epoch": 0.029803424223208624,
"grad_norm": 0.00042718948679976165,
"learning_rate": 6.3978947368421055e-06,
"loss": 0.0,
"step": 188
},
{
"epoch": 0.029961953075459735,
"grad_norm": 0.00037346952012740076,
"learning_rate": 5.864736842105263e-06,
"loss": 0.0,
"step": 189
},
{
"epoch": 0.030120481927710843,
"grad_norm": 0.00046352826757356524,
"learning_rate": 5.33157894736842e-06,
"loss": 0.0,
"step": 190
},
{
"epoch": 0.030279010779961955,
"grad_norm": 0.0003845041792374104,
"learning_rate": 4.7984210526315785e-06,
"loss": 0.0,
"step": 191
},
{
"epoch": 0.030437539632213063,
"grad_norm": 0.00042385683627799153,
"learning_rate": 4.265263157894737e-06,
"loss": 0.0,
"step": 192
},
{
"epoch": 0.030596068484464174,
"grad_norm": 0.0003921858442481607,
"learning_rate": 3.732105263157894e-06,
"loss": 0.0,
"step": 193
},
{
"epoch": 0.030754597336715282,
"grad_norm": 0.0005148733034729958,
"learning_rate": 3.1989473684210527e-06,
"loss": 0.0,
"step": 194
},
{
"epoch": 0.03091312618896639,
"grad_norm": 0.0004899385967291892,
"learning_rate": 2.66578947368421e-06,
"loss": 0.0,
"step": 195
},
{
"epoch": 0.031071655041217502,
"grad_norm": 0.0006933041149750352,
"learning_rate": 2.1326315789473684e-06,
"loss": 0.0,
"step": 196
},
{
"epoch": 0.03123018389346861,
"grad_norm": 0.0009382757125422359,
"learning_rate": 1.5994736842105264e-06,
"loss": 0.0,
"step": 197
},
{
"epoch": 0.03138871274571972,
"grad_norm": 0.0007173538906499743,
"learning_rate": 1.0663157894736842e-06,
"loss": 0.0,
"step": 198
},
{
"epoch": 0.03154724159797083,
"grad_norm": 0.0005706042284145951,
"learning_rate": 5.331578947368421e-07,
"loss": 0.0,
"step": 199
},
{
"epoch": 0.03170577045022194,
"grad_norm": 0.0005148272030055523,
"learning_rate": 0.0,
"loss": 0.0,
"step": 200
},
{
"epoch": 0.03170577045022194,
"eval_loss": 0.0018136479193344712,
"eval_runtime": 284.2884,
"eval_samples_per_second": 9.343,
"eval_steps_per_second": 2.336,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.162764252413952e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}