jssky's picture
Training in progress, step 120, checkpoint
224bb2a verified
raw
history blame
21.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.25157232704402516,
"eval_steps": 120,
"global_step": 120,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0020964360587002098,
"grad_norm": 0.6940059065818787,
"learning_rate": 2e-05,
"loss": 1.5374,
"step": 1
},
{
"epoch": 0.0041928721174004195,
"grad_norm": 0.7119749188423157,
"learning_rate": 4e-05,
"loss": 1.4412,
"step": 2
},
{
"epoch": 0.006289308176100629,
"grad_norm": 0.6409623026847839,
"learning_rate": 6e-05,
"loss": 1.3945,
"step": 3
},
{
"epoch": 0.008385744234800839,
"grad_norm": 0.7138105630874634,
"learning_rate": 8e-05,
"loss": 1.4504,
"step": 4
},
{
"epoch": 0.010482180293501049,
"grad_norm": 0.8199965357780457,
"learning_rate": 0.0001,
"loss": 1.6434,
"step": 5
},
{
"epoch": 0.012578616352201259,
"grad_norm": 0.7341210842132568,
"learning_rate": 0.00012,
"loss": 1.3584,
"step": 6
},
{
"epoch": 0.014675052410901468,
"grad_norm": 0.8366342782974243,
"learning_rate": 0.00014,
"loss": 1.6133,
"step": 7
},
{
"epoch": 0.016771488469601678,
"grad_norm": 0.45160987973213196,
"learning_rate": 0.00016,
"loss": 1.5444,
"step": 8
},
{
"epoch": 0.018867924528301886,
"grad_norm": 0.564163863658905,
"learning_rate": 0.00018,
"loss": 1.4406,
"step": 9
},
{
"epoch": 0.020964360587002098,
"grad_norm": 1.0345866680145264,
"learning_rate": 0.0002,
"loss": 1.4192,
"step": 10
},
{
"epoch": 0.023060796645702306,
"grad_norm": 0.7354846596717834,
"learning_rate": 0.0001999977372615812,
"loss": 1.386,
"step": 11
},
{
"epoch": 0.025157232704402517,
"grad_norm": 0.5225378274917603,
"learning_rate": 0.00019999094914872442,
"loss": 1.272,
"step": 12
},
{
"epoch": 0.027253668763102725,
"grad_norm": 0.3142692446708679,
"learning_rate": 0.0001999796359686242,
"loss": 1.3781,
"step": 13
},
{
"epoch": 0.029350104821802937,
"grad_norm": 0.33122241497039795,
"learning_rate": 0.00019996379823325583,
"loss": 1.5188,
"step": 14
},
{
"epoch": 0.031446540880503145,
"grad_norm": 0.2958654761314392,
"learning_rate": 0.0001999434366593524,
"loss": 0.999,
"step": 15
},
{
"epoch": 0.033542976939203356,
"grad_norm": 0.4279610812664032,
"learning_rate": 0.00019991855216837224,
"loss": 1.5178,
"step": 16
},
{
"epoch": 0.03563941299790356,
"grad_norm": 0.36770907044410706,
"learning_rate": 0.00019988914588645715,
"loss": 1.2745,
"step": 17
},
{
"epoch": 0.03773584905660377,
"grad_norm": 0.3166482150554657,
"learning_rate": 0.00019985521914438165,
"loss": 1.5023,
"step": 18
},
{
"epoch": 0.039832285115303984,
"grad_norm": 0.42765095829963684,
"learning_rate": 0.0001998167734774926,
"loss": 1.2504,
"step": 19
},
{
"epoch": 0.041928721174004195,
"grad_norm": 0.392689973115921,
"learning_rate": 0.00019977381062563976,
"loss": 1.2228,
"step": 20
},
{
"epoch": 0.0440251572327044,
"grad_norm": 0.358163982629776,
"learning_rate": 0.000199726332533097,
"loss": 1.2634,
"step": 21
},
{
"epoch": 0.04612159329140461,
"grad_norm": 0.3274112939834595,
"learning_rate": 0.00019967434134847442,
"loss": 1.4746,
"step": 22
},
{
"epoch": 0.04821802935010482,
"grad_norm": 0.3587968945503235,
"learning_rate": 0.00019961783942462104,
"loss": 1.3947,
"step": 23
},
{
"epoch": 0.050314465408805034,
"grad_norm": 0.30727654695510864,
"learning_rate": 0.00019955682931851833,
"loss": 1.4815,
"step": 24
},
{
"epoch": 0.05241090146750524,
"grad_norm": 0.4096279442310333,
"learning_rate": 0.00019949131379116454,
"loss": 1.3225,
"step": 25
},
{
"epoch": 0.05450733752620545,
"grad_norm": 0.36623865365982056,
"learning_rate": 0.00019942129580744966,
"loss": 1.3904,
"step": 26
},
{
"epoch": 0.05660377358490566,
"grad_norm": 0.3568407893180847,
"learning_rate": 0.00019934677853602133,
"loss": 1.463,
"step": 27
},
{
"epoch": 0.05870020964360587,
"grad_norm": 0.4338196814060211,
"learning_rate": 0.0001992677653491414,
"loss": 1.4359,
"step": 28
},
{
"epoch": 0.06079664570230608,
"grad_norm": 0.4408683180809021,
"learning_rate": 0.00019918425982253334,
"loss": 1.8015,
"step": 29
},
{
"epoch": 0.06289308176100629,
"grad_norm": 0.3609876036643982,
"learning_rate": 0.00019909626573522043,
"loss": 1.3589,
"step": 30
},
{
"epoch": 0.0649895178197065,
"grad_norm": 0.43560177087783813,
"learning_rate": 0.0001990037870693547,
"loss": 1.734,
"step": 31
},
{
"epoch": 0.06708595387840671,
"grad_norm": 0.37430861592292786,
"learning_rate": 0.00019890682801003675,
"loss": 1.3517,
"step": 32
},
{
"epoch": 0.06918238993710692,
"grad_norm": 0.4608246386051178,
"learning_rate": 0.00019880539294512637,
"loss": 1.4881,
"step": 33
},
{
"epoch": 0.07127882599580712,
"grad_norm": 0.41597816348075867,
"learning_rate": 0.0001986994864650439,
"loss": 1.2676,
"step": 34
},
{
"epoch": 0.07337526205450734,
"grad_norm": 0.561418354511261,
"learning_rate": 0.00019858911336256257,
"loss": 1.4233,
"step": 35
},
{
"epoch": 0.07547169811320754,
"grad_norm": 0.9351180195808411,
"learning_rate": 0.00019847427863259163,
"loss": 1.2086,
"step": 36
},
{
"epoch": 0.07756813417190776,
"grad_norm": 0.6147457957267761,
"learning_rate": 0.00019835498747195008,
"loss": 1.4909,
"step": 37
},
{
"epoch": 0.07966457023060797,
"grad_norm": 0.4514181315898895,
"learning_rate": 0.00019823124527913185,
"loss": 1.2649,
"step": 38
},
{
"epoch": 0.08176100628930817,
"grad_norm": 0.49401888251304626,
"learning_rate": 0.0001981030576540612,
"loss": 1.5149,
"step": 39
},
{
"epoch": 0.08385744234800839,
"grad_norm": 0.6095734238624573,
"learning_rate": 0.00019797043039783936,
"loss": 1.4917,
"step": 40
},
{
"epoch": 0.0859538784067086,
"grad_norm": 0.42444926500320435,
"learning_rate": 0.0001978333695124821,
"loss": 1.3691,
"step": 41
},
{
"epoch": 0.0880503144654088,
"grad_norm": 0.47243213653564453,
"learning_rate": 0.00019769188120064812,
"loss": 1.7828,
"step": 42
},
{
"epoch": 0.09014675052410902,
"grad_norm": 0.4187338650226593,
"learning_rate": 0.00019754597186535814,
"loss": 1.2147,
"step": 43
},
{
"epoch": 0.09224318658280922,
"grad_norm": 0.4433446228504181,
"learning_rate": 0.0001973956481097053,
"loss": 1.1449,
"step": 44
},
{
"epoch": 0.09433962264150944,
"grad_norm": 0.5269142389297485,
"learning_rate": 0.0001972409167365564,
"loss": 1.4682,
"step": 45
},
{
"epoch": 0.09643605870020965,
"grad_norm": 0.4906723201274872,
"learning_rate": 0.0001970817847482439,
"loss": 1.3701,
"step": 46
},
{
"epoch": 0.09853249475890985,
"grad_norm": 0.5275290608406067,
"learning_rate": 0.000196918259346249,
"loss": 1.3704,
"step": 47
},
{
"epoch": 0.10062893081761007,
"grad_norm": 0.5568628907203674,
"learning_rate": 0.00019675034793087596,
"loss": 1.068,
"step": 48
},
{
"epoch": 0.10272536687631027,
"grad_norm": 0.6039868593215942,
"learning_rate": 0.000196578058100917,
"loss": 1.2204,
"step": 49
},
{
"epoch": 0.10482180293501048,
"grad_norm": 0.9857679605484009,
"learning_rate": 0.0001964013976533084,
"loss": 1.0091,
"step": 50
},
{
"epoch": 0.1069182389937107,
"grad_norm": 0.3437671959400177,
"learning_rate": 0.00019622037458277784,
"loss": 1.2225,
"step": 51
},
{
"epoch": 0.1090146750524109,
"grad_norm": 0.3308734893798828,
"learning_rate": 0.00019603499708148244,
"loss": 1.2099,
"step": 52
},
{
"epoch": 0.1111111111111111,
"grad_norm": 0.353939026594162,
"learning_rate": 0.0001958452735386381,
"loss": 1.2554,
"step": 53
},
{
"epoch": 0.11320754716981132,
"grad_norm": 0.3151988089084625,
"learning_rate": 0.00019565121254013979,
"loss": 1.252,
"step": 54
},
{
"epoch": 0.11530398322851153,
"grad_norm": 0.32421159744262695,
"learning_rate": 0.00019545282286817303,
"loss": 0.9776,
"step": 55
},
{
"epoch": 0.11740041928721175,
"grad_norm": 0.3662404417991638,
"learning_rate": 0.0001952501135008165,
"loss": 1.3977,
"step": 56
},
{
"epoch": 0.11949685534591195,
"grad_norm": 0.3840480148792267,
"learning_rate": 0.00019504309361163566,
"loss": 1.2663,
"step": 57
},
{
"epoch": 0.12159329140461216,
"grad_norm": 0.37903356552124023,
"learning_rate": 0.00019483177256926767,
"loss": 1.5308,
"step": 58
},
{
"epoch": 0.12368972746331237,
"grad_norm": 0.346229612827301,
"learning_rate": 0.0001946161599369973,
"loss": 1.4319,
"step": 59
},
{
"epoch": 0.12578616352201258,
"grad_norm": 0.34781116247177124,
"learning_rate": 0.00019439626547232433,
"loss": 1.1933,
"step": 60
},
{
"epoch": 0.1278825995807128,
"grad_norm": 0.3286825716495514,
"learning_rate": 0.0001941720991265218,
"loss": 1.1038,
"step": 61
},
{
"epoch": 0.129979035639413,
"grad_norm": 0.39212745428085327,
"learning_rate": 0.00019394367104418576,
"loss": 1.2789,
"step": 62
},
{
"epoch": 0.1320754716981132,
"grad_norm": 0.3172178566455841,
"learning_rate": 0.0001937109915627762,
"loss": 1.1614,
"step": 63
},
{
"epoch": 0.13417190775681342,
"grad_norm": 0.371159166097641,
"learning_rate": 0.00019347407121214914,
"loss": 1.3819,
"step": 64
},
{
"epoch": 0.13626834381551362,
"grad_norm": 0.36089271306991577,
"learning_rate": 0.00019323292071408017,
"loss": 1.4392,
"step": 65
},
{
"epoch": 0.13836477987421383,
"grad_norm": 0.42245927453041077,
"learning_rate": 0.00019298755098177926,
"loss": 1.2518,
"step": 66
},
{
"epoch": 0.14046121593291405,
"grad_norm": 0.3602246642112732,
"learning_rate": 0.00019273797311939673,
"loss": 1.3146,
"step": 67
},
{
"epoch": 0.14255765199161424,
"grad_norm": 0.3581138253211975,
"learning_rate": 0.00019248419842152098,
"loss": 1.2622,
"step": 68
},
{
"epoch": 0.14465408805031446,
"grad_norm": 0.391454815864563,
"learning_rate": 0.0001922262383726672,
"loss": 1.4421,
"step": 69
},
{
"epoch": 0.14675052410901468,
"grad_norm": 0.4634746313095093,
"learning_rate": 0.00019196410464675766,
"loss": 1.3862,
"step": 70
},
{
"epoch": 0.1488469601677149,
"grad_norm": 0.35802096128463745,
"learning_rate": 0.00019169780910659333,
"loss": 1.4004,
"step": 71
},
{
"epoch": 0.1509433962264151,
"grad_norm": 0.34099411964416504,
"learning_rate": 0.00019142736380331726,
"loss": 1.2887,
"step": 72
},
{
"epoch": 0.1530398322851153,
"grad_norm": 0.37205106019973755,
"learning_rate": 0.00019115278097586903,
"loss": 1.518,
"step": 73
},
{
"epoch": 0.15513626834381553,
"grad_norm": 0.3985058665275574,
"learning_rate": 0.00019087407305043086,
"loss": 1.3483,
"step": 74
},
{
"epoch": 0.15723270440251572,
"grad_norm": 0.3541426956653595,
"learning_rate": 0.0001905912526398654,
"loss": 1.3036,
"step": 75
},
{
"epoch": 0.15932914046121593,
"grad_norm": 0.44033437967300415,
"learning_rate": 0.00019030433254314474,
"loss": 1.3732,
"step": 76
},
{
"epoch": 0.16142557651991615,
"grad_norm": 0.40152212977409363,
"learning_rate": 0.00019001332574477146,
"loss": 1.479,
"step": 77
},
{
"epoch": 0.16352201257861634,
"grad_norm": 0.46172958612442017,
"learning_rate": 0.00018971824541419083,
"loss": 1.381,
"step": 78
},
{
"epoch": 0.16561844863731656,
"grad_norm": 0.40097662806510925,
"learning_rate": 0.0001894191049051948,
"loss": 1.1499,
"step": 79
},
{
"epoch": 0.16771488469601678,
"grad_norm": 0.49080637097358704,
"learning_rate": 0.0001891159177553179,
"loss": 1.664,
"step": 80
},
{
"epoch": 0.16981132075471697,
"grad_norm": 0.45318862795829773,
"learning_rate": 0.00018880869768522432,
"loss": 1.3287,
"step": 81
},
{
"epoch": 0.1719077568134172,
"grad_norm": 0.4062664210796356,
"learning_rate": 0.00018849745859808717,
"loss": 1.2012,
"step": 82
},
{
"epoch": 0.1740041928721174,
"grad_norm": 0.4371073246002197,
"learning_rate": 0.00018818221457895926,
"loss": 1.4706,
"step": 83
},
{
"epoch": 0.1761006289308176,
"grad_norm": 0.41299256682395935,
"learning_rate": 0.00018786297989413568,
"loss": 1.2486,
"step": 84
},
{
"epoch": 0.17819706498951782,
"grad_norm": 0.44734108448028564,
"learning_rate": 0.00018753976899050812,
"loss": 1.1505,
"step": 85
},
{
"epoch": 0.18029350104821804,
"grad_norm": 0.552854597568512,
"learning_rate": 0.00018721259649491113,
"loss": 1.5622,
"step": 86
},
{
"epoch": 0.18238993710691823,
"grad_norm": 0.541213870048523,
"learning_rate": 0.0001868814772134603,
"loss": 1.5055,
"step": 87
},
{
"epoch": 0.18448637316561844,
"grad_norm": 0.48175540566444397,
"learning_rate": 0.00018654642613088194,
"loss": 1.2456,
"step": 88
},
{
"epoch": 0.18658280922431866,
"grad_norm": 0.5197116732597351,
"learning_rate": 0.0001862074584098352,
"loss": 1.4801,
"step": 89
},
{
"epoch": 0.18867924528301888,
"grad_norm": 0.46993735432624817,
"learning_rate": 0.00018586458939022586,
"loss": 1.5128,
"step": 90
},
{
"epoch": 0.19077568134171907,
"grad_norm": 0.5093168616294861,
"learning_rate": 0.00018551783458851189,
"loss": 1.521,
"step": 91
},
{
"epoch": 0.1928721174004193,
"grad_norm": 0.4279519021511078,
"learning_rate": 0.0001851672096970016,
"loss": 1.0692,
"step": 92
},
{
"epoch": 0.1949685534591195,
"grad_norm": 0.48902031779289246,
"learning_rate": 0.00018481273058314316,
"loss": 1.3202,
"step": 93
},
{
"epoch": 0.1970649895178197,
"grad_norm": 0.5409737229347229,
"learning_rate": 0.00018445441328880682,
"loss": 1.6125,
"step": 94
},
{
"epoch": 0.19916142557651992,
"grad_norm": 0.5205714702606201,
"learning_rate": 0.00018409227402955871,
"loss": 1.1616,
"step": 95
},
{
"epoch": 0.20125786163522014,
"grad_norm": 0.5157482624053955,
"learning_rate": 0.00018372632919392716,
"loss": 1.3375,
"step": 96
},
{
"epoch": 0.20335429769392033,
"grad_norm": 0.5590908527374268,
"learning_rate": 0.00018335659534266094,
"loss": 1.6429,
"step": 97
},
{
"epoch": 0.20545073375262055,
"grad_norm": 0.5677520036697388,
"learning_rate": 0.00018298308920797985,
"loss": 1.1629,
"step": 98
},
{
"epoch": 0.20754716981132076,
"grad_norm": 0.6165626645088196,
"learning_rate": 0.00018260582769281743,
"loss": 1.0469,
"step": 99
},
{
"epoch": 0.20964360587002095,
"grad_norm": 0.7722473740577698,
"learning_rate": 0.0001822248278700563,
"loss": 1.7717,
"step": 100
},
{
"epoch": 0.21174004192872117,
"grad_norm": 0.34500235319137573,
"learning_rate": 0.00018184010698175506,
"loss": 1.0338,
"step": 101
},
{
"epoch": 0.2138364779874214,
"grad_norm": 0.4223347008228302,
"learning_rate": 0.0001814516824383685,
"loss": 1.384,
"step": 102
},
{
"epoch": 0.21593291404612158,
"grad_norm": 0.3532989025115967,
"learning_rate": 0.0001810595718179593,
"loss": 1.1763,
"step": 103
},
{
"epoch": 0.2180293501048218,
"grad_norm": 0.31655967235565186,
"learning_rate": 0.00018066379286540277,
"loss": 1.4366,
"step": 104
},
{
"epoch": 0.22012578616352202,
"grad_norm": 0.580037534236908,
"learning_rate": 0.00018026436349158378,
"loss": 1.4038,
"step": 105
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.3317371606826782,
"learning_rate": 0.00017986130177258608,
"loss": 1.2701,
"step": 106
},
{
"epoch": 0.22431865828092243,
"grad_norm": 0.34435999393463135,
"learning_rate": 0.00017945462594887445,
"loss": 1.2306,
"step": 107
},
{
"epoch": 0.22641509433962265,
"grad_norm": 0.30907875299453735,
"learning_rate": 0.000179044354424469,
"loss": 1.0864,
"step": 108
},
{
"epoch": 0.22851153039832284,
"grad_norm": 0.3259734511375427,
"learning_rate": 0.00017863050576611265,
"loss": 1.1871,
"step": 109
},
{
"epoch": 0.23060796645702306,
"grad_norm": 0.3698357939720154,
"learning_rate": 0.00017821309870243054,
"loss": 1.2336,
"step": 110
},
{
"epoch": 0.23270440251572327,
"grad_norm": 0.3339691162109375,
"learning_rate": 0.00017779215212308265,
"loss": 1.1696,
"step": 111
},
{
"epoch": 0.2348008385744235,
"grad_norm": 0.333344429731369,
"learning_rate": 0.0001773676850779089,
"loss": 1.3809,
"step": 112
},
{
"epoch": 0.23689727463312368,
"grad_norm": 0.35278016328811646,
"learning_rate": 0.00017693971677606714,
"loss": 1.3156,
"step": 113
},
{
"epoch": 0.2389937106918239,
"grad_norm": 0.3800717890262604,
"learning_rate": 0.00017650826658516375,
"loss": 1.1809,
"step": 114
},
{
"epoch": 0.24109014675052412,
"grad_norm": 0.353089302778244,
"learning_rate": 0.00017607335403037712,
"loss": 1.5121,
"step": 115
},
{
"epoch": 0.2431865828092243,
"grad_norm": 0.3874945044517517,
"learning_rate": 0.00017563499879357425,
"loss": 1.5124,
"step": 116
},
{
"epoch": 0.24528301886792453,
"grad_norm": 0.3635624945163727,
"learning_rate": 0.00017519322071241983,
"loss": 1.1454,
"step": 117
},
{
"epoch": 0.24737945492662475,
"grad_norm": 0.39976125955581665,
"learning_rate": 0.0001747480397794786,
"loss": 1.4797,
"step": 118
},
{
"epoch": 0.24947589098532494,
"grad_norm": 0.3654632866382599,
"learning_rate": 0.0001742994761413105,
"loss": 1.2913,
"step": 119
},
{
"epoch": 0.25157232704402516,
"grad_norm": 0.4034808874130249,
"learning_rate": 0.0001738475500975592,
"loss": 1.4904,
"step": 120
},
{
"epoch": 0.25157232704402516,
"eval_loss": 1.3252530097961426,
"eval_runtime": 13.8389,
"eval_samples_per_second": 14.524,
"eval_steps_per_second": 7.298,
"step": 120
}
],
"logging_steps": 1,
"max_steps": 477,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 120,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.141788609845658e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}