k1h0's picture
Upload folder using huggingface_hub
8799709 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9927007299270073,
"eval_steps": 500,
"global_step": 85,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01167883211678832,
"grad_norm": 2.102217674255371,
"learning_rate": 4.998292650357558e-05,
"loss": 1.2528,
"num_input_tokens_seen": 1572864,
"step": 1
},
{
"epoch": 0.02335766423357664,
"grad_norm": 1.8151874542236328,
"learning_rate": 4.993172933464471e-05,
"loss": 1.1456,
"num_input_tokens_seen": 3145728,
"step": 2
},
{
"epoch": 0.035036496350364967,
"grad_norm": 1.6563794612884521,
"learning_rate": 4.984647842238185e-05,
"loss": 1.0528,
"num_input_tokens_seen": 4718592,
"step": 3
},
{
"epoch": 0.04671532846715328,
"grad_norm": 1.4128868579864502,
"learning_rate": 4.972729020927865e-05,
"loss": 0.9542,
"num_input_tokens_seen": 6291456,
"step": 4
},
{
"epoch": 0.058394160583941604,
"grad_norm": 1.3149222135543823,
"learning_rate": 4.957432749209755e-05,
"loss": 0.8888,
"num_input_tokens_seen": 7864320,
"step": 5
},
{
"epoch": 0.07007299270072993,
"grad_norm": 1.1523312330245972,
"learning_rate": 4.938779919951092e-05,
"loss": 0.8125,
"num_input_tokens_seen": 9437184,
"step": 6
},
{
"epoch": 0.08175182481751825,
"grad_norm": 1.0558948516845703,
"learning_rate": 4.916796010672969e-05,
"loss": 0.7639,
"num_input_tokens_seen": 11010048,
"step": 7
},
{
"epoch": 0.09343065693430656,
"grad_norm": 0.9993464350700378,
"learning_rate": 4.891511048751102e-05,
"loss": 0.7068,
"num_input_tokens_seen": 12582912,
"step": 8
},
{
"epoch": 0.10510948905109489,
"grad_norm": 0.7277781367301941,
"learning_rate": 4.862959570402049e-05,
"loss": 0.6852,
"num_input_tokens_seen": 14155776,
"step": 9
},
{
"epoch": 0.11678832116788321,
"grad_norm": 0.33135032653808594,
"learning_rate": 4.8311805735108894e-05,
"loss": 0.6571,
"num_input_tokens_seen": 15728640,
"step": 10
},
{
"epoch": 0.12846715328467154,
"grad_norm": 0.24875016510486603,
"learning_rate": 4.796217464364808e-05,
"loss": 0.6347,
"num_input_tokens_seen": 17301504,
"step": 11
},
{
"epoch": 0.14014598540145987,
"grad_norm": 0.219735249876976,
"learning_rate": 4.758117998365322e-05,
"loss": 0.6191,
"num_input_tokens_seen": 18874368,
"step": 12
},
{
"epoch": 0.15182481751824817,
"grad_norm": 0.20399349927902222,
"learning_rate": 4.716934214800155e-05,
"loss": 0.5761,
"num_input_tokens_seen": 20447232,
"step": 13
},
{
"epoch": 0.1635036496350365,
"grad_norm": 0.18108128011226654,
"learning_rate": 4.672722365763821e-05,
"loss": 0.5772,
"num_input_tokens_seen": 22020096,
"step": 14
},
{
"epoch": 0.17518248175182483,
"grad_norm": 0.15887582302093506,
"learning_rate": 4.625542839324036e-05,
"loss": 0.5579,
"num_input_tokens_seen": 23592960,
"step": 15
},
{
"epoch": 0.18686131386861313,
"grad_norm": 0.1574232280254364,
"learning_rate": 4.575460077038877e-05,
"loss": 0.5674,
"num_input_tokens_seen": 25165824,
"step": 16
},
{
"epoch": 0.19854014598540146,
"grad_norm": 0.1397695690393448,
"learning_rate": 4.522542485937369e-05,
"loss": 0.5767,
"num_input_tokens_seen": 26738688,
"step": 17
},
{
"epoch": 0.21021897810218979,
"grad_norm": 0.12465737760066986,
"learning_rate": 4.4668623450837085e-05,
"loss": 0.5559,
"num_input_tokens_seen": 28311552,
"step": 18
},
{
"epoch": 0.22189781021897811,
"grad_norm": 0.10970822721719742,
"learning_rate": 4.408495706852758e-05,
"loss": 0.5603,
"num_input_tokens_seen": 29884416,
"step": 19
},
{
"epoch": 0.23357664233576642,
"grad_norm": 0.10278748720884323,
"learning_rate": 4.347522293051648e-05,
"loss": 0.5541,
"num_input_tokens_seen": 31457280,
"step": 20
},
{
"epoch": 0.24525547445255474,
"grad_norm": 0.09158932417631149,
"learning_rate": 4.284025386029381e-05,
"loss": 0.5358,
"num_input_tokens_seen": 33030144,
"step": 21
},
{
"epoch": 0.2569343065693431,
"grad_norm": 0.09265461564064026,
"learning_rate": 4.218091714923157e-05,
"loss": 0.529,
"num_input_tokens_seen": 34603008,
"step": 22
},
{
"epoch": 0.2686131386861314,
"grad_norm": 0.08593211323022842,
"learning_rate": 4.149811337196807e-05,
"loss": 0.5391,
"num_input_tokens_seen": 36175872,
"step": 23
},
{
"epoch": 0.28029197080291973,
"grad_norm": 0.07743365317583084,
"learning_rate": 4.079277515633127e-05,
"loss": 0.5368,
"num_input_tokens_seen": 37748736,
"step": 24
},
{
"epoch": 0.291970802919708,
"grad_norm": 0.08235425502061844,
"learning_rate": 4.0065865909481417e-05,
"loss": 0.5226,
"num_input_tokens_seen": 39321600,
"step": 25
},
{
"epoch": 0.30364963503649633,
"grad_norm": 0.07797503471374512,
"learning_rate": 3.931837850201263e-05,
"loss": 0.5181,
"num_input_tokens_seen": 40894464,
"step": 26
},
{
"epoch": 0.31532846715328466,
"grad_norm": 0.07740040868520737,
"learning_rate": 3.855133391181124e-05,
"loss": 0.5368,
"num_input_tokens_seen": 42467328,
"step": 27
},
{
"epoch": 0.327007299270073,
"grad_norm": 0.07796761393547058,
"learning_rate": 3.7765779829522675e-05,
"loss": 0.5143,
"num_input_tokens_seen": 44040192,
"step": 28
},
{
"epoch": 0.3386861313868613,
"grad_norm": 0.06905554234981537,
"learning_rate": 3.696278922753216e-05,
"loss": 0.5084,
"num_input_tokens_seen": 45613056,
"step": 29
},
{
"epoch": 0.35036496350364965,
"grad_norm": 0.07220657169818878,
"learning_rate": 3.6143458894413465e-05,
"loss": 0.5026,
"num_input_tokens_seen": 47185920,
"step": 30
},
{
"epoch": 0.362043795620438,
"grad_norm": 0.0667012482881546,
"learning_rate": 3.5308907936847594e-05,
"loss": 0.53,
"num_input_tokens_seen": 48758784,
"step": 31
},
{
"epoch": 0.37372262773722625,
"grad_norm": 0.06964272260665894,
"learning_rate": 3.446027625105776e-05,
"loss": 0.532,
"num_input_tokens_seen": 50331648,
"step": 32
},
{
"epoch": 0.3854014598540146,
"grad_norm": 0.06804265826940536,
"learning_rate": 3.3598722965848204e-05,
"loss": 0.5257,
"num_input_tokens_seen": 51904512,
"step": 33
},
{
"epoch": 0.3970802919708029,
"grad_norm": 0.06633932888507843,
"learning_rate": 3.272542485937369e-05,
"loss": 0.5078,
"num_input_tokens_seen": 53477376,
"step": 34
},
{
"epoch": 0.40875912408759124,
"grad_norm": 0.06411036849021912,
"learning_rate": 3.1841574751802076e-05,
"loss": 0.5012,
"num_input_tokens_seen": 55050240,
"step": 35
},
{
"epoch": 0.42043795620437957,
"grad_norm": 0.06728877872228622,
"learning_rate": 3.094837987606547e-05,
"loss": 0.5117,
"num_input_tokens_seen": 56623104,
"step": 36
},
{
"epoch": 0.4321167883211679,
"grad_norm": 0.06455469876527786,
"learning_rate": 3.0047060228925256e-05,
"loss": 0.516,
"num_input_tokens_seen": 58195968,
"step": 37
},
{
"epoch": 0.44379562043795623,
"grad_norm": 0.06497868150472641,
"learning_rate": 2.913884690460325e-05,
"loss": 0.5189,
"num_input_tokens_seen": 59768832,
"step": 38
},
{
"epoch": 0.4554744525547445,
"grad_norm": 0.062144387513399124,
"learning_rate": 2.8224980413255086e-05,
"loss": 0.4973,
"num_input_tokens_seen": 61341696,
"step": 39
},
{
"epoch": 0.46715328467153283,
"grad_norm": 0.064272440969944,
"learning_rate": 2.7306708986582553e-05,
"loss": 0.4942,
"num_input_tokens_seen": 62914560,
"step": 40
},
{
"epoch": 0.47883211678832116,
"grad_norm": 0.06099522113800049,
"learning_rate": 2.638528687289925e-05,
"loss": 0.5069,
"num_input_tokens_seen": 64487424,
"step": 41
},
{
"epoch": 0.4905109489051095,
"grad_norm": 0.06261651962995529,
"learning_rate": 2.5461972623978247e-05,
"loss": 0.5218,
"num_input_tokens_seen": 66060288,
"step": 42
},
{
"epoch": 0.5021897810218978,
"grad_norm": 0.059084702283144,
"learning_rate": 2.453802737602176e-05,
"loss": 0.5003,
"num_input_tokens_seen": 67633152,
"step": 43
},
{
"epoch": 0.5138686131386861,
"grad_norm": 0.05868009477853775,
"learning_rate": 2.361471312710075e-05,
"loss": 0.4997,
"num_input_tokens_seen": 69206016,
"step": 44
},
{
"epoch": 0.5255474452554745,
"grad_norm": 0.05527840927243233,
"learning_rate": 2.2693291013417453e-05,
"loss": 0.5045,
"num_input_tokens_seen": 70778880,
"step": 45
},
{
"epoch": 0.5372262773722628,
"grad_norm": 0.05962590128183365,
"learning_rate": 2.1775019586744923e-05,
"loss": 0.4972,
"num_input_tokens_seen": 72351744,
"step": 46
},
{
"epoch": 0.5489051094890511,
"grad_norm": 0.05784057453274727,
"learning_rate": 2.0861153095396748e-05,
"loss": 0.5174,
"num_input_tokens_seen": 73924608,
"step": 47
},
{
"epoch": 0.5605839416058395,
"grad_norm": 0.0580497644841671,
"learning_rate": 1.995293977107475e-05,
"loss": 0.5086,
"num_input_tokens_seen": 75497472,
"step": 48
},
{
"epoch": 0.5722627737226277,
"grad_norm": 0.05789309740066528,
"learning_rate": 1.9051620123934537e-05,
"loss": 0.4988,
"num_input_tokens_seen": 77070336,
"step": 49
},
{
"epoch": 0.583941605839416,
"grad_norm": 0.05743095278739929,
"learning_rate": 1.815842524819793e-05,
"loss": 0.5047,
"num_input_tokens_seen": 78643200,
"step": 50
},
{
"epoch": 0.5956204379562043,
"grad_norm": 0.05637204647064209,
"learning_rate": 1.7274575140626318e-05,
"loss": 0.4977,
"num_input_tokens_seen": 80216064,
"step": 51
},
{
"epoch": 0.6072992700729927,
"grad_norm": 0.054249729961156845,
"learning_rate": 1.6401277034151798e-05,
"loss": 0.4775,
"num_input_tokens_seen": 81788928,
"step": 52
},
{
"epoch": 0.618978102189781,
"grad_norm": 0.05614548176527023,
"learning_rate": 1.5539723748942245e-05,
"loss": 0.519,
"num_input_tokens_seen": 83361792,
"step": 53
},
{
"epoch": 0.6306569343065693,
"grad_norm": 0.05785420536994934,
"learning_rate": 1.4691092063152417e-05,
"loss": 0.5102,
"num_input_tokens_seen": 84934656,
"step": 54
},
{
"epoch": 0.6423357664233577,
"grad_norm": 0.05397043377161026,
"learning_rate": 1.3856541105586545e-05,
"loss": 0.4705,
"num_input_tokens_seen": 86507520,
"step": 55
},
{
"epoch": 0.654014598540146,
"grad_norm": 0.05297677591443062,
"learning_rate": 1.303721077246784e-05,
"loss": 0.4965,
"num_input_tokens_seen": 88080384,
"step": 56
},
{
"epoch": 0.6656934306569343,
"grad_norm": 0.05379556864500046,
"learning_rate": 1.223422017047733e-05,
"loss": 0.503,
"num_input_tokens_seen": 89653248,
"step": 57
},
{
"epoch": 0.6773722627737226,
"grad_norm": 0.058262865990400314,
"learning_rate": 1.1448666088188764e-05,
"loss": 0.4921,
"num_input_tokens_seen": 91226112,
"step": 58
},
{
"epoch": 0.689051094890511,
"grad_norm": 0.05684163048863411,
"learning_rate": 1.068162149798737e-05,
"loss": 0.5042,
"num_input_tokens_seen": 92798976,
"step": 59
},
{
"epoch": 0.7007299270072993,
"grad_norm": 0.053847264498472214,
"learning_rate": 9.934134090518593e-06,
"loss": 0.5145,
"num_input_tokens_seen": 94371840,
"step": 60
},
{
"epoch": 0.7124087591240876,
"grad_norm": 0.05480194464325905,
"learning_rate": 9.207224843668732e-06,
"loss": 0.4897,
"num_input_tokens_seen": 95944704,
"step": 61
},
{
"epoch": 0.724087591240876,
"grad_norm": 0.05290277674794197,
"learning_rate": 8.50188662803194e-06,
"loss": 0.5033,
"num_input_tokens_seen": 97517568,
"step": 62
},
{
"epoch": 0.7357664233576642,
"grad_norm": 0.05715763568878174,
"learning_rate": 7.819082850768434e-06,
"loss": 0.473,
"num_input_tokens_seen": 99090432,
"step": 63
},
{
"epoch": 0.7474452554744525,
"grad_norm": 0.060164712369441986,
"learning_rate": 7.159746139706194e-06,
"loss": 0.5298,
"num_input_tokens_seen": 100663296,
"step": 64
},
{
"epoch": 0.7591240875912408,
"grad_norm": 0.058020610362291336,
"learning_rate": 6.524777069483526e-06,
"loss": 0.4685,
"num_input_tokens_seen": 102236160,
"step": 65
},
{
"epoch": 0.7708029197080292,
"grad_norm": 0.05399800464510918,
"learning_rate": 5.915042931472425e-06,
"loss": 0.4931,
"num_input_tokens_seen": 103809024,
"step": 66
},
{
"epoch": 0.7824817518248175,
"grad_norm": 0.05629091337323189,
"learning_rate": 5.33137654916292e-06,
"loss": 0.4957,
"num_input_tokens_seen": 105381888,
"step": 67
},
{
"epoch": 0.7941605839416058,
"grad_norm": 0.05569099634885788,
"learning_rate": 4.7745751406263165e-06,
"loss": 0.4924,
"num_input_tokens_seen": 106954752,
"step": 68
},
{
"epoch": 0.8058394160583942,
"grad_norm": 0.05562946945428848,
"learning_rate": 4.245399229611238e-06,
"loss": 0.5041,
"num_input_tokens_seen": 108527616,
"step": 69
},
{
"epoch": 0.8175182481751825,
"grad_norm": 0.05801980197429657,
"learning_rate": 3.7445716067596503e-06,
"loss": 0.4948,
"num_input_tokens_seen": 110100480,
"step": 70
},
{
"epoch": 0.8291970802919708,
"grad_norm": 0.058106038719415665,
"learning_rate": 3.2727763423617913e-06,
"loss": 0.5085,
"num_input_tokens_seen": 111673344,
"step": 71
},
{
"epoch": 0.8408759124087591,
"grad_norm": 0.052898507565259933,
"learning_rate": 2.8306578519984527e-06,
"loss": 0.5152,
"num_input_tokens_seen": 113246208,
"step": 72
},
{
"epoch": 0.8525547445255475,
"grad_norm": 0.05265094339847565,
"learning_rate": 2.418820016346779e-06,
"loss": 0.4905,
"num_input_tokens_seen": 114819072,
"step": 73
},
{
"epoch": 0.8642335766423358,
"grad_norm": 0.054221879690885544,
"learning_rate": 2.0378253563519247e-06,
"loss": 0.5023,
"num_input_tokens_seen": 116391936,
"step": 74
},
{
"epoch": 0.8759124087591241,
"grad_norm": 0.0557856447994709,
"learning_rate": 1.6881942648911076e-06,
"loss": 0.4993,
"num_input_tokens_seen": 117964800,
"step": 75
},
{
"epoch": 0.8875912408759125,
"grad_norm": 0.05209067091345787,
"learning_rate": 1.3704042959795132e-06,
"loss": 0.481,
"num_input_tokens_seen": 119537664,
"step": 76
},
{
"epoch": 0.8992700729927007,
"grad_norm": 0.05373719707131386,
"learning_rate": 1.0848895124889818e-06,
"loss": 0.4806,
"num_input_tokens_seen": 121110528,
"step": 77
},
{
"epoch": 0.910948905109489,
"grad_norm": 0.055330030620098114,
"learning_rate": 8.320398932703144e-07,
"loss": 0.4915,
"num_input_tokens_seen": 122683392,
"step": 78
},
{
"epoch": 0.9226277372262773,
"grad_norm": 0.05640307813882828,
"learning_rate": 6.122008004890851e-07,
"loss": 0.4937,
"num_input_tokens_seen": 124256256,
"step": 79
},
{
"epoch": 0.9343065693430657,
"grad_norm": 0.05659601837396622,
"learning_rate": 4.256725079024554e-07,
"loss": 0.5152,
"num_input_tokens_seen": 125829120,
"step": 80
},
{
"epoch": 0.945985401459854,
"grad_norm": 0.05199190601706505,
"learning_rate": 2.7270979072135104e-07,
"loss": 0.4832,
"num_input_tokens_seen": 127401984,
"step": 81
},
{
"epoch": 0.9576642335766423,
"grad_norm": 0.05153407156467438,
"learning_rate": 1.5352157761815977e-07,
"loss": 0.4975,
"num_input_tokens_seen": 128974848,
"step": 82
},
{
"epoch": 0.9693430656934306,
"grad_norm": 0.05456986650824547,
"learning_rate": 6.827066535529946e-08,
"loss": 0.4887,
"num_input_tokens_seen": 130547712,
"step": 83
},
{
"epoch": 0.981021897810219,
"grad_norm": 0.057446081191301346,
"learning_rate": 1.7073496424427348e-08,
"loss": 0.4905,
"num_input_tokens_seen": 132120576,
"step": 84
},
{
"epoch": 0.9927007299270073,
"grad_norm": 0.05915559083223343,
"learning_rate": 0.0,
"loss": 0.5088,
"num_input_tokens_seen": 133693440,
"step": 85
},
{
"epoch": 0.9927007299270073,
"num_input_tokens_seen": 133693440,
"step": 85,
"total_flos": 5.206772811237949e+18,
"train_loss": 0.5589800634804893,
"train_runtime": 13057.0702,
"train_samples_per_second": 2.516,
"train_steps_per_second": 0.007
}
],
"logging_steps": 1,
"max_steps": 85,
"num_input_tokens_seen": 133693440,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.206772811237949e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}