dimasik87's picture
Training in progress, step 100, checkpoint
440eb6c verified
raw
history blame
18.9 kB
{
"best_metric": 1.6530330181121826,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 0.01025010250102501,
"eval_steps": 50,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0001025010250102501,
"grad_norm": 0.705419659614563,
"learning_rate": 1.001e-05,
"loss": 1.8146,
"step": 1
},
{
"epoch": 0.0001025010250102501,
"eval_loss": 2.0756940841674805,
"eval_runtime": 214.3165,
"eval_samples_per_second": 19.168,
"eval_steps_per_second": 4.792,
"step": 1
},
{
"epoch": 0.0002050020500205002,
"grad_norm": 0.9089563488960266,
"learning_rate": 2.002e-05,
"loss": 1.5119,
"step": 2
},
{
"epoch": 0.0003075030750307503,
"grad_norm": 0.9483783841133118,
"learning_rate": 3.0029999999999995e-05,
"loss": 1.6709,
"step": 3
},
{
"epoch": 0.0004100041000410004,
"grad_norm": 1.0198091268539429,
"learning_rate": 4.004e-05,
"loss": 1.9187,
"step": 4
},
{
"epoch": 0.0005125051250512505,
"grad_norm": 1.091034173965454,
"learning_rate": 5.005e-05,
"loss": 1.8457,
"step": 5
},
{
"epoch": 0.0006150061500615006,
"grad_norm": 1.2312097549438477,
"learning_rate": 6.005999999999999e-05,
"loss": 2.1334,
"step": 6
},
{
"epoch": 0.0007175071750717508,
"grad_norm": 1.246527910232544,
"learning_rate": 7.006999999999998e-05,
"loss": 1.562,
"step": 7
},
{
"epoch": 0.0008200082000820008,
"grad_norm": 1.4241694211959839,
"learning_rate": 8.008e-05,
"loss": 1.7474,
"step": 8
},
{
"epoch": 0.0009225092250922509,
"grad_norm": 1.28750479221344,
"learning_rate": 9.009e-05,
"loss": 2.0525,
"step": 9
},
{
"epoch": 0.001025010250102501,
"grad_norm": 1.2573869228363037,
"learning_rate": 0.0001001,
"loss": 2.0545,
"step": 10
},
{
"epoch": 0.001127511275112751,
"grad_norm": 1.154963731765747,
"learning_rate": 9.957315789473684e-05,
"loss": 1.7098,
"step": 11
},
{
"epoch": 0.0012300123001230013,
"grad_norm": 1.1779276132583618,
"learning_rate": 9.904631578947367e-05,
"loss": 1.9339,
"step": 12
},
{
"epoch": 0.0013325133251332513,
"grad_norm": 1.2296435832977295,
"learning_rate": 9.851947368421052e-05,
"loss": 1.8391,
"step": 13
},
{
"epoch": 0.0014350143501435015,
"grad_norm": 1.2227979898452759,
"learning_rate": 9.799263157894736e-05,
"loss": 1.8549,
"step": 14
},
{
"epoch": 0.0015375153751537515,
"grad_norm": 1.230628252029419,
"learning_rate": 9.746578947368421e-05,
"loss": 1.6791,
"step": 15
},
{
"epoch": 0.0016400164001640015,
"grad_norm": 1.2459157705307007,
"learning_rate": 9.693894736842104e-05,
"loss": 1.8406,
"step": 16
},
{
"epoch": 0.0017425174251742518,
"grad_norm": 1.397838830947876,
"learning_rate": 9.641210526315789e-05,
"loss": 1.796,
"step": 17
},
{
"epoch": 0.0018450184501845018,
"grad_norm": 1.3623087406158447,
"learning_rate": 9.588526315789473e-05,
"loss": 1.6451,
"step": 18
},
{
"epoch": 0.001947519475194752,
"grad_norm": 1.386612057685852,
"learning_rate": 9.535842105263157e-05,
"loss": 1.8315,
"step": 19
},
{
"epoch": 0.002050020500205002,
"grad_norm": 1.4097331762313843,
"learning_rate": 9.483157894736841e-05,
"loss": 1.8867,
"step": 20
},
{
"epoch": 0.002152521525215252,
"grad_norm": 1.2772889137268066,
"learning_rate": 9.430473684210526e-05,
"loss": 1.8137,
"step": 21
},
{
"epoch": 0.002255022550225502,
"grad_norm": 1.3513169288635254,
"learning_rate": 9.37778947368421e-05,
"loss": 1.6449,
"step": 22
},
{
"epoch": 0.0023575235752357525,
"grad_norm": 1.5203684568405151,
"learning_rate": 9.325105263157894e-05,
"loss": 1.7515,
"step": 23
},
{
"epoch": 0.0024600246002460025,
"grad_norm": 1.2085375785827637,
"learning_rate": 9.272421052631578e-05,
"loss": 1.4646,
"step": 24
},
{
"epoch": 0.0025625256252562525,
"grad_norm": 1.4566919803619385,
"learning_rate": 9.219736842105263e-05,
"loss": 1.8844,
"step": 25
},
{
"epoch": 0.0026650266502665026,
"grad_norm": 1.5748794078826904,
"learning_rate": 9.167052631578946e-05,
"loss": 1.7089,
"step": 26
},
{
"epoch": 0.0027675276752767526,
"grad_norm": 1.3820806741714478,
"learning_rate": 9.114368421052632e-05,
"loss": 1.7397,
"step": 27
},
{
"epoch": 0.002870028700287003,
"grad_norm": 1.47676420211792,
"learning_rate": 9.061684210526315e-05,
"loss": 1.6333,
"step": 28
},
{
"epoch": 0.002972529725297253,
"grad_norm": 1.3956615924835205,
"learning_rate": 9.009e-05,
"loss": 1.5381,
"step": 29
},
{
"epoch": 0.003075030750307503,
"grad_norm": 1.4320456981658936,
"learning_rate": 8.956315789473683e-05,
"loss": 1.5204,
"step": 30
},
{
"epoch": 0.003177531775317753,
"grad_norm": 1.564431071281433,
"learning_rate": 8.903631578947368e-05,
"loss": 1.6053,
"step": 31
},
{
"epoch": 0.003280032800328003,
"grad_norm": 1.604783296585083,
"learning_rate": 8.850947368421052e-05,
"loss": 1.4296,
"step": 32
},
{
"epoch": 0.0033825338253382535,
"grad_norm": 2.1952967643737793,
"learning_rate": 8.798263157894736e-05,
"loss": 2.3028,
"step": 33
},
{
"epoch": 0.0034850348503485036,
"grad_norm": 1.7859286069869995,
"learning_rate": 8.745578947368422e-05,
"loss": 1.0892,
"step": 34
},
{
"epoch": 0.0035875358753587536,
"grad_norm": 2.099475145339966,
"learning_rate": 8.692894736842105e-05,
"loss": 1.9027,
"step": 35
},
{
"epoch": 0.0036900369003690036,
"grad_norm": 2.05716609954834,
"learning_rate": 8.64021052631579e-05,
"loss": 1.8596,
"step": 36
},
{
"epoch": 0.0037925379253792536,
"grad_norm": 3.045736074447632,
"learning_rate": 8.587526315789473e-05,
"loss": 1.6003,
"step": 37
},
{
"epoch": 0.003895038950389504,
"grad_norm": 3.077202320098877,
"learning_rate": 8.534842105263157e-05,
"loss": 1.7161,
"step": 38
},
{
"epoch": 0.003997539975399754,
"grad_norm": 2.8727805614471436,
"learning_rate": 8.482157894736842e-05,
"loss": 1.2155,
"step": 39
},
{
"epoch": 0.004100041000410004,
"grad_norm": 3.2088754177093506,
"learning_rate": 8.429473684210525e-05,
"loss": 2.3325,
"step": 40
},
{
"epoch": 0.0042025420254202545,
"grad_norm": 3.467123031616211,
"learning_rate": 8.376789473684211e-05,
"loss": 1.7372,
"step": 41
},
{
"epoch": 0.004305043050430504,
"grad_norm": 3.2959988117218018,
"learning_rate": 8.324105263157894e-05,
"loss": 2.0588,
"step": 42
},
{
"epoch": 0.0044075440754407546,
"grad_norm": 3.0905165672302246,
"learning_rate": 8.271421052631579e-05,
"loss": 1.793,
"step": 43
},
{
"epoch": 0.004510045100451004,
"grad_norm": 4.93464469909668,
"learning_rate": 8.218736842105262e-05,
"loss": 1.8903,
"step": 44
},
{
"epoch": 0.004612546125461255,
"grad_norm": 2.7704639434814453,
"learning_rate": 8.166052631578947e-05,
"loss": 1.1567,
"step": 45
},
{
"epoch": 0.004715047150471505,
"grad_norm": 3.1440534591674805,
"learning_rate": 8.113368421052631e-05,
"loss": 1.568,
"step": 46
},
{
"epoch": 0.004817548175481755,
"grad_norm": 4.074300289154053,
"learning_rate": 8.060684210526315e-05,
"loss": 1.9814,
"step": 47
},
{
"epoch": 0.004920049200492005,
"grad_norm": 5.697752475738525,
"learning_rate": 8.008e-05,
"loss": 1.862,
"step": 48
},
{
"epoch": 0.005022550225502255,
"grad_norm": 6.887198448181152,
"learning_rate": 7.955315789473684e-05,
"loss": 1.7794,
"step": 49
},
{
"epoch": 0.005125051250512505,
"grad_norm": 6.863124847412109,
"learning_rate": 7.902631578947368e-05,
"loss": 1.5001,
"step": 50
},
{
"epoch": 0.005125051250512505,
"eval_loss": 1.7083282470703125,
"eval_runtime": 213.8938,
"eval_samples_per_second": 19.206,
"eval_steps_per_second": 4.801,
"step": 50
},
{
"epoch": 0.0052275522755227555,
"grad_norm": 0.7324660420417786,
"learning_rate": 7.849947368421052e-05,
"loss": 1.7833,
"step": 51
},
{
"epoch": 0.005330053300533005,
"grad_norm": 1.0079008340835571,
"learning_rate": 7.797263157894736e-05,
"loss": 2.0611,
"step": 52
},
{
"epoch": 0.005432554325543256,
"grad_norm": 1.0478756427764893,
"learning_rate": 7.744578947368421e-05,
"loss": 1.5655,
"step": 53
},
{
"epoch": 0.005535055350553505,
"grad_norm": 1.1489282846450806,
"learning_rate": 7.691894736842104e-05,
"loss": 2.0112,
"step": 54
},
{
"epoch": 0.005637556375563756,
"grad_norm": 1.0685741901397705,
"learning_rate": 7.63921052631579e-05,
"loss": 1.837,
"step": 55
},
{
"epoch": 0.005740057400574006,
"grad_norm": 0.9752334952354431,
"learning_rate": 7.586526315789473e-05,
"loss": 1.8408,
"step": 56
},
{
"epoch": 0.005842558425584256,
"grad_norm": 1.0988349914550781,
"learning_rate": 7.533842105263158e-05,
"loss": 1.6708,
"step": 57
},
{
"epoch": 0.005945059450594506,
"grad_norm": 1.059479832649231,
"learning_rate": 7.481157894736841e-05,
"loss": 1.4579,
"step": 58
},
{
"epoch": 0.006047560475604756,
"grad_norm": 1.0964906215667725,
"learning_rate": 7.428473684210526e-05,
"loss": 1.944,
"step": 59
},
{
"epoch": 0.006150061500615006,
"grad_norm": 1.0522820949554443,
"learning_rate": 7.375789473684209e-05,
"loss": 1.762,
"step": 60
},
{
"epoch": 0.0062525625256252566,
"grad_norm": 1.0768394470214844,
"learning_rate": 7.323105263157895e-05,
"loss": 1.7185,
"step": 61
},
{
"epoch": 0.006355063550635506,
"grad_norm": 1.1058900356292725,
"learning_rate": 7.270421052631578e-05,
"loss": 1.6855,
"step": 62
},
{
"epoch": 0.006457564575645757,
"grad_norm": 1.0793980360031128,
"learning_rate": 7.217736842105263e-05,
"loss": 1.6553,
"step": 63
},
{
"epoch": 0.006560065600656006,
"grad_norm": 1.257716417312622,
"learning_rate": 7.165052631578947e-05,
"loss": 1.5379,
"step": 64
},
{
"epoch": 0.006662566625666257,
"grad_norm": 1.116843581199646,
"learning_rate": 7.11236842105263e-05,
"loss": 1.6653,
"step": 65
},
{
"epoch": 0.006765067650676507,
"grad_norm": 1.016861081123352,
"learning_rate": 7.059684210526315e-05,
"loss": 1.3905,
"step": 66
},
{
"epoch": 0.006867568675686757,
"grad_norm": 1.1021226644515991,
"learning_rate": 7.006999999999998e-05,
"loss": 1.6295,
"step": 67
},
{
"epoch": 0.006970069700697007,
"grad_norm": 1.1197590827941895,
"learning_rate": 6.954315789473684e-05,
"loss": 1.6643,
"step": 68
},
{
"epoch": 0.007072570725707257,
"grad_norm": 1.242423415184021,
"learning_rate": 6.901631578947368e-05,
"loss": 1.7367,
"step": 69
},
{
"epoch": 0.007175071750717507,
"grad_norm": 1.1498782634735107,
"learning_rate": 6.848947368421052e-05,
"loss": 1.6476,
"step": 70
},
{
"epoch": 0.007277572775727758,
"grad_norm": 1.174621820449829,
"learning_rate": 6.796263157894737e-05,
"loss": 2.0002,
"step": 71
},
{
"epoch": 0.007380073800738007,
"grad_norm": 1.1680294275283813,
"learning_rate": 6.74357894736842e-05,
"loss": 1.4689,
"step": 72
},
{
"epoch": 0.007482574825748258,
"grad_norm": 1.3079551458358765,
"learning_rate": 6.690894736842105e-05,
"loss": 1.5757,
"step": 73
},
{
"epoch": 0.007585075850758507,
"grad_norm": 1.2481780052185059,
"learning_rate": 6.638210526315788e-05,
"loss": 1.6238,
"step": 74
},
{
"epoch": 0.007687576875768758,
"grad_norm": 1.3879331350326538,
"learning_rate": 6.585526315789474e-05,
"loss": 1.9082,
"step": 75
},
{
"epoch": 0.007790077900779008,
"grad_norm": 1.6329386234283447,
"learning_rate": 6.532842105263157e-05,
"loss": 1.739,
"step": 76
},
{
"epoch": 0.007892578925789259,
"grad_norm": 1.4674111604690552,
"learning_rate": 6.480157894736842e-05,
"loss": 1.6045,
"step": 77
},
{
"epoch": 0.007995079950799507,
"grad_norm": 1.4382115602493286,
"learning_rate": 6.427473684210526e-05,
"loss": 1.4885,
"step": 78
},
{
"epoch": 0.008097580975809758,
"grad_norm": 1.4571962356567383,
"learning_rate": 6.37478947368421e-05,
"loss": 1.3451,
"step": 79
},
{
"epoch": 0.008200082000820008,
"grad_norm": 1.6193132400512695,
"learning_rate": 6.322105263157894e-05,
"loss": 1.8344,
"step": 80
},
{
"epoch": 0.008302583025830259,
"grad_norm": 1.673788070678711,
"learning_rate": 6.269421052631577e-05,
"loss": 1.4983,
"step": 81
},
{
"epoch": 0.008405084050840509,
"grad_norm": 1.7791383266448975,
"learning_rate": 6.216736842105263e-05,
"loss": 1.8181,
"step": 82
},
{
"epoch": 0.008507585075850758,
"grad_norm": 1.6379783153533936,
"learning_rate": 6.164052631578947e-05,
"loss": 1.5085,
"step": 83
},
{
"epoch": 0.008610086100861008,
"grad_norm": 2.0746712684631348,
"learning_rate": 6.111368421052631e-05,
"loss": 1.8331,
"step": 84
},
{
"epoch": 0.008712587125871259,
"grad_norm": 1.9016090631484985,
"learning_rate": 6.058684210526315e-05,
"loss": 1.6955,
"step": 85
},
{
"epoch": 0.008815088150881509,
"grad_norm": 2.0374410152435303,
"learning_rate": 6.005999999999999e-05,
"loss": 1.6262,
"step": 86
},
{
"epoch": 0.00891758917589176,
"grad_norm": 2.458918809890747,
"learning_rate": 5.953315789473684e-05,
"loss": 1.782,
"step": 87
},
{
"epoch": 0.009020090200902008,
"grad_norm": 2.420497179031372,
"learning_rate": 5.9006315789473676e-05,
"loss": 1.4256,
"step": 88
},
{
"epoch": 0.009122591225912259,
"grad_norm": 3.1010406017303467,
"learning_rate": 5.847947368421053e-05,
"loss": 1.5861,
"step": 89
},
{
"epoch": 0.00922509225092251,
"grad_norm": 2.9462192058563232,
"learning_rate": 5.795263157894737e-05,
"loss": 1.7643,
"step": 90
},
{
"epoch": 0.00932759327593276,
"grad_norm": 3.0921950340270996,
"learning_rate": 5.742578947368421e-05,
"loss": 1.7482,
"step": 91
},
{
"epoch": 0.00943009430094301,
"grad_norm": 3.2740588188171387,
"learning_rate": 5.6898947368421046e-05,
"loss": 1.7103,
"step": 92
},
{
"epoch": 0.009532595325953259,
"grad_norm": 3.282179594039917,
"learning_rate": 5.6372105263157886e-05,
"loss": 1.3958,
"step": 93
},
{
"epoch": 0.00963509635096351,
"grad_norm": 2.8050053119659424,
"learning_rate": 5.584526315789473e-05,
"loss": 1.0987,
"step": 94
},
{
"epoch": 0.00973759737597376,
"grad_norm": 3.167931079864502,
"learning_rate": 5.531842105263158e-05,
"loss": 1.0239,
"step": 95
},
{
"epoch": 0.00984009840098401,
"grad_norm": 4.01984167098999,
"learning_rate": 5.4791578947368424e-05,
"loss": 2.1924,
"step": 96
},
{
"epoch": 0.00994259942599426,
"grad_norm": 7.1500983238220215,
"learning_rate": 5.426473684210526e-05,
"loss": 1.6619,
"step": 97
},
{
"epoch": 0.01004510045100451,
"grad_norm": 3.5555317401885986,
"learning_rate": 5.37378947368421e-05,
"loss": 1.0794,
"step": 98
},
{
"epoch": 0.01014760147601476,
"grad_norm": 3.8048603534698486,
"learning_rate": 5.321105263157894e-05,
"loss": 1.4438,
"step": 99
},
{
"epoch": 0.01025010250102501,
"grad_norm": 6.511986255645752,
"learning_rate": 5.268421052631578e-05,
"loss": 2.3968,
"step": 100
},
{
"epoch": 0.01025010250102501,
"eval_loss": 1.6530330181121826,
"eval_runtime": 213.675,
"eval_samples_per_second": 19.225,
"eval_steps_per_second": 4.806,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.572495321923584e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}