VERSIL91's picture
End of training
1ad08e5 verified
{
"best_metric": 11.9102783203125,
"best_model_checkpoint": "miner_id_24/checkpoint-150",
"epoch": 3.0,
"eval_steps": 50,
"global_step": 168,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017857142857142856,
"eval_loss": 11.9290132522583,
"eval_runtime": 0.5421,
"eval_samples_per_second": 175.25,
"eval_steps_per_second": 44.274,
"step": 1
},
{
"epoch": 0.05357142857142857,
"grad_norm": 0.019959961995482445,
"learning_rate": 3e-05,
"loss": 11.9303,
"step": 3
},
{
"epoch": 0.10714285714285714,
"grad_norm": 0.022794105112552643,
"learning_rate": 6e-05,
"loss": 11.9305,
"step": 6
},
{
"epoch": 0.16071428571428573,
"grad_norm": 0.0353395938873291,
"learning_rate": 9e-05,
"loss": 11.9295,
"step": 9
},
{
"epoch": 0.21428571428571427,
"grad_norm": 0.03792329132556915,
"learning_rate": 9.996046986136509e-05,
"loss": 11.93,
"step": 12
},
{
"epoch": 0.26785714285714285,
"grad_norm": 0.03241180628538132,
"learning_rate": 9.975310752612137e-05,
"loss": 11.9276,
"step": 15
},
{
"epoch": 0.32142857142857145,
"grad_norm": 0.05070869252085686,
"learning_rate": 9.936876709681668e-05,
"loss": 11.9291,
"step": 18
},
{
"epoch": 0.375,
"grad_norm": 0.05852275714278221,
"learning_rate": 9.880881572095256e-05,
"loss": 11.9291,
"step": 21
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.08124187588691711,
"learning_rate": 9.807524521637102e-05,
"loss": 11.9278,
"step": 24
},
{
"epoch": 0.48214285714285715,
"grad_norm": 0.05361940711736679,
"learning_rate": 9.717066498610673e-05,
"loss": 11.9256,
"step": 27
},
{
"epoch": 0.5357142857142857,
"grad_norm": 0.0750463530421257,
"learning_rate": 9.609829273641034e-05,
"loss": 11.9261,
"step": 30
},
{
"epoch": 0.5892857142857143,
"grad_norm": 0.08543704450130463,
"learning_rate": 9.486194303096062e-05,
"loss": 11.9248,
"step": 33
},
{
"epoch": 0.6428571428571429,
"grad_norm": 0.09016852080821991,
"learning_rate": 9.346601372197914e-05,
"loss": 11.9243,
"step": 36
},
{
"epoch": 0.6964285714285714,
"grad_norm": 0.10736904293298721,
"learning_rate": 9.191547030651383e-05,
"loss": 11.9206,
"step": 39
},
{
"epoch": 0.75,
"grad_norm": 0.0793285071849823,
"learning_rate": 9.021582826353824e-05,
"loss": 11.9232,
"step": 42
},
{
"epoch": 0.8035714285714286,
"grad_norm": 0.05057210102677345,
"learning_rate": 8.83731334346954e-05,
"loss": 11.9215,
"step": 45
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.05617088824510574,
"learning_rate": 8.639394051847472e-05,
"loss": 11.9211,
"step": 48
},
{
"epoch": 0.8928571428571429,
"eval_loss": 11.918818473815918,
"eval_runtime": 0.5369,
"eval_samples_per_second": 176.942,
"eval_steps_per_second": 44.701,
"step": 50
},
{
"epoch": 0.9107142857142857,
"grad_norm": 0.06780627369880676,
"learning_rate": 8.428528975432066e-05,
"loss": 11.9193,
"step": 51
},
{
"epoch": 0.9642857142857143,
"grad_norm": 0.03634953871369362,
"learning_rate": 8.2054681879611e-05,
"loss": 11.9178,
"step": 54
},
{
"epoch": 1.0178571428571428,
"grad_norm": 0.04605906456708908,
"learning_rate": 7.971005144858553e-05,
"loss": 11.9182,
"step": 57
},
{
"epoch": 1.0714285714285714,
"grad_norm": 0.025876272469758987,
"learning_rate": 7.725973860813338e-05,
"loss": 11.9199,
"step": 60
},
{
"epoch": 1.125,
"grad_norm": 0.04855341464281082,
"learning_rate": 7.471245943083615e-05,
"loss": 11.9185,
"step": 63
},
{
"epoch": 1.1785714285714286,
"grad_norm": 0.03910359740257263,
"learning_rate": 7.20772749107956e-05,
"loss": 11.9184,
"step": 66
},
{
"epoch": 1.2321428571428572,
"grad_norm": 0.08992303162813187,
"learning_rate": 6.936355873253206e-05,
"loss": 11.9154,
"step": 69
},
{
"epoch": 1.2857142857142856,
"grad_norm": 0.04321262612938881,
"learning_rate": 6.65809639276034e-05,
"loss": 11.919,
"step": 72
},
{
"epoch": 1.3392857142857144,
"grad_norm": 0.049111876636743546,
"learning_rate": 6.373938853755126e-05,
"loss": 11.9185,
"step": 75
},
{
"epoch": 1.3928571428571428,
"grad_norm": 0.0660889744758606,
"learning_rate": 6.08489404053159e-05,
"loss": 11.9156,
"step": 78
},
{
"epoch": 1.4464285714285714,
"grad_norm": 0.0920424684882164,
"learning_rate": 5.791990122036075e-05,
"loss": 11.9151,
"step": 81
},
{
"epoch": 1.5,
"grad_norm": 0.04646582156419754,
"learning_rate": 5.496268994540309e-05,
"loss": 11.9162,
"step": 84
},
{
"epoch": 1.5535714285714286,
"grad_norm": 0.053827133029699326,
"learning_rate": 5.19878257548463e-05,
"loss": 11.9172,
"step": 87
},
{
"epoch": 1.6071428571428572,
"grad_norm": 0.04865885153412819,
"learning_rate": 4.900589061674649e-05,
"loss": 11.9165,
"step": 90
},
{
"epoch": 1.6607142857142856,
"grad_norm": 0.0875491127371788,
"learning_rate": 4.602749165141428e-05,
"loss": 11.9132,
"step": 93
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.04339161515235901,
"learning_rate": 4.3063223400546594e-05,
"loss": 11.9123,
"step": 96
},
{
"epoch": 1.7678571428571428,
"grad_norm": 0.04742836579680443,
"learning_rate": 4.012363014110237e-05,
"loss": 11.9154,
"step": 99
},
{
"epoch": 1.7857142857142856,
"eval_loss": 11.913043975830078,
"eval_runtime": 0.5398,
"eval_samples_per_second": 175.99,
"eval_steps_per_second": 44.461,
"step": 100
},
{
"epoch": 1.8214285714285714,
"grad_norm": 0.04398871585726738,
"learning_rate": 3.721916837797627e-05,
"loss": 11.9137,
"step": 102
},
{
"epoch": 1.875,
"grad_norm": 0.05781185254454613,
"learning_rate": 3.436016964888865e-05,
"loss": 11.9125,
"step": 105
},
{
"epoch": 1.9285714285714286,
"grad_norm": 0.08808522671461105,
"learning_rate": 3.1556803773799614e-05,
"loss": 11.9077,
"step": 108
},
{
"epoch": 1.9821428571428572,
"grad_norm": 0.06021308898925781,
"learning_rate": 2.8819042679573617e-05,
"loss": 11.9141,
"step": 111
},
{
"epoch": 2.0357142857142856,
"grad_norm": 0.048566147685050964,
"learning_rate": 2.6156624928574707e-05,
"loss": 11.9136,
"step": 114
},
{
"epoch": 2.0892857142857144,
"grad_norm": 0.04223039001226425,
"learning_rate": 2.3579021077369046e-05,
"loss": 11.9142,
"step": 117
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.04601627215743065,
"learning_rate": 2.1095399988757574e-05,
"loss": 11.9133,
"step": 120
},
{
"epoch": 2.1964285714285716,
"grad_norm": 0.05840318650007248,
"learning_rate": 1.8714596216972007e-05,
"loss": 11.9112,
"step": 123
},
{
"epoch": 2.25,
"grad_norm": 0.035777896642684937,
"learning_rate": 1.6445078582048155e-05,
"loss": 11.9091,
"step": 126
},
{
"epoch": 2.3035714285714284,
"grad_norm": 0.037539299577474594,
"learning_rate": 1.4294920045162513e-05,
"loss": 11.9146,
"step": 129
},
{
"epoch": 2.357142857142857,
"grad_norm": 0.042784880846738815,
"learning_rate": 1.2271768992088489e-05,
"loss": 11.9117,
"step": 132
},
{
"epoch": 2.4107142857142856,
"grad_norm": 0.05495860055088997,
"learning_rate": 1.038282202692129e-05,
"loss": 11.9109,
"step": 135
},
{
"epoch": 2.4642857142857144,
"grad_norm": 0.07855169475078583,
"learning_rate": 8.634798372847148e-06,
"loss": 11.9069,
"step": 138
},
{
"epoch": 2.517857142857143,
"grad_norm": 0.03630689159035683,
"learning_rate": 7.033915971016952e-06,
"loss": 11.9139,
"step": 141
},
{
"epoch": 2.571428571428571,
"grad_norm": 0.048760075122117996,
"learning_rate": 5.585869362543416e-06,
"loss": 11.9124,
"step": 144
},
{
"epoch": 2.625,
"grad_norm": 0.04377627745270729,
"learning_rate": 4.29580943229827e-06,
"loss": 11.9118,
"step": 147
},
{
"epoch": 2.678571428571429,
"grad_norm": 0.05847623199224472,
"learning_rate": 3.1683250865636114e-06,
"loss": 11.9095,
"step": 150
},
{
"epoch": 2.678571428571429,
"eval_loss": 11.9102783203125,
"eval_runtime": 0.5392,
"eval_samples_per_second": 176.177,
"eval_steps_per_second": 44.508,
"step": 150
},
{
"epoch": 2.732142857142857,
"grad_norm": 0.034065768122673035,
"learning_rate": 2.2074269297119587e-06,
"loss": 11.9088,
"step": 153
},
{
"epoch": 2.7857142857142856,
"grad_norm": 0.04465312138199806,
"learning_rate": 1.4165329979794973e-06,
"loss": 11.9133,
"step": 156
},
{
"epoch": 2.8392857142857144,
"grad_norm": 0.043062131851911545,
"learning_rate": 7.984566010789674e-07,
"loss": 11.9122,
"step": 159
},
{
"epoch": 2.892857142857143,
"grad_norm": 0.05254960432648659,
"learning_rate": 3.553963149013295e-07,
"loss": 11.9094,
"step": 162
},
{
"epoch": 2.946428571428571,
"grad_norm": 0.032099399715662,
"learning_rate": 8.892816090335099e-08,
"loss": 11.9082,
"step": 165
},
{
"epoch": 3.0,
"grad_norm": 0.09737348556518555,
"learning_rate": 0.0,
"loss": 11.908,
"step": 168
}
],
"logging_steps": 3,
"max_steps": 168,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 808543272960.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}