lesso06's picture
Training in progress, step 500, checkpoint
de354ac verified
{
"best_metric": 2.1746580600738525,
"best_model_checkpoint": "miner_id_24/checkpoint-450",
"epoch": 0.1811922449719152,
"eval_steps": 50,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003623844899438304,
"eval_loss": 3.253896474838257,
"eval_runtime": 74.5486,
"eval_samples_per_second": 15.587,
"eval_steps_per_second": 3.903,
"step": 1
},
{
"epoch": 0.003623844899438304,
"grad_norm": 2.0134332180023193,
"learning_rate": 4.12e-05,
"loss": 2.878,
"step": 10
},
{
"epoch": 0.007247689798876608,
"grad_norm": 1.886479377746582,
"learning_rate": 8.24e-05,
"loss": 2.5585,
"step": 20
},
{
"epoch": 0.010871534698314912,
"grad_norm": 1.836480975151062,
"learning_rate": 0.0001236,
"loss": 2.4347,
"step": 30
},
{
"epoch": 0.014495379597753216,
"grad_norm": 2.0984137058258057,
"learning_rate": 0.0001648,
"loss": 2.3679,
"step": 40
},
{
"epoch": 0.01811922449719152,
"grad_norm": 4.840346813201904,
"learning_rate": 0.000206,
"loss": 2.3008,
"step": 50
},
{
"epoch": 0.01811922449719152,
"eval_loss": 2.5439529418945312,
"eval_runtime": 74.1639,
"eval_samples_per_second": 15.668,
"eval_steps_per_second": 3.924,
"step": 50
},
{
"epoch": 0.021743069396629823,
"grad_norm": 1.3696887493133545,
"learning_rate": 0.0002057490971767619,
"loss": 2.384,
"step": 60
},
{
"epoch": 0.02536691429606813,
"grad_norm": 1.702624797821045,
"learning_rate": 0.00020499761108038175,
"loss": 2.4112,
"step": 70
},
{
"epoch": 0.028990759195506432,
"grad_norm": 1.841941237449646,
"learning_rate": 0.00020374920287558198,
"loss": 2.3759,
"step": 80
},
{
"epoch": 0.03261460409494474,
"grad_norm": 2.737161874771118,
"learning_rate": 0.00020200995468164684,
"loss": 2.2692,
"step": 90
},
{
"epoch": 0.03623844899438304,
"grad_norm": 4.560957431793213,
"learning_rate": 0.00019978833994094855,
"loss": 2.1869,
"step": 100
},
{
"epoch": 0.03623844899438304,
"eval_loss": 2.719433307647705,
"eval_runtime": 74.2575,
"eval_samples_per_second": 15.648,
"eval_steps_per_second": 3.919,
"step": 100
},
{
"epoch": 0.03986229389382134,
"grad_norm": 1.4899482727050781,
"learning_rate": 0.00019709518213718787,
"loss": 2.4953,
"step": 110
},
{
"epoch": 0.043486138793259646,
"grad_norm": 1.4345935583114624,
"learning_rate": 0.00019394360206446948,
"loss": 2.3727,
"step": 120
},
{
"epoch": 0.047109983692697956,
"grad_norm": 1.8238986730575562,
"learning_rate": 0.00019034895390411186,
"loss": 2.4042,
"step": 130
},
{
"epoch": 0.05073382859213626,
"grad_norm": 2.5793519020080566,
"learning_rate": 0.0001863287504206196,
"loss": 2.3586,
"step": 140
},
{
"epoch": 0.05435767349157456,
"grad_norm": 5.261249542236328,
"learning_rate": 0.00018190257764125471,
"loss": 2.2702,
"step": 150
},
{
"epoch": 0.05435767349157456,
"eval_loss": 2.756633758544922,
"eval_runtime": 74.3689,
"eval_samples_per_second": 15.625,
"eval_steps_per_second": 3.913,
"step": 150
},
{
"epoch": 0.057981518391012864,
"grad_norm": 1.6721464395523071,
"learning_rate": 0.00017709199943488106,
"loss": 2.5641,
"step": 160
},
{
"epoch": 0.061605363290451166,
"grad_norm": 2.1737399101257324,
"learning_rate": 0.00017192045245496238,
"loss": 2.2523,
"step": 170
},
{
"epoch": 0.06522920818988948,
"grad_norm": 1.9437196254730225,
"learning_rate": 0.00016641313195854277,
"loss": 2.3634,
"step": 180
},
{
"epoch": 0.06885305308932778,
"grad_norm": 2.254823923110962,
"learning_rate": 0.0001605968690574869,
"loss": 2.2732,
"step": 190
},
{
"epoch": 0.07247689798876608,
"grad_norm": 4.65241813659668,
"learning_rate": 0.0001545,
"loss": 2.2174,
"step": 200
},
{
"epoch": 0.07247689798876608,
"eval_loss": 2.4248692989349365,
"eval_runtime": 74.2004,
"eval_samples_per_second": 15.66,
"eval_steps_per_second": 3.922,
"step": 200
},
{
"epoch": 0.07610074288820438,
"grad_norm": 1.2383389472961426,
"learning_rate": 0.00014815222811927496,
"loss": 2.2746,
"step": 210
},
{
"epoch": 0.07972458778764269,
"grad_norm": 1.7436312437057495,
"learning_rate": 0.00014158447912183896,
"loss": 2.403,
"step": 220
},
{
"epoch": 0.08334843268708099,
"grad_norm": 1.832153081893921,
"learning_rate": 0.00013482875042061958,
"loss": 2.3621,
"step": 230
},
{
"epoch": 0.08697227758651929,
"grad_norm": 2.1388401985168457,
"learning_rate": 0.00012791795524676576,
"loss": 2.2489,
"step": 240
},
{
"epoch": 0.0905961224859576,
"grad_norm": 5.763660430908203,
"learning_rate": 0.00012088576229969385,
"loss": 2.17,
"step": 250
},
{
"epoch": 0.0905961224859576,
"eval_loss": 2.3869099617004395,
"eval_runtime": 73.9921,
"eval_samples_per_second": 15.704,
"eval_steps_per_second": 3.933,
"step": 250
},
{
"epoch": 0.09421996738539591,
"grad_norm": 1.1788461208343506,
"learning_rate": 0.0001137664317165683,
"loss": 2.3782,
"step": 260
},
{
"epoch": 0.09784381228483421,
"grad_norm": 1.594274640083313,
"learning_rate": 0.00010659464816035761,
"loss": 2.354,
"step": 270
},
{
"epoch": 0.10146765718427252,
"grad_norm": 2.020671844482422,
"learning_rate": 9.940535183964242e-05,
"loss": 2.3174,
"step": 280
},
{
"epoch": 0.10509150208371082,
"grad_norm": 1.9140796661376953,
"learning_rate": 9.22335682834317e-05,
"loss": 2.233,
"step": 290
},
{
"epoch": 0.10871534698314912,
"grad_norm": 5.550623416900635,
"learning_rate": 8.511423770030617e-05,
"loss": 2.2559,
"step": 300
},
{
"epoch": 0.10871534698314912,
"eval_loss": 2.2845678329467773,
"eval_runtime": 74.3833,
"eval_samples_per_second": 15.622,
"eval_steps_per_second": 3.912,
"step": 300
},
{
"epoch": 0.11233919188258742,
"grad_norm": 1.1478153467178345,
"learning_rate": 7.808204475323423e-05,
"loss": 2.2516,
"step": 310
},
{
"epoch": 0.11596303678202573,
"grad_norm": 1.3297686576843262,
"learning_rate": 7.117124957938042e-05,
"loss": 2.2543,
"step": 320
},
{
"epoch": 0.11958688168146403,
"grad_norm": 1.892397403717041,
"learning_rate": 6.441552087816105e-05,
"loss": 2.2304,
"step": 330
},
{
"epoch": 0.12321072658090233,
"grad_norm": 2.205890417098999,
"learning_rate": 5.784777188072502e-05,
"loss": 2.0618,
"step": 340
},
{
"epoch": 0.12683457148034064,
"grad_norm": 4.548376083374023,
"learning_rate": 5.150000000000002e-05,
"loss": 2.2087,
"step": 350
},
{
"epoch": 0.12683457148034064,
"eval_loss": 2.2185325622558594,
"eval_runtime": 74.4964,
"eval_samples_per_second": 15.598,
"eval_steps_per_second": 3.906,
"step": 350
},
{
"epoch": 0.13045841637977895,
"grad_norm": 1.0791500806808472,
"learning_rate": 4.540313094251309e-05,
"loss": 2.3101,
"step": 360
},
{
"epoch": 0.13408226127921724,
"grad_norm": 1.6470967531204224,
"learning_rate": 3.958686804145719e-05,
"loss": 2.1884,
"step": 370
},
{
"epoch": 0.13770610617865556,
"grad_norm": 1.761602759361267,
"learning_rate": 3.4079547545037634e-05,
"loss": 2.2748,
"step": 380
},
{
"epoch": 0.14132995107809385,
"grad_norm": 2.274669647216797,
"learning_rate": 2.8908000565118947e-05,
"loss": 2.1368,
"step": 390
},
{
"epoch": 0.14495379597753216,
"grad_norm": 4.047662734985352,
"learning_rate": 2.4097422358745275e-05,
"loss": 2.2695,
"step": 400
},
{
"epoch": 0.14495379597753216,
"eval_loss": 2.1836745738983154,
"eval_runtime": 74.0192,
"eval_samples_per_second": 15.699,
"eval_steps_per_second": 3.931,
"step": 400
},
{
"epoch": 0.14857764087697048,
"grad_norm": 1.494776725769043,
"learning_rate": 1.9671249579380422e-05,
"loss": 2.2505,
"step": 410
},
{
"epoch": 0.15220148577640877,
"grad_norm": 1.513637661933899,
"learning_rate": 1.5651046095888127e-05,
"loss": 2.2135,
"step": 420
},
{
"epoch": 0.15582533067584708,
"grad_norm": 1.749608039855957,
"learning_rate": 1.205639793553052e-05,
"loss": 2.254,
"step": 430
},
{
"epoch": 0.15944917557528537,
"grad_norm": 1.8985235691070557,
"learning_rate": 8.904817862812098e-06,
"loss": 2.1949,
"step": 440
},
{
"epoch": 0.1630730204747237,
"grad_norm": 5.7069902420043945,
"learning_rate": 6.211660059051443e-06,
"loss": 2.0524,
"step": 450
},
{
"epoch": 0.1630730204747237,
"eval_loss": 2.1746580600738525,
"eval_runtime": 74.2011,
"eval_samples_per_second": 15.66,
"eval_steps_per_second": 3.922,
"step": 450
},
{
"epoch": 0.16669686537416198,
"grad_norm": 1.3536880016326904,
"learning_rate": 3.990045318353154e-06,
"loss": 2.2023,
"step": 460
},
{
"epoch": 0.1703207102736003,
"grad_norm": 1.4728455543518066,
"learning_rate": 2.250797124418014e-06,
"loss": 2.2914,
"step": 470
},
{
"epoch": 0.17394455517303858,
"grad_norm": 1.5814944505691528,
"learning_rate": 1.0023889196182526e-06,
"loss": 2.2179,
"step": 480
},
{
"epoch": 0.1775684000724769,
"grad_norm": 1.8983994722366333,
"learning_rate": 2.5090282323810766e-07,
"loss": 2.1582,
"step": 490
},
{
"epoch": 0.1811922449719152,
"grad_norm": 7.900518417358398,
"learning_rate": 0.0,
"loss": 2.0353,
"step": 500
},
{
"epoch": 0.1811922449719152,
"eval_loss": 2.1751492023468018,
"eval_runtime": 74.1689,
"eval_samples_per_second": 15.667,
"eval_steps_per_second": 3.923,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 1
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.739340792777933e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}