{ "best_metric": 1.303171157836914, "best_model_checkpoint": "miner_id_24/checkpoint-400", "epoch": 0.055206680008281, "eval_steps": 50, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001380167000207025, "eval_loss": 2.196531057357788, "eval_runtime": 194.725, "eval_samples_per_second": 15.668, "eval_steps_per_second": 3.918, "step": 1 }, { "epoch": 0.001380167000207025, "grad_norm": 1.853713035583496, "learning_rate": 4.0400000000000006e-05, "loss": 1.9047, "step": 10 }, { "epoch": 0.00276033400041405, "grad_norm": 2.0089311599731445, "learning_rate": 8.080000000000001e-05, "loss": 1.6083, "step": 20 }, { "epoch": 0.004140501000621075, "grad_norm": 1.6798384189605713, "learning_rate": 0.00012119999999999999, "loss": 1.4963, "step": 30 }, { "epoch": 0.0055206680008281, "grad_norm": 2.0184950828552246, "learning_rate": 0.00016160000000000002, "loss": 1.4766, "step": 40 }, { "epoch": 0.006900835001035125, "grad_norm": 2.2714614868164062, "learning_rate": 0.000202, "loss": 1.6766, "step": 50 }, { "epoch": 0.006900835001035125, "eval_loss": 1.5489071607589722, "eval_runtime": 194.1989, "eval_samples_per_second": 15.711, "eval_steps_per_second": 3.929, "step": 50 }, { "epoch": 0.00828100200124215, "grad_norm": 1.3468365669250488, "learning_rate": 0.00020175396907624226, "loss": 1.3174, "step": 60 }, { "epoch": 0.009661169001449175, "grad_norm": 1.405402421951294, "learning_rate": 0.0002010170749428986, "loss": 1.4007, "step": 70 }, { "epoch": 0.0110413360016562, "grad_norm": 1.607871174812317, "learning_rate": 0.00019979290767411438, "loss": 1.481, "step": 80 }, { "epoch": 0.012421503001863225, "grad_norm": 1.7328439950942993, "learning_rate": 0.0001980874312897702, "loss": 1.4341, "step": 90 }, { "epoch": 0.01380167000207025, "grad_norm": 1.9513015747070312, "learning_rate": 0.00019590895469937675, "loss": 1.5601, "step": 100 }, { "epoch": 0.01380167000207025, "eval_loss": 1.470448613166809, "eval_runtime": 194.0541, "eval_samples_per_second": 15.722, "eval_steps_per_second": 3.932, "step": 100 }, { "epoch": 0.015181837002277276, "grad_norm": 1.1693236827850342, "learning_rate": 0.0001932680912219027, "loss": 1.1405, "step": 110 }, { "epoch": 0.0165620040024843, "grad_norm": 1.6762423515319824, "learning_rate": 0.00019017770687875164, "loss": 1.4333, "step": 120 }, { "epoch": 0.017942171002691326, "grad_norm": 1.6374577283859253, "learning_rate": 0.000186652857711799, "loss": 1.4104, "step": 130 }, { "epoch": 0.01932233800289835, "grad_norm": 1.8635785579681396, "learning_rate": 0.00018271071643186968, "loss": 1.473, "step": 140 }, { "epoch": 0.020702505003105377, "grad_norm": 2.215712547302246, "learning_rate": 0.00017837048875501678, "loss": 1.591, "step": 150 }, { "epoch": 0.020702505003105377, "eval_loss": 1.4511727094650269, "eval_runtime": 194.0225, "eval_samples_per_second": 15.725, "eval_steps_per_second": 3.933, "step": 150 }, { "epoch": 0.0220826720033124, "grad_norm": 1.410007357597351, "learning_rate": 0.00017365331983420376, "loss": 1.2349, "step": 160 }, { "epoch": 0.023462839003519427, "grad_norm": 1.6017369031906128, "learning_rate": 0.0001685821912422447, "loss": 1.3995, "step": 170 }, { "epoch": 0.02484300600372645, "grad_norm": 1.6010783910751343, "learning_rate": 0.00016318180900789148, "loss": 1.3827, "step": 180 }, { "epoch": 0.026223173003933477, "grad_norm": 1.8045480251312256, "learning_rate": 0.00015747848325054544, "loss": 1.4918, "step": 190 }, { "epoch": 0.0276033400041405, "grad_norm": 1.9998146295547485, "learning_rate": 0.0001515, "loss": 1.4707, "step": 200 }, { "epoch": 0.0276033400041405, "eval_loss": 1.4300576448440552, "eval_runtime": 193.9168, "eval_samples_per_second": 15.734, "eval_steps_per_second": 3.935, "step": 200 }, { "epoch": 0.028983507004347524, "grad_norm": 1.4455054998397827, "learning_rate": 0.00014527548582569683, "loss": 1.1305, "step": 210 }, { "epoch": 0.03036367400455455, "grad_norm": 1.4689016342163086, "learning_rate": 0.00013883526593500714, "loss": 1.3947, "step": 220 }, { "epoch": 0.031743841004761575, "grad_norm": 1.685587763786316, "learning_rate": 0.0001322107164318697, "loss": 1.3514, "step": 230 }, { "epoch": 0.0331240080049686, "grad_norm": 1.8048762083053589, "learning_rate": 0.00012543411145556643, "loss": 1.4504, "step": 240 }, { "epoch": 0.03450417500517563, "grad_norm": 2.7551028728485107, "learning_rate": 0.00011853846594435998, "loss": 1.5985, "step": 250 }, { "epoch": 0.03450417500517563, "eval_loss": 1.4250391721725464, "eval_runtime": 194.211, "eval_samples_per_second": 15.71, "eval_steps_per_second": 3.929, "step": 250 }, { "epoch": 0.03588434200538265, "grad_norm": 1.2140876054763794, "learning_rate": 0.00011155737479003301, "loss": 1.1542, "step": 260 }, { "epoch": 0.037264509005589676, "grad_norm": 1.2436723709106445, "learning_rate": 0.00010452484916695262, "loss": 1.3743, "step": 270 }, { "epoch": 0.0386446760057967, "grad_norm": 1.5601242780685425, "learning_rate": 9.747515083304742e-05, "loss": 1.3219, "step": 280 }, { "epoch": 0.04002484300600373, "grad_norm": 1.6851481199264526, "learning_rate": 9.044262520996702e-05, "loss": 1.3735, "step": 290 }, { "epoch": 0.04140501000621075, "grad_norm": 1.7231396436691284, "learning_rate": 8.346153405564004e-05, "loss": 1.4879, "step": 300 }, { "epoch": 0.04140501000621075, "eval_loss": 1.3670674562454224, "eval_runtime": 194.5478, "eval_samples_per_second": 15.683, "eval_steps_per_second": 3.922, "step": 300 }, { "epoch": 0.04278517700641778, "grad_norm": 1.1776810884475708, "learning_rate": 7.656588854443357e-05, "loss": 1.0798, "step": 310 }, { "epoch": 0.0441653440066248, "grad_norm": 1.2887829542160034, "learning_rate": 6.978928356813031e-05, "loss": 1.3379, "step": 320 }, { "epoch": 0.045545511006831824, "grad_norm": 1.455325961112976, "learning_rate": 6.316473406499288e-05, "loss": 1.3066, "step": 330 }, { "epoch": 0.046925678007038854, "grad_norm": 1.5977486371994019, "learning_rate": 5.672451417430317e-05, "loss": 1.3588, "step": 340 }, { "epoch": 0.04830584500724588, "grad_norm": 1.935961127281189, "learning_rate": 5.050000000000002e-05, "loss": 1.4722, "step": 350 }, { "epoch": 0.04830584500724588, "eval_loss": 1.3290566205978394, "eval_runtime": 193.6171, "eval_samples_per_second": 15.758, "eval_steps_per_second": 3.941, "step": 350 }, { "epoch": 0.0496860120074529, "grad_norm": 1.0178029537200928, "learning_rate": 4.452151674945458e-05, "loss": 1.0677, "step": 360 }, { "epoch": 0.051066179007659925, "grad_norm": 1.2846441268920898, "learning_rate": 3.8818190992108515e-05, "loss": 1.3573, "step": 370 }, { "epoch": 0.052446346007866955, "grad_norm": 1.4894729852676392, "learning_rate": 3.3417808757755355e-05, "loss": 1.2733, "step": 380 }, { "epoch": 0.05382651300807398, "grad_norm": 1.49921452999115, "learning_rate": 2.8346680165796253e-05, "loss": 1.304, "step": 390 }, { "epoch": 0.055206680008281, "grad_norm": 2.0546510219573975, "learning_rate": 2.362951124498323e-05, "loss": 1.4644, "step": 400 }, { "epoch": 0.055206680008281, "eval_loss": 1.303171157836914, "eval_runtime": 194.2964, "eval_samples_per_second": 15.703, "eval_steps_per_second": 3.927, "step": 400 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.881983448947098e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }