{ "best_metric": 2.1746580600738525, "best_model_checkpoint": "miner_id_24/checkpoint-450", "epoch": 0.1811922449719152, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003623844899438304, "eval_loss": 3.253896474838257, "eval_runtime": 74.5486, "eval_samples_per_second": 15.587, "eval_steps_per_second": 3.903, "step": 1 }, { "epoch": 0.003623844899438304, "grad_norm": 2.0134332180023193, "learning_rate": 4.12e-05, "loss": 2.878, "step": 10 }, { "epoch": 0.007247689798876608, "grad_norm": 1.886479377746582, "learning_rate": 8.24e-05, "loss": 2.5585, "step": 20 }, { "epoch": 0.010871534698314912, "grad_norm": 1.836480975151062, "learning_rate": 0.0001236, "loss": 2.4347, "step": 30 }, { "epoch": 0.014495379597753216, "grad_norm": 2.0984137058258057, "learning_rate": 0.0001648, "loss": 2.3679, "step": 40 }, { "epoch": 0.01811922449719152, "grad_norm": 4.840346813201904, "learning_rate": 0.000206, "loss": 2.3008, "step": 50 }, { "epoch": 0.01811922449719152, "eval_loss": 2.5439529418945312, "eval_runtime": 74.1639, "eval_samples_per_second": 15.668, "eval_steps_per_second": 3.924, "step": 50 }, { "epoch": 0.021743069396629823, "grad_norm": 1.3696887493133545, "learning_rate": 0.0002057490971767619, "loss": 2.384, "step": 60 }, { "epoch": 0.02536691429606813, "grad_norm": 1.702624797821045, "learning_rate": 0.00020499761108038175, "loss": 2.4112, "step": 70 }, { "epoch": 0.028990759195506432, "grad_norm": 1.841941237449646, "learning_rate": 0.00020374920287558198, "loss": 2.3759, "step": 80 }, { "epoch": 0.03261460409494474, "grad_norm": 2.737161874771118, "learning_rate": 0.00020200995468164684, "loss": 2.2692, "step": 90 }, { "epoch": 0.03623844899438304, "grad_norm": 4.560957431793213, "learning_rate": 0.00019978833994094855, "loss": 2.1869, "step": 100 }, { "epoch": 0.03623844899438304, "eval_loss": 2.719433307647705, "eval_runtime": 74.2575, "eval_samples_per_second": 15.648, "eval_steps_per_second": 3.919, "step": 100 }, { "epoch": 0.03986229389382134, "grad_norm": 1.4899482727050781, "learning_rate": 0.00019709518213718787, "loss": 2.4953, "step": 110 }, { "epoch": 0.043486138793259646, "grad_norm": 1.4345935583114624, "learning_rate": 0.00019394360206446948, "loss": 2.3727, "step": 120 }, { "epoch": 0.047109983692697956, "grad_norm": 1.8238986730575562, "learning_rate": 0.00019034895390411186, "loss": 2.4042, "step": 130 }, { "epoch": 0.05073382859213626, "grad_norm": 2.5793519020080566, "learning_rate": 0.0001863287504206196, "loss": 2.3586, "step": 140 }, { "epoch": 0.05435767349157456, "grad_norm": 5.261249542236328, "learning_rate": 0.00018190257764125471, "loss": 2.2702, "step": 150 }, { "epoch": 0.05435767349157456, "eval_loss": 2.756633758544922, "eval_runtime": 74.3689, "eval_samples_per_second": 15.625, "eval_steps_per_second": 3.913, "step": 150 }, { "epoch": 0.057981518391012864, "grad_norm": 1.6721464395523071, "learning_rate": 0.00017709199943488106, "loss": 2.5641, "step": 160 }, { "epoch": 0.061605363290451166, "grad_norm": 2.1737399101257324, "learning_rate": 0.00017192045245496238, "loss": 2.2523, "step": 170 }, { "epoch": 0.06522920818988948, "grad_norm": 1.9437196254730225, "learning_rate": 0.00016641313195854277, "loss": 2.3634, "step": 180 }, { "epoch": 0.06885305308932778, "grad_norm": 2.254823923110962, "learning_rate": 0.0001605968690574869, "loss": 2.2732, "step": 190 }, { "epoch": 0.07247689798876608, "grad_norm": 4.65241813659668, "learning_rate": 0.0001545, "loss": 2.2174, "step": 200 }, { "epoch": 0.07247689798876608, "eval_loss": 2.4248692989349365, "eval_runtime": 74.2004, "eval_samples_per_second": 15.66, "eval_steps_per_second": 3.922, "step": 200 }, { "epoch": 0.07610074288820438, "grad_norm": 1.2383389472961426, "learning_rate": 0.00014815222811927496, "loss": 2.2746, "step": 210 }, { "epoch": 0.07972458778764269, "grad_norm": 1.7436312437057495, "learning_rate": 0.00014158447912183896, "loss": 2.403, "step": 220 }, { "epoch": 0.08334843268708099, "grad_norm": 1.832153081893921, "learning_rate": 0.00013482875042061958, "loss": 2.3621, "step": 230 }, { "epoch": 0.08697227758651929, "grad_norm": 2.1388401985168457, "learning_rate": 0.00012791795524676576, "loss": 2.2489, "step": 240 }, { "epoch": 0.0905961224859576, "grad_norm": 5.763660430908203, "learning_rate": 0.00012088576229969385, "loss": 2.17, "step": 250 }, { "epoch": 0.0905961224859576, "eval_loss": 2.3869099617004395, "eval_runtime": 73.9921, "eval_samples_per_second": 15.704, "eval_steps_per_second": 3.933, "step": 250 }, { "epoch": 0.09421996738539591, "grad_norm": 1.1788461208343506, "learning_rate": 0.0001137664317165683, "loss": 2.3782, "step": 260 }, { "epoch": 0.09784381228483421, "grad_norm": 1.594274640083313, "learning_rate": 0.00010659464816035761, "loss": 2.354, "step": 270 }, { "epoch": 0.10146765718427252, "grad_norm": 2.020671844482422, "learning_rate": 9.940535183964242e-05, "loss": 2.3174, "step": 280 }, { "epoch": 0.10509150208371082, "grad_norm": 1.9140796661376953, "learning_rate": 9.22335682834317e-05, "loss": 2.233, "step": 290 }, { "epoch": 0.10871534698314912, "grad_norm": 5.550623416900635, "learning_rate": 8.511423770030617e-05, "loss": 2.2559, "step": 300 }, { "epoch": 0.10871534698314912, "eval_loss": 2.2845678329467773, "eval_runtime": 74.3833, "eval_samples_per_second": 15.622, "eval_steps_per_second": 3.912, "step": 300 }, { "epoch": 0.11233919188258742, "grad_norm": 1.1478153467178345, "learning_rate": 7.808204475323423e-05, "loss": 2.2516, "step": 310 }, { "epoch": 0.11596303678202573, "grad_norm": 1.3297686576843262, "learning_rate": 7.117124957938042e-05, "loss": 2.2543, "step": 320 }, { "epoch": 0.11958688168146403, "grad_norm": 1.892397403717041, "learning_rate": 6.441552087816105e-05, "loss": 2.2304, "step": 330 }, { "epoch": 0.12321072658090233, "grad_norm": 2.205890417098999, "learning_rate": 5.784777188072502e-05, "loss": 2.0618, "step": 340 }, { "epoch": 0.12683457148034064, "grad_norm": 4.548376083374023, "learning_rate": 5.150000000000002e-05, "loss": 2.2087, "step": 350 }, { "epoch": 0.12683457148034064, "eval_loss": 2.2185325622558594, "eval_runtime": 74.4964, "eval_samples_per_second": 15.598, "eval_steps_per_second": 3.906, "step": 350 }, { "epoch": 0.13045841637977895, "grad_norm": 1.0791500806808472, "learning_rate": 4.540313094251309e-05, "loss": 2.3101, "step": 360 }, { "epoch": 0.13408226127921724, "grad_norm": 1.6470967531204224, "learning_rate": 3.958686804145719e-05, "loss": 2.1884, "step": 370 }, { "epoch": 0.13770610617865556, "grad_norm": 1.761602759361267, "learning_rate": 3.4079547545037634e-05, "loss": 2.2748, "step": 380 }, { "epoch": 0.14132995107809385, "grad_norm": 2.274669647216797, "learning_rate": 2.8908000565118947e-05, "loss": 2.1368, "step": 390 }, { "epoch": 0.14495379597753216, "grad_norm": 4.047662734985352, "learning_rate": 2.4097422358745275e-05, "loss": 2.2695, "step": 400 }, { "epoch": 0.14495379597753216, "eval_loss": 2.1836745738983154, "eval_runtime": 74.0192, "eval_samples_per_second": 15.699, "eval_steps_per_second": 3.931, "step": 400 }, { "epoch": 0.14857764087697048, "grad_norm": 1.494776725769043, "learning_rate": 1.9671249579380422e-05, "loss": 2.2505, "step": 410 }, { "epoch": 0.15220148577640877, "grad_norm": 1.513637661933899, "learning_rate": 1.5651046095888127e-05, "loss": 2.2135, "step": 420 }, { "epoch": 0.15582533067584708, "grad_norm": 1.749608039855957, "learning_rate": 1.205639793553052e-05, "loss": 2.254, "step": 430 }, { "epoch": 0.15944917557528537, "grad_norm": 1.8985235691070557, "learning_rate": 8.904817862812098e-06, "loss": 2.1949, "step": 440 }, { "epoch": 0.1630730204747237, "grad_norm": 5.7069902420043945, "learning_rate": 6.211660059051443e-06, "loss": 2.0524, "step": 450 }, { "epoch": 0.1630730204747237, "eval_loss": 2.1746580600738525, "eval_runtime": 74.2011, "eval_samples_per_second": 15.66, "eval_steps_per_second": 3.922, "step": 450 }, { "epoch": 0.16669686537416198, "grad_norm": 1.3536880016326904, "learning_rate": 3.990045318353154e-06, "loss": 2.2023, "step": 460 }, { "epoch": 0.1703207102736003, "grad_norm": 1.4728455543518066, "learning_rate": 2.250797124418014e-06, "loss": 2.2914, "step": 470 }, { "epoch": 0.17394455517303858, "grad_norm": 1.5814944505691528, "learning_rate": 1.0023889196182526e-06, "loss": 2.2179, "step": 480 }, { "epoch": 0.1775684000724769, "grad_norm": 1.8983994722366333, "learning_rate": 2.5090282323810766e-07, "loss": 2.1582, "step": 490 }, { "epoch": 0.1811922449719152, "grad_norm": 7.900518417358398, "learning_rate": 0.0, "loss": 2.0353, "step": 500 }, { "epoch": 0.1811922449719152, "eval_loss": 2.1751492023468018, "eval_runtime": 74.1689, "eval_samples_per_second": 15.667, "eval_steps_per_second": 3.923, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.739340792777933e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }