{ "best_metric": 0.13189151883125305, "best_model_checkpoint": "miner_id_24/checkpoint-450", "epoch": 0.5437737901033171, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001087547580206634, "eval_loss": 2.6140365600585938, "eval_runtime": 22.7312, "eval_samples_per_second": 17.069, "eval_steps_per_second": 4.267, "step": 1 }, { "epoch": 0.010875475802066341, "grad_norm": 1.1718631982803345, "learning_rate": 4.0400000000000006e-05, "loss": 0.6852, "step": 10 }, { "epoch": 0.021750951604132682, "grad_norm": 1.6259856224060059, "learning_rate": 8.080000000000001e-05, "loss": 0.6464, "step": 20 }, { "epoch": 0.03262642740619902, "grad_norm": 2.4112484455108643, "learning_rate": 0.00012119999999999999, "loss": 0.4593, "step": 30 }, { "epoch": 0.043501903208265365, "grad_norm": 6.566432476043701, "learning_rate": 0.00016160000000000002, "loss": 0.546, "step": 40 }, { "epoch": 0.054377379010331704, "grad_norm": 4.432110786437988, "learning_rate": 0.000202, "loss": 0.465, "step": 50 }, { "epoch": 0.054377379010331704, "eval_loss": 0.5987022519111633, "eval_runtime": 22.7609, "eval_samples_per_second": 17.047, "eval_steps_per_second": 4.262, "step": 50 }, { "epoch": 0.06525285481239804, "grad_norm": 1.1767598390579224, "learning_rate": 0.00020175396907624226, "loss": 0.2883, "step": 60 }, { "epoch": 0.07612833061446438, "grad_norm": 0.9557681679725647, "learning_rate": 0.0002010170749428986, "loss": 0.2687, "step": 70 }, { "epoch": 0.08700380641653073, "grad_norm": 2.2727346420288086, "learning_rate": 0.00019979290767411438, "loss": 0.2226, "step": 80 }, { "epoch": 0.09787928221859707, "grad_norm": 3.207522392272949, "learning_rate": 0.0001980874312897702, "loss": 0.2887, "step": 90 }, { "epoch": 0.10875475802066341, "grad_norm": 5.689190864562988, "learning_rate": 0.00019590895469937675, "loss": 0.4137, "step": 100 }, { "epoch": 0.10875475802066341, "eval_loss": 0.5496706366539001, "eval_runtime": 22.733, "eval_samples_per_second": 17.068, "eval_steps_per_second": 4.267, "step": 100 }, { "epoch": 0.11963023382272975, "grad_norm": 0.8785877227783203, "learning_rate": 0.0001932680912219027, "loss": 0.2831, "step": 110 }, { "epoch": 0.13050570962479607, "grad_norm": 0.8429152965545654, "learning_rate": 0.00019017770687875164, "loss": 0.1949, "step": 120 }, { "epoch": 0.14138118542686243, "grad_norm": 1.1005014181137085, "learning_rate": 0.000186652857711799, "loss": 0.1827, "step": 130 }, { "epoch": 0.15225666122892875, "grad_norm": 4.498092174530029, "learning_rate": 0.00018271071643186968, "loss": 3.1838, "step": 140 }, { "epoch": 0.1631321370309951, "grad_norm": 12.54207706451416, "learning_rate": 0.00017837048875501678, "loss": 0.2994, "step": 150 }, { "epoch": 0.1631321370309951, "eval_loss": 0.4509264826774597, "eval_runtime": 22.7489, "eval_samples_per_second": 17.056, "eval_steps_per_second": 4.264, "step": 150 }, { "epoch": 0.17400761283306146, "grad_norm": 0.6588810682296753, "learning_rate": 0.00017365331983420376, "loss": 0.2452, "step": 160 }, { "epoch": 0.18488308863512778, "grad_norm": 0.8176196813583374, "learning_rate": 0.0001685821912422447, "loss": 0.1395, "step": 170 }, { "epoch": 0.19575856443719414, "grad_norm": 1.6196659803390503, "learning_rate": 0.00016318180900789148, "loss": 0.1752, "step": 180 }, { "epoch": 0.20663404023926046, "grad_norm": 1.6988027095794678, "learning_rate": 0.00015747848325054544, "loss": 0.272, "step": 190 }, { "epoch": 0.21750951604132682, "grad_norm": 2.5128302574157715, "learning_rate": 0.0001515, "loss": 0.3535, "step": 200 }, { "epoch": 0.21750951604132682, "eval_loss": 0.2948015630245209, "eval_runtime": 22.7652, "eval_samples_per_second": 17.044, "eval_steps_per_second": 4.261, "step": 200 }, { "epoch": 0.22838499184339314, "grad_norm": 0.5956371426582336, "learning_rate": 0.00014527548582569683, "loss": 0.211, "step": 210 }, { "epoch": 0.2392604676454595, "grad_norm": 0.7749199867248535, "learning_rate": 0.00013883526593500714, "loss": 0.1476, "step": 220 }, { "epoch": 0.2501359434475258, "grad_norm": 0.8212958574295044, "learning_rate": 0.0001322107164318697, "loss": 0.1348, "step": 230 }, { "epoch": 0.26101141924959215, "grad_norm": 2.9552807807922363, "learning_rate": 0.00012543411145556643, "loss": 0.2418, "step": 240 }, { "epoch": 0.27188689505165853, "grad_norm": 5.775247097015381, "learning_rate": 0.00011853846594435998, "loss": 0.203, "step": 250 }, { "epoch": 0.27188689505165853, "eval_loss": 0.27232831716537476, "eval_runtime": 22.8141, "eval_samples_per_second": 17.007, "eval_steps_per_second": 4.252, "step": 250 }, { "epoch": 0.28276237085372485, "grad_norm": 0.527335524559021, "learning_rate": 0.00011155737479003301, "loss": 0.2042, "step": 260 }, { "epoch": 0.2936378466557912, "grad_norm": 0.4495840072631836, "learning_rate": 0.00010452484916695262, "loss": 0.1778, "step": 270 }, { "epoch": 0.3045133224578575, "grad_norm": 0.55239337682724, "learning_rate": 9.747515083304742e-05, "loss": 0.0975, "step": 280 }, { "epoch": 0.3153887982599239, "grad_norm": 2.169278144836426, "learning_rate": 9.044262520996702e-05, "loss": 0.2378, "step": 290 }, { "epoch": 0.3262642740619902, "grad_norm": 6.8742995262146, "learning_rate": 8.346153405564004e-05, "loss": 0.4967, "step": 300 }, { "epoch": 0.3262642740619902, "eval_loss": 0.24104903638362885, "eval_runtime": 22.7429, "eval_samples_per_second": 17.06, "eval_steps_per_second": 4.265, "step": 300 }, { "epoch": 0.33713974986405654, "grad_norm": 0.5075086355209351, "learning_rate": 7.656588854443357e-05, "loss": 0.1705, "step": 310 }, { "epoch": 0.3480152256661229, "grad_norm": 0.6775454878807068, "learning_rate": 6.978928356813031e-05, "loss": 0.1091, "step": 320 }, { "epoch": 0.35889070146818924, "grad_norm": 4.688424587249756, "learning_rate": 6.316473406499288e-05, "loss": 0.1494, "step": 330 }, { "epoch": 0.36976617727025557, "grad_norm": 7.4858808517456055, "learning_rate": 5.672451417430317e-05, "loss": 0.1966, "step": 340 }, { "epoch": 0.3806416530723219, "grad_norm": 5.136172771453857, "learning_rate": 5.050000000000002e-05, "loss": 0.232, "step": 350 }, { "epoch": 0.3806416530723219, "eval_loss": 0.20996609330177307, "eval_runtime": 23.069, "eval_samples_per_second": 16.819, "eval_steps_per_second": 4.205, "step": 350 }, { "epoch": 0.3915171288743883, "grad_norm": 0.5805565714836121, "learning_rate": 4.452151674945458e-05, "loss": 0.1389, "step": 360 }, { "epoch": 0.4023926046764546, "grad_norm": 1.093970775604248, "learning_rate": 3.8818190992108515e-05, "loss": 0.1285, "step": 370 }, { "epoch": 0.4132680804785209, "grad_norm": 0.5711682438850403, "learning_rate": 3.3417808757755355e-05, "loss": 0.109, "step": 380 }, { "epoch": 0.42414355628058725, "grad_norm": 1.3308712244033813, "learning_rate": 2.8346680165796253e-05, "loss": 0.1234, "step": 390 }, { "epoch": 0.43501903208265363, "grad_norm": 3.603949546813965, "learning_rate": 2.362951124498323e-05, "loss": 0.1905, "step": 400 }, { "epoch": 0.43501903208265363, "eval_loss": 0.1441967934370041, "eval_runtime": 22.5486, "eval_samples_per_second": 17.207, "eval_steps_per_second": 4.302, "step": 400 }, { "epoch": 0.44589450788471996, "grad_norm": 0.4847097098827362, "learning_rate": 1.928928356813032e-05, "loss": 0.1315, "step": 410 }, { "epoch": 0.4567699836867863, "grad_norm": 0.6296175122261047, "learning_rate": 1.5347142288200977e-05, "loss": 0.1213, "step": 420 }, { "epoch": 0.4676454594888526, "grad_norm": 1.3326748609542847, "learning_rate": 1.1822293121248375e-05, "loss": 0.1011, "step": 430 }, { "epoch": 0.478520935290919, "grad_norm": 0.9956807494163513, "learning_rate": 8.731908778097302e-06, "loss": 0.1667, "step": 440 }, { "epoch": 0.4893964110929853, "grad_norm": 1.1891385316848755, "learning_rate": 6.09104530062326e-06, "loss": 0.2123, "step": 450 }, { "epoch": 0.4893964110929853, "eval_loss": 0.13189151883125305, "eval_runtime": 22.5505, "eval_samples_per_second": 17.206, "eval_steps_per_second": 4.301, "step": 450 }, { "epoch": 0.5002718868950516, "grad_norm": 0.4370393455028534, "learning_rate": 3.912568710229791e-06, "loss": 0.1239, "step": 460 }, { "epoch": 0.511147362697118, "grad_norm": 0.5538854002952576, "learning_rate": 2.2070923258856255e-06, "loss": 0.0938, "step": 470 }, { "epoch": 0.5220228384991843, "grad_norm": 1.1034691333770752, "learning_rate": 9.829250571013935e-07, "loss": 0.1054, "step": 480 }, { "epoch": 0.5328983143012507, "grad_norm": 1.444915533065796, "learning_rate": 2.4603092375775605e-07, "loss": 0.1426, "step": 490 }, { "epoch": 0.5437737901033171, "grad_norm": 3.916834831237793, "learning_rate": 0.0, "loss": 0.2352, "step": 500 }, { "epoch": 0.5437737901033171, "eval_loss": 0.13204382359981537, "eval_runtime": 22.7776, "eval_samples_per_second": 17.034, "eval_steps_per_second": 4.259, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.002332739076096e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }