{ "best_metric": 2.79083251953125, "best_model_checkpoint": "miner_id_24/checkpoint-450", "epoch": 0.026761819803746655, "eval_steps": 50, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.947071067499257e-05, "eval_loss": 3.244145631790161, "eval_runtime": 152.6741, "eval_samples_per_second": 46.373, "eval_steps_per_second": 11.593, "step": 1 }, { "epoch": 0.0005947071067499256, "grad_norm": 0.7733362317085266, "learning_rate": 4.3e-05, "loss": 3.1454, "step": 10 }, { "epoch": 0.0011894142134998512, "grad_norm": 0.8843149542808533, "learning_rate": 8.6e-05, "loss": 3.1051, "step": 20 }, { "epoch": 0.001784121320249777, "grad_norm": 1.0911622047424316, "learning_rate": 0.000129, "loss": 3.1398, "step": 30 }, { "epoch": 0.0023788284269997025, "grad_norm": 1.3399714231491089, "learning_rate": 0.000172, "loss": 3.0248, "step": 40 }, { "epoch": 0.0029735355337496285, "grad_norm": 2.3345842361450195, "learning_rate": 0.000215, "loss": 3.0162, "step": 50 }, { "epoch": 0.0029735355337496285, "eval_loss": 3.0040273666381836, "eval_runtime": 152.6861, "eval_samples_per_second": 46.37, "eval_steps_per_second": 11.592, "step": 50 }, { "epoch": 0.003568242640499554, "grad_norm": 0.7499136924743652, "learning_rate": 0.0002147381354029311, "loss": 2.9332, "step": 60 }, { "epoch": 0.004162949747249479, "grad_norm": 0.827118456363678, "learning_rate": 0.0002139538173897188, "loss": 2.9428, "step": 70 }, { "epoch": 0.004757656853999405, "grad_norm": 0.8964134454727173, "learning_rate": 0.0002126508670788841, "loss": 2.917, "step": 80 }, { "epoch": 0.0053523639607493305, "grad_norm": 1.1437286138534546, "learning_rate": 0.00021083563231336926, "loss": 2.9036, "step": 90 }, { "epoch": 0.005947071067499257, "grad_norm": 2.0146210193634033, "learning_rate": 0.00020851695673448515, "loss": 2.9321, "step": 100 }, { "epoch": 0.005947071067499257, "eval_loss": 2.9241762161254883, "eval_runtime": 152.9767, "eval_samples_per_second": 46.282, "eval_steps_per_second": 11.57, "step": 100 }, { "epoch": 0.006541778174249183, "grad_norm": 0.7368491888046265, "learning_rate": 0.00020570613669657956, "loss": 2.9719, "step": 110 }, { "epoch": 0.007136485280999108, "grad_norm": 0.8235340118408203, "learning_rate": 0.00020241686623233464, "loss": 2.9481, "step": 120 }, { "epoch": 0.007731192387749034, "grad_norm": 0.9127469062805176, "learning_rate": 0.00019866517033681577, "loss": 2.8795, "step": 130 }, { "epoch": 0.008325899494498959, "grad_norm": 1.1192858219146729, "learning_rate": 0.00019446932689530684, "loss": 2.8425, "step": 140 }, { "epoch": 0.008920606601248885, "grad_norm": 2.003065347671509, "learning_rate": 0.0001898497776352901, "loss": 2.8621, "step": 150 }, { "epoch": 0.008920606601248885, "eval_loss": 2.9108498096466064, "eval_runtime": 151.7844, "eval_samples_per_second": 46.645, "eval_steps_per_second": 11.661, "step": 150 }, { "epoch": 0.00951531370799881, "grad_norm": 0.6534820199012756, "learning_rate": 0.000184829028536405, "loss": 2.9198, "step": 160 }, { "epoch": 0.010110020814748736, "grad_norm": 0.7408804893493652, "learning_rate": 0.00017943154018357726, "loss": 2.9099, "step": 170 }, { "epoch": 0.010704727921498661, "grad_norm": 0.9117923378944397, "learning_rate": 0.00017368360859750824, "loss": 2.7631, "step": 180 }, { "epoch": 0.011299435028248588, "grad_norm": 0.9990629553794861, "learning_rate": 0.00016761323712310527, "loss": 2.7877, "step": 190 }, { "epoch": 0.011894142134998514, "grad_norm": 1.7992066144943237, "learning_rate": 0.00016125, "loss": 2.9707, "step": 200 }, { "epoch": 0.011894142134998514, "eval_loss": 2.8687682151794434, "eval_runtime": 151.6691, "eval_samples_per_second": 46.681, "eval_steps_per_second": 11.67, "step": 200 }, { "epoch": 0.012488849241748439, "grad_norm": 0.6635362505912781, "learning_rate": 0.0001546248982798258, "loss": 2.8439, "step": 210 }, { "epoch": 0.013083556348498365, "grad_norm": 0.7446403503417969, "learning_rate": 0.00014777020879221055, "loss": 2.7955, "step": 220 }, { "epoch": 0.01367826345524829, "grad_norm": 0.834824800491333, "learning_rate": 0.00014071932689530684, "loss": 2.8353, "step": 230 }, { "epoch": 0.014272970561998216, "grad_norm": 1.0048375129699707, "learning_rate": 0.00013350660377696428, "loss": 2.8452, "step": 240 }, { "epoch": 0.014867677668748141, "grad_norm": 1.9789893627166748, "learning_rate": 0.00012616717909919503, "loss": 2.8344, "step": 250 }, { "epoch": 0.014867677668748141, "eval_loss": 2.843684434890747, "eval_runtime": 156.5509, "eval_samples_per_second": 45.225, "eval_steps_per_second": 11.306, "step": 250 }, { "epoch": 0.015462384775498068, "grad_norm": 0.6162389516830444, "learning_rate": 0.00011873680980127275, "loss": 2.8784, "step": 260 }, { "epoch": 0.016057091882247992, "grad_norm": 0.7453039288520813, "learning_rate": 0.00011125169589551887, "loss": 2.8229, "step": 270 }, { "epoch": 0.016651798988997917, "grad_norm": 0.8477461338043213, "learning_rate": 0.00010374830410448118, "loss": 2.7984, "step": 280 }, { "epoch": 0.017246506095747845, "grad_norm": 0.9298999309539795, "learning_rate": 9.626319019872726e-05, "loss": 2.8242, "step": 290 }, { "epoch": 0.01784121320249777, "grad_norm": 1.780192494392395, "learning_rate": 8.883282090080499e-05, "loss": 2.806, "step": 300 }, { "epoch": 0.01784121320249777, "eval_loss": 2.81919264793396, "eval_runtime": 152.2132, "eval_samples_per_second": 46.514, "eval_steps_per_second": 11.628, "step": 300 }, { "epoch": 0.018435920309247695, "grad_norm": 0.6574707627296448, "learning_rate": 8.149339622303573e-05, "loss": 2.8579, "step": 310 }, { "epoch": 0.01903062741599762, "grad_norm": 0.7700655460357666, "learning_rate": 7.428067310469316e-05, "loss": 2.8334, "step": 320 }, { "epoch": 0.019625334522747548, "grad_norm": 0.9334675073623657, "learning_rate": 6.722979120778945e-05, "loss": 2.83, "step": 330 }, { "epoch": 0.020220041629497473, "grad_norm": 1.045379877090454, "learning_rate": 6.0375101720174165e-05, "loss": 2.7034, "step": 340 }, { "epoch": 0.020814748736247397, "grad_norm": 1.8645881414413452, "learning_rate": 5.3750000000000026e-05, "loss": 2.8473, "step": 350 }, { "epoch": 0.020814748736247397, "eval_loss": 2.807974100112915, "eval_runtime": 151.7178, "eval_samples_per_second": 46.666, "eval_steps_per_second": 11.666, "step": 350 }, { "epoch": 0.021409455842997322, "grad_norm": 0.6872610449790955, "learning_rate": 4.738676287689473e-05, "loss": 2.8492, "step": 360 }, { "epoch": 0.02200416294974725, "grad_norm": 0.7925031781196594, "learning_rate": 4.131639140249173e-05, "loss": 2.8009, "step": 370 }, { "epoch": 0.022598870056497175, "grad_norm": 0.9018983244895935, "learning_rate": 3.5568459816422774e-05, "loss": 2.8143, "step": 380 }, { "epoch": 0.0231935771632471, "grad_norm": 1.0778685808181763, "learning_rate": 3.017097146359502e-05, "loss": 2.7583, "step": 390 }, { "epoch": 0.023788284269997028, "grad_norm": 1.6764299869537354, "learning_rate": 2.5150222364709875e-05, "loss": 2.7238, "step": 400 }, { "epoch": 0.023788284269997028, "eval_loss": 2.7967422008514404, "eval_runtime": 152.1562, "eval_samples_per_second": 46.531, "eval_steps_per_second": 11.633, "step": 400 }, { "epoch": 0.024382991376746953, "grad_norm": 0.7588967084884644, "learning_rate": 2.053067310469316e-05, "loss": 2.855, "step": 410 }, { "epoch": 0.024977698483496878, "grad_norm": 0.7385247945785522, "learning_rate": 1.633482966318421e-05, "loss": 2.7904, "step": 420 }, { "epoch": 0.025572405590246802, "grad_norm": 0.8757428526878357, "learning_rate": 1.2583133767665349e-05, "loss": 2.7639, "step": 430 }, { "epoch": 0.02616711269699673, "grad_norm": 1.0848084688186646, "learning_rate": 9.293863303420395e-06, "loss": 2.7639, "step": 440 }, { "epoch": 0.026761819803746655, "grad_norm": 2.063558578491211, "learning_rate": 6.483043265514856e-06, "loss": 2.8008, "step": 450 }, { "epoch": 0.026761819803746655, "eval_loss": 2.79083251953125, "eval_runtime": 153.2642, "eval_samples_per_second": 46.195, "eval_steps_per_second": 11.549, "step": 450 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4744780492308480.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }