{ "best_metric": 0.8026256561279297, "best_model_checkpoint": "miner_id_24/checkpoint-400", "epoch": 0.09679370840895342, "eval_steps": 50, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00024198427102238354, "eval_loss": 1.3080251216888428, "eval_runtime": 53.1456, "eval_samples_per_second": 32.74, "eval_steps_per_second": 8.185, "step": 1 }, { "epoch": 0.0024198427102238356, "grad_norm": 1.103469967842102, "learning_rate": 4.2600000000000005e-05, "loss": 0.9286, "step": 10 }, { "epoch": 0.004839685420447671, "grad_norm": 1.7151211500167847, "learning_rate": 8.520000000000001e-05, "loss": 0.9736, "step": 20 }, { "epoch": 0.007259528130671506, "grad_norm": 1.8954715728759766, "learning_rate": 0.0001278, "loss": 1.108, "step": 30 }, { "epoch": 0.009679370840895343, "grad_norm": 2.3523519039154053, "learning_rate": 0.00017040000000000002, "loss": 1.1477, "step": 40 }, { "epoch": 0.012099213551119177, "grad_norm": 2.1743710041046143, "learning_rate": 0.000213, "loss": 1.0015, "step": 50 }, { "epoch": 0.012099213551119177, "eval_loss": 1.0154005289077759, "eval_runtime": 52.97, "eval_samples_per_second": 32.849, "eval_steps_per_second": 8.212, "step": 50 }, { "epoch": 0.014519056261343012, "grad_norm": 1.202251672744751, "learning_rate": 0.00021274057135267128, "loss": 0.8938, "step": 60 }, { "epoch": 0.01693889897156685, "grad_norm": 1.0863494873046875, "learning_rate": 0.00021196354932097723, "loss": 0.9626, "step": 70 }, { "epoch": 0.019358741681790685, "grad_norm": 1.537191390991211, "learning_rate": 0.0002106727194781503, "loss": 1.0011, "step": 80 }, { "epoch": 0.021778584392014518, "grad_norm": 2.1484382152557373, "learning_rate": 0.00020887437061743096, "loss": 1.1008, "step": 90 }, { "epoch": 0.024198427102238355, "grad_norm": 2.325395107269287, "learning_rate": 0.00020657726411369925, "loss": 0.9323, "step": 100 }, { "epoch": 0.024198427102238355, "eval_loss": 1.0085864067077637, "eval_runtime": 53.2168, "eval_samples_per_second": 32.696, "eval_steps_per_second": 8.174, "step": 100 }, { "epoch": 0.02661826981246219, "grad_norm": 0.9922317266464233, "learning_rate": 0.000203792591238937, "loss": 0.8588, "step": 110 }, { "epoch": 0.029038112522686024, "grad_norm": 1.1746575832366943, "learning_rate": 0.0002005339186394757, "loss": 0.8886, "step": 120 }, { "epoch": 0.03145795523290986, "grad_norm": 1.4047287702560425, "learning_rate": 0.00019681712224065936, "loss": 0.9517, "step": 130 }, { "epoch": 0.0338777979431337, "grad_norm": 2.049302816390991, "learning_rate": 0.0001926603099009319, "loss": 1.0665, "step": 140 }, { "epoch": 0.036297640653357534, "grad_norm": 1.6594562530517578, "learning_rate": 0.00018808373319217114, "loss": 0.8871, "step": 150 }, { "epoch": 0.036297640653357534, "eval_loss": 0.9642200469970703, "eval_runtime": 53.2251, "eval_samples_per_second": 32.691, "eval_steps_per_second": 8.173, "step": 150 }, { "epoch": 0.03871748336358137, "grad_norm": 0.8076631426811218, "learning_rate": 0.00018310968873606635, "loss": 0.7604, "step": 160 }, { "epoch": 0.0411373260738052, "grad_norm": 1.2448451519012451, "learning_rate": 0.0001777624095772184, "loss": 0.8971, "step": 170 }, { "epoch": 0.043557168784029036, "grad_norm": 1.4778797626495361, "learning_rate": 0.0001720679471221826, "loss": 0.9395, "step": 180 }, { "epoch": 0.04597701149425287, "grad_norm": 2.3277740478515625, "learning_rate": 0.00016605404421963453, "loss": 1.0046, "step": 190 }, { "epoch": 0.04839685420447671, "grad_norm": 2.7840893268585205, "learning_rate": 0.00015975, "loss": 0.9651, "step": 200 }, { "epoch": 0.04839685420447671, "eval_loss": 0.9314111471176147, "eval_runtime": 53.3426, "eval_samples_per_second": 32.619, "eval_steps_per_second": 8.155, "step": 200 }, { "epoch": 0.050816696914700546, "grad_norm": 0.8467873930931091, "learning_rate": 0.00015318652713303674, "loss": 0.7771, "step": 210 }, { "epoch": 0.05323653962492438, "grad_norm": 1.810876488685608, "learning_rate": 0.00014639560219879464, "loss": 0.8956, "step": 220 }, { "epoch": 0.05565638233514821, "grad_norm": 1.5156196355819702, "learning_rate": 0.0001394103099009319, "loss": 0.953, "step": 230 }, { "epoch": 0.05807622504537205, "grad_norm": 1.6678131818771362, "learning_rate": 0.0001322646818813646, "loss": 0.971, "step": 240 }, { "epoch": 0.060496067755595885, "grad_norm": 2.130657196044922, "learning_rate": 0.0001249935309215281, "loss": 0.8652, "step": 250 }, { "epoch": 0.060496067755595885, "eval_loss": 0.8826441168785095, "eval_runtime": 53.194, "eval_samples_per_second": 32.71, "eval_steps_per_second": 8.178, "step": 250 }, { "epoch": 0.06291591046581972, "grad_norm": 0.86822110414505, "learning_rate": 0.0001176322813380051, "loss": 0.7765, "step": 260 }, { "epoch": 0.06533575317604355, "grad_norm": 1.0380805730819702, "learning_rate": 0.00011021679639881638, "loss": 0.8417, "step": 270 }, { "epoch": 0.0677555958862674, "grad_norm": 1.1815681457519531, "learning_rate": 0.00010278320360118368, "loss": 0.8916, "step": 280 }, { "epoch": 0.07017543859649122, "grad_norm": 2.9040372371673584, "learning_rate": 9.536771866199493e-05, "loss": 0.8656, "step": 290 }, { "epoch": 0.07259528130671507, "grad_norm": 2.076068878173828, "learning_rate": 8.800646907847192e-05, "loss": 0.8418, "step": 300 }, { "epoch": 0.07259528130671507, "eval_loss": 0.8545441031455994, "eval_runtime": 53.1112, "eval_samples_per_second": 32.761, "eval_steps_per_second": 8.19, "step": 300 }, { "epoch": 0.0750151240169389, "grad_norm": 0.856940507888794, "learning_rate": 8.07353181186354e-05, "loss": 0.7956, "step": 310 }, { "epoch": 0.07743496672716274, "grad_norm": 0.9584810137748718, "learning_rate": 7.35896900990681e-05, "loss": 0.7721, "step": 320 }, { "epoch": 0.07985480943738657, "grad_norm": 1.5156158208847046, "learning_rate": 6.660439780120536e-05, "loss": 0.9232, "step": 330 }, { "epoch": 0.0822746521476104, "grad_norm": 2.322288990020752, "learning_rate": 5.981347286696324e-05, "loss": 0.8629, "step": 340 }, { "epoch": 0.08469449485783424, "grad_norm": 1.789493441581726, "learning_rate": 5.325000000000002e-05, "loss": 0.8331, "step": 350 }, { "epoch": 0.08469449485783424, "eval_loss": 0.8212170600891113, "eval_runtime": 53.0695, "eval_samples_per_second": 32.787, "eval_steps_per_second": 8.197, "step": 350 }, { "epoch": 0.08711433756805807, "grad_norm": 0.8615260720252991, "learning_rate": 4.6945955780365475e-05, "loss": 0.7184, "step": 360 }, { "epoch": 0.08953418027828192, "grad_norm": 1.1007622480392456, "learning_rate": 4.0932052877817393e-05, "loss": 0.8244, "step": 370 }, { "epoch": 0.09195402298850575, "grad_norm": 1.265352725982666, "learning_rate": 3.523759042278163e-05, "loss": 0.8621, "step": 380 }, { "epoch": 0.09437386569872959, "grad_norm": 1.663914680480957, "learning_rate": 2.989031126393367e-05, "loss": 0.8621, "step": 390 }, { "epoch": 0.09679370840895342, "grad_norm": 1.8117319345474243, "learning_rate": 2.4916266807828855e-05, "loss": 0.7855, "step": 400 }, { "epoch": 0.09679370840895342, "eval_loss": 0.8026256561279297, "eval_runtime": 53.2521, "eval_samples_per_second": 32.675, "eval_steps_per_second": 8.169, "step": 400 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.88932260102144e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }