{ "best_metric": 1.3987419605255127, "best_model_checkpoint": "miner_id_24/checkpoint-450", "epoch": 0.28425241614553726, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005685048322910744, "eval_loss": 1.9435316324234009, "eval_runtime": 68.5626, "eval_samples_per_second": 10.808, "eval_steps_per_second": 2.713, "step": 1 }, { "epoch": 0.005685048322910745, "grad_norm": 3.069612503051758, "learning_rate": 4.08e-05, "loss": 3.128, "step": 10 }, { "epoch": 0.01137009664582149, "grad_norm": 4.051701068878174, "learning_rate": 8.16e-05, "loss": 3.3624, "step": 20 }, { "epoch": 0.017055144968732235, "grad_norm": 3.853865146636963, "learning_rate": 0.0001224, "loss": 2.9052, "step": 30 }, { "epoch": 0.02274019329164298, "grad_norm": 4.710514545440674, "learning_rate": 0.0001632, "loss": 2.944, "step": 40 }, { "epoch": 0.028425241614553724, "grad_norm": 40.267189025878906, "learning_rate": 0.000204, "loss": 2.8684, "step": 50 }, { "epoch": 0.028425241614553724, "eval_loss": 1.691328763961792, "eval_runtime": 68.2464, "eval_samples_per_second": 10.858, "eval_steps_per_second": 2.725, "step": 50 }, { "epoch": 0.03411028993746447, "grad_norm": 3.3239216804504395, "learning_rate": 0.00020375153312650207, "loss": 2.5603, "step": 60 }, { "epoch": 0.039795338260375214, "grad_norm": 3.307145595550537, "learning_rate": 0.00020300734301164017, "loss": 3.1017, "step": 70 }, { "epoch": 0.04548038658328596, "grad_norm": 4.276277542114258, "learning_rate": 0.00020177105527484818, "loss": 3.3892, "step": 80 }, { "epoch": 0.051165434906196704, "grad_norm": 6.920476913452148, "learning_rate": 0.00020004869298570854, "loss": 2.7897, "step": 90 }, { "epoch": 0.05685048322910745, "grad_norm": 21.554874420166016, "learning_rate": 0.00019784864732016265, "loss": 2.6079, "step": 100 }, { "epoch": 0.05685048322910745, "eval_loss": 1.8583825826644897, "eval_runtime": 68.2278, "eval_samples_per_second": 10.861, "eval_steps_per_second": 2.726, "step": 100 }, { "epoch": 0.06253553155201819, "grad_norm": 3.7841274738311768, "learning_rate": 0.00019518163667954527, "loss": 2.6824, "step": 110 }, { "epoch": 0.06822057987492894, "grad_norm": 3.257988452911377, "learning_rate": 0.00019206065447161056, "loss": 2.777, "step": 120 }, { "epoch": 0.07390562819783968, "grad_norm": 4.051051616668701, "learning_rate": 0.00018850090580795544, "loss": 3.6913, "step": 130 }, { "epoch": 0.07959067652075043, "grad_norm": 6.902204513549805, "learning_rate": 0.00018451973342624464, "loss": 3.2365, "step": 140 }, { "epoch": 0.08527572484366117, "grad_norm": 47.04884719848633, "learning_rate": 0.00018013653319813575, "loss": 2.7994, "step": 150 }, { "epoch": 0.08527572484366117, "eval_loss": 1.5659745931625366, "eval_runtime": 68.2701, "eval_samples_per_second": 10.854, "eval_steps_per_second": 2.724, "step": 150 }, { "epoch": 0.09096077316657192, "grad_norm": 3.0065667629241943, "learning_rate": 0.0001753726596345424, "loss": 2.9537, "step": 160 }, { "epoch": 0.09664582148948266, "grad_norm": 3.099189519882202, "learning_rate": 0.00017025132184860355, "loss": 3.0517, "step": 170 }, { "epoch": 0.10233086981239341, "grad_norm": 3.6827144622802734, "learning_rate": 0.00016479747048321714, "loss": 3.5192, "step": 180 }, { "epoch": 0.10801591813530415, "grad_norm": 7.355824947357178, "learning_rate": 0.00015903767615401616, "loss": 3.0137, "step": 190 }, { "epoch": 0.1137009664582149, "grad_norm": 14.326348304748535, "learning_rate": 0.000153, "loss": 2.8734, "step": 200 }, { "epoch": 0.1137009664582149, "eval_loss": 1.535672903060913, "eval_runtime": 67.9931, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.736, "step": 200 }, { "epoch": 0.11938601478112564, "grad_norm": 2.603611469268799, "learning_rate": 0.0001467138569724859, "loss": 2.4002, "step": 210 }, { "epoch": 0.12507106310403637, "grad_norm": 3.106489658355713, "learning_rate": 0.00014020987252842305, "loss": 3.1293, "step": 220 }, { "epoch": 0.13075611142694712, "grad_norm": 3.54360294342041, "learning_rate": 0.00013351973342624464, "loss": 3.3809, "step": 230 }, { "epoch": 0.13644115974985788, "grad_norm": 5.519043922424316, "learning_rate": 0.00012667603335116609, "loss": 2.7644, "step": 240 }, { "epoch": 0.14212620807276863, "grad_norm": 22.177762985229492, "learning_rate": 0.00011971211412202691, "loss": 2.7882, "step": 250 }, { "epoch": 0.14212620807276863, "eval_loss": 1.4961472749710083, "eval_runtime": 68.4935, "eval_samples_per_second": 10.819, "eval_steps_per_second": 2.716, "step": 250 }, { "epoch": 0.14781125639567935, "grad_norm": 2.693026304244995, "learning_rate": 0.00011266190325330066, "loss": 3.0899, "step": 260 }, { "epoch": 0.1534963047185901, "grad_norm": 7.191518306732178, "learning_rate": 0.00010555974866365511, "loss": 2.9899, "step": 270 }, { "epoch": 0.15918135304150086, "grad_norm": 4.4006028175354, "learning_rate": 9.844025133634492e-05, "loss": 3.0914, "step": 280 }, { "epoch": 0.1648664013644116, "grad_norm": 6.061286926269531, "learning_rate": 9.133809674669937e-05, "loss": 2.8825, "step": 290 }, { "epoch": 0.17055144968732233, "grad_norm": 18.585689544677734, "learning_rate": 8.428788587797311e-05, "loss": 2.1726, "step": 300 }, { "epoch": 0.17055144968732233, "eval_loss": 1.4905575513839722, "eval_runtime": 68.056, "eval_samples_per_second": 10.888, "eval_steps_per_second": 2.733, "step": 300 }, { "epoch": 0.17623649801023308, "grad_norm": 3.150554895401001, "learning_rate": 7.73239666488339e-05, "loss": 2.9069, "step": 310 }, { "epoch": 0.18192154633314384, "grad_norm": 3.780085802078247, "learning_rate": 7.048026657375537e-05, "loss": 3.1427, "step": 320 }, { "epoch": 0.1876065946560546, "grad_norm": 3.767456531524658, "learning_rate": 6.379012747157697e-05, "loss": 3.3117, "step": 330 }, { "epoch": 0.1932916429789653, "grad_norm": 8.545947074890137, "learning_rate": 5.7286143027514095e-05, "loss": 2.9684, "step": 340 }, { "epoch": 0.19897669130187606, "grad_norm": 14.986133575439453, "learning_rate": 5.100000000000002e-05, "loss": 2.9029, "step": 350 }, { "epoch": 0.19897669130187606, "eval_loss": 1.449243187904358, "eval_runtime": 68.6437, "eval_samples_per_second": 10.795, "eval_steps_per_second": 2.71, "step": 350 }, { "epoch": 0.20466173962478681, "grad_norm": 2.6051387786865234, "learning_rate": 4.496232384598384e-05, "loss": 2.7807, "step": 360 }, { "epoch": 0.21034678794769757, "grad_norm": 2.8948583602905273, "learning_rate": 3.9202529516782854e-05, "loss": 3.2851, "step": 370 }, { "epoch": 0.2160318362706083, "grad_norm": 4.121320724487305, "learning_rate": 3.374867815139649e-05, "loss": 3.2988, "step": 380 }, { "epoch": 0.22171688459351904, "grad_norm": 6.426261901855469, "learning_rate": 2.8627340365457602e-05, "loss": 2.8684, "step": 390 }, { "epoch": 0.2274019329164298, "grad_norm": 20.456893920898438, "learning_rate": 2.3863466801864254e-05, "loss": 2.2538, "step": 400 }, { "epoch": 0.2274019329164298, "eval_loss": 1.4112082719802856, "eval_runtime": 68.5395, "eval_samples_per_second": 10.811, "eval_steps_per_second": 2.714, "step": 400 }, { "epoch": 0.23308698123934055, "grad_norm": 3.0374252796173096, "learning_rate": 1.9480266573755372e-05, "loss": 2.4258, "step": 410 }, { "epoch": 0.23877202956225127, "grad_norm": 3.4767048358917236, "learning_rate": 1.5499094192044554e-05, "loss": 3.0519, "step": 420 }, { "epoch": 0.24445707788516202, "grad_norm": 4.919350624084473, "learning_rate": 1.1939345528389446e-05, "loss": 3.2507, "step": 430 }, { "epoch": 0.25014212620807275, "grad_norm": 7.929502487182617, "learning_rate": 8.818363320454701e-06, "loss": 2.8546, "step": 440 }, { "epoch": 0.2558271745309835, "grad_norm": 26.788230895996094, "learning_rate": 6.1513526798373514e-06, "loss": 2.4219, "step": 450 }, { "epoch": 0.2558271745309835, "eval_loss": 1.3987419605255127, "eval_runtime": 68.6583, "eval_samples_per_second": 10.793, "eval_steps_per_second": 2.709, "step": 450 }, { "epoch": 0.26151222285389425, "grad_norm": 2.4997353553771973, "learning_rate": 3.9513070142914725e-06, "loss": 2.8869, "step": 460 }, { "epoch": 0.26719727117680503, "grad_norm": 4.465632915496826, "learning_rate": 2.2289447251518195e-06, "loss": 3.2354, "step": 470 }, { "epoch": 0.27288231949971575, "grad_norm": 4.705733776092529, "learning_rate": 9.92656988359823e-07, "loss": 3.124, "step": 480 }, { "epoch": 0.2785673678226265, "grad_norm": 4.500922679901123, "learning_rate": 2.4846687349793185e-07, "loss": 2.7457, "step": 490 }, { "epoch": 0.28425241614553726, "grad_norm": 22.040611267089844, "learning_rate": 0.0, "loss": 2.8161, "step": 500 }, { "epoch": 0.28425241614553726, "eval_loss": 1.4063708782196045, "eval_runtime": 69.6065, "eval_samples_per_second": 10.646, "eval_steps_per_second": 2.672, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5434051934486528e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }