{ "best_metric": 0.7168570756912231, "best_model_checkpoint": "miner_id_24/checkpoint-300", "epoch": 0.6511123168746609, "eval_steps": 150, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002170374389582203, "eval_loss": 3.8393607139587402, "eval_runtime": 52.8023, "eval_samples_per_second": 14.715, "eval_steps_per_second": 1.856, "step": 1 }, { "epoch": 0.02170374389582203, "grad_norm": 26.712209701538086, "learning_rate": 6e-06, "loss": 10.3362, "step": 10 }, { "epoch": 0.04340748779164406, "grad_norm": 38.7980842590332, "learning_rate": 1.2e-05, "loss": 12.0151, "step": 20 }, { "epoch": 0.06511123168746609, "grad_norm": 29.304401397705078, "learning_rate": 1.8e-05, "loss": 11.2638, "step": 30 }, { "epoch": 0.08681497558328811, "grad_norm": 39.74540710449219, "learning_rate": 2.4e-05, "loss": 9.229, "step": 40 }, { "epoch": 0.10851871947911014, "grad_norm": 65.20191955566406, "learning_rate": 3e-05, "loss": 6.7067, "step": 50 }, { "epoch": 0.13022246337493218, "grad_norm": 15.829483985900879, "learning_rate": 2.9996479470277262e-05, "loss": 5.6606, "step": 60 }, { "epoch": 0.1519262072707542, "grad_norm": 16.72377586364746, "learning_rate": 2.9985919533659653e-05, "loss": 4.3456, "step": 70 }, { "epoch": 0.17362995116657623, "grad_norm": 17.048402786254883, "learning_rate": 2.9968325147023267e-05, "loss": 3.5431, "step": 80 }, { "epoch": 0.19533369506239825, "grad_norm": 23.174785614013672, "learning_rate": 2.994370456924292e-05, "loss": 3.5592, "step": 90 }, { "epoch": 0.21703743895822028, "grad_norm": 63.94609832763672, "learning_rate": 2.9912069357315394e-05, "loss": 3.5889, "step": 100 }, { "epoch": 0.23874118285404233, "grad_norm": 17.54082679748535, "learning_rate": 2.9873434360934543e-05, "loss": 4.0375, "step": 110 }, { "epoch": 0.26044492674986436, "grad_norm": 13.922243118286133, "learning_rate": 2.9827817715520775e-05, "loss": 3.5151, "step": 120 }, { "epoch": 0.2821486706456864, "grad_norm": 18.978328704833984, "learning_rate": 2.977524083370823e-05, "loss": 3.1774, "step": 130 }, { "epoch": 0.3038524145415084, "grad_norm": 19.49057388305664, "learning_rate": 2.9715728395293587e-05, "loss": 3.2158, "step": 140 }, { "epoch": 0.32555615843733043, "grad_norm": 36.420654296875, "learning_rate": 2.96493083356513e-05, "loss": 3.1129, "step": 150 }, { "epoch": 0.32555615843733043, "eval_loss": 0.803920328617096, "eval_runtime": 53.8173, "eval_samples_per_second": 14.438, "eval_steps_per_second": 1.821, "step": 150 }, { "epoch": 0.34725990233315246, "grad_norm": 15.815438270568848, "learning_rate": 2.9576011832620583e-05, "loss": 3.6763, "step": 160 }, { "epoch": 0.3689636462289745, "grad_norm": 17.322349548339844, "learning_rate": 2.9495873291870436e-05, "loss": 3.2852, "step": 170 }, { "epoch": 0.3906673901247965, "grad_norm": 16.479698181152344, "learning_rate": 2.940893033074948e-05, "loss": 3.0177, "step": 180 }, { "epoch": 0.41237113402061853, "grad_norm": 20.874675750732422, "learning_rate": 2.9315223760628224e-05, "loss": 2.676, "step": 190 }, { "epoch": 0.43407487791644056, "grad_norm": 29.774669647216797, "learning_rate": 2.9214797567742036e-05, "loss": 3.227, "step": 200 }, { "epoch": 0.45577862181226264, "grad_norm": 14.984698295593262, "learning_rate": 2.9107698892543862e-05, "loss": 3.4, "step": 210 }, { "epoch": 0.47748236570808467, "grad_norm": 18.93448829650879, "learning_rate": 2.8993978007576263e-05, "loss": 2.9846, "step": 220 }, { "epoch": 0.4991861096039067, "grad_norm": 17.96265411376953, "learning_rate": 2.8873688293873336e-05, "loss": 3.0037, "step": 230 }, { "epoch": 0.5208898534997287, "grad_norm": 23.578723907470703, "learning_rate": 2.874688621590339e-05, "loss": 2.7363, "step": 240 }, { "epoch": 0.5425935973955507, "grad_norm": 39.76424026489258, "learning_rate": 2.861363129506436e-05, "loss": 3.1817, "step": 250 }, { "epoch": 0.5642973412913728, "grad_norm": 20.42775535583496, "learning_rate": 2.847398608174417e-05, "loss": 3.2541, "step": 260 }, { "epoch": 0.5860010851871947, "grad_norm": 16.566482543945312, "learning_rate": 2.832801612595937e-05, "loss": 2.8651, "step": 270 }, { "epoch": 0.6077048290830168, "grad_norm": 16.73906707763672, "learning_rate": 2.8175789946585697e-05, "loss": 2.8237, "step": 280 }, { "epoch": 0.6294085729788389, "grad_norm": 23.292736053466797, "learning_rate": 2.801737899919502e-05, "loss": 3.0393, "step": 290 }, { "epoch": 0.6511123168746609, "grad_norm": 38.29425048828125, "learning_rate": 2.7852857642513838e-05, "loss": 2.7109, "step": 300 }, { "epoch": 0.6511123168746609, "eval_loss": 0.7168570756912231, "eval_runtime": 53.8613, "eval_samples_per_second": 14.426, "eval_steps_per_second": 1.819, "step": 300 } ], "logging_steps": 10, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 150, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.206407486275584e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }