{ "best_metric": 2.38608717918396, "best_model_checkpoint": "miner_id_24/checkpoint-400", "epoch": 0.14106859460412627, "eval_steps": 50, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00035267148651031563, "eval_loss": 3.0103461742401123, "eval_runtime": 25.7577, "eval_samples_per_second": 46.355, "eval_steps_per_second": 11.608, "step": 1 }, { "epoch": 0.0035267148651031564, "grad_norm": 1.8814945220947266, "learning_rate": 4.02e-05, "loss": 1.7486, "step": 10 }, { "epoch": 0.007053429730206313, "grad_norm": 3.284661054611206, "learning_rate": 8.04e-05, "loss": 2.104, "step": 20 }, { "epoch": 0.01058014459530947, "grad_norm": 4.527420520782471, "learning_rate": 0.0001206, "loss": 2.6147, "step": 30 }, { "epoch": 0.014106859460412626, "grad_norm": 8.724098205566406, "learning_rate": 0.0001608, "loss": 2.973, "step": 40 }, { "epoch": 0.017633574325515784, "grad_norm": 15.409860610961914, "learning_rate": 0.000201, "loss": 3.0979, "step": 50 }, { "epoch": 0.017633574325515784, "eval_loss": 2.7572579383850098, "eval_runtime": 25.7728, "eval_samples_per_second": 46.328, "eval_steps_per_second": 11.601, "step": 50 }, { "epoch": 0.02116028919061894, "grad_norm": 2.0140380859375, "learning_rate": 0.00020075518705111234, "loss": 1.6315, "step": 60 }, { "epoch": 0.024687004055722096, "grad_norm": 2.247471332550049, "learning_rate": 0.00020002194090852784, "loss": 2.1592, "step": 70 }, { "epoch": 0.02821371892082525, "grad_norm": 5.211777210235596, "learning_rate": 0.00019880383387374748, "loss": 2.7279, "step": 80 }, { "epoch": 0.03174043378592841, "grad_norm": 6.06164026260376, "learning_rate": 0.00019710680044180106, "loss": 2.995, "step": 90 }, { "epoch": 0.03526714865103157, "grad_norm": 12.699899673461914, "learning_rate": 0.0001949391083889838, "loss": 3.0191, "step": 100 }, { "epoch": 0.03526714865103157, "eval_loss": 3.1388041973114014, "eval_runtime": 25.4478, "eval_samples_per_second": 46.92, "eval_steps_per_second": 11.75, "step": 100 }, { "epoch": 0.03879386351613472, "grad_norm": 1.716439127922058, "learning_rate": 0.00019231131849308138, "loss": 1.8341, "step": 110 }, { "epoch": 0.04232057838123788, "grad_norm": 2.2980763912200928, "learning_rate": 0.00018923623308232218, "loss": 2.0825, "step": 120 }, { "epoch": 0.045847293246341035, "grad_norm": 3.878023862838745, "learning_rate": 0.00018572883366372081, "loss": 2.7559, "step": 130 }, { "epoch": 0.04937400811144419, "grad_norm": 5.2749786376953125, "learning_rate": 0.00018180620793468224, "loss": 2.8715, "step": 140 }, { "epoch": 0.05290072297654735, "grad_norm": 15.11570930480957, "learning_rate": 0.00017748746653345728, "loss": 3.1419, "step": 150 }, { "epoch": 0.05290072297654735, "eval_loss": 2.7679033279418945, "eval_runtime": 25.974, "eval_samples_per_second": 45.969, "eval_steps_per_second": 11.512, "step": 150 }, { "epoch": 0.0564274378416505, "grad_norm": 1.4065930843353271, "learning_rate": 0.00017279364993403443, "loss": 1.9145, "step": 160 }, { "epoch": 0.05995415270675366, "grad_norm": 2.066331624984741, "learning_rate": 0.00016774762593906525, "loss": 1.9541, "step": 170 }, { "epoch": 0.06348086757185682, "grad_norm": 3.5042123794555664, "learning_rate": 0.00016237397827022866, "loss": 2.4669, "step": 180 }, { "epoch": 0.06700758243695998, "grad_norm": 5.192580223083496, "learning_rate": 0.00015669888679881007, "loss": 2.8639, "step": 190 }, { "epoch": 0.07053429730206313, "grad_norm": 13.965015411376953, "learning_rate": 0.00015075, "loss": 2.7112, "step": 200 }, { "epoch": 0.07053429730206313, "eval_loss": 2.5790367126464844, "eval_runtime": 25.986, "eval_samples_per_second": 45.948, "eval_steps_per_second": 11.506, "step": 200 }, { "epoch": 0.07406101216716629, "grad_norm": 1.4565879106521606, "learning_rate": 0.00014455630025230227, "loss": 1.8242, "step": 210 }, { "epoch": 0.07758772703226945, "grad_norm": 2.0483245849609375, "learning_rate": 0.00013814796263829918, "loss": 1.8938, "step": 220 }, { "epoch": 0.0811144418973726, "grad_norm": 3.366831064224243, "learning_rate": 0.00013155620793468223, "loss": 2.5577, "step": 230 }, { "epoch": 0.08464115676247576, "grad_norm": 4.877556324005127, "learning_rate": 0.0001248131505077666, "loss": 2.8751, "step": 240 }, { "epoch": 0.08816787162757891, "grad_norm": 7.277644157409668, "learning_rate": 0.00011795164185552652, "loss": 2.9683, "step": 250 }, { "epoch": 0.08816787162757891, "eval_loss": 2.5053224563598633, "eval_runtime": 25.9786, "eval_samples_per_second": 45.961, "eval_steps_per_second": 11.509, "step": 250 }, { "epoch": 0.09169458649268207, "grad_norm": 1.3616389036178589, "learning_rate": 0.00011100511055839919, "loss": 1.7079, "step": 260 }, { "epoch": 0.09522130135778523, "grad_norm": 2.048992156982422, "learning_rate": 0.00010400739941860137, "loss": 1.996, "step": 270 }, { "epoch": 0.09874801622288838, "grad_norm": 3.304049491882324, "learning_rate": 9.699260058139868e-05, "loss": 2.6002, "step": 280 }, { "epoch": 0.10227473108799154, "grad_norm": 5.327462196350098, "learning_rate": 8.999488944160085e-05, "loss": 2.904, "step": 290 }, { "epoch": 0.1058014459530947, "grad_norm": 13.41323184967041, "learning_rate": 8.30483581444735e-05, "loss": 3.0004, "step": 300 }, { "epoch": 0.1058014459530947, "eval_loss": 2.461601972579956, "eval_runtime": 25.8902, "eval_samples_per_second": 46.118, "eval_steps_per_second": 11.549, "step": 300 }, { "epoch": 0.10932816081819785, "grad_norm": 1.484745979309082, "learning_rate": 7.618684949223341e-05, "loss": 1.7311, "step": 310 }, { "epoch": 0.112854875683301, "grad_norm": 2.000025749206543, "learning_rate": 6.94437920653178e-05, "loss": 2.2476, "step": 320 }, { "epoch": 0.11638159054840416, "grad_norm": 3.111759901046753, "learning_rate": 6.285203736170084e-05, "loss": 2.6156, "step": 330 }, { "epoch": 0.11990830541350732, "grad_norm": 4.285041809082031, "learning_rate": 5.6443699747697714e-05, "loss": 2.7437, "step": 340 }, { "epoch": 0.12343502027861047, "grad_norm": 11.187140464782715, "learning_rate": 5.025000000000002e-05, "loss": 3.0227, "step": 350 }, { "epoch": 0.12343502027861047, "eval_loss": 2.405674695968628, "eval_runtime": 25.8716, "eval_samples_per_second": 46.151, "eval_steps_per_second": 11.557, "step": 350 }, { "epoch": 0.12696173514371364, "grad_norm": 1.2822436094284058, "learning_rate": 4.430111320118996e-05, "loss": 1.6962, "step": 360 }, { "epoch": 0.1304884500088168, "grad_norm": 2.262040853500366, "learning_rate": 3.862602172977134e-05, "loss": 2.0943, "step": 370 }, { "epoch": 0.13401516487391996, "grad_norm": 3.736957550048828, "learning_rate": 3.325237406093478e-05, "loss": 2.6009, "step": 380 }, { "epoch": 0.1375418797390231, "grad_norm": 4.0212321281433105, "learning_rate": 2.820635006596558e-05, "loss": 2.7693, "step": 390 }, { "epoch": 0.14106859460412627, "grad_norm": 14.131185531616211, "learning_rate": 2.351253346654272e-05, "loss": 3.0325, "step": 400 }, { "epoch": 0.14106859460412627, "eval_loss": 2.38608717918396, "eval_runtime": 25.8049, "eval_samples_per_second": 46.27, "eval_steps_per_second": 11.587, "step": 400 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4018810864533504.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }