{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.02543558438255119, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002543558438255119, "eval_loss": 5.598480701446533, "eval_runtime": 320.3837, "eval_samples_per_second": 20.669, "eval_steps_per_second": 2.584, "step": 1 }, { "epoch": 0.0007630675314765357, "grad_norm": 23.70137596130371, "learning_rate": 3e-05, "loss": 5.7078, "step": 3 }, { "epoch": 0.0015261350629530714, "grad_norm": 11.249475479125977, "learning_rate": 6e-05, "loss": 3.5538, "step": 6 }, { "epoch": 0.002289202594429607, "grad_norm": 7.258395671844482, "learning_rate": 9e-05, "loss": 2.0523, "step": 9 }, { "epoch": 0.002289202594429607, "eval_loss": 1.837152123451233, "eval_runtime": 321.8836, "eval_samples_per_second": 20.573, "eval_steps_per_second": 2.572, "step": 9 }, { "epoch": 0.003052270125906143, "grad_norm": 8.19688606262207, "learning_rate": 9.987820251299122e-05, "loss": 1.8409, "step": 12 }, { "epoch": 0.0038153376573826785, "grad_norm": 3.2697651386260986, "learning_rate": 9.924038765061042e-05, "loss": 1.7495, "step": 15 }, { "epoch": 0.004578405188859214, "grad_norm": 3.1090781688690186, "learning_rate": 9.806308479691595e-05, "loss": 1.7031, "step": 18 }, { "epoch": 0.004578405188859214, "eval_loss": 1.6593525409698486, "eval_runtime": 321.5875, "eval_samples_per_second": 20.592, "eval_steps_per_second": 2.575, "step": 18 }, { "epoch": 0.00534147272033575, "grad_norm": 2.3846840858459473, "learning_rate": 9.635919272833938e-05, "loss": 1.6711, "step": 21 }, { "epoch": 0.006104540251812286, "grad_norm": 1.8909103870391846, "learning_rate": 9.414737964294636e-05, "loss": 1.6372, "step": 24 }, { "epoch": 0.006867607783288821, "grad_norm": 3.047074794769287, "learning_rate": 9.145187862775209e-05, "loss": 1.6429, "step": 27 }, { "epoch": 0.006867607783288821, "eval_loss": 1.627792239189148, "eval_runtime": 321.6005, "eval_samples_per_second": 20.591, "eval_steps_per_second": 2.575, "step": 27 }, { "epoch": 0.007630675314765357, "grad_norm": 2.153074264526367, "learning_rate": 8.83022221559489e-05, "loss": 1.6103, "step": 30 }, { "epoch": 0.008393742846241892, "grad_norm": 1.5520834922790527, "learning_rate": 8.473291852294987e-05, "loss": 1.5756, "step": 33 }, { "epoch": 0.009156810377718427, "grad_norm": 1.3303474187850952, "learning_rate": 8.07830737662829e-05, "loss": 1.5947, "step": 36 }, { "epoch": 0.009156810377718427, "eval_loss": 1.596153974533081, "eval_runtime": 321.4648, "eval_samples_per_second": 20.599, "eval_steps_per_second": 2.576, "step": 36 }, { "epoch": 0.009919877909194963, "grad_norm": 1.5800137519836426, "learning_rate": 7.649596321166024e-05, "loss": 1.6058, "step": 39 }, { "epoch": 0.0106829454406715, "grad_norm": 1.3730175495147705, "learning_rate": 7.191855733945387e-05, "loss": 1.5432, "step": 42 }, { "epoch": 0.011446012972148036, "grad_norm": 1.8071743249893188, "learning_rate": 6.710100716628344e-05, "loss": 1.5758, "step": 45 }, { "epoch": 0.011446012972148036, "eval_loss": 1.5650079250335693, "eval_runtime": 321.4815, "eval_samples_per_second": 20.598, "eval_steps_per_second": 2.576, "step": 45 }, { "epoch": 0.012209080503624571, "grad_norm": 1.7138389348983765, "learning_rate": 6.209609477998338e-05, "loss": 1.4889, "step": 48 }, { "epoch": 0.012972148035101107, "grad_norm": 1.4016435146331787, "learning_rate": 5.695865504800327e-05, "loss": 1.5086, "step": 51 }, { "epoch": 0.013735215566577643, "grad_norm": 1.3536241054534912, "learning_rate": 5.174497483512506e-05, "loss": 1.5789, "step": 54 }, { "epoch": 0.013735215566577643, "eval_loss": 1.5569493770599365, "eval_runtime": 321.523, "eval_samples_per_second": 20.596, "eval_steps_per_second": 2.575, "step": 54 }, { "epoch": 0.014498283098054178, "grad_norm": 1.2225385904312134, "learning_rate": 4.6512176312793736e-05, "loss": 1.5789, "step": 57 }, { "epoch": 0.015261350629530714, "grad_norm": 1.0706034898757935, "learning_rate": 4.131759111665349e-05, "loss": 1.5548, "step": 60 }, { "epoch": 0.016024418161007248, "grad_norm": 0.9136539101600647, "learning_rate": 3.6218132209150045e-05, "loss": 1.5457, "step": 63 }, { "epoch": 0.016024418161007248, "eval_loss": 1.5255224704742432, "eval_runtime": 321.5153, "eval_samples_per_second": 20.596, "eval_steps_per_second": 2.575, "step": 63 }, { "epoch": 0.016787485692483783, "grad_norm": 1.1234185695648193, "learning_rate": 3.12696703292044e-05, "loss": 1.532, "step": 66 }, { "epoch": 0.01755055322396032, "grad_norm": 1.1647182703018188, "learning_rate": 2.6526421860705473e-05, "loss": 1.4708, "step": 69 }, { "epoch": 0.018313620755436855, "grad_norm": 1.3128389120101929, "learning_rate": 2.2040354826462668e-05, "loss": 1.5735, "step": 72 }, { "epoch": 0.018313620755436855, "eval_loss": 1.5191624164581299, "eval_runtime": 321.4932, "eval_samples_per_second": 20.598, "eval_steps_per_second": 2.575, "step": 72 }, { "epoch": 0.01907668828691339, "grad_norm": 1.3198413848876953, "learning_rate": 1.7860619515673033e-05, "loss": 1.5235, "step": 75 }, { "epoch": 0.019839755818389926, "grad_norm": 1.1952508687973022, "learning_rate": 1.4033009983067452e-05, "loss": 1.5038, "step": 78 }, { "epoch": 0.02060282334986646, "grad_norm": 1.0137096643447876, "learning_rate": 1.0599462319663905e-05, "loss": 1.4952, "step": 81 }, { "epoch": 0.02060282334986646, "eval_loss": 1.5108598470687866, "eval_runtime": 321.4625, "eval_samples_per_second": 20.6, "eval_steps_per_second": 2.576, "step": 81 }, { "epoch": 0.021365890881343, "grad_norm": 2.021695613861084, "learning_rate": 7.597595192178702e-06, "loss": 1.5211, "step": 84 }, { "epoch": 0.022128958412819536, "grad_norm": 0.9883065223693848, "learning_rate": 5.060297685041659e-06, "loss": 1.5691, "step": 87 }, { "epoch": 0.022892025944296072, "grad_norm": 1.4156526327133179, "learning_rate": 3.0153689607045845e-06, "loss": 1.498, "step": 90 }, { "epoch": 0.022892025944296072, "eval_loss": 1.5057381391525269, "eval_runtime": 321.4316, "eval_samples_per_second": 20.602, "eval_steps_per_second": 2.576, "step": 90 }, { "epoch": 0.023655093475772607, "grad_norm": 1.0483613014221191, "learning_rate": 1.4852136862001764e-06, "loss": 1.5108, "step": 93 }, { "epoch": 0.024418161007249143, "grad_norm": 1.0317051410675049, "learning_rate": 4.865965629214819e-07, "loss": 1.4549, "step": 96 }, { "epoch": 0.02518122853872568, "grad_norm": 0.9832442998886108, "learning_rate": 3.04586490452119e-08, "loss": 1.4761, "step": 99 }, { "epoch": 0.02518122853872568, "eval_loss": 1.5044512748718262, "eval_runtime": 321.63, "eval_samples_per_second": 20.589, "eval_steps_per_second": 2.574, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.14368026066944e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }