{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.024752475247524754, "eval_steps": 13, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004950495049504951, "grad_norm": 0.5099626779556274, "learning_rate": 5e-06, "loss": 2.5697, "step": 1 }, { "epoch": 0.0004950495049504951, "eval_loss": 2.622131824493408, "eval_runtime": 15.9426, "eval_samples_per_second": 53.379, "eval_steps_per_second": 26.721, "step": 1 }, { "epoch": 0.0009900990099009901, "grad_norm": 0.539216935634613, "learning_rate": 1e-05, "loss": 2.6019, "step": 2 }, { "epoch": 0.0014851485148514852, "grad_norm": 1.0987958908081055, "learning_rate": 1.5e-05, "loss": 2.5236, "step": 3 }, { "epoch": 0.0019801980198019802, "grad_norm": 0.9217115640640259, "learning_rate": 2e-05, "loss": 2.682, "step": 4 }, { "epoch": 0.0024752475247524753, "grad_norm": 0.7861433625221252, "learning_rate": 2.5e-05, "loss": 2.3686, "step": 5 }, { "epoch": 0.0029702970297029703, "grad_norm": 0.9432778358459473, "learning_rate": 3e-05, "loss": 2.3954, "step": 6 }, { "epoch": 0.0034653465346534654, "grad_norm": 0.6162946820259094, "learning_rate": 3.5e-05, "loss": 2.8127, "step": 7 }, { "epoch": 0.0039603960396039604, "grad_norm": 0.671248197555542, "learning_rate": 4e-05, "loss": 2.5454, "step": 8 }, { "epoch": 0.004455445544554455, "grad_norm": 0.7792280912399292, "learning_rate": 4.5e-05, "loss": 2.5155, "step": 9 }, { "epoch": 0.0049504950495049506, "grad_norm": 0.5344467163085938, "learning_rate": 5e-05, "loss": 2.6138, "step": 10 }, { "epoch": 0.005445544554455445, "grad_norm": 0.8850129246711731, "learning_rate": 4.99229333433282e-05, "loss": 2.5492, "step": 11 }, { "epoch": 0.005940594059405941, "grad_norm": 0.8699153065681458, "learning_rate": 4.9692208514878444e-05, "loss": 2.7188, "step": 12 }, { "epoch": 0.006435643564356435, "grad_norm": 0.7159351110458374, "learning_rate": 4.9309248009941914e-05, "loss": 2.7603, "step": 13 }, { "epoch": 0.006435643564356435, "eval_loss": 2.6000754833221436, "eval_runtime": 14.7393, "eval_samples_per_second": 57.737, "eval_steps_per_second": 28.902, "step": 13 }, { "epoch": 0.006930693069306931, "grad_norm": 0.7229506969451904, "learning_rate": 4.877641290737884e-05, "loss": 2.5148, "step": 14 }, { "epoch": 0.007425742574257425, "grad_norm": 0.7777081727981567, "learning_rate": 4.8096988312782174e-05, "loss": 2.716, "step": 15 }, { "epoch": 0.007920792079207921, "grad_norm": 0.709530770778656, "learning_rate": 4.72751631047092e-05, "loss": 2.6661, "step": 16 }, { "epoch": 0.008415841584158416, "grad_norm": 0.6867318749427795, "learning_rate": 4.6316004108852305e-05, "loss": 2.6155, "step": 17 }, { "epoch": 0.00891089108910891, "grad_norm": 0.8236218690872192, "learning_rate": 4.522542485937369e-05, "loss": 2.5981, "step": 18 }, { "epoch": 0.009405940594059406, "grad_norm": 0.6285402774810791, "learning_rate": 4.401014914000078e-05, "loss": 2.6279, "step": 19 }, { "epoch": 0.009900990099009901, "grad_norm": 0.6804826259613037, "learning_rate": 4.267766952966369e-05, "loss": 2.5699, "step": 20 }, { "epoch": 0.010396039603960397, "grad_norm": 0.8533461093902588, "learning_rate": 4.123620120825459e-05, "loss": 2.5804, "step": 21 }, { "epoch": 0.01089108910891089, "grad_norm": 1.2304980754852295, "learning_rate": 3.969463130731183e-05, "loss": 2.7291, "step": 22 }, { "epoch": 0.011386138613861386, "grad_norm": 0.7633657455444336, "learning_rate": 3.8062464117898724e-05, "loss": 2.4248, "step": 23 }, { "epoch": 0.011881188118811881, "grad_norm": 0.8680675029754639, "learning_rate": 3.634976249348867e-05, "loss": 2.5192, "step": 24 }, { "epoch": 0.012376237623762377, "grad_norm": 0.703606903553009, "learning_rate": 3.456708580912725e-05, "loss": 2.2694, "step": 25 }, { "epoch": 0.01287128712871287, "grad_norm": 0.9794207215309143, "learning_rate": 3.272542485937369e-05, "loss": 2.5514, "step": 26 }, { "epoch": 0.01287128712871287, "eval_loss": 2.531092882156372, "eval_runtime": 14.5972, "eval_samples_per_second": 58.299, "eval_steps_per_second": 29.184, "step": 26 }, { "epoch": 0.013366336633663366, "grad_norm": 0.712409257888794, "learning_rate": 3.083613409639764e-05, "loss": 2.7024, "step": 27 }, { "epoch": 0.013861386138613862, "grad_norm": 0.7140835523605347, "learning_rate": 2.8910861626005776e-05, "loss": 2.3597, "step": 28 }, { "epoch": 0.014356435643564357, "grad_norm": 0.7529740929603577, "learning_rate": 2.6961477393196126e-05, "loss": 2.4987, "step": 29 }, { "epoch": 0.01485148514851485, "grad_norm": 0.7249071002006531, "learning_rate": 2.5e-05, "loss": 2.5261, "step": 30 }, { "epoch": 0.015346534653465346, "grad_norm": 0.6572368144989014, "learning_rate": 2.303852260680388e-05, "loss": 2.5675, "step": 31 }, { "epoch": 0.015841584158415842, "grad_norm": 0.8049694895744324, "learning_rate": 2.1089138373994223e-05, "loss": 2.4052, "step": 32 }, { "epoch": 0.016336633663366337, "grad_norm": 1.333217740058899, "learning_rate": 1.9163865903602374e-05, "loss": 3.0249, "step": 33 }, { "epoch": 0.016831683168316833, "grad_norm": 0.6144301891326904, "learning_rate": 1.7274575140626318e-05, "loss": 2.6427, "step": 34 }, { "epoch": 0.017326732673267328, "grad_norm": 0.5658847093582153, "learning_rate": 1.5432914190872757e-05, "loss": 2.3681, "step": 35 }, { "epoch": 0.01782178217821782, "grad_norm": 0.5838199257850647, "learning_rate": 1.3650237506511331e-05, "loss": 2.2482, "step": 36 }, { "epoch": 0.018316831683168316, "grad_norm": 0.6288196444511414, "learning_rate": 1.1937535882101281e-05, "loss": 2.495, "step": 37 }, { "epoch": 0.01881188118811881, "grad_norm": 0.7089413404464722, "learning_rate": 1.0305368692688174e-05, "loss": 2.4151, "step": 38 }, { "epoch": 0.019306930693069307, "grad_norm": 0.46954041719436646, "learning_rate": 8.763798791745411e-06, "loss": 2.3573, "step": 39 }, { "epoch": 0.019306930693069307, "eval_loss": 2.504016399383545, "eval_runtime": 14.7245, "eval_samples_per_second": 57.795, "eval_steps_per_second": 28.931, "step": 39 }, { "epoch": 0.019801980198019802, "grad_norm": 0.5877854824066162, "learning_rate": 7.3223304703363135e-06, "loss": 2.3505, "step": 40 }, { "epoch": 0.020297029702970298, "grad_norm": 0.7243385910987854, "learning_rate": 5.989850859999227e-06, "loss": 2.5964, "step": 41 }, { "epoch": 0.020792079207920793, "grad_norm": 0.5486332774162292, "learning_rate": 4.7745751406263165e-06, "loss": 2.5455, "step": 42 }, { "epoch": 0.02128712871287129, "grad_norm": 0.5547685623168945, "learning_rate": 3.6839958911476957e-06, "loss": 2.5585, "step": 43 }, { "epoch": 0.02178217821782178, "grad_norm": 0.5885905027389526, "learning_rate": 2.7248368952908053e-06, "loss": 2.3613, "step": 44 }, { "epoch": 0.022277227722772276, "grad_norm": 0.47415289282798767, "learning_rate": 1.9030116872178316e-06, "loss": 2.4039, "step": 45 }, { "epoch": 0.02277227722772277, "grad_norm": 0.6020286083221436, "learning_rate": 1.2235870926211619e-06, "loss": 2.5496, "step": 46 }, { "epoch": 0.023267326732673267, "grad_norm": 0.8112446069717407, "learning_rate": 6.907519900580861e-07, "loss": 2.6563, "step": 47 }, { "epoch": 0.023762376237623763, "grad_norm": 0.6824198365211487, "learning_rate": 3.077914851215585e-07, "loss": 2.5111, "step": 48 }, { "epoch": 0.024257425742574258, "grad_norm": 0.5109888315200806, "learning_rate": 7.706665667180091e-08, "loss": 2.4153, "step": 49 }, { "epoch": 0.024752475247524754, "grad_norm": 0.6785337328910828, "learning_rate": 0.0, "loss": 2.5139, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1624279061889024.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }