{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9900990099009901, "eval_steps": 13, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019801980198019802, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.0919, "step": 1 }, { "epoch": 0.019801980198019802, "eval_loss": 2.079954147338867, "eval_runtime": 13.8908, "eval_samples_per_second": 8.999, "eval_steps_per_second": 4.535, "step": 1 }, { "epoch": 0.039603960396039604, "grad_norm": 1.203125, "learning_rate": 4e-05, "loss": 2.0814, "step": 2 }, { "epoch": 0.0594059405940594, "grad_norm": 1.1953125, "learning_rate": 6e-05, "loss": 2.0499, "step": 3 }, { "epoch": 0.07920792079207921, "grad_norm": 1.0859375, "learning_rate": 8e-05, "loss": 2.0153, "step": 4 }, { "epoch": 0.09900990099009901, "grad_norm": 1.0390625, "learning_rate": 0.0001, "loss": 1.9548, "step": 5 }, { "epoch": 0.1188118811881188, "grad_norm": 0.89453125, "learning_rate": 0.00012, "loss": 1.8982, "step": 6 }, { "epoch": 0.13861386138613863, "grad_norm": 0.67578125, "learning_rate": 0.00014, "loss": 1.8226, "step": 7 }, { "epoch": 0.15841584158415842, "grad_norm": 0.66796875, "learning_rate": 0.00016, "loss": 1.7572, "step": 8 }, { "epoch": 0.1782178217821782, "grad_norm": 0.78515625, "learning_rate": 0.00018, "loss": 1.7074, "step": 9 }, { "epoch": 0.19801980198019803, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 1.6317, "step": 10 }, { "epoch": 0.21782178217821782, "grad_norm": 0.484375, "learning_rate": 0.0001999863304992469, "loss": 1.5801, "step": 11 }, { "epoch": 0.2376237623762376, "grad_norm": 0.53125, "learning_rate": 0.00019994532573409262, "loss": 1.5721, "step": 12 }, { "epoch": 0.25742574257425743, "grad_norm": 0.6953125, "learning_rate": 0.00019987699691483048, "loss": 1.5479, "step": 13 }, { "epoch": 0.25742574257425743, "eval_loss": 1.5341482162475586, "eval_runtime": 13.8795, "eval_samples_per_second": 9.006, "eval_steps_per_second": 4.539, "step": 13 }, { "epoch": 0.27722772277227725, "grad_norm": 0.65234375, "learning_rate": 0.00019978136272187747, "loss": 1.534, "step": 14 }, { "epoch": 0.297029702970297, "grad_norm": 0.515625, "learning_rate": 0.000199658449300667, "loss": 1.4804, "step": 15 }, { "epoch": 0.31683168316831684, "grad_norm": 0.439453125, "learning_rate": 0.00019950829025450114, "loss": 1.4805, "step": 16 }, { "epoch": 0.33663366336633666, "grad_norm": 0.361328125, "learning_rate": 0.00019933092663536382, "loss": 1.3809, "step": 17 }, { "epoch": 0.3564356435643564, "grad_norm": 0.3125, "learning_rate": 0.00019912640693269752, "loss": 1.3837, "step": 18 }, { "epoch": 0.37623762376237624, "grad_norm": 0.337890625, "learning_rate": 0.00019889478706014687, "loss": 1.3673, "step": 19 }, { "epoch": 0.39603960396039606, "grad_norm": 0.298828125, "learning_rate": 0.00019863613034027224, "loss": 1.366, "step": 20 }, { "epoch": 0.4158415841584158, "grad_norm": 0.34375, "learning_rate": 0.00019835050748723824, "loss": 1.3318, "step": 21 }, { "epoch": 0.43564356435643564, "grad_norm": 0.341796875, "learning_rate": 0.00019803799658748094, "loss": 1.2741, "step": 22 }, { "epoch": 0.45544554455445546, "grad_norm": 0.326171875, "learning_rate": 0.00019769868307835994, "loss": 1.2978, "step": 23 }, { "epoch": 0.4752475247524752, "grad_norm": 0.291015625, "learning_rate": 0.0001973326597248006, "loss": 1.2733, "step": 24 }, { "epoch": 0.49504950495049505, "grad_norm": 0.306640625, "learning_rate": 0.00019694002659393305, "loss": 1.2302, "step": 25 }, { "epoch": 0.5148514851485149, "grad_norm": 0.318359375, "learning_rate": 0.00019652089102773488, "loss": 1.2083, "step": 26 }, { "epoch": 0.5148514851485149, "eval_loss": 1.224540114402771, "eval_runtime": 13.8695, "eval_samples_per_second": 9.013, "eval_steps_per_second": 4.542, "step": 26 }, { "epoch": 0.5346534653465347, "grad_norm": 0.26953125, "learning_rate": 0.00019607536761368484, "loss": 1.1761, "step": 27 }, { "epoch": 0.5544554455445545, "grad_norm": 0.296875, "learning_rate": 0.00019560357815343577, "loss": 1.1751, "step": 28 }, { "epoch": 0.5742574257425742, "grad_norm": 0.310546875, "learning_rate": 0.00019510565162951537, "loss": 1.2002, "step": 29 }, { "epoch": 0.594059405940594, "grad_norm": 0.287109375, "learning_rate": 0.00019458172417006347, "loss": 1.1544, "step": 30 }, { "epoch": 0.6138613861386139, "grad_norm": 0.365234375, "learning_rate": 0.00019403193901161613, "loss": 1.1384, "step": 31 }, { "epoch": 0.6336633663366337, "grad_norm": 0.236328125, "learning_rate": 0.0001934564464599461, "loss": 1.0999, "step": 32 }, { "epoch": 0.6534653465346535, "grad_norm": 0.326171875, "learning_rate": 0.00019285540384897073, "loss": 1.1576, "step": 33 }, { "epoch": 0.6732673267326733, "grad_norm": 0.310546875, "learning_rate": 0.00019222897549773848, "loss": 1.091, "step": 34 }, { "epoch": 0.693069306930693, "grad_norm": 0.2578125, "learning_rate": 0.00019157733266550575, "loss": 1.056, "step": 35 }, { "epoch": 0.7128712871287128, "grad_norm": 0.267578125, "learning_rate": 0.00019090065350491626, "loss": 1.1068, "step": 36 }, { "epoch": 0.7326732673267327, "grad_norm": 0.2490234375, "learning_rate": 0.00019019912301329592, "loss": 1.0583, "step": 37 }, { "epoch": 0.7524752475247525, "grad_norm": 0.2734375, "learning_rate": 0.00018947293298207635, "loss": 1.0671, "step": 38 }, { "epoch": 0.7722772277227723, "grad_norm": 0.2490234375, "learning_rate": 0.0001887222819443612, "loss": 1.0851, "step": 39 }, { "epoch": 0.7722772277227723, "eval_loss": 1.060703158378601, "eval_runtime": 13.878, "eval_samples_per_second": 9.007, "eval_steps_per_second": 4.54, "step": 39 }, { "epoch": 0.7920792079207921, "grad_norm": 0.22265625, "learning_rate": 0.0001879473751206489, "loss": 1.0343, "step": 40 }, { "epoch": 0.8118811881188119, "grad_norm": 0.1796875, "learning_rate": 0.00018714842436272773, "loss": 0.9789, "step": 41 }, { "epoch": 0.8316831683168316, "grad_norm": 0.248046875, "learning_rate": 0.00018632564809575742, "loss": 1.0174, "step": 42 }, { "epoch": 0.8514851485148515, "grad_norm": 0.2294921875, "learning_rate": 0.0001854792712585539, "loss": 1.0004, "step": 43 }, { "epoch": 0.8712871287128713, "grad_norm": 0.228515625, "learning_rate": 0.00018460952524209355, "loss": 1.0281, "step": 44 }, { "epoch": 0.8910891089108911, "grad_norm": 0.220703125, "learning_rate": 0.00018371664782625287, "loss": 0.9992, "step": 45 }, { "epoch": 0.9108910891089109, "grad_norm": 0.2138671875, "learning_rate": 0.00018280088311480201, "loss": 0.9635, "step": 46 }, { "epoch": 0.9306930693069307, "grad_norm": 0.265625, "learning_rate": 0.00018186248146866927, "loss": 1.006, "step": 47 }, { "epoch": 0.9504950495049505, "grad_norm": 0.2451171875, "learning_rate": 0.00018090169943749476, "loss": 0.9891, "step": 48 }, { "epoch": 0.9702970297029703, "grad_norm": 0.28515625, "learning_rate": 0.0001799187996894925, "loss": 0.9809, "step": 49 }, { "epoch": 0.9900990099009901, "grad_norm": 0.212890625, "learning_rate": 0.00017891405093963938, "loss": 0.9646, "step": 50 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.04177049010176e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }