{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.20512820512820512, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020512820512820513, "eval_loss": 3.666609048843384, "eval_runtime": 74.3332, "eval_samples_per_second": 11.058, "eval_steps_per_second": 1.386, "step": 1 }, { "epoch": 0.006153846153846154, "grad_norm": 55.59685516357422, "learning_rate": 1.5e-05, "loss": 14.9637, "step": 3 }, { "epoch": 0.012307692307692308, "grad_norm": 49.33617401123047, "learning_rate": 3e-05, "loss": 14.2659, "step": 6 }, { "epoch": 0.018461538461538463, "grad_norm": 40.35097885131836, "learning_rate": 4.5e-05, "loss": 12.0429, "step": 9 }, { "epoch": 0.018461538461538463, "eval_loss": 2.3609907627105713, "eval_runtime": 75.0844, "eval_samples_per_second": 10.948, "eval_steps_per_second": 1.372, "step": 9 }, { "epoch": 0.024615384615384615, "grad_norm": 30.74170684814453, "learning_rate": 4.993910125649561e-05, "loss": 8.4852, "step": 12 }, { "epoch": 0.03076923076923077, "grad_norm": 30.664113998413086, "learning_rate": 4.962019382530521e-05, "loss": 6.9901, "step": 15 }, { "epoch": 0.036923076923076927, "grad_norm": 37.1199951171875, "learning_rate": 4.9031542398457974e-05, "loss": 5.2257, "step": 18 }, { "epoch": 0.036923076923076927, "eval_loss": 1.283078670501709, "eval_runtime": 75.1182, "eval_samples_per_second": 10.943, "eval_steps_per_second": 1.371, "step": 18 }, { "epoch": 0.043076923076923075, "grad_norm": 25.036869049072266, "learning_rate": 4.817959636416969e-05, "loss": 5.1287, "step": 21 }, { "epoch": 0.04923076923076923, "grad_norm": 19.620019912719727, "learning_rate": 4.707368982147318e-05, "loss": 4.4763, "step": 24 }, { "epoch": 0.055384615384615386, "grad_norm": 24.405311584472656, "learning_rate": 4.572593931387604e-05, "loss": 3.8667, "step": 27 }, { "epoch": 0.055384615384615386, "eval_loss": 0.9862188100814819, "eval_runtime": 75.1057, "eval_samples_per_second": 10.945, "eval_steps_per_second": 1.371, "step": 27 }, { "epoch": 0.06153846153846154, "grad_norm": 22.108592987060547, "learning_rate": 4.415111107797445e-05, "loss": 4.0031, "step": 30 }, { "epoch": 0.06769230769230769, "grad_norm": 19.9141788482666, "learning_rate": 4.2366459261474933e-05, "loss": 3.8066, "step": 33 }, { "epoch": 0.07384615384615385, "grad_norm": 22.954469680786133, "learning_rate": 4.039153688314145e-05, "loss": 3.7173, "step": 36 }, { "epoch": 0.07384615384615385, "eval_loss": 0.9269228577613831, "eval_runtime": 75.1147, "eval_samples_per_second": 10.943, "eval_steps_per_second": 1.371, "step": 36 }, { "epoch": 0.08, "grad_norm": 24.711549758911133, "learning_rate": 3.824798160583012e-05, "loss": 3.9268, "step": 39 }, { "epoch": 0.08615384615384615, "grad_norm": 16.412107467651367, "learning_rate": 3.5959278669726935e-05, "loss": 3.9029, "step": 42 }, { "epoch": 0.09230769230769231, "grad_norm": 19.484609603881836, "learning_rate": 3.355050358314172e-05, "loss": 3.7293, "step": 45 }, { "epoch": 0.09230769230769231, "eval_loss": 0.888292133808136, "eval_runtime": 75.1111, "eval_samples_per_second": 10.944, "eval_steps_per_second": 1.371, "step": 45 }, { "epoch": 0.09846153846153846, "grad_norm": 17.762903213500977, "learning_rate": 3.104804738999169e-05, "loss": 3.8463, "step": 48 }, { "epoch": 0.10461538461538461, "grad_norm": 16.857402801513672, "learning_rate": 2.8479327524001636e-05, "loss": 3.2797, "step": 51 }, { "epoch": 0.11076923076923077, "grad_norm": 17.598472595214844, "learning_rate": 2.587248741756253e-05, "loss": 3.4031, "step": 54 }, { "epoch": 0.11076923076923077, "eval_loss": 0.8580312132835388, "eval_runtime": 75.1055, "eval_samples_per_second": 10.945, "eval_steps_per_second": 1.371, "step": 54 }, { "epoch": 0.11692307692307692, "grad_norm": 19.14990234375, "learning_rate": 2.3256088156396868e-05, "loss": 3.5648, "step": 57 }, { "epoch": 0.12307692307692308, "grad_norm": 15.191015243530273, "learning_rate": 2.0658795558326743e-05, "loss": 3.2272, "step": 60 }, { "epoch": 0.12923076923076923, "grad_norm": 16.499048233032227, "learning_rate": 1.8109066104575023e-05, "loss": 3.1709, "step": 63 }, { "epoch": 0.12923076923076923, "eval_loss": 0.8416498303413391, "eval_runtime": 75.1294, "eval_samples_per_second": 10.941, "eval_steps_per_second": 1.371, "step": 63 }, { "epoch": 0.13538461538461538, "grad_norm": 16.03173828125, "learning_rate": 1.56348351646022e-05, "loss": 3.1598, "step": 66 }, { "epoch": 0.14153846153846153, "grad_norm": 15.822704315185547, "learning_rate": 1.3263210930352737e-05, "loss": 3.1743, "step": 69 }, { "epoch": 0.1476923076923077, "grad_norm": 22.87495994567871, "learning_rate": 1.1020177413231334e-05, "loss": 3.7806, "step": 72 }, { "epoch": 0.1476923076923077, "eval_loss": 0.8304129838943481, "eval_runtime": 75.1064, "eval_samples_per_second": 10.944, "eval_steps_per_second": 1.371, "step": 72 }, { "epoch": 0.15384615384615385, "grad_norm": 17.428983688354492, "learning_rate": 8.930309757836517e-06, "loss": 3.6015, "step": 75 }, { "epoch": 0.16, "grad_norm": 19.788145065307617, "learning_rate": 7.016504991533726e-06, "loss": 3.2009, "step": 78 }, { "epoch": 0.16615384615384615, "grad_norm": 18.700437545776367, "learning_rate": 5.299731159831953e-06, "loss": 3.3891, "step": 81 }, { "epoch": 0.16615384615384615, "eval_loss": 0.8223448991775513, "eval_runtime": 75.1008, "eval_samples_per_second": 10.945, "eval_steps_per_second": 1.371, "step": 81 }, { "epoch": 0.1723076923076923, "grad_norm": 16.810592651367188, "learning_rate": 3.798797596089351e-06, "loss": 3.3512, "step": 84 }, { "epoch": 0.17846153846153845, "grad_norm": 17.362895965576172, "learning_rate": 2.5301488425208296e-06, "loss": 3.5883, "step": 87 }, { "epoch": 0.18461538461538463, "grad_norm": 22.981599807739258, "learning_rate": 1.5076844803522922e-06, "loss": 3.2509, "step": 90 }, { "epoch": 0.18461538461538463, "eval_loss": 0.8187195062637329, "eval_runtime": 75.1228, "eval_samples_per_second": 10.942, "eval_steps_per_second": 1.371, "step": 90 }, { "epoch": 0.19076923076923077, "grad_norm": 16.67460823059082, "learning_rate": 7.426068431000882e-07, "loss": 3.4548, "step": 93 }, { "epoch": 0.19692307692307692, "grad_norm": 21.03778839111328, "learning_rate": 2.4329828146074095e-07, "loss": 3.7234, "step": 96 }, { "epoch": 0.20307692307692307, "grad_norm": 17.65099334716797, "learning_rate": 1.522932452260595e-08, "loss": 3.2285, "step": 99 }, { "epoch": 0.20307692307692307, "eval_loss": 0.8180755376815796, "eval_runtime": 75.1032, "eval_samples_per_second": 10.945, "eval_steps_per_second": 1.371, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.406258997362688e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }