{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.88659793814433, "eval_steps": 7, "global_step": 70, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.041237113402061855, "eval_loss": 2.0562095642089844, "eval_runtime": 2.9139, "eval_samples_per_second": 14.071, "eval_steps_per_second": 2.059, "step": 1 }, { "epoch": 0.12371134020618557, "grad_norm": 1.2591140270233154, "learning_rate": 3e-05, "loss": 1.9881, "step": 3 }, { "epoch": 0.24742268041237114, "grad_norm": 1.3045929670333862, "learning_rate": 6e-05, "loss": 1.9707, "step": 6 }, { "epoch": 0.28865979381443296, "eval_loss": 1.8532114028930664, "eval_runtime": 2.9739, "eval_samples_per_second": 13.787, "eval_steps_per_second": 2.018, "step": 7 }, { "epoch": 0.3711340206185567, "grad_norm": 1.4863039255142212, "learning_rate": 9e-05, "loss": 1.7318, "step": 9 }, { "epoch": 0.4948453608247423, "grad_norm": 1.4484189748764038, "learning_rate": 9.975153876827008e-05, "loss": 1.5902, "step": 12 }, { "epoch": 0.5773195876288659, "eval_loss": 1.459747314453125, "eval_runtime": 2.9899, "eval_samples_per_second": 13.713, "eval_steps_per_second": 2.007, "step": 14 }, { "epoch": 0.6185567010309279, "grad_norm": 1.3237439393997192, "learning_rate": 9.84538643114539e-05, "loss": 1.4891, "step": 15 }, { "epoch": 0.7422680412371134, "grad_norm": 1.3351250886917114, "learning_rate": 9.607381059352038e-05, "loss": 1.4098, "step": 18 }, { "epoch": 0.865979381443299, "grad_norm": 1.3576433658599854, "learning_rate": 9.266454408160779e-05, "loss": 1.228, "step": 21 }, { "epoch": 0.865979381443299, "eval_loss": 1.3227709531784058, "eval_runtime": 3.0131, "eval_samples_per_second": 13.607, "eval_steps_per_second": 1.991, "step": 21 }, { "epoch": 0.9896907216494846, "grad_norm": 1.2061851024627686, "learning_rate": 8.83022221559489e-05, "loss": 1.3031, "step": 24 }, { "epoch": 1.1134020618556701, "grad_norm": 1.1129889488220215, "learning_rate": 8.308429187984297e-05, "loss": 1.4281, "step": 27 }, { "epoch": 1.1546391752577319, "eval_loss": 1.2710272073745728, "eval_runtime": 3.0113, "eval_samples_per_second": 13.615, "eval_steps_per_second": 1.993, "step": 28 }, { "epoch": 1.2371134020618557, "grad_norm": 1.0241196155548096, "learning_rate": 7.712731319328798e-05, "loss": 1.0659, "step": 30 }, { "epoch": 1.3608247422680413, "grad_norm": 1.213539719581604, "learning_rate": 7.056435515653059e-05, "loss": 1.0993, "step": 33 }, { "epoch": 1.443298969072165, "eval_loss": 1.2519707679748535, "eval_runtime": 3.0101, "eval_samples_per_second": 13.621, "eval_steps_per_second": 1.993, "step": 35 }, { "epoch": 1.4845360824742269, "grad_norm": 1.1910161972045898, "learning_rate": 6.354202340715026e-05, "loss": 1.0659, "step": 36 }, { "epoch": 1.6082474226804124, "grad_norm": 1.2115846872329712, "learning_rate": 5.621718523237427e-05, "loss": 1.0223, "step": 39 }, { "epoch": 1.731958762886598, "grad_norm": 1.3184078931808472, "learning_rate": 4.875346541309637e-05, "loss": 1.0009, "step": 42 }, { "epoch": 1.731958762886598, "eval_loss": 1.2434487342834473, "eval_runtime": 3.0216, "eval_samples_per_second": 13.569, "eval_steps_per_second": 1.986, "step": 42 }, { "epoch": 1.8556701030927836, "grad_norm": 1.2150309085845947, "learning_rate": 4.131759111665349e-05, "loss": 0.9667, "step": 45 }, { "epoch": 1.9793814432989691, "grad_norm": 1.312495231628418, "learning_rate": 3.4075667487415785e-05, "loss": 1.0141, "step": 48 }, { "epoch": 2.020618556701031, "eval_loss": 1.2144564390182495, "eval_runtime": 3.0168, "eval_samples_per_second": 13.59, "eval_steps_per_second": 1.989, "step": 49 }, { "epoch": 2.1030927835051547, "grad_norm": 1.2755225896835327, "learning_rate": 2.718946713234185e-05, "loss": 1.2721, "step": 51 }, { "epoch": 2.2268041237113403, "grad_norm": 1.2755869626998901, "learning_rate": 2.0812816388260518e-05, "loss": 0.8322, "step": 54 }, { "epoch": 2.3092783505154637, "eval_loss": 1.2048313617706299, "eval_runtime": 3.016, "eval_samples_per_second": 13.594, "eval_steps_per_second": 1.989, "step": 56 }, { "epoch": 2.350515463917526, "grad_norm": 1.1833029985427856, "learning_rate": 1.5088159095696363e-05, "loss": 0.8151, "step": 57 }, { "epoch": 2.4742268041237114, "grad_norm": 1.187381386756897, "learning_rate": 1.0143374638853891e-05, "loss": 0.8196, "step": 60 }, { "epoch": 2.597938144329897, "grad_norm": 1.330628514289856, "learning_rate": 6.088921331488568e-06, "loss": 0.8458, "step": 63 }, { "epoch": 2.597938144329897, "eval_loss": 1.2047181129455566, "eval_runtime": 3.0172, "eval_samples_per_second": 13.589, "eval_steps_per_second": 1.989, "step": 63 }, { "epoch": 2.7216494845360826, "grad_norm": 1.3034956455230713, "learning_rate": 3.0153689607045845e-06, "loss": 0.8017, "step": 66 }, { "epoch": 2.845360824742268, "grad_norm": 1.2311569452285767, "learning_rate": 9.913756075728087e-07, "loss": 0.8266, "step": 69 }, { "epoch": 2.88659793814433, "eval_loss": 1.205170750617981, "eval_runtime": 3.0134, "eval_samples_per_second": 13.606, "eval_steps_per_second": 1.991, "step": 70 } ], "logging_steps": 3, "max_steps": 73, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 7, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0340054014623744e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }