{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.034415509923138694, "eval_steps": 10, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001147183664104623, "eval_loss": 1.873344898223877, "eval_runtime": 12.7647, "eval_samples_per_second": 515.797, "eval_steps_per_second": 8.069, "step": 10 }, { "epoch": 0.002294367328209246, "eval_loss": 1.8726389408111572, "eval_runtime": 12.8667, "eval_samples_per_second": 511.709, "eval_steps_per_second": 8.005, "step": 20 }, { "epoch": 0.0034415509923138693, "eval_loss": 1.8714078664779663, "eval_runtime": 12.9103, "eval_samples_per_second": 509.979, "eval_steps_per_second": 7.978, "step": 30 }, { "epoch": 0.004588734656418492, "eval_loss": 1.8696790933609009, "eval_runtime": 12.947, "eval_samples_per_second": 508.534, "eval_steps_per_second": 7.955, "step": 40 }, { "epoch": 0.0057359183205231154, "eval_loss": 1.8675329685211182, "eval_runtime": 12.9458, "eval_samples_per_second": 508.582, "eval_steps_per_second": 7.956, "step": 50 }, { "epoch": 0.006883101984627739, "eval_loss": 1.8649154901504517, "eval_runtime": 13.0432, "eval_samples_per_second": 504.785, "eval_steps_per_second": 7.897, "step": 60 }, { "epoch": 0.008030285648732363, "eval_loss": 1.8619294166564941, "eval_runtime": 13.0638, "eval_samples_per_second": 503.988, "eval_steps_per_second": 7.884, "step": 70 }, { "epoch": 0.009177469312836984, "eval_loss": 1.8583979606628418, "eval_runtime": 13.0482, "eval_samples_per_second": 504.592, "eval_steps_per_second": 7.894, "step": 80 }, { "epoch": 0.010324652976941608, "eval_loss": 1.85438871383667, "eval_runtime": 13.0615, "eval_samples_per_second": 504.075, "eval_steps_per_second": 7.886, "step": 90 }, { "epoch": 0.011471836641046231, "grad_norm": 9.938580513000488, "learning_rate": 3.8226299694189603e-07, "loss": 3.1046, "step": 100 }, { "epoch": 0.011471836641046231, "eval_loss": 1.849947214126587, "eval_runtime": 13.0663, "eval_samples_per_second": 503.89, "eval_steps_per_second": 7.883, "step": 100 }, { "epoch": 0.012619020305150854, "eval_loss": 1.8451412916183472, "eval_runtime": 12.9771, "eval_samples_per_second": 507.357, "eval_steps_per_second": 7.937, "step": 110 }, { "epoch": 0.013766203969255477, "eval_loss": 1.8399487733840942, "eval_runtime": 13.0209, "eval_samples_per_second": 505.648, "eval_steps_per_second": 7.91, "step": 120 }, { "epoch": 0.0149133876333601, "eval_loss": 1.8342881202697754, "eval_runtime": 13.0369, "eval_samples_per_second": 505.028, "eval_steps_per_second": 7.901, "step": 130 }, { "epoch": 0.016060571297464726, "eval_loss": 1.8283486366271973, "eval_runtime": 13.0149, "eval_samples_per_second": 505.88, "eval_steps_per_second": 7.914, "step": 140 }, { "epoch": 0.017207754961569347, "eval_loss": 1.822334885597229, "eval_runtime": 13.0213, "eval_samples_per_second": 505.632, "eval_steps_per_second": 7.91, "step": 150 }, { "epoch": 0.01835493862567397, "eval_loss": 1.8158738613128662, "eval_runtime": 13.0599, "eval_samples_per_second": 504.14, "eval_steps_per_second": 7.887, "step": 160 }, { "epoch": 0.019502122289778594, "eval_loss": 1.8090614080429077, "eval_runtime": 13.034, "eval_samples_per_second": 505.14, "eval_steps_per_second": 7.902, "step": 170 }, { "epoch": 0.020649305953883215, "eval_loss": 1.8015782833099365, "eval_runtime": 13.0665, "eval_samples_per_second": 503.885, "eval_steps_per_second": 7.883, "step": 180 }, { "epoch": 0.02179648961798784, "eval_loss": 1.793796420097351, "eval_runtime": 13.0555, "eval_samples_per_second": 504.31, "eval_steps_per_second": 7.889, "step": 190 }, { "epoch": 0.022943673282092462, "grad_norm": 4.906337738037109, "learning_rate": 7.645259938837921e-07, "loss": 3.0303, "step": 200 }, { "epoch": 0.022943673282092462, "eval_loss": 1.785815715789795, "eval_runtime": 12.9925, "eval_samples_per_second": 506.754, "eval_steps_per_second": 7.928, "step": 200 }, { "epoch": 0.024090856946197087, "eval_loss": 1.7775053977966309, "eval_runtime": 13.0639, "eval_samples_per_second": 503.986, "eval_steps_per_second": 7.884, "step": 210 }, { "epoch": 0.025238040610301708, "eval_loss": 1.7692992687225342, "eval_runtime": 13.0129, "eval_samples_per_second": 505.96, "eval_steps_per_second": 7.915, "step": 220 }, { "epoch": 0.026385224274406333, "eval_loss": 1.760453224182129, "eval_runtime": 13.0078, "eval_samples_per_second": 506.158, "eval_steps_per_second": 7.918, "step": 230 }, { "epoch": 0.027532407938510955, "eval_loss": 1.751396656036377, "eval_runtime": 12.9957, "eval_samples_per_second": 506.628, "eval_steps_per_second": 7.926, "step": 240 }, { "epoch": 0.02867959160261558, "eval_loss": 1.7417218685150146, "eval_runtime": 12.9774, "eval_samples_per_second": 507.344, "eval_steps_per_second": 7.937, "step": 250 }, { "epoch": 0.0298267752667202, "eval_loss": 1.7319914102554321, "eval_runtime": 13.0219, "eval_samples_per_second": 505.611, "eval_steps_per_second": 7.91, "step": 260 }, { "epoch": 0.030973958930824826, "eval_loss": 1.7227253913879395, "eval_runtime": 13.0026, "eval_samples_per_second": 506.361, "eval_steps_per_second": 7.922, "step": 270 }, { "epoch": 0.03212114259492945, "eval_loss": 1.7133797407150269, "eval_runtime": 12.9757, "eval_samples_per_second": 507.409, "eval_steps_per_second": 7.938, "step": 280 }, { "epoch": 0.03326832625903407, "eval_loss": 1.704041600227356, "eval_runtime": 12.9845, "eval_samples_per_second": 507.065, "eval_steps_per_second": 7.933, "step": 290 }, { "epoch": 0.034415509923138694, "grad_norm": 4.665822505950928, "learning_rate": 1.1467889908256882e-06, "loss": 2.9459, "step": 300 }, { "epoch": 0.034415509923138694, "eval_loss": 1.6940686702728271, "eval_runtime": 13.0019, "eval_samples_per_second": 506.387, "eval_steps_per_second": 7.922, "step": 300 } ], "logging_steps": 100, "max_steps": 26151, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }