{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5494, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018201674554058973, "grad_norm": 0.12074069678783417, "learning_rate": 3.642987249544627e-06, "loss": 2.4211, "step": 100 }, { "epoch": 0.03640334910811795, "grad_norm": 0.16923406720161438, "learning_rate": 7.285974499089254e-06, "loss": 2.4302, "step": 200 }, { "epoch": 0.05460502366217692, "grad_norm": 0.2564025819301605, "learning_rate": 1.0928961748633882e-05, "loss": 2.3662, "step": 300 }, { "epoch": 0.0728066982162359, "grad_norm": 0.3928129971027374, "learning_rate": 1.4571948998178507e-05, "loss": 2.3484, "step": 400 }, { "epoch": 0.09100837277029487, "grad_norm": 0.4639979898929596, "learning_rate": 1.8214936247723133e-05, "loss": 2.2966, "step": 500 }, { "epoch": 0.10921004732435384, "grad_norm": 0.5541882514953613, "learning_rate": 1.9994751447857635e-05, "loss": 2.2143, "step": 600 }, { "epoch": 0.12741172187841282, "grad_norm": 0.5979414582252502, "learning_rate": 1.995402116472328e-05, "loss": 2.2202, "step": 700 }, { "epoch": 0.1456133964324718, "grad_norm": 0.7200450897216797, "learning_rate": 1.9873128483611436e-05, "loss": 2.186, "step": 800 }, { "epoch": 0.16381507098653075, "grad_norm": 0.7004340291023254, "learning_rate": 1.975239978960794e-05, "loss": 2.143, "step": 900 }, { "epoch": 0.18201674554058975, "grad_norm": 0.7130960822105408, "learning_rate": 1.9592322197796805e-05, "loss": 2.1259, "step": 1000 }, { "epoch": 0.2002184200946487, "grad_norm": 0.7769984602928162, "learning_rate": 1.93935415878525e-05, "loss": 2.1268, "step": 1100 }, { "epoch": 0.21842009464870768, "grad_norm": 0.7485630512237549, "learning_rate": 1.9156859998050283e-05, "loss": 2.1238, "step": 1200 }, { "epoch": 0.23662176920276665, "grad_norm": 0.7145903706550598, "learning_rate": 1.888323238920913e-05, "loss": 2.0817, "step": 1300 }, { "epoch": 0.25482344375682564, "grad_norm": 1.07276451587677, "learning_rate": 1.8573762791624132e-05, "loss": 2.0974, "step": 1400 }, { "epoch": 0.2730251183108846, "grad_norm": 0.9192919731140137, "learning_rate": 1.8229699850534693e-05, "loss": 2.0534, "step": 1500 }, { "epoch": 0.2912267928649436, "grad_norm": 0.8001774549484253, "learning_rate": 1.785243178810155e-05, "loss": 2.0735, "step": 1600 }, { "epoch": 0.30942846741900254, "grad_norm": 1.0227042436599731, "learning_rate": 1.7443480802220092e-05, "loss": 2.0371, "step": 1700 }, { "epoch": 0.3276301419730615, "grad_norm": 1.1049360036849976, "learning_rate": 1.700449692476954e-05, "loss": 2.0776, "step": 1800 }, { "epoch": 0.3458318165271205, "grad_norm": 1.410515308380127, "learning_rate": 1.653725136407869e-05, "loss": 2.0528, "step": 1900 }, { "epoch": 0.3640334910811795, "grad_norm": 0.8015642762184143, "learning_rate": 1.604362935846993e-05, "loss": 2.0418, "step": 2000 }, { "epoch": 0.38223516563523846, "grad_norm": 1.178688645362854, "learning_rate": 1.552562256971609e-05, "loss": 2.0279, "step": 2100 }, { "epoch": 0.4004368401892974, "grad_norm": 0.9646903276443481, "learning_rate": 1.4985321047100842e-05, "loss": 2.0186, "step": 2200 }, { "epoch": 0.4186385147433564, "grad_norm": 0.7918594479560852, "learning_rate": 1.4424904794506034e-05, "loss": 2.026, "step": 2300 }, { "epoch": 0.43684018929741536, "grad_norm": 1.1927690505981445, "learning_rate": 1.384663497455101e-05, "loss": 2.0192, "step": 2400 }, { "epoch": 0.4550418638514743, "grad_norm": 1.190807819366455, "learning_rate": 1.3252844785273324e-05, "loss": 2.0296, "step": 2500 }, { "epoch": 0.4732435384055333, "grad_norm": 0.8977068662643433, "learning_rate": 1.2645930046161638e-05, "loss": 1.9879, "step": 2600 }, { "epoch": 0.49144521295959226, "grad_norm": 0.9004688262939453, "learning_rate": 1.2028339531524058e-05, "loss": 1.9958, "step": 2700 }, { "epoch": 0.5096468875136513, "grad_norm": 1.2170230150222778, "learning_rate": 1.1402565090194902e-05, "loss": 2.0112, "step": 2800 }, { "epoch": 0.5278485620677102, "grad_norm": 0.7809821963310242, "learning_rate": 1.077113159144467e-05, "loss": 2.0264, "step": 2900 }, { "epoch": 0.5460502366217692, "grad_norm": 1.3472033739089966, "learning_rate": 1.013658673765951e-05, "loss": 2.0294, "step": 3000 }, { "epoch": 0.5642519111758282, "grad_norm": 1.2806614637374878, "learning_rate": 9.5014907848938e-06, "loss": 2.0347, "step": 3100 }, { "epoch": 0.5824535857298871, "grad_norm": 1.0503904819488525, "learning_rate": 8.868406212771264e-06, "loss": 2.0319, "step": 3200 }, { "epoch": 0.6006552602839461, "grad_norm": 1.008973240852356, "learning_rate": 8.239887385414536e-06, "loss": 1.9869, "step": 3300 }, { "epoch": 0.6188569348380051, "grad_norm": 0.9321379065513611, "learning_rate": 7.618470245118983e-06, "loss": 1.9791, "step": 3400 }, { "epoch": 0.637058609392064, "grad_norm": 0.953461766242981, "learning_rate": 7.006662080354818e-06, "loss": 1.9991, "step": 3500 }, { "epoch": 0.655260283946123, "grad_norm": 0.9465263485908508, "learning_rate": 6.406931409381398e-06, "loss": 1.9582, "step": 3600 }, { "epoch": 0.673461958500182, "grad_norm": 1.0994277000427246, "learning_rate": 5.821698020291234e-06, "loss": 1.9744, "step": 3700 }, { "epoch": 0.691663633054241, "grad_norm": 1.4719334840774536, "learning_rate": 5.2533232076700295e-06, "loss": 2.0104, "step": 3800 }, { "epoch": 0.7098653076082999, "grad_norm": 1.5267114639282227, "learning_rate": 4.704100245265875e-06, "loss": 1.9709, "step": 3900 }, { "epoch": 0.728066982162359, "grad_norm": 1.1738559007644653, "learning_rate": 4.17624513310824e-06, "loss": 1.9668, "step": 4000 }, { "epoch": 0.746268656716418, "grad_norm": 0.9715506434440613, "learning_rate": 3.6718876564103676e-06, "loss": 1.999, "step": 4100 }, { "epoch": 0.7644703312704769, "grad_norm": 1.1521637439727783, "learning_rate": 3.1930627923304226e-06, "loss": 2.0145, "step": 4200 }, { "epoch": 0.7826720058245359, "grad_norm": 0.9326179027557373, "learning_rate": 2.7417024992633512e-06, "loss": 2.0114, "step": 4300 }, { "epoch": 0.8008736803785949, "grad_norm": 1.0622531175613403, "learning_rate": 2.319627921791974e-06, "loss": 1.9749, "step": 4400 }, { "epoch": 0.8190753549326538, "grad_norm": 1.1154918670654297, "learning_rate": 1.928542042748661e-06, "loss": 1.9867, "step": 4500 }, { "epoch": 0.8372770294867128, "grad_norm": 1.2928146123886108, "learning_rate": 1.5700228120350248e-06, "loss": 1.8986, "step": 4600 }, { "epoch": 0.8554787040407718, "grad_norm": 0.9215976595878601, "learning_rate": 1.2455167799233836e-06, "loss": 1.9781, "step": 4700 }, { "epoch": 0.8736803785948307, "grad_norm": 0.9362882375717163, "learning_rate": 9.563332605283148e-07, "loss": 2.0269, "step": 4800 }, { "epoch": 0.8918820531488897, "grad_norm": 1.0011606216430664, "learning_rate": 7.036390489974565e-07, "loss": 1.9843, "step": 4900 }, { "epoch": 0.9100837277029487, "grad_norm": 1.2879751920700073, "learning_rate": 4.884537137365974e-07, "loss": 1.9739, "step": 5000 }, { "epoch": 0.9282854022570076, "grad_norm": 1.0357611179351807, "learning_rate": 3.116454826639448e-07, "loss": 2.009, "step": 5100 }, { "epoch": 0.9464870768110666, "grad_norm": 1.006805658340454, "learning_rate": 1.7392774009168169e-07, "loss": 1.9903, "step": 5200 }, { "epoch": 0.9646887513651256, "grad_norm": 1.6478885412216187, "learning_rate": 7.585614836914778e-08, "loss": 1.9801, "step": 5300 }, { "epoch": 0.9828904259191845, "grad_norm": 0.9603891372680664, "learning_rate": 1.782640590122342e-08, "loss": 2.0091, "step": 5400 }, { "epoch": 1.0, "eval_loss": 1.9291760921478271, "eval_runtime": 74.021, "eval_samples_per_second": 13.956, "eval_steps_per_second": 1.756, "step": 5494 }, { "epoch": 1.0, "step": 5494, "total_flos": 1.0006271901696e+17, "train_loss": 2.0624009956218217, "train_runtime": 1767.6809, "train_samples_per_second": 6.215, "train_steps_per_second": 3.108 } ], "logging_steps": 100, "max_steps": 5494, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0006271901696e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }