{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2424289485945024, "eval_steps": 1000, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024848578971890044, "grad_norm": 1.1403515692366237, "learning_rate": 0.0002475, "loss": 0.7727, "step": 100 }, { "epoch": 0.04969715794378009, "grad_norm": 2.0321728234236067, "learning_rate": 0.0004975, "loss": 0.6434, "step": 200 }, { "epoch": 0.07454573691567014, "grad_norm": 1.5298185029395692, "learning_rate": 0.0004896875, "loss": 0.6966, "step": 300 }, { "epoch": 0.09939431588756018, "grad_norm": 1.3277644876612467, "learning_rate": 0.00047927083333333334, "loss": 0.6549, "step": 400 }, { "epoch": 0.12424289485945023, "grad_norm": 1.5260941221380562, "learning_rate": 0.0004688541666666667, "loss": 0.6218, "step": 500 }, { "epoch": 0.14909147383134028, "grad_norm": 1.461823959618585, "learning_rate": 0.0004584375, "loss": 0.6, "step": 600 }, { "epoch": 0.17394005280323033, "grad_norm": 1.579155099177269, "learning_rate": 0.0004480208333333333, "loss": 0.5768, "step": 700 }, { "epoch": 0.19878863177512035, "grad_norm": 1.0030257236753515, "learning_rate": 0.0004376041666666667, "loss": 0.5625, "step": 800 }, { "epoch": 0.2236372107470104, "grad_norm": 1.1362089000123636, "learning_rate": 0.0004271875, "loss": 0.5397, "step": 900 }, { "epoch": 0.24848578971890045, "grad_norm": 1.2060122481392086, "learning_rate": 0.00041677083333333334, "loss": 0.5262, "step": 1000 }, { "epoch": 0.24848578971890045, "eval_accuracy": 0.8519274864233807, "eval_loss": 0.4272424280643463, "eval_runtime": 355.1137, "eval_samples_per_second": 76.471, "eval_steps_per_second": 9.56, "step": 1000 }, { "epoch": 0.2733343686907905, "grad_norm": 1.4939232786831804, "learning_rate": 0.0004063541666666667, "loss": 0.507, "step": 1100 }, { "epoch": 0.29818294766268055, "grad_norm": 1.0210090194207913, "learning_rate": 0.0003959375, "loss": 0.4923, "step": 1200 }, { "epoch": 0.3230315266345706, "grad_norm": 1.5265032617099488, "learning_rate": 0.0003855208333333333, "loss": 0.485, "step": 1300 }, { "epoch": 0.34788010560646065, "grad_norm": 1.5974690542066108, "learning_rate": 0.0003751041666666667, "loss": 0.4734, "step": 1400 }, { "epoch": 0.3727286845783507, "grad_norm": 1.3463235139038994, "learning_rate": 0.0003646875, "loss": 0.4629, "step": 1500 }, { "epoch": 0.3975772635502407, "grad_norm": 0.8545527093751057, "learning_rate": 0.00035427083333333334, "loss": 0.4504, "step": 1600 }, { "epoch": 0.4224258425221308, "grad_norm": 1.1247072555863402, "learning_rate": 0.0003438541666666667, "loss": 0.4366, "step": 1700 }, { "epoch": 0.4472744214940208, "grad_norm": 0.9808487633447448, "learning_rate": 0.0003334375, "loss": 0.4326, "step": 1800 }, { "epoch": 0.4721230004659109, "grad_norm": 0.9117492642967103, "learning_rate": 0.0003230208333333333, "loss": 0.4259, "step": 1900 }, { "epoch": 0.4969715794378009, "grad_norm": 0.7717947637961347, "learning_rate": 0.0003126041666666667, "loss": 0.413, "step": 2000 }, { "epoch": 0.4969715794378009, "eval_accuracy": 0.8710562213369932, "eval_loss": 0.3649989068508148, "eval_runtime": 354.5714, "eval_samples_per_second": 76.588, "eval_steps_per_second": 9.575, "step": 2000 }, { "epoch": 0.521820158409691, "grad_norm": 0.8113600127860467, "learning_rate": 0.0003021875, "loss": 0.4058, "step": 2100 }, { "epoch": 0.546668737381581, "grad_norm": 0.8903551125095357, "learning_rate": 0.00029177083333333334, "loss": 0.397, "step": 2200 }, { "epoch": 0.571517316353471, "grad_norm": 0.6671784623140349, "learning_rate": 0.0002813541666666667, "loss": 0.3913, "step": 2300 }, { "epoch": 0.5963658953253611, "grad_norm": 0.7312172828583983, "learning_rate": 0.0002709375, "loss": 0.3818, "step": 2400 }, { "epoch": 0.6212144742972511, "grad_norm": 0.7230909781837032, "learning_rate": 0.0002605208333333333, "loss": 0.3764, "step": 2500 }, { "epoch": 0.6460630532691412, "grad_norm": 0.6919092960285033, "learning_rate": 0.0002501041666666667, "loss": 0.3749, "step": 2600 }, { "epoch": 0.6709116322410312, "grad_norm": 0.7449666551062925, "learning_rate": 0.0002396875, "loss": 0.3679, "step": 2700 }, { "epoch": 0.6957602112129213, "grad_norm": 0.6657006999853705, "learning_rate": 0.00022927083333333333, "loss": 0.3603, "step": 2800 }, { "epoch": 0.7206087901848113, "grad_norm": 0.7751989636808677, "learning_rate": 0.00021885416666666665, "loss": 0.3576, "step": 2900 }, { "epoch": 0.7454573691567014, "grad_norm": 0.582330455026636, "learning_rate": 0.0002084375, "loss": 0.3505, "step": 3000 }, { "epoch": 0.7454573691567014, "eval_accuracy": 0.885227854690163, "eval_loss": 0.3137596547603607, "eval_runtime": 354.0567, "eval_samples_per_second": 76.7, "eval_steps_per_second": 9.589, "step": 3000 }, { "epoch": 0.7703059481285914, "grad_norm": 0.624670881883226, "learning_rate": 0.00019802083333333333, "loss": 0.3419, "step": 3100 }, { "epoch": 0.7951545271004814, "grad_norm": 0.6846077168970752, "learning_rate": 0.00018760416666666665, "loss": 0.3429, "step": 3200 }, { "epoch": 0.8200031060723715, "grad_norm": 0.549136141158353, "learning_rate": 0.0001771875, "loss": 0.3347, "step": 3300 }, { "epoch": 0.8448516850442616, "grad_norm": 0.5196724667426235, "learning_rate": 0.00016677083333333333, "loss": 0.3347, "step": 3400 }, { "epoch": 0.8697002640161515, "grad_norm": 0.559242933165121, "learning_rate": 0.00015635416666666665, "loss": 0.33, "step": 3500 }, { "epoch": 0.8945488429880416, "grad_norm": 0.5987209783232824, "learning_rate": 0.0001459375, "loss": 0.3236, "step": 3600 }, { "epoch": 0.9193974219599317, "grad_norm": 0.5980851789962784, "learning_rate": 0.00013552083333333333, "loss": 0.3242, "step": 3700 }, { "epoch": 0.9442460009318218, "grad_norm": 0.8472335802810602, "learning_rate": 0.00012510416666666665, "loss": 0.3135, "step": 3800 }, { "epoch": 0.9690945799037117, "grad_norm": 0.5414234864761139, "learning_rate": 0.0001146875, "loss": 0.3151, "step": 3900 }, { "epoch": 0.9939431588756018, "grad_norm": 0.5498938130901921, "learning_rate": 0.00010427083333333333, "loss": 0.3111, "step": 4000 }, { "epoch": 0.9939431588756018, "eval_accuracy": 0.894954167881237, "eval_loss": 0.2828981876373291, "eval_runtime": 353.5701, "eval_samples_per_second": 76.805, "eval_steps_per_second": 9.602, "step": 4000 }, { "epoch": 1.0187917378474918, "grad_norm": 0.49437309189757744, "learning_rate": 9.385416666666667e-05, "loss": 0.3059, "step": 4100 }, { "epoch": 1.043640316819382, "grad_norm": 0.5262199037896036, "learning_rate": 8.34375e-05, "loss": 0.3021, "step": 4200 }, { "epoch": 1.068488895791272, "grad_norm": 0.508417592763766, "learning_rate": 7.302083333333333e-05, "loss": 0.2961, "step": 4300 }, { "epoch": 1.093337474763162, "grad_norm": 0.4111721436877036, "learning_rate": 6.260416666666667e-05, "loss": 0.2966, "step": 4400 }, { "epoch": 1.118186053735052, "grad_norm": 0.44511878991978343, "learning_rate": 5.21875e-05, "loss": 0.293, "step": 4500 }, { "epoch": 1.143034632706942, "grad_norm": 0.44581104227849994, "learning_rate": 4.177083333333334e-05, "loss": 0.2903, "step": 4600 }, { "epoch": 1.167883211678832, "grad_norm": 0.40843300918294273, "learning_rate": 3.135416666666667e-05, "loss": 0.2872, "step": 4700 }, { "epoch": 1.1927317906507222, "grad_norm": 0.45173761297005016, "learning_rate": 2.09375e-05, "loss": 0.2829, "step": 4800 }, { "epoch": 1.2175803696226122, "grad_norm": 0.3729668197663827, "learning_rate": 1.0520833333333334e-05, "loss": 0.2833, "step": 4900 }, { "epoch": 1.2424289485945024, "grad_norm": 0.39171821421954367, "learning_rate": 1.0416666666666668e-07, "loss": 0.2817, "step": 5000 }, { "epoch": 1.2424289485945024, "eval_accuracy": 0.9025104707867149, "eval_loss": 0.2596372365951538, "eval_runtime": 353.5151, "eval_samples_per_second": 76.817, "eval_steps_per_second": 9.604, "step": 5000 }, { "epoch": 1.2424289485945024, "step": 5000, "total_flos": 643481070469120.0, "train_loss": 0.4145617248535156, "train_runtime": 25261.0456, "train_samples_per_second": 25.335, "train_steps_per_second": 0.198 } ], "logging_steps": 100, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 643481070469120.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }