{ "best_metric": 0.2986587882041931, "best_model_checkpoint": "./results/checkpoint-180", "epoch": 1.510574018126888, "eval_steps": 20, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06042296072507553, "grad_norm": 0.5974797606468201, "learning_rate": 1.97583081570997e-05, "loss": 0.5426, "step": 20 }, { "epoch": 0.06042296072507553, "eval_accuracy": 0.7450980392156863, "eval_loss": 0.7869178652763367, "eval_runtime": 15.0039, "eval_samples_per_second": 10.197, "eval_steps_per_second": 2.599, "step": 20 }, { "epoch": 0.12084592145015106, "grad_norm": 13.038370132446289, "learning_rate": 1.9516616314199397e-05, "loss": 0.5984, "step": 40 }, { "epoch": 0.12084592145015106, "eval_accuracy": 0.7908496732026143, "eval_loss": 0.39430469274520874, "eval_runtime": 15.9435, "eval_samples_per_second": 9.596, "eval_steps_per_second": 2.446, "step": 40 }, { "epoch": 0.18126888217522658, "grad_norm": 0.1351632922887802, "learning_rate": 1.9274924471299096e-05, "loss": 0.4864, "step": 60 }, { "epoch": 0.18126888217522658, "eval_accuracy": 0.7843137254901961, "eval_loss": 0.9364686608314514, "eval_runtime": 15.2263, "eval_samples_per_second": 10.048, "eval_steps_per_second": 2.561, "step": 60 }, { "epoch": 0.24169184290030213, "grad_norm": 1.4119517803192139, "learning_rate": 1.9033232628398792e-05, "loss": 0.6039, "step": 80 }, { "epoch": 0.24169184290030213, "eval_accuracy": 0.7712418300653595, "eval_loss": 0.6580381989479065, "eval_runtime": 15.5472, "eval_samples_per_second": 9.841, "eval_steps_per_second": 2.508, "step": 80 }, { "epoch": 0.3021148036253776, "grad_norm": 13.04010009765625, "learning_rate": 1.879154078549849e-05, "loss": 0.5741, "step": 100 }, { "epoch": 0.3021148036253776, "eval_accuracy": 0.8235294117647058, "eval_loss": 0.3454345464706421, "eval_runtime": 15.5082, "eval_samples_per_second": 9.866, "eval_steps_per_second": 2.515, "step": 100 }, { "epoch": 0.36253776435045315, "grad_norm": 61.882415771484375, "learning_rate": 1.854984894259819e-05, "loss": 0.4276, "step": 120 }, { "epoch": 0.36253776435045315, "eval_accuracy": 0.8169934640522876, "eval_loss": 0.5421260595321655, "eval_runtime": 15.4627, "eval_samples_per_second": 9.895, "eval_steps_per_second": 2.522, "step": 120 }, { "epoch": 0.4229607250755287, "grad_norm": 8.9187593460083, "learning_rate": 1.8308157099697886e-05, "loss": 0.4342, "step": 140 }, { "epoch": 0.4229607250755287, "eval_accuracy": 0.8562091503267973, "eval_loss": 0.4258342981338501, "eval_runtime": 15.4753, "eval_samples_per_second": 9.887, "eval_steps_per_second": 2.52, "step": 140 }, { "epoch": 0.48338368580060426, "grad_norm": 40.476078033447266, "learning_rate": 1.8066465256797586e-05, "loss": 0.4915, "step": 160 }, { "epoch": 0.48338368580060426, "eval_accuracy": 0.8300653594771242, "eval_loss": 0.5960604548454285, "eval_runtime": 15.4631, "eval_samples_per_second": 9.895, "eval_steps_per_second": 2.522, "step": 160 }, { "epoch": 0.5438066465256798, "grad_norm": 3.8627891540527344, "learning_rate": 1.782477341389728e-05, "loss": 0.4127, "step": 180 }, { "epoch": 0.5438066465256798, "eval_accuracy": 0.869281045751634, "eval_loss": 0.2986587882041931, "eval_runtime": 15.5147, "eval_samples_per_second": 9.862, "eval_steps_per_second": 2.514, "step": 180 }, { "epoch": 0.6042296072507553, "grad_norm": 16.525339126586914, "learning_rate": 1.758308157099698e-05, "loss": 0.3166, "step": 200 }, { "epoch": 0.6042296072507553, "eval_accuracy": 0.869281045751634, "eval_loss": 0.33075031638145447, "eval_runtime": 15.5417, "eval_samples_per_second": 9.845, "eval_steps_per_second": 2.509, "step": 200 }, { "epoch": 0.6646525679758308, "grad_norm": 1.5080480575561523, "learning_rate": 1.7341389728096677e-05, "loss": 0.4018, "step": 220 }, { "epoch": 0.6646525679758308, "eval_accuracy": 0.803921568627451, "eval_loss": 0.5285586714744568, "eval_runtime": 15.4908, "eval_samples_per_second": 9.877, "eval_steps_per_second": 2.518, "step": 220 }, { "epoch": 0.7250755287009063, "grad_norm": 21.941341400146484, "learning_rate": 1.7099697885196376e-05, "loss": 0.3007, "step": 240 }, { "epoch": 0.7250755287009063, "eval_accuracy": 0.8627450980392157, "eval_loss": 0.584548830986023, "eval_runtime": 15.5027, "eval_samples_per_second": 9.869, "eval_steps_per_second": 2.516, "step": 240 }, { "epoch": 0.7854984894259819, "grad_norm": 7.442047119140625, "learning_rate": 1.6858006042296072e-05, "loss": 0.4893, "step": 260 }, { "epoch": 0.7854984894259819, "eval_accuracy": 0.8627450980392157, "eval_loss": 0.36624589562416077, "eval_runtime": 15.471, "eval_samples_per_second": 9.889, "eval_steps_per_second": 2.521, "step": 260 }, { "epoch": 0.8459214501510574, "grad_norm": 27.062305450439453, "learning_rate": 1.661631419939577e-05, "loss": 0.274, "step": 280 }, { "epoch": 0.8459214501510574, "eval_accuracy": 0.869281045751634, "eval_loss": 0.34829556941986084, "eval_runtime": 15.4719, "eval_samples_per_second": 9.889, "eval_steps_per_second": 2.521, "step": 280 }, { "epoch": 0.9063444108761329, "grad_norm": 9.517237663269043, "learning_rate": 1.637462235649547e-05, "loss": 0.5741, "step": 300 }, { "epoch": 0.9063444108761329, "eval_accuracy": 0.8823529411764706, "eval_loss": 0.32800009846687317, "eval_runtime": 15.4828, "eval_samples_per_second": 9.882, "eval_steps_per_second": 2.519, "step": 300 }, { "epoch": 0.9667673716012085, "grad_norm": 40.94089889526367, "learning_rate": 1.6132930513595166e-05, "loss": 0.3752, "step": 320 }, { "epoch": 0.9667673716012085, "eval_accuracy": 0.8888888888888888, "eval_loss": 0.5250552296638489, "eval_runtime": 15.485, "eval_samples_per_second": 9.881, "eval_steps_per_second": 2.519, "step": 320 }, { "epoch": 1.027190332326284, "grad_norm": 0.2767094373703003, "learning_rate": 1.5891238670694865e-05, "loss": 0.2711, "step": 340 }, { "epoch": 1.027190332326284, "eval_accuracy": 0.8562091503267973, "eval_loss": 0.6096686720848083, "eval_runtime": 15.4954, "eval_samples_per_second": 9.874, "eval_steps_per_second": 2.517, "step": 340 }, { "epoch": 1.0876132930513596, "grad_norm": 1.7755597829818726, "learning_rate": 1.5649546827794565e-05, "loss": 0.2369, "step": 360 }, { "epoch": 1.0876132930513596, "eval_accuracy": 0.869281045751634, "eval_loss": 0.5457373857498169, "eval_runtime": 15.479, "eval_samples_per_second": 9.884, "eval_steps_per_second": 2.52, "step": 360 }, { "epoch": 1.148036253776435, "grad_norm": 197.30111694335938, "learning_rate": 1.540785498489426e-05, "loss": 0.3756, "step": 380 }, { "epoch": 1.148036253776435, "eval_accuracy": 0.8758169934640523, "eval_loss": 0.6890403628349304, "eval_runtime": 15.448, "eval_samples_per_second": 9.904, "eval_steps_per_second": 2.525, "step": 380 }, { "epoch": 1.2084592145015105, "grad_norm": 0.16098028421401978, "learning_rate": 1.516616314199396e-05, "loss": 0.6575, "step": 400 }, { "epoch": 1.2084592145015105, "eval_accuracy": 0.869281045751634, "eval_loss": 0.47085481882095337, "eval_runtime": 15.4824, "eval_samples_per_second": 9.882, "eval_steps_per_second": 2.519, "step": 400 }, { "epoch": 1.2688821752265862, "grad_norm": 0.3975774943828583, "learning_rate": 1.4924471299093657e-05, "loss": 0.3268, "step": 420 }, { "epoch": 1.2688821752265862, "eval_accuracy": 0.8496732026143791, "eval_loss": 0.5218892693519592, "eval_runtime": 15.4867, "eval_samples_per_second": 9.879, "eval_steps_per_second": 2.518, "step": 420 }, { "epoch": 1.3293051359516617, "grad_norm": 8.90912914276123, "learning_rate": 1.4682779456193355e-05, "loss": 0.3994, "step": 440 }, { "epoch": 1.3293051359516617, "eval_accuracy": 0.869281045751634, "eval_loss": 0.42816421389579773, "eval_runtime": 15.4691, "eval_samples_per_second": 9.891, "eval_steps_per_second": 2.521, "step": 440 }, { "epoch": 1.3897280966767371, "grad_norm": 93.57767486572266, "learning_rate": 1.4441087613293053e-05, "loss": 0.0879, "step": 460 }, { "epoch": 1.3897280966767371, "eval_accuracy": 0.8758169934640523, "eval_loss": 0.6293966174125671, "eval_runtime": 15.4702, "eval_samples_per_second": 9.89, "eval_steps_per_second": 2.521, "step": 460 }, { "epoch": 1.4501510574018126, "grad_norm": 0.061734456568956375, "learning_rate": 1.419939577039275e-05, "loss": 0.2566, "step": 480 }, { "epoch": 1.4501510574018126, "eval_accuracy": 0.8627450980392157, "eval_loss": 0.7143150568008423, "eval_runtime": 15.449, "eval_samples_per_second": 9.904, "eval_steps_per_second": 2.524, "step": 480 }, { "epoch": 1.510574018126888, "grad_norm": 0.11357846111059189, "learning_rate": 1.3957703927492448e-05, "loss": 0.2897, "step": 500 }, { "epoch": 1.510574018126888, "eval_accuracy": 0.869281045751634, "eval_loss": 0.6120204329490662, "eval_runtime": 15.463, "eval_samples_per_second": 9.895, "eval_steps_per_second": 2.522, "step": 500 } ], "logging_steps": 20, "max_steps": 1655, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1604904119249676.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }