{ "best_metric": 0.6673267326732674, "best_model_checkpoint": "tiny-bert-sst2-distilled/run-1/checkpoint-2970", "epoch": 10.0, "eval_steps": 500, "global_step": 2970, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 0.8816747665405273, "learning_rate": 0.0001319007893143593, "loss": 0.4812, "step": 297 }, { "epoch": 1.0, "eval_accuracy": 0.5188118811881188, "eval_loss": 0.4719693958759308, "eval_runtime": 14.5929, "eval_samples_per_second": 34.606, "eval_steps_per_second": 1.096, "step": 297 }, { "epoch": 2.0, "grad_norm": 0.699450671672821, "learning_rate": 0.00011724514605720826, "loss": 0.4696, "step": 594 }, { "epoch": 2.0, "eval_accuracy": 0.5801980198019802, "eval_loss": 0.46854168176651, "eval_runtime": 15.1826, "eval_samples_per_second": 33.262, "eval_steps_per_second": 1.054, "step": 594 }, { "epoch": 3.0, "grad_norm": 1.2772634029388428, "learning_rate": 0.00010258950280005722, "loss": 0.4576, "step": 891 }, { "epoch": 3.0, "eval_accuracy": 0.6336633663366337, "eval_loss": 0.46670353412628174, "eval_runtime": 14.6186, "eval_samples_per_second": 34.545, "eval_steps_per_second": 1.094, "step": 891 }, { "epoch": 4.0, "grad_norm": 1.0283604860305786, "learning_rate": 8.793385954290618e-05, "loss": 0.4458, "step": 1188 }, { "epoch": 4.0, "eval_accuracy": 0.6356435643564357, "eval_loss": 0.4543105363845825, "eval_runtime": 15.6547, "eval_samples_per_second": 32.259, "eval_steps_per_second": 1.022, "step": 1188 }, { "epoch": 5.0, "grad_norm": 1.320448398590088, "learning_rate": 7.327821628575516e-05, "loss": 0.4413, "step": 1485 }, { "epoch": 5.0, "eval_accuracy": 0.6415841584158416, "eval_loss": 0.4486277401447296, "eval_runtime": 14.9223, "eval_samples_per_second": 33.842, "eval_steps_per_second": 1.072, "step": 1485 }, { "epoch": 6.0, "grad_norm": 1.926311731338501, "learning_rate": 5.862257302860413e-05, "loss": 0.4367, "step": 1782 }, { "epoch": 6.0, "eval_accuracy": 0.6336633663366337, "eval_loss": 0.44990047812461853, "eval_runtime": 15.0105, "eval_samples_per_second": 33.643, "eval_steps_per_second": 1.066, "step": 1782 }, { "epoch": 7.0, "grad_norm": 1.3797311782836914, "learning_rate": 4.396692977145309e-05, "loss": 0.4336, "step": 2079 }, { "epoch": 7.0, "eval_accuracy": 0.6574257425742575, "eval_loss": 0.4465705454349518, "eval_runtime": 14.9751, "eval_samples_per_second": 33.723, "eval_steps_per_second": 1.068, "step": 2079 }, { "epoch": 8.0, "grad_norm": 1.7872868776321411, "learning_rate": 2.9311286514302065e-05, "loss": 0.4305, "step": 2376 }, { "epoch": 8.0, "eval_accuracy": 0.6534653465346535, "eval_loss": 0.44232580065727234, "eval_runtime": 15.2623, "eval_samples_per_second": 33.088, "eval_steps_per_second": 1.048, "step": 2376 }, { "epoch": 9.0, "grad_norm": 1.2580420970916748, "learning_rate": 1.4655643257151032e-05, "loss": 0.4287, "step": 2673 }, { "epoch": 9.0, "eval_accuracy": 0.6574257425742575, "eval_loss": 0.4441685378551483, "eval_runtime": 17.0285, "eval_samples_per_second": 29.656, "eval_steps_per_second": 0.94, "step": 2673 }, { "epoch": 10.0, "grad_norm": 1.7653656005859375, "learning_rate": 0.0, "loss": 0.4265, "step": 2970 }, { "epoch": 10.0, "eval_accuracy": 0.6673267326732674, "eval_loss": 0.44359874725341797, "eval_runtime": 15.5409, "eval_samples_per_second": 32.495, "eval_steps_per_second": 1.03, "step": 2970 } ], "logging_steps": 500, "max_steps": 2970, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 7778432831400.0, "train_batch_size": 32, "trial_name": null, "trial_params": { "alpha": 0.6312656697409034, "learning_rate": 0.00014655643257151032, "num_train_epochs": 10, "temperature": 21 } }