{ "best_metric": 0.6702544031311155, "best_model_checkpoint": "tiny-bert-sst2-distilled/run-2/checkpoint-576", "epoch": 9.0, "eval_steps": 500, "global_step": 864, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 0.663398027420044, "learning_rate": 0.00027867701883546326, "loss": 0.2787, "step": 96 }, { "epoch": 1.0, "eval_accuracy": 0.5499021526418787, "eval_f1": 0.24092409240924093, "eval_loss": 0.25340619683265686, "eval_precision": 0.7684210526315789, "eval_recall": 0.14285714285714285, "eval_runtime": 28.5207, "eval_samples_per_second": 35.834, "eval_steps_per_second": 1.122, "step": 96 }, { "epoch": 2.0, "grad_norm": 0.7603411078453064, "learning_rate": 0.00024384239148103037, "loss": 0.2582, "step": 192 }, { "epoch": 2.0, "eval_accuracy": 0.5675146771037182, "eval_f1": 0.300632911392405, "eval_loss": 0.2482573688030243, "eval_precision": 0.7851239669421488, "eval_recall": 0.18590998043052837, "eval_runtime": 28.5782, "eval_samples_per_second": 35.762, "eval_steps_per_second": 1.12, "step": 192 }, { "epoch": 3.0, "grad_norm": 0.6440157294273376, "learning_rate": 0.00020900776412659743, "loss": 0.2508, "step": 288 }, { "epoch": 3.0, "eval_accuracy": 0.6027397260273972, "eval_f1": 0.40117994100294985, "eval_loss": 0.25017455220222473, "eval_precision": 0.8143712574850299, "eval_recall": 0.26614481409001955, "eval_runtime": 28.8169, "eval_samples_per_second": 35.465, "eval_steps_per_second": 1.11, "step": 288 }, { "epoch": 4.0, "grad_norm": 0.8489454388618469, "learning_rate": 0.00017417313677216454, "loss": 0.2472, "step": 384 }, { "epoch": 4.0, "eval_accuracy": 0.576320939334638, "eval_f1": 0.304975922953451, "eval_loss": 0.2455543428659439, "eval_precision": 0.8482142857142857, "eval_recall": 0.18590998043052837, "eval_runtime": 28.3606, "eval_samples_per_second": 36.036, "eval_steps_per_second": 1.128, "step": 384 }, { "epoch": 5.0, "grad_norm": 0.6578030586242676, "learning_rate": 0.00013933850941773163, "loss": 0.2444, "step": 480 }, { "epoch": 5.0, "eval_accuracy": 0.5949119373776908, "eval_f1": 0.37082066869300917, "eval_loss": 0.24357673525810242, "eval_precision": 0.8299319727891157, "eval_recall": 0.23874755381604695, "eval_runtime": 28.5893, "eval_samples_per_second": 35.748, "eval_steps_per_second": 1.119, "step": 480 }, { "epoch": 6.0, "grad_norm": 0.576878547668457, "learning_rate": 0.00010450388206329872, "loss": 0.244, "step": 576 }, { "epoch": 6.0, "eval_accuracy": 0.6702544031311155, "eval_f1": 0.5536423841059602, "eval_loss": 0.24780145287513733, "eval_precision": 0.8565573770491803, "eval_recall": 0.4090019569471624, "eval_runtime": 28.8228, "eval_samples_per_second": 35.458, "eval_steps_per_second": 1.11, "step": 576 }, { "epoch": 7.0, "grad_norm": 0.42660632729530334, "learning_rate": 6.966925470886582e-05, "loss": 0.2418, "step": 672 }, { "epoch": 7.0, "eval_accuracy": 0.6350293542074364, "eval_f1": 0.4709219858156028, "eval_loss": 0.2434568703174591, "eval_precision": 0.8556701030927835, "eval_recall": 0.324853228962818, "eval_runtime": 28.6835, "eval_samples_per_second": 35.63, "eval_steps_per_second": 1.116, "step": 672 }, { "epoch": 8.0, "grad_norm": 0.7299667000770569, "learning_rate": 3.483462735443291e-05, "loss": 0.2396, "step": 768 }, { "epoch": 8.0, "eval_accuracy": 0.6203522504892368, "eval_f1": 0.43440233236151604, "eval_loss": 0.24256332218647003, "eval_precision": 0.8514285714285714, "eval_recall": 0.29158512720156554, "eval_runtime": 29.1745, "eval_samples_per_second": 35.031, "eval_steps_per_second": 1.097, "step": 768 }, { "epoch": 9.0, "grad_norm": 0.48326733708381653, "learning_rate": 0.0, "loss": 0.2396, "step": 864 }, { "epoch": 9.0, "eval_accuracy": 0.6291585127201565, "eval_f1": 0.4562410329985653, "eval_loss": 0.2423153966665268, "eval_precision": 0.8548387096774194, "eval_recall": 0.3111545988258317, "eval_runtime": 28.281, "eval_samples_per_second": 36.137, "eval_steps_per_second": 1.131, "step": 864 } ], "logging_steps": 500, "max_steps": 864, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 500, "total_flos": 2121256775520.0, "train_batch_size": 32, "trial_name": null, "trial_params": { "alpha": 0.3483609408663828, "learning_rate": 0.0003135116461898962, "num_train_epochs": 9, "temperature": 7 } }