{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08815232722143865, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01763046544428773, "grad_norm": 1.5435048341751099, "learning_rate": 4.9706158909261876e-05, "loss": 0.8407, "step": 500 }, { "epoch": 0.01763046544428773, "eval_accuracy": 0.8252411503248731, "eval_loss": 0.7565935850143433, "eval_runtime": 2895.6955, "eval_samples_per_second": 32.989, "eval_steps_per_second": 0.516, "step": 500 }, { "epoch": 0.03526093088857546, "grad_norm": 1.5116485357284546, "learning_rate": 4.9412317818523744e-05, "loss": 0.5964, "step": 1000 }, { "epoch": 0.03526093088857546, "eval_accuracy": 0.8373293281429335, "eval_loss": 0.6963507533073425, "eval_runtime": 2899.1751, "eval_samples_per_second": 32.949, "eval_steps_per_second": 0.515, "step": 1000 }, { "epoch": 0.05289139633286319, "grad_norm": 1.4373358488082886, "learning_rate": 4.911847672778562e-05, "loss": 0.5661, "step": 1500 }, { "epoch": 0.05289139633286319, "eval_accuracy": 0.8443953465863471, "eval_loss": 0.6656736731529236, "eval_runtime": 2944.9636, "eval_samples_per_second": 32.437, "eval_steps_per_second": 0.507, "step": 1500 }, { "epoch": 0.07052186177715092, "grad_norm": 1.216012716293335, "learning_rate": 4.882463563704749e-05, "loss": 0.5402, "step": 2000 }, { "epoch": 0.07052186177715092, "eval_accuracy": 0.8482718545347777, "eval_loss": 0.6440214514732361, "eval_runtime": 2944.5243, "eval_samples_per_second": 32.442, "eval_steps_per_second": 0.507, "step": 2000 }, { "epoch": 0.08815232722143865, "grad_norm": 1.0847452878952026, "learning_rate": 4.853079454630936e-05, "loss": 0.5237, "step": 2500 }, { "epoch": 0.08815232722143865, "eval_accuracy": 0.8508165457808422, "eval_loss": 0.6308088898658752, "eval_runtime": 2933.5042, "eval_samples_per_second": 32.564, "eval_steps_per_second": 0.509, "step": 2500 } ], "logging_steps": 500, "max_steps": 85080, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 3.70943641780224e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }