{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.992, "eval_steps": 500, "global_step": 31, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 777.4896087646484, "epoch": 0.032, "grad_norm": 0.5903529116709957, "kl": 0.0, "learning_rate": 7.5e-07, "loss": 0.0, "reward": 0.5625000223517418, "reward_std": 0.07392781227827072, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "step": 1 }, { "completion_length": 777.8633041381836, "epoch": 0.16, "grad_norm": 0.380767083039185, "kl": 0.0002472996711730957, "learning_rate": 2.989857536612915e-06, "loss": 0.0, "reward": 0.5247395876795053, "reward_std": 0.03600824438035488, "rewards/accuracy_reward": 0.5247395876795053, "rewards/format_reward": 0.0, "step": 5 }, { "completion_length": 808.4958587646485, "epoch": 0.32, "grad_norm": 0.17029855265794955, "kl": 0.0011264801025390625, "learning_rate": 2.649066664678467e-06, "loss": 0.0, "reward": 0.48437500894069674, "reward_std": 0.04581574220210314, "rewards/accuracy_reward": 0.48437500894069674, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 786.0593933105469, "epoch": 0.48, "grad_norm": 3.440906400450222, "kl": 0.018999862670898437, "learning_rate": 1.9302048490666355e-06, "loss": 0.0008, "reward": 0.47604167386889457, "reward_std": 0.039101397059857844, "rewards/accuracy_reward": 0.47604167386889457, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 777.956265258789, "epoch": 0.64, "grad_norm": 0.3120699062992745, "kl": 0.006772041320800781, "learning_rate": 1.069795150933365e-06, "loss": 0.0003, "reward": 0.47083334177732467, "reward_std": 0.029002375528216363, "rewards/accuracy_reward": 0.47083334177732467, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 810.5562713623046, "epoch": 0.8, "grad_norm": 0.2740538627520351, "kl": 0.003619384765625, "learning_rate": 3.5093333532153313e-07, "loss": 0.0001, "reward": 0.49375000819563863, "reward_std": 0.03593750298023224, "rewards/accuracy_reward": 0.49375000819563863, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 798.4125213623047, "epoch": 0.96, "grad_norm": 9.502030258134457, "kl": 0.008373069763183593, "learning_rate": 1.0142463387085465e-08, "loss": 0.0003, "reward": 0.5312500089406967, "reward_std": 0.05033150520175696, "rewards/accuracy_reward": 0.5312500089406967, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 795.6406402587891, "epoch": 0.992, "kl": 0.006107330322265625, "reward": 0.5885416716337204, "reward_std": 0.012757758609950542, "rewards/accuracy_reward": 0.5885416716337204, "rewards/format_reward": 0.0, "step": 31, "total_flos": 0.0, "train_loss": 0.00025426253455163744, "train_runtime": 3666.8179, "train_samples_per_second": 0.273, "train_steps_per_second": 0.008 } ], "logging_steps": 5, "max_steps": 31, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }