{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9905956112852664, "eval_steps": 500, "global_step": 79, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012539184952978056, "grad_norm": 9.408438376530524, "learning_rate": 6.25e-08, "logits/chosen": -2.9077322483062744, "logits/rejected": -2.8318910598754883, "logps/chosen": -351.8885498046875, "logps/pi_response": -76.32845306396484, "logps/ref_response": -76.32845306396484, "logps/rejected": -169.29762268066406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.12539184952978055, "grad_norm": 8.382734262861335, "learning_rate": 4.990217055187362e-07, "logits/chosen": -2.7861804962158203, "logits/rejected": -2.762985944747925, "logps/chosen": -234.13641357421875, "logps/pi_response": -70.14045715332031, "logps/ref_response": -70.02328491210938, "logps/rejected": -167.99607849121094, "loss": 0.6904, "rewards/accuracies": 0.5347222089767456, "rewards/chosen": 0.006987536326050758, "rewards/margins": 0.004230231046676636, "rewards/rejected": 0.0027573055122047663, "step": 10 }, { "epoch": 0.2507836990595611, "grad_norm": 6.754671268873102, "learning_rate": 4.655786431300069e-07, "logits/chosen": -2.7409508228302, "logits/rejected": -2.689985752105713, "logps/chosen": -245.8350067138672, "logps/pi_response": -75.90806579589844, "logps/ref_response": -67.40553283691406, "logps/rejected": -170.30355834960938, "loss": 0.6606, "rewards/accuracies": 0.721875011920929, "rewards/chosen": 0.04610520228743553, "rewards/margins": 0.0813165009021759, "rewards/rejected": -0.03521129488945007, "step": 20 }, { "epoch": 0.3761755485893417, "grad_norm": 6.575914102976567, "learning_rate": 3.9061232191019517e-07, "logits/chosen": -2.6522345542907715, "logits/rejected": -2.618391275405884, "logps/chosen": -233.65853881835938, "logps/pi_response": -102.7225112915039, "logps/ref_response": -65.888427734375, "logps/rejected": -187.14553833007812, "loss": 0.6248, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.10190945863723755, "rewards/margins": 0.1844368726015091, "rewards/rejected": -0.28634634613990784, "step": 30 }, { "epoch": 0.5015673981191222, "grad_norm": 7.971938899453216, "learning_rate": 2.8856223324132555e-07, "logits/chosen": -2.6599223613739014, "logits/rejected": -2.6376564502716064, "logps/chosen": -255.1501007080078, "logps/pi_response": -138.35655212402344, "logps/ref_response": -70.97199249267578, "logps/rejected": -225.760498046875, "loss": 0.5773, "rewards/accuracies": 0.765625, "rewards/chosen": -0.22394080460071564, "rewards/margins": 0.34193405508995056, "rewards/rejected": -0.5658749341964722, "step": 40 }, { "epoch": 0.6269592476489029, "grad_norm": 10.038764882605939, "learning_rate": 1.7908455541642582e-07, "logits/chosen": -2.6536831855773926, "logits/rejected": -2.6201894283294678, "logps/chosen": -285.1458435058594, "logps/pi_response": -155.6894989013672, "logps/ref_response": -69.12784576416016, "logps/rejected": -250.23306274414062, "loss": 0.5459, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.3502059578895569, "rewards/margins": 0.4657812714576721, "rewards/rejected": -0.815987229347229, "step": 50 }, { "epoch": 0.7523510971786834, "grad_norm": 10.581283000102326, "learning_rate": 8.32661172908373e-08, "logits/chosen": -2.649432897567749, "logits/rejected": -2.614516019821167, "logps/chosen": -256.6278076171875, "logps/pi_response": -162.4719696044922, "logps/ref_response": -62.94016647338867, "logps/rejected": -266.3846435546875, "loss": 0.5258, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.44566231966018677, "rewards/margins": 0.5812581181526184, "rewards/rejected": -1.0269205570220947, "step": 60 }, { "epoch": 0.877742946708464, "grad_norm": 10.940165896983189, "learning_rate": 1.956279997278043e-08, "logits/chosen": -2.6477224826812744, "logits/rejected": -2.610698699951172, "logps/chosen": -302.5779724121094, "logps/pi_response": -183.13412475585938, "logps/ref_response": -70.71024322509766, "logps/rejected": -277.17822265625, "loss": 0.5125, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -0.5014069676399231, "rewards/margins": 0.6355606913566589, "rewards/rejected": -1.1369677782058716, "step": 70 }, { "epoch": 0.9905956112852664, "step": 79, "total_flos": 0.0, "train_loss": 0.583715951895412, "train_runtime": 3516.8514, "train_samples_per_second": 5.794, "train_steps_per_second": 0.022 } ], "logging_steps": 10, "max_steps": 79, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }