{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3333333333333333, "eval_steps": 100, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.13333333333333333, "grad_norm": 0.5337432026863098, "learning_rate": 0.0004999965616101344, "logits/chosen": 0.23934367299079895, "logits/rejected": 0.42762428522109985, "logps/chosen": -113.45499420166016, "logps/rejected": -75.61299896240234, "loss": 0.271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9617708325386047, "rewards/margins": 3.3194146156311035, "rewards/rejected": -2.3576436042785645, "step": 10 }, { "epoch": 0.26666666666666666, "grad_norm": 0.0004492771113291383, "learning_rate": 0.0004999690550586881, "logits/chosen": 0.39065027236938477, "logits/rejected": 0.9774702191352844, "logps/chosen": -103.68326568603516, "logps/rejected": -159.09165954589844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.5827335119247437, "rewards/margins": 12.054422378540039, "rewards/rejected": -10.471688270568848, "step": 20 }, { "epoch": 0.4, "grad_norm": 0.009628095664083958, "learning_rate": 0.0004999140449822787, "logits/chosen": 0.39641499519348145, "logits/rejected": 1.1506410837173462, "logps/chosen": -112.4819564819336, "logps/rejected": -203.46591186523438, "loss": 0.0144, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.7971093654632568, "rewards/margins": 15.904172897338867, "rewards/rejected": -15.107061386108398, "step": 30 }, { "epoch": 0.5333333333333333, "grad_norm": 0.16114771366119385, "learning_rate": 0.0004998315374335394, "logits/chosen": 0.42631012201309204, "logits/rejected": 1.436499834060669, "logps/chosen": -120.41705322265625, "logps/rejected": -188.07252502441406, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.19870629906654358, "rewards/margins": 13.751760482788086, "rewards/rejected": -13.553054809570312, "step": 40 }, { "epoch": 0.6666666666666666, "grad_norm": 2.7873742510564625e-05, "learning_rate": 0.0004997215414905875, "logits/chosen": 0.45363932847976685, "logits/rejected": 1.7247793674468994, "logps/chosen": -107.3564453125, "logps/rejected": -212.8771514892578, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.316047191619873, "rewards/margins": 17.600120544433594, "rewards/rejected": -16.284074783325195, "step": 50 }, { "epoch": 0.8, "grad_norm": 2.3321376829699147e-06, "learning_rate": 0.0004995840692560257, "logits/chosen": 0.3636583983898163, "logits/rejected": 1.690006971359253, "logps/chosen": -117.81230163574219, "logps/rejected": -231.50650024414062, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.40089258551597595, "rewards/margins": 18.288105010986328, "rewards/rejected": -17.8872127532959, "step": 60 }, { "epoch": 0.9333333333333333, "grad_norm": 9.223950655723456e-06, "learning_rate": 0.0004994191358556106, "logits/chosen": 0.4559127390384674, "logits/rejected": 1.189163088798523, "logps/chosen": -108.7114486694336, "logps/rejected": -249.8244171142578, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.3001081943511963, "rewards/margins": 20.991580963134766, "rewards/rejected": -19.691471099853516, "step": 70 }, { "epoch": 1.0666666666666667, "grad_norm": 1.353669034642735e-07, "learning_rate": 0.0004992267594365879, "logits/chosen": 0.617511510848999, "logits/rejected": 1.2590217590332031, "logps/chosen": -98.4377670288086, "logps/rejected": -260.7948913574219, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 2.062624216079712, "rewards/margins": 22.795236587524414, "rewards/rejected": -20.73261070251465, "step": 80 }, { "epoch": 1.2, "grad_norm": 7.2894854383775964e-06, "learning_rate": 0.0004990069611656963, "logits/chosen": 0.8833354115486145, "logits/rejected": 1.289421796798706, "logps/chosen": -99.6741943359375, "logps/rejected": -247.78750610351562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.3024544715881348, "rewards/margins": 21.798070907592773, "rewards/rejected": -19.49561882019043, "step": 90 }, { "epoch": 1.3333333333333333, "grad_norm": 5.883700850972673e-06, "learning_rate": 0.0004987597652268379, "logits/chosen": 0.957192599773407, "logits/rejected": 1.1422635316848755, "logps/chosen": -107.17437744140625, "logps/rejected": -239.02804565429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.3617346286773682, "rewards/margins": 20.076194763183594, "rewards/rejected": -18.714458465576172, "step": 100 }, { "epoch": 1.3333333333333333, "eval_logits/chosen": 0.9479974508285522, "eval_logits/rejected": 1.0917534828186035, "eval_logps/chosen": -110.24896240234375, "eval_logps/rejected": -238.44570922851562, "eval_loss": 8.311428246088326e-05, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 1.203629493713379, "eval_rewards/margins": 19.736553192138672, "eval_rewards/rejected": -18.532922744750977, "eval_runtime": 161.9066, "eval_samples_per_second": 1.235, "eval_steps_per_second": 1.235, "step": 100 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }