{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998045729919874, "eval_steps": 500, "global_step": 1279, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.050029314051201874, "grad_norm": 6.638810634613037, "learning_rate": 5e-07, "logits/chosen": -0.5138859748840332, "logits/rejected": -0.4518983066082001, "logps/chosen": -77.28889465332031, "logps/rejected": -14.844705581665039, "loss": 0.6924, "rewards/accuracies": 0.50390625, "rewards/chosen": 0.0017984423320740461, "rewards/margins": 0.0019690156914293766, "rewards/rejected": -0.00017057315562851727, "step": 64 }, { "epoch": 0.10005862810240375, "grad_norm": 3.6703028678894043, "learning_rate": 1e-06, "logits/chosen": -0.5301803350448608, "logits/rejected": -0.46129050850868225, "logps/chosen": -77.88700866699219, "logps/rejected": -13.736372947692871, "loss": 0.6787, "rewards/accuracies": 0.693359375, "rewards/chosen": 0.026838650926947594, "rewards/margins": 0.02976146526634693, "rewards/rejected": -0.002922814106568694, "step": 128 }, { "epoch": 0.15008794215360563, "grad_norm": 4.980158805847168, "learning_rate": 9.44396177237185e-07, "logits/chosen": -0.5469677448272705, "logits/rejected": -0.48332178592681885, "logps/chosen": -77.87326049804688, "logps/rejected": -14.177068710327148, "loss": 0.6161, "rewards/accuracies": 0.82421875, "rewards/chosen": 0.15434227883815765, "rewards/margins": 0.17204590141773224, "rewards/rejected": -0.017703618854284286, "step": 192 }, { "epoch": 0.2001172562048075, "grad_norm": 2.8744795322418213, "learning_rate": 8.887923544743701e-07, "logits/chosen": -0.5636645555496216, "logits/rejected": -0.5051460266113281, "logps/chosen": -63.355735778808594, "logps/rejected": -12.730010032653809, "loss": 0.5124, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3969045877456665, "rewards/margins": 0.5048438310623169, "rewards/rejected": -0.10793925076723099, "step": 256 }, { "epoch": 0.2501465702560094, "grad_norm": 2.2977707386016846, "learning_rate": 8.331885317115551e-07, "logits/chosen": -0.5301216840744019, "logits/rejected": -0.47033828496932983, "logps/chosen": -65.85761260986328, "logps/rejected": -15.843816757202148, "loss": 0.3878, "rewards/accuracies": 0.888671875, "rewards/chosen": 0.79034823179245, "rewards/margins": 1.061028003692627, "rewards/rejected": -0.2706798315048218, "step": 320 }, { "epoch": 0.30017588430721126, "grad_norm": 8.175545692443848, "learning_rate": 7.775847089487402e-07, "logits/chosen": -0.5346195697784424, "logits/rejected": -0.4867401123046875, "logps/chosen": -57.788429260253906, "logps/rejected": -16.67302894592285, "loss": 0.3424, "rewards/accuracies": 0.904296875, "rewards/chosen": 0.9831899404525757, "rewards/margins": 1.4113095998764038, "rewards/rejected": -0.42811962962150574, "step": 384 }, { "epoch": 0.35020519835841313, "grad_norm": 5.475202560424805, "learning_rate": 7.219808861859253e-07, "logits/chosen": -0.5421884059906006, "logits/rejected": -0.4896155595779419, "logps/chosen": -59.334922790527344, "logps/rejected": -18.624767303466797, "loss": 0.265, "rewards/accuracies": 0.970703125, "rewards/chosen": 1.2069406509399414, "rewards/margins": 1.8477783203125, "rewards/rejected": -0.640837550163269, "step": 448 }, { "epoch": 0.400234512409615, "grad_norm": 3.7930808067321777, "learning_rate": 6.663770634231103e-07, "logits/chosen": -0.5423855185508728, "logits/rejected": -0.5051375031471252, "logps/chosen": -58.52878952026367, "logps/rejected": -23.4506893157959, "loss": 0.182, "rewards/accuracies": 0.986328125, "rewards/chosen": 1.3336964845657349, "rewards/margins": 2.311509132385254, "rewards/rejected": -0.977812647819519, "step": 512 }, { "epoch": 0.45026382646081686, "grad_norm": 7.777218818664551, "learning_rate": 6.107732406602954e-07, "logits/chosen": -0.5627282857894897, "logits/rejected": -0.5052539110183716, "logps/chosen": -60.55673599243164, "logps/rejected": -26.99791717529297, "loss": 0.1096, "rewards/accuracies": 0.982421875, "rewards/chosen": 1.3532178401947021, "rewards/margins": 2.905151605606079, "rewards/rejected": -1.5519336462020874, "step": 576 }, { "epoch": 0.5002931405120188, "grad_norm": 0.5906669497489929, "learning_rate": 5.551694178974804e-07, "logits/chosen": -0.5611196160316467, "logits/rejected": -0.5089166164398193, "logps/chosen": -59.12473678588867, "logps/rejected": -32.835243225097656, "loss": 0.0648, "rewards/accuracies": 0.9921875, "rewards/chosen": 1.443546175956726, "rewards/margins": 3.489488124847412, "rewards/rejected": -2.0459418296813965, "step": 640 }, { "epoch": 0.5503224545632206, "grad_norm": 0.7606092095375061, "learning_rate": 4.995655951346655e-07, "logits/chosen": -0.5526726245880127, "logits/rejected": -0.49657124280929565, "logps/chosen": -57.72775650024414, "logps/rejected": -36.90594482421875, "loss": 0.0546, "rewards/accuracies": 0.990234375, "rewards/chosen": 1.4812407493591309, "rewards/margins": 3.8918285369873047, "rewards/rejected": -2.410587787628174, "step": 704 }, { "epoch": 0.6003517686144225, "grad_norm": 0.8932979702949524, "learning_rate": 4.4396177237185057e-07, "logits/chosen": -0.5280415415763855, "logits/rejected": -0.4753148555755615, "logps/chosen": -58.317237854003906, "logps/rejected": -40.71867370605469, "loss": 0.0436, "rewards/accuracies": 0.990234375, "rewards/chosen": 1.4664157629013062, "rewards/margins": 4.119485855102539, "rewards/rejected": -2.6530702114105225, "step": 768 }, { "epoch": 0.6503810826656244, "grad_norm": 1.0950371026992798, "learning_rate": 3.8835794960903563e-07, "logits/chosen": -0.523577094078064, "logits/rejected": -0.464932382106781, "logps/chosen": -58.45826721191406, "logps/rejected": -40.988529205322266, "loss": 0.0442, "rewards/accuracies": 0.990234375, "rewards/chosen": 1.5603256225585938, "rewards/margins": 4.361178398132324, "rewards/rejected": -2.8008527755737305, "step": 832 }, { "epoch": 0.7004103967168263, "grad_norm": 0.8026629686355591, "learning_rate": 3.327541268462207e-07, "logits/chosen": -0.5141459703445435, "logits/rejected": -0.44692087173461914, "logps/chosen": -59.13020324707031, "logps/rejected": -41.566627502441406, "loss": 0.038, "rewards/accuracies": 0.994140625, "rewards/chosen": 1.5455219745635986, "rewards/margins": 4.467495918273926, "rewards/rejected": -2.9219741821289062, "step": 896 }, { "epoch": 0.7504397107680282, "grad_norm": 0.05718870088458061, "learning_rate": 2.7715030408340575e-07, "logits/chosen": -0.5028055906295776, "logits/rejected": -0.43091291189193726, "logps/chosen": -56.72560501098633, "logps/rejected": -42.42565155029297, "loss": 0.0423, "rewards/accuracies": 0.986328125, "rewards/chosen": 1.564468264579773, "rewards/margins": 4.592702865600586, "rewards/rejected": -3.0282342433929443, "step": 960 }, { "epoch": 0.80046902481923, "grad_norm": 0.14965815842151642, "learning_rate": 2.215464813205908e-07, "logits/chosen": -0.510848343372345, "logits/rejected": -0.4440664052963257, "logps/chosen": -56.914100646972656, "logps/rejected": -42.933258056640625, "loss": 0.0307, "rewards/accuracies": 0.9921875, "rewards/chosen": 1.552268147468567, "rewards/margins": 4.621021270751953, "rewards/rejected": -3.068753242492676, "step": 1024 }, { "epoch": 0.8504983388704319, "grad_norm": 0.4853415787220001, "learning_rate": 1.6594265855777585e-07, "logits/chosen": -0.5129883289337158, "logits/rejected": -0.4405321180820465, "logps/chosen": -56.04991912841797, "logps/rejected": -43.990997314453125, "loss": 0.0213, "rewards/accuracies": 0.994140625, "rewards/chosen": 1.5749919414520264, "rewards/margins": 4.778657913208008, "rewards/rejected": -3.2036657333374023, "step": 1088 }, { "epoch": 0.9005276529216337, "grad_norm": 0.44261807203292847, "learning_rate": 1.103388357949609e-07, "logits/chosen": -0.5202418565750122, "logits/rejected": -0.4409845471382141, "logps/chosen": -58.534915924072266, "logps/rejected": -44.22764205932617, "loss": 0.0335, "rewards/accuracies": 0.98828125, "rewards/chosen": 1.6060659885406494, "rewards/margins": 4.796003341674805, "rewards/rejected": -3.189937114715576, "step": 1152 }, { "epoch": 0.9505569669728356, "grad_norm": 0.2634647786617279, "learning_rate": 5.4735013032145953e-08, "logits/chosen": -0.48342519998550415, "logits/rejected": -0.4142173230648041, "logps/chosen": -57.232521057128906, "logps/rejected": -44.50954818725586, "loss": 0.0315, "rewards/accuracies": 0.990234375, "rewards/chosen": 1.5877962112426758, "rewards/margins": 4.721351623535156, "rewards/rejected": -3.1335554122924805, "step": 1216 }, { "epoch": 0.9998045729919874, "step": 1279, "total_flos": 7.724459437129728e+17, "train_loss": 0.21135991807092067, "train_runtime": 19931.1539, "train_samples_per_second": 0.513, "train_steps_per_second": 0.064 } ], "logging_steps": 64, "max_steps": 1279, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.724459437129728e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }