{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9904153354632586, "eval_steps": 500, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006389776357827476, "grad_norm": 38.53680030738914, "learning_rate": 1.0638297872340425e-08, "logits/chosen": -3.453125, "logits/rejected": -3.4375, "logps/chosen": -139.0, "logps/rejected": -128.0, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06389776357827476, "grad_norm": 37.086320471160064, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -3.5625, "logits/rejected": -3.5, "logps/chosen": -174.0, "logps/rejected": -170.0, "loss": 0.6903, "rewards/accuracies": 0.1875, "rewards/chosen": 0.00677490234375, "rewards/margins": 0.00555419921875, "rewards/rejected": 0.00121307373046875, "step": 10 }, { "epoch": 0.12779552715654952, "grad_norm": 37.797498575960304, "learning_rate": 2.127659574468085e-07, "logits/chosen": -3.578125, "logits/rejected": -3.5625, "logps/chosen": -164.0, "logps/rejected": -149.0, "loss": 0.6924, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.004852294921875, "rewards/margins": 0.0111083984375, "rewards/rejected": -0.006256103515625, "step": 20 }, { "epoch": 0.19169329073482427, "grad_norm": 36.37288176457082, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -3.609375, "logits/rejected": -3.59375, "logps/chosen": -162.0, "logps/rejected": -155.0, "loss": 0.6801, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 0.004669189453125, "rewards/margins": 0.0390625, "rewards/rejected": -0.0341796875, "step": 30 }, { "epoch": 0.25559105431309903, "grad_norm": 34.81091514767431, "learning_rate": 4.25531914893617e-07, "logits/chosen": -3.578125, "logits/rejected": -3.53125, "logps/chosen": -161.0, "logps/rejected": -142.0, "loss": 0.6685, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.01409912109375, "rewards/margins": 0.06884765625, "rewards/rejected": -0.0830078125, "step": 40 }, { "epoch": 0.3194888178913738, "grad_norm": 32.850826486655734, "learning_rate": 4.96437054631829e-07, "logits/chosen": -3.65625, "logits/rejected": -3.65625, "logps/chosen": -161.0, "logps/rejected": -158.0, "loss": 0.6611, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0140380859375, "rewards/margins": 0.08984375, "rewards/rejected": -0.07568359375, "step": 50 }, { "epoch": 0.38338658146964855, "grad_norm": 37.177195702506765, "learning_rate": 4.845605700712589e-07, "logits/chosen": -3.71875, "logits/rejected": -3.703125, "logps/chosen": -162.0, "logps/rejected": -153.0, "loss": 0.6238, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0203857421875, "rewards/margins": 0.18359375, "rewards/rejected": -0.203125, "step": 60 }, { "epoch": 0.4472843450479233, "grad_norm": 36.403665892064886, "learning_rate": 4.7268408551068883e-07, "logits/chosen": -3.625, "logits/rejected": -3.59375, "logps/chosen": -169.0, "logps/rejected": -152.0, "loss": 0.6254, "rewards/accuracies": 0.65625, "rewards/chosen": -0.033447265625, "rewards/margins": 0.212890625, "rewards/rejected": -0.24609375, "step": 70 }, { "epoch": 0.5111821086261981, "grad_norm": 38.60720512624817, "learning_rate": 4.6080760095011875e-07, "logits/chosen": -3.625, "logits/rejected": -3.59375, "logps/chosen": -172.0, "logps/rejected": -154.0, "loss": 0.6216, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08056640625, "rewards/margins": 0.388671875, "rewards/rejected": -0.30859375, "step": 80 }, { "epoch": 0.5750798722044729, "grad_norm": 32.642277330645705, "learning_rate": 4.4893111638954866e-07, "logits/chosen": -3.609375, "logits/rejected": -3.578125, "logps/chosen": -166.0, "logps/rejected": -145.0, "loss": 0.6482, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.240234375, "rewards/margins": 0.3046875, "rewards/rejected": -0.546875, "step": 90 }, { "epoch": 0.6389776357827476, "grad_norm": 34.47332747576967, "learning_rate": 4.3705463182897863e-07, "logits/chosen": -3.671875, "logits/rejected": -3.65625, "logps/chosen": -172.0, "logps/rejected": -158.0, "loss": 0.6118, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.205078125, "rewards/margins": 0.380859375, "rewards/rejected": -0.5859375, "step": 100 }, { "epoch": 0.7028753993610224, "grad_norm": 39.45037295466221, "learning_rate": 4.251781472684085e-07, "logits/chosen": -3.640625, "logits/rejected": -3.625, "logps/chosen": -170.0, "logps/rejected": -164.0, "loss": 0.6584, "rewards/accuracies": 0.625, "rewards/chosen": -0.44921875, "rewards/margins": 0.37109375, "rewards/rejected": -0.8203125, "step": 110 }, { "epoch": 0.7667731629392971, "grad_norm": 39.79332640803062, "learning_rate": 4.1330166270783846e-07, "logits/chosen": -3.578125, "logits/rejected": -3.578125, "logps/chosen": -167.0, "logps/rejected": -139.0, "loss": 0.6227, "rewards/accuracies": 0.65625, "rewards/chosen": -0.48046875, "rewards/margins": 0.400390625, "rewards/rejected": -0.87890625, "step": 120 }, { "epoch": 0.8306709265175719, "grad_norm": 33.432327857908874, "learning_rate": 4.0142517814726837e-07, "logits/chosen": -3.609375, "logits/rejected": -3.609375, "logps/chosen": -178.0, "logps/rejected": -152.0, "loss": 0.6071, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.26953125, "rewards/margins": 0.55859375, "rewards/rejected": -0.828125, "step": 130 }, { "epoch": 0.8945686900958466, "grad_norm": 31.757203089299825, "learning_rate": 3.8954869358669834e-07, "logits/chosen": -3.65625, "logits/rejected": -3.578125, "logps/chosen": -167.0, "logps/rejected": -156.0, "loss": 0.6081, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.333984375, "rewards/margins": 0.40625, "rewards/rejected": -0.73828125, "step": 140 }, { "epoch": 0.9584664536741214, "grad_norm": 34.635549945156775, "learning_rate": 3.7767220902612825e-07, "logits/chosen": -3.609375, "logits/rejected": -3.59375, "logps/chosen": -173.0, "logps/rejected": -160.0, "loss": 0.644, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.55078125, "rewards/margins": 0.474609375, "rewards/rejected": -1.03125, "step": 150 }, { "epoch": 0.9968051118210862, "eval_logits/chosen": -3.65625, "eval_logits/rejected": -3.6875, "eval_logps/chosen": -163.0, "eval_logps/rejected": -149.0, "eval_loss": 0.5966406464576721, "eval_rewards/accuracies": 0.6785714030265808, "eval_rewards/chosen": -0.470703125, "eval_rewards/margins": 0.470703125, "eval_rewards/rejected": -0.94140625, "eval_runtime": 12.2769, "eval_samples_per_second": 16.291, "eval_steps_per_second": 0.57, "step": 156 }, { "epoch": 1.0223642172523961, "grad_norm": 17.578277558626937, "learning_rate": 3.6579572446555817e-07, "logits/chosen": -3.59375, "logits/rejected": -3.578125, "logps/chosen": -163.0, "logps/rejected": -165.0, "loss": 0.5154, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.337890625, "rewards/margins": 0.8125, "rewards/rejected": -1.1484375, "step": 160 }, { "epoch": 1.0862619808306708, "grad_norm": 18.045497494579415, "learning_rate": 3.5391923990498813e-07, "logits/chosen": -3.609375, "logits/rejected": -3.5625, "logps/chosen": -166.0, "logps/rejected": -161.0, "loss": 0.2778, "rewards/accuracies": 0.9375, "rewards/chosen": 0.28515625, "rewards/margins": 2.03125, "rewards/rejected": -1.75, "step": 170 }, { "epoch": 1.1501597444089458, "grad_norm": 31.421762384244513, "learning_rate": 3.42042755344418e-07, "logits/chosen": -3.6875, "logits/rejected": -3.65625, "logps/chosen": -169.0, "logps/rejected": -163.0, "loss": 0.2699, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0830078125, "rewards/margins": 1.78125, "rewards/rejected": -1.6953125, "step": 180 }, { "epoch": 1.2140575079872205, "grad_norm": 23.922864036418108, "learning_rate": 3.3016627078384796e-07, "logits/chosen": -3.6875, "logits/rejected": -3.625, "logps/chosen": -170.0, "logps/rejected": -160.0, "loss": 0.2828, "rewards/accuracies": 0.90625, "rewards/chosen": -0.0189208984375, "rewards/margins": 1.96875, "rewards/rejected": -1.984375, "step": 190 }, { "epoch": 1.2779552715654952, "grad_norm": 35.48813795033075, "learning_rate": 3.182897862232779e-07, "logits/chosen": -3.6875, "logits/rejected": -3.59375, "logps/chosen": -166.0, "logps/rejected": -172.0, "loss": 0.2671, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0281982421875, "rewards/margins": 2.0, "rewards/rejected": -1.96875, "step": 200 }, { "epoch": 1.34185303514377, "grad_norm": 26.443386433583438, "learning_rate": 3.0641330166270784e-07, "logits/chosen": -3.671875, "logits/rejected": -3.609375, "logps/chosen": -170.0, "logps/rejected": -168.0, "loss": 0.2989, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.16796875, "rewards/margins": 1.875, "rewards/rejected": -1.7109375, "step": 210 }, { "epoch": 1.4057507987220448, "grad_norm": 21.45349324293884, "learning_rate": 2.9453681710213776e-07, "logits/chosen": -3.671875, "logits/rejected": -3.65625, "logps/chosen": -172.0, "logps/rejected": -171.0, "loss": 0.2874, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.26171875, "rewards/margins": 1.8125, "rewards/rejected": -2.078125, "step": 220 }, { "epoch": 1.4696485623003195, "grad_norm": 15.23988385292947, "learning_rate": 2.8266033254156767e-07, "logits/chosen": -3.65625, "logits/rejected": -3.625, "logps/chosen": -152.0, "logps/rejected": -154.0, "loss": 0.3156, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.2158203125, "rewards/margins": 1.8046875, "rewards/rejected": -2.015625, "step": 230 }, { "epoch": 1.5335463258785942, "grad_norm": 20.9068024116458, "learning_rate": 2.7078384798099764e-07, "logits/chosen": -3.6875, "logits/rejected": -3.640625, "logps/chosen": -166.0, "logps/rejected": -179.0, "loss": 0.2663, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.17578125, "rewards/margins": 2.296875, "rewards/rejected": -2.46875, "step": 240 }, { "epoch": 1.5974440894568689, "grad_norm": 23.61858832933539, "learning_rate": 2.589073634204275e-07, "logits/chosen": -3.625, "logits/rejected": -3.59375, "logps/chosen": -169.0, "logps/rejected": -175.0, "loss": 0.255, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.302734375, "rewards/margins": 2.203125, "rewards/rejected": -2.515625, "step": 250 }, { "epoch": 1.6613418530351438, "grad_norm": 15.684487484295666, "learning_rate": 2.4703087885985747e-07, "logits/chosen": -3.6875, "logits/rejected": -3.65625, "logps/chosen": -176.0, "logps/rejected": -178.0, "loss": 0.279, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.3984375, "rewards/margins": 1.984375, "rewards/rejected": -2.375, "step": 260 }, { "epoch": 1.7252396166134185, "grad_norm": 17.902346692684933, "learning_rate": 2.351543942992874e-07, "logits/chosen": -3.6875, "logits/rejected": -3.625, "logps/chosen": -175.0, "logps/rejected": -186.0, "loss": 0.27, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.29296875, "rewards/margins": 2.140625, "rewards/rejected": -2.4375, "step": 270 }, { "epoch": 1.7891373801916934, "grad_norm": 16.024604674297596, "learning_rate": 2.2327790973871732e-07, "logits/chosen": -3.65625, "logits/rejected": -3.609375, "logps/chosen": -178.0, "logps/rejected": -186.0, "loss": 0.3036, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.76171875, "rewards/margins": 2.15625, "rewards/rejected": -2.921875, "step": 280 }, { "epoch": 1.8530351437699681, "grad_norm": 20.910031176692122, "learning_rate": 2.1140142517814726e-07, "logits/chosen": -3.625, "logits/rejected": -3.59375, "logps/chosen": -174.0, "logps/rejected": -182.0, "loss": 0.257, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.0810546875, "rewards/margins": 2.5625, "rewards/rejected": -2.640625, "step": 290 }, { "epoch": 1.9169329073482428, "grad_norm": 26.07564194077576, "learning_rate": 1.9952494061757718e-07, "logits/chosen": -3.65625, "logits/rejected": -3.578125, "logps/chosen": -167.0, "logps/rejected": -170.0, "loss": 0.3132, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.2099609375, "rewards/margins": 2.234375, "rewards/rejected": -2.4375, "step": 300 }, { "epoch": 1.9808306709265175, "grad_norm": 26.49958432728375, "learning_rate": 1.876484560570071e-07, "logits/chosen": -3.71875, "logits/rejected": -3.703125, "logps/chosen": -165.0, "logps/rejected": -174.0, "loss": 0.2996, "rewards/accuracies": 0.875, "rewards/chosen": -0.400390625, "rewards/margins": 2.03125, "rewards/rejected": -2.4375, "step": 310 }, { "epoch": 2.0, "eval_logits/chosen": -3.765625, "eval_logits/rejected": -3.796875, "eval_logps/chosen": -167.0, "eval_logps/rejected": -154.0, "eval_loss": 0.5964062213897705, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": -0.86328125, "eval_rewards/margins": 0.5546875, "eval_rewards/rejected": -1.4140625, "eval_runtime": 12.2832, "eval_samples_per_second": 16.282, "eval_steps_per_second": 0.57, "step": 313 }, { "epoch": 2.0447284345047922, "grad_norm": 12.799642134592919, "learning_rate": 1.7577197149643706e-07, "logits/chosen": -3.671875, "logits/rejected": -3.625, "logps/chosen": -173.0, "logps/rejected": -195.0, "loss": 0.1648, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.2431640625, "rewards/margins": 3.125, "rewards/rejected": -3.359375, "step": 320 }, { "epoch": 2.108626198083067, "grad_norm": 24.727219361838603, "learning_rate": 1.6389548693586697e-07, "logits/chosen": -3.703125, "logits/rejected": -3.65625, "logps/chosen": -165.0, "logps/rejected": -176.0, "loss": 0.1591, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.287109375, "rewards/margins": 2.90625, "rewards/rejected": -3.1875, "step": 330 }, { "epoch": 2.1725239616613417, "grad_norm": 26.008991012968647, "learning_rate": 1.520190023752969e-07, "logits/chosen": -3.65625, "logits/rejected": -3.609375, "logps/chosen": -162.0, "logps/rejected": -178.0, "loss": 0.1532, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6640625, "rewards/margins": 3.125, "rewards/rejected": -3.796875, "step": 340 }, { "epoch": 2.236421725239617, "grad_norm": 20.087149399156676, "learning_rate": 1.4014251781472683e-07, "logits/chosen": -3.734375, "logits/rejected": -3.65625, "logps/chosen": -165.0, "logps/rejected": -177.0, "loss": 0.1472, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.6171875, "rewards/margins": 3.015625, "rewards/rejected": -3.640625, "step": 350 }, { "epoch": 2.3003194888178915, "grad_norm": 18.05087937817016, "learning_rate": 1.2826603325415677e-07, "logits/chosen": -3.796875, "logits/rejected": -3.734375, "logps/chosen": -166.0, "logps/rejected": -193.0, "loss": 0.1431, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.54296875, "rewards/margins": 3.0625, "rewards/rejected": -3.609375, "step": 360 }, { "epoch": 2.364217252396166, "grad_norm": 18.05523178462953, "learning_rate": 1.163895486935867e-07, "logits/chosen": -3.78125, "logits/rejected": -3.75, "logps/chosen": -177.0, "logps/rejected": -185.0, "loss": 0.1465, "rewards/accuracies": 0.96875, "rewards/chosen": -0.78125, "rewards/margins": 3.15625, "rewards/rejected": -3.9375, "step": 370 }, { "epoch": 2.428115015974441, "grad_norm": 12.57343346412284, "learning_rate": 1.0451306413301662e-07, "logits/chosen": -3.78125, "logits/rejected": -3.75, "logps/chosen": -175.0, "logps/rejected": -187.0, "loss": 0.1316, "rewards/accuracies": 0.96875, "rewards/chosen": -0.828125, "rewards/margins": 3.296875, "rewards/rejected": -4.125, "step": 380 }, { "epoch": 2.4920127795527156, "grad_norm": 21.734917480559936, "learning_rate": 9.263657957244655e-08, "logits/chosen": -3.796875, "logits/rejected": -3.765625, "logps/chosen": -173.0, "logps/rejected": -200.0, "loss": 0.1625, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.1796875, "rewards/margins": 3.15625, "rewards/rejected": -4.34375, "step": 390 }, { "epoch": 2.5559105431309903, "grad_norm": 19.74378557143576, "learning_rate": 8.076009501187649e-08, "logits/chosen": -3.84375, "logits/rejected": -3.765625, "logps/chosen": -172.0, "logps/rejected": -202.0, "loss": 0.1368, "rewards/accuracies": 0.96875, "rewards/chosen": -1.09375, "rewards/margins": 3.390625, "rewards/rejected": -4.5, "step": 400 }, { "epoch": 2.619808306709265, "grad_norm": 18.45734841726139, "learning_rate": 6.88836104513064e-08, "logits/chosen": -3.703125, "logits/rejected": -3.65625, "logps/chosen": -166.0, "logps/rejected": -199.0, "loss": 0.1322, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.03125, "rewards/margins": 3.515625, "rewards/rejected": -4.53125, "step": 410 }, { "epoch": 2.68370607028754, "grad_norm": 22.52618935655582, "learning_rate": 5.700712589073634e-08, "logits/chosen": -3.703125, "logits/rejected": -3.65625, "logps/chosen": -178.0, "logps/rejected": -203.0, "loss": 0.1197, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6796875, "rewards/margins": 3.765625, "rewards/rejected": -4.4375, "step": 420 }, { "epoch": 2.747603833865815, "grad_norm": 18.32031096894331, "learning_rate": 4.5130641330166267e-08, "logits/chosen": -3.765625, "logits/rejected": -3.71875, "logps/chosen": -177.0, "logps/rejected": -195.0, "loss": 0.1689, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.76953125, "rewards/margins": 3.359375, "rewards/rejected": -4.125, "step": 430 }, { "epoch": 2.8115015974440896, "grad_norm": 15.907202030933394, "learning_rate": 3.32541567695962e-08, "logits/chosen": -3.71875, "logits/rejected": -3.671875, "logps/chosen": -170.0, "logps/rejected": -203.0, "loss": 0.1358, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.85546875, "rewards/margins": 3.578125, "rewards/rejected": -4.4375, "step": 440 }, { "epoch": 2.8753993610223643, "grad_norm": 15.683451965611985, "learning_rate": 2.1377672209026125e-08, "logits/chosen": -3.6875, "logits/rejected": -3.59375, "logps/chosen": -172.0, "logps/rejected": -181.0, "loss": 0.1541, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9375, "rewards/margins": 3.53125, "rewards/rejected": -4.46875, "step": 450 }, { "epoch": 2.939297124600639, "grad_norm": 31.718439650295267, "learning_rate": 9.501187648456057e-09, "logits/chosen": -3.75, "logits/rejected": -3.734375, "logps/chosen": -171.0, "logps/rejected": -190.0, "loss": 0.1648, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9140625, "rewards/margins": 3.453125, "rewards/rejected": -4.375, "step": 460 }, { "epoch": 2.9904153354632586, "eval_logits/chosen": -3.828125, "eval_logits/rejected": -3.859375, "eval_logps/chosen": -176.0, "eval_logps/rejected": -163.0, "eval_loss": 0.64453125, "eval_rewards/accuracies": 0.6607142686843872, "eval_rewards/chosen": -1.7109375, "eval_rewards/margins": 0.63671875, "eval_rewards/rejected": -2.34375, "eval_runtime": 15.0233, "eval_samples_per_second": 13.313, "eval_steps_per_second": 0.466, "step": 468 }, { "epoch": 2.9904153354632586, "step": 468, "total_flos": 0.0, "train_loss": 0.35841141399155313, "train_runtime": 4617.8414, "train_samples_per_second": 6.493, "train_steps_per_second": 0.101 } ], "logging_steps": 10, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }