{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994767137624281, "eval_steps": 500, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0052328623757195184, "grad_norm": 819697.7987526867, "learning_rate": 2.6041666666666667e-08, "logits/chosen": -2.897020101547241, "logits/rejected": -2.8810553550720215, "logps/chosen": -281.18853759765625, "logps/rejected": -241.4916534423828, "loss": 62511.5062, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -281.18853759765625, "rewards/margins": -39.69694519042969, "rewards/rejected": -241.4916534423828, "step": 5 }, { "epoch": 0.010465724751439037, "grad_norm": 856447.0339256247, "learning_rate": 5.208333333333333e-08, "logits/chosen": -2.8515119552612305, "logits/rejected": -2.852177381515503, "logps/chosen": -227.5166778564453, "logps/rejected": -218.9936065673828, "loss": 62508.0563, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -227.5166778564453, "rewards/margins": -8.523069381713867, "rewards/rejected": -218.9936065673828, "step": 10 }, { "epoch": 0.015698587127158554, "grad_norm": 608077.0241737472, "learning_rate": 7.812499999999999e-08, "logits/chosen": -2.8871281147003174, "logits/rejected": -2.8566455841064453, "logps/chosen": -296.6144104003906, "logps/rejected": -248.87496948242188, "loss": 62494.775, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -296.6144104003906, "rewards/margins": -47.739437103271484, "rewards/rejected": -248.87496948242188, "step": 15 }, { "epoch": 0.020931449502878074, "grad_norm": 547713.4134125254, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.8649909496307373, "logits/rejected": -2.900007963180542, "logps/chosen": -300.6615905761719, "logps/rejected": -290.6969909667969, "loss": 62498.0375, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -300.6615905761719, "rewards/margins": -9.964593887329102, "rewards/rejected": -290.6969909667969, "step": 20 }, { "epoch": 0.026164311878597593, "grad_norm": 550202.226793022, "learning_rate": 1.3020833333333334e-07, "logits/chosen": -2.861807346343994, "logits/rejected": -2.8286397457122803, "logps/chosen": -297.4012756347656, "logps/rejected": -225.73532104492188, "loss": 62479.4313, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -297.4012756347656, "rewards/margins": -71.66590881347656, "rewards/rejected": -225.73532104492188, "step": 25 }, { "epoch": 0.03139717425431711, "grad_norm": 575687.3818314937, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -2.8637468814849854, "logits/rejected": -2.855187177658081, "logps/chosen": -261.7722473144531, "logps/rejected": -266.75311279296875, "loss": 62467.7375, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -261.7722473144531, "rewards/margins": 4.980858325958252, "rewards/rejected": -266.75311279296875, "step": 30 }, { "epoch": 0.03663003663003663, "grad_norm": 601042.0970547737, "learning_rate": 1.8229166666666666e-07, "logits/chosen": -2.882888078689575, "logits/rejected": -2.8436450958251953, "logps/chosen": -322.3620300292969, "logps/rejected": -236.65188598632812, "loss": 62398.2562, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -322.3620300292969, "rewards/margins": -85.71016693115234, "rewards/rejected": -236.65188598632812, "step": 35 }, { "epoch": 0.04186289900575615, "grad_norm": 1270156.296221847, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.926880121231079, "logits/rejected": -2.873258590698242, "logps/chosen": -266.81585693359375, "logps/rejected": -222.47512817382812, "loss": 62382.9187, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -266.81585693359375, "rewards/margins": -44.34074401855469, "rewards/rejected": -222.47512817382812, "step": 40 }, { "epoch": 0.04709576138147567, "grad_norm": 562197.8356415116, "learning_rate": 2.3437499999999998e-07, "logits/chosen": -2.934823989868164, "logits/rejected": -2.8437087535858154, "logps/chosen": -337.57647705078125, "logps/rejected": -253.1848602294922, "loss": 62295.2562, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -337.57647705078125, "rewards/margins": -84.39164733886719, "rewards/rejected": -253.1848602294922, "step": 45 }, { "epoch": 0.052328623757195186, "grad_norm": 579259.1669227169, "learning_rate": 2.604166666666667e-07, "logits/chosen": -2.8226637840270996, "logits/rejected": -2.8579444885253906, "logps/chosen": -235.44284057617188, "logps/rejected": -253.05126953125, "loss": 62140.85, "rewards/accuracies": 0.5, "rewards/chosen": -235.44284057617188, "rewards/margins": 17.60841941833496, "rewards/rejected": -253.05126953125, "step": 50 }, { "epoch": 0.0575614861329147, "grad_norm": 599221.2375408602, "learning_rate": 2.864583333333333e-07, "logits/chosen": -2.9071204662323, "logits/rejected": -2.86643385887146, "logps/chosen": -295.3536376953125, "logps/rejected": -295.96044921875, "loss": 62103.8438, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -295.3536376953125, "rewards/margins": 0.6068130731582642, "rewards/rejected": -295.96044921875, "step": 55 }, { "epoch": 0.06279434850863422, "grad_norm": 587217.7209315128, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.825546979904175, "logits/rejected": -2.853196620941162, "logps/chosen": -280.54376220703125, "logps/rejected": -290.41162109375, "loss": 61848.075, "rewards/accuracies": 0.625, "rewards/chosen": -280.54376220703125, "rewards/margins": 9.867898941040039, "rewards/rejected": -290.41162109375, "step": 60 }, { "epoch": 0.06802721088435375, "grad_norm": 692483.3715259883, "learning_rate": 3.3854166666666667e-07, "logits/chosen": -2.8896777629852295, "logits/rejected": -2.869809150695801, "logps/chosen": -279.5859375, "logps/rejected": -267.8680725097656, "loss": 61784.125, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -279.5859375, "rewards/margins": -11.717863082885742, "rewards/rejected": -267.8680725097656, "step": 65 }, { "epoch": 0.07326007326007326, "grad_norm": 643621.4786223344, "learning_rate": 3.645833333333333e-07, "logits/chosen": -2.86277174949646, "logits/rejected": -2.849290132522583, "logps/chosen": -270.3601989746094, "logps/rejected": -299.9423828125, "loss": 61415.6375, "rewards/accuracies": 0.5625, "rewards/chosen": -270.3601989746094, "rewards/margins": 29.582199096679688, "rewards/rejected": -299.9423828125, "step": 70 }, { "epoch": 0.07849293563579278, "grad_norm": 797625.2394802963, "learning_rate": 3.9062499999999997e-07, "logits/chosen": -2.875126600265503, "logits/rejected": -2.8260860443115234, "logps/chosen": -278.2060241699219, "logps/rejected": -263.739990234375, "loss": 61252.4812, "rewards/accuracies": 0.5, "rewards/chosen": -278.2060241699219, "rewards/margins": -14.466039657592773, "rewards/rejected": -263.739990234375, "step": 75 }, { "epoch": 0.0837257980115123, "grad_norm": 570934.2395758026, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.8297553062438965, "logits/rejected": -2.818152904510498, "logps/chosen": -244.71047973632812, "logps/rejected": -216.3663330078125, "loss": 61182.05, "rewards/accuracies": 0.4375, "rewards/chosen": -244.71047973632812, "rewards/margins": -28.344135284423828, "rewards/rejected": -216.3663330078125, "step": 80 }, { "epoch": 0.08895866038723181, "grad_norm": 647636.324219079, "learning_rate": 4.427083333333333e-07, "logits/chosen": -2.8677287101745605, "logits/rejected": -2.8416037559509277, "logps/chosen": -280.59759521484375, "logps/rejected": -278.1571044921875, "loss": 60841.875, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -280.59759521484375, "rewards/margins": -2.440479278564453, "rewards/rejected": -278.1571044921875, "step": 85 }, { "epoch": 0.09419152276295134, "grad_norm": 693686.6913297386, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -2.8715648651123047, "logits/rejected": -2.886065721511841, "logps/chosen": -303.4865417480469, "logps/rejected": -300.1495361328125, "loss": 60200.25, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -303.4865417480469, "rewards/margins": -3.336996555328369, "rewards/rejected": -300.1495361328125, "step": 90 }, { "epoch": 0.09942438513867086, "grad_norm": 695048.3682737482, "learning_rate": 4.947916666666667e-07, "logits/chosen": -2.8399910926818848, "logits/rejected": -2.8273520469665527, "logps/chosen": -285.8985900878906, "logps/rejected": -278.19525146484375, "loss": 59913.85, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -285.8985900878906, "rewards/margins": -7.703277587890625, "rewards/rejected": -278.19525146484375, "step": 95 }, { "epoch": 0.10465724751439037, "grad_norm": 926619.6100320778, "learning_rate": 4.999732492681437e-07, "logits/chosen": -2.839812994003296, "logits/rejected": -2.814923048019409, "logps/chosen": -280.0777587890625, "logps/rejected": -326.1065979003906, "loss": 58985.2125, "rewards/accuracies": 0.625, "rewards/chosen": -280.0777587890625, "rewards/margins": 46.02882385253906, "rewards/rejected": -326.1065979003906, "step": 100 }, { "epoch": 0.10989010989010989, "grad_norm": 730881.0367768478, "learning_rate": 4.998645842314724e-07, "logits/chosen": -2.8014039993286133, "logits/rejected": -2.7791314125061035, "logps/chosen": -325.879638671875, "logps/rejected": -323.22125244140625, "loss": 59519.525, "rewards/accuracies": 0.5625, "rewards/chosen": -325.879638671875, "rewards/margins": -2.658414363861084, "rewards/rejected": -323.22125244140625, "step": 105 }, { "epoch": 0.1151229722658294, "grad_norm": 787482.5379143337, "learning_rate": 4.996723692767926e-07, "logits/chosen": -2.877906322479248, "logits/rejected": -2.860431671142578, "logps/chosen": -331.8926086425781, "logps/rejected": -336.84979248046875, "loss": 59833.9437, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -331.8926086425781, "rewards/margins": 4.957190036773682, "rewards/rejected": -336.84979248046875, "step": 110 }, { "epoch": 0.12035583464154893, "grad_norm": 758644.6025801541, "learning_rate": 4.993966686770933e-07, "logits/chosen": -2.8740134239196777, "logits/rejected": -2.849520683288574, "logps/chosen": -286.97998046875, "logps/rejected": -302.22589111328125, "loss": 59542.8562, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -286.97998046875, "rewards/margins": 15.245903015136719, "rewards/rejected": -302.22589111328125, "step": 115 }, { "epoch": 0.12558869701726844, "grad_norm": 839068.6603800668, "learning_rate": 4.990375746213598e-07, "logits/chosen": -2.8500800132751465, "logits/rejected": -2.813788414001465, "logps/chosen": -252.0970458984375, "logps/rejected": -269.5028381347656, "loss": 58766.425, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -252.0970458984375, "rewards/margins": 17.40580177307129, "rewards/rejected": -269.5028381347656, "step": 120 }, { "epoch": 0.13082155939298795, "grad_norm": 790620.3365762861, "learning_rate": 4.985952071837474e-07, "logits/chosen": -2.8092734813690186, "logits/rejected": -2.8068203926086426, "logps/chosen": -272.0372619628906, "logps/rejected": -282.043701171875, "loss": 57950.4375, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -272.0372619628906, "rewards/margins": 10.00644588470459, "rewards/rejected": -282.043701171875, "step": 125 }, { "epoch": 0.1360544217687075, "grad_norm": 800005.8930982946, "learning_rate": 4.980697142834314e-07, "logits/chosen": -2.9066848754882812, "logits/rejected": -2.889483690261841, "logps/chosen": -358.52880859375, "logps/rejected": -351.5975341796875, "loss": 57769.1687, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -358.52880859375, "rewards/margins": -6.931341648101807, "rewards/rejected": -351.5975341796875, "step": 130 }, { "epoch": 0.141287284144427, "grad_norm": 991860.3958541746, "learning_rate": 4.974612716351446e-07, "logits/chosen": -2.8132946491241455, "logits/rejected": -2.807452917098999, "logps/chosen": -269.17333984375, "logps/rejected": -304.22784423828125, "loss": 57210.9125, "rewards/accuracies": 0.5625, "rewards/chosen": -269.17333984375, "rewards/margins": 35.054466247558594, "rewards/rejected": -304.22784423828125, "step": 135 }, { "epoch": 0.14652014652014653, "grad_norm": 1217484.7693174647, "learning_rate": 4.967700826904229e-07, "logits/chosen": -2.881108045578003, "logits/rejected": -2.877159357070923, "logps/chosen": -324.2433166503906, "logps/rejected": -289.4080505371094, "loss": 58436.2625, "rewards/accuracies": 0.3125, "rewards/chosen": -324.2433166503906, "rewards/margins": -34.83523941040039, "rewards/rejected": -289.4080505371094, "step": 140 }, { "epoch": 0.15175300889586604, "grad_norm": 1144094.5245424435, "learning_rate": 4.95996378569574e-07, "logits/chosen": -2.861013889312744, "logits/rejected": -2.8163299560546875, "logps/chosen": -310.35223388671875, "logps/rejected": -315.37078857421875, "loss": 56525.3125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -310.35223388671875, "rewards/margins": 5.018545627593994, "rewards/rejected": -315.37078857421875, "step": 145 }, { "epoch": 0.15698587127158556, "grad_norm": 906591.8638946635, "learning_rate": 4.951404179843962e-07, "logits/chosen": -2.8345422744750977, "logits/rejected": -2.8686890602111816, "logps/chosen": -276.36981201171875, "logps/rejected": -285.62548828125, "loss": 58509.9375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -276.36981201171875, "rewards/margins": 9.255735397338867, "rewards/rejected": -285.62548828125, "step": 150 }, { "epoch": 0.16221873364730507, "grad_norm": 907999.1062550667, "learning_rate": 4.942024871516694e-07, "logits/chosen": -2.8697471618652344, "logits/rejected": -2.8267807960510254, "logps/chosen": -320.91058349609375, "logps/rejected": -321.4515075683594, "loss": 58345.9, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -320.91058349609375, "rewards/margins": 0.5409385561943054, "rewards/rejected": -321.4515075683594, "step": 155 }, { "epoch": 0.1674515960230246, "grad_norm": 885328.1305549938, "learning_rate": 4.931828996974498e-07, "logits/chosen": -2.7532379627227783, "logits/rejected": -2.7566537857055664, "logps/chosen": -237.9208526611328, "logps/rejected": -254.9251251220703, "loss": 58183.8625, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -237.9208526611328, "rewards/margins": 17.00423812866211, "rewards/rejected": -254.9251251220703, "step": 160 }, { "epoch": 0.1726844583987441, "grad_norm": 1593598.7457023177, "learning_rate": 4.920819965521997e-07, "logits/chosen": -2.6699514389038086, "logits/rejected": -2.670328378677368, "logps/chosen": -305.18328857421875, "logps/rejected": -284.074951171875, "loss": 57758.7562, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -305.18328857421875, "rewards/margins": -21.108369827270508, "rewards/rejected": -284.074951171875, "step": 165 }, { "epoch": 0.17791732077446362, "grad_norm": 993568.4034911739, "learning_rate": 4.909001458367866e-07, "logits/chosen": -2.7054855823516846, "logits/rejected": -2.7096757888793945, "logps/chosen": -286.2120666503906, "logps/rejected": -321.2934265136719, "loss": 57056.9187, "rewards/accuracies": 0.5625, "rewards/chosen": -286.2120666503906, "rewards/margins": 35.081356048583984, "rewards/rejected": -321.2934265136719, "step": 170 }, { "epoch": 0.18315018315018314, "grad_norm": 934004.2559217811, "learning_rate": 4.896377427393911e-07, "logits/chosen": -2.7484357357025146, "logits/rejected": -2.7158854007720947, "logps/chosen": -286.253662109375, "logps/rejected": -315.47406005859375, "loss": 57739.1625, "rewards/accuracies": 0.5625, "rewards/chosen": -286.253662109375, "rewards/margins": 29.220422744750977, "rewards/rejected": -315.47406005859375, "step": 175 }, { "epoch": 0.18838304552590268, "grad_norm": 854532.9754199074, "learning_rate": 4.882952093833627e-07, "logits/chosen": -2.6975908279418945, "logits/rejected": -2.697767972946167, "logps/chosen": -299.58221435546875, "logps/rejected": -306.10247802734375, "loss": 56578.5375, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -299.58221435546875, "rewards/margins": 6.520210266113281, "rewards/rejected": -306.10247802734375, "step": 180 }, { "epoch": 0.1936159079016222, "grad_norm": 1018591.7582230872, "learning_rate": 4.868729946860708e-07, "logits/chosen": -2.697580575942993, "logits/rejected": -2.6543309688568115, "logps/chosen": -300.19854736328125, "logps/rejected": -279.4755859375, "loss": 56696.2875, "rewards/accuracies": 0.4375, "rewards/chosen": -300.19854736328125, "rewards/margins": -20.722976684570312, "rewards/rejected": -279.4755859375, "step": 185 }, { "epoch": 0.1988487702773417, "grad_norm": 1865987.6965253549, "learning_rate": 4.853715742087946e-07, "logits/chosen": -2.715686321258545, "logits/rejected": -2.6946115493774414, "logps/chosen": -261.4237060546875, "logps/rejected": -260.59552001953125, "loss": 55295.3625, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -261.4237060546875, "rewards/margins": -0.8281745910644531, "rewards/rejected": -260.59552001953125, "step": 190 }, { "epoch": 0.20408163265306123, "grad_norm": 2075031.8789570439, "learning_rate": 4.837914499977052e-07, "logits/chosen": -2.7049078941345215, "logits/rejected": -2.649726152420044, "logps/chosen": -348.7242126464844, "logps/rejected": -302.77056884765625, "loss": 56870.6875, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -348.7242126464844, "rewards/margins": -45.95365524291992, "rewards/rejected": -302.77056884765625, "step": 195 }, { "epoch": 0.20931449502878074, "grad_norm": 1044606.2904041886, "learning_rate": 4.821331504159906e-07, "logits/chosen": -2.662055253982544, "logits/rejected": -2.6654608249664307, "logps/chosen": -240.9337921142578, "logps/rejected": -280.35516357421875, "loss": 57408.1, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -240.9337921142578, "rewards/margins": 39.421363830566406, "rewards/rejected": -280.35516357421875, "step": 200 }, { "epoch": 0.21454735740450026, "grad_norm": 1613413.1304386982, "learning_rate": 4.80397229967181e-07, "logits/chosen": -2.5958218574523926, "logits/rejected": -2.5995872020721436, "logps/chosen": -260.1720275878906, "logps/rejected": -268.8197326660156, "loss": 57515.7125, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -260.1720275878906, "rewards/margins": 8.647693634033203, "rewards/rejected": -268.8197326660156, "step": 205 }, { "epoch": 0.21978021978021978, "grad_norm": 965375.2956772823, "learning_rate": 4.785842691097342e-07, "logits/chosen": -2.722567081451416, "logits/rejected": -2.6706037521362305, "logps/chosen": -301.97955322265625, "logps/rejected": -308.42462158203125, "loss": 56186.2937, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -301.97955322265625, "rewards/margins": 6.445120334625244, "rewards/rejected": -308.42462158203125, "step": 210 }, { "epoch": 0.2250130821559393, "grad_norm": 1682586.8851408535, "learning_rate": 4.7669487406294076e-07, "logits/chosen": -2.691540479660034, "logits/rejected": -2.6860575675964355, "logps/chosen": -292.8274230957031, "logps/rejected": -352.30621337890625, "loss": 57221.1375, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -292.8274230957031, "rewards/margins": 59.47880172729492, "rewards/rejected": -352.30621337890625, "step": 215 }, { "epoch": 0.2302459445316588, "grad_norm": 1043252.7613651449, "learning_rate": 4.7472967660421603e-07, "logits/chosen": -2.7390644550323486, "logits/rejected": -2.6686208248138428, "logps/chosen": -251.1779327392578, "logps/rejected": -252.0894317626953, "loss": 56568.1813, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -251.1779327392578, "rewards/margins": 0.9114850163459778, "rewards/rejected": -252.0894317626953, "step": 220 }, { "epoch": 0.23547880690737832, "grad_norm": 1024702.6869019131, "learning_rate": 4.7268933385784627e-07, "logits/chosen": -2.682610273361206, "logits/rejected": -2.640778064727783, "logps/chosen": -247.3615264892578, "logps/rejected": -284.06402587890625, "loss": 56326.825, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -247.3615264892578, "rewards/margins": 36.7025146484375, "rewards/rejected": -284.06402587890625, "step": 225 }, { "epoch": 0.24071166928309787, "grad_norm": 977761.8133840163, "learning_rate": 4.705745280752585e-07, "logits/chosen": -2.6460351943969727, "logits/rejected": -2.5948281288146973, "logps/chosen": -320.40252685546875, "logps/rejected": -341.423583984375, "loss": 56747.225, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -320.40252685546875, "rewards/margins": 21.021081924438477, "rewards/rejected": -341.423583984375, "step": 230 }, { "epoch": 0.24594453165881738, "grad_norm": 987590.8827444692, "learning_rate": 4.68385966406889e-07, "logits/chosen": -2.592116117477417, "logits/rejected": -2.5488688945770264, "logps/chosen": -270.15057373046875, "logps/rejected": -273.60870361328125, "loss": 57541.425, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -270.15057373046875, "rewards/margins": 3.458080768585205, "rewards/rejected": -273.60870361328125, "step": 235 }, { "epoch": 0.25117739403453687, "grad_norm": 1115088.0063134031, "learning_rate": 4.6612438066572555e-07, "logits/chosen": -2.5365209579467773, "logits/rejected": -2.5030362606048584, "logps/chosen": -303.14288330078125, "logps/rejected": -285.1949462890625, "loss": 57592.9, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -303.14288330078125, "rewards/margins": -17.947914123535156, "rewards/rejected": -285.1949462890625, "step": 240 }, { "epoch": 0.2564102564102564, "grad_norm": 872007.2172912332, "learning_rate": 4.6379052708260356e-07, "logits/chosen": -2.571394443511963, "logits/rejected": -2.5047571659088135, "logps/chosen": -271.99029541015625, "logps/rejected": -268.821533203125, "loss": 57330.4187, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -271.99029541015625, "rewards/margins": -3.168781280517578, "rewards/rejected": -268.821533203125, "step": 245 }, { "epoch": 0.2616431187859759, "grad_norm": 1283172.0120638541, "learning_rate": 4.6138518605333664e-07, "logits/chosen": -2.591219663619995, "logits/rejected": -2.5536255836486816, "logps/chosen": -340.99761962890625, "logps/rejected": -338.05084228515625, "loss": 58544.5125, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -340.99761962890625, "rewards/margins": -2.9467933177948, "rewards/rejected": -338.05084228515625, "step": 250 }, { "epoch": 0.2668759811616955, "grad_norm": 887220.7931197283, "learning_rate": 4.589091618777674e-07, "logits/chosen": -2.452988862991333, "logits/rejected": -2.426440715789795, "logps/chosen": -310.2080993652344, "logps/rejected": -326.74005126953125, "loss": 59796.9938, "rewards/accuracies": 0.5, "rewards/chosen": -310.2080993652344, "rewards/margins": 16.531951904296875, "rewards/rejected": -326.74005126953125, "step": 255 }, { "epoch": 0.272108843537415, "grad_norm": 1079677.4885566523, "learning_rate": 4.5636328249082514e-07, "logits/chosen": -2.6359188556671143, "logits/rejected": -2.5355026721954346, "logps/chosen": -310.75189208984375, "logps/rejected": -308.8578796386719, "loss": 59678.8, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -310.75189208984375, "rewards/margins": -1.893977403640747, "rewards/rejected": -308.8578796386719, "step": 260 }, { "epoch": 0.2773417059131345, "grad_norm": 1084565.0284754713, "learning_rate": 4.5374839918567996e-07, "logits/chosen": -2.6321051120758057, "logits/rejected": -2.567678928375244, "logps/chosen": -327.0538635253906, "logps/rejected": -316.69342041015625, "loss": 58093.8688, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -327.0538635253906, "rewards/margins": -10.360448837280273, "rewards/rejected": -316.69342041015625, "step": 265 }, { "epoch": 0.282574568288854, "grad_norm": 1295931.023547164, "learning_rate": 4.510653863290871e-07, "logits/chosen": -2.627354383468628, "logits/rejected": -2.5420610904693604, "logps/chosen": -284.817138671875, "logps/rejected": -295.0710144042969, "loss": 56263.8875, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -284.817138671875, "rewards/margins": 10.253904342651367, "rewards/rejected": -295.0710144042969, "step": 270 }, { "epoch": 0.28780743066457354, "grad_norm": 1013086.5803710954, "learning_rate": 4.483151410690151e-07, "logits/chosen": -2.6444249153137207, "logits/rejected": -2.5427169799804688, "logps/chosen": -279.9425354003906, "logps/rejected": -270.55450439453125, "loss": 54940.2875, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -279.9425354003906, "rewards/margins": -9.388038635253906, "rewards/rejected": -270.55450439453125, "step": 275 }, { "epoch": 0.29304029304029305, "grad_norm": 1576188.4710046574, "learning_rate": 4.4549858303465737e-07, "logits/chosen": -2.6243691444396973, "logits/rejected": -2.5685534477233887, "logps/chosen": -296.85418701171875, "logps/rejected": -310.7580261230469, "loss": 56116.5938, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -296.85418701171875, "rewards/margins": 13.903894424438477, "rewards/rejected": -310.7580261230469, "step": 280 }, { "epoch": 0.29827315541601257, "grad_norm": 1319520.840838825, "learning_rate": 4.4261665402892476e-07, "logits/chosen": -2.5911037921905518, "logits/rejected": -2.5209097862243652, "logps/chosen": -265.95025634765625, "logps/rejected": -334.61431884765625, "loss": 57866.4625, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -265.95025634765625, "rewards/margins": 68.66404724121094, "rewards/rejected": -334.61431884765625, "step": 285 }, { "epoch": 0.3035060177917321, "grad_norm": 1164732.143957571, "learning_rate": 4.396703177135261e-07, "logits/chosen": -2.6242473125457764, "logits/rejected": -2.5436782836914062, "logps/chosen": -349.99383544921875, "logps/rejected": -329.6797180175781, "loss": 56799.7375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -349.99383544921875, "rewards/margins": -20.31418800354004, "rewards/rejected": -329.6797180175781, "step": 290 }, { "epoch": 0.3087388801674516, "grad_norm": 1036095.2706155936, "learning_rate": 4.3666055928673697e-07, "logits/chosen": -2.6259796619415283, "logits/rejected": -2.596653938293457, "logps/chosen": -294.8160400390625, "logps/rejected": -268.30645751953125, "loss": 55223.3125, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -294.8160400390625, "rewards/margins": -26.509592056274414, "rewards/rejected": -268.30645751953125, "step": 295 }, { "epoch": 0.3139717425431711, "grad_norm": 1421793.3450062282, "learning_rate": 4.335883851539693e-07, "logits/chosen": -2.536402702331543, "logits/rejected": -2.470693588256836, "logps/chosen": -266.8374328613281, "logps/rejected": -269.9141845703125, "loss": 54316.75, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -266.8374328613281, "rewards/margins": 3.0767579078674316, "rewards/rejected": -269.9141845703125, "step": 300 }, { "epoch": 0.31920460491889063, "grad_norm": 1145803.694963496, "learning_rate": 4.304548225912481e-07, "logits/chosen": -2.4925479888916016, "logits/rejected": -2.4637606143951416, "logps/chosen": -268.6978454589844, "logps/rejected": -288.32489013671875, "loss": 56123.5, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -268.6978454589844, "rewards/margins": 19.627042770385742, "rewards/rejected": -288.32489013671875, "step": 305 }, { "epoch": 0.32443746729461015, "grad_norm": 1320708.0317829524, "learning_rate": 4.272609194017105e-07, "logits/chosen": -2.427326202392578, "logits/rejected": -2.375277519226074, "logps/chosen": -273.1225280761719, "logps/rejected": -294.8364562988281, "loss": 55285.2125, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -273.1225280761719, "rewards/margins": 21.713897705078125, "rewards/rejected": -294.8364562988281, "step": 310 }, { "epoch": 0.32967032967032966, "grad_norm": 1175173.27697323, "learning_rate": 4.2400774356524003e-07, "logits/chosen": -2.463435649871826, "logits/rejected": -2.390852689743042, "logps/chosen": -291.9449768066406, "logps/rejected": -351.9012756347656, "loss": 55227.475, "rewards/accuracies": 0.5625, "rewards/chosen": -291.9449768066406, "rewards/margins": 59.956260681152344, "rewards/rejected": -351.9012756347656, "step": 315 }, { "epoch": 0.3349031920460492, "grad_norm": 1736713.8372362903, "learning_rate": 4.2069638288135547e-07, "logits/chosen": -2.424726724624634, "logits/rejected": -2.4184367656707764, "logps/chosen": -293.0435485839844, "logps/rejected": -315.5990905761719, "loss": 56523.2438, "rewards/accuracies": 0.5, "rewards/chosen": -293.0435485839844, "rewards/margins": 22.555578231811523, "rewards/rejected": -315.5990905761719, "step": 320 }, { "epoch": 0.3401360544217687, "grad_norm": 1392044.8934369895, "learning_rate": 4.1732794460547037e-07, "logits/chosen": -2.4518871307373047, "logits/rejected": -2.444579601287842, "logps/chosen": -241.4635009765625, "logps/rejected": -265.34478759765625, "loss": 57858.325, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -241.4635009765625, "rewards/margins": 23.881275177001953, "rewards/rejected": -265.34478759765625, "step": 325 }, { "epoch": 0.3453689167974882, "grad_norm": 1151359.8150083232, "learning_rate": 4.139035550786494e-07, "logits/chosen": -2.4895317554473877, "logits/rejected": -2.476973056793213, "logps/chosen": -236.6543426513672, "logps/rejected": -301.2790832519531, "loss": 54808.6438, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -236.6543426513672, "rewards/margins": 64.62477111816406, "rewards/rejected": -301.2790832519531, "step": 330 }, { "epoch": 0.35060177917320773, "grad_norm": 1148068.4271174779, "learning_rate": 4.104243593509806e-07, "logits/chosen": -2.511590003967285, "logits/rejected": -2.449333906173706, "logps/chosen": -255.1795196533203, "logps/rejected": -306.39111328125, "loss": 56303.15, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -255.1795196533203, "rewards/margins": 51.211570739746094, "rewards/rejected": -306.39111328125, "step": 335 }, { "epoch": 0.35583464154892724, "grad_norm": 1398198.9347442659, "learning_rate": 4.0689152079869306e-07, "logits/chosen": -2.4384443759918213, "logits/rejected": -2.4097814559936523, "logps/chosen": -313.1650085449219, "logps/rejected": -348.493896484375, "loss": 54666.4, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -313.1650085449219, "rewards/margins": 35.3288688659668, "rewards/rejected": -348.493896484375, "step": 340 }, { "epoch": 0.36106750392464676, "grad_norm": 1048517.8304177759, "learning_rate": 4.0330622073514606e-07, "logits/chosen": -2.456749439239502, "logits/rejected": -2.353886127471924, "logps/chosen": -325.97222900390625, "logps/rejected": -289.00445556640625, "loss": 55775.8562, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -325.97222900390625, "rewards/margins": -36.96786117553711, "rewards/rejected": -289.00445556640625, "step": 345 }, { "epoch": 0.3663003663003663, "grad_norm": 1751549.502482873, "learning_rate": 3.99669658015821e-07, "logits/chosen": -2.325648784637451, "logits/rejected": -2.3088955879211426, "logps/chosen": -249.3928680419922, "logps/rejected": -300.55267333984375, "loss": 56549.7063, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -249.3928680419922, "rewards/margins": 51.1598014831543, "rewards/rejected": -300.55267333984375, "step": 350 }, { "epoch": 0.3715332286760858, "grad_norm": 1743710.2168055333, "learning_rate": 3.9598304863744615e-07, "logits/chosen": -2.3647897243499756, "logits/rejected": -2.302427053451538, "logps/chosen": -264.53399658203125, "logps/rejected": -291.4407958984375, "loss": 55886.3688, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -264.53399658203125, "rewards/margins": 26.906795501708984, "rewards/rejected": -291.4407958984375, "step": 355 }, { "epoch": 0.37676609105180536, "grad_norm": 1076304.712482483, "learning_rate": 3.92247625331392e-07, "logits/chosen": -2.3268961906433105, "logits/rejected": -2.2726428508758545, "logps/chosen": -233.88784790039062, "logps/rejected": -254.55062866210938, "loss": 55491.6813, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -233.88784790039062, "rewards/margins": 20.662763595581055, "rewards/rejected": -254.55062866210938, "step": 360 }, { "epoch": 0.3819989534275249, "grad_norm": 1089826.502625074, "learning_rate": 3.8846463715146867e-07, "logits/chosen": -2.4054033756256104, "logits/rejected": -2.35465669631958, "logps/chosen": -293.21893310546875, "logps/rejected": -311.3880920410156, "loss": 56063.525, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -293.21893310546875, "rewards/margins": 18.16920280456543, "rewards/rejected": -311.3880920410156, "step": 365 }, { "epoch": 0.3872318158032444, "grad_norm": 1476308.11101748, "learning_rate": 3.846353490562664e-07, "logits/chosen": -2.3780322074890137, "logits/rejected": -2.329284191131592, "logps/chosen": -254.2071990966797, "logps/rejected": -289.7637634277344, "loss": 55320.5563, "rewards/accuracies": 0.625, "rewards/chosen": -254.2071990966797, "rewards/margins": 35.55649948120117, "rewards/rejected": -289.7637634277344, "step": 370 }, { "epoch": 0.3924646781789639, "grad_norm": 1105169.4119545654, "learning_rate": 3.8076104148617817e-07, "logits/chosen": -2.3992652893066406, "logits/rejected": -2.3519163131713867, "logps/chosen": -297.7577209472656, "logps/rejected": -303.87060546875, "loss": 55865.7375, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -297.7577209472656, "rewards/margins": 6.11287260055542, "rewards/rejected": -303.87060546875, "step": 375 }, { "epoch": 0.3976975405546834, "grad_norm": 1165531.8808372426, "learning_rate": 3.768430099352445e-07, "logits/chosen": -2.4510560035705566, "logits/rejected": -2.369868278503418, "logps/chosen": -297.7958984375, "logps/rejected": -273.23046875, "loss": 57969.975, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -297.7958984375, "rewards/margins": -24.565448760986328, "rewards/rejected": -273.23046875, "step": 380 }, { "epoch": 0.40293040293040294, "grad_norm": 1756290.8872507422, "learning_rate": 3.728825645179653e-07, "logits/chosen": -2.4245288372039795, "logits/rejected": -2.3175346851348877, "logps/chosen": -359.3501892089844, "logps/rejected": -339.73492431640625, "loss": 57982.1, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -359.3501892089844, "rewards/margins": -19.61526107788086, "rewards/rejected": -339.73492431640625, "step": 385 }, { "epoch": 0.40816326530612246, "grad_norm": 1538328.85318582, "learning_rate": 3.6888102953122304e-07, "logits/chosen": -2.190237045288086, "logits/rejected": -2.2050204277038574, "logps/chosen": -305.574951171875, "logps/rejected": -310.0028991699219, "loss": 56215.6438, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -305.574951171875, "rewards/margins": 4.427947044372559, "rewards/rejected": -310.0028991699219, "step": 390 }, { "epoch": 0.413396127681842, "grad_norm": 1032609.5663318251, "learning_rate": 3.6483974301146263e-07, "logits/chosen": -2.409813165664673, "logits/rejected": -2.279897451400757, "logps/chosen": -289.708984375, "logps/rejected": -292.1055908203125, "loss": 55959.5, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -289.708984375, "rewards/margins": 2.3966078758239746, "rewards/rejected": -292.1055908203125, "step": 395 }, { "epoch": 0.4186289900575615, "grad_norm": 1349205.656483844, "learning_rate": 3.607600562872785e-07, "logits/chosen": -2.303772211074829, "logits/rejected": -2.219710111618042, "logps/chosen": -319.6715087890625, "logps/rejected": -316.0106506347656, "loss": 57163.6375, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -319.6715087890625, "rewards/margins": -3.66082501411438, "rewards/rejected": -316.0106506347656, "step": 400 }, { "epoch": 0.423861852433281, "grad_norm": 979251.717327062, "learning_rate": 3.566433335275558e-07, "logits/chosen": -2.2218708992004395, "logits/rejected": -2.146432876586914, "logps/chosen": -270.113037109375, "logps/rejected": -288.05926513671875, "loss": 54550.1687, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -270.113037109375, "rewards/margins": 17.94621467590332, "rewards/rejected": -288.05926513671875, "step": 405 }, { "epoch": 0.4290947148090005, "grad_norm": 1332038.5850987951, "learning_rate": 3.5249095128531856e-07, "logits/chosen": -2.1922194957733154, "logits/rejected": -2.0968267917633057, "logps/chosen": -301.511962890625, "logps/rejected": -319.2638244628906, "loss": 55946.6562, "rewards/accuracies": 0.5625, "rewards/chosen": -301.511962890625, "rewards/margins": 17.751834869384766, "rewards/rejected": -319.2638244628906, "step": 410 }, { "epoch": 0.43432757718472004, "grad_norm": 1088873.3097436083, "learning_rate": 3.4830429803743705e-07, "logits/chosen": -2.3102076053619385, "logits/rejected": -2.264838695526123, "logps/chosen": -313.7403869628906, "logps/rejected": -312.85125732421875, "loss": 55392.65, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -313.7403869628906, "rewards/margins": -0.8891464471817017, "rewards/rejected": -312.85125732421875, "step": 415 }, { "epoch": 0.43956043956043955, "grad_norm": 1374961.051492005, "learning_rate": 3.4408477372034736e-07, "logits/chosen": -2.271077871322632, "logits/rejected": -2.221766948699951, "logps/chosen": -293.64752197265625, "logps/rejected": -334.50390625, "loss": 56162.7438, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -293.64752197265625, "rewards/margins": 40.85638427734375, "rewards/rejected": -334.50390625, "step": 420 }, { "epoch": 0.44479330193615907, "grad_norm": 1029506.0927493338, "learning_rate": 3.3983378926194015e-07, "logits/chosen": -2.24725079536438, "logits/rejected": -2.1463942527770996, "logps/chosen": -292.072021484375, "logps/rejected": -306.45660400390625, "loss": 55289.5437, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -292.072021484375, "rewards/margins": 14.384634017944336, "rewards/rejected": -306.45660400390625, "step": 425 }, { "epoch": 0.4500261643118786, "grad_norm": 1002942.6665806974, "learning_rate": 3.3555276610977276e-07, "logits/chosen": -2.2519736289978027, "logits/rejected": -2.1914682388305664, "logps/chosen": -308.74169921875, "logps/rejected": -312.30438232421875, "loss": 55870.9875, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -308.74169921875, "rewards/margins": 3.5627059936523438, "rewards/rejected": -312.30438232421875, "step": 430 }, { "epoch": 0.4552590266875981, "grad_norm": 1018589.2167974291, "learning_rate": 3.3124313575576487e-07, "logits/chosen": -2.17337703704834, "logits/rejected": -2.1850523948669434, "logps/chosen": -284.9986267089844, "logps/rejected": -300.66607666015625, "loss": 54878.6375, "rewards/accuracies": 0.5, "rewards/chosen": -284.9986267089844, "rewards/margins": 15.66742992401123, "rewards/rejected": -300.66607666015625, "step": 435 }, { "epoch": 0.4604918890633176, "grad_norm": 1076240.3043484294, "learning_rate": 3.269063392575352e-07, "logits/chosen": -2.107131242752075, "logits/rejected": -2.0504283905029297, "logps/chosen": -245.75363159179688, "logps/rejected": -265.7157897949219, "loss": 55359.2375, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -245.75363159179688, "rewards/margins": 19.962154388427734, "rewards/rejected": -265.7157897949219, "step": 440 }, { "epoch": 0.46572475143903713, "grad_norm": 1259375.5689547102, "learning_rate": 3.2254382675653905e-07, "logits/chosen": -2.274196147918701, "logits/rejected": -2.182969331741333, "logps/chosen": -341.2582092285156, "logps/rejected": -347.05010986328125, "loss": 55359.3875, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -341.2582092285156, "rewards/margins": 5.791925430297852, "rewards/rejected": -347.05010986328125, "step": 445 }, { "epoch": 0.47095761381475665, "grad_norm": 1927449.2632160257, "learning_rate": 3.1815705699316964e-07, "logits/chosen": -2.264638662338257, "logits/rejected": -2.235848903656006, "logps/chosen": -247.626220703125, "logps/rejected": -299.621826171875, "loss": 55009.4375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -247.626220703125, "rewards/margins": 51.99560546875, "rewards/rejected": -299.621826171875, "step": 450 }, { "epoch": 0.47619047619047616, "grad_norm": 2337791.6005255897, "learning_rate": 3.1374749681898216e-07, "logits/chosen": -2.189664125442505, "logits/rejected": -2.1661365032196045, "logps/chosen": -283.037841796875, "logps/rejected": -331.63189697265625, "loss": 56368.575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -283.037841796875, "rewards/margins": 48.59403991699219, "rewards/rejected": -331.63189697265625, "step": 455 }, { "epoch": 0.48142333856619574, "grad_norm": 1581637.4423084452, "learning_rate": 3.0931662070620794e-07, "logits/chosen": -2.2392799854278564, "logits/rejected": -2.1875014305114746, "logps/chosen": -283.40338134765625, "logps/rejected": -318.47039794921875, "loss": 55090.0, "rewards/accuracies": 0.5, "rewards/chosen": -283.40338134765625, "rewards/margins": 35.06700897216797, "rewards/rejected": -318.47039794921875, "step": 460 }, { "epoch": 0.48665620094191525, "grad_norm": 1456316.7528858548, "learning_rate": 3.048659102547186e-07, "logits/chosen": -2.3513636589050293, "logits/rejected": -2.2428977489471436, "logps/chosen": -318.89703369140625, "logps/rejected": -347.49859619140625, "loss": 56281.025, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -318.89703369140625, "rewards/margins": 28.60154151916504, "rewards/rejected": -347.49859619140625, "step": 465 }, { "epoch": 0.49188906331763477, "grad_norm": 1026249.5450756603, "learning_rate": 3.003968536966078e-07, "logits/chosen": -2.180349826812744, "logits/rejected": -2.0016205310821533, "logps/chosen": -281.388916015625, "logps/rejected": -276.56427001953125, "loss": 53576.8, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -281.388916015625, "rewards/margins": -4.824639320373535, "rewards/rejected": -276.56427001953125, "step": 470 }, { "epoch": 0.4971219256933543, "grad_norm": 1429705.0746835866, "learning_rate": 2.959109453985547e-07, "logits/chosen": -2.2324633598327637, "logits/rejected": -2.0949769020080566, "logps/chosen": -299.1005554199219, "logps/rejected": -289.7796325683594, "loss": 55444.925, "rewards/accuracies": 0.5, "rewards/chosen": -299.1005554199219, "rewards/margins": -9.320911407470703, "rewards/rejected": -289.7796325683594, "step": 475 }, { "epoch": 0.5023547880690737, "grad_norm": 1213056.8914210084, "learning_rate": 2.9140968536213693e-07, "logits/chosen": -2.1725077629089355, "logits/rejected": -2.1495959758758545, "logps/chosen": -259.3185729980469, "logps/rejected": -283.7967529296875, "loss": 54958.5125, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -259.3185729980469, "rewards/margins": 24.478168487548828, "rewards/rejected": -283.7967529296875, "step": 480 }, { "epoch": 0.5075876504447933, "grad_norm": 1461214.8419503546, "learning_rate": 2.868945787222582e-07, "logits/chosen": -2.1361522674560547, "logits/rejected": -2.180379867553711, "logps/chosen": -234.53329467773438, "logps/rejected": -269.418701171875, "loss": 55915.4812, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -234.53329467773438, "rewards/margins": 34.885379791259766, "rewards/rejected": -269.418701171875, "step": 485 }, { "epoch": 0.5128205128205128, "grad_norm": 1374060.3676287297, "learning_rate": 2.823671352438608e-07, "logits/chosen": -2.101999044418335, "logits/rejected": -2.050888776779175, "logps/chosen": -254.61770629882812, "logps/rejected": -283.10235595703125, "loss": 55689.6875, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -254.61770629882812, "rewards/margins": 28.48464012145996, "rewards/rejected": -283.10235595703125, "step": 490 }, { "epoch": 0.5180533751962323, "grad_norm": 1234397.4379234589, "learning_rate": 2.7782886881708866e-07, "logits/chosen": -2.2712063789367676, "logits/rejected": -2.099457263946533, "logps/chosen": -310.07879638671875, "logps/rejected": -374.5906677246094, "loss": 54732.425, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -310.07879638671875, "rewards/margins": 64.51188659667969, "rewards/rejected": -374.5906677246094, "step": 495 }, { "epoch": 0.5232862375719518, "grad_norm": 2228603.377630035, "learning_rate": 2.73281296951072e-07, "logits/chosen": -2.017988920211792, "logits/rejected": -2.0341272354125977, "logps/chosen": -222.3467559814453, "logps/rejected": -274.4383239746094, "loss": 57469.6625, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -222.3467559814453, "rewards/margins": 52.0915641784668, "rewards/rejected": -274.4383239746094, "step": 500 }, { "epoch": 0.5285190999476713, "grad_norm": 1599169.5252724146, "learning_rate": 2.6872594026650096e-07, "logits/chosen": -2.240408420562744, "logits/rejected": -2.2190628051757812, "logps/chosen": -270.1956481933594, "logps/rejected": -335.818359375, "loss": 53495.1125, "rewards/accuracies": 0.625, "rewards/chosen": -270.1956481933594, "rewards/margins": 65.6227035522461, "rewards/rejected": -335.818359375, "step": 505 }, { "epoch": 0.533751962323391, "grad_norm": 1490279.794358093, "learning_rate": 2.641643219871597e-07, "logits/chosen": -2.219712734222412, "logits/rejected": -2.139911651611328, "logps/chosen": -288.52215576171875, "logps/rejected": -317.8120422363281, "loss": 54654.5563, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -288.52215576171875, "rewards/margins": 29.289892196655273, "rewards/rejected": -317.8120422363281, "step": 510 }, { "epoch": 0.5389848246991105, "grad_norm": 935331.0867922652, "learning_rate": 2.595979674305891e-07, "logits/chosen": -2.084282398223877, "logits/rejected": -2.0336263179779053, "logps/chosen": -237.1022186279297, "logps/rejected": -258.7450256347656, "loss": 54242.45, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -237.1022186279297, "rewards/margins": 21.642807006835938, "rewards/rejected": -258.7450256347656, "step": 515 }, { "epoch": 0.54421768707483, "grad_norm": 856340.2472942632, "learning_rate": 2.550284034980507e-07, "logits/chosen": -2.1015374660491943, "logits/rejected": -2.0551133155822754, "logps/chosen": -279.68505859375, "logps/rejected": -288.3494567871094, "loss": 55627.8063, "rewards/accuracies": 0.5625, "rewards/chosen": -279.68505859375, "rewards/margins": 8.664429664611816, "rewards/rejected": -288.3494567871094, "step": 520 }, { "epoch": 0.5494505494505495, "grad_norm": 990531.7402526786, "learning_rate": 2.5045715816395916e-07, "logits/chosen": -2.2954821586608887, "logits/rejected": -2.194169521331787, "logps/chosen": -299.71234130859375, "logps/rejected": -318.68243408203125, "loss": 55352.35, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -299.71234130859375, "rewards/margins": 18.970050811767578, "rewards/rejected": -318.68243408203125, "step": 525 }, { "epoch": 0.554683411826269, "grad_norm": 1450335.2512408984, "learning_rate": 2.4588575996495794e-07, "logits/chosen": -2.2317874431610107, "logits/rejected": -2.169450521469116, "logps/chosen": -276.4530334472656, "logps/rejected": -311.96551513671875, "loss": 54057.4375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -276.4530334472656, "rewards/margins": 35.512451171875, "rewards/rejected": -311.96551513671875, "step": 530 }, { "epoch": 0.5599162742019885, "grad_norm": 1671087.2529512038, "learning_rate": 2.413157374888054e-07, "logits/chosen": -2.2822182178497314, "logits/rejected": -2.2092044353485107, "logps/chosen": -297.13531494140625, "logps/rejected": -293.6783142089844, "loss": 56565.0062, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -297.13531494140625, "rewards/margins": -3.457014560699463, "rewards/rejected": -293.6783142089844, "step": 535 }, { "epoch": 0.565149136577708, "grad_norm": 1228860.5418419128, "learning_rate": 2.367486188632446e-07, "logits/chosen": -2.173696279525757, "logits/rejected": -2.099151849746704, "logps/chosen": -266.07257080078125, "logps/rejected": -315.889892578125, "loss": 56023.175, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -266.07257080078125, "rewards/margins": 49.81734085083008, "rewards/rejected": -315.889892578125, "step": 540 }, { "epoch": 0.5703819989534276, "grad_norm": 1271355.6361364825, "learning_rate": 2.321859312450267e-07, "logits/chosen": -2.364675760269165, "logits/rejected": -2.297121286392212, "logps/chosen": -312.4518737792969, "logps/rejected": -373.5928039550781, "loss": 54175.6875, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -312.4518737792969, "rewards/margins": 61.14093017578125, "rewards/rejected": -373.5928039550781, "step": 545 }, { "epoch": 0.5756148613291471, "grad_norm": 1398597.9993464884, "learning_rate": 2.276292003092593e-07, "logits/chosen": -2.2173264026641846, "logits/rejected": -2.1705925464630127, "logps/chosen": -307.5565490722656, "logps/rejected": -310.6933898925781, "loss": 54367.9, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -307.5565490722656, "rewards/margins": 3.136824131011963, "rewards/rejected": -310.6933898925781, "step": 550 }, { "epoch": 0.5808477237048666, "grad_norm": 1551718.3427722957, "learning_rate": 2.230799497392495e-07, "logits/chosen": -2.2841944694519043, "logits/rejected": -2.225440502166748, "logps/chosen": -272.79681396484375, "logps/rejected": -291.16204833984375, "loss": 56317.2063, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -272.79681396484375, "rewards/margins": 18.365182876586914, "rewards/rejected": -291.16204833984375, "step": 555 }, { "epoch": 0.5860805860805861, "grad_norm": 1420409.765508531, "learning_rate": 2.185397007170141e-07, "logits/chosen": -2.300354480743408, "logits/rejected": -2.2717068195343018, "logps/chosen": -313.4623107910156, "logps/rejected": -360.75189208984375, "loss": 55098.0625, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -313.4623107910156, "rewards/margins": 47.28960418701172, "rewards/rejected": -360.75189208984375, "step": 560 }, { "epoch": 0.5913134484563056, "grad_norm": 1392120.5854896335, "learning_rate": 2.14009971414625e-07, "logits/chosen": -2.2033753395080566, "logits/rejected": -2.1571030616760254, "logps/chosen": -282.2511291503906, "logps/rejected": -287.3548889160156, "loss": 54579.0062, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -282.2511291503906, "rewards/margins": 5.103717803955078, "rewards/rejected": -287.3548889160156, "step": 565 }, { "epoch": 0.5965463108320251, "grad_norm": 1488981.6454046704, "learning_rate": 2.094922764865619e-07, "logits/chosen": -2.218703031539917, "logits/rejected": -2.244843006134033, "logps/chosen": -232.9685821533203, "logps/rejected": -295.5643615722656, "loss": 56100.95, "rewards/accuracies": 0.625, "rewards/chosen": -232.9685821533203, "rewards/margins": 62.59580612182617, "rewards/rejected": -295.5643615722656, "step": 570 }, { "epoch": 0.6017791732077447, "grad_norm": 1135626.4886801469, "learning_rate": 2.0498812656324064e-07, "logits/chosen": -2.142216205596924, "logits/rejected": -2.1622607707977295, "logps/chosen": -289.1842041015625, "logps/rejected": -325.3665466308594, "loss": 54899.825, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -289.1842041015625, "rewards/margins": 36.182373046875, "rewards/rejected": -325.3665466308594, "step": 575 }, { "epoch": 0.6070120355834642, "grad_norm": 1329372.6719742662, "learning_rate": 2.0049902774588797e-07, "logits/chosen": -2.207730770111084, "logits/rejected": -2.0855050086975098, "logps/chosen": -299.4252014160156, "logps/rejected": -332.8768615722656, "loss": 55401.0125, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -299.4252014160156, "rewards/margins": 33.4516487121582, "rewards/rejected": -332.8768615722656, "step": 580 }, { "epoch": 0.6122448979591837, "grad_norm": 1187559.6151840126, "learning_rate": 1.960264811029297e-07, "logits/chosen": -2.22457218170166, "logits/rejected": -2.148383617401123, "logps/chosen": -281.52923583984375, "logps/rejected": -282.35784912109375, "loss": 56603.25, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -281.52923583984375, "rewards/margins": 0.828582763671875, "rewards/rejected": -282.35784912109375, "step": 585 }, { "epoch": 0.6174777603349032, "grad_norm": 1671629.5147047387, "learning_rate": 1.9157198216806238e-07, "logits/chosen": -2.209186315536499, "logits/rejected": -2.1159491539001465, "logps/chosen": -251.740966796875, "logps/rejected": -269.68011474609375, "loss": 55453.7562, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -251.740966796875, "rewards/margins": 17.939146041870117, "rewards/rejected": -269.68011474609375, "step": 590 }, { "epoch": 0.6227106227106227, "grad_norm": 1173783.4292100056, "learning_rate": 1.8713702044017577e-07, "logits/chosen": -2.1656856536865234, "logits/rejected": -2.1623783111572266, "logps/chosen": -301.41497802734375, "logps/rejected": -317.84295654296875, "loss": 54113.325, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -301.41497802734375, "rewards/margins": 16.427982330322266, "rewards/rejected": -317.84295654296875, "step": 595 }, { "epoch": 0.6279434850863422, "grad_norm": 1387972.3011875993, "learning_rate": 1.8272307888529274e-07, "logits/chosen": -2.1883492469787598, "logits/rejected": -2.1378281116485596, "logps/chosen": -257.42822265625, "logps/rejected": -320.2197265625, "loss": 55090.8625, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -257.42822265625, "rewards/margins": 62.7915153503418, "rewards/rejected": -320.2197265625, "step": 600 }, { "epoch": 0.6331763474620618, "grad_norm": 1488821.1810637303, "learning_rate": 1.783316334406939e-07, "logits/chosen": -2.185284376144409, "logits/rejected": -2.0930609703063965, "logps/chosen": -322.49005126953125, "logps/rejected": -319.7123718261719, "loss": 54071.0125, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -322.49005126953125, "rewards/margins": -2.777683973312378, "rewards/rejected": -319.7123718261719, "step": 605 }, { "epoch": 0.6384092098377813, "grad_norm": 1496837.8340915893, "learning_rate": 1.7396415252139288e-07, "logits/chosen": -2.2097795009613037, "logits/rejected": -2.0639331340789795, "logps/chosen": -308.24530029296875, "logps/rejected": -331.1815490722656, "loss": 54010.9875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -308.24530029296875, "rewards/margins": 22.936208724975586, "rewards/rejected": -331.1815490722656, "step": 610 }, { "epoch": 0.6436420722135008, "grad_norm": 1535540.9500706908, "learning_rate": 1.6962209652912625e-07, "logits/chosen": -2.1692049503326416, "logits/rejected": -2.077504873275757, "logps/chosen": -255.7120361328125, "logps/rejected": -309.77008056640625, "loss": 54530.4875, "rewards/accuracies": 0.625, "rewards/chosen": -255.7120361328125, "rewards/margins": 54.058021545410156, "rewards/rejected": -309.77008056640625, "step": 615 }, { "epoch": 0.6488749345892203, "grad_norm": 1397345.2747377793, "learning_rate": 1.6530691736402316e-07, "logits/chosen": -2.1868765354156494, "logits/rejected": -2.1478359699249268, "logps/chosen": -292.8278503417969, "logps/rejected": -312.1719055175781, "loss": 54489.7375, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -292.8278503417969, "rewards/margins": 19.344045639038086, "rewards/rejected": -312.1719055175781, "step": 620 }, { "epoch": 0.6541077969649398, "grad_norm": 1977409.2998021427, "learning_rate": 1.610200579391182e-07, "logits/chosen": -2.1679329872131348, "logits/rejected": -2.1316826343536377, "logps/chosen": -283.0874938964844, "logps/rejected": -364.5801696777344, "loss": 55410.75, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -283.0874938964844, "rewards/margins": 81.49267578125, "rewards/rejected": -364.5801696777344, "step": 625 }, { "epoch": 0.6593406593406593, "grad_norm": 1362818.5877687463, "learning_rate": 1.5676295169786864e-07, "logits/chosen": -2.0093648433685303, "logits/rejected": -1.9298946857452393, "logps/chosen": -282.3995056152344, "logps/rejected": -278.3210754394531, "loss": 54493.85, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -282.3995056152344, "rewards/margins": -4.078440189361572, "rewards/rejected": -278.3210754394531, "step": 630 }, { "epoch": 0.6645735217163788, "grad_norm": 956804.5377818815, "learning_rate": 1.5253702213483842e-07, "logits/chosen": -2.1643216609954834, "logits/rejected": -2.119776964187622, "logps/chosen": -271.3257751464844, "logps/rejected": -303.90423583984375, "loss": 54765.8125, "rewards/accuracies": 0.5625, "rewards/chosen": -271.3257751464844, "rewards/margins": 32.57844924926758, "rewards/rejected": -303.90423583984375, "step": 635 }, { "epoch": 0.6698063840920984, "grad_norm": 1933509.9856251064, "learning_rate": 1.483436823197092e-07, "logits/chosen": -2.093644857406616, "logits/rejected": -2.10066556930542, "logps/chosen": -269.1563415527344, "logps/rejected": -319.5292663574219, "loss": 54325.475, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -269.1563415527344, "rewards/margins": 50.37293243408203, "rewards/rejected": -319.5292663574219, "step": 640 }, { "epoch": 0.6750392464678179, "grad_norm": 1218847.4753339728, "learning_rate": 1.4418433442477703e-07, "logits/chosen": -2.216813087463379, "logits/rejected": -2.1345386505126953, "logps/chosen": -338.1468505859375, "logps/rejected": -352.8824768066406, "loss": 53920.6188, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -338.1468505859375, "rewards/margins": 14.735623359680176, "rewards/rejected": -352.8824768066406, "step": 645 }, { "epoch": 0.6802721088435374, "grad_norm": 1418811.7836556053, "learning_rate": 1.4006036925609243e-07, "logits/chosen": -2.139899492263794, "logits/rejected": -2.0506820678710938, "logps/chosen": -257.8123779296875, "logps/rejected": -283.587890625, "loss": 55958.4187, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -257.8123779296875, "rewards/margins": 25.775487899780273, "rewards/rejected": -283.587890625, "step": 650 }, { "epoch": 0.6855049712192569, "grad_norm": 1109912.054173663, "learning_rate": 1.3597316578840216e-07, "logits/chosen": -2.0801479816436768, "logits/rejected": -2.0766029357910156, "logps/chosen": -256.91619873046875, "logps/rejected": -276.5906677246094, "loss": 54215.7375, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -256.91619873046875, "rewards/margins": 19.674455642700195, "rewards/rejected": -276.5906677246094, "step": 655 }, { "epoch": 0.6907378335949764, "grad_norm": 1210210.3382933068, "learning_rate": 1.319240907040458e-07, "logits/chosen": -2.245999574661255, "logits/rejected": -2.1108059883117676, "logps/chosen": -321.09796142578125, "logps/rejected": -322.8074645996094, "loss": 55360.3, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -321.09796142578125, "rewards/margins": 1.7095245122909546, "rewards/rejected": -322.8074645996094, "step": 660 }, { "epoch": 0.6959706959706959, "grad_norm": 1098340.112919491, "learning_rate": 1.279144979359641e-07, "logits/chosen": -2.1789064407348633, "logits/rejected": -2.157804489135742, "logps/chosen": -267.2674865722656, "logps/rejected": -305.3499450683594, "loss": 55846.8812, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -267.2674865722656, "rewards/margins": 38.08247756958008, "rewards/rejected": -305.3499450683594, "step": 665 }, { "epoch": 0.7012035583464155, "grad_norm": 1447767.6648965469, "learning_rate": 1.2394572821496948e-07, "logits/chosen": -2.2281277179718018, "logits/rejected": -2.21685791015625, "logps/chosen": -273.71417236328125, "logps/rejected": -327.2356262207031, "loss": 54601.5563, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -273.71417236328125, "rewards/margins": 53.521484375, "rewards/rejected": -327.2356262207031, "step": 670 }, { "epoch": 0.706436420722135, "grad_norm": 1277662.338967538, "learning_rate": 1.2001910862143174e-07, "logits/chosen": -2.2508022785186768, "logits/rejected": -2.217378854751587, "logps/chosen": -325.85906982421875, "logps/rejected": -380.45074462890625, "loss": 55330.475, "rewards/accuracies": 0.5625, "rewards/chosen": -325.85906982421875, "rewards/margins": 54.59168243408203, "rewards/rejected": -380.45074462890625, "step": 675 }, { "epoch": 0.7116692830978545, "grad_norm": 1785233.744803184, "learning_rate": 1.1613595214152711e-07, "logits/chosen": -2.2163052558898926, "logits/rejected": -2.1031951904296875, "logps/chosen": -284.403076171875, "logps/rejected": -271.61138916015625, "loss": 54460.6625, "rewards/accuracies": 0.375, "rewards/chosen": -284.403076171875, "rewards/margins": -12.791729927062988, "rewards/rejected": -271.61138916015625, "step": 680 }, { "epoch": 0.716902145473574, "grad_norm": 1084153.5773127347, "learning_rate": 1.122975572282018e-07, "logits/chosen": -2.19317364692688, "logits/rejected": -2.1025004386901855, "logps/chosen": -290.7996520996094, "logps/rejected": -270.1470947265625, "loss": 54599.6188, "rewards/accuracies": 0.4375, "rewards/chosen": -290.7996520996094, "rewards/margins": -20.65255355834961, "rewards/rejected": -270.1470947265625, "step": 685 }, { "epoch": 0.7221350078492935, "grad_norm": 1089542.9473462715, "learning_rate": 1.0850520736699362e-07, "logits/chosen": -2.144193172454834, "logits/rejected": -2.0945630073547363, "logps/chosen": -264.43109130859375, "logps/rejected": -340.2378845214844, "loss": 54947.6625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -264.43109130859375, "rewards/margins": 75.80680084228516, "rewards/rejected": -340.2378845214844, "step": 690 }, { "epoch": 0.727367870225013, "grad_norm": 1178567.4912604708, "learning_rate": 1.0476017064685941e-07, "logits/chosen": -2.2328460216522217, "logits/rejected": -2.1399552822113037, "logps/chosen": -284.4504089355469, "logps/rejected": -293.85321044921875, "loss": 55292.35, "rewards/accuracies": 0.5, "rewards/chosen": -284.4504089355469, "rewards/margins": 9.402796745300293, "rewards/rejected": -293.85321044921875, "step": 695 }, { "epoch": 0.7326007326007326, "grad_norm": 1333559.7423557746, "learning_rate": 1.0106369933615042e-07, "logits/chosen": -2.011481761932373, "logits/rejected": -1.935136079788208, "logps/chosen": -258.0648193359375, "logps/rejected": -269.0512390136719, "loss": 56453.9, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -258.0648193359375, "rewards/margins": 10.98639965057373, "rewards/rejected": -269.0512390136719, "step": 700 }, { "epoch": 0.7378335949764521, "grad_norm": 1081251.2157163108, "learning_rate": 9.741702946387748e-08, "logits/chosen": -2.1545426845550537, "logits/rejected": -2.0765717029571533, "logps/chosen": -247.3363494873047, "logps/rejected": -301.45672607421875, "loss": 54404.8, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -247.3363494873047, "rewards/margins": 54.120391845703125, "rewards/rejected": -301.45672607421875, "step": 705 }, { "epoch": 0.7430664573521716, "grad_norm": 1104364.5468847684, "learning_rate": 9.382138040640714e-08, "logits/chosen": -1.989871621131897, "logits/rejected": -1.9418586492538452, "logps/chosen": -244.57852172851562, "logps/rejected": -289.2986755371094, "loss": 54110.525, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -244.57852172851562, "rewards/margins": 44.72013854980469, "rewards/rejected": -289.2986755371094, "step": 710 }, { "epoch": 0.7482993197278912, "grad_norm": 1104089.7558876271, "learning_rate": 9.027795447972545e-08, "logits/chosen": -2.2300283908843994, "logits/rejected": -2.1951324939727783, "logps/chosen": -286.88922119140625, "logps/rejected": -345.98822021484375, "loss": 52983.1375, "rewards/accuracies": 0.625, "rewards/chosen": -286.88922119140625, "rewards/margins": 59.0989990234375, "rewards/rejected": -345.98822021484375, "step": 715 }, { "epoch": 0.7535321821036107, "grad_norm": 1208966.7828290404, "learning_rate": 8.678793653740632e-08, "logits/chosen": -2.19745135307312, "logits/rejected": -2.0950427055358887, "logps/chosen": -259.8890686035156, "logps/rejected": -310.04876708984375, "loss": 55099.525, "rewards/accuracies": 0.625, "rewards/chosen": -259.8890686035156, "rewards/margins": 50.15970993041992, "rewards/rejected": -310.04876708984375, "step": 720 }, { "epoch": 0.7587650444793302, "grad_norm": 1272614.4979089308, "learning_rate": 8.335249357441945e-08, "logits/chosen": -2.041647434234619, "logits/rejected": -2.0392508506774902, "logps/chosen": -260.08172607421875, "logps/rejected": -329.94854736328125, "loss": 54837.7125, "rewards/accuracies": 0.625, "rewards/chosen": -260.08172607421875, "rewards/margins": 69.86690521240234, "rewards/rejected": -329.94854736328125, "step": 725 }, { "epoch": 0.7639979068550498, "grad_norm": 1162448.1540473108, "learning_rate": 7.997277433690983e-08, "logits/chosen": -2.1625466346740723, "logits/rejected": -2.0773284435272217, "logps/chosen": -268.3184814453125, "logps/rejected": -292.38433837890625, "loss": 55808.2125, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -268.3184814453125, "rewards/margins": 24.065847396850586, "rewards/rejected": -292.38433837890625, "step": 730 }, { "epoch": 0.7692307692307693, "grad_norm": 1243184.3713818155, "learning_rate": 7.664990893807885e-08, "logits/chosen": -2.1861138343811035, "logits/rejected": -2.1057441234588623, "logps/chosen": -248.58114624023438, "logps/rejected": -316.4317626953125, "loss": 54297.5375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -248.58114624023438, "rewards/margins": 67.85064697265625, "rewards/rejected": -316.4317626953125, "step": 735 }, { "epoch": 0.7744636316064888, "grad_norm": 1338047.2392976265, "learning_rate": 7.338500848029602e-08, "logits/chosen": -2.1806750297546387, "logits/rejected": -2.1461918354034424, "logps/chosen": -285.19451904296875, "logps/rejected": -319.1790466308594, "loss": 55123.75, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -285.19451904296875, "rewards/margins": 33.984554290771484, "rewards/rejected": -319.1790466308594, "step": 740 }, { "epoch": 0.7796964939822083, "grad_norm": 1984510.6026826864, "learning_rate": 7.01791646835681e-08, "logits/chosen": -2.2138607501983643, "logits/rejected": -2.1573081016540527, "logps/chosen": -270.462890625, "logps/rejected": -285.9214172363281, "loss": 54790.0, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -270.462890625, "rewards/margins": 15.458574295043945, "rewards/rejected": -285.9214172363281, "step": 745 }, { "epoch": 0.7849293563579278, "grad_norm": 1378850.8751623577, "learning_rate": 6.70334495204884e-08, "logits/chosen": -2.117934465408325, "logits/rejected": -2.0909981727600098, "logps/chosen": -254.19442749023438, "logps/rejected": -307.30255126953125, "loss": 54093.9875, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -254.19442749023438, "rewards/margins": 53.108154296875, "rewards/rejected": -307.30255126953125, "step": 750 }, { "epoch": 0.7901622187336473, "grad_norm": 1370111.0134525597, "learning_rate": 6.394891485779022e-08, "logits/chosen": -2.266648292541504, "logits/rejected": -2.2330288887023926, "logps/chosen": -290.75335693359375, "logps/rejected": -312.68597412109375, "loss": 54021.125, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -290.75335693359375, "rewards/margins": 21.932575225830078, "rewards/rejected": -312.68597412109375, "step": 755 }, { "epoch": 0.7953950811093669, "grad_norm": 1572823.5723971077, "learning_rate": 6.092659210462231e-08, "logits/chosen": -2.1503944396972656, "logits/rejected": -2.113105297088623, "logps/chosen": -281.13037109375, "logps/rejected": -321.85693359375, "loss": 54900.25, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -281.13037109375, "rewards/margins": 40.7265510559082, "rewards/rejected": -321.85693359375, "step": 760 }, { "epoch": 0.8006279434850864, "grad_norm": 1225741.5170516171, "learning_rate": 5.7967491867665975e-08, "logits/chosen": -2.0941481590270996, "logits/rejected": -2.064021348953247, "logps/chosen": -251.21670532226562, "logps/rejected": -310.03631591796875, "loss": 54873.5938, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -251.21670532226562, "rewards/margins": 58.8195915222168, "rewards/rejected": -310.03631591796875, "step": 765 }, { "epoch": 0.8058608058608059, "grad_norm": 1642379.1878661881, "learning_rate": 5.507260361320737e-08, "logits/chosen": -2.1802749633789062, "logits/rejected": -2.097052812576294, "logps/chosen": -280.42254638671875, "logps/rejected": -292.81768798828125, "loss": 54552.0125, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -280.42254638671875, "rewards/margins": 12.39512825012207, "rewards/rejected": -292.81768798828125, "step": 770 }, { "epoch": 0.8110936682365254, "grad_norm": 1328549.6940408363, "learning_rate": 5.2242895336278734e-08, "logits/chosen": -2.2298295497894287, "logits/rejected": -2.1420650482177734, "logps/chosen": -275.43951416015625, "logps/rejected": -293.7701110839844, "loss": 54556.4625, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -275.43951416015625, "rewards/margins": 18.330612182617188, "rewards/rejected": -293.7701110839844, "step": 775 }, { "epoch": 0.8163265306122449, "grad_norm": 1403447.9375936964, "learning_rate": 4.947931323697982e-08, "logits/chosen": -2.1510796546936035, "logits/rejected": -2.070650339126587, "logps/chosen": -281.1759338378906, "logps/rejected": -301.5060119628906, "loss": 53222.4187, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -281.1759338378906, "rewards/margins": 20.330089569091797, "rewards/rejected": -301.5060119628906, "step": 780 }, { "epoch": 0.8215593929879644, "grad_norm": 1248429.4711292263, "learning_rate": 4.678278140408667e-08, "logits/chosen": -2.2055792808532715, "logits/rejected": -2.0527145862579346, "logps/chosen": -284.21142578125, "logps/rejected": -297.88018798828125, "loss": 51932.0875, "rewards/accuracies": 0.5625, "rewards/chosen": -284.21142578125, "rewards/margins": 13.668767929077148, "rewards/rejected": -297.88018798828125, "step": 785 }, { "epoch": 0.826792255363684, "grad_norm": 1250406.1121283756, "learning_rate": 4.415420150605398e-08, "logits/chosen": -2.110973596572876, "logits/rejected": -1.9595458507537842, "logps/chosen": -252.9059295654297, "logps/rejected": -279.2762451171875, "loss": 55843.9812, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -252.9059295654297, "rewards/margins": 26.370315551757812, "rewards/rejected": -279.2762451171875, "step": 790 }, { "epoch": 0.8320251177394035, "grad_norm": 1200316.971470453, "learning_rate": 4.159445248951457e-08, "logits/chosen": -2.0804190635681152, "logits/rejected": -2.0888171195983887, "logps/chosen": -227.65390014648438, "logps/rejected": -293.1388244628906, "loss": 54166.2125, "rewards/accuracies": 0.6875, "rewards/chosen": -227.65390014648438, "rewards/margins": 65.48490905761719, "rewards/rejected": -293.1388244628906, "step": 795 }, { "epoch": 0.837257980115123, "grad_norm": 1453921.71532858, "learning_rate": 3.9104390285376374e-08, "logits/chosen": -2.275310754776001, "logits/rejected": -2.17592191696167, "logps/chosen": -284.0006103515625, "logps/rejected": -263.94525146484375, "loss": 55792.875, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -284.0006103515625, "rewards/margins": -20.05536460876465, "rewards/rejected": -263.94525146484375, "step": 800 }, { "epoch": 0.8424908424908425, "grad_norm": 1115550.7841994467, "learning_rate": 3.6684847522615664e-08, "logits/chosen": -2.1132473945617676, "logits/rejected": -2.0296568870544434, "logps/chosen": -242.7162628173828, "logps/rejected": -279.27545166015625, "loss": 55248.8063, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -242.7162628173828, "rewards/margins": 36.55915069580078, "rewards/rejected": -279.27545166015625, "step": 805 }, { "epoch": 0.847723704866562, "grad_norm": 1588544.8496029316, "learning_rate": 3.433663324986208e-08, "logits/chosen": -2.1658711433410645, "logits/rejected": -2.0674452781677246, "logps/chosen": -296.5272216796875, "logps/rejected": -326.5904541015625, "loss": 55337.175, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -296.5272216796875, "rewards/margins": 30.063217163085938, "rewards/rejected": -326.5904541015625, "step": 810 }, { "epoch": 0.8529565672422815, "grad_norm": 1954751.458337351, "learning_rate": 3.206053266486808e-08, "logits/chosen": -2.254883289337158, "logits/rejected": -2.1984355449676514, "logps/chosen": -274.1257629394531, "logps/rejected": -293.55303955078125, "loss": 54866.6188, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -274.1257629394531, "rewards/margins": 19.427263259887695, "rewards/rejected": -293.55303955078125, "step": 815 }, { "epoch": 0.858189429618001, "grad_norm": 1259920.9995805293, "learning_rate": 2.9857306851953897e-08, "logits/chosen": -2.12813663482666, "logits/rejected": -2.065500259399414, "logps/chosen": -282.5124206542969, "logps/rejected": -329.7523498535156, "loss": 54957.6875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -282.5124206542969, "rewards/margins": 47.23994064331055, "rewards/rejected": -329.7523498535156, "step": 820 }, { "epoch": 0.8634222919937206, "grad_norm": 2932710.1060309387, "learning_rate": 2.772769252751575e-08, "logits/chosen": -2.2625370025634766, "logits/rejected": -2.1728615760803223, "logps/chosen": -326.66375732421875, "logps/rejected": -282.999755859375, "loss": 55274.6625, "rewards/accuracies": 0.5, "rewards/chosen": -326.66375732421875, "rewards/margins": -43.66400909423828, "rewards/rejected": -282.999755859375, "step": 825 }, { "epoch": 0.8686551543694401, "grad_norm": 1506078.4494627095, "learning_rate": 2.567240179368185e-08, "logits/chosen": -2.1724421977996826, "logits/rejected": -2.121241569519043, "logps/chosen": -305.38079833984375, "logps/rejected": -287.86627197265625, "loss": 53377.5625, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -305.38079833984375, "rewards/margins": -17.514530181884766, "rewards/rejected": -287.86627197265625, "step": 830 }, { "epoch": 0.8738880167451596, "grad_norm": 1304314.0364927459, "learning_rate": 2.3692121900199174e-08, "logits/chosen": -2.153219699859619, "logits/rejected": -2.0992071628570557, "logps/chosen": -261.697998046875, "logps/rejected": -283.06072998046875, "loss": 54374.4625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -261.697998046875, "rewards/margins": 21.36276626586914, "rewards/rejected": -283.06072998046875, "step": 835 }, { "epoch": 0.8791208791208791, "grad_norm": 1648439.4660647989, "learning_rate": 2.1787515014630357e-08, "logits/chosen": -2.146265983581543, "logits/rejected": -2.111722946166992, "logps/chosen": -265.7535705566406, "logps/rejected": -268.1636962890625, "loss": 55597.7875, "rewards/accuracies": 0.5, "rewards/chosen": -265.7535705566406, "rewards/margins": 2.4101357460021973, "rewards/rejected": -268.1636962890625, "step": 840 }, { "epoch": 0.8843537414965986, "grad_norm": 1734398.4767520986, "learning_rate": 1.995921800093761e-08, "logits/chosen": -2.073884963989258, "logits/rejected": -1.9895031452178955, "logps/chosen": -282.88983154296875, "logps/rejected": -306.662353515625, "loss": 53997.5125, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -282.88983154296875, "rewards/margins": 23.772525787353516, "rewards/rejected": -306.662353515625, "step": 845 }, { "epoch": 0.8895866038723181, "grad_norm": 1454626.9788120938, "learning_rate": 1.820784220652766e-08, "logits/chosen": -2.1386914253234863, "logits/rejected": -2.0203399658203125, "logps/chosen": -289.72161865234375, "logps/rejected": -275.7218017578125, "loss": 55009.9875, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -289.72161865234375, "rewards/margins": -13.99982738494873, "rewards/rejected": -275.7218017578125, "step": 850 }, { "epoch": 0.8948194662480377, "grad_norm": 1089368.648801681, "learning_rate": 1.6533973257828765e-08, "logits/chosen": -2.091768980026245, "logits/rejected": -2.0091001987457275, "logps/chosen": -287.00640869140625, "logps/rejected": -331.1282958984375, "loss": 54365.375, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -287.00640869140625, "rewards/margins": 44.12189483642578, "rewards/rejected": -331.1282958984375, "step": 855 }, { "epoch": 0.9000523286237572, "grad_norm": 1510934.5115232496, "learning_rate": 1.4938170864468636e-08, "logits/chosen": -2.1866893768310547, "logits/rejected": -2.085561513900757, "logps/chosen": -258.2256774902344, "logps/rejected": -292.9275817871094, "loss": 54320.6625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -258.2256774902344, "rewards/margins": 34.7019157409668, "rewards/rejected": -292.9275817871094, "step": 860 }, { "epoch": 0.9052851909994767, "grad_norm": 1724641.859318044, "learning_rate": 1.342096863211828e-08, "logits/chosen": -2.1254117488861084, "logits/rejected": -2.0715444087982178, "logps/chosen": -281.90814208984375, "logps/rejected": -320.0205078125, "loss": 56361.75, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -281.90814208984375, "rewards/margins": 38.11237335205078, "rewards/rejected": -320.0205078125, "step": 865 }, { "epoch": 0.9105180533751962, "grad_norm": 1063487.5975205353, "learning_rate": 1.1982873884064465e-08, "logits/chosen": -1.9770715236663818, "logits/rejected": -2.01908540725708, "logps/chosen": -227.65396118164062, "logps/rejected": -317.01251220703125, "loss": 55278.3875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -227.65396118164062, "rewards/margins": 89.35859680175781, "rewards/rejected": -317.01251220703125, "step": 870 }, { "epoch": 0.9157509157509157, "grad_norm": 1565693.148460451, "learning_rate": 1.062436749157053e-08, "logits/chosen": -2.1096649169921875, "logits/rejected": -2.111191749572754, "logps/chosen": -293.599609375, "logps/rejected": -321.7491760253906, "loss": 54704.9375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -293.599609375, "rewards/margins": 28.14957046508789, "rewards/rejected": -321.7491760253906, "step": 875 }, { "epoch": 0.9209837781266352, "grad_norm": 1036263.9285741834, "learning_rate": 9.345903713082304e-09, "logits/chosen": -2.1749892234802246, "logits/rejected": -2.0691840648651123, "logps/chosen": -331.82086181640625, "logps/rejected": -299.9912414550781, "loss": 53077.875, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -331.82086181640625, "rewards/margins": -31.82961082458496, "rewards/rejected": -299.9912414550781, "step": 880 }, { "epoch": 0.9262166405023547, "grad_norm": 1469306.753540594, "learning_rate": 8.147910042332922e-09, "logits/chosen": -2.1455626487731934, "logits/rejected": -2.0270955562591553, "logps/chosen": -334.5442810058594, "logps/rejected": -350.59002685546875, "loss": 55319.25, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -334.5442810058594, "rewards/margins": 16.04566764831543, "rewards/rejected": -350.59002685546875, "step": 885 }, { "epoch": 0.9314495028780743, "grad_norm": 1665409.940510744, "learning_rate": 7.030787065396865e-09, "logits/chosen": -2.038339614868164, "logits/rejected": -1.9863135814666748, "logps/chosen": -280.74298095703125, "logps/rejected": -290.1654052734375, "loss": 54026.875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -280.74298095703125, "rewards/margins": 9.422399520874023, "rewards/rejected": -290.1654052734375, "step": 890 }, { "epoch": 0.9366823652537938, "grad_norm": 1264428.2705349482, "learning_rate": 5.994908326741876e-09, "logits/chosen": -2.1871466636657715, "logits/rejected": -2.144632339477539, "logps/chosen": -302.3477478027344, "logps/rejected": -335.5939636230469, "loss": 54326.7562, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -302.3477478027344, "rewards/margins": 33.246219635009766, "rewards/rejected": -335.5939636230469, "step": 895 }, { "epoch": 0.9419152276295133, "grad_norm": 1732479.872330989, "learning_rate": 5.04062020432286e-09, "logits/chosen": -2.223008632659912, "logits/rejected": -2.123403787612915, "logps/chosen": -267.91107177734375, "logps/rejected": -292.2001953125, "loss": 53162.075, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -267.91107177734375, "rewards/margins": 24.28915023803711, "rewards/rejected": -292.2001953125, "step": 900 }, { "epoch": 0.9471480900052328, "grad_norm": 1668574.1463273366, "learning_rate": 4.168241793759658e-09, "logits/chosen": -2.1200461387634277, "logits/rejected": -2.0498270988464355, "logps/chosen": -266.21112060546875, "logps/rejected": -335.3847351074219, "loss": 52995.9688, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -266.21112060546875, "rewards/margins": 69.17359924316406, "rewards/rejected": -335.3847351074219, "step": 905 }, { "epoch": 0.9523809523809523, "grad_norm": 1455951.9893622866, "learning_rate": 3.3780648016376866e-09, "logits/chosen": -2.221703052520752, "logits/rejected": -2.0837242603302, "logps/chosen": -328.39630126953125, "logps/rejected": -332.1032409667969, "loss": 55753.5, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -328.39630126953125, "rewards/margins": 3.7069344520568848, "rewards/rejected": -332.1032409667969, "step": 910 }, { "epoch": 0.957613814756672, "grad_norm": 1397349.994078792, "learning_rate": 2.6703534479667887e-09, "logits/chosen": -2.1655023097991943, "logits/rejected": -2.0703787803649902, "logps/chosen": -253.6987762451172, "logps/rejected": -273.0363464355469, "loss": 53243.575, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -253.6987762451172, "rewards/margins": 19.337589263916016, "rewards/rejected": -273.0363464355469, "step": 915 }, { "epoch": 0.9628466771323915, "grad_norm": 1342408.6426420235, "learning_rate": 2.0453443778310766e-09, "logits/chosen": -2.0957493782043457, "logits/rejected": -2.029906988143921, "logps/chosen": -270.45806884765625, "logps/rejected": -297.3926086425781, "loss": 54182.1375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -270.45806884765625, "rewards/margins": 26.934490203857422, "rewards/rejected": -297.3926086425781, "step": 920 }, { "epoch": 0.968079539508111, "grad_norm": 1458294.4502452172, "learning_rate": 1.5032465822596153e-09, "logits/chosen": -2.1939797401428223, "logits/rejected": -2.1166329383850098, "logps/chosen": -300.76947021484375, "logps/rejected": -320.9613952636719, "loss": 54235.6937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -300.76947021484375, "rewards/margins": 20.191925048828125, "rewards/rejected": -320.9613952636719, "step": 925 }, { "epoch": 0.9733124018838305, "grad_norm": 2290841.929562142, "learning_rate": 1.0442413283435758e-09, "logits/chosen": -2.114621639251709, "logits/rejected": -2.098475217819214, "logps/chosen": -277.58563232421875, "logps/rejected": -333.00006103515625, "loss": 53597.825, "rewards/accuracies": 0.625, "rewards/chosen": -277.58563232421875, "rewards/margins": 55.41447830200195, "rewards/rejected": -333.00006103515625, "step": 930 }, { "epoch": 0.97854526425955, "grad_norm": 2365466.4829686345, "learning_rate": 6.684820986240513e-10, "logits/chosen": -2.1461949348449707, "logits/rejected": -2.1061387062072754, "logps/chosen": -285.5892333984375, "logps/rejected": -329.62567138671875, "loss": 55886.8125, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -285.5892333984375, "rewards/margins": 44.03642272949219, "rewards/rejected": -329.62567138671875, "step": 935 }, { "epoch": 0.9837781266352695, "grad_norm": 1714580.7073031003, "learning_rate": 3.760945397705828e-10, "logits/chosen": -2.290830135345459, "logits/rejected": -2.2668721675872803, "logps/chosen": -314.2235107421875, "logps/rejected": -362.34698486328125, "loss": 54598.4375, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -314.2235107421875, "rewards/margins": 48.12348556518555, "rewards/rejected": -362.34698486328125, "step": 940 }, { "epoch": 0.989010989010989, "grad_norm": 1433494.2103824487, "learning_rate": 1.6717642056721104e-10, "logits/chosen": -2.0160892009735107, "logits/rejected": -2.0129268169403076, "logps/chosen": -284.138916015625, "logps/rejected": -306.3015441894531, "loss": 54053.5687, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -284.138916015625, "rewards/margins": 22.162614822387695, "rewards/rejected": -306.3015441894531, "step": 945 }, { "epoch": 0.9942438513867086, "grad_norm": 1191159.0388659274, "learning_rate": 4.17975992204056e-11, "logits/chosen": -2.057304620742798, "logits/rejected": -2.056112289428711, "logps/chosen": -266.7309875488281, "logps/rejected": -323.4661865234375, "loss": 55682.3375, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -266.7309875488281, "rewards/margins": 56.735191345214844, "rewards/rejected": -323.4661865234375, "step": 950 }, { "epoch": 0.9994767137624281, "grad_norm": 1276570.622002559, "learning_rate": 0.0, "logits/chosen": -2.1545863151550293, "logits/rejected": -2.146925449371338, "logps/chosen": -280.2084045410156, "logps/rejected": -343.4630432128906, "loss": 54058.05, "rewards/accuracies": 0.625, "rewards/chosen": -280.2084045410156, "rewards/margins": 63.254638671875, "rewards/rejected": -343.4630432128906, "step": 955 }, { "epoch": 0.9994767137624281, "step": 955, "total_flos": 0.0, "train_loss": 56244.764594240834, "train_runtime": 21694.4484, "train_samples_per_second": 2.818, "train_steps_per_second": 0.044 } ], "logging_steps": 5, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }