{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9973828840617638, "eval_steps": 500, "global_step": 954, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "diff_generated": 0.0, "epoch": 0.002093692750588851, "grad_norm": 4027.4986845337753, "learning_rate": 2.083333333333333e-08, "logits/chosen": -2.1441590785980225, "logits/rejected": -2.0543735027313232, "logps/chosen": -276.82366943359375, "logps/rejected": -131.32485961914062, "loss": 140.2437, "losses_ref": -131.32485961914062, "ref_logps/chosen": -276.82366943359375, "ref_logps/rejected": -131.32485961914062, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1, "u": 1.4901161193847656e-08, "weight": 1.0 }, { "diff_generated": 0.004567362368106842, "epoch": 0.010468463752944255, "grad_norm": 4012.8373505662616, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.2097952365875244, "logits/rejected": -2.1078758239746094, "logps/chosen": -280.6259460449219, "logps/rejected": -162.3510284423828, "loss": 129.4337, "losses_ref": -163.54556274414062, "ref_logps/chosen": -280.68133544921875, "ref_logps/rejected": -162.3555908203125, "rewards/accuracies": 0.43359375, "rewards/chosen": 0.000553958467207849, "rewards/margins": 0.0005082848947495222, "rewards/rejected": 4.567361975205131e-05, "step": 5, "u": 0.01998738758265972, "weight": 1.0011132955551147 }, { "diff_generated": -0.883712887763977, "epoch": 0.02093692750588851, "grad_norm": 3617.405413942307, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.355677843093872, "logits/rejected": -2.1583828926086426, "logps/chosen": -302.09747314453125, "logps/rejected": -169.69467163085938, "loss": 157.3847, "losses_ref": -137.87350463867188, "ref_logps/chosen": -302.58917236328125, "ref_logps/rejected": -168.81094360351562, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.004917326383292675, "rewards/margins": 0.01375445444136858, "rewards/rejected": -0.00883712898939848, "step": 10, "u": -0.573723316192627, "weight": 0.8237913250923157 }, { "diff_generated": -3.757080078125, "epoch": 0.031405391258832765, "grad_norm": 3487.9086553330885, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.285557270050049, "logits/rejected": -2.1396851539611816, "logps/chosen": -299.9487609863281, "logps/rejected": -166.72817993164062, "loss": 215.9423, "losses_ref": -61.32612991333008, "ref_logps/chosen": -304.54766845703125, "ref_logps/rejected": -162.97108459472656, "rewards/accuracies": 0.984375, "rewards/chosen": 0.0459887757897377, "rewards/margins": 0.08355957269668579, "rewards/rejected": -0.03757079690694809, "step": 15, "u": -1.074953317642212, "weight": 0.4649723172187805 }, { "diff_generated": -13.927907943725586, "epoch": 0.04187385501177702, "grad_norm": 2892.3287332168984, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.246702194213867, "logits/rejected": -2.1279449462890625, "logps/chosen": -267.1871337890625, "logps/rejected": -170.03897094726562, "loss": 233.0012, "losses_ref": -32.27024459838867, "ref_logps/chosen": -283.3597106933594, "ref_logps/rejected": -156.11105346679688, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.16172581911087036, "rewards/margins": 0.30100491642951965, "rewards/rejected": -0.1392790973186493, "step": 20, "u": -0.5478723049163818, "weight": 0.3134520649909973 }, { "diff_generated": -26.503625869750977, "epoch": 0.05234231876472128, "grad_norm": 2024.707131865886, "learning_rate": 5.208333333333334e-07, "logits/chosen": -2.209564447402954, "logits/rejected": -2.0659689903259277, "logps/chosen": -255.67092895507812, "logps/rejected": -183.784423828125, "loss": 225.1278, "losses_ref": -30.188289642333984, "ref_logps/chosen": -280.2396545410156, "ref_logps/rejected": -157.2808074951172, "rewards/accuracies": 1.0, "rewards/chosen": 0.245687335729599, "rewards/margins": 0.5107235908508301, "rewards/rejected": -0.2650362551212311, "step": 25, "u": 0.18020522594451904, "weight": 0.2832922041416168 }, { "diff_generated": -51.14558792114258, "epoch": 0.06281078251766553, "grad_norm": 1518.1810592262389, "learning_rate": 6.249999999999999e-07, "logits/chosen": -2.2818737030029297, "logits/rejected": -2.199028968811035, "logps/chosen": -243.2410888671875, "logps/rejected": -215.5212860107422, "loss": 229.1218, "losses_ref": -20.79702377319336, "ref_logps/chosen": -273.4181823730469, "ref_logps/rejected": -164.37570190429688, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": 0.30177104473114014, "rewards/margins": 0.8132268786430359, "rewards/rejected": -0.5114558935165405, "step": 30, "u": -0.08460383862257004, "weight": 0.19152367115020752 }, { "diff_generated": -66.31632995605469, "epoch": 0.07327924627060979, "grad_norm": 1482.6172349050332, "learning_rate": 7.291666666666666e-07, "logits/chosen": -2.2653889656066895, "logits/rejected": -2.1242835521698, "logps/chosen": -249.3292999267578, "logps/rejected": -223.139892578125, "loss": 228.9043, "losses_ref": -19.583892822265625, "ref_logps/chosen": -282.82373046875, "ref_logps/rejected": -156.8235626220703, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.3349445164203644, "rewards/margins": 0.9981077909469604, "rewards/rejected": -0.6631633043289185, "step": 35, "u": 0.06723131239414215, "weight": 0.2029893398284912 }, { "diff_generated": -101.70452880859375, "epoch": 0.08374771002355404, "grad_norm": 1747.512023088969, "learning_rate": 8.333333333333333e-07, "logits/chosen": -2.109070062637329, "logits/rejected": -2.079871654510498, "logps/chosen": -237.7236328125, "logps/rejected": -262.9115905761719, "loss": 238.8995, "losses_ref": -15.8267822265625, "ref_logps/chosen": -272.7063903808594, "ref_logps/rejected": -161.20706176757812, "rewards/accuracies": 1.0, "rewards/chosen": 0.34982770681381226, "rewards/margins": 1.366873025894165, "rewards/rejected": -1.017045259475708, "step": 40, "u": -1.1587042808532715, "weight": 0.09851591289043427 }, { "diff_generated": -117.0851058959961, "epoch": 0.0942161737764983, "grad_norm": 1667.7557707134451, "learning_rate": 9.374999999999999e-07, "logits/chosen": -2.20316219329834, "logits/rejected": -2.008223295211792, "logps/chosen": -257.76983642578125, "logps/rejected": -278.8745422363281, "loss": 239.9967, "losses_ref": -20.097864151000977, "ref_logps/chosen": -293.736083984375, "ref_logps/rejected": -161.78945922851562, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.3596626818180084, "rewards/margins": 1.530513882637024, "rewards/rejected": -1.1708511114120483, "step": 45, "u": -0.12792688608169556, "weight": 0.16130205988883972 }, { "diff_generated": -126.9466781616211, "epoch": 0.10468463752944256, "grad_norm": 1521.2094097818665, "learning_rate": 1.0416666666666667e-06, "logits/chosen": -2.1982452869415283, "logits/rejected": -2.1284544467926025, "logps/chosen": -232.5095977783203, "logps/rejected": -295.5307922363281, "loss": 224.3866, "losses_ref": -21.150318145751953, "ref_logps/chosen": -270.96405029296875, "ref_logps/rejected": -168.58413696289062, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.38454434275627136, "rewards/margins": 1.6540111303329468, "rewards/rejected": -1.2694666385650635, "step": 50, "u": 0.001223707222379744, "weight": 0.18796880543231964 }, { "diff_generated": -141.50799560546875, "epoch": 0.11515310128238682, "grad_norm": 1612.8192197434123, "learning_rate": 1.1458333333333333e-06, "logits/chosen": -2.0737013816833496, "logits/rejected": -1.9873807430267334, "logps/chosen": -239.891357421875, "logps/rejected": -311.09619140625, "loss": 220.8677, "losses_ref": -7.660050392150879, "ref_logps/chosen": -280.08502197265625, "ref_logps/rejected": -169.58819580078125, "rewards/accuracies": 1.0, "rewards/chosen": 0.40193670988082886, "rewards/margins": 1.8170166015625, "rewards/rejected": -1.4150798320770264, "step": 55, "u": -0.9630683660507202, "weight": 0.08691856265068054 }, { "diff_generated": -137.93148803710938, "epoch": 0.12562156503533106, "grad_norm": 1372.8553226775107, "learning_rate": 1.2499999999999999e-06, "logits/chosen": -1.9770643711090088, "logits/rejected": -1.8704265356063843, "logps/chosen": -242.3487091064453, "logps/rejected": -295.7236633300781, "loss": 226.417, "losses_ref": -8.987265586853027, "ref_logps/chosen": -281.4112548828125, "ref_logps/rejected": -157.79214477539062, "rewards/accuracies": 1.0, "rewards/chosen": 0.39062565565109253, "rewards/margins": 1.7699406147003174, "rewards/rejected": -1.37931489944458, "step": 60, "u": -0.9851242303848267, "weight": 0.08782722800970078 }, { "diff_generated": -155.2223358154297, "epoch": 0.1360900287882753, "grad_norm": 1255.5204766616016, "learning_rate": 1.3541666666666667e-06, "logits/chosen": -1.9109680652618408, "logits/rejected": -1.800903081893921, "logps/chosen": -251.7116241455078, "logps/rejected": -313.6351318359375, "loss": 226.6359, "losses_ref": -6.898039817810059, "ref_logps/chosen": -291.105224609375, "ref_logps/rejected": -158.41278076171875, "rewards/accuracies": 1.0, "rewards/chosen": 0.3939359784126282, "rewards/margins": 1.9461593627929688, "rewards/rejected": -1.5522234439849854, "step": 65, "u": -1.2434440851211548, "weight": 0.07695779949426651 }, { "diff_generated": -131.47259521484375, "epoch": 0.14655849254121958, "grad_norm": 1343.9563405956512, "learning_rate": 1.4583333333333333e-06, "logits/chosen": -1.8604061603546143, "logits/rejected": -1.8694736957550049, "logps/chosen": -233.1003875732422, "logps/rejected": -294.2840881347656, "loss": 225.3943, "losses_ref": -10.19434642791748, "ref_logps/chosen": -274.62811279296875, "ref_logps/rejected": -162.8114776611328, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.41527730226516724, "rewards/margins": 1.7300033569335938, "rewards/rejected": -1.3147261142730713, "step": 70, "u": -1.253035545349121, "weight": 0.09121803939342499 }, { "diff_generated": -137.2784423828125, "epoch": 0.15702695629416383, "grad_norm": 1322.5353000176865, "learning_rate": 1.5624999999999999e-06, "logits/chosen": -1.800450086593628, "logits/rejected": -1.649074912071228, "logps/chosen": -263.3216247558594, "logps/rejected": -309.0770568847656, "loss": 233.1299, "losses_ref": -10.170949935913086, "ref_logps/chosen": -306.6936950683594, "ref_logps/rejected": -171.79859924316406, "rewards/accuracies": 1.0, "rewards/chosen": 0.4337209165096283, "rewards/margins": 1.8065054416656494, "rewards/rejected": -1.3727843761444092, "step": 75, "u": -1.3111217021942139, "weight": 0.08406667411327362 }, { "diff_generated": -128.09861755371094, "epoch": 0.16749542004710807, "grad_norm": 1343.7990825981688, "learning_rate": 1.6666666666666667e-06, "logits/chosen": -1.6293904781341553, "logits/rejected": -1.653552770614624, "logps/chosen": -211.403564453125, "logps/rejected": -288.49102783203125, "loss": 223.2768, "losses_ref": -5.209358215332031, "ref_logps/chosen": -253.810302734375, "ref_logps/rejected": -160.39239501953125, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.42406734824180603, "rewards/margins": 1.7050535678863525, "rewards/rejected": -1.2809861898422241, "step": 80, "u": -1.129665732383728, "weight": 0.05663755536079407 }, { "diff_generated": -135.20687866210938, "epoch": 0.17796388380005235, "grad_norm": 1192.5534502285198, "learning_rate": 1.7708333333333332e-06, "logits/chosen": -1.573900818824768, "logits/rejected": -1.4756534099578857, "logps/chosen": -239.03305053710938, "logps/rejected": -300.70184326171875, "loss": 223.0021, "losses_ref": -7.026658535003662, "ref_logps/chosen": -282.1534423828125, "ref_logps/rejected": -165.49496459960938, "rewards/accuracies": 1.0, "rewards/chosen": 0.43120384216308594, "rewards/margins": 1.7832725048065186, "rewards/rejected": -1.3520687818527222, "step": 85, "u": -0.8892000317573547, "weight": 0.074161596596241 }, { "diff_generated": -148.8050994873047, "epoch": 0.1884323475529966, "grad_norm": 1417.0310516206605, "learning_rate": 1.8749999999999998e-06, "logits/chosen": -1.3538436889648438, "logits/rejected": -1.2718507051467896, "logps/chosen": -234.74856567382812, "logps/rejected": -304.4095458984375, "loss": 232.0073, "losses_ref": -11.248689651489258, "ref_logps/chosen": -279.6741638183594, "ref_logps/rejected": -155.60443115234375, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.4492563307285309, "rewards/margins": 1.937307596206665, "rewards/rejected": -1.488051176071167, "step": 90, "u": -1.1423507928848267, "weight": 0.08043224364519119 }, { "diff_generated": -148.7802276611328, "epoch": 0.19890081130594087, "grad_norm": 1432.3504082681623, "learning_rate": 1.9791666666666666e-06, "logits/chosen": -1.1082611083984375, "logits/rejected": -1.0555765628814697, "logps/chosen": -235.3373565673828, "logps/rejected": -309.65771484375, "loss": 219.1082, "losses_ref": -13.706560134887695, "ref_logps/chosen": -277.9019470214844, "ref_logps/rejected": -160.87747192382812, "rewards/accuracies": 1.0, "rewards/chosen": 0.4256461262702942, "rewards/margins": 1.9134483337402344, "rewards/rejected": -1.487802267074585, "step": 95, "u": -1.0568161010742188, "weight": 0.09002764523029327 }, { "diff_generated": -158.8511962890625, "epoch": 0.2093692750588851, "grad_norm": 1374.8147610024293, "learning_rate": 1.9998927475076105e-06, "logits/chosen": -0.9869598150253296, "logits/rejected": -0.8535524606704712, "logps/chosen": -238.96426391601562, "logps/rejected": -322.4727783203125, "loss": 236.8658, "losses_ref": -5.802731513977051, "ref_logps/chosen": -282.0462951660156, "ref_logps/rejected": -163.62156677246094, "rewards/accuracies": 1.0, "rewards/chosen": 0.43082040548324585, "rewards/margins": 2.0193324089050293, "rewards/rejected": -1.5885119438171387, "step": 100, "u": -1.2527328729629517, "weight": 0.06292165815830231 }, { "diff_generated": -147.18008422851562, "epoch": 0.21983773881182936, "grad_norm": 1625.9248559762682, "learning_rate": 1.9994570736865402e-06, "logits/chosen": -1.07206392288208, "logits/rejected": -0.9393303990364075, "logps/chosen": -232.5029296875, "logps/rejected": -308.7837829589844, "loss": 213.8591, "losses_ref": -10.191104888916016, "ref_logps/chosen": -275.3525390625, "ref_logps/rejected": -161.60366821289062, "rewards/accuracies": 1.0, "rewards/chosen": 0.4284963011741638, "rewards/margins": 1.9002971649169922, "rewards/rejected": -1.4718010425567627, "step": 105, "u": -1.0033910274505615, "weight": 0.10204311460256577 }, { "diff_generated": -128.29922485351562, "epoch": 0.23030620256477363, "grad_norm": 1231.2639002533556, "learning_rate": 1.9986864211644068e-06, "logits/chosen": -1.1658036708831787, "logits/rejected": -1.0709865093231201, "logps/chosen": -231.3977813720703, "logps/rejected": -283.1410217285156, "loss": 246.1861, "losses_ref": -6.052565574645996, "ref_logps/chosen": -272.9906921386719, "ref_logps/rejected": -154.841796875, "rewards/accuracies": 1.0, "rewards/chosen": 0.41592931747436523, "rewards/margins": 1.6989214420318604, "rewards/rejected": -1.2829921245574951, "step": 110, "u": -1.3139088153839111, "weight": 0.07522980868816376 }, { "diff_generated": -133.98553466796875, "epoch": 0.24077466631771788, "grad_norm": 1343.0801296451152, "learning_rate": 1.997581048233623e-06, "logits/chosen": -1.1396609544754028, "logits/rejected": -1.1306806802749634, "logps/chosen": -226.9049835205078, "logps/rejected": -293.1982421875, "loss": 230.2171, "losses_ref": -5.637959957122803, "ref_logps/chosen": -269.8221130371094, "ref_logps/rejected": -159.2126922607422, "rewards/accuracies": 1.0, "rewards/chosen": 0.4291713833808899, "rewards/margins": 1.769026756286621, "rewards/rejected": -1.3398553133010864, "step": 115, "u": -1.168405294418335, "weight": 0.05913761258125305 }, { "diff_generated": -123.33839416503906, "epoch": 0.2512431300706621, "grad_norm": 1434.3497350520097, "learning_rate": 1.9961413253717214e-06, "logits/chosen": -1.5746419429779053, "logits/rejected": -1.518913984298706, "logps/chosen": -228.5142822265625, "logps/rejected": -284.6359558105469, "loss": 234.8627, "losses_ref": -9.012969017028809, "ref_logps/chosen": -274.33917236328125, "ref_logps/rejected": -161.29759216308594, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.4582485258579254, "rewards/margins": 1.6916322708129883, "rewards/rejected": -1.2333838939666748, "step": 120, "u": -0.9588969349861145, "weight": 0.08884967118501663 }, { "diff_generated": -151.09429931640625, "epoch": 0.26171159382360637, "grad_norm": 1274.213985322993, "learning_rate": 1.994367735117177e-06, "logits/chosen": -1.6689637899398804, "logits/rejected": -1.6743271350860596, "logps/chosen": -216.779541015625, "logps/rejected": -306.51861572265625, "loss": 226.4779, "losses_ref": -6.019095420837402, "ref_logps/chosen": -259.2029724121094, "ref_logps/rejected": -155.42433166503906, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.42423415184020996, "rewards/margins": 1.9351768493652344, "rewards/rejected": -1.5109429359436035, "step": 125, "u": -1.222081184387207, "weight": 0.08297105878591537 }, { "diff_generated": -161.22811889648438, "epoch": 0.2721800575765506, "grad_norm": 1337.1173679216238, "learning_rate": 1.992260871907687e-06, "logits/chosen": -1.5299973487854004, "logits/rejected": -1.4785773754119873, "logps/chosen": -239.4655303955078, "logps/rejected": -327.7781982421875, "loss": 242.8888, "losses_ref": -7.182534694671631, "ref_logps/chosen": -280.188720703125, "ref_logps/rejected": -166.550048828125, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.407231867313385, "rewards/margins": 2.019512891769409, "rewards/rejected": -1.612281084060669, "step": 130, "u": -1.2559138536453247, "weight": 0.05781525373458862 }, { "diff_generated": -169.7267303466797, "epoch": 0.2826485213294949, "grad_norm": 1374.1488593321894, "learning_rate": 1.9898214418809326e-06, "logits/chosen": -1.3805739879608154, "logits/rejected": -1.3600701093673706, "logps/chosen": -238.9783935546875, "logps/rejected": -343.4627380371094, "loss": 242.9051, "losses_ref": -2.127274990081787, "ref_logps/chosen": -281.3921203613281, "ref_logps/rejected": -173.73602294921875, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.4241371750831604, "rewards/margins": 2.1214041709899902, "rewards/rejected": -1.6972671747207642, "step": 135, "u": -1.7065389156341553, "weight": 0.033993639051914215 }, { "diff_generated": -151.85092163085938, "epoch": 0.29311698508243916, "grad_norm": 1370.477984750469, "learning_rate": 1.9870502626379126e-06, "logits/chosen": -1.3134925365447998, "logits/rejected": -1.3758270740509033, "logps/chosen": -227.9882049560547, "logps/rejected": -322.3777770996094, "loss": 229.547, "losses_ref": -4.158343315124512, "ref_logps/chosen": -270.9952392578125, "ref_logps/rejected": -170.52687072753906, "rewards/accuracies": 1.0, "rewards/chosen": 0.43007057905197144, "rewards/margins": 1.9485795497894287, "rewards/rejected": -1.518509030342102, "step": 140, "u": -1.3956022262573242, "weight": 0.05143100023269653 }, { "diff_generated": -146.50155639648438, "epoch": 0.3035854488353834, "grad_norm": 1794.900701079277, "learning_rate": 1.983948262968915e-06, "logits/chosen": -1.5504910945892334, "logits/rejected": -1.4326040744781494, "logps/chosen": -259.777587890625, "logps/rejected": -307.3033752441406, "loss": 242.1811, "losses_ref": -2.1557910442352295, "ref_logps/chosen": -302.7044982910156, "ref_logps/rejected": -160.8018035888672, "rewards/accuracies": 1.0, "rewards/chosen": 0.42926883697509766, "rewards/margins": 1.8942844867706299, "rewards/rejected": -1.4650156497955322, "step": 145, "u": -1.3577892780303955, "weight": 0.044694624841213226 }, { "diff_generated": -155.41860961914062, "epoch": 0.31405391258832765, "grad_norm": 1420.5558411185323, "learning_rate": 1.9805164825422237e-06, "logits/chosen": -2.0522618293762207, "logits/rejected": -1.9478759765625, "logps/chosen": -238.4119873046875, "logps/rejected": -314.91790771484375, "loss": 224.1883, "losses_ref": -3.6840145587921143, "ref_logps/chosen": -281.19158935546875, "ref_logps/rejected": -159.49932861328125, "rewards/accuracies": 1.0, "rewards/chosen": 0.4277961254119873, "rewards/margins": 1.9819822311401367, "rewards/rejected": -1.5541859865188599, "step": 150, "u": -1.3957250118255615, "weight": 0.05671170353889465 }, { "diff_generated": -151.29141235351562, "epoch": 0.3245223763412719, "grad_norm": 1339.4660772749999, "learning_rate": 1.9767560715556594e-06, "logits/chosen": -2.201369524002075, "logits/rejected": -2.1122801303863525, "logps/chosen": -232.8695831298828, "logps/rejected": -321.6642150878906, "loss": 230.8218, "losses_ref": -4.063229084014893, "ref_logps/chosen": -279.747314453125, "ref_logps/rejected": -170.372802734375, "rewards/accuracies": 1.0, "rewards/chosen": 0.46877723932266235, "rewards/margins": 1.9816913604736328, "rewards/rejected": -1.5129140615463257, "step": 155, "u": -1.4928115606307983, "weight": 0.05359172821044922 }, { "diff_generated": -154.98220825195312, "epoch": 0.33499084009421615, "grad_norm": 1436.3409054374235, "learning_rate": 1.972668290351084e-06, "logits/chosen": -2.1720938682556152, "logits/rejected": -2.0600266456604004, "logps/chosen": -240.95022583007812, "logps/rejected": -311.90997314453125, "loss": 234.915, "losses_ref": -4.4140777587890625, "ref_logps/chosen": -289.99774169921875, "ref_logps/rejected": -156.92776489257812, "rewards/accuracies": 1.0, "rewards/chosen": 0.4904751777648926, "rewards/margins": 2.040297031402588, "rewards/rejected": -1.5498219728469849, "step": 160, "u": -1.4394853115081787, "weight": 0.04004598781466484 }, { "diff_generated": -144.861572265625, "epoch": 0.34545930384716045, "grad_norm": 1297.629892424431, "learning_rate": 1.968254508991978e-06, "logits/chosen": -2.255429267883301, "logits/rejected": -2.142435073852539, "logps/chosen": -243.08935546875, "logps/rejected": -304.804443359375, "loss": 237.5995, "losses_ref": -2.3130009174346924, "ref_logps/chosen": -284.68487548828125, "ref_logps/rejected": -159.94287109375, "rewards/accuracies": 1.0, "rewards/chosen": 0.4159550666809082, "rewards/margins": 1.8645708560943604, "rewards/rejected": -1.4486157894134521, "step": 165, "u": -1.5542036294937134, "weight": 0.030019784346222878 }, { "diff_generated": -151.61795043945312, "epoch": 0.3559277676001047, "grad_norm": 1297.3953865872961, "learning_rate": 1.9635162068042544e-06, "logits/chosen": -2.119171380996704, "logits/rejected": -2.017618417739868, "logps/chosen": -247.02041625976562, "logps/rejected": -313.6037292480469, "loss": 237.275, "losses_ref": -6.966467380523682, "ref_logps/chosen": -288.6535949707031, "ref_logps/rejected": -161.9857940673828, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": 0.41633161902427673, "rewards/margins": 1.9325110912322998, "rewards/rejected": -1.5161794424057007, "step": 170, "u": -1.2121031284332275, "weight": 0.07038909941911697 }, { "diff_generated": -144.2270050048828, "epoch": 0.36639623135304894, "grad_norm": 1438.8100283748447, "learning_rate": 1.958454971880441e-06, "logits/chosen": -2.147486686706543, "logits/rejected": -2.0490543842315674, "logps/chosen": -268.3631591796875, "logps/rejected": -305.03021240234375, "loss": 251.9562, "losses_ref": -5.818743705749512, "ref_logps/chosen": -313.4308776855469, "ref_logps/rejected": -160.80323791503906, "rewards/accuracies": 1.0, "rewards/chosen": 0.4506770670413971, "rewards/margins": 1.8929469585418701, "rewards/rejected": -1.442270040512085, "step": 175, "u": -1.2561050653457642, "weight": 0.0653764009475708 }, { "diff_generated": -141.9085693359375, "epoch": 0.3768646951059932, "grad_norm": 1207.513295077982, "learning_rate": 1.9530725005474194e-06, "logits/chosen": -2.267883539199829, "logits/rejected": -2.218174457550049, "logps/chosen": -221.9941864013672, "logps/rejected": -298.5855407714844, "loss": 221.5628, "losses_ref": -3.0411601066589355, "ref_logps/chosen": -264.38067626953125, "ref_logps/rejected": -156.677001953125, "rewards/accuracies": 1.0, "rewards/chosen": 0.4238646924495697, "rewards/margins": 1.8429502248764038, "rewards/rejected": -1.4190856218338013, "step": 180, "u": -1.1714732646942139, "weight": 0.05968625098466873 }, { "diff_generated": -150.76657104492188, "epoch": 0.38733315885893743, "grad_norm": 1307.7780975566222, "learning_rate": 1.9473705967978807e-06, "logits/chosen": -2.420961856842041, "logits/rejected": -2.327650547027588, "logps/chosen": -227.6046600341797, "logps/rejected": -303.7978210449219, "loss": 229.0799, "losses_ref": -15.570757865905762, "ref_logps/chosen": -272.23333740234375, "ref_logps/rejected": -153.03126525878906, "rewards/accuracies": 1.0, "rewards/chosen": 0.4462866187095642, "rewards/margins": 1.9539520740509033, "rewards/rejected": -1.5076655149459839, "step": 185, "u": -0.8297923803329468, "weight": 0.09269951283931732 }, { "diff_generated": -147.6534423828125, "epoch": 0.39780162261188173, "grad_norm": 1169.0067686339887, "learning_rate": 1.941351171685697e-06, "logits/chosen": -2.2705044746398926, "logits/rejected": -2.2303287982940674, "logps/chosen": -229.6949920654297, "logps/rejected": -316.17437744140625, "loss": 234.7021, "losses_ref": -5.174070835113525, "ref_logps/chosen": -274.26959228515625, "ref_logps/rejected": -168.52093505859375, "rewards/accuracies": 1.0, "rewards/chosen": 0.4457460343837738, "rewards/margins": 1.9222803115844727, "rewards/rejected": -1.4765344858169556, "step": 190, "u": -1.7719621658325195, "weight": 0.03358909860253334 }, { "diff_generated": -159.57711791992188, "epoch": 0.408270086364826, "grad_norm": 1222.7096009577886, "learning_rate": 1.9350162426854148e-06, "logits/chosen": -2.1345176696777344, "logits/rejected": -2.1815943717956543, "logps/chosen": -195.1034393310547, "logps/rejected": -316.82177734375, "loss": 220.9707, "losses_ref": -4.031326770782471, "ref_logps/chosen": -238.08377075195312, "ref_logps/rejected": -157.2446746826172, "rewards/accuracies": 1.0, "rewards/chosen": 0.42980343103408813, "rewards/margins": 2.025574207305908, "rewards/rejected": -1.595771074295044, "step": 195, "u": -1.546870231628418, "weight": 0.03703851252794266 }, { "diff_generated": -167.23892211914062, "epoch": 0.4187385501177702, "grad_norm": 1368.307859885155, "learning_rate": 1.9283679330160725e-06, "logits/chosen": -2.1258459091186523, "logits/rejected": -2.004584789276123, "logps/chosen": -238.9210205078125, "logps/rejected": -331.30718994140625, "loss": 244.6853, "losses_ref": -4.569379806518555, "ref_logps/chosen": -285.3875732421875, "ref_logps/rejected": -164.0682830810547, "rewards/accuracies": 1.0, "rewards/chosen": 0.4646654725074768, "rewards/margins": 2.137054681777954, "rewards/rejected": -1.6723893880844116, "step": 200, "u": -1.6067603826522827, "weight": 0.04548769071698189 }, { "diff_generated": -156.21780395507812, "epoch": 0.42920701387071447, "grad_norm": 1208.8530669692416, "learning_rate": 1.9214084709295847e-06, "logits/chosen": -2.0831170082092285, "logits/rejected": -1.964040756225586, "logps/chosen": -255.9301300048828, "logps/rejected": -318.99798583984375, "loss": 233.3463, "losses_ref": -5.610936641693115, "ref_logps/chosen": -300.7832946777344, "ref_logps/rejected": -162.78021240234375, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.44853147864341736, "rewards/margins": 2.010709524154663, "rewards/rejected": -1.5621780157089233, "step": 205, "u": -1.3661489486694336, "weight": 0.06516700237989426 }, { "diff_generated": -171.98703002929688, "epoch": 0.4396754776236587, "grad_norm": 1215.8559114876498, "learning_rate": 1.9141401889639164e-06, "logits/chosen": -1.9906151294708252, "logits/rejected": -1.9088771343231201, "logps/chosen": -235.02249145507812, "logps/rejected": -345.1544494628906, "loss": 234.6928, "losses_ref": -2.863798141479492, "ref_logps/chosen": -280.8175048828125, "ref_logps/rejected": -173.16738891601562, "rewards/accuracies": 1.0, "rewards/chosen": 0.45795029401779175, "rewards/margins": 2.1778206825256348, "rewards/rejected": -1.7198702096939087, "step": 210, "u": -1.4222519397735596, "weight": 0.044259898364543915 }, { "diff_generated": -168.92660522460938, "epoch": 0.45014394137660296, "grad_norm": 1266.497741976898, "learning_rate": 1.906565523161312e-06, "logits/chosen": -1.9987051486968994, "logits/rejected": -1.9987319707870483, "logps/chosen": -227.54159545898438, "logps/rejected": -331.20281982421875, "loss": 227.5447, "losses_ref": -2.0428645610809326, "ref_logps/chosen": -272.03076171875, "ref_logps/rejected": -162.27622985839844, "rewards/accuracies": 1.0, "rewards/chosen": 0.4448915421962738, "rewards/margins": 2.134157657623291, "rewards/rejected": -1.6892658472061157, "step": 215, "u": -1.699721097946167, "weight": 0.028461579233407974 }, { "diff_generated": -181.3323211669922, "epoch": 0.46061240512954726, "grad_norm": 1409.5627230630107, "learning_rate": 1.8986870122518259e-06, "logits/chosen": -1.996578574180603, "logits/rejected": -1.9339357614517212, "logps/chosen": -241.12069702148438, "logps/rejected": -345.39239501953125, "loss": 250.5986, "losses_ref": -13.413454055786133, "ref_logps/chosen": -284.3638610839844, "ref_logps/rejected": -164.06004333496094, "rewards/accuracies": 1.0, "rewards/chosen": 0.43243154883384705, "rewards/margins": 2.2457549571990967, "rewards/rejected": -1.8133233785629272, "step": 220, "u": -1.559470295906067, "weight": 0.03921313211321831 }, { "diff_generated": -167.23196411132812, "epoch": 0.4710808688824915, "grad_norm": 1439.3131066005014, "learning_rate": 1.8905072968024423e-06, "logits/chosen": -2.0085692405700684, "logits/rejected": -1.9212806224822998, "logps/chosen": -240.53793334960938, "logps/rejected": -324.13519287109375, "loss": 229.6424, "losses_ref": -2.6123085021972656, "ref_logps/chosen": -288.477783203125, "ref_logps/rejected": -156.90321350097656, "rewards/accuracies": 1.0, "rewards/chosen": 0.47939810156822205, "rewards/margins": 2.1517176628112793, "rewards/rejected": -1.6723196506500244, "step": 225, "u": -1.7230523824691772, "weight": 0.04574074223637581 }, { "diff_generated": -159.5584259033203, "epoch": 0.48154933263543576, "grad_norm": 1268.731805706848, "learning_rate": 1.88202911833206e-06, "logits/chosen": -2.006537914276123, "logits/rejected": -2.0306971073150635, "logps/chosen": -209.113037109375, "logps/rejected": -324.5091552734375, "loss": 221.1728, "losses_ref": -2.3901400566101074, "ref_logps/chosen": -255.0234832763672, "ref_logps/rejected": -164.95074462890625, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.459104061126709, "rewards/margins": 2.0546882152557373, "rewards/rejected": -1.5955842733383179, "step": 230, "u": -1.3925855159759521, "weight": 0.045756690204143524 }, { "diff_generated": -170.58221435546875, "epoch": 0.49201779638838, "grad_norm": 1315.789025978012, "learning_rate": 1.873255318392644e-06, "logits/chosen": -1.9995191097259521, "logits/rejected": -1.8898826837539673, "logps/chosen": -234.0719757080078, "logps/rejected": -327.0367736816406, "loss": 242.3326, "losses_ref": -4.473931312561035, "ref_logps/chosen": -280.68048095703125, "ref_logps/rejected": -156.4545440673828, "rewards/accuracies": 1.0, "rewards/chosen": 0.46608513593673706, "rewards/margins": 2.1719069480895996, "rewards/rejected": -1.7058223485946655, "step": 235, "u": -1.6257721185684204, "weight": 0.034325193613767624 }, { "diff_generated": -163.37722778320312, "epoch": 0.5024862601413242, "grad_norm": 1285.4823648929914, "learning_rate": 1.8641888376168483e-06, "logits/chosen": -1.9665982723236084, "logits/rejected": -1.9548044204711914, "logps/chosen": -215.7754669189453, "logps/rejected": -326.5556335449219, "loss": 231.7613, "losses_ref": -5.584181308746338, "ref_logps/chosen": -260.7419128417969, "ref_logps/rejected": -163.17840576171875, "rewards/accuracies": 1.0, "rewards/chosen": 0.44966477155685425, "rewards/margins": 2.083436965942383, "rewards/rejected": -1.6337722539901733, "step": 240, "u": -1.2691129446029663, "weight": 0.0609821155667305 }, { "diff_generated": -147.24386596679688, "epoch": 0.5129547238942685, "grad_norm": 1347.3156065591786, "learning_rate": 1.8548327147324312e-06, "logits/chosen": -1.9906165599822998, "logits/rejected": -1.872373342514038, "logps/chosen": -243.5879364013672, "logps/rejected": -304.78204345703125, "loss": 236.4194, "losses_ref": -7.212074279785156, "ref_logps/chosen": -291.9618835449219, "ref_logps/rejected": -157.53817749023438, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.48373931646347046, "rewards/margins": 1.9561779499053955, "rewards/rejected": -1.4724384546279907, "step": 245, "u": -0.7504249811172485, "weight": 0.08246179670095444 }, { "diff_generated": -136.68235778808594, "epoch": 0.5234231876472127, "grad_norm": 1280.557570592857, "learning_rate": 1.8451900855437948e-06, "logits/chosen": -2.0444495677948, "logits/rejected": -1.9412866830825806, "logps/chosen": -237.24496459960938, "logps/rejected": -305.5830078125, "loss": 231.6959, "losses_ref": -4.014006614685059, "ref_logps/chosen": -285.0312805175781, "ref_logps/rejected": -168.90065002441406, "rewards/accuracies": 1.0, "rewards/chosen": 0.47786277532577515, "rewards/margins": 1.8446861505508423, "rewards/rejected": -1.3668235540390015, "step": 250, "u": -1.4464961290359497, "weight": 0.045917607843875885 }, { "diff_generated": -147.02664184570312, "epoch": 0.533891651400157, "grad_norm": 1271.960313695608, "learning_rate": 1.8352641818809846e-06, "logits/chosen": -2.012394428253174, "logits/rejected": -1.9293123483657837, "logps/chosen": -255.23617553710938, "logps/rejected": -305.11065673828125, "loss": 237.2504, "losses_ref": -3.9721827507019043, "ref_logps/chosen": -298.58929443359375, "ref_logps/rejected": -158.0840606689453, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.43353086709976196, "rewards/margins": 1.9037971496582031, "rewards/rejected": -1.470266342163086, "step": 255, "u": -1.2067726850509644, "weight": 0.04464394599199295 }, { "diff_generated": -150.324462890625, "epoch": 0.5443601151531012, "grad_norm": 1323.4761845101339, "learning_rate": 1.8250583305165094e-06, "logits/chosen": -1.7699302434921265, "logits/rejected": -1.7340294122695923, "logps/chosen": -232.5556640625, "logps/rejected": -303.0191650390625, "loss": 236.4857, "losses_ref": -3.8249027729034424, "ref_logps/chosen": -277.13360595703125, "ref_logps/rejected": -152.69473266601562, "rewards/accuracies": 1.0, "rewards/chosen": 0.4457794725894928, "rewards/margins": 1.949023962020874, "rewards/rejected": -1.5032446384429932, "step": 260, "u": -1.4394437074661255, "weight": 0.06012386828660965 }, { "diff_generated": -146.3737335205078, "epoch": 0.5548285789060455, "grad_norm": 1232.2132266823505, "learning_rate": 1.8145759520503357e-06, "logits/chosen": -1.836775541305542, "logits/rejected": -1.7096904516220093, "logps/chosen": -242.7677764892578, "logps/rejected": -308.00592041015625, "loss": 219.0433, "losses_ref": -2.2338509559631348, "ref_logps/chosen": -290.8897705078125, "ref_logps/rejected": -161.63217163085938, "rewards/accuracies": 1.0, "rewards/chosen": 0.48121970891952515, "rewards/margins": 1.9449567794799805, "rewards/rejected": -1.4637373685836792, "step": 265, "u": -1.7323522567749023, "weight": 0.03296298533678055 }, { "diff_generated": -160.1627960205078, "epoch": 0.5652970426589898, "grad_norm": 1351.7338122517372, "learning_rate": 1.803820559763439e-06, "logits/chosen": -1.7946879863739014, "logits/rejected": -1.7407840490341187, "logps/chosen": -215.82290649414062, "logps/rejected": -316.18743896484375, "loss": 232.6284, "losses_ref": -3.786867618560791, "ref_logps/chosen": -261.61407470703125, "ref_logps/rejected": -156.02464294433594, "rewards/accuracies": 1.0, "rewards/chosen": 0.45791149139404297, "rewards/margins": 2.059539318084717, "rewards/rejected": -1.6016279458999634, "step": 270, "u": -1.651993751525879, "weight": 0.04032987728714943 }, { "diff_generated": -142.9796905517578, "epoch": 0.575765506411934, "grad_norm": 1181.4870635863472, "learning_rate": 1.7927957584402895e-06, "logits/chosen": -1.875299096107483, "logits/rejected": -1.8068253993988037, "logps/chosen": -228.66781616210938, "logps/rejected": -303.5104064941406, "loss": 224.2237, "losses_ref": -4.741028308868408, "ref_logps/chosen": -272.44915771484375, "ref_logps/rejected": -160.53070068359375, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.43781352043151855, "rewards/margins": 1.8676105737686157, "rewards/rejected": -1.4297969341278076, "step": 275, "u": -1.267141580581665, "weight": 0.07081650197505951 }, { "diff_generated": -147.29513549804688, "epoch": 0.5862339701648783, "grad_norm": 1311.3976945524007, "learning_rate": 1.78150524316067e-06, "logits/chosen": -1.9360460042953491, "logits/rejected": -1.8399826288223267, "logps/chosen": -244.2842559814453, "logps/rejected": -319.99603271484375, "loss": 221.9428, "losses_ref": -5.114128112792969, "ref_logps/chosen": -288.6471252441406, "ref_logps/rejected": -172.7008819580078, "rewards/accuracies": 1.0, "rewards/chosen": 0.4436289668083191, "rewards/margins": 1.9165802001953125, "rewards/rejected": -1.4729512929916382, "step": 280, "u": -1.33490788936615, "weight": 0.05145906284451485 }, { "diff_generated": -165.53073120117188, "epoch": 0.5967024339178225, "grad_norm": 1217.7737895640616, "learning_rate": 1.7699527980612304e-06, "logits/chosen": -2.008852243423462, "logits/rejected": -1.865282416343689, "logps/chosen": -235.48495483398438, "logps/rejected": -324.86236572265625, "loss": 237.0448, "losses_ref": -3.6097474098205566, "ref_logps/chosen": -281.65557861328125, "ref_logps/rejected": -159.3316650390625, "rewards/accuracies": 1.0, "rewards/chosen": 0.4617062509059906, "rewards/margins": 2.117013454437256, "rewards/rejected": -1.655307412147522, "step": 285, "u": -1.5152809619903564, "weight": 0.03953182324767113 }, { "diff_generated": -152.0204315185547, "epoch": 0.6071708976707668, "grad_norm": 1365.967274184474, "learning_rate": 1.758142295067194e-06, "logits/chosen": -1.9733747243881226, "logits/rejected": -1.8123550415039062, "logps/chosen": -253.77774047851562, "logps/rejected": -316.69073486328125, "loss": 236.0956, "losses_ref": -9.393682479858398, "ref_logps/chosen": -299.4283142089844, "ref_logps/rejected": -164.6702880859375, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.45650559663772583, "rewards/margins": 1.9767096042633057, "rewards/rejected": -1.5202041864395142, "step": 290, "u": -1.0198547840118408, "weight": 0.07342410832643509 }, { "diff_generated": -156.43539428710938, "epoch": 0.6176393614237111, "grad_norm": 1274.7347074994798, "learning_rate": 1.7460776925946416e-06, "logits/chosen": -2.04952335357666, "logits/rejected": -1.9772619009017944, "logps/chosen": -231.12759399414062, "logps/rejected": -324.58734130859375, "loss": 216.7738, "losses_ref": -3.1922709941864014, "ref_logps/chosen": -275.5738525390625, "ref_logps/rejected": -168.1519317626953, "rewards/accuracies": 1.0, "rewards/chosen": 0.4444626271724701, "rewards/margins": 2.0088164806365967, "rewards/rejected": -1.5643537044525146, "step": 295, "u": -1.7094459533691406, "weight": 0.027651017531752586 }, { "diff_generated": -166.10739135742188, "epoch": 0.6281078251766553, "grad_norm": 1279.289070746857, "learning_rate": 1.7337630342238039e-06, "logits/chosen": -2.0860671997070312, "logits/rejected": -1.9944241046905518, "logps/chosen": -226.953125, "logps/rejected": -329.9337158203125, "loss": 245.769, "losses_ref": -2.491637706756592, "ref_logps/chosen": -276.3335266113281, "ref_logps/rejected": -163.8263397216797, "rewards/accuracies": 1.0, "rewards/chosen": 0.49380379915237427, "rewards/margins": 2.1548776626586914, "rewards/rejected": -1.6610740423202515, "step": 300, "u": -1.8025985956192017, "weight": 0.020761026069521904 }, { "diff_generated": -160.49281311035156, "epoch": 0.6385762889295996, "grad_norm": 1143.8594113545453, "learning_rate": 1.7212024473438145e-06, "logits/chosen": -2.1227848529815674, "logits/rejected": -2.037874698638916, "logps/chosen": -227.2042694091797, "logps/rejected": -324.0436096191406, "loss": 218.3608, "losses_ref": -5.721261978149414, "ref_logps/chosen": -275.447265625, "ref_logps/rejected": -163.55076599121094, "rewards/accuracies": 1.0, "rewards/chosen": 0.4824300706386566, "rewards/margins": 2.0873584747314453, "rewards/rejected": -1.6049282550811768, "step": 305, "u": -1.5689971446990967, "weight": 0.03765694424510002 }, { "diff_generated": -165.9134979248047, "epoch": 0.6490447526825438, "grad_norm": 1216.910104164249, "learning_rate": 1.70840014176937e-06, "logits/chosen": -2.148029327392578, "logits/rejected": -1.9548304080963135, "logps/chosen": -259.4276123046875, "logps/rejected": -335.60723876953125, "loss": 237.5431, "losses_ref": -6.571761131286621, "ref_logps/chosen": -307.9371643066406, "ref_logps/rejected": -169.69369506835938, "rewards/accuracies": 1.0, "rewards/chosen": 0.48509567975997925, "rewards/margins": 2.144230604171753, "rewards/rejected": -1.659135103225708, "step": 310, "u": -1.2953577041625977, "weight": 0.06081492453813553 }, { "diff_generated": -150.7538299560547, "epoch": 0.6595132164354881, "grad_norm": 1285.8252216937017, "learning_rate": 1.6953604083297663e-06, "logits/chosen": -2.0963034629821777, "logits/rejected": -2.005828619003296, "logps/chosen": -238.0185089111328, "logps/rejected": -313.0700988769531, "loss": 232.0059, "losses_ref": -5.998663425445557, "ref_logps/chosen": -286.41973876953125, "ref_logps/rejected": -162.3162841796875, "rewards/accuracies": 1.0, "rewards/chosen": 0.4840126633644104, "rewards/margins": 1.9915508031845093, "rewards/rejected": -1.5075383186340332, "step": 315, "u": -1.022328495979309, "weight": 0.07499580085277557 }, { "diff_generated": -167.22291564941406, "epoch": 0.6699816801884323, "grad_norm": 1393.3558242713107, "learning_rate": 1.6820876174307821e-06, "logits/chosen": -2.0343525409698486, "logits/rejected": -1.9958488941192627, "logps/chosen": -220.11959838867188, "logps/rejected": -324.1341857910156, "loss": 235.1374, "losses_ref": -3.5960795879364014, "ref_logps/chosen": -265.8931579589844, "ref_logps/rejected": -156.91128540039062, "rewards/accuracies": 1.0, "rewards/chosen": 0.45773547887802124, "rewards/margins": 2.129964590072632, "rewards/rejected": -1.6722290515899658, "step": 320, "u": -1.116194486618042, "weight": 0.04208649322390556 }, { "diff_generated": -152.4580078125, "epoch": 0.6804501439413766, "grad_norm": 1260.574816635609, "learning_rate": 1.668586217589889e-06, "logits/chosen": -2.028233051300049, "logits/rejected": -1.943868637084961, "logps/chosen": -252.96224975585938, "logps/rejected": -314.017578125, "loss": 228.4758, "losses_ref": -1.9372276067733765, "ref_logps/chosen": -299.65130615234375, "ref_logps/rejected": -161.55958557128906, "rewards/accuracies": 1.0, "rewards/chosen": 0.46689024567604065, "rewards/margins": 1.9914703369140625, "rewards/rejected": -1.5245802402496338, "step": 325, "u": -1.6558067798614502, "weight": 0.03156626224517822 }, { "diff_generated": -140.8079376220703, "epoch": 0.6909186076943209, "grad_norm": 1201.438854630279, "learning_rate": 1.6548607339452852e-06, "logits/chosen": -2.0895023345947266, "logits/rejected": -2.036318778991699, "logps/chosen": -216.3995361328125, "logps/rejected": -303.2993469238281, "loss": 233.4191, "losses_ref": -2.161651134490967, "ref_logps/chosen": -261.6273498535156, "ref_logps/rejected": -162.49142456054688, "rewards/accuracies": 1.0, "rewards/chosen": 0.4522779583930969, "rewards/margins": 1.8603572845458984, "rewards/rejected": -1.4080793857574463, "step": 330, "u": -1.8709716796875, "weight": 0.017609911039471626 }, { "diff_generated": -143.80978393554688, "epoch": 0.7013870714472651, "grad_norm": 1237.6054714094937, "learning_rate": 1.6409157667392455e-06, "logits/chosen": -2.059278964996338, "logits/rejected": -1.9892032146453857, "logps/chosen": -235.5959930419922, "logps/rejected": -307.551513671875, "loss": 229.4944, "losses_ref": -6.860163688659668, "ref_logps/chosen": -283.805908203125, "ref_logps/rejected": -163.74172973632812, "rewards/accuracies": 1.0, "rewards/chosen": 0.48209866881370544, "rewards/margins": 1.920196771621704, "rewards/rejected": -1.4380979537963867, "step": 335, "u": -0.9188238382339478, "weight": 0.07267802953720093 }, { "diff_generated": -160.1986846923828, "epoch": 0.7118555352002094, "grad_norm": 1143.246546308752, "learning_rate": 1.6267559897763027e-06, "logits/chosen": -1.8168014287948608, "logits/rejected": -1.863437294960022, "logps/chosen": -188.27635192871094, "logps/rejected": -314.9437561035156, "loss": 216.8938, "losses_ref": -1.3188815116882324, "ref_logps/chosen": -237.00216674804688, "ref_logps/rejected": -154.74508666992188, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.4872584939002991, "rewards/margins": 2.089244842529297, "rewards/rejected": -1.6019866466522217, "step": 340, "u": -1.303836703300476, "weight": 0.029538637027144432 }, { "diff_generated": -151.67230224609375, "epoch": 0.7223239989531536, "grad_norm": 1267.0562713440388, "learning_rate": 1.6123861488567708e-06, "logits/chosen": -1.9331505298614502, "logits/rejected": -1.7450395822525024, "logps/chosen": -256.15277099609375, "logps/rejected": -316.7372131347656, "loss": 244.0877, "losses_ref": -2.1836702823638916, "ref_logps/chosen": -306.53680419921875, "ref_logps/rejected": -165.06492614746094, "rewards/accuracies": 1.0, "rewards/chosen": 0.5038406848907471, "rewards/margins": 2.0205636024475098, "rewards/rejected": -1.5167229175567627, "step": 345, "u": -1.485855221748352, "weight": 0.0375472754240036 }, { "diff_generated": -147.20364379882812, "epoch": 0.7327924627060979, "grad_norm": 1350.564919328469, "learning_rate": 1.5978110601861409e-06, "logits/chosen": -1.9117012023925781, "logits/rejected": -1.8668915033340454, "logps/chosen": -253.0355224609375, "logps/rejected": -311.43927001953125, "loss": 240.3254, "losses_ref": -2.832030773162842, "ref_logps/chosen": -299.90985107421875, "ref_logps/rejected": -164.23562622070312, "rewards/accuracies": 1.0, "rewards/chosen": 0.46874284744262695, "rewards/margins": 1.9407793283462524, "rewards/rejected": -1.472036361694336, "step": 350, "u": -1.429086685180664, "weight": 0.04381849616765976 }, { "diff_generated": -152.60690307617188, "epoch": 0.7432609264590422, "grad_norm": 1367.6005309235504, "learning_rate": 1.5830356087608763e-06, "logits/chosen": -1.887460708618164, "logits/rejected": -1.8180389404296875, "logps/chosen": -214.82699584960938, "logps/rejected": -321.7936096191406, "loss": 228.0585, "losses_ref": -1.8199619054794312, "ref_logps/chosen": -263.9666748046875, "ref_logps/rejected": -169.18673706054688, "rewards/accuracies": 1.0, "rewards/chosen": 0.49139684438705444, "rewards/margins": 2.017465829849243, "rewards/rejected": -1.526068925857544, "step": 355, "u": -1.6218674182891846, "weight": 0.02579430676996708 }, { "diff_generated": -148.06683349609375, "epoch": 0.7537293902119864, "grad_norm": 1346.2814229526575, "learning_rate": 1.5680647467311555e-06, "logits/chosen": -1.8571285009384155, "logits/rejected": -1.7857725620269775, "logps/chosen": -244.458251953125, "logps/rejected": -319.65484619140625, "loss": 223.1362, "losses_ref": -2.564044237136841, "ref_logps/chosen": -293.27410888671875, "ref_logps/rejected": -171.58799743652344, "rewards/accuracies": 1.0, "rewards/chosen": 0.4881584644317627, "rewards/margins": 1.9688268899917603, "rewards/rejected": -1.480668306350708, "step": 360, "u": -1.7225738763809204, "weight": 0.03375329077243805 }, { "diff_generated": -158.3987579345703, "epoch": 0.7641978539649307, "grad_norm": 1338.295994157702, "learning_rate": 1.552903491741107e-06, "logits/chosen": -1.837961196899414, "logits/rejected": -1.839646577835083, "logps/chosen": -230.9562530517578, "logps/rejected": -320.73455810546875, "loss": 230.7235, "losses_ref": -2.363715648651123, "ref_logps/chosen": -276.13995361328125, "ref_logps/rejected": -162.33580017089844, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.45183688402175903, "rewards/margins": 2.0358242988586426, "rewards/rejected": -1.5839874744415283, "step": 365, "u": -1.599321722984314, "weight": 0.03038620948791504 }, { "diff_generated": -143.7940216064453, "epoch": 0.7746663177178749, "grad_norm": 1122.4826063930961, "learning_rate": 1.5375569252470895e-06, "logits/chosen": -1.994361162185669, "logits/rejected": -1.8850581645965576, "logps/chosen": -266.71722412109375, "logps/rejected": -306.846923828125, "loss": 232.9005, "losses_ref": -7.7454657554626465, "ref_logps/chosen": -315.1695251464844, "ref_logps/rejected": -163.05288696289062, "rewards/accuracies": 1.0, "rewards/chosen": 0.48452290892601013, "rewards/margins": 1.922463059425354, "rewards/rejected": -1.437940239906311, "step": 370, "u": -1.2242950201034546, "weight": 0.05640628933906555 }, { "diff_generated": -147.28756713867188, "epoch": 0.7851347814708192, "grad_norm": 1267.5426171485876, "learning_rate": 1.5220301908145903e-06, "logits/chosen": -1.984815001487732, "logits/rejected": -1.8735277652740479, "logps/chosen": -236.86972045898438, "logps/rejected": -316.1020812988281, "loss": 254.4526, "losses_ref": -1.4826844930648804, "ref_logps/chosen": -283.3154296875, "ref_logps/rejected": -168.81448364257812, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.4644569754600525, "rewards/margins": 1.9373327493667603, "rewards/rejected": -1.472875714302063, "step": 375, "u": -1.3915516138076782, "weight": 0.03981015831232071 }, { "diff_generated": -141.51336669921875, "epoch": 0.7956032452237635, "grad_norm": 1213.313777968865, "learning_rate": 1.5063284923943028e-06, "logits/chosen": -1.9686000347137451, "logits/rejected": -1.856993317604065, "logps/chosen": -250.8971710205078, "logps/rejected": -304.9432067871094, "loss": 236.0771, "losses_ref": -2.1682116985321045, "ref_logps/chosen": -298.9543762207031, "ref_logps/rejected": -163.42984008789062, "rewards/accuracies": 1.0, "rewards/chosen": 0.4805716872215271, "rewards/margins": 1.895705223083496, "rewards/rejected": -1.4151335954666138, "step": 380, "u": -1.7837505340576172, "weight": 0.02852563187479973 }, { "diff_generated": -156.00753784179688, "epoch": 0.8060717089767077, "grad_norm": 1211.688665180567, "learning_rate": 1.490457092577968e-06, "logits/chosen": -1.9195213317871094, "logits/rejected": -1.8409401178359985, "logps/chosen": -229.5646209716797, "logps/rejected": -317.98406982421875, "loss": 227.1155, "losses_ref": -1.2010728120803833, "ref_logps/chosen": -279.9380798339844, "ref_logps/rejected": -161.97653198242188, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.5037345290184021, "rewards/margins": 2.063809871673584, "rewards/rejected": -1.5600755214691162, "step": 385, "u": -1.6141672134399414, "weight": 0.025375287979841232 }, { "diff_generated": -152.0254669189453, "epoch": 0.816540172729652, "grad_norm": 1239.5744414457495, "learning_rate": 1.4744213108345602e-06, "logits/chosen": -2.0957484245300293, "logits/rejected": -1.9671990871429443, "logps/chosen": -254.6474151611328, "logps/rejected": -313.9129333496094, "loss": 233.3016, "losses_ref": -4.944865703582764, "ref_logps/chosen": -304.72125244140625, "ref_logps/rejected": -161.88751220703125, "rewards/accuracies": 1.0, "rewards/chosen": 0.5007385015487671, "rewards/margins": 2.0209929943084717, "rewards/rejected": -1.5202546119689941, "step": 390, "u": -1.7007286548614502, "weight": 0.05083342641592026 }, { "diff_generated": -139.01864624023438, "epoch": 0.8270086364825961, "grad_norm": 1205.2400489160098, "learning_rate": 1.4582265217274103e-06, "logits/chosen": -1.9418761730194092, "logits/rejected": -1.8380733728408813, "logps/chosen": -247.5355682373047, "logps/rejected": -302.6370849609375, "loss": 239.286, "losses_ref": -1.7620617151260376, "ref_logps/chosen": -293.9803161621094, "ref_logps/rejected": -163.61843872070312, "rewards/accuracies": 1.0, "rewards/chosen": 0.464447557926178, "rewards/margins": 1.8546336889266968, "rewards/rejected": -1.3901864290237427, "step": 395, "u": -1.7652490139007568, "weight": 0.021758217364549637 }, { "diff_generated": -157.04232788085938, "epoch": 0.8374771002355405, "grad_norm": 1205.8199372142892, "learning_rate": 1.4418781531128635e-06, "logits/chosen": -2.0544238090515137, "logits/rejected": -2.0346767902374268, "logps/chosen": -234.49368286132812, "logps/rejected": -326.8393249511719, "loss": 233.9242, "losses_ref": -1.8244788646697998, "ref_logps/chosen": -282.6474609375, "ref_logps/rejected": -169.79696655273438, "rewards/accuracies": 1.0, "rewards/chosen": 0.48153790831565857, "rewards/margins": 2.0519611835479736, "rewards/rejected": -1.5704233646392822, "step": 400, "u": -1.6305999755859375, "weight": 0.024077033624053 }, { "diff_generated": -152.27468872070312, "epoch": 0.8479455639884846, "grad_norm": 1197.3605051527013, "learning_rate": 1.4253816843210748e-06, "logits/chosen": -1.9861503839492798, "logits/rejected": -1.8832927942276, "logps/chosen": -244.0829315185547, "logps/rejected": -317.6984558105469, "loss": 237.8302, "losses_ref": -3.3451290130615234, "ref_logps/chosen": -295.3381652832031, "ref_logps/rejected": -165.42379760742188, "rewards/accuracies": 1.0, "rewards/chosen": 0.51255202293396, "rewards/margins": 2.035299062728882, "rewards/rejected": -1.5227469205856323, "step": 405, "u": -1.4654412269592285, "weight": 0.036893170326948166 }, { "diff_generated": -154.22146606445312, "epoch": 0.8584140277414289, "grad_norm": 1201.6414819745964, "learning_rate": 1.4087426443195547e-06, "logits/chosen": -1.9021320343017578, "logits/rejected": -1.8548545837402344, "logps/chosen": -212.048583984375, "logps/rejected": -310.93731689453125, "loss": 223.4945, "losses_ref": -1.363377571105957, "ref_logps/chosen": -261.7601013183594, "ref_logps/rejected": -156.71588134765625, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.4971153736114502, "rewards/margins": 2.039330005645752, "rewards/rejected": -1.5422146320343018, "step": 410, "u": -1.499205470085144, "weight": 0.03244508430361748 }, { "diff_generated": -152.22821044921875, "epoch": 0.8688824914943732, "grad_norm": 1267.1758549974509, "learning_rate": 1.391966609860075e-06, "logits/chosen": -1.9990746974945068, "logits/rejected": -1.9241716861724854, "logps/chosen": -235.38150024414062, "logps/rejected": -307.2711181640625, "loss": 229.5926, "losses_ref": -3.3139452934265137, "ref_logps/chosen": -284.34393310546875, "ref_logps/rejected": -155.04290771484375, "rewards/accuracies": 1.0, "rewards/chosen": 0.48962411284446716, "rewards/margins": 2.011906147003174, "rewards/rejected": -1.5222820043563843, "step": 415, "u": -1.4567835330963135, "weight": 0.04380001127719879 }, { "diff_generated": -142.56979370117188, "epoch": 0.8793509552473174, "grad_norm": 1188.388834303343, "learning_rate": 1.3750592036095619e-06, "logits/chosen": -2.0134921073913574, "logits/rejected": -1.8790652751922607, "logps/chosen": -250.85546875, "logps/rejected": -295.076416015625, "loss": 235.3638, "losses_ref": -3.0703201293945312, "ref_logps/chosen": -298.8680725097656, "ref_logps/rejected": -152.5066375732422, "rewards/accuracies": 1.0, "rewards/chosen": 0.4801257252693176, "rewards/margins": 1.9058234691619873, "rewards/rejected": -1.4256978034973145, "step": 420, "u": -1.4335013628005981, "weight": 0.04213564842939377 }, { "diff_generated": -147.99075317382812, "epoch": 0.8898194190002617, "grad_norm": 1470.580405072441, "learning_rate": 1.3580260922655984e-06, "logits/chosen": -1.9547443389892578, "logits/rejected": -1.8864132165908813, "logps/chosen": -229.1260223388672, "logps/rejected": -308.96728515625, "loss": 230.7763, "losses_ref": -7.961075782775879, "ref_logps/chosen": -278.4296875, "ref_logps/rejected": -160.97654724121094, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.4930366575717926, "rewards/margins": 1.9729440212249756, "rewards/rejected": -1.4799073934555054, "step": 425, "u": -0.9531173706054688, "weight": 0.06897237151861191 }, { "diff_generated": -150.89932250976562, "epoch": 0.9002878827532059, "grad_norm": 1247.106031502474, "learning_rate": 1.3408729846571713e-06, "logits/chosen": -1.9829527139663696, "logits/rejected": -1.7790740728378296, "logps/chosen": -250.89053344726562, "logps/rejected": -306.49493408203125, "loss": 227.0822, "losses_ref": -3.353726625442505, "ref_logps/chosen": -299.95831298828125, "ref_logps/rejected": -155.59561157226562, "rewards/accuracies": 1.0, "rewards/chosen": 0.49067792296409607, "rewards/margins": 1.9996709823608398, "rewards/rejected": -1.5089929103851318, "step": 430, "u": -1.7451012134552002, "weight": 0.029034754261374474 }, { "diff_generated": -161.92088317871094, "epoch": 0.9107563465061502, "grad_norm": 1215.392487626507, "learning_rate": 1.3236056298312956e-06, "logits/chosen": -1.8760721683502197, "logits/rejected": -1.7741060256958008, "logps/chosen": -230.2984161376953, "logps/rejected": -322.80450439453125, "loss": 219.6414, "losses_ref": -2.6977756023406982, "ref_logps/chosen": -276.49066162109375, "ref_logps/rejected": -160.88360595703125, "rewards/accuracies": 1.0, "rewards/chosen": 0.4619222581386566, "rewards/margins": 2.0811312198638916, "rewards/rejected": -1.6192089319229126, "step": 435, "u": -1.3300695419311523, "weight": 0.049107056111097336 }, { "diff_generated": -169.64144897460938, "epoch": 0.9212248102590945, "grad_norm": 1199.549953331359, "learning_rate": 1.3062298151261591e-06, "logits/chosen": -1.8538296222686768, "logits/rejected": -1.7674894332885742, "logps/chosen": -247.5723114013672, "logps/rejected": -334.99432373046875, "loss": 228.2011, "losses_ref": -3.9634671211242676, "ref_logps/chosen": -293.4337463378906, "ref_logps/rejected": -165.35289001464844, "rewards/accuracies": 1.0, "rewards/chosen": 0.4586148262023926, "rewards/margins": 2.155029296875, "rewards/rejected": -1.696414589881897, "step": 440, "u": -0.911568284034729, "weight": 0.05750606581568718 }, { "diff_generated": -176.77850341796875, "epoch": 0.9316932740120387, "grad_norm": 1134.6331161044943, "learning_rate": 1.2887513642314372e-06, "logits/chosen": -1.7472941875457764, "logits/rejected": -1.6525627374649048, "logps/chosen": -229.4337921142578, "logps/rejected": -337.2396545410156, "loss": 225.3431, "losses_ref": -0.7648504376411438, "ref_logps/chosen": -279.60003662109375, "ref_logps/rejected": -160.4611358642578, "rewards/accuracies": 1.0, "rewards/chosen": 0.5016621947288513, "rewards/margins": 2.2694473266601562, "rewards/rejected": -1.7677850723266602, "step": 445, "u": -1.5227153301239014, "weight": 0.024677513167262077 }, { "diff_generated": -180.25607299804688, "epoch": 0.942161737764983, "grad_norm": 1254.0549089198466, "learning_rate": 1.271176135236417e-06, "logits/chosen": -1.8400166034698486, "logits/rejected": -1.6989673376083374, "logps/chosen": -255.73233032226562, "logps/rejected": -341.2875671386719, "loss": 233.639, "losses_ref": -4.4961042404174805, "ref_logps/chosen": -307.17620849609375, "ref_logps/rejected": -161.03147888183594, "rewards/accuracies": 1.0, "rewards/chosen": 0.5144392251968384, "rewards/margins": 2.317000150680542, "rewards/rejected": -1.8025610446929932, "step": 450, "u": -1.3673655986785889, "weight": 0.044325508177280426 }, { "diff_generated": -192.91705322265625, "epoch": 0.9526302015179272, "grad_norm": 1206.737012082796, "learning_rate": 1.2535100186666e-06, "logits/chosen": -1.808547019958496, "logits/rejected": -1.6920995712280273, "logps/chosen": -254.8017578125, "logps/rejected": -351.9469909667969, "loss": 245.5463, "losses_ref": -0.9527796506881714, "ref_logps/chosen": -304.09619140625, "ref_logps/rejected": -159.02993774414062, "rewards/accuracies": 1.0, "rewards/chosen": 0.4929441809654236, "rewards/margins": 2.422114610671997, "rewards/rejected": -1.9291703701019287, "step": 455, "u": -1.6784477233886719, "weight": 0.02164948359131813 }, { "diff_generated": -185.23989868164062, "epoch": 0.9630986652708715, "grad_norm": 1270.8466354695972, "learning_rate": 1.2357589355094273e-06, "logits/chosen": -1.8315858840942383, "logits/rejected": -1.7088918685913086, "logps/chosen": -269.20538330078125, "logps/rejected": -338.2021179199219, "loss": 246.9693, "losses_ref": -3.263090133666992, "ref_logps/chosen": -319.02618408203125, "ref_logps/rejected": -152.9622039794922, "rewards/accuracies": 1.0, "rewards/chosen": 0.4982084631919861, "rewards/margins": 2.350607395172119, "rewards/rejected": -1.8523988723754883, "step": 460, "u": -1.5151453018188477, "weight": 0.04330545663833618 }, { "diff_generated": -187.97279357910156, "epoch": 0.9735671290238157, "grad_norm": 1262.5411111889684, "learning_rate": 1.2179288352297982e-06, "logits/chosen": -1.7451597452163696, "logits/rejected": -1.6632684469223022, "logps/chosen": -227.63937377929688, "logps/rejected": -355.5631103515625, "loss": 232.7903, "losses_ref": -1.6858165264129639, "ref_logps/chosen": -279.9383544921875, "ref_logps/rejected": -167.59031677246094, "rewards/accuracies": 1.0, "rewards/chosen": 0.5229896903038025, "rewards/margins": 2.4027175903320312, "rewards/rejected": -1.8797279596328735, "step": 465, "u": -1.672014594078064, "weight": 0.022105634212493896 }, { "diff_generated": -206.70443725585938, "epoch": 0.98403559277676, "grad_norm": 1278.739837777923, "learning_rate": 1.2000256937760445e-06, "logits/chosen": -1.570615291595459, "logits/rejected": -1.4970500469207764, "logps/chosen": -237.1439208984375, "logps/rejected": -359.52886962890625, "loss": 239.3887, "losses_ref": -2.77233624458313, "ref_logps/chosen": -285.7524719238281, "ref_logps/rejected": -152.82440185546875, "rewards/accuracies": 1.0, "rewards/chosen": 0.4860858917236328, "rewards/margins": 2.5531301498413086, "rewards/rejected": -2.067044496536255, "step": 470, "u": -1.4401319026947021, "weight": 0.04976705089211464 }, { "diff_generated": -199.87838745117188, "epoch": 0.9945040565297043, "grad_norm": 1152.4542486150297, "learning_rate": 1.1820555115770255e-06, "logits/chosen": -1.4883148670196533, "logits/rejected": -1.505014419555664, "logps/chosen": -225.6768798828125, "logps/rejected": -358.5788269042969, "loss": 226.472, "losses_ref": -4.080103874206543, "ref_logps/chosen": -273.79522705078125, "ref_logps/rejected": -158.70046997070312, "rewards/accuracies": 1.0, "rewards/chosen": 0.48118335008621216, "rewards/margins": 2.4799671173095703, "rewards/rejected": -1.998783826828003, "step": 475, "u": -1.5296900272369385, "weight": 0.05185595899820328 }, { "diff_generated": -208.0040283203125, "epoch": 1.0049725202826485, "grad_norm": 1301.7034712659586, "learning_rate": 1.1640243115310217e-06, "logits/chosen": -1.5732040405273438, "logits/rejected": -1.5068919658660889, "logps/chosen": -223.4159393310547, "logps/rejected": -374.62591552734375, "loss": 226.8136, "losses_ref": -4.88800573348999, "ref_logps/chosen": -293.29400634765625, "ref_logps/rejected": -166.62188720703125, "rewards/accuracies": 1.0, "rewards/chosen": 0.6987806558609009, "rewards/margins": 2.778820514678955, "rewards/rejected": -2.080040454864502, "step": 480, "u": -1.9416106939315796, "weight": 0.03623828664422035 }, { "diff_generated": -215.4559783935547, "epoch": 1.0154409840355927, "grad_norm": 1355.7730407413364, "learning_rate": 1.1459381369870972e-06, "logits/chosen": -1.5292342901229858, "logits/rejected": -1.4070460796356201, "logps/chosen": -192.32717895507812, "logps/rejected": -380.6830139160156, "loss": 181.3888, "losses_ref": -3.105132818222046, "ref_logps/chosen": -294.8918762207031, "ref_logps/rejected": -165.22702026367188, "rewards/accuracies": 1.0, "rewards/chosen": 1.0256469249725342, "rewards/margins": 3.180206775665283, "rewards/rejected": -2.15455961227417, "step": 485, "u": -2.883460521697998, "weight": 0.04116251319646835 }, { "diff_generated": -209.49990844726562, "epoch": 1.025909447788537, "grad_norm": 1391.787637976481, "learning_rate": 1.1278030497196046e-06, "logits/chosen": -1.2669024467468262, "logits/rejected": -1.2282651662826538, "logps/chosen": -166.51095581054688, "logps/rejected": -365.71807861328125, "loss": 180.3994, "losses_ref": -2.922461986541748, "ref_logps/chosen": -264.67388916015625, "ref_logps/rejected": -156.21817016601562, "rewards/accuracies": 1.0, "rewards/chosen": 0.9816292524337769, "rewards/margins": 3.0766279697418213, "rewards/rejected": -2.094998836517334, "step": 490, "u": -3.193206310272217, "weight": 0.029411697760224342 }, { "diff_generated": -208.9027099609375, "epoch": 1.0363779115414813, "grad_norm": 1404.013865827238, "learning_rate": 1.1096251278965172e-06, "logits/chosen": -1.229707956314087, "logits/rejected": -1.2453272342681885, "logps/chosen": -167.49026489257812, "logps/rejected": -368.3177185058594, "loss": 166.3708, "losses_ref": -5.491534233093262, "ref_logps/chosen": -266.75250244140625, "ref_logps/rejected": -159.41500854492188, "rewards/accuracies": 1.0, "rewards/chosen": 0.9926217794418335, "rewards/margins": 3.081648588180542, "rewards/rejected": -2.089026927947998, "step": 495, "u": -1.6660839319229126, "weight": 0.06474236398935318 }, { "diff_generated": -213.4435577392578, "epoch": 1.0468463752944255, "grad_norm": 1314.8398697189618, "learning_rate": 1.0914104640422679e-06, "logits/chosen": -1.391204595565796, "logits/rejected": -1.3654673099517822, "logps/chosen": -161.88082885742188, "logps/rejected": -374.336669921875, "loss": 175.893, "losses_ref": -1.2716583013534546, "ref_logps/chosen": -257.99908447265625, "ref_logps/rejected": -160.89312744140625, "rewards/accuracies": 1.0, "rewards/chosen": 0.9611825942993164, "rewards/margins": 3.0956180095672607, "rewards/rejected": -2.1344354152679443, "step": 500, "u": -3.1927852630615234, "weight": 0.015900352969765663 }, { "diff_generated": -206.3148956298828, "epoch": 1.05731483904737, "grad_norm": 1445.075187818513, "learning_rate": 1.0731651629957721e-06, "logits/chosen": -1.3434970378875732, "logits/rejected": -1.305525541305542, "logps/chosen": -192.31558227539062, "logps/rejected": -378.03851318359375, "loss": 185.1733, "losses_ref": -2.9738333225250244, "ref_logps/chosen": -297.85302734375, "ref_logps/rejected": -171.7236328125, "rewards/accuracies": 1.0, "rewards/chosen": 1.055374264717102, "rewards/margins": 3.118523359298706, "rewards/rejected": -2.0631489753723145, "step": 505, "u": -3.4035236835479736, "weight": 0.024688560515642166 }, { "diff_generated": -186.8843231201172, "epoch": 1.067783302800314, "grad_norm": 1181.3904875833675, "learning_rate": 1.0548953398643274e-06, "logits/chosen": -1.566375970840454, "logits/rejected": -1.4381110668182373, "logps/chosen": -193.49539184570312, "logps/rejected": -350.2602233886719, "loss": 179.7564, "losses_ref": -2.258450984954834, "ref_logps/chosen": -297.76202392578125, "ref_logps/rejected": -163.3759002685547, "rewards/accuracies": 1.0, "rewards/chosen": 1.0426661968231201, "rewards/margins": 2.9115095138549805, "rewards/rejected": -1.8688430786132812, "step": 510, "u": -2.4679007530212402, "weight": 0.044156283140182495 }, { "diff_generated": -200.12066650390625, "epoch": 1.0782517665532583, "grad_norm": 1297.9609792649137, "learning_rate": 1.0366071179740706e-06, "logits/chosen": -1.6367733478546143, "logits/rejected": -1.4493190050125122, "logps/chosen": -209.0851593017578, "logps/rejected": -365.74053955078125, "loss": 186.0993, "losses_ref": -3.8747305870056152, "ref_logps/chosen": -317.296630859375, "ref_logps/rejected": -165.619873046875, "rewards/accuracies": 1.0, "rewards/chosen": 1.0821150541305542, "rewards/margins": 3.0833218097686768, "rewards/rejected": -2.001206636428833, "step": 515, "u": -2.938070297241211, "weight": 0.03331952169537544 }, { "diff_generated": -207.33700561523438, "epoch": 1.0887202303062025, "grad_norm": 1362.4544964162274, "learning_rate": 1.0183066268176775e-06, "logits/chosen": -1.541912317276001, "logits/rejected": -1.406719446182251, "logps/chosen": -204.0404052734375, "logps/rejected": -376.406494140625, "loss": 202.1916, "losses_ref": -0.5564223527908325, "ref_logps/chosen": -307.4422912597656, "ref_logps/rejected": -169.06948852539062, "rewards/accuracies": 1.0, "rewards/chosen": 1.034018874168396, "rewards/margins": 3.107388973236084, "rewards/rejected": -2.0733699798583984, "step": 520, "u": -3.34126353263855, "weight": 0.007363998796790838 }, { "diff_generated": -209.34707641601562, "epoch": 1.0991886940591469, "grad_norm": 1329.928673259645, "learning_rate": 1e-06, "logits/chosen": -1.4774454832077026, "logits/rejected": -1.3976843357086182, "logps/chosen": -190.63027954101562, "logps/rejected": -365.6118469238281, "loss": 191.7027, "losses_ref": -4.4078168869018555, "ref_logps/chosen": -289.65625, "ref_logps/rejected": -156.2647705078125, "rewards/accuracies": 1.0, "rewards/chosen": 0.990260124206543, "rewards/margins": 3.083730936050415, "rewards/rejected": -2.093470811843872, "step": 525, "u": -2.4628920555114746, "weight": 0.034967873245477676 }, { "diff_generated": -219.25039672851562, "epoch": 1.109657157812091, "grad_norm": 1263.6571441007575, "learning_rate": 9.816933731823228e-07, "logits/chosen": -1.48972749710083, "logits/rejected": -1.3531391620635986, "logps/chosen": -184.37472534179688, "logps/rejected": -382.6318359375, "loss": 179.9115, "losses_ref": -4.217190742492676, "ref_logps/chosen": -283.9466857910156, "ref_logps/rejected": -163.38145446777344, "rewards/accuracies": 1.0, "rewards/chosen": 0.995719313621521, "rewards/margins": 3.188223361968994, "rewards/rejected": -2.1925039291381836, "step": 530, "u": -2.7124040126800537, "weight": 0.03560812398791313 }, { "diff_generated": -222.4695587158203, "epoch": 1.1201256215650353, "grad_norm": 1387.0782441687347, "learning_rate": 9.633928820259293e-07, "logits/chosen": -1.2347859144210815, "logits/rejected": -1.2332684993743896, "logps/chosen": -162.6536102294922, "logps/rejected": -388.9007263183594, "loss": 162.1828, "losses_ref": -2.344147205352783, "ref_logps/chosen": -256.69085693359375, "ref_logps/rejected": -166.43115234375, "rewards/accuracies": 1.0, "rewards/chosen": 0.9403725862503052, "rewards/margins": 3.1650681495666504, "rewards/rejected": -2.2246956825256348, "step": 535, "u": -2.9149539470672607, "weight": 0.036711305379867554 }, { "diff_generated": -220.98583984375, "epoch": 1.1305940853179797, "grad_norm": 1297.6784365239848, "learning_rate": 9.451046601356725e-07, "logits/chosen": -1.3270328044891357, "logits/rejected": -1.2543261051177979, "logps/chosen": -174.17941284179688, "logps/rejected": -378.3968200683594, "loss": 168.7943, "losses_ref": -5.623769760131836, "ref_logps/chosen": -267.5427551269531, "ref_logps/rejected": -157.4110107421875, "rewards/accuracies": 1.0, "rewards/chosen": 0.9336336255073547, "rewards/margins": 3.1434922218322754, "rewards/rejected": -2.2098584175109863, "step": 540, "u": -2.444173574447632, "weight": 0.07986775040626526 }, { "diff_generated": -227.32980346679688, "epoch": 1.1410625490709239, "grad_norm": 1205.4234546771595, "learning_rate": 9.268348370042281e-07, "logits/chosen": -1.3813427686691284, "logits/rejected": -1.318725347518921, "logps/chosen": -174.5741424560547, "logps/rejected": -399.95172119140625, "loss": 168.9783, "losses_ref": -3.7193565368652344, "ref_logps/chosen": -273.3332824707031, "ref_logps/rejected": -172.62191772460938, "rewards/accuracies": 1.0, "rewards/chosen": 0.9875916242599487, "rewards/margins": 3.260889768600464, "rewards/rejected": -2.2732982635498047, "step": 545, "u": -3.2109789848327637, "weight": 0.042879991233348846 }, { "diff_generated": -249.11709594726562, "epoch": 1.151531012823868, "grad_norm": 1266.8954047572045, "learning_rate": 9.085895359577323e-07, "logits/chosen": -1.33551824092865, "logits/rejected": -1.3183876276016235, "logps/chosen": -167.4661865234375, "logps/rejected": -403.6305236816406, "loss": 174.6021, "losses_ref": -1.4713778495788574, "ref_logps/chosen": -267.08013916015625, "ref_logps/rejected": -154.513427734375, "rewards/accuracies": 1.0, "rewards/chosen": 0.9961398243904114, "rewards/margins": 3.4873108863830566, "rewards/rejected": -2.491170883178711, "step": 550, "u": -2.983215093612671, "weight": 0.02549784444272518 }, { "diff_generated": -220.6905059814453, "epoch": 1.1619994765768125, "grad_norm": 1242.623006879505, "learning_rate": 8.903748721034826e-07, "logits/chosen": -1.410308599472046, "logits/rejected": -1.3436871767044067, "logps/chosen": -175.43826293945312, "logps/rejected": -392.7843322753906, "loss": 178.3087, "losses_ref": -2.583522081375122, "ref_logps/chosen": -277.257080078125, "ref_logps/rejected": -172.09388732910156, "rewards/accuracies": 1.0, "rewards/chosen": 1.018188238143921, "rewards/margins": 3.225093126296997, "rewards/rejected": -2.206904649734497, "step": 555, "u": -2.3211851119995117, "weight": 0.039024386554956436 }, { "diff_generated": -246.51620483398438, "epoch": 1.1724679403297567, "grad_norm": 1315.7734920897904, "learning_rate": 8.721969502803953e-07, "logits/chosen": -1.4283636808395386, "logits/rejected": -1.4595166444778442, "logps/chosen": -190.37667846679688, "logps/rejected": -401.78594970703125, "loss": 169.0395, "losses_ref": -0.9799969792366028, "ref_logps/chosen": -288.49481201171875, "ref_logps/rejected": -155.26974487304688, "rewards/accuracies": 1.0, "rewards/chosen": 0.9811817407608032, "rewards/margins": 3.4463438987731934, "rewards/rejected": -2.4651618003845215, "step": 560, "u": -3.4851043224334717, "weight": 0.011031994596123695 }, { "diff_generated": -237.05813598632812, "epoch": 1.1829364040827008, "grad_norm": 1324.6420978322203, "learning_rate": 8.540618630129027e-07, "logits/chosen": -1.5112595558166504, "logits/rejected": -1.4447729587554932, "logps/chosen": -197.54592895507812, "logps/rejected": -408.50360107421875, "loss": 180.7105, "losses_ref": -8.419300079345703, "ref_logps/chosen": -298.6998596191406, "ref_logps/rejected": -171.4455108642578, "rewards/accuracies": 1.0, "rewards/chosen": 1.011539340019226, "rewards/margins": 3.3821206092834473, "rewards/rejected": -2.3705811500549316, "step": 565, "u": -3.0135345458984375, "weight": 0.03675166517496109 }, { "diff_generated": -226.0189666748047, "epoch": 1.193404867835645, "grad_norm": 1290.2940777725041, "learning_rate": 8.359756884689783e-07, "logits/chosen": -1.5810168981552124, "logits/rejected": -1.4695533514022827, "logps/chosen": -179.12496948242188, "logps/rejected": -392.3472595214844, "loss": 183.5485, "losses_ref": -1.6247104406356812, "ref_logps/chosen": -278.8708801269531, "ref_logps/rejected": -166.32827758789062, "rewards/accuracies": 1.0, "rewards/chosen": 0.9974590539932251, "rewards/margins": 3.257648468017578, "rewards/rejected": -2.2601895332336426, "step": 570, "u": -3.080786943435669, "weight": 0.019241400063037872 }, { "diff_generated": -213.7429962158203, "epoch": 1.2038733315885894, "grad_norm": 1344.148507837478, "learning_rate": 8.179444884229744e-07, "logits/chosen": -1.4880825281143188, "logits/rejected": -1.502333641052246, "logps/chosen": -189.47103881835938, "logps/rejected": -378.13275146484375, "loss": 171.5777, "losses_ref": -0.9622389674186707, "ref_logps/chosen": -284.98492431640625, "ref_logps/rejected": -164.38975524902344, "rewards/accuracies": 1.0, "rewards/chosen": 0.9551390409469604, "rewards/margins": 3.092568874359131, "rewards/rejected": -2.137429714202881, "step": 575, "u": -3.1298766136169434, "weight": 0.013516530394554138 }, { "diff_generated": -231.77328491210938, "epoch": 1.2143417953415336, "grad_norm": 1304.8866597655287, "learning_rate": 7.999743062239557e-07, "logits/chosen": -1.4784562587738037, "logits/rejected": -1.5664056539535522, "logps/chosen": -176.44296264648438, "logps/rejected": -421.82135009765625, "loss": 181.4369, "losses_ref": -1.1828618049621582, "ref_logps/chosen": -274.30767822265625, "ref_logps/rejected": -190.04803466796875, "rewards/accuracies": 1.0, "rewards/chosen": 0.9786470532417297, "rewards/margins": 3.296379804611206, "rewards/rejected": -2.3177332878112793, "step": 580, "u": -3.0596017837524414, "weight": 0.012155565433204174 }, { "diff_generated": -220.5200653076172, "epoch": 1.2248102590944778, "grad_norm": 1320.435055959004, "learning_rate": 7.820711647702017e-07, "logits/chosen": -1.4778623580932617, "logits/rejected": -1.5001682043075562, "logps/chosen": -168.55393981933594, "logps/rejected": -381.0849304199219, "loss": 177.0697, "losses_ref": -2.307084560394287, "ref_logps/chosen": -260.8992004394531, "ref_logps/rejected": -160.5648651123047, "rewards/accuracies": 1.0, "rewards/chosen": 0.9234523773193359, "rewards/margins": 3.128653049468994, "rewards/rejected": -2.205200672149658, "step": 585, "u": -2.9381721019744873, "weight": 0.03427546098828316 }, { "diff_generated": -207.7950439453125, "epoch": 1.235278722847422, "grad_norm": 1261.3527727961057, "learning_rate": 7.642410644905726e-07, "logits/chosen": -1.4036446809768677, "logits/rejected": -1.4330257177352905, "logps/chosen": -171.85134887695312, "logps/rejected": -370.4696960449219, "loss": 176.7884, "losses_ref": -2.213914394378662, "ref_logps/chosen": -269.0211486816406, "ref_logps/rejected": -162.6746826171875, "rewards/accuracies": 1.0, "rewards/chosen": 0.9716979265213013, "rewards/margins": 3.0496482849121094, "rewards/rejected": -2.0779504776000977, "step": 590, "u": -2.40622878074646, "weight": 0.04459633305668831 }, { "diff_generated": -230.44302368164062, "epoch": 1.2457471866003664, "grad_norm": 1305.239745538134, "learning_rate": 7.464899813334e-07, "logits/chosen": -1.261853575706482, "logits/rejected": -1.2570579051971436, "logps/chosen": -181.5194091796875, "logps/rejected": -393.291259765625, "loss": 177.5223, "losses_ref": -4.516595840454102, "ref_logps/chosen": -278.3271789550781, "ref_logps/rejected": -162.84823608398438, "rewards/accuracies": 1.0, "rewards/chosen": 0.9680774807929993, "rewards/margins": 3.272507905960083, "rewards/rejected": -2.3044302463531494, "step": 595, "u": -2.7008533477783203, "weight": 0.05883873626589775 }, { "diff_generated": -232.184326171875, "epoch": 1.2562156503533106, "grad_norm": 1268.765281131021, "learning_rate": 7.288238647635829e-07, "logits/chosen": -1.4351574182510376, "logits/rejected": -1.2977135181427002, "logps/chosen": -184.0857696533203, "logps/rejected": -400.95361328125, "loss": 177.9198, "losses_ref": -3.803828001022339, "ref_logps/chosen": -284.0093078613281, "ref_logps/rejected": -168.76925659179688, "rewards/accuracies": 1.0, "rewards/chosen": 0.9992351531982422, "rewards/margins": 3.3210787773132324, "rewards/rejected": -2.3218436241149902, "step": 600, "u": -2.89520263671875, "weight": 0.026231110095977783 }, { "diff_generated": -198.3398895263672, "epoch": 1.2666841141062548, "grad_norm": 1202.5442238394803, "learning_rate": 7.112486357685631e-07, "logits/chosen": -1.499137043952942, "logits/rejected": -1.4640613794326782, "logps/chosen": -186.61227416992188, "logps/rejected": -356.29132080078125, "loss": 187.6658, "losses_ref": -4.061453819274902, "ref_logps/chosen": -287.03717041015625, "ref_logps/rejected": -157.95144653320312, "rewards/accuracies": 1.0, "rewards/chosen": 1.004248857498169, "rewards/margins": 2.987647771835327, "rewards/rejected": -1.9833987951278687, "step": 605, "u": -3.231105089187622, "weight": 0.03989076986908913 }, { "diff_generated": -219.8319091796875, "epoch": 1.2771525778591992, "grad_norm": 1294.217440674336, "learning_rate": 6.937701848738407e-07, "logits/chosen": -1.41506028175354, "logits/rejected": -1.4094430208206177, "logps/chosen": -169.46595764160156, "logps/rejected": -384.82025146484375, "loss": 167.9688, "losses_ref": -1.3657054901123047, "ref_logps/chosen": -266.1152648925781, "ref_logps/rejected": -164.98837280273438, "rewards/accuracies": 1.0, "rewards/chosen": 0.9664928317070007, "rewards/margins": 3.1648120880126953, "rewards/rejected": -2.19831919670105, "step": 610, "u": -3.3554062843322754, "weight": 0.01739688031375408 }, { "diff_generated": -217.53970336914062, "epoch": 1.2876210416121434, "grad_norm": 1337.3093609895052, "learning_rate": 6.763943701687045e-07, "logits/chosen": -1.633599877357483, "logits/rejected": -1.5192573070526123, "logps/chosen": -191.32760620117188, "logps/rejected": -387.1475524902344, "loss": 183.0882, "losses_ref": -0.359982430934906, "ref_logps/chosen": -299.5060119628906, "ref_logps/rejected": -169.60787963867188, "rewards/accuracies": 1.0, "rewards/chosen": 1.0817840099334717, "rewards/margins": 3.257181167602539, "rewards/rejected": -2.1753971576690674, "step": 615, "u": -3.340681552886963, "weight": 0.008451832458376884 }, { "diff_generated": -208.6597137451172, "epoch": 1.2980895053650876, "grad_norm": 1324.2598041902163, "learning_rate": 6.591270153428288e-07, "logits/chosen": -1.6454055309295654, "logits/rejected": -1.489946961402893, "logps/chosen": -191.6290283203125, "logps/rejected": -364.0921325683594, "loss": 178.0635, "losses_ref": -2.520381450653076, "ref_logps/chosen": -295.8542785644531, "ref_logps/rejected": -155.4324188232422, "rewards/accuracies": 1.0, "rewards/chosen": 1.042252540588379, "rewards/margins": 3.128849506378174, "rewards/rejected": -2.086596965789795, "step": 620, "u": -2.7204320430755615, "weight": 0.02861974760890007 }, { "diff_generated": -196.55752563476562, "epoch": 1.308557969118032, "grad_norm": 1344.8788218911382, "learning_rate": 6.419739077344016e-07, "logits/chosen": -1.5530303716659546, "logits/rejected": -1.423179030418396, "logps/chosen": -200.18063354492188, "logps/rejected": -360.1055603027344, "loss": 179.8101, "losses_ref": -3.9870200157165527, "ref_logps/chosen": -300.4015197753906, "ref_logps/rejected": -163.5480194091797, "rewards/accuracies": 1.0, "rewards/chosen": 1.0022084712982178, "rewards/margins": 2.9677836894989014, "rewards/rejected": -1.9655752182006836, "step": 625, "u": -2.844027042388916, "weight": 0.038288719952106476 }, { "diff_generated": -198.09622192382812, "epoch": 1.3190264328709762, "grad_norm": 1205.0358284390313, "learning_rate": 6.24940796390438e-07, "logits/chosen": -1.5373382568359375, "logits/rejected": -1.444549322128296, "logps/chosen": -174.25350952148438, "logps/rejected": -362.83953857421875, "loss": 166.5968, "losses_ref": -2.2841248512268066, "ref_logps/chosen": -274.06365966796875, "ref_logps/rejected": -164.74331665039062, "rewards/accuracies": 1.0, "rewards/chosen": 0.9981018900871277, "rewards/margins": 2.9790642261505127, "rewards/rejected": -1.9809621572494507, "step": 630, "u": -2.492745876312256, "weight": 0.01923806592822075 }, { "diff_generated": -214.6193389892578, "epoch": 1.3294948966239204, "grad_norm": 1314.9134741026285, "learning_rate": 6.08033390139925e-07, "logits/chosen": -1.4583691358566284, "logits/rejected": -1.290028691291809, "logps/chosen": -190.0717315673828, "logps/rejected": -369.1700439453125, "loss": 192.5529, "losses_ref": -0.966667652130127, "ref_logps/chosen": -293.4891662597656, "ref_logps/rejected": -154.55068969726562, "rewards/accuracies": 1.0, "rewards/chosen": 1.0341745615005493, "rewards/margins": 3.180367946624756, "rewards/rejected": -2.146193265914917, "step": 635, "u": -2.8957810401916504, "weight": 0.020022699609398842 }, { "diff_generated": -209.5125732421875, "epoch": 1.3399633603768648, "grad_norm": 1311.833771803947, "learning_rate": 5.912573556804452e-07, "logits/chosen": -1.4464821815490723, "logits/rejected": -1.3825037479400635, "logps/chosen": -181.79258728027344, "logps/rejected": -380.0604553222656, "loss": 186.6832, "losses_ref": -2.0217666625976562, "ref_logps/chosen": -283.5255126953125, "ref_logps/rejected": -170.54788208007812, "rewards/accuracies": 1.0, "rewards/chosen": 1.017329216003418, "rewards/margins": 3.112454891204834, "rewards/rejected": -2.095125675201416, "step": 640, "u": -2.146329164505005, "weight": 0.051374662667512894 }, { "diff_generated": -233.44900512695312, "epoch": 1.350431824129809, "grad_norm": 1320.9978857185588, "learning_rate": 5.746183156789252e-07, "logits/chosen": -1.4467910528182983, "logits/rejected": -1.2174046039581299, "logps/chosen": -190.71127319335938, "logps/rejected": -401.9010314941406, "loss": 181.6372, "losses_ref": -1.3231620788574219, "ref_logps/chosen": -301.30584716796875, "ref_logps/rejected": -168.45204162597656, "rewards/accuracies": 1.0, "rewards/chosen": 1.1059458255767822, "rewards/margins": 3.4404358863830566, "rewards/rejected": -2.3344900608062744, "step": 645, "u": -2.5961060523986816, "weight": 0.031209224835038185 }, { "diff_generated": -218.05880737304688, "epoch": 1.3609002878827532, "grad_norm": 1268.0769992434364, "learning_rate": 5.581218468871365e-07, "logits/chosen": -1.2198398113250732, "logits/rejected": -1.3189094066619873, "logps/chosen": -157.86666870117188, "logps/rejected": -376.75433349609375, "loss": 168.9012, "losses_ref": -2.4989333152770996, "ref_logps/chosen": -252.76400756835938, "ref_logps/rejected": -158.69552612304688, "rewards/accuracies": 1.0, "rewards/chosen": 0.9489734768867493, "rewards/margins": 3.129561424255371, "rewards/rejected": -2.1805882453918457, "step": 650, "u": -2.92409086227417, "weight": 0.0428018681704998 }, { "diff_generated": -235.935546875, "epoch": 1.3713687516356974, "grad_norm": 1347.742524924812, "learning_rate": 5.417734782725896e-07, "logits/chosen": -1.2961053848266602, "logits/rejected": -1.261878252029419, "logps/chosen": -177.77523803710938, "logps/rejected": -389.1588134765625, "loss": 179.405, "losses_ref": -1.0838311910629272, "ref_logps/chosen": -277.2697448730469, "ref_logps/rejected": -153.2233123779297, "rewards/accuracies": 1.0, "rewards/chosen": 0.9949450492858887, "rewards/margins": 3.3543007373809814, "rewards/rejected": -2.3593554496765137, "step": 655, "u": -3.1044487953186035, "weight": 0.017367416992783546 }, { "diff_generated": -211.7913055419922, "epoch": 1.3818372153886418, "grad_norm": 1311.034191538654, "learning_rate": 5.255786891654399e-07, "logits/chosen": -1.2746165990829468, "logits/rejected": -1.2540855407714844, "logps/chosen": -170.9514923095703, "logps/rejected": -376.79229736328125, "loss": 174.0495, "losses_ref": -2.949699878692627, "ref_logps/chosen": -268.7665100097656, "ref_logps/rejected": -165.0010223388672, "rewards/accuracies": 1.0, "rewards/chosen": 0.9781501889228821, "rewards/margins": 3.0960631370544434, "rewards/rejected": -2.117912769317627, "step": 660, "u": -1.9965251684188843, "weight": 0.04027215391397476 }, { "diff_generated": -224.0879364013672, "epoch": 1.392305679141586, "grad_norm": 1328.7070235599076, "learning_rate": 5.095429074220319e-07, "logits/chosen": -1.2053465843200684, "logits/rejected": -1.1557897329330444, "logps/chosen": -175.30589294433594, "logps/rejected": -393.43218994140625, "loss": 184.6881, "losses_ref": -3.8794121742248535, "ref_logps/chosen": -274.28826904296875, "ref_logps/rejected": -169.34422302246094, "rewards/accuracies": 1.0, "rewards/chosen": 0.9898236989974976, "rewards/margins": 3.230703353881836, "rewards/rejected": -2.240879535675049, "step": 665, "u": -3.0530405044555664, "weight": 0.03465485945343971 }, { "diff_generated": -240.96484375, "epoch": 1.4027741428945302, "grad_norm": 1353.971116750616, "learning_rate": 4.936715076056974e-07, "logits/chosen": -1.242436408996582, "logits/rejected": -1.24913489818573, "logps/chosen": -183.4954833984375, "logps/rejected": -405.70050048828125, "loss": 171.9422, "losses_ref": -0.8431612253189087, "ref_logps/chosen": -284.3236999511719, "ref_logps/rejected": -164.7356414794922, "rewards/accuracies": 1.0, "rewards/chosen": 1.0082820653915405, "rewards/margins": 3.4179306030273438, "rewards/rejected": -2.4096481800079346, "step": 670, "u": -3.2680907249450684, "weight": 0.007226690649986267 }, { "diff_generated": -230.71115112304688, "epoch": 1.4132426066474744, "grad_norm": 1301.2684317901687, "learning_rate": 4.779698091854098e-07, "logits/chosen": -1.4362276792526245, "logits/rejected": -1.2898997068405151, "logps/chosen": -196.05447387695312, "logps/rejected": -400.25994873046875, "loss": 193.0132, "losses_ref": -0.4112131595611572, "ref_logps/chosen": -306.9317321777344, "ref_logps/rejected": -169.54879760742188, "rewards/accuracies": 1.0, "rewards/chosen": 1.1087725162506104, "rewards/margins": 3.415884494781494, "rewards/rejected": -2.307111978530884, "step": 675, "u": -3.2834739685058594, "weight": 0.006595195736736059 }, { "diff_generated": -205.662841796875, "epoch": 1.4237110704004188, "grad_norm": 1344.3331479958706, "learning_rate": 4.624430747529102e-07, "logits/chosen": -1.3598095178604126, "logits/rejected": -1.158661961555481, "logps/chosen": -205.39236450195312, "logps/rejected": -369.7188720703125, "loss": 181.4401, "losses_ref": -1.5265072584152222, "ref_logps/chosen": -313.2685546875, "ref_logps/rejected": -164.05599975585938, "rewards/accuracies": 1.0, "rewards/chosen": 1.0787618160247803, "rewards/margins": 3.135390520095825, "rewards/rejected": -2.056628465652466, "step": 680, "u": -3.1280694007873535, "weight": 0.024629075080156326 }, { "diff_generated": -223.96005249023438, "epoch": 1.434179534153363, "grad_norm": 1420.0899808408303, "learning_rate": 4.4709650825889277e-07, "logits/chosen": -1.202007532119751, "logits/rejected": -1.1467583179473877, "logps/chosen": -161.4755859375, "logps/rejected": -394.6024475097656, "loss": 181.6898, "losses_ref": -0.6423639059066772, "ref_logps/chosen": -258.74224853515625, "ref_logps/rejected": -170.64236450195312, "rewards/accuracies": 1.0, "rewards/chosen": 0.9726665616035461, "rewards/margins": 3.2122673988342285, "rewards/rejected": -2.239600658416748, "step": 685, "u": -2.6377460956573486, "weight": 0.008045530878007412 }, { "diff_generated": -199.37355041503906, "epoch": 1.4446479979063072, "grad_norm": 1308.6573761420323, "learning_rate": 4.3193525326884426e-07, "logits/chosen": -1.3359885215759277, "logits/rejected": -1.2320592403411865, "logps/chosen": -199.9832000732422, "logps/rejected": -364.55865478515625, "loss": 197.232, "losses_ref": -2.2240054607391357, "ref_logps/chosen": -303.2825927734375, "ref_logps/rejected": -165.18508911132812, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 1.0329937934875488, "rewards/margins": 3.026729106903076, "rewards/rejected": -1.9937355518341064, "step": 690, "u": -3.028186559677124, "weight": 0.02633347176015377 }, { "diff_generated": -224.0160369873047, "epoch": 1.4551164616592516, "grad_norm": 1299.079432778448, "learning_rate": 4.1696439123912406e-07, "logits/chosen": -1.2223880290985107, "logits/rejected": -1.209564447402954, "logps/chosen": -174.464111328125, "logps/rejected": -393.27691650390625, "loss": 178.2965, "losses_ref": -4.651436805725098, "ref_logps/chosen": -266.0323486328125, "ref_logps/rejected": -169.26083374023438, "rewards/accuracies": 1.0, "rewards/chosen": 0.915682315826416, "rewards/margins": 3.1558427810668945, "rewards/rejected": -2.2401604652404785, "step": 695, "u": -2.1582460403442383, "weight": 0.050126731395721436 }, { "diff_generated": -229.42153930664062, "epoch": 1.4655849254121958, "grad_norm": 1181.3986183050397, "learning_rate": 4.0218893981385927e-07, "logits/chosen": -1.2920024394989014, "logits/rejected": -1.2460237741470337, "logps/chosen": -169.0710906982422, "logps/rejected": -389.62451171875, "loss": 185.8502, "losses_ref": -2.0431206226348877, "ref_logps/chosen": -263.98992919921875, "ref_logps/rejected": -160.20298767089844, "rewards/accuracies": 1.0, "rewards/chosen": 0.9491885900497437, "rewards/margins": 3.243403196334839, "rewards/rejected": -2.294214963912964, "step": 700, "u": -2.900634765625, "weight": 0.029426846653223038 }, { "diff_generated": -238.50405883789062, "epoch": 1.47605338916514, "grad_norm": 1395.6758572737517, "learning_rate": 3.87613851143229e-07, "logits/chosen": -1.321358323097229, "logits/rejected": -1.2150487899780273, "logps/chosen": -193.1901397705078, "logps/rejected": -408.7565002441406, "loss": 180.6914, "losses_ref": -7.425305366516113, "ref_logps/chosen": -295.8336486816406, "ref_logps/rejected": -170.25244140625, "rewards/accuracies": 1.0, "rewards/chosen": 1.0264348983764648, "rewards/margins": 3.411475419998169, "rewards/rejected": -2.385040760040283, "step": 705, "u": -2.5025954246520996, "weight": 0.05010579898953438 }, { "diff_generated": -232.7953643798828, "epoch": 1.4865218529180844, "grad_norm": 1298.8055689658759, "learning_rate": 3.7324401022369744e-07, "logits/chosen": -1.322563886642456, "logits/rejected": -1.1327731609344482, "logps/chosen": -194.57736206054688, "logps/rejected": -386.1799011230469, "loss": 178.1232, "losses_ref": -1.3739917278289795, "ref_logps/chosen": -296.6303405761719, "ref_logps/rejected": -153.38453674316406, "rewards/accuracies": 1.0, "rewards/chosen": 1.0205297470092773, "rewards/margins": 3.3484835624694824, "rewards/rejected": -2.327953815460205, "step": 710, "u": -3.248492479324341, "weight": 0.022353414446115494 }, { "diff_generated": -204.0673370361328, "epoch": 1.4969903166710286, "grad_norm": 1434.009703095031, "learning_rate": 3.5908423326075455e-07, "logits/chosen": -1.2674996852874756, "logits/rejected": -1.242331862449646, "logps/chosen": -167.33718872070312, "logps/rejected": -369.3961486816406, "loss": 183.2372, "losses_ref": -1.1576902866363525, "ref_logps/chosen": -261.9808044433594, "ref_logps/rejected": -165.32882690429688, "rewards/accuracies": 1.0, "rewards/chosen": 0.9464362263679504, "rewards/margins": 2.987109661102295, "rewards/rejected": -2.0406734943389893, "step": 715, "u": -2.994257688522339, "weight": 0.022263679653406143 }, { "diff_generated": -233.936767578125, "epoch": 1.5074587804239727, "grad_norm": 1304.2767992641454, "learning_rate": 3.45139266054715e-07, "logits/chosen": -1.318178415298462, "logits/rejected": -1.1334383487701416, "logps/chosen": -197.61227416992188, "logps/rejected": -397.12127685546875, "loss": 183.3899, "losses_ref": -1.6034200191497803, "ref_logps/chosen": -309.3571472167969, "ref_logps/rejected": -163.1844940185547, "rewards/accuracies": 1.0, "rewards/chosen": 1.1174486875534058, "rewards/margins": 3.4568161964416504, "rewards/rejected": -2.3393678665161133, "step": 720, "u": -3.3942806720733643, "weight": 0.019716758280992508 }, { "diff_generated": -244.8833770751953, "epoch": 1.5179272441769172, "grad_norm": 1236.5966034907726, "learning_rate": 3.314137824101111e-07, "logits/chosen": -1.306779384613037, "logits/rejected": -1.1290355920791626, "logps/chosen": -218.06015014648438, "logps/rejected": -403.7084045410156, "loss": 191.625, "losses_ref": -2.257856845855713, "ref_logps/chosen": -318.39056396484375, "ref_logps/rejected": -158.82498168945312, "rewards/accuracies": 1.0, "rewards/chosen": 1.0033042430877686, "rewards/margins": 3.4521377086639404, "rewards/rejected": -2.448833703994751, "step": 725, "u": -3.3267006874084473, "weight": 0.032559871673583984 }, { "diff_generated": -222.01101684570312, "epoch": 1.5283957079298613, "grad_norm": 1211.6699328046157, "learning_rate": 3.179123825692178e-07, "logits/chosen": -1.248240351676941, "logits/rejected": -1.091903805732727, "logps/chosen": -175.27281188964844, "logps/rejected": -383.36309814453125, "loss": 173.3922, "losses_ref": -5.464686393737793, "ref_logps/chosen": -273.9178771972656, "ref_logps/rejected": -161.35206604003906, "rewards/accuracies": 1.0, "rewards/chosen": 0.9864505529403687, "rewards/margins": 3.2065606117248535, "rewards/rejected": -2.2201101779937744, "step": 730, "u": -2.946007013320923, "weight": 0.04170671105384827 }, { "diff_generated": -220.7677764892578, "epoch": 1.5388641716828055, "grad_norm": 1300.7738080642184, "learning_rate": 3.0463959167023335e-07, "logits/chosen": -1.2869834899902344, "logits/rejected": -1.1894266605377197, "logps/chosen": -182.8350372314453, "logps/rejected": -379.5199890136719, "loss": 171.4061, "losses_ref": -4.5947465896606445, "ref_logps/chosen": -284.21063232421875, "ref_logps/rejected": -158.75221252441406, "rewards/accuracies": 1.0, "rewards/chosen": 1.0137560367584229, "rewards/margins": 3.2214341163635254, "rewards/rejected": -2.2076778411865234, "step": 735, "u": -2.485495090484619, "weight": 0.053016532212495804 }, { "diff_generated": -238.62289428710938, "epoch": 1.54933263543575, "grad_norm": 1330.9443663432232, "learning_rate": 2.915998582306299e-07, "logits/chosen": -1.3296325206756592, "logits/rejected": -1.1434093713760376, "logps/chosen": -192.86752319335938, "logps/rejected": -412.7796325683594, "loss": 171.9964, "losses_ref": -0.9953049421310425, "ref_logps/chosen": -298.61065673828125, "ref_logps/rejected": -174.15672302246094, "rewards/accuracies": 1.0, "rewards/chosen": 1.0574313402175903, "rewards/margins": 3.443660259246826, "rewards/rejected": -2.3862290382385254, "step": 740, "u": -3.0645031929016113, "weight": 0.014706036075949669 }, { "diff_generated": -232.26016235351562, "epoch": 1.559801099188694, "grad_norm": 1284.1954470666844, "learning_rate": 2.7879755265618557e-07, "logits/chosen": -1.1518179178237915, "logits/rejected": -1.1568098068237305, "logps/chosen": -160.57080078125, "logps/rejected": -390.94854736328125, "loss": 177.5848, "losses_ref": -0.8798303604125977, "ref_logps/chosen": -254.80648803710938, "ref_logps/rejected": -158.68838500976562, "rewards/accuracies": 1.0, "rewards/chosen": 0.9423569440841675, "rewards/margins": 3.264958620071411, "rewards/rejected": -2.322601556777954, "step": 745, "u": -2.921161413192749, "weight": 0.0170670785009861 }, { "diff_generated": -233.20187377929688, "epoch": 1.5702695629416383, "grad_norm": 1289.4633991370238, "learning_rate": 2.6623696577619625e-07, "logits/chosen": -1.2346287965774536, "logits/rejected": -1.2745471000671387, "logps/chosen": -192.0581512451172, "logps/rejected": -391.81146240234375, "loss": 182.0347, "losses_ref": -1.6879841089248657, "ref_logps/chosen": -290.81500244140625, "ref_logps/rejected": -158.6095428466797, "rewards/accuracies": 1.0, "rewards/chosen": 0.9875686764717102, "rewards/margins": 3.3195877075195312, "rewards/rejected": -2.3320186138153076, "step": 750, "u": -3.0655760765075684, "weight": 0.02230766788125038 }, { "diff_generated": -217.38137817382812, "epoch": 1.5807380266945825, "grad_norm": 1443.1241349727247, "learning_rate": 2.5392230740535846e-07, "logits/chosen": -1.4136921167373657, "logits/rejected": -1.14936363697052, "logps/chosen": -205.6985321044922, "logps/rejected": -384.5464782714844, "loss": 193.6084, "losses_ref": -2.2517495155334473, "ref_logps/chosen": -317.1262512207031, "ref_logps/rejected": -167.1651153564453, "rewards/accuracies": 1.0, "rewards/chosen": 1.1142771244049072, "rewards/margins": 3.288090467453003, "rewards/rejected": -2.173813581466675, "step": 755, "u": -2.5095248222351074, "weight": 0.04148329049348831 }, { "diff_generated": -223.0001220703125, "epoch": 1.5912064904475267, "grad_norm": 1315.9291386894508, "learning_rate": 2.418577049328058e-07, "logits/chosen": -1.6086959838867188, "logits/rejected": -1.2083603143692017, "logps/chosen": -214.2064666748047, "logps/rejected": -383.09918212890625, "loss": 193.4667, "losses_ref": -0.697050929069519, "ref_logps/chosen": -327.52081298828125, "ref_logps/rejected": -160.09909057617188, "rewards/accuracies": 1.0, "rewards/chosen": 1.1331430673599243, "rewards/margins": 3.3631443977355957, "rewards/rejected": -2.2300009727478027, "step": 760, "u": -3.4212822914123535, "weight": 0.02071799524128437 }, { "diff_generated": -240.10708618164062, "epoch": 1.6016749542004711, "grad_norm": 1354.8278973512276, "learning_rate": 2.300472019387697e-07, "logits/chosen": -1.3740001916885376, "logits/rejected": -1.2972242832183838, "logps/chosen": -184.8181915283203, "logps/rejected": -400.69439697265625, "loss": 183.1763, "losses_ref": -5.5122270584106445, "ref_logps/chosen": -284.5431213378906, "ref_logps/rejected": -160.58731079101562, "rewards/accuracies": 1.0, "rewards/chosen": 0.9972493052482605, "rewards/margins": 3.398320436477661, "rewards/rejected": -2.4010708332061768, "step": 765, "u": -2.9249844551086426, "weight": 0.04190880060195923 }, { "diff_generated": -224.24789428710938, "epoch": 1.6121434179534153, "grad_norm": 1294.896383811541, "learning_rate": 2.1849475683932994e-07, "logits/chosen": -1.3714028596878052, "logits/rejected": -1.3127011060714722, "logps/chosen": -184.06544494628906, "logps/rejected": -384.2123107910156, "loss": 179.5198, "losses_ref": -3.6349315643310547, "ref_logps/chosen": -284.44268798828125, "ref_logps/rejected": -159.9644012451172, "rewards/accuracies": 1.0, "rewards/chosen": 1.003772497177124, "rewards/margins": 3.246250867843628, "rewards/rejected": -2.242478847503662, "step": 770, "u": -2.7088732719421387, "weight": 0.04078099876642227 }, { "diff_generated": -228.37911987304688, "epoch": 1.6226118817063595, "grad_norm": 1315.1478573927377, "learning_rate": 2.0720424155971038e-07, "logits/chosen": -1.4367603063583374, "logits/rejected": -1.2870023250579834, "logps/chosen": -201.5555877685547, "logps/rejected": -386.1324157714844, "loss": 176.5013, "losses_ref": -2.8903164863586426, "ref_logps/chosen": -306.54461669921875, "ref_logps/rejected": -157.75328063964844, "rewards/accuracies": 1.0, "rewards/chosen": 1.0498902797698975, "rewards/margins": 3.333681583404541, "rewards/rejected": -2.2837913036346436, "step": 775, "u": -2.703965902328491, "weight": 0.04053039103746414 }, { "diff_generated": -220.4508819580078, "epoch": 1.633080345459304, "grad_norm": 1400.50539955428, "learning_rate": 1.961794402365611e-07, "logits/chosen": -1.4036462306976318, "logits/rejected": -1.2919548749923706, "logps/chosen": -200.26541137695312, "logps/rejected": -386.81597900390625, "loss": 183.6931, "losses_ref": -1.8775193691253662, "ref_logps/chosen": -310.53729248046875, "ref_logps/rejected": -166.36508178710938, "rewards/accuracies": 1.0, "rewards/chosen": 1.1027185916900635, "rewards/margins": 3.307227373123169, "rewards/rejected": -2.2045087814331055, "step": 780, "u": -2.835704803466797, "weight": 0.031143631786108017 }, { "diff_generated": -217.3331298828125, "epoch": 1.643548809212248, "grad_norm": 1301.0844819616188, "learning_rate": 1.8542404794966427e-07, "logits/chosen": -1.4641870260238647, "logits/rejected": -1.3147245645523071, "logps/chosen": -196.31103515625, "logps/rejected": -391.39166259765625, "loss": 178.2437, "losses_ref": -1.2605804204940796, "ref_logps/chosen": -303.4082336425781, "ref_logps/rejected": -174.0585479736328, "rewards/accuracies": 1.0, "rewards/chosen": 1.070972204208374, "rewards/margins": 3.2443034648895264, "rewards/rejected": -2.1733312606811523, "step": 785, "u": -2.590919017791748, "weight": 0.01949651725590229 }, { "diff_generated": -220.4461669921875, "epoch": 1.6540172729651923, "grad_norm": 1297.8083100097251, "learning_rate": 1.7494166948349053e-07, "logits/chosen": -1.3500601053237915, "logits/rejected": -1.411941409111023, "logps/chosen": -159.91616821289062, "logps/rejected": -383.6579284667969, "loss": 166.2805, "losses_ref": -1.1791235208511353, "ref_logps/chosen": -257.8923034667969, "ref_logps/rejected": -163.21176147460938, "rewards/accuracies": 1.0, "rewards/chosen": 0.9797613024711609, "rewards/margins": 3.184222936630249, "rewards/rejected": -2.2044615745544434, "step": 790, "u": -3.492673873901367, "weight": 0.020172851160168648 }, { "diff_generated": -218.501708984375, "epoch": 1.6644857367181367, "grad_norm": 1305.6902286203212, "learning_rate": 1.6473581811901528e-07, "logits/chosen": -1.3759443759918213, "logits/rejected": -1.3116881847381592, "logps/chosen": -175.59524536132812, "logps/rejected": -386.4131774902344, "loss": 166.0248, "losses_ref": -0.9349870681762695, "ref_logps/chosen": -275.24603271484375, "ref_logps/rejected": -167.91146850585938, "rewards/accuracies": 1.0, "rewards/chosen": 0.9965084791183472, "rewards/margins": 3.181525468826294, "rewards/rejected": -2.1850171089172363, "step": 795, "u": -3.1951217651367188, "weight": 0.007377298083156347 }, { "diff_generated": -227.8848114013672, "epoch": 1.674954200471081, "grad_norm": 1377.3447203192195, "learning_rate": 1.5480991445620538e-07, "logits/chosen": -1.3294823169708252, "logits/rejected": -1.3292287588119507, "logps/chosen": -171.1267852783203, "logps/rejected": -383.339111328125, "loss": 179.9502, "losses_ref": -1.4068111181259155, "ref_logps/chosen": -269.3274841308594, "ref_logps/rejected": -155.45433044433594, "rewards/accuracies": 1.0, "rewards/chosen": 0.9820070266723633, "rewards/margins": 3.260855197906494, "rewards/rejected": -2.278848171234131, "step": 800, "u": -3.075801134109497, "weight": 0.02200758084654808 }, { "diff_generated": -223.35498046875, "epoch": 1.685422664224025, "grad_norm": 1269.6278028028526, "learning_rate": 1.4516728526756873e-07, "logits/chosen": -1.4065078496932983, "logits/rejected": -1.2835044860839844, "logps/chosen": -182.1883544921875, "logps/rejected": -374.7066650390625, "loss": 186.9203, "losses_ref": -2.037257671356201, "ref_logps/chosen": -276.4019470214844, "ref_logps/rejected": -151.35165405273438, "rewards/accuracies": 1.0, "rewards/chosen": 0.9421361684799194, "rewards/margins": 3.1756858825683594, "rewards/rejected": -2.2335495948791504, "step": 805, "u": -2.378087282180786, "weight": 0.04411940649151802 }, { "diff_generated": -216.16616821289062, "epoch": 1.6958911279769695, "grad_norm": 1463.8714206417467, "learning_rate": 1.3581116238315194e-07, "logits/chosen": -1.4423078298568726, "logits/rejected": -1.3139569759368896, "logps/chosen": -205.9932098388672, "logps/rejected": -375.70849609375, "loss": 190.2176, "losses_ref": -1.2827723026275635, "ref_logps/chosen": -311.7004699707031, "ref_logps/rejected": -159.54234313964844, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 1.057072639465332, "rewards/margins": 3.2187340259552, "rewards/rejected": -2.161661386489868, "step": 810, "u": -2.773268938064575, "weight": 0.022295668721199036 }, { "diff_generated": -213.4055633544922, "epoch": 1.7063595917299135, "grad_norm": 1352.148808645479, "learning_rate": 1.2674468160735586e-07, "logits/chosen": -1.4077790975570679, "logits/rejected": -1.3166415691375732, "logps/chosen": -177.383544921875, "logps/rejected": -373.1116027832031, "loss": 179.0974, "losses_ref": -3.5087268352508545, "ref_logps/chosen": -279.3453674316406, "ref_logps/rejected": -159.70603942871094, "rewards/accuracies": 1.0, "rewards/chosen": 1.019618034362793, "rewards/margins": 3.1536736488342285, "rewards/rejected": -2.1340556144714355, "step": 815, "u": -2.615370512008667, "weight": 0.0508296899497509 }, { "diff_generated": -237.0354766845703, "epoch": 1.7168280554828579, "grad_norm": 1326.9582054025304, "learning_rate": 1.1797088166794e-07, "logits/chosen": -1.328039288520813, "logits/rejected": -1.2903969287872314, "logps/chosen": -176.13819885253906, "logps/rejected": -401.98748779296875, "loss": 179.7547, "losses_ref": -0.005425100214779377, "ref_logps/chosen": -275.8017883300781, "ref_logps/rejected": -164.95204162597656, "rewards/accuracies": 1.0, "rewards/chosen": 0.9966354370117188, "rewards/margins": 3.366990327835083, "rewards/rejected": -2.370354652404785, "step": 820, "u": -2.696533679962158, "weight": 3.794050280703232e-05 }, { "diff_generated": -219.973876953125, "epoch": 1.7272965192358023, "grad_norm": 1183.8442331623387, "learning_rate": 1.0949270319755766e-07, "logits/chosen": -1.3806655406951904, "logits/rejected": -1.337877631187439, "logps/chosen": -167.13290405273438, "logps/rejected": -381.6925048828125, "loss": 173.9734, "losses_ref": -2.8696396350860596, "ref_logps/chosen": -262.41363525390625, "ref_logps/rejected": -161.7186279296875, "rewards/accuracies": 1.0, "rewards/chosen": 0.9528074264526367, "rewards/margins": 3.1525461673736572, "rewards/rejected": -2.1997389793395996, "step": 825, "u": -2.7405786514282227, "weight": 0.02941594459116459 }, { "diff_generated": -211.63803100585938, "epoch": 1.7377649829887463, "grad_norm": 1227.5586469104078, "learning_rate": 1.013129877481741e-07, "logits/chosen": -1.3626017570495605, "logits/rejected": -1.199372410774231, "logps/chosen": -211.2673797607422, "logps/rejected": -382.85003662109375, "loss": 185.7144, "losses_ref": -5.601190090179443, "ref_logps/chosen": -318.6112060546875, "ref_logps/rejected": -171.2120361328125, "rewards/accuracies": 1.0, "rewards/chosen": 1.0734381675720215, "rewards/margins": 3.1898186206817627, "rewards/rejected": -2.116380214691162, "step": 830, "u": -2.8598952293395996, "weight": 0.029378216713666916 }, { "diff_generated": -230.986328125, "epoch": 1.7482334467416907, "grad_norm": 1290.8717428712873, "learning_rate": 9.343447683868799e-08, "logits/chosen": -1.2116000652313232, "logits/rejected": -1.2751588821411133, "logps/chosen": -169.79380798339844, "logps/rejected": -394.716064453125, "loss": 178.4699, "losses_ref": -0.9493634104728699, "ref_logps/chosen": -262.22894287109375, "ref_logps/rejected": -163.72976684570312, "rewards/accuracies": 1.0, "rewards/chosen": 0.9243512153625488, "rewards/margins": 3.2342143058776855, "rewards/rejected": -2.309863328933716, "step": 835, "u": -2.9001994132995605, "weight": 0.008882230147719383 }, { "diff_generated": -216.50314331054688, "epoch": 1.7587019104946349, "grad_norm": 1335.6008361033998, "learning_rate": 8.585981103608342e-08, "logits/chosen": -1.3362239599227905, "logits/rejected": -1.1397970914840698, "logps/chosen": -206.77511596679688, "logps/rejected": -389.9546203613281, "loss": 191.1818, "losses_ref": -0.28884872794151306, "ref_logps/chosen": -316.14837646484375, "ref_logps/rejected": -173.45150756835938, "rewards/accuracies": 1.0, "rewards/chosen": 1.0937325954437256, "rewards/margins": 3.2587637901306152, "rewards/rejected": -2.1650314331054688, "step": 840, "u": -3.1953749656677246, "weight": 0.004329306539148092 }, { "diff_generated": -202.93106079101562, "epoch": 1.769170374247579, "grad_norm": 1266.3673749218208, "learning_rate": 7.859152907041544e-08, "logits/chosen": -1.354994773864746, "logits/rejected": -1.1393146514892578, "logps/chosen": -199.24710083007812, "logps/rejected": -360.2781677246094, "loss": 176.0576, "losses_ref": -1.7230793237686157, "ref_logps/chosen": -305.8094177246094, "ref_logps/rejected": -157.3471221923828, "rewards/accuracies": 1.0, "rewards/chosen": 1.06562340259552, "rewards/margins": 3.0949339866638184, "rewards/rejected": -2.029310464859009, "step": 845, "u": -2.745694160461426, "weight": 0.0367230661213398 }, { "diff_generated": -211.84765625, "epoch": 1.7796388380005235, "grad_norm": 1302.5644299154628, "learning_rate": 7.163206698392742e-08, "logits/chosen": -1.2949212789535522, "logits/rejected": -1.1885995864868164, "logps/chosen": -185.09088134765625, "logps/rejected": -367.0600891113281, "loss": 184.3042, "losses_ref": -3.0929312705993652, "ref_logps/chosen": -285.8619384765625, "ref_logps/rejected": -155.21240234375, "rewards/accuracies": 1.0, "rewards/chosen": 1.0077106952667236, "rewards/margins": 3.1261868476867676, "rewards/rejected": -2.118476629257202, "step": 850, "u": -3.2007651329040527, "weight": 0.03339768201112747 }, { "diff_generated": -209.816650390625, "epoch": 1.7901073017534677, "grad_norm": 1345.4005628593675, "learning_rate": 6.498375731458527e-08, "logits/chosen": -1.4427772760391235, "logits/rejected": -1.2521936893463135, "logps/chosen": -190.9636688232422, "logps/rejected": -376.20391845703125, "loss": 177.6342, "losses_ref": -2.1869561672210693, "ref_logps/chosen": -298.1745300292969, "ref_logps/rejected": -166.38723754882812, "rewards/accuracies": 1.0, "rewards/chosen": 1.072108507156372, "rewards/margins": 3.1702747344970703, "rewards/rejected": -2.098165988922119, "step": 855, "u": -3.108565330505371, "weight": 0.026614084839820862 }, { "diff_generated": -229.3601837158203, "epoch": 1.8005757655064119, "grad_norm": 1261.2192787306672, "learning_rate": 5.8648828314302735e-08, "logits/chosen": -1.3119590282440186, "logits/rejected": -1.1316639184951782, "logps/chosen": -186.41650390625, "logps/rejected": -386.34906005859375, "loss": 176.3818, "losses_ref": -2.3852286338806152, "ref_logps/chosen": -289.1662292480469, "ref_logps/rejected": -156.98886108398438, "rewards/accuracies": 1.0, "rewards/chosen": 1.0274972915649414, "rewards/margins": 3.3210995197296143, "rewards/rejected": -2.2936015129089355, "step": 860, "u": -2.8182337284088135, "weight": 0.03502316027879715 }, { "diff_generated": -210.816650390625, "epoch": 1.8110442292593563, "grad_norm": 1269.4818113722586, "learning_rate": 5.2629403202119505e-08, "logits/chosen": -1.2412734031677246, "logits/rejected": -1.227634072303772, "logps/chosen": -173.3083953857422, "logps/rejected": -375.7776184082031, "loss": 171.0543, "losses_ref": -0.6853199005126953, "ref_logps/chosen": -271.1694030761719, "ref_logps/rejected": -164.96096801757812, "rewards/accuracies": 1.0, "rewards/chosen": 0.9786099195480347, "rewards/margins": 3.0867760181427, "rewards/rejected": -2.108166217803955, "step": 865, "u": -3.394763231277466, "weight": 0.010494846850633621 }, { "diff_generated": -226.4854736328125, "epoch": 1.8215126930123005, "grad_norm": 1268.655880675009, "learning_rate": 4.692749945258057e-08, "logits/chosen": -1.3430616855621338, "logits/rejected": -1.1744420528411865, "logps/chosen": -195.01589965820312, "logps/rejected": -389.9505920410156, "loss": 186.5683, "losses_ref": -3.434800624847412, "ref_logps/chosen": -299.091796875, "ref_logps/rejected": -163.46514892578125, "rewards/accuracies": 1.0, "rewards/chosen": 1.0407590866088867, "rewards/margins": 3.3056137561798096, "rewards/rejected": -2.2648544311523438, "step": 870, "u": -2.5695509910583496, "weight": 0.046660859137773514 }, { "diff_generated": -236.5417022705078, "epoch": 1.8319811567652446, "grad_norm": 1204.436962297953, "learning_rate": 4.1545028119559066e-08, "logits/chosen": -1.3133630752563477, "logits/rejected": -1.3207045793533325, "logps/chosen": -190.3129425048828, "logps/rejected": -398.3179931640625, "loss": 171.6172, "losses_ref": -1.358794927597046, "ref_logps/chosen": -287.73504638671875, "ref_logps/rejected": -161.77627563476562, "rewards/accuracies": 1.0, "rewards/chosen": 0.974220871925354, "rewards/margins": 3.339637279510498, "rewards/rejected": -2.365417003631592, "step": 875, "u": -2.7414660453796387, "weight": 0.022162286564707756 }, { "diff_generated": -210.89306640625, "epoch": 1.842449620518189, "grad_norm": 1230.1188284044147, "learning_rate": 3.648379319574568e-08, "logits/chosen": -1.383299708366394, "logits/rejected": -1.3287036418914795, "logps/chosen": -190.19691467285156, "logps/rejected": -363.8573913574219, "loss": 168.7976, "losses_ref": -4.205277442932129, "ref_logps/chosen": -291.4452819824219, "ref_logps/rejected": -152.96432495117188, "rewards/accuracies": 1.0, "rewards/chosen": 1.0124839544296265, "rewards/margins": 3.1214146614074707, "rewards/rejected": -2.1089303493499756, "step": 880, "u": -2.69191312789917, "weight": 0.03942141681909561 }, { "diff_generated": -224.0536346435547, "epoch": 1.8529180842711332, "grad_norm": 1317.8866979194424, "learning_rate": 3.17454910080216e-08, "logits/chosen": -1.387369155883789, "logits/rejected": -1.256730318069458, "logps/chosen": -213.5888671875, "logps/rejected": -388.00115966796875, "loss": 200.2688, "losses_ref": -0.6602109670639038, "ref_logps/chosen": -319.39569091796875, "ref_logps/rejected": -163.94747924804688, "rewards/accuracies": 1.0, "rewards/chosen": 1.058068037033081, "rewards/margins": 3.2986044883728027, "rewards/rejected": -2.2405362129211426, "step": 885, "u": -2.7355685234069824, "weight": 0.029411468654870987 }, { "diff_generated": -221.8759765625, "epoch": 1.8633865480240774, "grad_norm": 1306.8555947562052, "learning_rate": 2.733170964891607e-08, "logits/chosen": -1.3195066452026367, "logits/rejected": -1.2867323160171509, "logps/chosen": -170.53369140625, "logps/rejected": -378.52935791015625, "loss": 174.36, "losses_ref": -0.899361252784729, "ref_logps/chosen": -274.72943115234375, "ref_logps/rejected": -156.65335083007812, "rewards/accuracies": 1.0, "rewards/chosen": 1.0419572591781616, "rewards/margins": 3.2607169151306152, "rewards/rejected": -2.218759775161743, "step": 890, "u": -3.3832144737243652, "weight": 0.008660494349896908 }, { "diff_generated": -214.10165405273438, "epoch": 1.8738550117770219, "grad_norm": 1275.7234308585855, "learning_rate": 2.324392844434042e-08, "logits/chosen": -1.3565282821655273, "logits/rejected": -1.344678282737732, "logps/chosen": -192.53738403320312, "logps/rejected": -390.2491149902344, "loss": 191.1614, "losses_ref": -2.9138152599334717, "ref_logps/chosen": -295.70458984375, "ref_logps/rejected": -176.14747619628906, "rewards/accuracies": 1.0, "rewards/chosen": 1.0316721200942993, "rewards/margins": 3.1726887226104736, "rewards/rejected": -2.141016721725464, "step": 895, "u": -3.1400444507598877, "weight": 0.02205641008913517 }, { "diff_generated": -221.6460418701172, "epoch": 1.8843234755299658, "grad_norm": 1242.4363921596732, "learning_rate": 1.9483517457776434e-08, "logits/chosen": -1.1762725114822388, "logits/rejected": -1.3724615573883057, "logps/chosen": -159.86691284179688, "logps/rejected": -381.15081787109375, "loss": 172.6295, "losses_ref": -4.887435436248779, "ref_logps/chosen": -252.33743286132812, "ref_logps/rejected": -159.50479125976562, "rewards/accuracies": 1.0, "rewards/chosen": 0.9247050285339355, "rewards/margins": 3.1411654949188232, "rewards/rejected": -2.216460704803467, "step": 900, "u": -2.2665815353393555, "weight": 0.07192285358905792 }, { "diff_generated": -227.669189453125, "epoch": 1.8947919392829102, "grad_norm": 1323.6799372011517, "learning_rate": 1.6051737031084533e-08, "logits/chosen": -1.2494432926177979, "logits/rejected": -1.1595919132232666, "logps/chosen": -175.1837921142578, "logps/rejected": -384.48175048828125, "loss": 174.3896, "losses_ref": -1.007882833480835, "ref_logps/chosen": -276.83319091796875, "ref_logps/rejected": -156.81253051757812, "rewards/accuracies": 1.0, "rewards/chosen": 1.0164941549301147, "rewards/margins": 3.2931861877441406, "rewards/rejected": -2.2766921520233154, "step": 905, "u": -3.0982091426849365, "weight": 0.01854753866791725 }, { "diff_generated": -222.00784301757812, "epoch": 1.9052604030358546, "grad_norm": 1353.0501425434295, "learning_rate": 1.2949737362087154e-08, "logits/chosen": -1.222752332687378, "logits/rejected": -1.265421986579895, "logps/chosen": -173.27577209472656, "logps/rejected": -388.85797119140625, "loss": 174.8498, "losses_ref": -6.1348981857299805, "ref_logps/chosen": -269.9849853515625, "ref_logps/rejected": -166.85018920898438, "rewards/accuracies": 1.0, "rewards/chosen": 0.9670922160148621, "rewards/margins": 3.1871705055236816, "rewards/rejected": -2.220078229904175, "step": 910, "u": -2.5107998847961426, "weight": 0.06687295436859131 }, { "diff_generated": -211.4918975830078, "epoch": 1.9157288667887986, "grad_norm": 1286.3307044665144, "learning_rate": 1.0178558119067315e-08, "logits/chosen": -1.2266263961791992, "logits/rejected": -1.0511000156402588, "logps/chosen": -177.09149169921875, "logps/rejected": -372.6114807128906, "loss": 175.9135, "losses_ref": -0.7255733609199524, "ref_logps/chosen": -277.30194091796875, "ref_logps/rejected": -161.1195831298828, "rewards/accuracies": 1.0, "rewards/chosen": 1.0021045207977295, "rewards/margins": 3.11702299118042, "rewards/rejected": -2.1149187088012695, "step": 915, "u": -3.0817387104034424, "weight": 0.014788592234253883 }, { "diff_generated": -220.1043701171875, "epoch": 1.926197330541743, "grad_norm": 1287.429219240266, "learning_rate": 7.739128092312918e-09, "logits/chosen": -1.3375459909439087, "logits/rejected": -1.274279236793518, "logps/chosen": -181.00665283203125, "logps/rejected": -377.59088134765625, "loss": 171.8915, "losses_ref": -1.6772384643554688, "ref_logps/chosen": -280.682861328125, "ref_logps/rejected": -157.4865264892578, "rewards/accuracies": 1.0, "rewards/chosen": 0.9967617988586426, "rewards/margins": 3.197805881500244, "rewards/rejected": -2.2010436058044434, "step": 920, "u": -2.880985736846924, "weight": 0.036544255912303925 }, { "diff_generated": -222.374755859375, "epoch": 1.9366657942946872, "grad_norm": 1348.176434591513, "learning_rate": 5.632264882822757e-09, "logits/chosen": -1.3248652219772339, "logits/rejected": -1.2289717197418213, "logps/chosen": -187.19947814941406, "logps/rejected": -382.12860107421875, "loss": 186.23, "losses_ref": -2.8856143951416016, "ref_logps/chosen": -288.6744079589844, "ref_logps/rejected": -159.7538299560547, "rewards/accuracies": 1.0, "rewards/chosen": 1.0147496461868286, "rewards/margins": 3.238497257232666, "rewards/rejected": -2.223747730255127, "step": 925, "u": -2.8917412757873535, "weight": 0.024965789169073105 }, { "diff_generated": -213.1198272705078, "epoch": 1.9471342580476314, "grad_norm": 1395.889446013467, "learning_rate": 3.858674628278824e-09, "logits/chosen": -1.366350531578064, "logits/rejected": -1.119940996170044, "logps/chosen": -188.4399871826172, "logps/rejected": -371.8886413574219, "loss": 182.8113, "losses_ref": -5.109557151794434, "ref_logps/chosen": -294.67010498046875, "ref_logps/rejected": -158.7688446044922, "rewards/accuracies": 1.0, "rewards/chosen": 1.0623013973236084, "rewards/margins": 3.1934995651245117, "rewards/rejected": -2.1311981678009033, "step": 930, "u": -2.426971435546875, "weight": 0.053149282932281494 }, { "diff_generated": -237.7774200439453, "epoch": 1.9576027218005758, "grad_norm": 1267.4031070589224, "learning_rate": 2.418951766376742e-09, "logits/chosen": -1.2219622135162354, "logits/rejected": -1.2400046586990356, "logps/chosen": -167.6567840576172, "logps/rejected": -398.30865478515625, "loss": 180.7217, "losses_ref": -5.82874059677124, "ref_logps/chosen": -267.5108947753906, "ref_logps/rejected": -160.53125, "rewards/accuracies": 1.0, "rewards/chosen": 0.9985405802726746, "rewards/margins": 3.376314878463745, "rewards/rejected": -2.377774238586426, "step": 935, "u": -2.8524553775787354, "weight": 0.05332515761256218 }, { "diff_generated": -221.6811981201172, "epoch": 1.96807118555352, "grad_norm": 1234.1312151479083, "learning_rate": 1.313578835593465e-09, "logits/chosen": -1.3167364597320557, "logits/rejected": -1.0956764221191406, "logps/chosen": -202.79949951171875, "logps/rejected": -389.28814697265625, "loss": 183.0365, "losses_ref": -1.295898199081421, "ref_logps/chosen": -312.72149658203125, "ref_logps/rejected": -167.60696411132812, "rewards/accuracies": 1.0, "rewards/chosen": 1.0992200374603271, "rewards/margins": 3.3160319328308105, "rewards/rejected": -2.2168118953704834, "step": 940, "u": -2.668116569519043, "weight": 0.01472543366253376 }, { "diff_generated": -209.20425415039062, "epoch": 1.9785396493064642, "grad_norm": 1327.2259702611773, "learning_rate": 5.429263134594242e-10, "logits/chosen": -1.298588514328003, "logits/rejected": -1.3200442790985107, "logps/chosen": -177.170654296875, "logps/rejected": -369.2535705566406, "loss": 179.5952, "losses_ref": -4.355043888092041, "ref_logps/chosen": -273.0224304199219, "ref_logps/rejected": -160.04933166503906, "rewards/accuracies": 1.0, "rewards/chosen": 0.9585177302360535, "rewards/margins": 3.050560474395752, "rewards/rejected": -2.0920424461364746, "step": 945, "u": -2.591240406036377, "weight": 0.0508696511387825 }, { "diff_generated": -218.85842895507812, "epoch": 1.9890081130594086, "grad_norm": 1215.4951947566592, "learning_rate": 1.0725249238940915e-10, "logits/chosen": -1.3104689121246338, "logits/rejected": -1.166074514389038, "logps/chosen": -190.97283935546875, "logps/rejected": -377.3951110839844, "loss": 185.4394, "losses_ref": -1.4506399631500244, "ref_logps/chosen": -288.34576416015625, "ref_logps/rejected": -158.53671264648438, "rewards/accuracies": 1.0, "rewards/chosen": 0.9737294316291809, "rewards/margins": 3.162313938140869, "rewards/rejected": -2.188584089279175, "step": 950, "u": -3.3431270122528076, "weight": 0.014722567982971668 } ], "logging_steps": 5, "max_steps": 954, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }