diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4044 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9973828840617638, + "eval_steps": 500, + "global_step": 954, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "diff_generated": 0.0, + "epoch": 0.002093692750588851, + "grad_norm": 4027.4986845337753, + "learning_rate": 2.083333333333333e-08, + "logits/chosen": -2.1441590785980225, + "logits/rejected": -2.0543735027313232, + "logps/chosen": -276.82366943359375, + "logps/rejected": -131.32485961914062, + "loss": 140.2437, + "losses_ref": -131.32485961914062, + "ref_logps/chosen": -276.82366943359375, + "ref_logps/rejected": -131.32485961914062, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "u": 1.4901161193847656e-08, + "weight": 1.0 + }, + { + "diff_generated": 0.004567362368106842, + "epoch": 0.010468463752944255, + "grad_norm": 4012.8373505662616, + "learning_rate": 1.0416666666666667e-07, + "logits/chosen": -2.2097952365875244, + "logits/rejected": -2.1078758239746094, + "logps/chosen": -280.6259460449219, + "logps/rejected": -162.3510284423828, + "loss": 129.4337, + "losses_ref": -163.54556274414062, + "ref_logps/chosen": -280.68133544921875, + "ref_logps/rejected": -162.3555908203125, + "rewards/accuracies": 0.43359375, + "rewards/chosen": 0.000553958467207849, + "rewards/margins": 0.0005082848947495222, + "rewards/rejected": 4.567361975205131e-05, + "step": 5, + "u": 0.01998738758265972, + "weight": 1.0011132955551147 + }, + { + "diff_generated": -0.883712887763977, + "epoch": 0.02093692750588851, + "grad_norm": 3617.405413942307, + "learning_rate": 2.0833333333333333e-07, + "logits/chosen": -2.355677843093872, + "logits/rejected": -2.1583828926086426, + "logps/chosen": -302.09747314453125, + "logps/rejected": -169.69467163085938, + "loss": 157.3847, + "losses_ref": -137.87350463867188, + "ref_logps/chosen": -302.58917236328125, + "ref_logps/rejected": -168.81094360351562, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.004917326383292675, + "rewards/margins": 0.01375445444136858, + "rewards/rejected": -0.00883712898939848, + "step": 10, + "u": -0.573723316192627, + "weight": 0.8237913250923157 + }, + { + "diff_generated": -3.757080078125, + "epoch": 0.031405391258832765, + "grad_norm": 3487.9086553330885, + "learning_rate": 3.1249999999999997e-07, + "logits/chosen": -2.285557270050049, + "logits/rejected": -2.1396851539611816, + "logps/chosen": -299.9487609863281, + "logps/rejected": -166.72817993164062, + "loss": 215.9423, + "losses_ref": -61.32612991333008, + "ref_logps/chosen": -304.54766845703125, + "ref_logps/rejected": -162.97108459472656, + "rewards/accuracies": 0.984375, + "rewards/chosen": 0.0459887757897377, + "rewards/margins": 0.08355957269668579, + "rewards/rejected": -0.03757079690694809, + "step": 15, + "u": -1.074953317642212, + "weight": 0.4649723172187805 + }, + { + "diff_generated": -13.927907943725586, + "epoch": 0.04187385501177702, + "grad_norm": 2892.3287332168984, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": -2.246702194213867, + "logits/rejected": -2.1279449462890625, + "logps/chosen": -267.1871337890625, + "logps/rejected": -170.03897094726562, + "loss": 233.0012, + "losses_ref": -32.27024459838867, + "ref_logps/chosen": -283.3597106933594, + "ref_logps/rejected": -156.11105346679688, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.16172581911087036, + "rewards/margins": 0.30100491642951965, + "rewards/rejected": -0.1392790973186493, + "step": 20, + "u": -0.5478723049163818, + "weight": 0.3134520649909973 + }, + { + "diff_generated": -26.503625869750977, + "epoch": 0.05234231876472128, + "grad_norm": 2024.707131865886, + "learning_rate": 5.208333333333334e-07, + "logits/chosen": -2.209564447402954, + "logits/rejected": -2.0659689903259277, + "logps/chosen": -255.67092895507812, + "logps/rejected": -183.784423828125, + "loss": 225.1278, + "losses_ref": -30.188289642333984, + "ref_logps/chosen": -280.2396545410156, + "ref_logps/rejected": -157.2808074951172, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.245687335729599, + "rewards/margins": 0.5107235908508301, + "rewards/rejected": -0.2650362551212311, + "step": 25, + "u": 0.18020522594451904, + "weight": 0.2832922041416168 + }, + { + "diff_generated": -51.14558792114258, + "epoch": 0.06281078251766553, + "grad_norm": 1518.1810592262389, + "learning_rate": 6.249999999999999e-07, + "logits/chosen": -2.2818737030029297, + "logits/rejected": -2.199028968811035, + "logps/chosen": -243.2410888671875, + "logps/rejected": -215.5212860107422, + "loss": 229.1218, + "losses_ref": -20.79702377319336, + "ref_logps/chosen": -273.4181823730469, + "ref_logps/rejected": -164.37570190429688, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": 0.30177104473114014, + "rewards/margins": 0.8132268786430359, + "rewards/rejected": -0.5114558935165405, + "step": 30, + "u": -0.08460383862257004, + "weight": 0.19152367115020752 + }, + { + "diff_generated": -66.31632995605469, + "epoch": 0.07327924627060979, + "grad_norm": 1482.6172349050332, + "learning_rate": 7.291666666666666e-07, + "logits/chosen": -2.2653889656066895, + "logits/rejected": -2.1242835521698, + "logps/chosen": -249.3292999267578, + "logps/rejected": -223.139892578125, + "loss": 228.9043, + "losses_ref": -19.583892822265625, + "ref_logps/chosen": -282.82373046875, + "ref_logps/rejected": -156.8235626220703, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.3349445164203644, + "rewards/margins": 0.9981077909469604, + "rewards/rejected": -0.6631633043289185, + "step": 35, + "u": 0.06723131239414215, + "weight": 0.2029893398284912 + }, + { + "diff_generated": -101.70452880859375, + "epoch": 0.08374771002355404, + "grad_norm": 1747.512023088969, + "learning_rate": 8.333333333333333e-07, + "logits/chosen": -2.109070062637329, + "logits/rejected": -2.079871654510498, + "logps/chosen": -237.7236328125, + "logps/rejected": -262.9115905761719, + "loss": 238.8995, + "losses_ref": -15.8267822265625, + "ref_logps/chosen": -272.7063903808594, + "ref_logps/rejected": -161.20706176757812, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34982770681381226, + "rewards/margins": 1.366873025894165, + "rewards/rejected": -1.017045259475708, + "step": 40, + "u": -1.1587042808532715, + "weight": 0.09851591289043427 + }, + { + "diff_generated": -117.0851058959961, + "epoch": 0.0942161737764983, + "grad_norm": 1667.7557707134451, + "learning_rate": 9.374999999999999e-07, + "logits/chosen": -2.20316219329834, + "logits/rejected": -2.008223295211792, + "logps/chosen": -257.76983642578125, + "logps/rejected": -278.8745422363281, + "loss": 239.9967, + "losses_ref": -20.097864151000977, + "ref_logps/chosen": -293.736083984375, + "ref_logps/rejected": -161.78945922851562, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.3596626818180084, + "rewards/margins": 1.530513882637024, + "rewards/rejected": -1.1708511114120483, + "step": 45, + "u": -0.12792688608169556, + "weight": 0.16130205988883972 + }, + { + "diff_generated": -126.9466781616211, + "epoch": 0.10468463752944256, + "grad_norm": 1521.2094097818665, + "learning_rate": 1.0416666666666667e-06, + "logits/chosen": -2.1982452869415283, + "logits/rejected": -2.1284544467926025, + "logps/chosen": -232.5095977783203, + "logps/rejected": -295.5307922363281, + "loss": 224.3866, + "losses_ref": -21.150318145751953, + "ref_logps/chosen": -270.96405029296875, + "ref_logps/rejected": -168.58413696289062, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.38454434275627136, + "rewards/margins": 1.6540111303329468, + "rewards/rejected": -1.2694666385650635, + "step": 50, + "u": 0.001223707222379744, + "weight": 0.18796880543231964 + }, + { + "diff_generated": -141.50799560546875, + "epoch": 0.11515310128238682, + "grad_norm": 1612.8192197434123, + "learning_rate": 1.1458333333333333e-06, + "logits/chosen": -2.0737013816833496, + "logits/rejected": -1.9873807430267334, + "logps/chosen": -239.891357421875, + "logps/rejected": -311.09619140625, + "loss": 220.8677, + "losses_ref": -7.660050392150879, + "ref_logps/chosen": -280.08502197265625, + "ref_logps/rejected": -169.58819580078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40193670988082886, + "rewards/margins": 1.8170166015625, + "rewards/rejected": -1.4150798320770264, + "step": 55, + "u": -0.9630683660507202, + "weight": 0.08691856265068054 + }, + { + "diff_generated": -137.93148803710938, + "epoch": 0.12562156503533106, + "grad_norm": 1372.8553226775107, + "learning_rate": 1.2499999999999999e-06, + "logits/chosen": -1.9770643711090088, + "logits/rejected": -1.8704265356063843, + "logps/chosen": -242.3487091064453, + "logps/rejected": -295.7236633300781, + "loss": 226.417, + "losses_ref": -8.987265586853027, + "ref_logps/chosen": -281.4112548828125, + "ref_logps/rejected": -157.79214477539062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39062565565109253, + "rewards/margins": 1.7699406147003174, + "rewards/rejected": -1.37931489944458, + "step": 60, + "u": -0.9851242303848267, + "weight": 0.08782722800970078 + }, + { + "diff_generated": -155.2223358154297, + "epoch": 0.1360900287882753, + "grad_norm": 1255.5204766616016, + "learning_rate": 1.3541666666666667e-06, + "logits/chosen": -1.9109680652618408, + "logits/rejected": -1.800903081893921, + "logps/chosen": -251.7116241455078, + "logps/rejected": -313.6351318359375, + "loss": 226.6359, + "losses_ref": -6.898039817810059, + "ref_logps/chosen": -291.105224609375, + "ref_logps/rejected": -158.41278076171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3939359784126282, + "rewards/margins": 1.9461593627929688, + "rewards/rejected": -1.5522234439849854, + "step": 65, + "u": -1.2434440851211548, + "weight": 0.07695779949426651 + }, + { + "diff_generated": -131.47259521484375, + "epoch": 0.14655849254121958, + "grad_norm": 1343.9563405956512, + "learning_rate": 1.4583333333333333e-06, + "logits/chosen": -1.8604061603546143, + "logits/rejected": -1.8694736957550049, + "logps/chosen": -233.1003875732422, + "logps/rejected": -294.2840881347656, + "loss": 225.3943, + "losses_ref": -10.19434642791748, + "ref_logps/chosen": -274.62811279296875, + "ref_logps/rejected": -162.8114776611328, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.41527730226516724, + "rewards/margins": 1.7300033569335938, + "rewards/rejected": -1.3147261142730713, + "step": 70, + "u": -1.253035545349121, + "weight": 0.09121803939342499 + }, + { + "diff_generated": -137.2784423828125, + "epoch": 0.15702695629416383, + "grad_norm": 1322.5353000176865, + "learning_rate": 1.5624999999999999e-06, + "logits/chosen": -1.800450086593628, + "logits/rejected": -1.649074912071228, + "logps/chosen": -263.3216247558594, + "logps/rejected": -309.0770568847656, + "loss": 233.1299, + "losses_ref": -10.170949935913086, + "ref_logps/chosen": -306.6936950683594, + "ref_logps/rejected": -171.79859924316406, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4337209165096283, + "rewards/margins": 1.8065054416656494, + "rewards/rejected": -1.3727843761444092, + "step": 75, + "u": -1.3111217021942139, + "weight": 0.08406667411327362 + }, + { + "diff_generated": -128.09861755371094, + "epoch": 0.16749542004710807, + "grad_norm": 1343.7990825981688, + "learning_rate": 1.6666666666666667e-06, + "logits/chosen": -1.6293904781341553, + "logits/rejected": -1.653552770614624, + "logps/chosen": -211.403564453125, + "logps/rejected": -288.49102783203125, + "loss": 223.2768, + "losses_ref": -5.209358215332031, + "ref_logps/chosen": -253.810302734375, + "ref_logps/rejected": -160.39239501953125, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.42406734824180603, + "rewards/margins": 1.7050535678863525, + "rewards/rejected": -1.2809861898422241, + "step": 80, + "u": -1.129665732383728, + "weight": 0.05663755536079407 + }, + { + "diff_generated": -135.20687866210938, + "epoch": 0.17796388380005235, + "grad_norm": 1192.5534502285198, + "learning_rate": 1.7708333333333332e-06, + "logits/chosen": -1.573900818824768, + "logits/rejected": -1.4756534099578857, + "logps/chosen": -239.03305053710938, + "logps/rejected": -300.70184326171875, + "loss": 223.0021, + "losses_ref": -7.026658535003662, + "ref_logps/chosen": -282.1534423828125, + "ref_logps/rejected": -165.49496459960938, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43120384216308594, + "rewards/margins": 1.7832725048065186, + "rewards/rejected": -1.3520687818527222, + "step": 85, + "u": -0.8892000317573547, + "weight": 0.074161596596241 + }, + { + "diff_generated": -148.8050994873047, + "epoch": 0.1884323475529966, + "grad_norm": 1417.0310516206605, + "learning_rate": 1.8749999999999998e-06, + "logits/chosen": -1.3538436889648438, + "logits/rejected": -1.2718507051467896, + "logps/chosen": -234.74856567382812, + "logps/rejected": -304.4095458984375, + "loss": 232.0073, + "losses_ref": -11.248689651489258, + "ref_logps/chosen": -279.6741638183594, + "ref_logps/rejected": -155.60443115234375, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.4492563307285309, + "rewards/margins": 1.937307596206665, + "rewards/rejected": -1.488051176071167, + "step": 90, + "u": -1.1423507928848267, + "weight": 0.08043224364519119 + }, + { + "diff_generated": -148.7802276611328, + "epoch": 0.19890081130594087, + "grad_norm": 1432.3504082681623, + "learning_rate": 1.9791666666666666e-06, + "logits/chosen": -1.1082611083984375, + "logits/rejected": -1.0555765628814697, + "logps/chosen": -235.3373565673828, + "logps/rejected": -309.65771484375, + "loss": 219.1082, + "losses_ref": -13.706560134887695, + "ref_logps/chosen": -277.9019470214844, + "ref_logps/rejected": -160.87747192382812, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4256461262702942, + "rewards/margins": 1.9134483337402344, + "rewards/rejected": -1.487802267074585, + "step": 95, + "u": -1.0568161010742188, + "weight": 0.09002764523029327 + }, + { + "diff_generated": -158.8511962890625, + "epoch": 0.2093692750588851, + "grad_norm": 1374.8147610024293, + "learning_rate": 1.9998927475076105e-06, + "logits/chosen": -0.9869598150253296, + "logits/rejected": -0.8535524606704712, + "logps/chosen": -238.96426391601562, + "logps/rejected": -322.4727783203125, + "loss": 236.8658, + "losses_ref": -5.802731513977051, + "ref_logps/chosen": -282.0462951660156, + "ref_logps/rejected": -163.62156677246094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43082040548324585, + "rewards/margins": 2.0193324089050293, + "rewards/rejected": -1.5885119438171387, + "step": 100, + "u": -1.2527328729629517, + "weight": 0.06292165815830231 + }, + { + "diff_generated": -147.18008422851562, + "epoch": 0.21983773881182936, + "grad_norm": 1625.9248559762682, + "learning_rate": 1.9994570736865402e-06, + "logits/chosen": -1.07206392288208, + "logits/rejected": -0.9393303990364075, + "logps/chosen": -232.5029296875, + "logps/rejected": -308.7837829589844, + "loss": 213.8591, + "losses_ref": -10.191104888916016, + "ref_logps/chosen": -275.3525390625, + "ref_logps/rejected": -161.60366821289062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4284963011741638, + "rewards/margins": 1.9002971649169922, + "rewards/rejected": -1.4718010425567627, + "step": 105, + "u": -1.0033910274505615, + "weight": 0.10204311460256577 + }, + { + "diff_generated": -128.29922485351562, + "epoch": 0.23030620256477363, + "grad_norm": 1231.2639002533556, + "learning_rate": 1.9986864211644068e-06, + "logits/chosen": -1.1658036708831787, + "logits/rejected": -1.0709865093231201, + "logps/chosen": -231.3977813720703, + "logps/rejected": -283.1410217285156, + "loss": 246.1861, + "losses_ref": -6.052565574645996, + "ref_logps/chosen": -272.9906921386719, + "ref_logps/rejected": -154.841796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41592931747436523, + "rewards/margins": 1.6989214420318604, + "rewards/rejected": -1.2829921245574951, + "step": 110, + "u": -1.3139088153839111, + "weight": 0.07522980868816376 + }, + { + "diff_generated": -133.98553466796875, + "epoch": 0.24077466631771788, + "grad_norm": 1343.0801296451152, + "learning_rate": 1.997581048233623e-06, + "logits/chosen": -1.1396609544754028, + "logits/rejected": -1.1306806802749634, + "logps/chosen": -226.9049835205078, + "logps/rejected": -293.1982421875, + "loss": 230.2171, + "losses_ref": -5.637959957122803, + "ref_logps/chosen": -269.8221130371094, + "ref_logps/rejected": -159.2126922607422, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4291713833808899, + "rewards/margins": 1.769026756286621, + "rewards/rejected": -1.3398553133010864, + "step": 115, + "u": -1.168405294418335, + "weight": 0.05913761258125305 + }, + { + "diff_generated": -123.33839416503906, + "epoch": 0.2512431300706621, + "grad_norm": 1434.3497350520097, + "learning_rate": 1.9961413253717214e-06, + "logits/chosen": -1.5746419429779053, + "logits/rejected": -1.518913984298706, + "logps/chosen": -228.5142822265625, + "logps/rejected": -284.6359558105469, + "loss": 234.8627, + "losses_ref": -9.012969017028809, + "ref_logps/chosen": -274.33917236328125, + "ref_logps/rejected": -161.29759216308594, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.4582485258579254, + "rewards/margins": 1.6916322708129883, + "rewards/rejected": -1.2333838939666748, + "step": 120, + "u": -0.9588969349861145, + "weight": 0.08884967118501663 + }, + { + "diff_generated": -151.09429931640625, + "epoch": 0.26171159382360637, + "grad_norm": 1274.213985322993, + "learning_rate": 1.994367735117177e-06, + "logits/chosen": -1.6689637899398804, + "logits/rejected": -1.6743271350860596, + "logps/chosen": -216.779541015625, + "logps/rejected": -306.51861572265625, + "loss": 226.4779, + "losses_ref": -6.019095420837402, + "ref_logps/chosen": -259.2029724121094, + "ref_logps/rejected": -155.42433166503906, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.42423415184020996, + "rewards/margins": 1.9351768493652344, + "rewards/rejected": -1.5109429359436035, + "step": 125, + "u": -1.222081184387207, + "weight": 0.08297105878591537 + }, + { + "diff_generated": -161.22811889648438, + "epoch": 0.2721800575765506, + "grad_norm": 1337.1173679216238, + "learning_rate": 1.992260871907687e-06, + "logits/chosen": -1.5299973487854004, + "logits/rejected": -1.4785773754119873, + "logps/chosen": -239.4655303955078, + "logps/rejected": -327.7781982421875, + "loss": 242.8888, + "losses_ref": -7.182534694671631, + "ref_logps/chosen": -280.188720703125, + "ref_logps/rejected": -166.550048828125, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.407231867313385, + "rewards/margins": 2.019512891769409, + "rewards/rejected": -1.612281084060669, + "step": 130, + "u": -1.2559138536453247, + "weight": 0.05781525373458862 + }, + { + "diff_generated": -169.7267303466797, + "epoch": 0.2826485213294949, + "grad_norm": 1374.1488593321894, + "learning_rate": 1.9898214418809326e-06, + "logits/chosen": -1.3805739879608154, + "logits/rejected": -1.3600701093673706, + "logps/chosen": -238.9783935546875, + "logps/rejected": -343.4627380371094, + "loss": 242.9051, + "losses_ref": -2.127274990081787, + "ref_logps/chosen": -281.3921203613281, + "ref_logps/rejected": -173.73602294921875, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.4241371750831604, + "rewards/margins": 2.1214041709899902, + "rewards/rejected": -1.6972671747207642, + "step": 135, + "u": -1.7065389156341553, + "weight": 0.033993639051914215 + }, + { + "diff_generated": -151.85092163085938, + "epoch": 0.29311698508243916, + "grad_norm": 1370.477984750469, + "learning_rate": 1.9870502626379126e-06, + "logits/chosen": -1.3134925365447998, + "logits/rejected": -1.3758270740509033, + "logps/chosen": -227.9882049560547, + "logps/rejected": -322.3777770996094, + "loss": 229.547, + "losses_ref": -4.158343315124512, + "ref_logps/chosen": -270.9952392578125, + "ref_logps/rejected": -170.52687072753906, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43007057905197144, + "rewards/margins": 1.9485795497894287, + "rewards/rejected": -1.518509030342102, + "step": 140, + "u": -1.3956022262573242, + "weight": 0.05143100023269653 + }, + { + "diff_generated": -146.50155639648438, + "epoch": 0.3035854488353834, + "grad_norm": 1794.900701079277, + "learning_rate": 1.983948262968915e-06, + "logits/chosen": -1.5504910945892334, + "logits/rejected": -1.4326040744781494, + "logps/chosen": -259.777587890625, + "logps/rejected": -307.3033752441406, + "loss": 242.1811, + "losses_ref": -2.1557910442352295, + "ref_logps/chosen": -302.7044982910156, + "ref_logps/rejected": -160.8018035888672, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42926883697509766, + "rewards/margins": 1.8942844867706299, + "rewards/rejected": -1.4650156497955322, + "step": 145, + "u": -1.3577892780303955, + "weight": 0.044694624841213226 + }, + { + "diff_generated": -155.41860961914062, + "epoch": 0.31405391258832765, + "grad_norm": 1420.5558411185323, + "learning_rate": 1.9805164825422237e-06, + "logits/chosen": -2.0522618293762207, + "logits/rejected": -1.9478759765625, + "logps/chosen": -238.4119873046875, + "logps/rejected": -314.91790771484375, + "loss": 224.1883, + "losses_ref": -3.6840145587921143, + "ref_logps/chosen": -281.19158935546875, + "ref_logps/rejected": -159.49932861328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4277961254119873, + "rewards/margins": 1.9819822311401367, + "rewards/rejected": -1.5541859865188599, + "step": 150, + "u": -1.3957250118255615, + "weight": 0.05671170353889465 + }, + { + "diff_generated": -151.29141235351562, + "epoch": 0.3245223763412719, + "grad_norm": 1339.4660772749999, + "learning_rate": 1.9767560715556594e-06, + "logits/chosen": -2.201369524002075, + "logits/rejected": -2.1122801303863525, + "logps/chosen": -232.8695831298828, + "logps/rejected": -321.6642150878906, + "loss": 230.8218, + "losses_ref": -4.063229084014893, + "ref_logps/chosen": -279.747314453125, + "ref_logps/rejected": -170.372802734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46877723932266235, + "rewards/margins": 1.9816913604736328, + "rewards/rejected": -1.5129140615463257, + "step": 155, + "u": -1.4928115606307983, + "weight": 0.05359172821044922 + }, + { + "diff_generated": -154.98220825195312, + "epoch": 0.33499084009421615, + "grad_norm": 1436.3409054374235, + "learning_rate": 1.972668290351084e-06, + "logits/chosen": -2.1720938682556152, + "logits/rejected": -2.0600266456604004, + "logps/chosen": -240.95022583007812, + "logps/rejected": -311.90997314453125, + "loss": 234.915, + "losses_ref": -4.4140777587890625, + "ref_logps/chosen": -289.99774169921875, + "ref_logps/rejected": -156.92776489257812, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4904751777648926, + "rewards/margins": 2.040297031402588, + "rewards/rejected": -1.5498219728469849, + "step": 160, + "u": -1.4394853115081787, + "weight": 0.04004598781466484 + }, + { + "diff_generated": -144.861572265625, + "epoch": 0.34545930384716045, + "grad_norm": 1297.629892424431, + "learning_rate": 1.968254508991978e-06, + "logits/chosen": -2.255429267883301, + "logits/rejected": -2.142435073852539, + "logps/chosen": -243.08935546875, + "logps/rejected": -304.804443359375, + "loss": 237.5995, + "losses_ref": -2.3130009174346924, + "ref_logps/chosen": -284.68487548828125, + "ref_logps/rejected": -159.94287109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4159550666809082, + "rewards/margins": 1.8645708560943604, + "rewards/rejected": -1.4486157894134521, + "step": 165, + "u": -1.5542036294937134, + "weight": 0.030019784346222878 + }, + { + "diff_generated": -151.61795043945312, + "epoch": 0.3559277676001047, + "grad_norm": 1297.3953865872961, + "learning_rate": 1.9635162068042544e-06, + "logits/chosen": -2.119171380996704, + "logits/rejected": -2.017618417739868, + "logps/chosen": -247.02041625976562, + "logps/rejected": -313.6037292480469, + "loss": 237.275, + "losses_ref": -6.966467380523682, + "ref_logps/chosen": -288.6535949707031, + "ref_logps/rejected": -161.9857940673828, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": 0.41633161902427673, + "rewards/margins": 1.9325110912322998, + "rewards/rejected": -1.5161794424057007, + "step": 170, + "u": -1.2121031284332275, + "weight": 0.07038909941911697 + }, + { + "diff_generated": -144.2270050048828, + "epoch": 0.36639623135304894, + "grad_norm": 1438.8100283748447, + "learning_rate": 1.958454971880441e-06, + "logits/chosen": -2.147486686706543, + "logits/rejected": -2.0490543842315674, + "logps/chosen": -268.3631591796875, + "logps/rejected": -305.03021240234375, + "loss": 251.9562, + "losses_ref": -5.818743705749512, + "ref_logps/chosen": -313.4308776855469, + "ref_logps/rejected": -160.80323791503906, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4506770670413971, + "rewards/margins": 1.8929469585418701, + "rewards/rejected": -1.442270040512085, + "step": 175, + "u": -1.2561050653457642, + "weight": 0.0653764009475708 + }, + { + "diff_generated": -141.9085693359375, + "epoch": 0.3768646951059932, + "grad_norm": 1207.513295077982, + "learning_rate": 1.9530725005474194e-06, + "logits/chosen": -2.267883539199829, + "logits/rejected": -2.218174457550049, + "logps/chosen": -221.9941864013672, + "logps/rejected": -298.5855407714844, + "loss": 221.5628, + "losses_ref": -3.0411601066589355, + "ref_logps/chosen": -264.38067626953125, + "ref_logps/rejected": -156.677001953125, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4238646924495697, + "rewards/margins": 1.8429502248764038, + "rewards/rejected": -1.4190856218338013, + "step": 180, + "u": -1.1714732646942139, + "weight": 0.05968625098466873 + }, + { + "diff_generated": -150.76657104492188, + "epoch": 0.38733315885893743, + "grad_norm": 1307.7780975566222, + "learning_rate": 1.9473705967978807e-06, + "logits/chosen": -2.420961856842041, + "logits/rejected": -2.327650547027588, + "logps/chosen": -227.6046600341797, + "logps/rejected": -303.7978210449219, + "loss": 229.0799, + "losses_ref": -15.570757865905762, + "ref_logps/chosen": -272.23333740234375, + "ref_logps/rejected": -153.03126525878906, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4462866187095642, + "rewards/margins": 1.9539520740509033, + "rewards/rejected": -1.5076655149459839, + "step": 185, + "u": -0.8297923803329468, + "weight": 0.09269951283931732 + }, + { + "diff_generated": -147.6534423828125, + "epoch": 0.39780162261188173, + "grad_norm": 1169.0067686339887, + "learning_rate": 1.941351171685697e-06, + "logits/chosen": -2.2705044746398926, + "logits/rejected": -2.2303287982940674, + "logps/chosen": -229.6949920654297, + "logps/rejected": -316.17437744140625, + "loss": 234.7021, + "losses_ref": -5.174070835113525, + "ref_logps/chosen": -274.26959228515625, + "ref_logps/rejected": -168.52093505859375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4457460343837738, + "rewards/margins": 1.9222803115844727, + "rewards/rejected": -1.4765344858169556, + "step": 190, + "u": -1.7719621658325195, + "weight": 0.03358909860253334 + }, + { + "diff_generated": -159.57711791992188, + "epoch": 0.408270086364826, + "grad_norm": 1222.7096009577886, + "learning_rate": 1.9350162426854148e-06, + "logits/chosen": -2.1345176696777344, + "logits/rejected": -2.1815943717956543, + "logps/chosen": -195.1034393310547, + "logps/rejected": -316.82177734375, + "loss": 220.9707, + "losses_ref": -4.031326770782471, + "ref_logps/chosen": -238.08377075195312, + "ref_logps/rejected": -157.2446746826172, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42980343103408813, + "rewards/margins": 2.025574207305908, + "rewards/rejected": -1.595771074295044, + "step": 195, + "u": -1.546870231628418, + "weight": 0.03703851252794266 + }, + { + "diff_generated": -167.23892211914062, + "epoch": 0.4187385501177702, + "grad_norm": 1368.307859885155, + "learning_rate": 1.9283679330160725e-06, + "logits/chosen": -2.1258459091186523, + "logits/rejected": -2.004584789276123, + "logps/chosen": -238.9210205078125, + "logps/rejected": -331.30718994140625, + "loss": 244.6853, + "losses_ref": -4.569379806518555, + "ref_logps/chosen": -285.3875732421875, + "ref_logps/rejected": -164.0682830810547, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4646654725074768, + "rewards/margins": 2.137054681777954, + "rewards/rejected": -1.6723893880844116, + "step": 200, + "u": -1.6067603826522827, + "weight": 0.04548769071698189 + }, + { + "diff_generated": -156.21780395507812, + "epoch": 0.42920701387071447, + "grad_norm": 1208.8530669692416, + "learning_rate": 1.9214084709295847e-06, + "logits/chosen": -2.0831170082092285, + "logits/rejected": -1.964040756225586, + "logps/chosen": -255.9301300048828, + "logps/rejected": -318.99798583984375, + "loss": 233.3463, + "losses_ref": -5.610936641693115, + "ref_logps/chosen": -300.7832946777344, + "ref_logps/rejected": -162.78021240234375, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.44853147864341736, + "rewards/margins": 2.010709524154663, + "rewards/rejected": -1.5621780157089233, + "step": 205, + "u": -1.3661489486694336, + "weight": 0.06516700237989426 + }, + { + "diff_generated": -171.98703002929688, + "epoch": 0.4396754776236587, + "grad_norm": 1215.8559114876498, + "learning_rate": 1.9141401889639164e-06, + "logits/chosen": -1.9906151294708252, + "logits/rejected": -1.9088771343231201, + "logps/chosen": -235.02249145507812, + "logps/rejected": -345.1544494628906, + "loss": 234.6928, + "losses_ref": -2.863798141479492, + "ref_logps/chosen": -280.8175048828125, + "ref_logps/rejected": -173.16738891601562, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45795029401779175, + "rewards/margins": 2.1778206825256348, + "rewards/rejected": -1.7198702096939087, + "step": 210, + "u": -1.4222519397735596, + "weight": 0.044259898364543915 + }, + { + "diff_generated": -168.92660522460938, + "epoch": 0.45014394137660296, + "grad_norm": 1266.497741976898, + "learning_rate": 1.906565523161312e-06, + "logits/chosen": -1.9987051486968994, + "logits/rejected": -1.9987319707870483, + "logps/chosen": -227.54159545898438, + "logps/rejected": -331.20281982421875, + "loss": 227.5447, + "losses_ref": -2.0428645610809326, + "ref_logps/chosen": -272.03076171875, + "ref_logps/rejected": -162.27622985839844, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4448915421962738, + "rewards/margins": 2.134157657623291, + "rewards/rejected": -1.6892658472061157, + "step": 215, + "u": -1.699721097946167, + "weight": 0.028461579233407974 + }, + { + "diff_generated": -181.3323211669922, + "epoch": 0.46061240512954726, + "grad_norm": 1409.5627230630107, + "learning_rate": 1.8986870122518259e-06, + "logits/chosen": -1.996578574180603, + "logits/rejected": -1.9339357614517212, + "logps/chosen": -241.12069702148438, + "logps/rejected": -345.39239501953125, + "loss": 250.5986, + "losses_ref": -13.413454055786133, + "ref_logps/chosen": -284.3638610839844, + "ref_logps/rejected": -164.06004333496094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43243154883384705, + "rewards/margins": 2.2457549571990967, + "rewards/rejected": -1.8133233785629272, + "step": 220, + "u": -1.559470295906067, + "weight": 0.03921313211321831 + }, + { + "diff_generated": -167.23196411132812, + "epoch": 0.4710808688824915, + "grad_norm": 1439.3131066005014, + "learning_rate": 1.8905072968024423e-06, + "logits/chosen": -2.0085692405700684, + "logits/rejected": -1.9212806224822998, + "logps/chosen": -240.53793334960938, + "logps/rejected": -324.13519287109375, + "loss": 229.6424, + "losses_ref": -2.6123085021972656, + "ref_logps/chosen": -288.477783203125, + "ref_logps/rejected": -156.90321350097656, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47939810156822205, + "rewards/margins": 2.1517176628112793, + "rewards/rejected": -1.6723196506500244, + "step": 225, + "u": -1.7230523824691772, + "weight": 0.04574074223637581 + }, + { + "diff_generated": -159.5584259033203, + "epoch": 0.48154933263543576, + "grad_norm": 1268.731805706848, + "learning_rate": 1.88202911833206e-06, + "logits/chosen": -2.006537914276123, + "logits/rejected": -2.0306971073150635, + "logps/chosen": -209.113037109375, + "logps/rejected": -324.5091552734375, + "loss": 221.1728, + "losses_ref": -2.3901400566101074, + "ref_logps/chosen": -255.0234832763672, + "ref_logps/rejected": -164.95074462890625, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.459104061126709, + "rewards/margins": 2.0546882152557373, + "rewards/rejected": -1.5955842733383179, + "step": 230, + "u": -1.3925855159759521, + "weight": 0.045756690204143524 + }, + { + "diff_generated": -170.58221435546875, + "epoch": 0.49201779638838, + "grad_norm": 1315.789025978012, + "learning_rate": 1.873255318392644e-06, + "logits/chosen": -1.9995191097259521, + "logits/rejected": -1.8898826837539673, + "logps/chosen": -234.0719757080078, + "logps/rejected": -327.0367736816406, + "loss": 242.3326, + "losses_ref": -4.473931312561035, + "ref_logps/chosen": -280.68048095703125, + "ref_logps/rejected": -156.4545440673828, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46608513593673706, + "rewards/margins": 2.1719069480895996, + "rewards/rejected": -1.7058223485946655, + "step": 235, + "u": -1.6257721185684204, + "weight": 0.034325193613767624 + }, + { + "diff_generated": -163.37722778320312, + "epoch": 0.5024862601413242, + "grad_norm": 1285.4823648929914, + "learning_rate": 1.8641888376168483e-06, + "logits/chosen": -1.9665982723236084, + "logits/rejected": -1.9548044204711914, + "logps/chosen": -215.7754669189453, + "logps/rejected": -326.5556335449219, + "loss": 231.7613, + "losses_ref": -5.584181308746338, + "ref_logps/chosen": -260.7419128417969, + "ref_logps/rejected": -163.17840576171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44966477155685425, + "rewards/margins": 2.083436965942383, + "rewards/rejected": -1.6337722539901733, + "step": 240, + "u": -1.2691129446029663, + "weight": 0.0609821155667305 + }, + { + "diff_generated": -147.24386596679688, + "epoch": 0.5129547238942685, + "grad_norm": 1347.3156065591786, + "learning_rate": 1.8548327147324312e-06, + "logits/chosen": -1.9906165599822998, + "logits/rejected": -1.872373342514038, + "logps/chosen": -243.5879364013672, + "logps/rejected": -304.78204345703125, + "loss": 236.4194, + "losses_ref": -7.212074279785156, + "ref_logps/chosen": -291.9618835449219, + "ref_logps/rejected": -157.53817749023438, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.48373931646347046, + "rewards/margins": 1.9561779499053955, + "rewards/rejected": -1.4724384546279907, + "step": 245, + "u": -0.7504249811172485, + "weight": 0.08246179670095444 + }, + { + "diff_generated": -136.68235778808594, + "epoch": 0.5234231876472127, + "grad_norm": 1280.557570592857, + "learning_rate": 1.8451900855437948e-06, + "logits/chosen": -2.0444495677948, + "logits/rejected": -1.9412866830825806, + "logps/chosen": -237.24496459960938, + "logps/rejected": -305.5830078125, + "loss": 231.6959, + "losses_ref": -4.014006614685059, + "ref_logps/chosen": -285.0312805175781, + "ref_logps/rejected": -168.90065002441406, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47786277532577515, + "rewards/margins": 1.8446861505508423, + "rewards/rejected": -1.3668235540390015, + "step": 250, + "u": -1.4464961290359497, + "weight": 0.045917607843875885 + }, + { + "diff_generated": -147.02664184570312, + "epoch": 0.533891651400157, + "grad_norm": 1271.960313695608, + "learning_rate": 1.8352641818809846e-06, + "logits/chosen": -2.012394428253174, + "logits/rejected": -1.9293123483657837, + "logps/chosen": -255.23617553710938, + "logps/rejected": -305.11065673828125, + "loss": 237.2504, + "losses_ref": -3.9721827507019043, + "ref_logps/chosen": -298.58929443359375, + "ref_logps/rejected": -158.0840606689453, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.43353086709976196, + "rewards/margins": 1.9037971496582031, + "rewards/rejected": -1.470266342163086, + "step": 255, + "u": -1.2067726850509644, + "weight": 0.04464394599199295 + }, + { + "diff_generated": -150.324462890625, + "epoch": 0.5443601151531012, + "grad_norm": 1323.4761845101339, + "learning_rate": 1.8250583305165094e-06, + "logits/chosen": -1.7699302434921265, + "logits/rejected": -1.7340294122695923, + "logps/chosen": -232.5556640625, + "logps/rejected": -303.0191650390625, + "loss": 236.4857, + "losses_ref": -3.8249027729034424, + "ref_logps/chosen": -277.13360595703125, + "ref_logps/rejected": -152.69473266601562, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4457794725894928, + "rewards/margins": 1.949023962020874, + "rewards/rejected": -1.5032446384429932, + "step": 260, + "u": -1.4394437074661255, + "weight": 0.06012386828660965 + }, + { + "diff_generated": -146.3737335205078, + "epoch": 0.5548285789060455, + "grad_norm": 1232.2132266823505, + "learning_rate": 1.8145759520503357e-06, + "logits/chosen": -1.836775541305542, + "logits/rejected": -1.7096904516220093, + "logps/chosen": -242.7677764892578, + "logps/rejected": -308.00592041015625, + "loss": 219.0433, + "losses_ref": -2.2338509559631348, + "ref_logps/chosen": -290.8897705078125, + "ref_logps/rejected": -161.63217163085938, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48121970891952515, + "rewards/margins": 1.9449567794799805, + "rewards/rejected": -1.4637373685836792, + "step": 265, + "u": -1.7323522567749023, + "weight": 0.03296298533678055 + }, + { + "diff_generated": -160.1627960205078, + "epoch": 0.5652970426589898, + "grad_norm": 1351.7338122517372, + "learning_rate": 1.803820559763439e-06, + "logits/chosen": -1.7946879863739014, + "logits/rejected": -1.7407840490341187, + "logps/chosen": -215.82290649414062, + "logps/rejected": -316.18743896484375, + "loss": 232.6284, + "losses_ref": -3.786867618560791, + "ref_logps/chosen": -261.61407470703125, + "ref_logps/rejected": -156.02464294433594, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45791149139404297, + "rewards/margins": 2.059539318084717, + "rewards/rejected": -1.6016279458999634, + "step": 270, + "u": -1.651993751525879, + "weight": 0.04032987728714943 + }, + { + "diff_generated": -142.9796905517578, + "epoch": 0.575765506411934, + "grad_norm": 1181.4870635863472, + "learning_rate": 1.7927957584402895e-06, + "logits/chosen": -1.875299096107483, + "logits/rejected": -1.8068253993988037, + "logps/chosen": -228.66781616210938, + "logps/rejected": -303.5104064941406, + "loss": 224.2237, + "losses_ref": -4.741028308868408, + "ref_logps/chosen": -272.44915771484375, + "ref_logps/rejected": -160.53070068359375, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.43781352043151855, + "rewards/margins": 1.8676105737686157, + "rewards/rejected": -1.4297969341278076, + "step": 275, + "u": -1.267141580581665, + "weight": 0.07081650197505951 + }, + { + "diff_generated": -147.29513549804688, + "epoch": 0.5862339701648783, + "grad_norm": 1311.3976945524007, + "learning_rate": 1.78150524316067e-06, + "logits/chosen": -1.9360460042953491, + "logits/rejected": -1.8399826288223267, + "logps/chosen": -244.2842559814453, + "logps/rejected": -319.99603271484375, + "loss": 221.9428, + "losses_ref": -5.114128112792969, + "ref_logps/chosen": -288.6471252441406, + "ref_logps/rejected": -172.7008819580078, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4436289668083191, + "rewards/margins": 1.9165802001953125, + "rewards/rejected": -1.4729512929916382, + "step": 280, + "u": -1.33490788936615, + "weight": 0.05145906284451485 + }, + { + "diff_generated": -165.53073120117188, + "epoch": 0.5967024339178225, + "grad_norm": 1217.7737895640616, + "learning_rate": 1.7699527980612304e-06, + "logits/chosen": -2.008852243423462, + "logits/rejected": -1.865282416343689, + "logps/chosen": -235.48495483398438, + "logps/rejected": -324.86236572265625, + "loss": 237.0448, + "losses_ref": -3.6097474098205566, + "ref_logps/chosen": -281.65557861328125, + "ref_logps/rejected": -159.3316650390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4617062509059906, + "rewards/margins": 2.117013454437256, + "rewards/rejected": -1.655307412147522, + "step": 285, + "u": -1.5152809619903564, + "weight": 0.03953182324767113 + }, + { + "diff_generated": -152.0204315185547, + "epoch": 0.6071708976707668, + "grad_norm": 1365.967274184474, + "learning_rate": 1.758142295067194e-06, + "logits/chosen": -1.9733747243881226, + "logits/rejected": -1.8123550415039062, + "logps/chosen": -253.77774047851562, + "logps/rejected": -316.69073486328125, + "loss": 236.0956, + "losses_ref": -9.393682479858398, + "ref_logps/chosen": -299.4283142089844, + "ref_logps/rejected": -164.6702880859375, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.45650559663772583, + "rewards/margins": 1.9767096042633057, + "rewards/rejected": -1.5202041864395142, + "step": 290, + "u": -1.0198547840118408, + "weight": 0.07342410832643509 + }, + { + "diff_generated": -156.43539428710938, + "epoch": 0.6176393614237111, + "grad_norm": 1274.7347074994798, + "learning_rate": 1.7460776925946416e-06, + "logits/chosen": -2.04952335357666, + "logits/rejected": -1.9772619009017944, + "logps/chosen": -231.12759399414062, + "logps/rejected": -324.58734130859375, + "loss": 216.7738, + "losses_ref": -3.1922709941864014, + "ref_logps/chosen": -275.5738525390625, + "ref_logps/rejected": -168.1519317626953, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4444626271724701, + "rewards/margins": 2.0088164806365967, + "rewards/rejected": -1.5643537044525146, + "step": 295, + "u": -1.7094459533691406, + "weight": 0.027651017531752586 + }, + { + "diff_generated": -166.10739135742188, + "epoch": 0.6281078251766553, + "grad_norm": 1279.289070746857, + "learning_rate": 1.7337630342238039e-06, + "logits/chosen": -2.0860671997070312, + "logits/rejected": -1.9944241046905518, + "logps/chosen": -226.953125, + "logps/rejected": -329.9337158203125, + "loss": 245.769, + "losses_ref": -2.491637706756592, + "ref_logps/chosen": -276.3335266113281, + "ref_logps/rejected": -163.8263397216797, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49380379915237427, + "rewards/margins": 2.1548776626586914, + "rewards/rejected": -1.6610740423202515, + "step": 300, + "u": -1.8025985956192017, + "weight": 0.020761026069521904 + }, + { + "diff_generated": -160.49281311035156, + "epoch": 0.6385762889295996, + "grad_norm": 1143.8594113545453, + "learning_rate": 1.7212024473438145e-06, + "logits/chosen": -2.1227848529815674, + "logits/rejected": -2.037874698638916, + "logps/chosen": -227.2042694091797, + "logps/rejected": -324.0436096191406, + "loss": 218.3608, + "losses_ref": -5.721261978149414, + "ref_logps/chosen": -275.447265625, + "ref_logps/rejected": -163.55076599121094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4824300706386566, + "rewards/margins": 2.0873584747314453, + "rewards/rejected": -1.6049282550811768, + "step": 305, + "u": -1.5689971446990967, + "weight": 0.03765694424510002 + }, + { + "diff_generated": -165.9134979248047, + "epoch": 0.6490447526825438, + "grad_norm": 1216.910104164249, + "learning_rate": 1.70840014176937e-06, + "logits/chosen": -2.148029327392578, + "logits/rejected": -1.9548304080963135, + "logps/chosen": -259.4276123046875, + "logps/rejected": -335.60723876953125, + "loss": 237.5431, + "losses_ref": -6.571761131286621, + "ref_logps/chosen": -307.9371643066406, + "ref_logps/rejected": -169.69369506835938, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48509567975997925, + "rewards/margins": 2.144230604171753, + "rewards/rejected": -1.659135103225708, + "step": 310, + "u": -1.2953577041625977, + "weight": 0.06081492453813553 + }, + { + "diff_generated": -150.7538299560547, + "epoch": 0.6595132164354881, + "grad_norm": 1285.8252216937017, + "learning_rate": 1.6953604083297663e-06, + "logits/chosen": -2.0963034629821777, + "logits/rejected": -2.005828619003296, + "logps/chosen": -238.0185089111328, + "logps/rejected": -313.0700988769531, + "loss": 232.0059, + "losses_ref": -5.998663425445557, + "ref_logps/chosen": -286.41973876953125, + "ref_logps/rejected": -162.3162841796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4840126633644104, + "rewards/margins": 1.9915508031845093, + "rewards/rejected": -1.5075383186340332, + "step": 315, + "u": -1.022328495979309, + "weight": 0.07499580085277557 + }, + { + "diff_generated": -167.22291564941406, + "epoch": 0.6699816801884323, + "grad_norm": 1393.3558242713107, + "learning_rate": 1.6820876174307821e-06, + "logits/chosen": -2.0343525409698486, + "logits/rejected": -1.9958488941192627, + "logps/chosen": -220.11959838867188, + "logps/rejected": -324.1341857910156, + "loss": 235.1374, + "losses_ref": -3.5960795879364014, + "ref_logps/chosen": -265.8931579589844, + "ref_logps/rejected": -156.91128540039062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45773547887802124, + "rewards/margins": 2.129964590072632, + "rewards/rejected": -1.6722290515899658, + "step": 320, + "u": -1.116194486618042, + "weight": 0.04208649322390556 + }, + { + "diff_generated": -152.4580078125, + "epoch": 0.6804501439413766, + "grad_norm": 1260.574816635609, + "learning_rate": 1.668586217589889e-06, + "logits/chosen": -2.028233051300049, + "logits/rejected": -1.943868637084961, + "logps/chosen": -252.96224975585938, + "logps/rejected": -314.017578125, + "loss": 228.4758, + "losses_ref": -1.9372276067733765, + "ref_logps/chosen": -299.65130615234375, + "ref_logps/rejected": -161.55958557128906, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46689024567604065, + "rewards/margins": 1.9914703369140625, + "rewards/rejected": -1.5245802402496338, + "step": 325, + "u": -1.6558067798614502, + "weight": 0.03156626224517822 + }, + { + "diff_generated": -140.8079376220703, + "epoch": 0.6909186076943209, + "grad_norm": 1201.438854630279, + "learning_rate": 1.6548607339452852e-06, + "logits/chosen": -2.0895023345947266, + "logits/rejected": -2.036318778991699, + "logps/chosen": -216.3995361328125, + "logps/rejected": -303.2993469238281, + "loss": 233.4191, + "losses_ref": -2.161651134490967, + "ref_logps/chosen": -261.6273498535156, + "ref_logps/rejected": -162.49142456054688, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4522779583930969, + "rewards/margins": 1.8603572845458984, + "rewards/rejected": -1.4080793857574463, + "step": 330, + "u": -1.8709716796875, + "weight": 0.017609911039471626 + }, + { + "diff_generated": -143.80978393554688, + "epoch": 0.7013870714472651, + "grad_norm": 1237.6054714094937, + "learning_rate": 1.6409157667392455e-06, + "logits/chosen": -2.059278964996338, + "logits/rejected": -1.9892032146453857, + "logps/chosen": -235.5959930419922, + "logps/rejected": -307.551513671875, + "loss": 229.4944, + "losses_ref": -6.860163688659668, + "ref_logps/chosen": -283.805908203125, + "ref_logps/rejected": -163.74172973632812, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48209866881370544, + "rewards/margins": 1.920196771621704, + "rewards/rejected": -1.4380979537963867, + "step": 335, + "u": -0.9188238382339478, + "weight": 0.07267802953720093 + }, + { + "diff_generated": -160.1986846923828, + "epoch": 0.7118555352002094, + "grad_norm": 1143.246546308752, + "learning_rate": 1.6267559897763027e-06, + "logits/chosen": -1.8168014287948608, + "logits/rejected": -1.863437294960022, + "logps/chosen": -188.27635192871094, + "logps/rejected": -314.9437561035156, + "loss": 216.8938, + "losses_ref": -1.3188815116882324, + "ref_logps/chosen": -237.00216674804688, + "ref_logps/rejected": -154.74508666992188, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.4872584939002991, + "rewards/margins": 2.089244842529297, + "rewards/rejected": -1.6019866466522217, + "step": 340, + "u": -1.303836703300476, + "weight": 0.029538637027144432 + }, + { + "diff_generated": -151.67230224609375, + "epoch": 0.7223239989531536, + "grad_norm": 1267.0562713440388, + "learning_rate": 1.6123861488567708e-06, + "logits/chosen": -1.9331505298614502, + "logits/rejected": -1.7450395822525024, + "logps/chosen": -256.15277099609375, + "logps/rejected": -316.7372131347656, + "loss": 244.0877, + "losses_ref": -2.1836702823638916, + "ref_logps/chosen": -306.53680419921875, + "ref_logps/rejected": -165.06492614746094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5038406848907471, + "rewards/margins": 2.0205636024475098, + "rewards/rejected": -1.5167229175567627, + "step": 345, + "u": -1.485855221748352, + "weight": 0.0375472754240036 + }, + { + "diff_generated": -147.20364379882812, + "epoch": 0.7327924627060979, + "grad_norm": 1350.564919328469, + "learning_rate": 1.5978110601861409e-06, + "logits/chosen": -1.9117012023925781, + "logits/rejected": -1.8668915033340454, + "logps/chosen": -253.0355224609375, + "logps/rejected": -311.43927001953125, + "loss": 240.3254, + "losses_ref": -2.832030773162842, + "ref_logps/chosen": -299.90985107421875, + "ref_logps/rejected": -164.23562622070312, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46874284744262695, + "rewards/margins": 1.9407793283462524, + "rewards/rejected": -1.472036361694336, + "step": 350, + "u": -1.429086685180664, + "weight": 0.04381849616765976 + }, + { + "diff_generated": -152.60690307617188, + "epoch": 0.7432609264590422, + "grad_norm": 1367.6005309235504, + "learning_rate": 1.5830356087608763e-06, + "logits/chosen": -1.887460708618164, + "logits/rejected": -1.8180389404296875, + "logps/chosen": -214.82699584960938, + "logps/rejected": -321.7936096191406, + "loss": 228.0585, + "losses_ref": -1.8199619054794312, + "ref_logps/chosen": -263.9666748046875, + "ref_logps/rejected": -169.18673706054688, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49139684438705444, + "rewards/margins": 2.017465829849243, + "rewards/rejected": -1.526068925857544, + "step": 355, + "u": -1.6218674182891846, + "weight": 0.02579430676996708 + }, + { + "diff_generated": -148.06683349609375, + "epoch": 0.7537293902119864, + "grad_norm": 1346.2814229526575, + "learning_rate": 1.5680647467311555e-06, + "logits/chosen": -1.8571285009384155, + "logits/rejected": -1.7857725620269775, + "logps/chosen": -244.458251953125, + "logps/rejected": -319.65484619140625, + "loss": 223.1362, + "losses_ref": -2.564044237136841, + "ref_logps/chosen": -293.27410888671875, + "ref_logps/rejected": -171.58799743652344, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4881584644317627, + "rewards/margins": 1.9688268899917603, + "rewards/rejected": -1.480668306350708, + "step": 360, + "u": -1.7225738763809204, + "weight": 0.03375329077243805 + }, + { + "diff_generated": -158.3987579345703, + "epoch": 0.7641978539649307, + "grad_norm": 1338.295994157702, + "learning_rate": 1.552903491741107e-06, + "logits/chosen": -1.837961196899414, + "logits/rejected": -1.839646577835083, + "logps/chosen": -230.9562530517578, + "logps/rejected": -320.73455810546875, + "loss": 230.7235, + "losses_ref": -2.363715648651123, + "ref_logps/chosen": -276.13995361328125, + "ref_logps/rejected": -162.33580017089844, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.45183688402175903, + "rewards/margins": 2.0358242988586426, + "rewards/rejected": -1.5839874744415283, + "step": 365, + "u": -1.599321722984314, + "weight": 0.03038620948791504 + }, + { + "diff_generated": -143.7940216064453, + "epoch": 0.7746663177178749, + "grad_norm": 1122.4826063930961, + "learning_rate": 1.5375569252470895e-06, + "logits/chosen": -1.994361162185669, + "logits/rejected": -1.8850581645965576, + "logps/chosen": -266.71722412109375, + "logps/rejected": -306.846923828125, + "loss": 232.9005, + "losses_ref": -7.7454657554626465, + "ref_logps/chosen": -315.1695251464844, + "ref_logps/rejected": -163.05288696289062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48452290892601013, + "rewards/margins": 1.922463059425354, + "rewards/rejected": -1.437940239906311, + "step": 370, + "u": -1.2242950201034546, + "weight": 0.05640628933906555 + }, + { + "diff_generated": -147.28756713867188, + "epoch": 0.7851347814708192, + "grad_norm": 1267.5426171485876, + "learning_rate": 1.5220301908145903e-06, + "logits/chosen": -1.984815001487732, + "logits/rejected": -1.8735277652740479, + "logps/chosen": -236.86972045898438, + "logps/rejected": -316.1020812988281, + "loss": 254.4526, + "losses_ref": -1.4826844930648804, + "ref_logps/chosen": -283.3154296875, + "ref_logps/rejected": -168.81448364257812, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.4644569754600525, + "rewards/margins": 1.9373327493667603, + "rewards/rejected": -1.472875714302063, + "step": 375, + "u": -1.3915516138076782, + "weight": 0.03981015831232071 + }, + { + "diff_generated": -141.51336669921875, + "epoch": 0.7956032452237635, + "grad_norm": 1213.313777968865, + "learning_rate": 1.5063284923943028e-06, + "logits/chosen": -1.9686000347137451, + "logits/rejected": -1.856993317604065, + "logps/chosen": -250.8971710205078, + "logps/rejected": -304.9432067871094, + "loss": 236.0771, + "losses_ref": -2.1682116985321045, + "ref_logps/chosen": -298.9543762207031, + "ref_logps/rejected": -163.42984008789062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4805716872215271, + "rewards/margins": 1.895705223083496, + "rewards/rejected": -1.4151335954666138, + "step": 380, + "u": -1.7837505340576172, + "weight": 0.02852563187479973 + }, + { + "diff_generated": -156.00753784179688, + "epoch": 0.8060717089767077, + "grad_norm": 1211.688665180567, + "learning_rate": 1.490457092577968e-06, + "logits/chosen": -1.9195213317871094, + "logits/rejected": -1.8409401178359985, + "logps/chosen": -229.5646209716797, + "logps/rejected": -317.98406982421875, + "loss": 227.1155, + "losses_ref": -1.2010728120803833, + "ref_logps/chosen": -279.9380798339844, + "ref_logps/rejected": -161.97653198242188, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.5037345290184021, + "rewards/margins": 2.063809871673584, + "rewards/rejected": -1.5600755214691162, + "step": 385, + "u": -1.6141672134399414, + "weight": 0.025375287979841232 + }, + { + "diff_generated": -152.0254669189453, + "epoch": 0.816540172729652, + "grad_norm": 1239.5744414457495, + "learning_rate": 1.4744213108345602e-06, + "logits/chosen": -2.0957484245300293, + "logits/rejected": -1.9671990871429443, + "logps/chosen": -254.6474151611328, + "logps/rejected": -313.9129333496094, + "loss": 233.3016, + "losses_ref": -4.944865703582764, + "ref_logps/chosen": -304.72125244140625, + "ref_logps/rejected": -161.88751220703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5007385015487671, + "rewards/margins": 2.0209929943084717, + "rewards/rejected": -1.5202546119689941, + "step": 390, + "u": -1.7007286548614502, + "weight": 0.05083342641592026 + }, + { + "diff_generated": -139.01864624023438, + "epoch": 0.8270086364825961, + "grad_norm": 1205.2400489160098, + "learning_rate": 1.4582265217274103e-06, + "logits/chosen": -1.9418761730194092, + "logits/rejected": -1.8380733728408813, + "logps/chosen": -247.5355682373047, + "logps/rejected": -302.6370849609375, + "loss": 239.286, + "losses_ref": -1.7620617151260376, + "ref_logps/chosen": -293.9803161621094, + "ref_logps/rejected": -163.61843872070312, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.464447557926178, + "rewards/margins": 1.8546336889266968, + "rewards/rejected": -1.3901864290237427, + "step": 395, + "u": -1.7652490139007568, + "weight": 0.021758217364549637 + }, + { + "diff_generated": -157.04232788085938, + "epoch": 0.8374771002355405, + "grad_norm": 1205.8199372142892, + "learning_rate": 1.4418781531128635e-06, + "logits/chosen": -2.0544238090515137, + "logits/rejected": -2.0346767902374268, + "logps/chosen": -234.49368286132812, + "logps/rejected": -326.8393249511719, + "loss": 233.9242, + "losses_ref": -1.8244788646697998, + "ref_logps/chosen": -282.6474609375, + "ref_logps/rejected": -169.79696655273438, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48153790831565857, + "rewards/margins": 2.0519611835479736, + "rewards/rejected": -1.5704233646392822, + "step": 400, + "u": -1.6305999755859375, + "weight": 0.024077033624053 + }, + { + "diff_generated": -152.27468872070312, + "epoch": 0.8479455639884846, + "grad_norm": 1197.3605051527013, + "learning_rate": 1.4253816843210748e-06, + "logits/chosen": -1.9861503839492798, + "logits/rejected": -1.8832927942276, + "logps/chosen": -244.0829315185547, + "logps/rejected": -317.6984558105469, + "loss": 237.8302, + "losses_ref": -3.3451290130615234, + "ref_logps/chosen": -295.3381652832031, + "ref_logps/rejected": -165.42379760742188, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.51255202293396, + "rewards/margins": 2.035299062728882, + "rewards/rejected": -1.5227469205856323, + "step": 405, + "u": -1.4654412269592285, + "weight": 0.036893170326948166 + }, + { + "diff_generated": -154.22146606445312, + "epoch": 0.8584140277414289, + "grad_norm": 1201.6414819745964, + "learning_rate": 1.4087426443195547e-06, + "logits/chosen": -1.9021320343017578, + "logits/rejected": -1.8548545837402344, + "logps/chosen": -212.048583984375, + "logps/rejected": -310.93731689453125, + "loss": 223.4945, + "losses_ref": -1.363377571105957, + "ref_logps/chosen": -261.7601013183594, + "ref_logps/rejected": -156.71588134765625, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.4971153736114502, + "rewards/margins": 2.039330005645752, + "rewards/rejected": -1.5422146320343018, + "step": 410, + "u": -1.499205470085144, + "weight": 0.03244508430361748 + }, + { + "diff_generated": -152.22821044921875, + "epoch": 0.8688824914943732, + "grad_norm": 1267.1758549974509, + "learning_rate": 1.391966609860075e-06, + "logits/chosen": -1.9990746974945068, + "logits/rejected": -1.9241716861724854, + "logps/chosen": -235.38150024414062, + "logps/rejected": -307.2711181640625, + "loss": 229.5926, + "losses_ref": -3.3139452934265137, + "ref_logps/chosen": -284.34393310546875, + "ref_logps/rejected": -155.04290771484375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48962411284446716, + "rewards/margins": 2.011906147003174, + "rewards/rejected": -1.5222820043563843, + "step": 415, + "u": -1.4567835330963135, + "weight": 0.04380001127719879 + }, + { + "diff_generated": -142.56979370117188, + "epoch": 0.8793509552473174, + "grad_norm": 1188.388834303343, + "learning_rate": 1.3750592036095619e-06, + "logits/chosen": -2.0134921073913574, + "logits/rejected": -1.8790652751922607, + "logps/chosen": -250.85546875, + "logps/rejected": -295.076416015625, + "loss": 235.3638, + "losses_ref": -3.0703201293945312, + "ref_logps/chosen": -298.8680725097656, + "ref_logps/rejected": -152.5066375732422, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4801257252693176, + "rewards/margins": 1.9058234691619873, + "rewards/rejected": -1.4256978034973145, + "step": 420, + "u": -1.4335013628005981, + "weight": 0.04213564842939377 + }, + { + "diff_generated": -147.99075317382812, + "epoch": 0.8898194190002617, + "grad_norm": 1470.580405072441, + "learning_rate": 1.3580260922655984e-06, + "logits/chosen": -1.9547443389892578, + "logits/rejected": -1.8864132165908813, + "logps/chosen": -229.1260223388672, + "logps/rejected": -308.96728515625, + "loss": 230.7763, + "losses_ref": -7.961075782775879, + "ref_logps/chosen": -278.4296875, + "ref_logps/rejected": -160.97654724121094, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.4930366575717926, + "rewards/margins": 1.9729440212249756, + "rewards/rejected": -1.4799073934555054, + "step": 425, + "u": -0.9531173706054688, + "weight": 0.06897237151861191 + }, + { + "diff_generated": -150.89932250976562, + "epoch": 0.9002878827532059, + "grad_norm": 1247.106031502474, + "learning_rate": 1.3408729846571713e-06, + "logits/chosen": -1.9829527139663696, + "logits/rejected": -1.7790740728378296, + "logps/chosen": -250.89053344726562, + "logps/rejected": -306.49493408203125, + "loss": 227.0822, + "losses_ref": -3.353726625442505, + "ref_logps/chosen": -299.95831298828125, + "ref_logps/rejected": -155.59561157226562, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49067792296409607, + "rewards/margins": 1.9996709823608398, + "rewards/rejected": -1.5089929103851318, + "step": 430, + "u": -1.7451012134552002, + "weight": 0.029034754261374474 + }, + { + "diff_generated": -161.92088317871094, + "epoch": 0.9107563465061502, + "grad_norm": 1215.392487626507, + "learning_rate": 1.3236056298312956e-06, + "logits/chosen": -1.8760721683502197, + "logits/rejected": -1.7741060256958008, + "logps/chosen": -230.2984161376953, + "logps/rejected": -322.80450439453125, + "loss": 219.6414, + "losses_ref": -2.6977756023406982, + "ref_logps/chosen": -276.49066162109375, + "ref_logps/rejected": -160.88360595703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4619222581386566, + "rewards/margins": 2.0811312198638916, + "rewards/rejected": -1.6192089319229126, + "step": 435, + "u": -1.3300695419311523, + "weight": 0.049107056111097336 + }, + { + "diff_generated": -169.64144897460938, + "epoch": 0.9212248102590945, + "grad_norm": 1199.549953331359, + "learning_rate": 1.3062298151261591e-06, + "logits/chosen": -1.8538296222686768, + "logits/rejected": -1.7674894332885742, + "logps/chosen": -247.5723114013672, + "logps/rejected": -334.99432373046875, + "loss": 228.2011, + "losses_ref": -3.9634671211242676, + "ref_logps/chosen": -293.4337463378906, + "ref_logps/rejected": -165.35289001464844, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4586148262023926, + "rewards/margins": 2.155029296875, + "rewards/rejected": -1.696414589881897, + "step": 440, + "u": -0.911568284034729, + "weight": 0.05750606581568718 + }, + { + "diff_generated": -176.77850341796875, + "epoch": 0.9316932740120387, + "grad_norm": 1134.6331161044943, + "learning_rate": 1.2887513642314372e-06, + "logits/chosen": -1.7472941875457764, + "logits/rejected": -1.6525627374649048, + "logps/chosen": -229.4337921142578, + "logps/rejected": -337.2396545410156, + "loss": 225.3431, + "losses_ref": -0.7648504376411438, + "ref_logps/chosen": -279.60003662109375, + "ref_logps/rejected": -160.4611358642578, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5016621947288513, + "rewards/margins": 2.2694473266601562, + "rewards/rejected": -1.7677850723266602, + "step": 445, + "u": -1.5227153301239014, + "weight": 0.024677513167262077 + }, + { + "diff_generated": -180.25607299804688, + "epoch": 0.942161737764983, + "grad_norm": 1254.0549089198466, + "learning_rate": 1.271176135236417e-06, + "logits/chosen": -1.8400166034698486, + "logits/rejected": -1.6989673376083374, + "logps/chosen": -255.73233032226562, + "logps/rejected": -341.2875671386719, + "loss": 233.639, + "losses_ref": -4.4961042404174805, + "ref_logps/chosen": -307.17620849609375, + "ref_logps/rejected": -161.03147888183594, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5144392251968384, + "rewards/margins": 2.317000150680542, + "rewards/rejected": -1.8025610446929932, + "step": 450, + "u": -1.3673655986785889, + "weight": 0.044325508177280426 + }, + { + "diff_generated": -192.91705322265625, + "epoch": 0.9526302015179272, + "grad_norm": 1206.737012082796, + "learning_rate": 1.2535100186666e-06, + "logits/chosen": -1.808547019958496, + "logits/rejected": -1.6920995712280273, + "logps/chosen": -254.8017578125, + "logps/rejected": -351.9469909667969, + "loss": 245.5463, + "losses_ref": -0.9527796506881714, + "ref_logps/chosen": -304.09619140625, + "ref_logps/rejected": -159.02993774414062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4929441809654236, + "rewards/margins": 2.422114610671997, + "rewards/rejected": -1.9291703701019287, + "step": 455, + "u": -1.6784477233886719, + "weight": 0.02164948359131813 + }, + { + "diff_generated": -185.23989868164062, + "epoch": 0.9630986652708715, + "grad_norm": 1270.8466354695972, + "learning_rate": 1.2357589355094273e-06, + "logits/chosen": -1.8315858840942383, + "logits/rejected": -1.7088918685913086, + "logps/chosen": -269.20538330078125, + "logps/rejected": -338.2021179199219, + "loss": 246.9693, + "losses_ref": -3.263090133666992, + "ref_logps/chosen": -319.02618408203125, + "ref_logps/rejected": -152.9622039794922, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4982084631919861, + "rewards/margins": 2.350607395172119, + "rewards/rejected": -1.8523988723754883, + "step": 460, + "u": -1.5151453018188477, + "weight": 0.04330545663833618 + }, + { + "diff_generated": -187.97279357910156, + "epoch": 0.9735671290238157, + "grad_norm": 1262.5411111889684, + "learning_rate": 1.2179288352297982e-06, + "logits/chosen": -1.7451597452163696, + "logits/rejected": -1.6632684469223022, + "logps/chosen": -227.63937377929688, + "logps/rejected": -355.5631103515625, + "loss": 232.7903, + "losses_ref": -1.6858165264129639, + "ref_logps/chosen": -279.9383544921875, + "ref_logps/rejected": -167.59031677246094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5229896903038025, + "rewards/margins": 2.4027175903320312, + "rewards/rejected": -1.8797279596328735, + "step": 465, + "u": -1.672014594078064, + "weight": 0.022105634212493896 + }, + { + "diff_generated": -206.70443725585938, + "epoch": 0.98403559277676, + "grad_norm": 1278.739837777923, + "learning_rate": 1.2000256937760445e-06, + "logits/chosen": -1.570615291595459, + "logits/rejected": -1.4970500469207764, + "logps/chosen": -237.1439208984375, + "logps/rejected": -359.52886962890625, + "loss": 239.3887, + "losses_ref": -2.77233624458313, + "ref_logps/chosen": -285.7524719238281, + "ref_logps/rejected": -152.82440185546875, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4860858917236328, + "rewards/margins": 2.5531301498413086, + "rewards/rejected": -2.067044496536255, + "step": 470, + "u": -1.4401319026947021, + "weight": 0.04976705089211464 + }, + { + "diff_generated": -199.87838745117188, + "epoch": 0.9945040565297043, + "grad_norm": 1152.4542486150297, + "learning_rate": 1.1820555115770255e-06, + "logits/chosen": -1.4883148670196533, + "logits/rejected": -1.505014419555664, + "logps/chosen": -225.6768798828125, + "logps/rejected": -358.5788269042969, + "loss": 226.472, + "losses_ref": -4.080103874206543, + "ref_logps/chosen": -273.79522705078125, + "ref_logps/rejected": -158.70046997070312, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48118335008621216, + "rewards/margins": 2.4799671173095703, + "rewards/rejected": -1.998783826828003, + "step": 475, + "u": -1.5296900272369385, + "weight": 0.05185595899820328 + }, + { + "diff_generated": -208.0040283203125, + "epoch": 1.0049725202826485, + "grad_norm": 1301.7034712659586, + "learning_rate": 1.1640243115310217e-06, + "logits/chosen": -1.5732040405273438, + "logits/rejected": -1.5068919658660889, + "logps/chosen": -223.4159393310547, + "logps/rejected": -374.62591552734375, + "loss": 226.8136, + "losses_ref": -4.88800573348999, + "ref_logps/chosen": -293.29400634765625, + "ref_logps/rejected": -166.62188720703125, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6987806558609009, + "rewards/margins": 2.778820514678955, + "rewards/rejected": -2.080040454864502, + "step": 480, + "u": -1.9416106939315796, + "weight": 0.03623828664422035 + }, + { + "diff_generated": -215.4559783935547, + "epoch": 1.0154409840355927, + "grad_norm": 1355.7730407413364, + "learning_rate": 1.1459381369870972e-06, + "logits/chosen": -1.5292342901229858, + "logits/rejected": -1.4070460796356201, + "logps/chosen": -192.32717895507812, + "logps/rejected": -380.6830139160156, + "loss": 181.3888, + "losses_ref": -3.105132818222046, + "ref_logps/chosen": -294.8918762207031, + "ref_logps/rejected": -165.22702026367188, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0256469249725342, + "rewards/margins": 3.180206775665283, + "rewards/rejected": -2.15455961227417, + "step": 485, + "u": -2.883460521697998, + "weight": 0.04116251319646835 + }, + { + "diff_generated": -209.49990844726562, + "epoch": 1.025909447788537, + "grad_norm": 1391.787637976481, + "learning_rate": 1.1278030497196046e-06, + "logits/chosen": -1.2669024467468262, + "logits/rejected": -1.2282651662826538, + "logps/chosen": -166.51095581054688, + "logps/rejected": -365.71807861328125, + "loss": 180.3994, + "losses_ref": -2.922461986541748, + "ref_logps/chosen": -264.67388916015625, + "ref_logps/rejected": -156.21817016601562, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9816292524337769, + "rewards/margins": 3.0766279697418213, + "rewards/rejected": -2.094998836517334, + "step": 490, + "u": -3.193206310272217, + "weight": 0.029411697760224342 + }, + { + "diff_generated": -208.9027099609375, + "epoch": 1.0363779115414813, + "grad_norm": 1404.013865827238, + "learning_rate": 1.1096251278965172e-06, + "logits/chosen": -1.229707956314087, + "logits/rejected": -1.2453272342681885, + "logps/chosen": -167.49026489257812, + "logps/rejected": -368.3177185058594, + "loss": 166.3708, + "losses_ref": -5.491534233093262, + "ref_logps/chosen": -266.75250244140625, + "ref_logps/rejected": -159.41500854492188, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9926217794418335, + "rewards/margins": 3.081648588180542, + "rewards/rejected": -2.089026927947998, + "step": 495, + "u": -1.6660839319229126, + "weight": 0.06474236398935318 + }, + { + "diff_generated": -213.4435577392578, + "epoch": 1.0468463752944255, + "grad_norm": 1314.8398697189618, + "learning_rate": 1.0914104640422679e-06, + "logits/chosen": -1.391204595565796, + "logits/rejected": -1.3654673099517822, + "logps/chosen": -161.88082885742188, + "logps/rejected": -374.336669921875, + "loss": 175.893, + "losses_ref": -1.2716583013534546, + "ref_logps/chosen": -257.99908447265625, + "ref_logps/rejected": -160.89312744140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9611825942993164, + "rewards/margins": 3.0956180095672607, + "rewards/rejected": -2.1344354152679443, + "step": 500, + "u": -3.1927852630615234, + "weight": 0.015900352969765663 + }, + { + "diff_generated": -206.3148956298828, + "epoch": 1.05731483904737, + "grad_norm": 1445.075187818513, + "learning_rate": 1.0731651629957721e-06, + "logits/chosen": -1.3434970378875732, + "logits/rejected": -1.305525541305542, + "logps/chosen": -192.31558227539062, + "logps/rejected": -378.03851318359375, + "loss": 185.1733, + "losses_ref": -2.9738333225250244, + "ref_logps/chosen": -297.85302734375, + "ref_logps/rejected": -171.7236328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.055374264717102, + "rewards/margins": 3.118523359298706, + "rewards/rejected": -2.0631489753723145, + "step": 505, + "u": -3.4035236835479736, + "weight": 0.024688560515642166 + }, + { + "diff_generated": -186.8843231201172, + "epoch": 1.067783302800314, + "grad_norm": 1181.3904875833675, + "learning_rate": 1.0548953398643274e-06, + "logits/chosen": -1.566375970840454, + "logits/rejected": -1.4381110668182373, + "logps/chosen": -193.49539184570312, + "logps/rejected": -350.2602233886719, + "loss": 179.7564, + "losses_ref": -2.258450984954834, + "ref_logps/chosen": -297.76202392578125, + "ref_logps/rejected": -163.3759002685547, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0426661968231201, + "rewards/margins": 2.9115095138549805, + "rewards/rejected": -1.8688430786132812, + "step": 510, + "u": -2.4679007530212402, + "weight": 0.044156283140182495 + }, + { + "diff_generated": -200.12066650390625, + "epoch": 1.0782517665532583, + "grad_norm": 1297.9609792649137, + "learning_rate": 1.0366071179740706e-06, + "logits/chosen": -1.6367733478546143, + "logits/rejected": -1.4493190050125122, + "logps/chosen": -209.0851593017578, + "logps/rejected": -365.74053955078125, + "loss": 186.0993, + "losses_ref": -3.8747305870056152, + "ref_logps/chosen": -317.296630859375, + "ref_logps/rejected": -165.619873046875, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0821150541305542, + "rewards/margins": 3.0833218097686768, + "rewards/rejected": -2.001206636428833, + "step": 515, + "u": -2.938070297241211, + "weight": 0.03331952169537544 + }, + { + "diff_generated": -207.33700561523438, + "epoch": 1.0887202303062025, + "grad_norm": 1362.4544964162274, + "learning_rate": 1.0183066268176775e-06, + "logits/chosen": -1.541912317276001, + "logits/rejected": -1.406719446182251, + "logps/chosen": -204.0404052734375, + "logps/rejected": -376.406494140625, + "loss": 202.1916, + "losses_ref": -0.5564223527908325, + "ref_logps/chosen": -307.4422912597656, + "ref_logps/rejected": -169.06948852539062, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.034018874168396, + "rewards/margins": 3.107388973236084, + "rewards/rejected": -2.0733699798583984, + "step": 520, + "u": -3.34126353263855, + "weight": 0.007363998796790838 + }, + { + "diff_generated": -209.34707641601562, + "epoch": 1.0991886940591469, + "grad_norm": 1329.928673259645, + "learning_rate": 1e-06, + "logits/chosen": -1.4774454832077026, + "logits/rejected": -1.3976843357086182, + "logps/chosen": -190.63027954101562, + "logps/rejected": -365.6118469238281, + "loss": 191.7027, + "losses_ref": -4.4078168869018555, + "ref_logps/chosen": -289.65625, + "ref_logps/rejected": -156.2647705078125, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.990260124206543, + "rewards/margins": 3.083730936050415, + "rewards/rejected": -2.093470811843872, + "step": 525, + "u": -2.4628920555114746, + "weight": 0.034967873245477676 + }, + { + "diff_generated": -219.25039672851562, + "epoch": 1.109657157812091, + "grad_norm": 1263.6571441007575, + "learning_rate": 9.816933731823228e-07, + "logits/chosen": -1.48972749710083, + "logits/rejected": -1.3531391620635986, + "logps/chosen": -184.37472534179688, + "logps/rejected": -382.6318359375, + "loss": 179.9115, + "losses_ref": -4.217190742492676, + "ref_logps/chosen": -283.9466857910156, + "ref_logps/rejected": -163.38145446777344, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.995719313621521, + "rewards/margins": 3.188223361968994, + "rewards/rejected": -2.1925039291381836, + "step": 530, + "u": -2.7124040126800537, + "weight": 0.03560812398791313 + }, + { + "diff_generated": -222.4695587158203, + "epoch": 1.1201256215650353, + "grad_norm": 1387.0782441687347, + "learning_rate": 9.633928820259293e-07, + "logits/chosen": -1.2347859144210815, + "logits/rejected": -1.2332684993743896, + "logps/chosen": -162.6536102294922, + "logps/rejected": -388.9007263183594, + "loss": 162.1828, + "losses_ref": -2.344147205352783, + "ref_logps/chosen": -256.69085693359375, + "ref_logps/rejected": -166.43115234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9403725862503052, + "rewards/margins": 3.1650681495666504, + "rewards/rejected": -2.2246956825256348, + "step": 535, + "u": -2.9149539470672607, + "weight": 0.036711305379867554 + }, + { + "diff_generated": -220.98583984375, + "epoch": 1.1305940853179797, + "grad_norm": 1297.6784365239848, + "learning_rate": 9.451046601356725e-07, + "logits/chosen": -1.3270328044891357, + "logits/rejected": -1.2543261051177979, + "logps/chosen": -174.17941284179688, + "logps/rejected": -378.3968200683594, + "loss": 168.7943, + "losses_ref": -5.623769760131836, + "ref_logps/chosen": -267.5427551269531, + "ref_logps/rejected": -157.4110107421875, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9336336255073547, + "rewards/margins": 3.1434922218322754, + "rewards/rejected": -2.2098584175109863, + "step": 540, + "u": -2.444173574447632, + "weight": 0.07986775040626526 + }, + { + "diff_generated": -227.32980346679688, + "epoch": 1.1410625490709239, + "grad_norm": 1205.4234546771595, + "learning_rate": 9.268348370042281e-07, + "logits/chosen": -1.3813427686691284, + "logits/rejected": -1.318725347518921, + "logps/chosen": -174.5741424560547, + "logps/rejected": -399.95172119140625, + "loss": 168.9783, + "losses_ref": -3.7193565368652344, + "ref_logps/chosen": -273.3332824707031, + "ref_logps/rejected": -172.62191772460938, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9875916242599487, + "rewards/margins": 3.260889768600464, + "rewards/rejected": -2.2732982635498047, + "step": 545, + "u": -3.2109789848327637, + "weight": 0.042879991233348846 + }, + { + "diff_generated": -249.11709594726562, + "epoch": 1.151531012823868, + "grad_norm": 1266.8954047572045, + "learning_rate": 9.085895359577323e-07, + "logits/chosen": -1.33551824092865, + "logits/rejected": -1.3183876276016235, + "logps/chosen": -167.4661865234375, + "logps/rejected": -403.6305236816406, + "loss": 174.6021, + "losses_ref": -1.4713778495788574, + "ref_logps/chosen": -267.08013916015625, + "ref_logps/rejected": -154.513427734375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9961398243904114, + "rewards/margins": 3.4873108863830566, + "rewards/rejected": -2.491170883178711, + "step": 550, + "u": -2.983215093612671, + "weight": 0.02549784444272518 + }, + { + "diff_generated": -220.6905059814453, + "epoch": 1.1619994765768125, + "grad_norm": 1242.623006879505, + "learning_rate": 8.903748721034826e-07, + "logits/chosen": -1.410308599472046, + "logits/rejected": -1.3436871767044067, + "logps/chosen": -175.43826293945312, + "logps/rejected": -392.7843322753906, + "loss": 178.3087, + "losses_ref": -2.583522081375122, + "ref_logps/chosen": -277.257080078125, + "ref_logps/rejected": -172.09388732910156, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.018188238143921, + "rewards/margins": 3.225093126296997, + "rewards/rejected": -2.206904649734497, + "step": 555, + "u": -2.3211851119995117, + "weight": 0.039024386554956436 + }, + { + "diff_generated": -246.51620483398438, + "epoch": 1.1724679403297567, + "grad_norm": 1315.7734920897904, + "learning_rate": 8.721969502803953e-07, + "logits/chosen": -1.4283636808395386, + "logits/rejected": -1.4595166444778442, + "logps/chosen": -190.37667846679688, + "logps/rejected": -401.78594970703125, + "loss": 169.0395, + "losses_ref": -0.9799969792366028, + "ref_logps/chosen": -288.49481201171875, + "ref_logps/rejected": -155.26974487304688, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9811817407608032, + "rewards/margins": 3.4463438987731934, + "rewards/rejected": -2.4651618003845215, + "step": 560, + "u": -3.4851043224334717, + "weight": 0.011031994596123695 + }, + { + "diff_generated": -237.05813598632812, + "epoch": 1.1829364040827008, + "grad_norm": 1324.6420978322203, + "learning_rate": 8.540618630129027e-07, + "logits/chosen": -1.5112595558166504, + "logits/rejected": -1.4447729587554932, + "logps/chosen": -197.54592895507812, + "logps/rejected": -408.50360107421875, + "loss": 180.7105, + "losses_ref": -8.419300079345703, + "ref_logps/chosen": -298.6998596191406, + "ref_logps/rejected": -171.4455108642578, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.011539340019226, + "rewards/margins": 3.3821206092834473, + "rewards/rejected": -2.3705811500549316, + "step": 565, + "u": -3.0135345458984375, + "weight": 0.03675166517496109 + }, + { + "diff_generated": -226.0189666748047, + "epoch": 1.193404867835645, + "grad_norm": 1290.2940777725041, + "learning_rate": 8.359756884689783e-07, + "logits/chosen": -1.5810168981552124, + "logits/rejected": -1.4695533514022827, + "logps/chosen": -179.12496948242188, + "logps/rejected": -392.3472595214844, + "loss": 183.5485, + "losses_ref": -1.6247104406356812, + "ref_logps/chosen": -278.8708801269531, + "ref_logps/rejected": -166.32827758789062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9974590539932251, + "rewards/margins": 3.257648468017578, + "rewards/rejected": -2.2601895332336426, + "step": 570, + "u": -3.080786943435669, + "weight": 0.019241400063037872 + }, + { + "diff_generated": -213.7429962158203, + "epoch": 1.2038733315885894, + "grad_norm": 1344.148507837478, + "learning_rate": 8.179444884229744e-07, + "logits/chosen": -1.4880825281143188, + "logits/rejected": -1.502333641052246, + "logps/chosen": -189.47103881835938, + "logps/rejected": -378.13275146484375, + "loss": 171.5777, + "losses_ref": -0.9622389674186707, + "ref_logps/chosen": -284.98492431640625, + "ref_logps/rejected": -164.38975524902344, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9551390409469604, + "rewards/margins": 3.092568874359131, + "rewards/rejected": -2.137429714202881, + "step": 575, + "u": -3.1298766136169434, + "weight": 0.013516530394554138 + }, + { + "diff_generated": -231.77328491210938, + "epoch": 1.2143417953415336, + "grad_norm": 1304.8866597655287, + "learning_rate": 7.999743062239557e-07, + "logits/chosen": -1.4784562587738037, + "logits/rejected": -1.5664056539535522, + "logps/chosen": -176.44296264648438, + "logps/rejected": -421.82135009765625, + "loss": 181.4369, + "losses_ref": -1.1828618049621582, + "ref_logps/chosen": -274.30767822265625, + "ref_logps/rejected": -190.04803466796875, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9786470532417297, + "rewards/margins": 3.296379804611206, + "rewards/rejected": -2.3177332878112793, + "step": 580, + "u": -3.0596017837524414, + "weight": 0.012155565433204174 + }, + { + "diff_generated": -220.5200653076172, + "epoch": 1.2248102590944778, + "grad_norm": 1320.435055959004, + "learning_rate": 7.820711647702017e-07, + "logits/chosen": -1.4778623580932617, + "logits/rejected": -1.5001682043075562, + "logps/chosen": -168.55393981933594, + "logps/rejected": -381.0849304199219, + "loss": 177.0697, + "losses_ref": -2.307084560394287, + "ref_logps/chosen": -260.8992004394531, + "ref_logps/rejected": -160.5648651123047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9234523773193359, + "rewards/margins": 3.128653049468994, + "rewards/rejected": -2.205200672149658, + "step": 585, + "u": -2.9381721019744873, + "weight": 0.03427546098828316 + }, + { + "diff_generated": -207.7950439453125, + "epoch": 1.235278722847422, + "grad_norm": 1261.3527727961057, + "learning_rate": 7.642410644905726e-07, + "logits/chosen": -1.4036446809768677, + "logits/rejected": -1.4330257177352905, + "logps/chosen": -171.85134887695312, + "logps/rejected": -370.4696960449219, + "loss": 176.7884, + "losses_ref": -2.213914394378662, + "ref_logps/chosen": -269.0211486816406, + "ref_logps/rejected": -162.6746826171875, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9716979265213013, + "rewards/margins": 3.0496482849121094, + "rewards/rejected": -2.0779504776000977, + "step": 590, + "u": -2.40622878074646, + "weight": 0.04459633305668831 + }, + { + "diff_generated": -230.44302368164062, + "epoch": 1.2457471866003664, + "grad_norm": 1305.239745538134, + "learning_rate": 7.464899813334e-07, + "logits/chosen": -1.261853575706482, + "logits/rejected": -1.2570579051971436, + "logps/chosen": -181.5194091796875, + "logps/rejected": -393.291259765625, + "loss": 177.5223, + "losses_ref": -4.516595840454102, + "ref_logps/chosen": -278.3271789550781, + "ref_logps/rejected": -162.84823608398438, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9680774807929993, + "rewards/margins": 3.272507905960083, + "rewards/rejected": -2.3044302463531494, + "step": 595, + "u": -2.7008533477783203, + "weight": 0.05883873626589775 + }, + { + "diff_generated": -232.184326171875, + "epoch": 1.2562156503533106, + "grad_norm": 1268.765281131021, + "learning_rate": 7.288238647635829e-07, + "logits/chosen": -1.4351574182510376, + "logits/rejected": -1.2977135181427002, + "logps/chosen": -184.0857696533203, + "logps/rejected": -400.95361328125, + "loss": 177.9198, + "losses_ref": -3.803828001022339, + "ref_logps/chosen": -284.0093078613281, + "ref_logps/rejected": -168.76925659179688, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9992351531982422, + "rewards/margins": 3.3210787773132324, + "rewards/rejected": -2.3218436241149902, + "step": 600, + "u": -2.89520263671875, + "weight": 0.026231110095977783 + }, + { + "diff_generated": -198.3398895263672, + "epoch": 1.2666841141062548, + "grad_norm": 1202.5442238394803, + "learning_rate": 7.112486357685631e-07, + "logits/chosen": -1.499137043952942, + "logits/rejected": -1.4640613794326782, + "logps/chosen": -186.61227416992188, + "logps/rejected": -356.29132080078125, + "loss": 187.6658, + "losses_ref": -4.061453819274902, + "ref_logps/chosen": -287.03717041015625, + "ref_logps/rejected": -157.95144653320312, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.004248857498169, + "rewards/margins": 2.987647771835327, + "rewards/rejected": -1.9833987951278687, + "step": 605, + "u": -3.231105089187622, + "weight": 0.03989076986908913 + }, + { + "diff_generated": -219.8319091796875, + "epoch": 1.2771525778591992, + "grad_norm": 1294.217440674336, + "learning_rate": 6.937701848738407e-07, + "logits/chosen": -1.41506028175354, + "logits/rejected": -1.4094430208206177, + "logps/chosen": -169.46595764160156, + "logps/rejected": -384.82025146484375, + "loss": 167.9688, + "losses_ref": -1.3657054901123047, + "ref_logps/chosen": -266.1152648925781, + "ref_logps/rejected": -164.98837280273438, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9664928317070007, + "rewards/margins": 3.1648120880126953, + "rewards/rejected": -2.19831919670105, + "step": 610, + "u": -3.3554062843322754, + "weight": 0.01739688031375408 + }, + { + "diff_generated": -217.53970336914062, + "epoch": 1.2876210416121434, + "grad_norm": 1337.3093609895052, + "learning_rate": 6.763943701687045e-07, + "logits/chosen": -1.633599877357483, + "logits/rejected": -1.5192573070526123, + "logps/chosen": -191.32760620117188, + "logps/rejected": -387.1475524902344, + "loss": 183.0882, + "losses_ref": -0.359982430934906, + "ref_logps/chosen": -299.5060119628906, + "ref_logps/rejected": -169.60787963867188, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0817840099334717, + "rewards/margins": 3.257181167602539, + "rewards/rejected": -2.1753971576690674, + "step": 615, + "u": -3.340681552886963, + "weight": 0.008451832458376884 + }, + { + "diff_generated": -208.6597137451172, + "epoch": 1.2980895053650876, + "grad_norm": 1324.2598041902163, + "learning_rate": 6.591270153428288e-07, + "logits/chosen": -1.6454055309295654, + "logits/rejected": -1.489946961402893, + "logps/chosen": -191.6290283203125, + "logps/rejected": -364.0921325683594, + "loss": 178.0635, + "losses_ref": -2.520381450653076, + "ref_logps/chosen": -295.8542785644531, + "ref_logps/rejected": -155.4324188232422, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.042252540588379, + "rewards/margins": 3.128849506378174, + "rewards/rejected": -2.086596965789795, + "step": 620, + "u": -2.7204320430755615, + "weight": 0.02861974760890007 + }, + { + "diff_generated": -196.55752563476562, + "epoch": 1.308557969118032, + "grad_norm": 1344.8788218911382, + "learning_rate": 6.419739077344016e-07, + "logits/chosen": -1.5530303716659546, + "logits/rejected": -1.423179030418396, + "logps/chosen": -200.18063354492188, + "logps/rejected": -360.1055603027344, + "loss": 179.8101, + "losses_ref": -3.9870200157165527, + "ref_logps/chosen": -300.4015197753906, + "ref_logps/rejected": -163.5480194091797, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0022084712982178, + "rewards/margins": 2.9677836894989014, + "rewards/rejected": -1.9655752182006836, + "step": 625, + "u": -2.844027042388916, + "weight": 0.038288719952106476 + }, + { + "diff_generated": -198.09622192382812, + "epoch": 1.3190264328709762, + "grad_norm": 1205.0358284390313, + "learning_rate": 6.24940796390438e-07, + "logits/chosen": -1.5373382568359375, + "logits/rejected": -1.444549322128296, + "logps/chosen": -174.25350952148438, + "logps/rejected": -362.83953857421875, + "loss": 166.5968, + "losses_ref": -2.2841248512268066, + "ref_logps/chosen": -274.06365966796875, + "ref_logps/rejected": -164.74331665039062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9981018900871277, + "rewards/margins": 2.9790642261505127, + "rewards/rejected": -1.9809621572494507, + "step": 630, + "u": -2.492745876312256, + "weight": 0.01923806592822075 + }, + { + "diff_generated": -214.6193389892578, + "epoch": 1.3294948966239204, + "grad_norm": 1314.9134741026285, + "learning_rate": 6.08033390139925e-07, + "logits/chosen": -1.4583691358566284, + "logits/rejected": -1.290028691291809, + "logps/chosen": -190.0717315673828, + "logps/rejected": -369.1700439453125, + "loss": 192.5529, + "losses_ref": -0.966667652130127, + "ref_logps/chosen": -293.4891662597656, + "ref_logps/rejected": -154.55068969726562, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0341745615005493, + "rewards/margins": 3.180367946624756, + "rewards/rejected": -2.146193265914917, + "step": 635, + "u": -2.8957810401916504, + "weight": 0.020022699609398842 + }, + { + "diff_generated": -209.5125732421875, + "epoch": 1.3399633603768648, + "grad_norm": 1311.833771803947, + "learning_rate": 5.912573556804452e-07, + "logits/chosen": -1.4464821815490723, + "logits/rejected": -1.3825037479400635, + "logps/chosen": -181.79258728027344, + "logps/rejected": -380.0604553222656, + "loss": 186.6832, + "losses_ref": -2.0217666625976562, + "ref_logps/chosen": -283.5255126953125, + "ref_logps/rejected": -170.54788208007812, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.017329216003418, + "rewards/margins": 3.112454891204834, + "rewards/rejected": -2.095125675201416, + "step": 640, + "u": -2.146329164505005, + "weight": 0.051374662667512894 + }, + { + "diff_generated": -233.44900512695312, + "epoch": 1.350431824129809, + "grad_norm": 1320.9978857185588, + "learning_rate": 5.746183156789252e-07, + "logits/chosen": -1.4467910528182983, + "logits/rejected": -1.2174046039581299, + "logps/chosen": -190.71127319335938, + "logps/rejected": -401.9010314941406, + "loss": 181.6372, + "losses_ref": -1.3231620788574219, + "ref_logps/chosen": -301.30584716796875, + "ref_logps/rejected": -168.45204162597656, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1059458255767822, + "rewards/margins": 3.4404358863830566, + "rewards/rejected": -2.3344900608062744, + "step": 645, + "u": -2.5961060523986816, + "weight": 0.031209224835038185 + }, + { + "diff_generated": -218.05880737304688, + "epoch": 1.3609002878827532, + "grad_norm": 1268.0769992434364, + "learning_rate": 5.581218468871365e-07, + "logits/chosen": -1.2198398113250732, + "logits/rejected": -1.3189094066619873, + "logps/chosen": -157.86666870117188, + "logps/rejected": -376.75433349609375, + "loss": 168.9012, + "losses_ref": -2.4989333152770996, + "ref_logps/chosen": -252.76400756835938, + "ref_logps/rejected": -158.69552612304688, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9489734768867493, + "rewards/margins": 3.129561424255371, + "rewards/rejected": -2.1805882453918457, + "step": 650, + "u": -2.92409086227417, + "weight": 0.0428018681704998 + }, + { + "diff_generated": -235.935546875, + "epoch": 1.3713687516356974, + "grad_norm": 1347.742524924812, + "learning_rate": 5.417734782725896e-07, + "logits/chosen": -1.2961053848266602, + "logits/rejected": -1.261878252029419, + "logps/chosen": -177.77523803710938, + "logps/rejected": -389.1588134765625, + "loss": 179.405, + "losses_ref": -1.0838311910629272, + "ref_logps/chosen": -277.2697448730469, + "ref_logps/rejected": -153.2233123779297, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9949450492858887, + "rewards/margins": 3.3543007373809814, + "rewards/rejected": -2.3593554496765137, + "step": 655, + "u": -3.1044487953186035, + "weight": 0.017367416992783546 + }, + { + "diff_generated": -211.7913055419922, + "epoch": 1.3818372153886418, + "grad_norm": 1311.034191538654, + "learning_rate": 5.255786891654399e-07, + "logits/chosen": -1.2746165990829468, + "logits/rejected": -1.2540855407714844, + "logps/chosen": -170.9514923095703, + "logps/rejected": -376.79229736328125, + "loss": 174.0495, + "losses_ref": -2.949699878692627, + "ref_logps/chosen": -268.7665100097656, + "ref_logps/rejected": -165.0010223388672, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9781501889228821, + "rewards/margins": 3.0960631370544434, + "rewards/rejected": -2.117912769317627, + "step": 660, + "u": -1.9965251684188843, + "weight": 0.04027215391397476 + }, + { + "diff_generated": -224.0879364013672, + "epoch": 1.392305679141586, + "grad_norm": 1328.7070235599076, + "learning_rate": 5.095429074220319e-07, + "logits/chosen": -1.2053465843200684, + "logits/rejected": -1.1557897329330444, + "logps/chosen": -175.30589294433594, + "logps/rejected": -393.43218994140625, + "loss": 184.6881, + "losses_ref": -3.8794121742248535, + "ref_logps/chosen": -274.28826904296875, + "ref_logps/rejected": -169.34422302246094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9898236989974976, + "rewards/margins": 3.230703353881836, + "rewards/rejected": -2.240879535675049, + "step": 665, + "u": -3.0530405044555664, + "weight": 0.03465485945343971 + }, + { + "diff_generated": -240.96484375, + "epoch": 1.4027741428945302, + "grad_norm": 1353.971116750616, + "learning_rate": 4.936715076056974e-07, + "logits/chosen": -1.242436408996582, + "logits/rejected": -1.24913489818573, + "logps/chosen": -183.4954833984375, + "logps/rejected": -405.70050048828125, + "loss": 171.9422, + "losses_ref": -0.8431612253189087, + "ref_logps/chosen": -284.3236999511719, + "ref_logps/rejected": -164.7356414794922, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0082820653915405, + "rewards/margins": 3.4179306030273438, + "rewards/rejected": -2.4096481800079346, + "step": 670, + "u": -3.2680907249450684, + "weight": 0.007226690649986267 + }, + { + "diff_generated": -230.71115112304688, + "epoch": 1.4132426066474744, + "grad_norm": 1301.2684317901687, + "learning_rate": 4.779698091854098e-07, + "logits/chosen": -1.4362276792526245, + "logits/rejected": -1.2898997068405151, + "logps/chosen": -196.05447387695312, + "logps/rejected": -400.25994873046875, + "loss": 193.0132, + "losses_ref": -0.4112131595611572, + "ref_logps/chosen": -306.9317321777344, + "ref_logps/rejected": -169.54879760742188, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1087725162506104, + "rewards/margins": 3.415884494781494, + "rewards/rejected": -2.307111978530884, + "step": 675, + "u": -3.2834739685058594, + "weight": 0.006595195736736059 + }, + { + "diff_generated": -205.662841796875, + "epoch": 1.4237110704004188, + "grad_norm": 1344.3331479958706, + "learning_rate": 4.624430747529102e-07, + "logits/chosen": -1.3598095178604126, + "logits/rejected": -1.158661961555481, + "logps/chosen": -205.39236450195312, + "logps/rejected": -369.7188720703125, + "loss": 181.4401, + "losses_ref": -1.5265072584152222, + "ref_logps/chosen": -313.2685546875, + "ref_logps/rejected": -164.05599975585938, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0787618160247803, + "rewards/margins": 3.135390520095825, + "rewards/rejected": -2.056628465652466, + "step": 680, + "u": -3.1280694007873535, + "weight": 0.024629075080156326 + }, + { + "diff_generated": -223.96005249023438, + "epoch": 1.434179534153363, + "grad_norm": 1420.0899808408303, + "learning_rate": 4.4709650825889277e-07, + "logits/chosen": -1.202007532119751, + "logits/rejected": -1.1467583179473877, + "logps/chosen": -161.4755859375, + "logps/rejected": -394.6024475097656, + "loss": 181.6898, + "losses_ref": -0.6423639059066772, + "ref_logps/chosen": -258.74224853515625, + "ref_logps/rejected": -170.64236450195312, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9726665616035461, + "rewards/margins": 3.2122673988342285, + "rewards/rejected": -2.239600658416748, + "step": 685, + "u": -2.6377460956573486, + "weight": 0.008045530878007412 + }, + { + "diff_generated": -199.37355041503906, + "epoch": 1.4446479979063072, + "grad_norm": 1308.6573761420323, + "learning_rate": 4.3193525326884426e-07, + "logits/chosen": -1.3359885215759277, + "logits/rejected": -1.2320592403411865, + "logps/chosen": -199.9832000732422, + "logps/rejected": -364.55865478515625, + "loss": 197.232, + "losses_ref": -2.2240054607391357, + "ref_logps/chosen": -303.2825927734375, + "ref_logps/rejected": -165.18508911132812, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 1.0329937934875488, + "rewards/margins": 3.026729106903076, + "rewards/rejected": -1.9937355518341064, + "step": 690, + "u": -3.028186559677124, + "weight": 0.02633347176015377 + }, + { + "diff_generated": -224.0160369873047, + "epoch": 1.4551164616592516, + "grad_norm": 1299.079432778448, + "learning_rate": 4.1696439123912406e-07, + "logits/chosen": -1.2223880290985107, + "logits/rejected": -1.209564447402954, + "logps/chosen": -174.464111328125, + "logps/rejected": -393.27691650390625, + "loss": 178.2965, + "losses_ref": -4.651436805725098, + "ref_logps/chosen": -266.0323486328125, + "ref_logps/rejected": -169.26083374023438, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.915682315826416, + "rewards/margins": 3.1558427810668945, + "rewards/rejected": -2.2401604652404785, + "step": 695, + "u": -2.1582460403442383, + "weight": 0.050126731395721436 + }, + { + "diff_generated": -229.42153930664062, + "epoch": 1.4655849254121958, + "grad_norm": 1181.3986183050397, + "learning_rate": 4.0218893981385927e-07, + "logits/chosen": -1.2920024394989014, + "logits/rejected": -1.2460237741470337, + "logps/chosen": -169.0710906982422, + "logps/rejected": -389.62451171875, + "loss": 185.8502, + "losses_ref": -2.0431206226348877, + "ref_logps/chosen": -263.98992919921875, + "ref_logps/rejected": -160.20298767089844, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9491885900497437, + "rewards/margins": 3.243403196334839, + "rewards/rejected": -2.294214963912964, + "step": 700, + "u": -2.900634765625, + "weight": 0.029426846653223038 + }, + { + "diff_generated": -238.50405883789062, + "epoch": 1.47605338916514, + "grad_norm": 1395.6758572737517, + "learning_rate": 3.87613851143229e-07, + "logits/chosen": -1.321358323097229, + "logits/rejected": -1.2150487899780273, + "logps/chosen": -193.1901397705078, + "logps/rejected": -408.7565002441406, + "loss": 180.6914, + "losses_ref": -7.425305366516113, + "ref_logps/chosen": -295.8336486816406, + "ref_logps/rejected": -170.25244140625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0264348983764648, + "rewards/margins": 3.411475419998169, + "rewards/rejected": -2.385040760040283, + "step": 705, + "u": -2.5025954246520996, + "weight": 0.05010579898953438 + }, + { + "diff_generated": -232.7953643798828, + "epoch": 1.4865218529180844, + "grad_norm": 1298.8055689658759, + "learning_rate": 3.7324401022369744e-07, + "logits/chosen": -1.322563886642456, + "logits/rejected": -1.1327731609344482, + "logps/chosen": -194.57736206054688, + "logps/rejected": -386.1799011230469, + "loss": 178.1232, + "losses_ref": -1.3739917278289795, + "ref_logps/chosen": -296.6303405761719, + "ref_logps/rejected": -153.38453674316406, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0205297470092773, + "rewards/margins": 3.3484835624694824, + "rewards/rejected": -2.327953815460205, + "step": 710, + "u": -3.248492479324341, + "weight": 0.022353414446115494 + }, + { + "diff_generated": -204.0673370361328, + "epoch": 1.4969903166710286, + "grad_norm": 1434.009703095031, + "learning_rate": 3.5908423326075455e-07, + "logits/chosen": -1.2674996852874756, + "logits/rejected": -1.242331862449646, + "logps/chosen": -167.33718872070312, + "logps/rejected": -369.3961486816406, + "loss": 183.2372, + "losses_ref": -1.1576902866363525, + "ref_logps/chosen": -261.9808044433594, + "ref_logps/rejected": -165.32882690429688, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9464362263679504, + "rewards/margins": 2.987109661102295, + "rewards/rejected": -2.0406734943389893, + "step": 715, + "u": -2.994257688522339, + "weight": 0.022263679653406143 + }, + { + "diff_generated": -233.936767578125, + "epoch": 1.5074587804239727, + "grad_norm": 1304.2767992641454, + "learning_rate": 3.45139266054715e-07, + "logits/chosen": -1.318178415298462, + "logits/rejected": -1.1334383487701416, + "logps/chosen": -197.61227416992188, + "logps/rejected": -397.12127685546875, + "loss": 183.3899, + "losses_ref": -1.6034200191497803, + "ref_logps/chosen": -309.3571472167969, + "ref_logps/rejected": -163.1844940185547, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1174486875534058, + "rewards/margins": 3.4568161964416504, + "rewards/rejected": -2.3393678665161133, + "step": 720, + "u": -3.3942806720733643, + "weight": 0.019716758280992508 + }, + { + "diff_generated": -244.8833770751953, + "epoch": 1.5179272441769172, + "grad_norm": 1236.5966034907726, + "learning_rate": 3.314137824101111e-07, + "logits/chosen": -1.306779384613037, + "logits/rejected": -1.1290355920791626, + "logps/chosen": -218.06015014648438, + "logps/rejected": -403.7084045410156, + "loss": 191.625, + "losses_ref": -2.257856845855713, + "ref_logps/chosen": -318.39056396484375, + "ref_logps/rejected": -158.82498168945312, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0033042430877686, + "rewards/margins": 3.4521377086639404, + "rewards/rejected": -2.448833703994751, + "step": 725, + "u": -3.3267006874084473, + "weight": 0.032559871673583984 + }, + { + "diff_generated": -222.01101684570312, + "epoch": 1.5283957079298613, + "grad_norm": 1211.6699328046157, + "learning_rate": 3.179123825692178e-07, + "logits/chosen": -1.248240351676941, + "logits/rejected": -1.091903805732727, + "logps/chosen": -175.27281188964844, + "logps/rejected": -383.36309814453125, + "loss": 173.3922, + "losses_ref": -5.464686393737793, + "ref_logps/chosen": -273.9178771972656, + "ref_logps/rejected": -161.35206604003906, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9864505529403687, + "rewards/margins": 3.2065606117248535, + "rewards/rejected": -2.2201101779937744, + "step": 730, + "u": -2.946007013320923, + "weight": 0.04170671105384827 + }, + { + "diff_generated": -220.7677764892578, + "epoch": 1.5388641716828055, + "grad_norm": 1300.7738080642184, + "learning_rate": 3.0463959167023335e-07, + "logits/chosen": -1.2869834899902344, + "logits/rejected": -1.1894266605377197, + "logps/chosen": -182.8350372314453, + "logps/rejected": -379.5199890136719, + "loss": 171.4061, + "losses_ref": -4.5947465896606445, + "ref_logps/chosen": -284.21063232421875, + "ref_logps/rejected": -158.75221252441406, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0137560367584229, + "rewards/margins": 3.2214341163635254, + "rewards/rejected": -2.2076778411865234, + "step": 735, + "u": -2.485495090484619, + "weight": 0.053016532212495804 + }, + { + "diff_generated": -238.62289428710938, + "epoch": 1.54933263543575, + "grad_norm": 1330.9443663432232, + "learning_rate": 2.915998582306299e-07, + "logits/chosen": -1.3296325206756592, + "logits/rejected": -1.1434093713760376, + "logps/chosen": -192.86752319335938, + "logps/rejected": -412.7796325683594, + "loss": 171.9964, + "losses_ref": -0.9953049421310425, + "ref_logps/chosen": -298.61065673828125, + "ref_logps/rejected": -174.15672302246094, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0574313402175903, + "rewards/margins": 3.443660259246826, + "rewards/rejected": -2.3862290382385254, + "step": 740, + "u": -3.0645031929016113, + "weight": 0.014706036075949669 + }, + { + "diff_generated": -232.26016235351562, + "epoch": 1.559801099188694, + "grad_norm": 1284.1954470666844, + "learning_rate": 2.7879755265618557e-07, + "logits/chosen": -1.1518179178237915, + "logits/rejected": -1.1568098068237305, + "logps/chosen": -160.57080078125, + "logps/rejected": -390.94854736328125, + "loss": 177.5848, + "losses_ref": -0.8798303604125977, + "ref_logps/chosen": -254.80648803710938, + "ref_logps/rejected": -158.68838500976562, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9423569440841675, + "rewards/margins": 3.264958620071411, + "rewards/rejected": -2.322601556777954, + "step": 745, + "u": -2.921161413192749, + "weight": 0.0170670785009861 + }, + { + "diff_generated": -233.20187377929688, + "epoch": 1.5702695629416383, + "grad_norm": 1289.4633991370238, + "learning_rate": 2.6623696577619625e-07, + "logits/chosen": -1.2346287965774536, + "logits/rejected": -1.2745471000671387, + "logps/chosen": -192.0581512451172, + "logps/rejected": -391.81146240234375, + "loss": 182.0347, + "losses_ref": -1.6879841089248657, + "ref_logps/chosen": -290.81500244140625, + "ref_logps/rejected": -158.6095428466797, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9875686764717102, + "rewards/margins": 3.3195877075195312, + "rewards/rejected": -2.3320186138153076, + "step": 750, + "u": -3.0655760765075684, + "weight": 0.02230766788125038 + }, + { + "diff_generated": -217.38137817382812, + "epoch": 1.5807380266945825, + "grad_norm": 1443.1241349727247, + "learning_rate": 2.5392230740535846e-07, + "logits/chosen": -1.4136921167373657, + "logits/rejected": -1.14936363697052, + "logps/chosen": -205.6985321044922, + "logps/rejected": -384.5464782714844, + "loss": 193.6084, + "losses_ref": -2.2517495155334473, + "ref_logps/chosen": -317.1262512207031, + "ref_logps/rejected": -167.1651153564453, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1142771244049072, + "rewards/margins": 3.288090467453003, + "rewards/rejected": -2.173813581466675, + "step": 755, + "u": -2.5095248222351074, + "weight": 0.04148329049348831 + }, + { + "diff_generated": -223.0001220703125, + "epoch": 1.5912064904475267, + "grad_norm": 1315.9291386894508, + "learning_rate": 2.418577049328058e-07, + "logits/chosen": -1.6086959838867188, + "logits/rejected": -1.2083603143692017, + "logps/chosen": -214.2064666748047, + "logps/rejected": -383.09918212890625, + "loss": 193.4667, + "losses_ref": -0.697050929069519, + "ref_logps/chosen": -327.52081298828125, + "ref_logps/rejected": -160.09909057617188, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1331430673599243, + "rewards/margins": 3.3631443977355957, + "rewards/rejected": -2.2300009727478027, + "step": 760, + "u": -3.4212822914123535, + "weight": 0.02071799524128437 + }, + { + "diff_generated": -240.10708618164062, + "epoch": 1.6016749542004711, + "grad_norm": 1354.8278973512276, + "learning_rate": 2.300472019387697e-07, + "logits/chosen": -1.3740001916885376, + "logits/rejected": -1.2972242832183838, + "logps/chosen": -184.8181915283203, + "logps/rejected": -400.69439697265625, + "loss": 183.1763, + "losses_ref": -5.5122270584106445, + "ref_logps/chosen": -284.5431213378906, + "ref_logps/rejected": -160.58731079101562, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9972493052482605, + "rewards/margins": 3.398320436477661, + "rewards/rejected": -2.4010708332061768, + "step": 765, + "u": -2.9249844551086426, + "weight": 0.04190880060195923 + }, + { + "diff_generated": -224.24789428710938, + "epoch": 1.6121434179534153, + "grad_norm": 1294.896383811541, + "learning_rate": 2.1849475683932994e-07, + "logits/chosen": -1.3714028596878052, + "logits/rejected": -1.3127011060714722, + "logps/chosen": -184.06544494628906, + "logps/rejected": -384.2123107910156, + "loss": 179.5198, + "losses_ref": -3.6349315643310547, + "ref_logps/chosen": -284.44268798828125, + "ref_logps/rejected": -159.9644012451172, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.003772497177124, + "rewards/margins": 3.246250867843628, + "rewards/rejected": -2.242478847503662, + "step": 770, + "u": -2.7088732719421387, + "weight": 0.04078099876642227 + }, + { + "diff_generated": -228.37911987304688, + "epoch": 1.6226118817063595, + "grad_norm": 1315.1478573927377, + "learning_rate": 2.0720424155971038e-07, + "logits/chosen": -1.4367603063583374, + "logits/rejected": -1.2870023250579834, + "logps/chosen": -201.5555877685547, + "logps/rejected": -386.1324157714844, + "loss": 176.5013, + "losses_ref": -2.8903164863586426, + "ref_logps/chosen": -306.54461669921875, + "ref_logps/rejected": -157.75328063964844, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0498902797698975, + "rewards/margins": 3.333681583404541, + "rewards/rejected": -2.2837913036346436, + "step": 775, + "u": -2.703965902328491, + "weight": 0.04053039103746414 + }, + { + "diff_generated": -220.4508819580078, + "epoch": 1.633080345459304, + "grad_norm": 1400.50539955428, + "learning_rate": 1.961794402365611e-07, + "logits/chosen": -1.4036462306976318, + "logits/rejected": -1.2919548749923706, + "logps/chosen": -200.26541137695312, + "logps/rejected": -386.81597900390625, + "loss": 183.6931, + "losses_ref": -1.8775193691253662, + "ref_logps/chosen": -310.53729248046875, + "ref_logps/rejected": -166.36508178710938, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1027185916900635, + "rewards/margins": 3.307227373123169, + "rewards/rejected": -2.2045087814331055, + "step": 780, + "u": -2.835704803466797, + "weight": 0.031143631786108017 + }, + { + "diff_generated": -217.3331298828125, + "epoch": 1.643548809212248, + "grad_norm": 1301.0844819616188, + "learning_rate": 1.8542404794966427e-07, + "logits/chosen": -1.4641870260238647, + "logits/rejected": -1.3147245645523071, + "logps/chosen": -196.31103515625, + "logps/rejected": -391.39166259765625, + "loss": 178.2437, + "losses_ref": -1.2605804204940796, + "ref_logps/chosen": -303.4082336425781, + "ref_logps/rejected": -174.0585479736328, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.070972204208374, + "rewards/margins": 3.2443034648895264, + "rewards/rejected": -2.1733312606811523, + "step": 785, + "u": -2.590919017791748, + "weight": 0.01949651725590229 + }, + { + "diff_generated": -220.4461669921875, + "epoch": 1.6540172729651923, + "grad_norm": 1297.8083100097251, + "learning_rate": 1.7494166948349053e-07, + "logits/chosen": -1.3500601053237915, + "logits/rejected": -1.411941409111023, + "logps/chosen": -159.91616821289062, + "logps/rejected": -383.6579284667969, + "loss": 166.2805, + "losses_ref": -1.1791235208511353, + "ref_logps/chosen": -257.8923034667969, + "ref_logps/rejected": -163.21176147460938, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9797613024711609, + "rewards/margins": 3.184222936630249, + "rewards/rejected": -2.2044615745544434, + "step": 790, + "u": -3.492673873901367, + "weight": 0.020172851160168648 + }, + { + "diff_generated": -218.501708984375, + "epoch": 1.6644857367181367, + "grad_norm": 1305.6902286203212, + "learning_rate": 1.6473581811901528e-07, + "logits/chosen": -1.3759443759918213, + "logits/rejected": -1.3116881847381592, + "logps/chosen": -175.59524536132812, + "logps/rejected": -386.4131774902344, + "loss": 166.0248, + "losses_ref": -0.9349870681762695, + "ref_logps/chosen": -275.24603271484375, + "ref_logps/rejected": -167.91146850585938, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9965084791183472, + "rewards/margins": 3.181525468826294, + "rewards/rejected": -2.1850171089172363, + "step": 795, + "u": -3.1951217651367188, + "weight": 0.007377298083156347 + }, + { + "diff_generated": -227.8848114013672, + "epoch": 1.674954200471081, + "grad_norm": 1377.3447203192195, + "learning_rate": 1.5480991445620538e-07, + "logits/chosen": -1.3294823169708252, + "logits/rejected": -1.3292287588119507, + "logps/chosen": -171.1267852783203, + "logps/rejected": -383.339111328125, + "loss": 179.9502, + "losses_ref": -1.4068111181259155, + "ref_logps/chosen": -269.3274841308594, + "ref_logps/rejected": -155.45433044433594, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9820070266723633, + "rewards/margins": 3.260855197906494, + "rewards/rejected": -2.278848171234131, + "step": 800, + "u": -3.075801134109497, + "weight": 0.02200758084654808 + }, + { + "diff_generated": -223.35498046875, + "epoch": 1.685422664224025, + "grad_norm": 1269.6278028028526, + "learning_rate": 1.4516728526756873e-07, + "logits/chosen": -1.4065078496932983, + "logits/rejected": -1.2835044860839844, + "logps/chosen": -182.1883544921875, + "logps/rejected": -374.7066650390625, + "loss": 186.9203, + "losses_ref": -2.037257671356201, + "ref_logps/chosen": -276.4019470214844, + "ref_logps/rejected": -151.35165405273438, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9421361684799194, + "rewards/margins": 3.1756858825683594, + "rewards/rejected": -2.2335495948791504, + "step": 805, + "u": -2.378087282180786, + "weight": 0.04411940649151802 + }, + { + "diff_generated": -216.16616821289062, + "epoch": 1.6958911279769695, + "grad_norm": 1463.8714206417467, + "learning_rate": 1.3581116238315194e-07, + "logits/chosen": -1.4423078298568726, + "logits/rejected": -1.3139569759368896, + "logps/chosen": -205.9932098388672, + "logps/rejected": -375.70849609375, + "loss": 190.2176, + "losses_ref": -1.2827723026275635, + "ref_logps/chosen": -311.7004699707031, + "ref_logps/rejected": -159.54234313964844, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 1.057072639465332, + "rewards/margins": 3.2187340259552, + "rewards/rejected": -2.161661386489868, + "step": 810, + "u": -2.773268938064575, + "weight": 0.022295668721199036 + }, + { + "diff_generated": -213.4055633544922, + "epoch": 1.7063595917299135, + "grad_norm": 1352.148808645479, + "learning_rate": 1.2674468160735586e-07, + "logits/chosen": -1.4077790975570679, + "logits/rejected": -1.3166415691375732, + "logps/chosen": -177.383544921875, + "logps/rejected": -373.1116027832031, + "loss": 179.0974, + "losses_ref": -3.5087268352508545, + "ref_logps/chosen": -279.3453674316406, + "ref_logps/rejected": -159.70603942871094, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.019618034362793, + "rewards/margins": 3.1536736488342285, + "rewards/rejected": -2.1340556144714355, + "step": 815, + "u": -2.615370512008667, + "weight": 0.0508296899497509 + }, + { + "diff_generated": -237.0354766845703, + "epoch": 1.7168280554828579, + "grad_norm": 1326.9582054025304, + "learning_rate": 1.1797088166794e-07, + "logits/chosen": -1.328039288520813, + "logits/rejected": -1.2903969287872314, + "logps/chosen": -176.13819885253906, + "logps/rejected": -401.98748779296875, + "loss": 179.7547, + "losses_ref": -0.005425100214779377, + "ref_logps/chosen": -275.8017883300781, + "ref_logps/rejected": -164.95204162597656, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9966354370117188, + "rewards/margins": 3.366990327835083, + "rewards/rejected": -2.370354652404785, + "step": 820, + "u": -2.696533679962158, + "weight": 3.794050280703232e-05 + }, + { + "diff_generated": -219.973876953125, + "epoch": 1.7272965192358023, + "grad_norm": 1183.8442331623387, + "learning_rate": 1.0949270319755766e-07, + "logits/chosen": -1.3806655406951904, + "logits/rejected": -1.337877631187439, + "logps/chosen": -167.13290405273438, + "logps/rejected": -381.6925048828125, + "loss": 173.9734, + "losses_ref": -2.8696396350860596, + "ref_logps/chosen": -262.41363525390625, + "ref_logps/rejected": -161.7186279296875, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9528074264526367, + "rewards/margins": 3.1525461673736572, + "rewards/rejected": -2.1997389793395996, + "step": 825, + "u": -2.7405786514282227, + "weight": 0.02941594459116459 + }, + { + "diff_generated": -211.63803100585938, + "epoch": 1.7377649829887463, + "grad_norm": 1227.5586469104078, + "learning_rate": 1.013129877481741e-07, + "logits/chosen": -1.3626017570495605, + "logits/rejected": -1.199372410774231, + "logps/chosen": -211.2673797607422, + "logps/rejected": -382.85003662109375, + "loss": 185.7144, + "losses_ref": -5.601190090179443, + "ref_logps/chosen": -318.6112060546875, + "ref_logps/rejected": -171.2120361328125, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0734381675720215, + "rewards/margins": 3.1898186206817627, + "rewards/rejected": -2.116380214691162, + "step": 830, + "u": -2.8598952293395996, + "weight": 0.029378216713666916 + }, + { + "diff_generated": -230.986328125, + "epoch": 1.7482334467416907, + "grad_norm": 1290.8717428712873, + "learning_rate": 9.343447683868799e-08, + "logits/chosen": -1.2116000652313232, + "logits/rejected": -1.2751588821411133, + "logps/chosen": -169.79380798339844, + "logps/rejected": -394.716064453125, + "loss": 178.4699, + "losses_ref": -0.9493634104728699, + "ref_logps/chosen": -262.22894287109375, + "ref_logps/rejected": -163.72976684570312, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9243512153625488, + "rewards/margins": 3.2342143058776855, + "rewards/rejected": -2.309863328933716, + "step": 835, + "u": -2.9001994132995605, + "weight": 0.008882230147719383 + }, + { + "diff_generated": -216.50314331054688, + "epoch": 1.7587019104946349, + "grad_norm": 1335.6008361033998, + "learning_rate": 8.585981103608342e-08, + "logits/chosen": -1.3362239599227905, + "logits/rejected": -1.1397970914840698, + "logps/chosen": -206.77511596679688, + "logps/rejected": -389.9546203613281, + "loss": 191.1818, + "losses_ref": -0.28884872794151306, + "ref_logps/chosen": -316.14837646484375, + "ref_logps/rejected": -173.45150756835938, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0937325954437256, + "rewards/margins": 3.2587637901306152, + "rewards/rejected": -2.1650314331054688, + "step": 840, + "u": -3.1953749656677246, + "weight": 0.004329306539148092 + }, + { + "diff_generated": -202.93106079101562, + "epoch": 1.769170374247579, + "grad_norm": 1266.3673749218208, + "learning_rate": 7.859152907041544e-08, + "logits/chosen": -1.354994773864746, + "logits/rejected": -1.1393146514892578, + "logps/chosen": -199.24710083007812, + "logps/rejected": -360.2781677246094, + "loss": 176.0576, + "losses_ref": -1.7230793237686157, + "ref_logps/chosen": -305.8094177246094, + "ref_logps/rejected": -157.3471221923828, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.06562340259552, + "rewards/margins": 3.0949339866638184, + "rewards/rejected": -2.029310464859009, + "step": 845, + "u": -2.745694160461426, + "weight": 0.0367230661213398 + }, + { + "diff_generated": -211.84765625, + "epoch": 1.7796388380005235, + "grad_norm": 1302.5644299154628, + "learning_rate": 7.163206698392742e-08, + "logits/chosen": -1.2949212789535522, + "logits/rejected": -1.1885995864868164, + "logps/chosen": -185.09088134765625, + "logps/rejected": -367.0600891113281, + "loss": 184.3042, + "losses_ref": -3.0929312705993652, + "ref_logps/chosen": -285.8619384765625, + "ref_logps/rejected": -155.21240234375, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0077106952667236, + "rewards/margins": 3.1261868476867676, + "rewards/rejected": -2.118476629257202, + "step": 850, + "u": -3.2007651329040527, + "weight": 0.03339768201112747 + }, + { + "diff_generated": -209.816650390625, + "epoch": 1.7901073017534677, + "grad_norm": 1345.4005628593675, + "learning_rate": 6.498375731458527e-08, + "logits/chosen": -1.4427772760391235, + "logits/rejected": -1.2521936893463135, + "logps/chosen": -190.9636688232422, + "logps/rejected": -376.20391845703125, + "loss": 177.6342, + "losses_ref": -2.1869561672210693, + "ref_logps/chosen": -298.1745300292969, + "ref_logps/rejected": -166.38723754882812, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.072108507156372, + "rewards/margins": 3.1702747344970703, + "rewards/rejected": -2.098165988922119, + "step": 855, + "u": -3.108565330505371, + "weight": 0.026614084839820862 + }, + { + "diff_generated": -229.3601837158203, + "epoch": 1.8005757655064119, + "grad_norm": 1261.2192787306672, + "learning_rate": 5.8648828314302735e-08, + "logits/chosen": -1.3119590282440186, + "logits/rejected": -1.1316639184951782, + "logps/chosen": -186.41650390625, + "logps/rejected": -386.34906005859375, + "loss": 176.3818, + "losses_ref": -2.3852286338806152, + "ref_logps/chosen": -289.1662292480469, + "ref_logps/rejected": -156.98886108398438, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0274972915649414, + "rewards/margins": 3.3210995197296143, + "rewards/rejected": -2.2936015129089355, + "step": 860, + "u": -2.8182337284088135, + "weight": 0.03502316027879715 + }, + { + "diff_generated": -210.816650390625, + "epoch": 1.8110442292593563, + "grad_norm": 1269.4818113722586, + "learning_rate": 5.2629403202119505e-08, + "logits/chosen": -1.2412734031677246, + "logits/rejected": -1.227634072303772, + "logps/chosen": -173.3083953857422, + "logps/rejected": -375.7776184082031, + "loss": 171.0543, + "losses_ref": -0.6853199005126953, + "ref_logps/chosen": -271.1694030761719, + "ref_logps/rejected": -164.96096801757812, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9786099195480347, + "rewards/margins": 3.0867760181427, + "rewards/rejected": -2.108166217803955, + "step": 865, + "u": -3.394763231277466, + "weight": 0.010494846850633621 + }, + { + "diff_generated": -226.4854736328125, + "epoch": 1.8215126930123005, + "grad_norm": 1268.655880675009, + "learning_rate": 4.692749945258057e-08, + "logits/chosen": -1.3430616855621338, + "logits/rejected": -1.1744420528411865, + "logps/chosen": -195.01589965820312, + "logps/rejected": -389.9505920410156, + "loss": 186.5683, + "losses_ref": -3.434800624847412, + "ref_logps/chosen": -299.091796875, + "ref_logps/rejected": -163.46514892578125, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0407590866088867, + "rewards/margins": 3.3056137561798096, + "rewards/rejected": -2.2648544311523438, + "step": 870, + "u": -2.5695509910583496, + "weight": 0.046660859137773514 + }, + { + "diff_generated": -236.5417022705078, + "epoch": 1.8319811567652446, + "grad_norm": 1204.436962297953, + "learning_rate": 4.1545028119559066e-08, + "logits/chosen": -1.3133630752563477, + "logits/rejected": -1.3207045793533325, + "logps/chosen": -190.3129425048828, + "logps/rejected": -398.3179931640625, + "loss": 171.6172, + "losses_ref": -1.358794927597046, + "ref_logps/chosen": -287.73504638671875, + "ref_logps/rejected": -161.77627563476562, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.974220871925354, + "rewards/margins": 3.339637279510498, + "rewards/rejected": -2.365417003631592, + "step": 875, + "u": -2.7414660453796387, + "weight": 0.022162286564707756 + }, + { + "diff_generated": -210.89306640625, + "epoch": 1.842449620518189, + "grad_norm": 1230.1188284044147, + "learning_rate": 3.648379319574568e-08, + "logits/chosen": -1.383299708366394, + "logits/rejected": -1.3287036418914795, + "logps/chosen": -190.19691467285156, + "logps/rejected": -363.8573913574219, + "loss": 168.7976, + "losses_ref": -4.205277442932129, + "ref_logps/chosen": -291.4452819824219, + "ref_logps/rejected": -152.96432495117188, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0124839544296265, + "rewards/margins": 3.1214146614074707, + "rewards/rejected": -2.1089303493499756, + "step": 880, + "u": -2.69191312789917, + "weight": 0.03942141681909561 + }, + { + "diff_generated": -224.0536346435547, + "epoch": 1.8529180842711332, + "grad_norm": 1317.8866979194424, + "learning_rate": 3.17454910080216e-08, + "logits/chosen": -1.387369155883789, + "logits/rejected": -1.256730318069458, + "logps/chosen": -213.5888671875, + "logps/rejected": -388.00115966796875, + "loss": 200.2688, + "losses_ref": -0.6602109670639038, + "ref_logps/chosen": -319.39569091796875, + "ref_logps/rejected": -163.94747924804688, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.058068037033081, + "rewards/margins": 3.2986044883728027, + "rewards/rejected": -2.2405362129211426, + "step": 885, + "u": -2.7355685234069824, + "weight": 0.029411468654870987 + }, + { + "diff_generated": -221.8759765625, + "epoch": 1.8633865480240774, + "grad_norm": 1306.8555947562052, + "learning_rate": 2.733170964891607e-08, + "logits/chosen": -1.3195066452026367, + "logits/rejected": -1.2867323160171509, + "logps/chosen": -170.53369140625, + "logps/rejected": -378.52935791015625, + "loss": 174.36, + "losses_ref": -0.899361252784729, + "ref_logps/chosen": -274.72943115234375, + "ref_logps/rejected": -156.65335083007812, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0419572591781616, + "rewards/margins": 3.2607169151306152, + "rewards/rejected": -2.218759775161743, + "step": 890, + "u": -3.3832144737243652, + "weight": 0.008660494349896908 + }, + { + "diff_generated": -214.10165405273438, + "epoch": 1.8738550117770219, + "grad_norm": 1275.7234308585855, + "learning_rate": 2.324392844434042e-08, + "logits/chosen": -1.3565282821655273, + "logits/rejected": -1.344678282737732, + "logps/chosen": -192.53738403320312, + "logps/rejected": -390.2491149902344, + "loss": 191.1614, + "losses_ref": -2.9138152599334717, + "ref_logps/chosen": -295.70458984375, + "ref_logps/rejected": -176.14747619628906, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0316721200942993, + "rewards/margins": 3.1726887226104736, + "rewards/rejected": -2.141016721725464, + "step": 895, + "u": -3.1400444507598877, + "weight": 0.02205641008913517 + }, + { + "diff_generated": -221.6460418701172, + "epoch": 1.8843234755299658, + "grad_norm": 1242.4363921596732, + "learning_rate": 1.9483517457776434e-08, + "logits/chosen": -1.1762725114822388, + "logits/rejected": -1.3724615573883057, + "logps/chosen": -159.86691284179688, + "logps/rejected": -381.15081787109375, + "loss": 172.6295, + "losses_ref": -4.887435436248779, + "ref_logps/chosen": -252.33743286132812, + "ref_logps/rejected": -159.50479125976562, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9247050285339355, + "rewards/margins": 3.1411654949188232, + "rewards/rejected": -2.216460704803467, + "step": 900, + "u": -2.2665815353393555, + "weight": 0.07192285358905792 + }, + { + "diff_generated": -227.669189453125, + "epoch": 1.8947919392829102, + "grad_norm": 1323.6799372011517, + "learning_rate": 1.6051737031084533e-08, + "logits/chosen": -1.2494432926177979, + "logits/rejected": -1.1595919132232666, + "logps/chosen": -175.1837921142578, + "logps/rejected": -384.48175048828125, + "loss": 174.3896, + "losses_ref": -1.007882833480835, + "ref_logps/chosen": -276.83319091796875, + "ref_logps/rejected": -156.81253051757812, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0164941549301147, + "rewards/margins": 3.2931861877441406, + "rewards/rejected": -2.2766921520233154, + "step": 905, + "u": -3.0982091426849365, + "weight": 0.01854753866791725 + }, + { + "diff_generated": -222.00784301757812, + "epoch": 1.9052604030358546, + "grad_norm": 1353.0501425434295, + "learning_rate": 1.2949737362087154e-08, + "logits/chosen": -1.222752332687378, + "logits/rejected": -1.265421986579895, + "logps/chosen": -173.27577209472656, + "logps/rejected": -388.85797119140625, + "loss": 174.8498, + "losses_ref": -6.1348981857299805, + "ref_logps/chosen": -269.9849853515625, + "ref_logps/rejected": -166.85018920898438, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9670922160148621, + "rewards/margins": 3.1871705055236816, + "rewards/rejected": -2.220078229904175, + "step": 910, + "u": -2.5107998847961426, + "weight": 0.06687295436859131 + }, + { + "diff_generated": -211.4918975830078, + "epoch": 1.9157288667887986, + "grad_norm": 1286.3307044665144, + "learning_rate": 1.0178558119067315e-08, + "logits/chosen": -1.2266263961791992, + "logits/rejected": -1.0511000156402588, + "logps/chosen": -177.09149169921875, + "logps/rejected": -372.6114807128906, + "loss": 175.9135, + "losses_ref": -0.7255733609199524, + "ref_logps/chosen": -277.30194091796875, + "ref_logps/rejected": -161.1195831298828, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0021045207977295, + "rewards/margins": 3.11702299118042, + "rewards/rejected": -2.1149187088012695, + "step": 915, + "u": -3.0817387104034424, + "weight": 0.014788592234253883 + }, + { + "diff_generated": -220.1043701171875, + "epoch": 1.926197330541743, + "grad_norm": 1287.429219240266, + "learning_rate": 7.739128092312918e-09, + "logits/chosen": -1.3375459909439087, + "logits/rejected": -1.274279236793518, + "logps/chosen": -181.00665283203125, + "logps/rejected": -377.59088134765625, + "loss": 171.8915, + "losses_ref": -1.6772384643554688, + "ref_logps/chosen": -280.682861328125, + "ref_logps/rejected": -157.4865264892578, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9967617988586426, + "rewards/margins": 3.197805881500244, + "rewards/rejected": -2.2010436058044434, + "step": 920, + "u": -2.880985736846924, + "weight": 0.036544255912303925 + }, + { + "diff_generated": -222.374755859375, + "epoch": 1.9366657942946872, + "grad_norm": 1348.176434591513, + "learning_rate": 5.632264882822757e-09, + "logits/chosen": -1.3248652219772339, + "logits/rejected": -1.2289717197418213, + "logps/chosen": -187.19947814941406, + "logps/rejected": -382.12860107421875, + "loss": 186.23, + "losses_ref": -2.8856143951416016, + "ref_logps/chosen": -288.6744079589844, + "ref_logps/rejected": -159.7538299560547, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0147496461868286, + "rewards/margins": 3.238497257232666, + "rewards/rejected": -2.223747730255127, + "step": 925, + "u": -2.8917412757873535, + "weight": 0.024965789169073105 + }, + { + "diff_generated": -213.1198272705078, + "epoch": 1.9471342580476314, + "grad_norm": 1395.889446013467, + "learning_rate": 3.858674628278824e-09, + "logits/chosen": -1.366350531578064, + "logits/rejected": -1.119940996170044, + "logps/chosen": -188.4399871826172, + "logps/rejected": -371.8886413574219, + "loss": 182.8113, + "losses_ref": -5.109557151794434, + "ref_logps/chosen": -294.67010498046875, + "ref_logps/rejected": -158.7688446044922, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0623013973236084, + "rewards/margins": 3.1934995651245117, + "rewards/rejected": -2.1311981678009033, + "step": 930, + "u": -2.426971435546875, + "weight": 0.053149282932281494 + }, + { + "diff_generated": -237.7774200439453, + "epoch": 1.9576027218005758, + "grad_norm": 1267.4031070589224, + "learning_rate": 2.418951766376742e-09, + "logits/chosen": -1.2219622135162354, + "logits/rejected": -1.2400046586990356, + "logps/chosen": -167.6567840576172, + "logps/rejected": -398.30865478515625, + "loss": 180.7217, + "losses_ref": -5.82874059677124, + "ref_logps/chosen": -267.5108947753906, + "ref_logps/rejected": -160.53125, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9985405802726746, + "rewards/margins": 3.376314878463745, + "rewards/rejected": -2.377774238586426, + "step": 935, + "u": -2.8524553775787354, + "weight": 0.05332515761256218 + }, + { + "diff_generated": -221.6811981201172, + "epoch": 1.96807118555352, + "grad_norm": 1234.1312151479083, + "learning_rate": 1.313578835593465e-09, + "logits/chosen": -1.3167364597320557, + "logits/rejected": -1.0956764221191406, + "logps/chosen": -202.79949951171875, + "logps/rejected": -389.28814697265625, + "loss": 183.0365, + "losses_ref": -1.295898199081421, + "ref_logps/chosen": -312.72149658203125, + "ref_logps/rejected": -167.60696411132812, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0992200374603271, + "rewards/margins": 3.3160319328308105, + "rewards/rejected": -2.2168118953704834, + "step": 940, + "u": -2.668116569519043, + "weight": 0.01472543366253376 + }, + { + "diff_generated": -209.20425415039062, + "epoch": 1.9785396493064642, + "grad_norm": 1327.2259702611773, + "learning_rate": 5.429263134594242e-10, + "logits/chosen": -1.298588514328003, + "logits/rejected": -1.3200442790985107, + "logps/chosen": -177.170654296875, + "logps/rejected": -369.2535705566406, + "loss": 179.5952, + "losses_ref": -4.355043888092041, + "ref_logps/chosen": -273.0224304199219, + "ref_logps/rejected": -160.04933166503906, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9585177302360535, + "rewards/margins": 3.050560474395752, + "rewards/rejected": -2.0920424461364746, + "step": 945, + "u": -2.591240406036377, + "weight": 0.0508696511387825 + }, + { + "diff_generated": -218.85842895507812, + "epoch": 1.9890081130594086, + "grad_norm": 1215.4951947566592, + "learning_rate": 1.0725249238940915e-10, + "logits/chosen": -1.3104689121246338, + "logits/rejected": -1.166074514389038, + "logps/chosen": -190.97283935546875, + "logps/rejected": -377.3951110839844, + "loss": 185.4394, + "losses_ref": -1.4506399631500244, + "ref_logps/chosen": -288.34576416015625, + "ref_logps/rejected": -158.53671264648438, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9737294316291809, + "rewards/margins": 3.162313938140869, + "rewards/rejected": -2.188584089279175, + "step": 950, + "u": -3.3431270122528076, + "weight": 0.014722567982971668 + } + ], + "logging_steps": 5, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}