{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9973828840617638, "eval_steps": 500, "global_step": 954, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "diff_generated": -1.8149629831314087, "epoch": 0.002093692750588851, "grad_norm": 43.26649304714989, "learning_rate": 2.083333333333333e-08, "logits/chosen": -2.1441590785980225, "logits/rejected": -2.0543735027313232, "logps/chosen": -276.82366943359375, "logps/rejected": -131.32485961914062, "logps_avg/chosen": -1.2310187816619873, "logps_avg/rejected": -0.5444889068603516, "loss": 0.9706, "losses_ref": -0.2554703652858734, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1, "u": -1.129564642906189, "weight": 0.727432131767273 }, { "diff_generated": -2.051100015640259, "epoch": 0.010468463752944255, "grad_norm": 36.895500460127934, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.2114098072052, "logits/rejected": -2.10967755317688, "logps/chosen": -280.6037902832031, "logps/rejected": -162.30044555664062, "logps_avg/chosen": -1.178394079208374, "logps_avg/rejected": -0.6153301000595093, "loss": 0.8456, "losses_ref": -0.2878931164741516, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5, "u": -1.3192780017852783, "weight": 0.6589411497116089 }, { "diff_generated": -2.0342957973480225, "epoch": 0.02093692750588851, "grad_norm": 42.24412669427099, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.3565850257873535, "logits/rejected": -2.1584813594818115, "logps/chosen": -300.6426086425781, "logps/rejected": -167.40040588378906, "logps_avg/chosen": -1.1184991598129272, "logps_avg/rejected": -0.6102887988090515, "loss": 0.8731, "losses_ref": -0.2850458025932312, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 10, "u": -1.2951091527938843, "weight": 0.6724194884300232 }, { "diff_generated": -1.9851667881011963, "epoch": 0.031405391258832765, "grad_norm": 31.267399626309693, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.2946715354919434, "logits/rejected": -2.146397113800049, "logps/chosen": -293.4947509765625, "logps/rejected": -156.3843994140625, "logps_avg/chosen": -1.0986683368682861, "logps_avg/rejected": -0.5955500602722168, "loss": 0.7379, "losses_ref": -0.28325891494750977, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 15, "u": -1.2593215703964233, "weight": 0.6894552111625671 }, { "diff_generated": -2.0035815238952637, "epoch": 0.04187385501177702, "grad_norm": 22.686346023577535, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.2586379051208496, "logits/rejected": -2.134080410003662, "logps/chosen": -261.52960205078125, "logps/rejected": -161.9304656982422, "logps_avg/chosen": -0.9046722650527954, "logps_avg/rejected": -0.6010745763778687, "loss": 0.5984, "losses_ref": -0.2947906255722046, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 20, "u": -1.323677659034729, "weight": 0.6650992631912231 }, { "diff_generated": -3.258924961090088, "epoch": 0.05234231876472128, "grad_norm": 15.412617135483135, "learning_rate": 5.208333333333334e-07, "logits/chosen": -2.1527328491210938, "logits/rejected": -2.013265609741211, "logps/chosen": -257.1512756347656, "logps/rejected": -277.85711669921875, "logps_avg/chosen": -0.8043298721313477, "logps_avg/rejected": -0.9776775240898132, "loss": 0.5813, "losses_ref": -0.25987568497657776, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 25, "u": -1.7415921688079834, "weight": 0.4334268569946289 }, { "diff_generated": -6.022626876831055, "epoch": 0.06281078251766553, "grad_norm": 15.25952740077981, "learning_rate": 6.249999999999999e-07, "logits/chosen": -2.1849024295806885, "logits/rejected": -2.1174261569976807, "logps/chosen": -248.16909790039062, "logps/rejected": -534.7174682617188, "logps_avg/chosen": -0.8181886672973633, "logps_avg/rejected": -1.8067880868911743, "loss": 0.667, "losses_ref": -0.1500019133090973, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 30, "u": -2.0229365825653076, "weight": 0.225816011428833 }, { "diff_generated": -9.153361320495605, "epoch": 0.07327924627060979, "grad_norm": 18.48300356782214, "learning_rate": 7.291666666666666e-07, "logits/chosen": -2.2708792686462402, "logits/rejected": -2.130821704864502, "logps/chosen": -255.21701049804688, "logps/rejected": -782.3409423828125, "logps_avg/chosen": -0.7904274463653564, "logps_avg/rejected": -2.7460083961486816, "loss": 0.6695, "losses_ref": -0.1412452608346939, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 35, "u": -2.0066444873809814, "weight": 0.2316206991672516 }, { "diff_generated": -13.209306716918945, "epoch": 0.08374771002355404, "grad_norm": 11.436173876886219, "learning_rate": 8.333333333333333e-07, "logits/chosen": -2.2111456394195557, "logits/rejected": -2.13924241065979, "logps/chosen": -241.15072631835938, "logps/rejected": -1223.218017578125, "logps_avg/chosen": -0.7820993661880493, "logps_avg/rejected": -3.962791919708252, "loss": 0.6798, "losses_ref": -0.09846386313438416, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 40, "u": -2.131727457046509, "weight": 0.1441923826932907 }, { "diff_generated": -14.63012409210205, "epoch": 0.0942161737764983, "grad_norm": 59.29532742939981, "learning_rate": 9.374999999999999e-07, "logits/chosen": -2.298741102218628, "logits/rejected": -2.0653302669525146, "logps/chosen": -264.97357177734375, "logps/rejected": -1320.9332275390625, "logps_avg/chosen": -0.779043436050415, "logps_avg/rejected": -4.389036655426025, "loss": 0.6914, "losses_ref": -0.08891113847494125, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 45, "u": -2.13495135307312, "weight": 0.13693246245384216 }, { "diff_generated": -12.911537170410156, "epoch": 0.10468463752944256, "grad_norm": 8.930786410211843, "learning_rate": 1.0416666666666667e-06, "logits/chosen": -2.302333116531372, "logits/rejected": -2.2043356895446777, "logps/chosen": -241.756103515625, "logps/rejected": -1145.5604248046875, "logps_avg/chosen": -0.7927433252334595, "logps_avg/rejected": -3.8734612464904785, "loss": 0.6993, "losses_ref": -0.10359562933444977, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 50, "u": -2.1199097633361816, "weight": 0.15450677275657654 }, { "diff_generated": -11.095788955688477, "epoch": 0.11515310128238682, "grad_norm": 9.783120635378207, "learning_rate": 1.1458333333333333e-06, "logits/chosen": -2.4609317779541016, "logits/rejected": -2.3575634956359863, "logps/chosen": -245.7393798828125, "logps/rejected": -981.2423095703125, "logps_avg/chosen": -0.8303758502006531, "logps_avg/rejected": -3.3287365436553955, "loss": 0.6926, "losses_ref": -0.08979364484548569, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 55, "u": -2.17197322845459, "weight": 0.11447404325008392 }, { "diff_generated": -13.795969009399414, "epoch": 0.12562156503533106, "grad_norm": 9.420248973366883, "learning_rate": 1.2499999999999999e-06, "logits/chosen": -2.5860393047332764, "logits/rejected": -2.482574939727783, "logps/chosen": -249.44070434570312, "logps/rejected": -1232.59228515625, "logps_avg/chosen": -0.7758530378341675, "logps_avg/rejected": -4.138791084289551, "loss": 0.6815, "losses_ref": -0.0876917839050293, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 60, "u": -2.1477303504943848, "weight": 0.12934879958629608 }, { "diff_generated": -16.25264549255371, "epoch": 0.1360900287882753, "grad_norm": 12.24868452539092, "learning_rate": 1.3541666666666667e-06, "logits/chosen": -2.640986204147339, "logits/rejected": -2.510274648666382, "logps/chosen": -258.56109619140625, "logps/rejected": -1508.2763671875, "logps_avg/chosen": -0.7998191118240356, "logps_avg/rejected": -4.875794410705566, "loss": 0.7039, "losses_ref": -0.07322683185338974, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 65, "u": -2.180368423461914, "weight": 0.1039782166481018 }, { "diff_generated": -16.121641159057617, "epoch": 0.14655849254121958, "grad_norm": 7.905374307014113, "learning_rate": 1.4583333333333333e-06, "logits/chosen": -2.581535816192627, "logits/rejected": -2.4923813343048096, "logps/chosen": -238.9574432373047, "logps/rejected": -1444.403564453125, "logps_avg/chosen": -0.8027188181877136, "logps_avg/rejected": -4.836493015289307, "loss": 0.6907, "losses_ref": -0.0750691220164299, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 70, "u": -2.189579486846924, "weight": 0.09880717098712921 }, { "diff_generated": -16.705251693725586, "epoch": 0.15702695629416383, "grad_norm": 9.573720561122785, "learning_rate": 1.5624999999999999e-06, "logits/chosen": -2.598374128341675, "logits/rejected": -2.446035146713257, "logps/chosen": -270.2249450683594, "logps/rejected": -1517.441650390625, "logps_avg/chosen": -0.7964105606079102, "logps_avg/rejected": -5.011575698852539, "loss": 0.725, "losses_ref": -0.07196028530597687, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 75, "u": -2.21059513092041, "weight": 0.08612708002328873 }, { "diff_generated": -18.304201126098633, "epoch": 0.16749542004710807, "grad_norm": 7.0924424799681, "learning_rate": 1.6666666666666667e-06, "logits/chosen": -2.591045618057251, "logits/rejected": -2.489673376083374, "logps/chosen": -216.99685668945312, "logps/rejected": -1667.5283203125, "logps_avg/chosen": -0.7215350866317749, "logps_avg/rejected": -5.491259574890137, "loss": 0.6699, "losses_ref": -0.06580645591020584, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 80, "u": -2.2118382453918457, "weight": 0.08225957304239273 }, { "diff_generated": -18.906076431274414, "epoch": 0.17796388380005235, "grad_norm": 7.632608732109636, "learning_rate": 1.7708333333333332e-06, "logits/chosen": -2.6046338081359863, "logits/rejected": -2.4658734798431396, "logps/chosen": -244.0012664794922, "logps/rejected": -1689.686767578125, "logps_avg/chosen": -0.7541030049324036, "logps_avg/rejected": -5.671823978424072, "loss": 0.7032, "losses_ref": -0.06257248669862747, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 85, "u": -2.2184884548187256, "weight": 0.07795710116624832 }, { "diff_generated": -22.26788902282715, "epoch": 0.1884323475529966, "grad_norm": 10.332533231863795, "learning_rate": 1.8749999999999998e-06, "logits/chosen": -2.62504243850708, "logits/rejected": -2.4670681953430176, "logps/chosen": -241.73550415039062, "logps/rejected": -1991.0435791015625, "logps_avg/chosen": -0.7270082235336304, "logps_avg/rejected": -6.680366516113281, "loss": 0.689, "losses_ref": -0.06023075059056282, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 90, "u": -2.2096261978149414, "weight": 0.08131252229213715 }, { "diff_generated": -20.242061614990234, "epoch": 0.19890081130594087, "grad_norm": 7.009998646854354, "learning_rate": 1.9791666666666666e-06, "logits/chosen": -2.5733673572540283, "logits/rejected": -2.4526114463806152, "logps/chosen": -241.0827178955078, "logps/rejected": -1833.453369140625, "logps_avg/chosen": -0.7628769278526306, "logps_avg/rejected": -6.07261848449707, "loss": 0.6963, "losses_ref": -0.06475149095058441, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 95, "u": -2.2028064727783203, "weight": 0.0875387191772461 }, { "diff_generated": -20.439355850219727, "epoch": 0.2093692750588851, "grad_norm": 8.018231688525765, "learning_rate": 1.9998927475076105e-06, "logits/chosen": -2.621689558029175, "logits/rejected": -2.470346689224243, "logps/chosen": -245.5767059326172, "logps/rejected": -1799.0728759765625, "logps_avg/chosen": -0.7319446802139282, "logps_avg/rejected": -6.13180685043335, "loss": 0.713, "losses_ref": -0.06253904104232788, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 100, "u": -2.2177913188934326, "weight": 0.07825066894292831 }, { "diff_generated": -20.04744529724121, "epoch": 0.21983773881182936, "grad_norm": 7.248502316485956, "learning_rate": 1.9994570736865402e-06, "logits/chosen": -2.5862081050872803, "logits/rejected": -2.4370968341827393, "logps/chosen": -236.89501953125, "logps/rejected": -1794.0465087890625, "logps_avg/chosen": -0.7266777753829956, "logps_avg/rejected": -6.01423454284668, "loss": 0.6834, "losses_ref": -0.06446884572505951, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 105, "u": -2.198464870452881, "weight": 0.0902954638004303 }, { "diff_generated": -20.10696792602539, "epoch": 0.23030620256477363, "grad_norm": 6.989545794085033, "learning_rate": 1.9986864211644068e-06, "logits/chosen": -2.570603609085083, "logits/rejected": -2.431187391281128, "logps/chosen": -236.31884765625, "logps/rejected": -1773.07421875, "logps_avg/chosen": -0.7348344326019287, "logps_avg/rejected": -6.032090187072754, "loss": 0.6907, "losses_ref": -0.06961078941822052, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 110, "u": -2.2041425704956055, "weight": 0.08867262303829193 }, { "diff_generated": -21.548114776611328, "epoch": 0.24077466631771788, "grad_norm": 8.060053280392543, "learning_rate": 1.997581048233623e-06, "logits/chosen": -2.581951141357422, "logits/rejected": -2.4441328048706055, "logps/chosen": -232.8576202392578, "logps/rejected": -1942.4847412109375, "logps_avg/chosen": -0.7739059329032898, "logps_avg/rejected": -6.4644341468811035, "loss": 0.6817, "losses_ref": -0.062096286565065384, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 115, "u": -2.216289520263672, "weight": 0.07720647752285004 }, { "diff_generated": -20.77760887145996, "epoch": 0.2512431300706621, "grad_norm": 6.53936940072868, "learning_rate": 1.9961413253717214e-06, "logits/chosen": -2.610959768295288, "logits/rejected": -2.4528729915618896, "logps/chosen": -233.8311004638672, "logps/rejected": -1862.2890625, "logps_avg/chosen": -0.7324265837669373, "logps_avg/rejected": -6.233283519744873, "loss": 0.6932, "losses_ref": -0.0750860795378685, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 120, "u": -2.200193405151367, "weight": 0.09466435015201569 }, { "diff_generated": -23.185279846191406, "epoch": 0.26171159382360637, "grad_norm": 7.018169897249557, "learning_rate": 1.994367735117177e-06, "logits/chosen": -2.5702836513519287, "logits/rejected": -2.391747236251831, "logps/chosen": -220.02792358398438, "logps/rejected": -2155.526123046875, "logps_avg/chosen": -0.7447048425674438, "logps_avg/rejected": -6.955584526062012, "loss": 0.7052, "losses_ref": -0.05986471846699715, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 125, "u": -2.1955928802490234, "weight": 0.08941423892974854 }, { "diff_generated": -22.66459846496582, "epoch": 0.2721800575765506, "grad_norm": 31.410489955444024, "learning_rate": 1.992260871907687e-06, "logits/chosen": -2.567049503326416, "logits/rejected": -2.4223153591156006, "logps/chosen": -242.8145751953125, "logps/rejected": -2053.98388671875, "logps_avg/chosen": -0.7978746294975281, "logps_avg/rejected": -6.799378871917725, "loss": 0.7155, "losses_ref": -0.04843521863222122, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 130, "u": -2.239774465560913, "weight": 0.05849189683794975 }, { "diff_generated": -23.263744354248047, "epoch": 0.2826485213294949, "grad_norm": 7.49886026826363, "learning_rate": 1.9898214418809326e-06, "logits/chosen": -2.532973289489746, "logits/rejected": -2.372011423110962, "logps/chosen": -241.5897674560547, "logps/rejected": -2110.734375, "logps_avg/chosen": -0.7454018592834473, "logps_avg/rejected": -6.979123592376709, "loss": 0.6961, "losses_ref": -0.04879006743431091, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 135, "u": -2.2302093505859375, "weight": 0.06326891481876373 }, { "diff_generated": -22.754619598388672, "epoch": 0.29311698508243916, "grad_norm": 7.014311333863948, "learning_rate": 1.9870502626379126e-06, "logits/chosen": -2.488236904144287, "logits/rejected": -2.361851215362549, "logps/chosen": -234.2844696044922, "logps/rejected": -2074.984375, "logps_avg/chosen": -0.7961763143539429, "logps_avg/rejected": -6.826386451721191, "loss": 0.7285, "losses_ref": -0.055333297699689865, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 140, "u": -2.2265305519104004, "weight": 0.06895061582326889 }, { "diff_generated": -20.225128173828125, "epoch": 0.3035854488353834, "grad_norm": 6.7478341009341865, "learning_rate": 1.983948262968915e-06, "logits/chosen": -2.5856704711914062, "logits/rejected": -2.4371695518493652, "logps/chosen": -263.78900146484375, "logps/rejected": -1824.1302490234375, "logps_avg/chosen": -0.7517282366752625, "logps_avg/rejected": -6.067538738250732, "loss": 0.6839, "losses_ref": -0.06395243108272552, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 145, "u": -2.2037534713745117, "weight": 0.08503635227680206 }, { "diff_generated": -24.050996780395508, "epoch": 0.31405391258832765, "grad_norm": 7.353090756036984, "learning_rate": 1.9805164825422237e-06, "logits/chosen": -2.607673168182373, "logits/rejected": -2.408552646636963, "logps/chosen": -241.8136749267578, "logps/rejected": -2169.62353515625, "logps_avg/chosen": -0.7578203082084656, "logps_avg/rejected": -7.215299129486084, "loss": 0.6958, "losses_ref": -0.05395021289587021, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 150, "u": -2.234814167022705, "weight": 0.06379680335521698 }, { "diff_generated": -23.94837188720703, "epoch": 0.3245223763412719, "grad_norm": 7.484499798723553, "learning_rate": 1.9767560715556594e-06, "logits/chosen": -2.5357837677001953, "logits/rejected": -2.3741650581359863, "logps/chosen": -237.78701782226562, "logps/rejected": -2074.5205078125, "logps_avg/chosen": -0.7676432132720947, "logps_avg/rejected": -7.184511661529541, "loss": 0.7199, "losses_ref": -0.044619906693696976, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 155, "u": -2.2300286293029785, "weight": 0.061775337904691696 }, { "diff_generated": -24.210857391357422, "epoch": 0.33499084009421615, "grad_norm": 7.8117370330190115, "learning_rate": 1.972668290351084e-06, "logits/chosen": -2.532038688659668, "logits/rejected": -2.3655738830566406, "logps/chosen": -246.5824432373047, "logps/rejected": -2090.85693359375, "logps_avg/chosen": -0.7575558423995972, "logps_avg/rejected": -7.2632575035095215, "loss": 0.6939, "losses_ref": -0.04590834304690361, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 160, "u": -2.236487865447998, "weight": 0.05844121426343918 }, { "diff_generated": -20.957683563232422, "epoch": 0.34545930384716045, "grad_norm": 7.4058662270815026, "learning_rate": 1.968254508991978e-06, "logits/chosen": -2.6238338947296143, "logits/rejected": -2.4566922187805176, "logps/chosen": -245.81436157226562, "logps/rejected": -1895.0390625, "logps_avg/chosen": -0.7605465054512024, "logps_avg/rejected": -6.2873053550720215, "loss": 0.701, "losses_ref": -0.05409424751996994, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 165, "u": -2.236602783203125, "weight": 0.0619116947054863 }, { "diff_generated": -23.36783218383789, "epoch": 0.3559277676001047, "grad_norm": 7.74288657614709, "learning_rate": 1.9635162068042544e-06, "logits/chosen": -2.5531725883483887, "logits/rejected": -2.385223627090454, "logps/chosen": -250.6099090576172, "logps/rejected": -2106.687744140625, "logps_avg/chosen": -0.7441612482070923, "logps_avg/rejected": -7.010349273681641, "loss": 0.7035, "losses_ref": -0.060589499771595, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 170, "u": -2.218136787414551, "weight": 0.0771271213889122 }, { "diff_generated": -23.426584243774414, "epoch": 0.36639623135304894, "grad_norm": 6.175218562127925, "learning_rate": 1.958454971880441e-06, "logits/chosen": -2.545517683029175, "logits/rejected": -2.3892464637756348, "logps/chosen": -271.62152099609375, "logps/rejected": -2128.689208984375, "logps_avg/chosen": -0.7712885141372681, "logps_avg/rejected": -7.027975559234619, "loss": 0.6768, "losses_ref": -0.059747565537691116, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 175, "u": -2.221135377883911, "weight": 0.07428421080112457 }, { "diff_generated": -23.27652931213379, "epoch": 0.3768646951059932, "grad_norm": 7.602114045248552, "learning_rate": 1.9530725005474194e-06, "logits/chosen": -2.5965559482574463, "logits/rejected": -2.4581873416900635, "logps/chosen": -225.35818481445312, "logps/rejected": -2096.1943359375, "logps_avg/chosen": -0.7377344369888306, "logps_avg/rejected": -6.982959747314453, "loss": 0.6599, "losses_ref": -0.06142450496554375, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 180, "u": -2.224907875061035, "weight": 0.07244168221950531 }, { "diff_generated": -24.591943740844727, "epoch": 0.38733315885893743, "grad_norm": 6.781608060052273, "learning_rate": 1.9473705967978807e-06, "logits/chosen": -2.6047005653381348, "logits/rejected": -2.4540090560913086, "logps/chosen": -231.2947235107422, "logps/rejected": -2179.2568359375, "logps_avg/chosen": -0.689501166343689, "logps_avg/rejected": -7.3775835037231445, "loss": 0.6665, "losses_ref": -0.05740996077656746, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 185, "u": -2.2239882946014404, "weight": 0.07182185351848602 }, { "diff_generated": -25.36248016357422, "epoch": 0.39780162261188173, "grad_norm": 10.641404317565371, "learning_rate": 1.941351171685697e-06, "logits/chosen": -2.5710506439208984, "logits/rejected": -2.4436774253845215, "logps/chosen": -236.1158905029297, "logps/rejected": -2273.37158203125, "logps_avg/chosen": -0.7929750680923462, "logps_avg/rejected": -7.6087446212768555, "loss": 0.7108, "losses_ref": -0.05253469944000244, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 190, "u": -2.239004373550415, "weight": 0.06010523438453674 }, { "diff_generated": -25.077518463134766, "epoch": 0.408270086364826, "grad_norm": 9.470830241427814, "learning_rate": 1.9350162426854148e-06, "logits/chosen": -2.602252244949341, "logits/rejected": -2.4661412239074707, "logps/chosen": -197.88571166992188, "logps/rejected": -2272.28076171875, "logps_avg/chosen": -0.7630836367607117, "logps_avg/rejected": -7.523255348205566, "loss": 0.6999, "losses_ref": -0.04595743492245674, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 195, "u": -2.243717670440674, "weight": 0.054233819246292114 }, { "diff_generated": -24.682910919189453, "epoch": 0.4187385501177702, "grad_norm": 6.269041714690376, "learning_rate": 1.9283679330160725e-06, "logits/chosen": -2.5849337577819824, "logits/rejected": -2.394373655319214, "logps/chosen": -242.97378540039062, "logps/rejected": -2224.541015625, "logps_avg/chosen": -0.7199097871780396, "logps_avg/rejected": -7.404873847961426, "loss": 0.69, "losses_ref": -0.0516563281416893, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 200, "u": -2.2307353019714355, "weight": 0.06507831811904907 }, { "diff_generated": -22.525114059448242, "epoch": 0.42920701387071447, "grad_norm": 6.963251924926938, "learning_rate": 1.9214084709295847e-06, "logits/chosen": -2.6382362842559814, "logits/rejected": -2.4577651023864746, "logps/chosen": -259.39349365234375, "logps/rejected": -2065.585693359375, "logps_avg/chosen": -0.7225343585014343, "logps_avg/rejected": -6.757534027099609, "loss": 0.696, "losses_ref": -0.05577712133526802, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 205, "u": -2.2293906211853027, "weight": 0.0664394274353981 }, { "diff_generated": -22.145648956298828, "epoch": 0.4396754776236587, "grad_norm": 7.299076527075288, "learning_rate": 1.9141401889639164e-06, "logits/chosen": -2.5583319664001465, "logits/rejected": -2.4039664268493652, "logps/chosen": -238.9542694091797, "logps/rejected": -2062.404541015625, "logps_avg/chosen": -0.7716320753097534, "logps_avg/rejected": -6.6436944007873535, "loss": 0.6993, "losses_ref": -0.058913152664899826, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 210, "u": -2.2152769565582275, "weight": 0.07614172250032425 }, { "diff_generated": -23.579111099243164, "epoch": 0.45014394137660296, "grad_norm": 8.50842985439364, "learning_rate": 1.906565523161312e-06, "logits/chosen": -2.600001335144043, "logits/rejected": -2.4590041637420654, "logps/chosen": -231.87673950195312, "logps/rejected": -2083.391357421875, "logps_avg/chosen": -0.7907384634017944, "logps_avg/rejected": -7.073732852935791, "loss": 0.7066, "losses_ref": -0.05489416792988777, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 215, "u": -2.234017848968506, "weight": 0.06394322961568832 }, { "diff_generated": -27.941696166992188, "epoch": 0.46061240512954726, "grad_norm": 6.305196829448204, "learning_rate": 1.8986870122518259e-06, "logits/chosen": -2.6018145084381104, "logits/rejected": -2.436535358428955, "logps/chosen": -245.06005859375, "logps/rejected": -2555.211181640625, "logps_avg/chosen": -0.7695084810256958, "logps_avg/rejected": -8.382509231567383, "loss": 0.7137, "losses_ref": -0.04443511739373207, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 220, "u": -2.2481765747070312, "weight": 0.051543742418289185 }, { "diff_generated": -26.58075523376465, "epoch": 0.4710808688824915, "grad_norm": 6.961879634992629, "learning_rate": 1.8905072968024423e-06, "logits/chosen": -2.567117214202881, "logits/rejected": -2.3789048194885254, "logps/chosen": -244.94296264648438, "logps/rejected": -2428.1923828125, "logps_avg/chosen": -0.7622503042221069, "logps_avg/rejected": -7.974226474761963, "loss": 0.6936, "losses_ref": -0.04088358208537102, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 225, "u": -2.2515604496002197, "weight": 0.04799002408981323 }, { "diff_generated": -24.84002113342285, "epoch": 0.48154933263543576, "grad_norm": 7.1763831101881275, "learning_rate": 1.88202911833206e-06, "logits/chosen": -2.520597219467163, "logits/rejected": -2.395034074783325, "logps/chosen": -213.36929321289062, "logps/rejected": -2192.75390625, "logps_avg/chosen": -0.7349015474319458, "logps_avg/rejected": -7.4520063400268555, "loss": 0.6978, "losses_ref": -0.051292240619659424, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 230, "u": -2.231480360031128, "weight": 0.06503967195749283 }, { "diff_generated": -26.721317291259766, "epoch": 0.49201779638838, "grad_norm": 5.9191842237687835, "learning_rate": 1.873255318392644e-06, "logits/chosen": -2.4896910190582275, "logits/rejected": -2.296112060546875, "logps/chosen": -239.5654296875, "logps/rejected": -2448.593505859375, "logps_avg/chosen": -0.7563043236732483, "logps_avg/rejected": -8.01639461517334, "loss": 0.7163, "losses_ref": -0.05161570757627487, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 235, "u": -2.230043649673462, "weight": 0.06615348160266876 }, { "diff_generated": -22.361705780029297, "epoch": 0.5024862601413242, "grad_norm": 6.264520814093759, "learning_rate": 1.8641888376168483e-06, "logits/chosen": -2.4571125507354736, "logits/rejected": -2.3177151679992676, "logps/chosen": -219.5469207763672, "logps/rejected": -1993.8834228515625, "logps_avg/chosen": -0.7551349997520447, "logps_avg/rejected": -6.708512783050537, "loss": 0.7049, "losses_ref": -0.05244841426610947, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 240, "u": -2.2336738109588623, "weight": 0.06469963490962982 }, { "diff_generated": -19.673988342285156, "epoch": 0.5129547238942685, "grad_norm": 6.373155717275301, "learning_rate": 1.8548327147324312e-06, "logits/chosen": -2.459257125854492, "logits/rejected": -2.273050546646118, "logps/chosen": -248.42935180664062, "logps/rejected": -1772.5706787109375, "logps_avg/chosen": -0.7812148928642273, "logps_avg/rejected": -5.902197360992432, "loss": 0.6961, "losses_ref": -0.0656919851899147, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 245, "u": -2.2100472450256348, "weight": 0.08213107287883759 }, { "diff_generated": -18.21377182006836, "epoch": 0.5234231876472127, "grad_norm": 7.015946672120974, "learning_rate": 1.8451900855437948e-06, "logits/chosen": -2.4628689289093018, "logits/rejected": -2.322192430496216, "logps/chosen": -242.85488891601562, "logps/rejected": -1614.31201171875, "logps_avg/chosen": -0.7260557413101196, "logps_avg/rejected": -5.4641313552856445, "loss": 0.6754, "losses_ref": -0.05365673825144768, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 250, "u": -2.234142303466797, "weight": 0.06292648613452911 }, { "diff_generated": -21.006351470947266, "epoch": 0.533891651400157, "grad_norm": 6.444057235727556, "learning_rate": 1.8352641818809846e-06, "logits/chosen": -2.44881010055542, "logits/rejected": -2.264845371246338, "logps/chosen": -258.3345031738281, "logps/rejected": -1910.637451171875, "logps_avg/chosen": -0.7704434394836426, "logps_avg/rejected": -6.301905155181885, "loss": 0.6922, "losses_ref": -0.05841520428657532, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 255, "u": -2.230961322784424, "weight": 0.06754828989505768 }, { "diff_generated": -21.22915267944336, "epoch": 0.5443601151531012, "grad_norm": 6.052398997240752, "learning_rate": 1.8250583305165094e-06, "logits/chosen": -2.3323371410369873, "logits/rejected": -2.212430477142334, "logps/chosen": -235.18038940429688, "logps/rejected": -1926.814453125, "logps_avg/chosen": -0.6792945861816406, "logps_avg/rejected": -6.368745803833008, "loss": 0.6742, "losses_ref": -0.047284115105867386, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 260, "u": -2.220738172531128, "weight": 0.06843873858451843 }, { "diff_generated": -21.301851272583008, "epoch": 0.5548285789060455, "grad_norm": 6.4499158810515755, "learning_rate": 1.8145759520503357e-06, "logits/chosen": -2.4637808799743652, "logits/rejected": -2.2752346992492676, "logps/chosen": -246.92269897460938, "logps/rejected": -1889.571533203125, "logps_avg/chosen": -0.7389290928840637, "logps_avg/rejected": -6.390555381774902, "loss": 0.6763, "losses_ref": -0.05337480455636978, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 265, "u": -2.2342476844787598, "weight": 0.06287747621536255 }, { "diff_generated": -24.129053115844727, "epoch": 0.5652970426589898, "grad_norm": 6.150486891273085, "learning_rate": 1.803820559763439e-06, "logits/chosen": -2.463932752609253, "logits/rejected": -2.262209415435791, "logps/chosen": -218.674072265625, "logps/rejected": -2158.11376953125, "logps_avg/chosen": -0.7358182072639465, "logps_avg/rejected": -7.238715171813965, "loss": 0.7092, "losses_ref": -0.05700932815670967, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 270, "u": -2.235081911087036, "weight": 0.0649222731590271 }, { "diff_generated": -22.390344619750977, "epoch": 0.575765506411934, "grad_norm": 7.077728369391663, "learning_rate": 1.7927957584402895e-06, "logits/chosen": -2.4641366004943848, "logits/rejected": -2.289757251739502, "logps/chosen": -230.87442016601562, "logps/rejected": -1978.302734375, "logps_avg/chosen": -0.6890340447425842, "logps_avg/rejected": -6.717103004455566, "loss": 0.6762, "losses_ref": -0.05622117966413498, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 275, "u": -2.217959403991699, "weight": 0.07241992652416229 }, { "diff_generated": -21.651906967163086, "epoch": 0.5862339701648783, "grad_norm": 6.269922997412507, "learning_rate": 1.78150524316067e-06, "logits/chosen": -2.512561082839966, "logits/rejected": -2.3291046619415283, "logps/chosen": -247.04129028320312, "logps/rejected": -1997.1549072265625, "logps_avg/chosen": -0.7235974073410034, "logps_avg/rejected": -6.495572566986084, "loss": 0.6702, "losses_ref": -0.04933195561170578, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 280, "u": -2.2435684204101562, "weight": 0.05631055310368538 }, { "diff_generated": -24.519784927368164, "epoch": 0.5967024339178225, "grad_norm": 6.4591538424452475, "learning_rate": 1.7699527980612304e-06, "logits/chosen": -2.533612012863159, "logits/rejected": -2.310060501098633, "logps/chosen": -241.06430053710938, "logps/rejected": -2117.74609375, "logps_avg/chosen": -0.7511512041091919, "logps_avg/rejected": -7.355935573577881, "loss": 0.7064, "losses_ref": -0.0406634621322155, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 285, "u": -2.252286434173584, "weight": 0.04695131629705429 }, { "diff_generated": -22.960649490356445, "epoch": 0.6071708976707668, "grad_norm": 6.2154170319293724, "learning_rate": 1.758142295067194e-06, "logits/chosen": -2.508026123046875, "logits/rejected": -2.2768871784210205, "logps/chosen": -256.1479797363281, "logps/rejected": -2004.0556640625, "logps_avg/chosen": -0.7584555745124817, "logps_avg/rejected": -6.888195037841797, "loss": 0.6642, "losses_ref": -0.05948421359062195, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 290, "u": -2.2216153144836426, "weight": 0.07435683906078339 }, { "diff_generated": -23.191375732421875, "epoch": 0.6176393614237111, "grad_norm": 6.58174772631908, "learning_rate": 1.7460776925946416e-06, "logits/chosen": -2.5151877403259277, "logits/rejected": -2.297478199005127, "logps/chosen": -233.7965087890625, "logps/rejected": -2135.15673828125, "logps_avg/chosen": -0.7887166738510132, "logps_avg/rejected": -6.957413673400879, "loss": 0.6755, "losses_ref": -0.055867087095975876, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 295, "u": -2.237914562225342, "weight": 0.062143467366695404 }, { "diff_generated": -24.709823608398438, "epoch": 0.6281078251766553, "grad_norm": 7.437442244122165, "learning_rate": 1.7337630342238039e-06, "logits/chosen": -2.525470018386841, "logits/rejected": -2.3166513442993164, "logps/chosen": -229.94558715820312, "logps/rejected": -2169.215576171875, "logps_avg/chosen": -0.7630201578140259, "logps_avg/rejected": -7.412947177886963, "loss": 0.7146, "losses_ref": -0.0521920807659626, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 300, "u": -2.238734722137451, "weight": 0.06009601429104805 }, { "diff_generated": -25.142507553100586, "epoch": 0.6385762889295996, "grad_norm": 6.81810983140467, "learning_rate": 1.7212024473438145e-06, "logits/chosen": -2.5295021533966064, "logits/rejected": -2.3437719345092773, "logps/chosen": -230.28018188476562, "logps/rejected": -2279.5810546875, "logps_avg/chosen": -0.6913032531738281, "logps_avg/rejected": -7.54275369644165, "loss": 0.6605, "losses_ref": -0.04879279434680939, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 305, "u": -2.2388291358947754, "weight": 0.058758098632097244 }, { "diff_generated": -24.345029830932617, "epoch": 0.6490447526825438, "grad_norm": 6.09422333137857, "learning_rate": 1.70840014176937e-06, "logits/chosen": -2.496091604232788, "logits/rejected": -2.2605936527252197, "logps/chosen": -264.0978698730469, "logps/rejected": -2208.2470703125, "logps_avg/chosen": -0.7388861179351807, "logps_avg/rejected": -7.303508758544922, "loss": 0.6912, "losses_ref": -0.042303841561079025, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 310, "u": -2.246720552444458, "weight": 0.05154282599687576 }, { "diff_generated": -23.305843353271484, "epoch": 0.6595132164354881, "grad_norm": 6.009874799920644, "learning_rate": 1.6953604083297663e-06, "logits/chosen": -2.5141513347625732, "logits/rejected": -2.3054990768432617, "logps/chosen": -241.82406616210938, "logps/rejected": -2167.42724609375, "logps_avg/chosen": -0.740818202495575, "logps_avg/rejected": -6.991753578186035, "loss": 0.6887, "losses_ref": -0.0596298985183239, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 315, "u": -2.2182247638702393, "weight": 0.07611407339572906 }, { "diff_generated": -27.0042724609375, "epoch": 0.6699816801884323, "grad_norm": 5.920473182891855, "learning_rate": 1.6820876174307821e-06, "logits/chosen": -2.482053279876709, "logits/rejected": -2.2886459827423096, "logps/chosen": -223.24893188476562, "logps/rejected": -2428.3193359375, "logps_avg/chosen": -0.7374002933502197, "logps_avg/rejected": -8.101282119750977, "loss": 0.6816, "losses_ref": -0.049068134278059006, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 320, "u": -2.235114574432373, "weight": 0.06041133403778076 }, { "diff_generated": -25.161632537841797, "epoch": 0.6804501439413766, "grad_norm": 6.759097342452152, "learning_rate": 1.668586217589889e-06, "logits/chosen": -2.4576220512390137, "logits/rejected": -2.2568023204803467, "logps/chosen": -255.9824676513672, "logps/rejected": -2272.87451171875, "logps_avg/chosen": -0.8112290501594543, "logps_avg/rejected": -7.548490047454834, "loss": 0.7034, "losses_ref": -0.04155198484659195, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 325, "u": -2.2597880363464355, "weight": 0.04243909567594528 }, { "diff_generated": -26.866863250732422, "epoch": 0.6909186076943209, "grad_norm": 5.913181146879915, "learning_rate": 1.6548607339452852e-06, "logits/chosen": -2.5034430027008057, "logits/rejected": -2.2873404026031494, "logps/chosen": -219.890625, "logps/rejected": -2450.533203125, "logps_avg/chosen": -0.7192927598953247, "logps_avg/rejected": -8.060060501098633, "loss": 0.679, "losses_ref": -0.04148325324058533, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 330, "u": -2.258359432220459, "weight": 0.04333708435297012 }, { "diff_generated": -26.58041000366211, "epoch": 0.7013870714472651, "grad_norm": 6.258862828154151, "learning_rate": 1.6409157667392455e-06, "logits/chosen": -2.5029423236846924, "logits/rejected": -2.2649450302124023, "logps/chosen": -239.6374969482422, "logps/rejected": -2410.685302734375, "logps_avg/chosen": -0.7706997990608215, "logps_avg/rejected": -7.974122524261475, "loss": 0.7035, "losses_ref": -0.05212752893567085, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 335, "u": -2.2294507026672363, "weight": 0.06685757637023926 }, { "diff_generated": -30.061986923217773, "epoch": 0.7118555352002094, "grad_norm": 7.477206152513725, "learning_rate": 1.6267559897763027e-06, "logits/chosen": -2.3795595169067383, "logits/rejected": -2.18742036819458, "logps/chosen": -192.0414276123047, "logps/rejected": -2674.73486328125, "logps_avg/chosen": -0.7409474849700928, "logps_avg/rejected": -9.018596649169922, "loss": 0.6831, "losses_ref": -0.044330693781375885, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 340, "u": -2.235874652862549, "weight": 0.059127964079380035 }, { "diff_generated": -28.720870971679688, "epoch": 0.7223239989531536, "grad_norm": 5.9002590426162325, "learning_rate": 1.6123861488567708e-06, "logits/chosen": -2.4881465435028076, "logits/rejected": -2.2146873474121094, "logps/chosen": -260.3475341796875, "logps/rejected": -2515.25, "logps_avg/chosen": -0.7461652755737305, "logps_avg/rejected": -8.61626148223877, "loss": 0.6968, "losses_ref": -0.044901080429553986, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 345, "u": -2.2523629665374756, "weight": 0.04855785518884659 }, { "diff_generated": -28.04868507385254, "epoch": 0.7327924627060979, "grad_norm": 15.894199978415127, "learning_rate": 1.5978110601861409e-06, "logits/chosen": -2.471588611602783, "logits/rejected": -2.2580113410949707, "logps/chosen": -255.3411102294922, "logps/rejected": -2506.482666015625, "logps_avg/chosen": -0.7827759385108948, "logps_avg/rejected": -8.414606094360352, "loss": 0.7362, "losses_ref": -0.04014447331428528, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 350, "u": -2.2483315467834473, "weight": 0.04962104931473732 }, { "diff_generated": -27.11871337890625, "epoch": 0.7432609264590422, "grad_norm": 5.4012187487436725, "learning_rate": 1.5830356087608763e-06, "logits/chosen": -2.4285144805908203, "logits/rejected": -2.1649179458618164, "logps/chosen": -218.6619415283203, "logps/rejected": -2413.4892578125, "logps_avg/chosen": -0.7086374163627625, "logps_avg/rejected": -8.135615348815918, "loss": 0.7021, "losses_ref": -0.03781733289361, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 355, "u": -2.2616829872131348, "weight": 0.0397658608853817 }, { "diff_generated": -26.739330291748047, "epoch": 0.7537293902119864, "grad_norm": 6.5263260405852614, "learning_rate": 1.5680647467311555e-06, "logits/chosen": -2.4289963245391846, "logits/rejected": -2.133953332901001, "logps/chosen": -247.11563110351562, "logps/rejected": -2465.95849609375, "logps_avg/chosen": -0.7823926210403442, "logps_avg/rejected": -8.02180004119873, "loss": 0.6853, "losses_ref": -0.0527551993727684, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 360, "u": -2.2420668601989746, "weight": 0.0583949089050293 }, { "diff_generated": -27.283761978149414, "epoch": 0.7641978539649307, "grad_norm": 6.979588218526593, "learning_rate": 1.552903491741107e-06, "logits/chosen": -2.449387550354004, "logits/rejected": -2.1368231773376465, "logps/chosen": -234.6686553955078, "logps/rejected": -2578.747802734375, "logps_avg/chosen": -0.740507185459137, "logps_avg/rejected": -8.185129165649414, "loss": 0.6824, "losses_ref": -0.03961649537086487, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 365, "u": -2.2473196983337402, "weight": 0.05010756850242615 }, { "diff_generated": -26.48313331604004, "epoch": 0.7746663177178749, "grad_norm": 7.697158528726637, "learning_rate": 1.5375569252470895e-06, "logits/chosen": -2.5160136222839355, "logits/rejected": -2.2105443477630615, "logps/chosen": -270.76727294921875, "logps/rejected": -2356.61376953125, "logps_avg/chosen": -0.798203706741333, "logps_avg/rejected": -7.944940090179443, "loss": 0.6956, "losses_ref": -0.03987672179937363, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 370, "u": -2.259321689605713, "weight": 0.042373161762952805 }, { "diff_generated": -25.16873550415039, "epoch": 0.7851347814708192, "grad_norm": 6.394620991151716, "learning_rate": 1.5220301908145903e-06, "logits/chosen": -2.464231491088867, "logits/rejected": -2.1346030235290527, "logps/chosen": -240.89230346679688, "logps/rejected": -2322.256591796875, "logps_avg/chosen": -0.6929277181625366, "logps_avg/rejected": -7.55062198638916, "loss": 0.6924, "losses_ref": -0.04263712465763092, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 375, "u": -2.2447123527526855, "weight": 0.05186506360769272 }, { "diff_generated": -26.598400115966797, "epoch": 0.7956032452237635, "grad_norm": 6.833084085030009, "learning_rate": 1.5063284923943028e-06, "logits/chosen": -2.4700121879577637, "logits/rejected": -2.1623213291168213, "logps/chosen": -255.25228881835938, "logps/rejected": -2325.41162109375, "logps_avg/chosen": -0.7505702376365662, "logps_avg/rejected": -7.97952127456665, "loss": 0.6914, "losses_ref": -0.039328016340732574, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 380, "u": -2.2589406967163086, "weight": 0.04286640137434006 }, { "diff_generated": -29.339923858642578, "epoch": 0.8060717089767077, "grad_norm": 6.446112080414134, "learning_rate": 1.490457092577968e-06, "logits/chosen": -2.4463934898376465, "logits/rejected": -2.0776758193969727, "logps/chosen": -232.91452026367188, "logps/rejected": -2714.375244140625, "logps_avg/chosen": -0.6785185933113098, "logps_avg/rejected": -8.801977157592773, "loss": 0.6865, "losses_ref": -0.04436464607715607, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 385, "u": -2.247189998626709, "weight": 0.05228755623102188 }, { "diff_generated": -27.133153915405273, "epoch": 0.816540172729652, "grad_norm": 5.888520537518448, "learning_rate": 1.4744213108345602e-06, "logits/chosen": -2.5249063968658447, "logits/rejected": -2.1448757648468018, "logps/chosen": -258.61212158203125, "logps/rejected": -2449.294677734375, "logps_avg/chosen": -0.7527631521224976, "logps_avg/rejected": -8.139945983886719, "loss": 0.685, "losses_ref": -0.0589534267783165, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 390, "u": -2.2169764041900635, "weight": 0.0769612193107605 }, { "diff_generated": -27.033132553100586, "epoch": 0.8270086364825961, "grad_norm": 6.211789156823427, "learning_rate": 1.4582265217274103e-06, "logits/chosen": -2.4122936725616455, "logits/rejected": -2.095203161239624, "logps/chosen": -251.5576629638672, "logps/rejected": -2401.735595703125, "logps_avg/chosen": -0.7489043474197388, "logps_avg/rejected": -8.109941482543945, "loss": 0.6753, "losses_ref": -0.048131681978702545, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 395, "u": -2.247305393218994, "weight": 0.053915899246931076 }, { "diff_generated": -30.035808563232422, "epoch": 0.8374771002355405, "grad_norm": 6.698107767192597, "learning_rate": 1.4418781531128635e-06, "logits/chosen": -2.486995220184326, "logits/rejected": -2.131185531616211, "logps/chosen": -239.08642578125, "logps/rejected": -2759.15625, "logps_avg/chosen": -0.7630764245986938, "logps_avg/rejected": -9.010741233825684, "loss": 0.6892, "losses_ref": -0.036631032824516296, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 400, "u": -2.2631499767303467, "weight": 0.038820598274469376 }, { "diff_generated": -27.721935272216797, "epoch": 0.8479455639884846, "grad_norm": 7.031324917308057, "learning_rate": 1.4253816843210748e-06, "logits/chosen": -2.4483680725097656, "logits/rejected": -2.089618444442749, "logps/chosen": -249.0079803466797, "logps/rejected": -2574.352783203125, "logps_avg/chosen": -0.722091019153595, "logps_avg/rejected": -8.316580772399902, "loss": 0.7066, "losses_ref": -0.043711207807064056, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 405, "u": -2.2515838146209717, "weight": 0.048544611781835556 }, { "diff_generated": -30.00594711303711, "epoch": 0.8584140277414289, "grad_norm": 5.878873328550679, "learning_rate": 1.4087426443195547e-06, "logits/chosen": -2.4377264976501465, "logits/rejected": -2.0860629081726074, "logps/chosen": -220.13644409179688, "logps/rejected": -2700.03369140625, "logps_avg/chosen": -0.7378045916557312, "logps_avg/rejected": -9.001784324645996, "loss": 0.6757, "losses_ref": -0.032459113746881485, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 410, "u": -2.2480547428131104, "weight": 0.04561341553926468 }, { "diff_generated": -27.96181297302246, "epoch": 0.8688824914943732, "grad_norm": 6.085121754886306, "learning_rate": 1.391966609860075e-06, "logits/chosen": -2.4773359298706055, "logits/rejected": -2.129520893096924, "logps/chosen": -239.4454803466797, "logps/rejected": -2550.92919921875, "logps_avg/chosen": -0.7163268327713013, "logps_avg/rejected": -8.388544082641602, "loss": 0.6864, "losses_ref": -0.03842215612530708, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 415, "u": -2.2578535079956055, "weight": 0.04306939244270325 }, { "diff_generated": -27.701797485351562, "epoch": 0.8793509552473174, "grad_norm": 6.898834621323108, "learning_rate": 1.3750592036095619e-06, "logits/chosen": -2.4759981632232666, "logits/rejected": -2.1207737922668457, "logps/chosen": -255.3009490966797, "logps/rejected": -2467.61328125, "logps_avg/chosen": -0.7468316555023193, "logps_avg/rejected": -8.310539245605469, "loss": 0.6929, "losses_ref": -0.050536155700683594, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 420, "u": -2.2376935482025146, "weight": 0.05989469215273857 }, { "diff_generated": -27.900798797607422, "epoch": 0.8898194190002617, "grad_norm": 7.318402161699278, "learning_rate": 1.3580260922655984e-06, "logits/chosen": -2.459564685821533, "logits/rejected": -2.133777379989624, "logps/chosen": -232.8207550048828, "logps/rejected": -2438.7041015625, "logps_avg/chosen": -0.7522517442703247, "logps_avg/rejected": -8.370241165161133, "loss": 0.6907, "losses_ref": -0.040023092180490494, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 425, "u": -2.2584662437438965, "weight": 0.043312422931194305 }, { "diff_generated": -29.812658309936523, "epoch": 0.9002878827532059, "grad_norm": 6.38418063766252, "learning_rate": 1.3408729846571713e-06, "logits/chosen": -2.4594979286193848, "logits/rejected": -2.071135997772217, "logps/chosen": -280.634521484375, "logps/rejected": -2652.205322265625, "logps_avg/chosen": -0.7122408747673035, "logps_avg/rejected": -8.943798065185547, "loss": 0.6859, "losses_ref": -0.03510651737451553, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 430, "u": -2.2575087547302246, "weight": 0.04237521067261696 }, { "diff_generated": -25.203630447387695, "epoch": 0.9107563465061502, "grad_norm": 6.588604544150575, "learning_rate": 1.3236056298312956e-06, "logits/chosen": -2.481071949005127, "logits/rejected": -2.1608479022979736, "logps/chosen": -234.13027954101562, "logps/rejected": -2276.569580078125, "logps_avg/chosen": -0.7077358365058899, "logps_avg/rejected": -7.561089992523193, "loss": 0.6722, "losses_ref": -0.04718080908060074, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 435, "u": -2.249262809753418, "weight": 0.05173084884881973 }, { "diff_generated": -23.8907470703125, "epoch": 0.9212248102590945, "grad_norm": 6.857956310477159, "learning_rate": 1.3062298151261591e-06, "logits/chosen": -2.5335617065429688, "logits/rejected": -2.219560146331787, "logps/chosen": -250.57705688476562, "logps/rejected": -2190.95947265625, "logps_avg/chosen": -0.6971117854118347, "logps_avg/rejected": -7.167223930358887, "loss": 0.6666, "losses_ref": -0.04138738289475441, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 440, "u": -2.250927686691284, "weight": 0.04870566353201866 }, { "diff_generated": -24.81663703918457, "epoch": 0.9316932740120387, "grad_norm": 7.035268937333438, "learning_rate": 1.2887513642314372e-06, "logits/chosen": -2.466610908508301, "logits/rejected": -2.1418159008026123, "logps/chosen": -234.072021484375, "logps/rejected": -2254.32177734375, "logps_avg/chosen": -0.7226396203041077, "logps_avg/rejected": -7.4449920654296875, "loss": 0.6772, "losses_ref": -0.02925349771976471, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 445, "u": -2.261955738067627, "weight": 0.036579299718141556 }, { "diff_generated": -23.858409881591797, "epoch": 0.942161737764983, "grad_norm": 5.8496221029871895, "learning_rate": 1.271176135236417e-06, "logits/chosen": -2.5474791526794434, "logits/rejected": -2.2467288970947266, "logps/chosen": -259.63043212890625, "logps/rejected": -2068.02978515625, "logps_avg/chosen": -0.7590965032577515, "logps_avg/rejected": -7.157523155212402, "loss": 0.6926, "losses_ref": -0.04666949436068535, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 450, "u": -2.24082612991333, "weight": 0.057170577347278595 }, { "diff_generated": -21.257368087768555, "epoch": 0.9526302015179272, "grad_norm": 9.579263194990599, "learning_rate": 1.2535100186666e-06, "logits/chosen": -2.5334389209747314, "logits/rejected": -2.2800872325897217, "logps/chosen": -258.4393615722656, "logps/rejected": -1949.274658203125, "logps_avg/chosen": -0.7667442560195923, "logps_avg/rejected": -6.377211093902588, "loss": 0.7272, "losses_ref": -0.04685154929757118, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 455, "u": -2.239955186843872, "weight": 0.05661741644144058 }, { "diff_generated": -21.260351181030273, "epoch": 0.9630986652708715, "grad_norm": 7.19097418251884, "learning_rate": 1.2357589355094273e-06, "logits/chosen": -2.5235114097595215, "logits/rejected": -2.2688846588134766, "logps/chosen": -274.0472106933594, "logps/rejected": -1854.4193115234375, "logps_avg/chosen": -0.7401561141014099, "logps_avg/rejected": -6.378105163574219, "loss": 0.6996, "losses_ref": -0.04187412187457085, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 460, "u": -2.255484104156494, "weight": 0.04560910537838936 }, { "diff_generated": -20.870580673217773, "epoch": 0.9735671290238157, "grad_norm": 5.873029901097039, "learning_rate": 1.2179288352297982e-06, "logits/chosen": -2.5459725856781006, "logits/rejected": -2.300191879272461, "logps/chosen": -233.07363891601562, "logps/rejected": -1780.218505859375, "logps_avg/chosen": -0.676838755607605, "logps_avg/rejected": -6.26117467880249, "loss": 0.701, "losses_ref": -0.035965751856565475, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 465, "u": -2.2623190879821777, "weight": 0.03852839767932892 }, { "diff_generated": -20.314434051513672, "epoch": 0.98403559277676, "grad_norm": 6.047640955364439, "learning_rate": 1.2000256937760445e-06, "logits/chosen": -2.478569746017456, "logits/rejected": -2.2165324687957764, "logps/chosen": -241.59115600585938, "logps/rejected": -1793.3131103515625, "logps_avg/chosen": -0.7300271987915039, "logps_avg/rejected": -6.094330787658691, "loss": 0.6661, "losses_ref": -0.03309565782546997, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 470, "u": -2.2623355388641357, "weight": 0.03777972236275673 }, { "diff_generated": -20.79926109313965, "epoch": 0.9945040565297043, "grad_norm": 8.164412498048108, "learning_rate": 1.1820555115770255e-06, "logits/chosen": -2.5342564582824707, "logits/rejected": -2.2890594005584717, "logps/chosen": -230.3572235107422, "logps/rejected": -1833.0390625, "logps_avg/chosen": -0.751907467842102, "logps_avg/rejected": -6.239778995513916, "loss": 0.6895, "losses_ref": -0.03975466638803482, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 475, "u": -2.2550594806671143, "weight": 0.04479961842298508 }, { "diff_generated": -20.89034080505371, "epoch": 1.0049725202826485, "grad_norm": 9.91162629957212, "learning_rate": 1.1640243115310217e-06, "logits/chosen": -2.515481948852539, "logits/rejected": -2.238800525665283, "logps/chosen": -238.7968292236328, "logps/rejected": -1904.6226806640625, "logps_avg/chosen": -0.730613112449646, "logps_avg/rejected": -6.2671027183532715, "loss": 0.6185, "losses_ref": -0.0886848121881485, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 480, "u": -3.230264186859131, "weight": 0.0876741111278534 }, { "diff_generated": -22.67684555053711, "epoch": 1.0154409840355927, "grad_norm": 8.949481189927978, "learning_rate": 1.1459381369870972e-06, "logits/chosen": -2.4899425506591797, "logits/rejected": -2.1274173259735107, "logps/chosen": -239.3141632080078, "logps/rejected": -2098.4287109375, "logps_avg/chosen": -0.6295738816261292, "logps_avg/rejected": -6.8030548095703125, "loss": 0.5199, "losses_ref": -0.09897326678037643, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 485, "u": -4.378706932067871, "weight": 0.08350441604852676 }, { "diff_generated": -24.163660049438477, "epoch": 1.025909447788537, "grad_norm": 8.708694233875605, "learning_rate": 1.1278030497196046e-06, "logits/chosen": -2.448932409286499, "logits/rejected": -2.0961108207702637, "logps/chosen": -197.19461059570312, "logps/rejected": -2133.96630859375, "logps_avg/chosen": -0.5785419940948486, "logps_avg/rejected": -7.2490973472595215, "loss": 0.5111, "losses_ref": -0.12583398818969727, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 490, "u": -4.304468631744385, "weight": 0.12433832883834839 }, { "diff_generated": -25.089040756225586, "epoch": 1.0363779115414813, "grad_norm": 8.538618246529412, "learning_rate": 1.1096251278965172e-06, "logits/chosen": -2.4840457439422607, "logits/rejected": -2.1427814960479736, "logps/chosen": -202.72528076171875, "logps/rejected": -2115.415283203125, "logps_avg/chosen": -0.5992251038551331, "logps_avg/rejected": -7.526711940765381, "loss": 0.4987, "losses_ref": -0.10639525949954987, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 495, "u": -4.343268394470215, "weight": 0.10977953672409058 }, { "diff_generated": -24.132022857666016, "epoch": 1.0468463752944255, "grad_norm": 7.67811116418592, "learning_rate": 1.0914104640422679e-06, "logits/chosen": -2.4932920932769775, "logits/rejected": -2.1089999675750732, "logps/chosen": -199.10342407226562, "logps/rejected": -2176.26318359375, "logps_avg/chosen": -0.6183401346206665, "logps_avg/rejected": -7.2396063804626465, "loss": 0.5202, "losses_ref": -0.12012694031000137, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 500, "u": -4.302992820739746, "weight": 0.11936072260141373 }, { "diff_generated": -23.451740264892578, "epoch": 1.05731483904737, "grad_norm": 20.37435210253164, "learning_rate": 1.0731651629957721e-06, "logits/chosen": -2.42221736907959, "logits/rejected": -2.1496148109436035, "logps/chosen": -229.11068725585938, "logps/rejected": -2094.52197265625, "logps_avg/chosen": -0.6533752679824829, "logps_avg/rejected": -7.035521507263184, "loss": 0.5184, "losses_ref": -0.1230870932340622, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 505, "u": -4.369751930236816, "weight": 0.1066075786948204 }, { "diff_generated": -22.74098777770996, "epoch": 1.067783302800314, "grad_norm": 7.268444145722818, "learning_rate": 1.0548953398643274e-06, "logits/chosen": -2.4076297283172607, "logits/rejected": -2.0819380283355713, "logps/chosen": -233.77938842773438, "logps/rejected": -2035.225830078125, "logps_avg/chosen": -0.6575011014938354, "logps_avg/rejected": -6.822296142578125, "loss": 0.4947, "losses_ref": -0.14097091555595398, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 510, "u": -4.253186225891113, "weight": 0.1333218514919281 }, { "diff_generated": -25.923725128173828, "epoch": 1.0782517665532583, "grad_norm": 8.062661700192072, "learning_rate": 1.0366071179740706e-06, "logits/chosen": -2.4787120819091797, "logits/rejected": -2.12414288520813, "logps/chosen": -257.2312927246094, "logps/rejected": -2302.900634765625, "logps_avg/chosen": -0.6627689003944397, "logps_avg/rejected": -7.777116298675537, "loss": 0.5085, "losses_ref": -0.10705102980136871, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 515, "u": -4.345104217529297, "weight": 0.10459395498037338 }, { "diff_generated": -27.071746826171875, "epoch": 1.0887202303062025, "grad_norm": 7.3598703596101975, "learning_rate": 1.0183066268176775e-06, "logits/chosen": -2.436248779296875, "logits/rejected": -2.075679063796997, "logps/chosen": -244.1257781982422, "logps/rejected": -2375.113525390625, "logps_avg/chosen": -0.6157761812210083, "logps_avg/rejected": -8.1215238571167, "loss": 0.5683, "losses_ref": -0.08251279592514038, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 520, "u": -4.396883010864258, "weight": 0.06908340752124786 }, { "diff_generated": -26.481449127197266, "epoch": 1.0991886940591469, "grad_norm": 8.892060607648993, "learning_rate": 1e-06, "logits/chosen": -2.4646589756011963, "logits/rejected": -2.096703290939331, "logps/chosen": -226.17453002929688, "logps/rejected": -2343.119384765625, "logps_avg/chosen": -0.6375609040260315, "logps_avg/rejected": -7.944435119628906, "loss": 0.5652, "losses_ref": -0.08028392493724823, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 525, "u": -4.399170398712158, "weight": 0.07661790400743484 }, { "diff_generated": -25.77886962890625, "epoch": 1.109657157812091, "grad_norm": 11.93280848823974, "learning_rate": 9.816933731823228e-07, "logits/chosen": -2.4755985736846924, "logits/rejected": -2.1236746311187744, "logps/chosen": -219.5588836669922, "logps/rejected": -2258.547119140625, "logps_avg/chosen": -0.6109720468521118, "logps_avg/rejected": -7.733660697937012, "loss": 0.5032, "losses_ref": -0.09919899702072144, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 530, "u": -4.327098846435547, "weight": 0.0968068465590477 }, { "diff_generated": -26.962757110595703, "epoch": 1.1201256215650353, "grad_norm": 11.74024044453861, "learning_rate": 9.633928820259293e-07, "logits/chosen": -2.382981777191162, "logits/rejected": -1.9988247156143188, "logps/chosen": -198.56578063964844, "logps/rejected": -2398.09326171875, "logps_avg/chosen": -0.6096338033676147, "logps_avg/rejected": -8.088827133178711, "loss": 0.5305, "losses_ref": -0.06856809556484222, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 535, "u": -4.411980152130127, "weight": 0.06424126774072647 }, { "diff_generated": -26.22715187072754, "epoch": 1.1305940853179797, "grad_norm": 11.054487118285914, "learning_rate": 9.451046601356725e-07, "logits/chosen": -2.4410181045532227, "logits/rejected": -2.095543146133423, "logps/chosen": -207.6184844970703, "logps/rejected": -2253.38623046875, "logps_avg/chosen": -0.6336568593978882, "logps_avg/rejected": -7.868145942687988, "loss": 0.5357, "losses_ref": -0.0955720990896225, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 540, "u": -4.348645210266113, "weight": 0.09204810112714767 }, { "diff_generated": -25.266141891479492, "epoch": 1.1410625490709239, "grad_norm": 8.805909515635294, "learning_rate": 9.268348370042281e-07, "logits/chosen": -2.4485838413238525, "logits/rejected": -2.1053905487060547, "logps/chosen": -216.48910522460938, "logps/rejected": -2250.44775390625, "logps_avg/chosen": -0.588961124420166, "logps_avg/rejected": -7.579843044281006, "loss": 0.5159, "losses_ref": -0.09172032028436661, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 545, "u": -4.326230525970459, "weight": 0.09086887538433075 }, { "diff_generated": -26.917110443115234, "epoch": 1.151531012823868, "grad_norm": 10.666064793677686, "learning_rate": 9.085895359577323e-07, "logits/chosen": -2.404174566268921, "logits/rejected": -2.037463665008545, "logps/chosen": -205.3460235595703, "logps/rejected": -2429.36279296875, "logps_avg/chosen": -0.5989923477172852, "logps_avg/rejected": -8.07513427734375, "loss": 0.5332, "losses_ref": -0.06065789982676506, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 550, "u": -4.387241363525391, "weight": 0.0559367910027504 }, { "diff_generated": -25.942188262939453, "epoch": 1.1619994765768125, "grad_norm": 10.199822581929254, "learning_rate": 8.903748721034826e-07, "logits/chosen": -2.432077407836914, "logits/rejected": -2.0631113052368164, "logps/chosen": -209.88076782226562, "logps/rejected": -2297.24853515625, "logps_avg/chosen": -0.6222396492958069, "logps_avg/rejected": -7.782655239105225, "loss": 0.5436, "losses_ref": -0.053764212876558304, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 555, "u": -4.413111209869385, "weight": 0.05243021994829178 }, { "diff_generated": -26.842655181884766, "epoch": 1.1724679403297567, "grad_norm": 9.055623269790141, "learning_rate": 8.721969502803953e-07, "logits/chosen": -2.4761881828308105, "logits/rejected": -2.037745952606201, "logps/chosen": -228.0619659423828, "logps/rejected": -2454.422607421875, "logps_avg/chosen": -0.6156254410743713, "logps_avg/rejected": -8.052797317504883, "loss": 0.4938, "losses_ref": -0.06194459646940231, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 560, "u": -4.417675018310547, "weight": 0.05182374641299248 }, { "diff_generated": -25.78971290588379, "epoch": 1.1829364040827008, "grad_norm": 11.397081928275703, "learning_rate": 8.540618630129027e-07, "logits/chosen": -2.4368996620178223, "logits/rejected": -2.0613627433776855, "logps/chosen": -244.33059692382812, "logps/rejected": -2314.3056640625, "logps_avg/chosen": -0.6685888171195984, "logps_avg/rejected": -7.736914157867432, "loss": 0.5495, "losses_ref": -0.07071459293365479, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 565, "u": -4.390778064727783, "weight": 0.06269918382167816 }, { "diff_generated": -26.82694435119629, "epoch": 1.193404867835645, "grad_norm": 9.221832000440747, "learning_rate": 8.359756884689783e-07, "logits/chosen": -2.497908115386963, "logits/rejected": -2.125258207321167, "logps/chosen": -215.4803009033203, "logps/rejected": -2407.225830078125, "logps_avg/chosen": -0.6236811876296997, "logps_avg/rejected": -8.048083305358887, "loss": 0.5244, "losses_ref": -0.08507435768842697, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 570, "u": -4.405519485473633, "weight": 0.07415871322154999 }, { "diff_generated": -27.44614601135254, "epoch": 1.2038733315885894, "grad_norm": 14.484772212758768, "learning_rate": 8.179444884229744e-07, "logits/chosen": -2.415398597717285, "logits/rejected": -2.0458593368530273, "logps/chosen": -224.60482788085938, "logps/rejected": -2476.796142578125, "logps_avg/chosen": -0.6788522601127625, "logps_avg/rejected": -8.233844757080078, "loss": 0.5625, "losses_ref": -0.05934012681245804, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 575, "u": -4.461823463439941, "weight": 0.044574279338121414 }, { "diff_generated": -29.135217666625977, "epoch": 1.2143417953415336, "grad_norm": 18.01394064023352, "learning_rate": 7.999743062239557e-07, "logits/chosen": -2.4544944763183594, "logits/rejected": -2.104241371154785, "logps/chosen": -210.87893676757812, "logps/rejected": -2643.50390625, "logps_avg/chosen": -0.6716314554214478, "logps_avg/rejected": -8.740565299987793, "loss": 0.5555, "losses_ref": -0.056417226791381836, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 580, "u": -4.44529914855957, "weight": 0.04976346716284752 }, { "diff_generated": -27.484622955322266, "epoch": 1.2248102590944778, "grad_norm": 10.29630717051048, "learning_rate": 7.820711647702017e-07, "logits/chosen": -2.4541475772857666, "logits/rejected": -2.0904035568237305, "logps/chosen": -202.5820770263672, "logps/rejected": -2515.11962890625, "logps_avg/chosen": -0.5754384994506836, "logps_avg/rejected": -8.245387077331543, "loss": 0.5346, "losses_ref": -0.08221448957920074, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 585, "u": -4.365923881530762, "weight": 0.07960718125104904 }, { "diff_generated": -26.950695037841797, "epoch": 1.235278722847422, "grad_norm": 10.223108898541343, "learning_rate": 7.642410644905726e-07, "logits/chosen": -2.3840575218200684, "logits/rejected": -2.0544769763946533, "logps/chosen": -205.935546875, "logps/rejected": -2364.6396484375, "logps_avg/chosen": -0.5895050764083862, "logps_avg/rejected": -8.08520793914795, "loss": 0.5503, "losses_ref": -0.10383725166320801, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 590, "u": -4.365530490875244, "weight": 0.09789486229419708 }, { "diff_generated": -29.25247573852539, "epoch": 1.2457471866003664, "grad_norm": 12.09100466478698, "learning_rate": 7.464899813334e-07, "logits/chosen": -2.3943965435028076, "logits/rejected": -2.067821979522705, "logps/chosen": -215.44094848632812, "logps/rejected": -2522.196533203125, "logps_avg/chosen": -0.6099680662155151, "logps_avg/rejected": -8.77574348449707, "loss": 0.5325, "losses_ref": -0.07746943831443787, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 595, "u": -4.356167793273926, "weight": 0.07601340860128403 }, { "diff_generated": -27.34578514099121, "epoch": 1.2562156503533106, "grad_norm": 8.052346731222642, "learning_rate": 7.288238647635829e-07, "logits/chosen": -2.435148239135742, "logits/rejected": -2.1030170917510986, "logps/chosen": -226.7269744873047, "logps/rejected": -2427.451171875, "logps_avg/chosen": -0.6252392530441284, "logps_avg/rejected": -8.2037353515625, "loss": 0.5356, "losses_ref": -0.06464961916208267, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 600, "u": -4.406424522399902, "weight": 0.061459980905056 }, { "diff_generated": -27.118465423583984, "epoch": 1.2666841141062548, "grad_norm": 11.655006277757288, "learning_rate": 7.112486357685631e-07, "logits/chosen": -2.450383424758911, "logits/rejected": -2.0887584686279297, "logps/chosen": -222.7769012451172, "logps/rejected": -2357.30712890625, "logps_avg/chosen": -0.6189793348312378, "logps_avg/rejected": -8.135540008544922, "loss": 0.5517, "losses_ref": -0.08965682238340378, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 605, "u": -4.366550445556641, "weight": 0.09028217941522598 }, { "diff_generated": -27.826339721679688, "epoch": 1.2771525778591992, "grad_norm": 8.355569379147827, "learning_rate": 6.937701848738407e-07, "logits/chosen": -2.4444997310638428, "logits/rejected": -2.103099822998047, "logps/chosen": -200.1586151123047, "logps/rejected": -2441.192138671875, "logps_avg/chosen": -0.5492798089981079, "logps_avg/rejected": -8.347902297973633, "loss": 0.5273, "losses_ref": -0.05201203376054764, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 610, "u": -4.470887660980225, "weight": 0.04102148860692978 }, { "diff_generated": -27.140499114990234, "epoch": 1.2876210416121434, "grad_norm": 19.07484346081228, "learning_rate": 6.763943701687045e-07, "logits/chosen": -2.4840033054351807, "logits/rejected": -2.0714080333709717, "logps/chosen": -237.1542510986328, "logps/rejected": -2492.620849609375, "logps_avg/chosen": -0.6195243000984192, "logps_avg/rejected": -8.142149925231934, "loss": 0.5249, "losses_ref": -0.07448837906122208, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 615, "u": -4.392305374145508, "weight": 0.06138737127184868 }, { "diff_generated": -29.331090927124023, "epoch": 1.2980895053650876, "grad_norm": 14.350296949575641, "learning_rate": 6.591270153428288e-07, "logits/chosen": -2.5314509868621826, "logits/rejected": -2.1232359409332275, "logps/chosen": -230.3607940673828, "logps/rejected": -2496.131103515625, "logps_avg/chosen": -0.6086186170578003, "logps_avg/rejected": -8.799327850341797, "loss": 0.5301, "losses_ref": -0.05894411355257034, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 620, "u": -4.37734842300415, "weight": 0.053639549762010574 }, { "diff_generated": -26.76749038696289, "epoch": 1.308557969118032, "grad_norm": 8.772096019129755, "learning_rate": 6.419739077344016e-07, "logits/chosen": -2.517256259918213, "logits/rejected": -2.158301591873169, "logps/chosen": -236.55648803710938, "logps/rejected": -2372.91796875, "logps_avg/chosen": -0.6213998794555664, "logps_avg/rejected": -8.030247688293457, "loss": 0.544, "losses_ref": -0.09482914954423904, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 625, "u": -4.376565456390381, "weight": 0.07662535458803177 }, { "diff_generated": -28.416824340820312, "epoch": 1.3190264328709762, "grad_norm": 7.8466631670725935, "learning_rate": 6.24940796390438e-07, "logits/chosen": -2.4629857540130615, "logits/rejected": -2.0768308639526367, "logps/chosen": -214.29360961914062, "logps/rejected": -2455.93115234375, "logps_avg/chosen": -0.6123236417770386, "logps_avg/rejected": -8.52504825592041, "loss": 0.5392, "losses_ref": -0.059877872467041016, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 630, "u": -4.431421756744385, "weight": 0.043088506907224655 }, { "diff_generated": -29.8402099609375, "epoch": 1.3294948966239204, "grad_norm": 20.160929381759352, "learning_rate": 6.08033390139925e-07, "logits/chosen": -2.4479854106903076, "logits/rejected": -2.0140042304992676, "logps/chosen": -228.12948608398438, "logps/rejected": -2645.977294921875, "logps_avg/chosen": -0.6280118227005005, "logps_avg/rejected": -8.95206356048584, "loss": 0.5647, "losses_ref": -0.0805547907948494, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 635, "u": -4.419759750366211, "weight": 0.06746160984039307 }, { "diff_generated": -29.193140029907227, "epoch": 1.3399633603768648, "grad_norm": 17.984653220174852, "learning_rate": 5.912573556804452e-07, "logits/chosen": -2.4721744060516357, "logits/rejected": -2.0706074237823486, "logps/chosen": -219.49658203125, "logps/rejected": -2600.13525390625, "logps_avg/chosen": -0.5888947248458862, "logps_avg/rejected": -8.757942199707031, "loss": 0.5708, "losses_ref": -0.06751363724470139, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 640, "u": -4.354660511016846, "weight": 0.08399678766727448 }, { "diff_generated": -29.59097671508789, "epoch": 1.350431824129809, "grad_norm": 8.832363301034992, "learning_rate": 5.746183156789252e-07, "logits/chosen": -2.522441864013672, "logits/rejected": -2.069122076034546, "logps/chosen": -234.3195343017578, "logps/rejected": -2680.282470703125, "logps_avg/chosen": -0.6104280352592468, "logps_avg/rejected": -8.877291679382324, "loss": 0.5457, "losses_ref": -0.05418990179896355, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 645, "u": -4.4123215675354, "weight": 0.058007679879665375 }, { "diff_generated": -28.265172958374023, "epoch": 1.3609002878827532, "grad_norm": 12.218786161167232, "learning_rate": 5.581218468871365e-07, "logits/chosen": -2.4173598289489746, "logits/rejected": -2.0515952110290527, "logps/chosen": -190.7438507080078, "logps/rejected": -2539.76953125, "logps_avg/chosen": -0.5876272320747375, "logps_avg/rejected": -8.479551315307617, "loss": 0.5169, "losses_ref": -0.08093442767858505, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 650, "u": -4.324867248535156, "weight": 0.08522786945104599 }, { "diff_generated": -31.353778839111328, "epoch": 1.3713687516356974, "grad_norm": 10.11440836146207, "learning_rate": 5.417734782725896e-07, "logits/chosen": -2.459190845489502, "logits/rejected": -2.060859203338623, "logps/chosen": -211.8318634033203, "logps/rejected": -2672.73583984375, "logps_avg/chosen": -0.5790122151374817, "logps_avg/rejected": -9.406133651733398, "loss": 0.5603, "losses_ref": -0.038860172033309937, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 655, "u": -4.468893051147461, "weight": 0.03133354336023331 }, { "diff_generated": -28.588571548461914, "epoch": 1.3818372153886418, "grad_norm": 12.583696879491457, "learning_rate": 5.255786891654399e-07, "logits/chosen": -2.4734246730804443, "logits/rejected": -2.0776007175445557, "logps/chosen": -203.22389221191406, "logps/rejected": -2578.066162109375, "logps_avg/chosen": -0.6348826289176941, "logps_avg/rejected": -8.57657241821289, "loss": 0.5486, "losses_ref": -0.06403845548629761, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 660, "u": -4.37404203414917, "weight": 0.06924913823604584 }, { "diff_generated": -29.78804588317871, "epoch": 1.392305679141586, "grad_norm": 13.032538343695713, "learning_rate": 5.095429074220319e-07, "logits/chosen": -2.4960551261901855, "logits/rejected": -2.1090826988220215, "logps/chosen": -213.1850128173828, "logps/rejected": -2626.316162109375, "logps_avg/chosen": -0.6238334774971008, "logps_avg/rejected": -8.93641471862793, "loss": 0.5533, "losses_ref": -0.06042981147766113, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 665, "u": -4.39785099029541, "weight": 0.05738676339387894 }, { "diff_generated": -31.395706176757812, "epoch": 1.4027741428945302, "grad_norm": 29.282292978403014, "learning_rate": 4.936715076056974e-07, "logits/chosen": -2.519998073577881, "logits/rejected": -2.1003477573394775, "logps/chosen": -227.49972534179688, "logps/rejected": -2841.53759765625, "logps_avg/chosen": -0.6322627067565918, "logps_avg/rejected": -9.418710708618164, "loss": 0.545, "losses_ref": -0.04599471017718315, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 670, "u": -4.469276428222656, "weight": 0.033300966024398804 }, { "diff_generated": -32.34383010864258, "epoch": 1.4132426066474744, "grad_norm": 21.235357659003228, "learning_rate": 4.779698091854098e-07, "logits/chosen": -2.5733542442321777, "logits/rejected": -2.1177892684936523, "logps/chosen": -241.3948516845703, "logps/rejected": -2941.85205078125, "logps_avg/chosen": -0.634663999080658, "logps_avg/rejected": -9.70314884185791, "loss": 0.5578, "losses_ref": -0.03548940271139145, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 675, "u": -4.454364776611328, "weight": 0.025668436661362648 }, { "diff_generated": -29.166423797607422, "epoch": 1.4237110704004188, "grad_norm": 9.728306873667183, "learning_rate": 4.624430747529102e-07, "logits/chosen": -2.5310111045837402, "logits/rejected": -2.1089558601379395, "logps/chosen": -245.45083618164062, "logps/rejected": -2643.77001953125, "logps_avg/chosen": -0.6183468699455261, "logps_avg/rejected": -8.749927520751953, "loss": 0.5228, "losses_ref": -0.08980627357959747, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 680, "u": -4.334953308105469, "weight": 0.07751224935054779 }, { "diff_generated": -33.88722610473633, "epoch": 1.434179534153363, "grad_norm": 14.616844426526761, "learning_rate": 4.4709650825889277e-07, "logits/chosen": -2.460334300994873, "logits/rejected": -2.0326919555664062, "logps/chosen": -193.82003784179688, "logps/rejected": -2947.883544921875, "logps_avg/chosen": -0.5843343138694763, "logps_avg/rejected": -10.166168212890625, "loss": 0.5694, "losses_ref": -0.03547119349241257, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 685, "u": -4.463663578033447, "weight": 0.030837317928671837 }, { "diff_generated": -30.6027774810791, "epoch": 1.4446479979063072, "grad_norm": 11.081953598678401, "learning_rate": 4.3193525326884426e-07, "logits/chosen": -2.5122551918029785, "logits/rejected": -2.0895779132843018, "logps/chosen": -238.4690704345703, "logps/rejected": -2627.096435546875, "logps_avg/chosen": -0.6726236343383789, "logps_avg/rejected": -9.180832862854004, "loss": 0.587, "losses_ref": -0.05756605789065361, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 690, "u": -4.38980770111084, "weight": 0.0510624423623085 }, { "diff_generated": -32.015716552734375, "epoch": 1.4551164616592516, "grad_norm": 11.608639050571856, "learning_rate": 4.1696439123912406e-07, "logits/chosen": -2.4778366088867188, "logits/rejected": -2.0454444885253906, "logps/chosen": -205.8911590576172, "logps/rejected": -2957.13525390625, "logps_avg/chosen": -0.6116452217102051, "logps_avg/rejected": -9.604714393615723, "loss": 0.5502, "losses_ref": -0.05736450105905533, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 695, "u": -4.357982635498047, "weight": 0.05952075123786926 }, { "diff_generated": -35.234153747558594, "epoch": 1.4655849254121958, "grad_norm": 8.17712308208093, "learning_rate": 4.0218893981385927e-07, "logits/chosen": -2.485691547393799, "logits/rejected": -2.046220064163208, "logps/chosen": -200.62582397460938, "logps/rejected": -3101.075439453125, "logps_avg/chosen": -0.5734541416168213, "logps_avg/rejected": -10.570245742797852, "loss": 0.5729, "losses_ref": -0.028310665860772133, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 700, "u": -4.45696496963501, "weight": 0.023738497868180275 }, { "diff_generated": -35.26641082763672, "epoch": 1.47605338916514, "grad_norm": 16.950355166034456, "learning_rate": 3.87613851143229e-07, "logits/chosen": -2.494295597076416, "logits/rejected": -2.00370717048645, "logps/chosen": -230.57400512695312, "logps/rejected": -3109.327392578125, "logps_avg/chosen": -0.6209388971328735, "logps_avg/rejected": -10.57992172241211, "loss": 0.5466, "losses_ref": -0.0546514168381691, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 705, "u": -4.406841278076172, "weight": 0.04961226135492325 }, { "diff_generated": -34.927207946777344, "epoch": 1.4865218529180844, "grad_norm": 9.208840009036596, "learning_rate": 3.7324401022369744e-07, "logits/chosen": -2.4626827239990234, "logits/rejected": -1.9565467834472656, "logps/chosen": -232.802001953125, "logps/rejected": -3108.4921875, "logps_avg/chosen": -0.6169513463973999, "logps_avg/rejected": -10.47816276550293, "loss": 0.5383, "losses_ref": -0.051527369767427444, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 710, "u": -4.410109043121338, "weight": 0.04789410158991814 }, { "diff_generated": -31.93350601196289, "epoch": 1.4969903166710286, "grad_norm": 8.74366239695945, "learning_rate": 3.5908423326075455e-07, "logits/chosen": -2.470921039581299, "logits/rejected": -2.028719425201416, "logps/chosen": -197.37814331054688, "logps/rejected": -2799.31396484375, "logps_avg/chosen": -0.5950369834899902, "logps_avg/rejected": -9.580052375793457, "loss": 0.5627, "losses_ref": -0.05724947527050972, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 715, "u": -4.411328315734863, "weight": 0.047762464731931686 }, { "diff_generated": -33.14401626586914, "epoch": 1.5074587804239727, "grad_norm": 8.842328295664547, "learning_rate": 3.45139266054715e-07, "logits/chosen": -2.5109152793884277, "logits/rejected": -2.010921001434326, "logps/chosen": -247.7344207763672, "logps/rejected": -3127.861328125, "logps_avg/chosen": -0.6326244473457336, "logps_avg/rejected": -9.943206787109375, "loss": 0.5529, "losses_ref": -0.05398111790418625, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 720, "u": -4.407879829406738, "weight": 0.051137275993824005 }, { "diff_generated": -34.548439025878906, "epoch": 1.5179272441769172, "grad_norm": 9.975694420372704, "learning_rate": 3.314137824101111e-07, "logits/chosen": -2.5249905586242676, "logits/rejected": -2.0087645053863525, "logps/chosen": -254.705322265625, "logps/rejected": -3178.156494140625, "logps_avg/chosen": -0.6393792033195496, "logps_avg/rejected": -10.364530563354492, "loss": 0.5512, "losses_ref": -0.05713530257344246, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 725, "u": -4.388852119445801, "weight": 0.061693333089351654 }, { "diff_generated": -32.73970413208008, "epoch": 1.5283957079298613, "grad_norm": 11.767533184902167, "learning_rate": 3.179123825692178e-07, "logits/chosen": -2.47417950630188, "logits/rejected": -2.016237497329712, "logps/chosen": -209.87802124023438, "logps/rejected": -2884.9580078125, "logps_avg/chosen": -0.5899583101272583, "logps_avg/rejected": -9.821910858154297, "loss": 0.5576, "losses_ref": -0.05416392162442207, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 730, "u": -4.406733989715576, "weight": 0.052076805382966995 }, { "diff_generated": -32.37422561645508, "epoch": 1.5388641716828055, "grad_norm": 9.47936945913295, "learning_rate": 3.0463959167023335e-07, "logits/chosen": -2.5015838146209717, "logits/rejected": -2.069798231124878, "logps/chosen": -217.7288055419922, "logps/rejected": -2870.407958984375, "logps_avg/chosen": -0.6165660619735718, "logps_avg/rejected": -9.712267875671387, "loss": 0.5285, "losses_ref": -0.08272585272789001, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 735, "u": -4.363903999328613, "weight": 0.07859805971384048 }, { "diff_generated": -32.28863525390625, "epoch": 1.54933263543575, "grad_norm": 9.124308513157976, "learning_rate": 2.915998582306299e-07, "logits/chosen": -2.5220367908477783, "logits/rejected": -2.038191318511963, "logps/chosen": -229.7245330810547, "logps/rejected": -2982.073486328125, "logps_avg/chosen": -0.617731511592865, "logps_avg/rejected": -9.686590194702148, "loss": 0.5329, "losses_ref": -0.05901874229311943, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 740, "u": -4.457033634185791, "weight": 0.051646940410137177 }, { "diff_generated": -31.57918357849121, "epoch": 1.559801099188694, "grad_norm": 8.788334428443942, "learning_rate": 2.7879755265618557e-07, "logits/chosen": -2.385359287261963, "logits/rejected": -2.0353574752807617, "logps/chosen": -191.27542114257812, "logps/rejected": -2743.20849609375, "logps_avg/chosen": -0.5724462270736694, "logps_avg/rejected": -9.473755836486816, "loss": 0.5301, "losses_ref": -0.06048304960131645, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 745, "u": -4.434350967407227, "weight": 0.051485490053892136 }, { "diff_generated": -33.260643005371094, "epoch": 1.5702695629416383, "grad_norm": 13.597985798817346, "learning_rate": 2.6623696577619625e-07, "logits/chosen": -2.498661518096924, "logits/rejected": -2.070701837539673, "logps/chosen": -227.7393035888672, "logps/rejected": -2963.530517578125, "logps_avg/chosen": -0.6551213264465332, "logps_avg/rejected": -9.978193283081055, "loss": 0.5837, "losses_ref": -0.03624705597758293, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 750, "u": -4.466372966766357, "weight": 0.028057094663381577 }, { "diff_generated": -29.464405059814453, "epoch": 1.5807380266945825, "grad_norm": 9.250307778356563, "learning_rate": 2.5392230740535846e-07, "logits/chosen": -2.5032472610473633, "logits/rejected": -2.06776762008667, "logps/chosen": -251.3708953857422, "logps/rejected": -2650.0810546875, "logps_avg/chosen": -0.6423950791358948, "logps_avg/rejected": -8.839322090148926, "loss": 0.5765, "losses_ref": -0.052409954369068146, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 755, "u": -4.408892631530762, "weight": 0.05609407275915146 }, { "diff_generated": -29.876062393188477, "epoch": 1.5912064904475267, "grad_norm": 12.686799097235559, "learning_rate": 2.418577049328058e-07, "logits/chosen": -2.5676896572113037, "logits/rejected": -2.1377835273742676, "logps/chosen": -265.7136535644531, "logps/rejected": -2646.18896484375, "logps_avg/chosen": -0.665650486946106, "logps_avg/rejected": -8.962818145751953, "loss": 0.5887, "losses_ref": -0.06443095207214355, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 760, "u": -4.407828330993652, "weight": 0.06109876185655594 }, { "diff_generated": -33.68701171875, "epoch": 1.6016749542004711, "grad_norm": 10.274482248605684, "learning_rate": 2.300472019387697e-07, "logits/chosen": -2.469991683959961, "logits/rejected": -2.029064893722534, "logps/chosen": -220.9040985107422, "logps/rejected": -3017.740234375, "logps_avg/chosen": -0.6078630685806274, "logps_avg/rejected": -10.10610294342041, "loss": 0.5524, "losses_ref": -0.04078926518559456, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 765, "u": -4.417876243591309, "weight": 0.03704729676246643 }, { "diff_generated": -31.825037002563477, "epoch": 1.6121434179534153, "grad_norm": 11.839464542057028, "learning_rate": 2.1849475683932994e-07, "logits/chosen": -2.4939956665039062, "logits/rejected": -2.1075644493103027, "logps/chosen": -223.6890869140625, "logps/rejected": -2828.83447265625, "logps_avg/chosen": -0.6260048747062683, "logps_avg/rejected": -9.547511100769043, "loss": 0.5492, "losses_ref": -0.05019731447100639, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 770, "u": -4.3696393966674805, "weight": 0.05455632880330086 }, { "diff_generated": -30.594751358032227, "epoch": 1.6226118817063595, "grad_norm": 9.146985127674856, "learning_rate": 2.0720424155971038e-07, "logits/chosen": -2.4665775299072266, "logits/rejected": -2.0385656356811523, "logps/chosen": -238.6437530517578, "logps/rejected": -2788.4453125, "logps_avg/chosen": -0.6432589292526245, "logps_avg/rejected": -9.178424835205078, "loss": 0.5603, "losses_ref": -0.060744620859622955, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 775, "u": -4.3677592277526855, "weight": 0.054513733834028244 }, { "diff_generated": -30.54671859741211, "epoch": 1.633080345459304, "grad_norm": 12.431506597181475, "learning_rate": 1.961794402365611e-07, "logits/chosen": -2.48872971534729, "logits/rejected": -2.045698404312134, "logps/chosen": -238.8667755126953, "logps/rejected": -2746.897705078125, "logps_avg/chosen": -0.6708707809448242, "logps_avg/rejected": -9.16401481628418, "loss": 0.5942, "losses_ref": -0.043663203716278076, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 780, "u": -4.429020881652832, "weight": 0.0392422154545784 }, { "diff_generated": -30.78244400024414, "epoch": 1.643548809212248, "grad_norm": 14.0111361325287, "learning_rate": 1.8542404794966427e-07, "logits/chosen": -2.5275959968566895, "logits/rejected": -2.0743932723999023, "logps/chosen": -236.8502655029297, "logps/rejected": -2726.872802734375, "logps_avg/chosen": -0.6049509644508362, "logps_avg/rejected": -9.234731674194336, "loss": 0.5559, "losses_ref": -0.040397271513938904, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 785, "u": -4.439243316650391, "weight": 0.034041326493024826 }, { "diff_generated": -30.46352767944336, "epoch": 1.6540172729651923, "grad_norm": 13.778205091571524, "learning_rate": 1.7494166948349053e-07, "logits/chosen": -2.4739108085632324, "logits/rejected": -2.0248847007751465, "logps/chosen": -188.06265258789062, "logps/rejected": -2811.63427734375, "logps_avg/chosen": -0.58104407787323, "logps_avg/rejected": -9.139059066772461, "loss": 0.5279, "losses_ref": -0.0705099031329155, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 790, "u": -4.394803047180176, "weight": 0.06850212812423706 }, { "diff_generated": -31.430471420288086, "epoch": 1.6644857367181367, "grad_norm": 7.6385064901749775, "learning_rate": 1.6473581811901528e-07, "logits/chosen": -2.465888500213623, "logits/rejected": -2.0527515411376953, "logps/chosen": -210.7668914794922, "logps/rejected": -2648.2431640625, "logps_avg/chosen": -0.6304226517677307, "logps_avg/rejected": -9.429141998291016, "loss": 0.5656, "losses_ref": -0.035576872527599335, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 795, "u": -4.473604679107666, "weight": 0.025509512051939964 }, { "diff_generated": -31.38290023803711, "epoch": 1.674954200471081, "grad_norm": 10.762504453960963, "learning_rate": 1.5480991445620538e-07, "logits/chosen": -2.458466053009033, "logits/rejected": -2.0299301147460938, "logps/chosen": -205.1313018798828, "logps/rejected": -2810.052001953125, "logps_avg/chosen": -0.5803036093711853, "logps_avg/rejected": -9.414871215820312, "loss": 0.5407, "losses_ref": -0.06857903301715851, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 800, "u": -4.383907318115234, "weight": 0.07058969140052795 }, { "diff_generated": -32.339012145996094, "epoch": 1.685422664224025, "grad_norm": 12.623391530366172, "learning_rate": 1.4516728526756873e-07, "logits/chosen": -2.4743473529815674, "logits/rejected": -2.0498290061950684, "logps/chosen": -213.2050018310547, "logps/rejected": -2888.50927734375, "logps_avg/chosen": -0.5934925079345703, "logps_avg/rejected": -9.701704025268555, "loss": 0.5501, "losses_ref": -0.061614394187927246, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 805, "u": -4.301063537597656, "weight": 0.06795644760131836 }, { "diff_generated": -29.015087127685547, "epoch": 1.6958911279769695, "grad_norm": 17.58977680719491, "learning_rate": 1.3581116238315194e-07, "logits/chosen": -2.4904446601867676, "logits/rejected": -2.050494909286499, "logps/chosen": -245.46176147460938, "logps/rejected": -2670.2060546875, "logps_avg/chosen": -0.6670945882797241, "logps_avg/rejected": -8.704526901245117, "loss": 0.5769, "losses_ref": -0.05934567004442215, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 810, "u": -4.374115943908691, "weight": 0.05290456861257553 }, { "diff_generated": -31.670734405517578, "epoch": 1.7063595917299135, "grad_norm": 20.41492239134003, "learning_rate": 1.2674468160735586e-07, "logits/chosen": -2.5279009342193604, "logits/rejected": -2.089564800262451, "logps/chosen": -219.30712890625, "logps/rejected": -2705.98193359375, "logps_avg/chosen": -0.6055987477302551, "logps_avg/rejected": -9.501219749450684, "loss": 0.5913, "losses_ref": -0.04426007717847824, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 815, "u": -4.457495212554932, "weight": 0.04348568618297577 }, { "diff_generated": -32.43749237060547, "epoch": 1.7168280554828579, "grad_norm": 8.725588658168348, "learning_rate": 1.1797088166794e-07, "logits/chosen": -2.479827880859375, "logits/rejected": -2.0322813987731934, "logps/chosen": -209.2858428955078, "logps/rejected": -2927.29150390625, "logps_avg/chosen": -0.5941019058227539, "logps_avg/rejected": -9.731245994567871, "loss": 0.5891, "losses_ref": -0.03500083088874817, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 820, "u": -4.411776065826416, "weight": 0.028576117008924484 }, { "diff_generated": -29.760284423828125, "epoch": 1.7272965192358023, "grad_norm": 7.224696592212977, "learning_rate": 1.0949270319755766e-07, "logits/chosen": -2.5083603858947754, "logits/rejected": -2.0863795280456543, "logps/chosen": -206.98812866210938, "logps/rejected": -2673.796875, "logps_avg/chosen": -0.5425812005996704, "logps_avg/rejected": -8.928085327148438, "loss": 0.5471, "losses_ref": -0.040049560368061066, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 825, "u": -4.422328472137451, "weight": 0.03419359400868416 }, { "diff_generated": -30.241125106811523, "epoch": 1.7377649829887463, "grad_norm": 11.359999539925766, "learning_rate": 1.013129877481741e-07, "logits/chosen": -2.4465301036834717, "logits/rejected": -2.0786962509155273, "logps/chosen": -251.66110229492188, "logps/rejected": -2615.54248046875, "logps_avg/chosen": -0.6354495286941528, "logps_avg/rejected": -9.07233715057373, "loss": 0.5595, "losses_ref": -0.038409143686294556, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 830, "u": -4.411218643188477, "weight": 0.03372519463300705 }, { "diff_generated": -31.309673309326172, "epoch": 1.7482334467416907, "grad_norm": 10.689212774701963, "learning_rate": 9.343447683868799e-08, "logits/chosen": -2.459969997406006, "logits/rejected": -2.0669496059417725, "logps/chosen": -197.42056274414062, "logps/rejected": -2780.952392578125, "logps_avg/chosen": -0.5673859715461731, "logps_avg/rejected": -9.392901420593262, "loss": 0.5517, "losses_ref": -0.03770770505070686, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 835, "u": -4.412764549255371, "weight": 0.03573904559016228 }, { "diff_generated": -30.0009765625, "epoch": 1.7587019104946349, "grad_norm": 13.800508017129163, "learning_rate": 8.585981103608342e-08, "logits/chosen": -2.48380184173584, "logits/rejected": -2.0376243591308594, "logps/chosen": -247.1182861328125, "logps/rejected": -2758.78857421875, "logps_avg/chosen": -0.6514982581138611, "logps_avg/rejected": -9.000292778015137, "loss": 0.5682, "losses_ref": -0.04732600972056389, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 840, "u": -4.44762659072876, "weight": 0.04196245223283768 }, { "diff_generated": -30.065624237060547, "epoch": 1.769170374247579, "grad_norm": 13.143185887862547, "learning_rate": 7.859152907041544e-08, "logits/chosen": -2.4641730785369873, "logits/rejected": -2.0567100048065186, "logps/chosen": -236.99148559570312, "logps/rejected": -2573.870849609375, "logps_avg/chosen": -0.6164765357971191, "logps_avg/rejected": -9.019688606262207, "loss": 0.5526, "losses_ref": -0.05898575857281685, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 845, "u": -4.398618698120117, "weight": 0.060839347541332245 }, { "diff_generated": -30.66835594177246, "epoch": 1.7796388380005235, "grad_norm": 17.88344708080126, "learning_rate": 7.163206698392742e-08, "logits/chosen": -2.4754815101623535, "logits/rejected": -2.077538251876831, "logps/chosen": -222.5938262939453, "logps/rejected": -2694.906494140625, "logps_avg/chosen": -0.6013268232345581, "logps_avg/rejected": -9.200507164001465, "loss": 0.5739, "losses_ref": -0.05739979073405266, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 850, "u": -4.392508506774902, "weight": 0.04965168982744217 }, { "diff_generated": -29.592029571533203, "epoch": 1.7901073017534677, "grad_norm": 13.06278922990348, "learning_rate": 6.498375731458527e-08, "logits/chosen": -2.514953136444092, "logits/rejected": -2.096156597137451, "logps/chosen": -233.39132690429688, "logps/rejected": -2654.203857421875, "logps_avg/chosen": -0.6016189455986023, "logps_avg/rejected": -8.877609252929688, "loss": 0.5566, "losses_ref": -0.04416666924953461, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 855, "u": -4.415837287902832, "weight": 0.03559427708387375 }, { "diff_generated": -32.24101638793945, "epoch": 1.8005757655064119, "grad_norm": 12.360220474861023, "learning_rate": 5.8648828314302735e-08, "logits/chosen": -2.4461560249328613, "logits/rejected": -2.015535354614258, "logps/chosen": -225.93533325195312, "logps/rejected": -2782.87255859375, "logps_avg/chosen": -0.5964374542236328, "logps_avg/rejected": -9.6723051071167, "loss": 0.5666, "losses_ref": -0.03670288249850273, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 860, "u": -4.398609161376953, "weight": 0.03543057292699814 }, { "diff_generated": -31.262531280517578, "epoch": 1.8110442292593563, "grad_norm": 16.164691771356388, "learning_rate": 5.2629403202119505e-08, "logits/chosen": -2.4537065029144287, "logits/rejected": -2.062150716781616, "logps/chosen": -204.52587890625, "logps/rejected": -2741.170654296875, "logps_avg/chosen": -0.5822928547859192, "logps_avg/rejected": -9.378759384155273, "loss": 0.5402, "losses_ref": -0.03764919191598892, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 865, "u": -4.42946720123291, "weight": 0.031336475163698196 }, { "diff_generated": -30.04671859741211, "epoch": 1.8215126930123005, "grad_norm": 10.013135246955365, "learning_rate": 4.692749945258057e-08, "logits/chosen": -2.4766173362731934, "logits/rejected": -2.0611166954040527, "logps/chosen": -236.82284545898438, "logps/rejected": -2744.845458984375, "logps_avg/chosen": -0.6182196736335754, "logps_avg/rejected": -9.014015197753906, "loss": 0.5905, "losses_ref": -0.07179991900920868, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 870, "u": -4.407280921936035, "weight": 0.07166210561990738 }, { "diff_generated": -32.90989303588867, "epoch": 1.8319811567652446, "grad_norm": 9.513246816083905, "learning_rate": 4.1545028119559066e-08, "logits/chosen": -2.4886152744293213, "logits/rejected": -2.066333770751953, "logps/chosen": -223.5939483642578, "logps/rejected": -2896.932373046875, "logps_avg/chosen": -0.6256131529808044, "logps_avg/rejected": -9.872968673706055, "loss": 0.5458, "losses_ref": -0.0590200200676918, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 875, "u": -4.408657073974609, "weight": 0.056226253509521484 }, { "diff_generated": -31.053930282592773, "epoch": 1.842449620518189, "grad_norm": 94.13052968470578, "learning_rate": 3.648379319574568e-08, "logits/chosen": -2.528390407562256, "logits/rejected": -2.073420524597168, "logps/chosen": -222.608642578125, "logps/rejected": -2745.4130859375, "logps_avg/chosen": -0.6137613654136658, "logps_avg/rejected": -9.316179275512695, "loss": 0.5237, "losses_ref": -0.06711964309215546, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 880, "u": -4.3957600593566895, "weight": 0.06412933766841888 }, { "diff_generated": -30.842365264892578, "epoch": 1.8529180842711332, "grad_norm": 8.312877021027528, "learning_rate": 3.17454910080216e-08, "logits/chosen": -2.5333809852600098, "logits/rejected": -2.1170499324798584, "logps/chosen": -253.5600128173828, "logps/rejected": -2778.802001953125, "logps_avg/chosen": -0.6801126599311829, "logps_avg/rejected": -9.25270938873291, "loss": 0.5709, "losses_ref": -0.0633564293384552, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 885, "u": -4.409802436828613, "weight": 0.05907650664448738 }, { "diff_generated": -30.68337631225586, "epoch": 1.8633865480240774, "grad_norm": 8.793473948703046, "learning_rate": 2.733170964891607e-08, "logits/chosen": -2.46742582321167, "logits/rejected": -2.0830397605895996, "logps/chosen": -204.62625122070312, "logps/rejected": -2726.16552734375, "logps_avg/chosen": -0.5727981328964233, "logps_avg/rejected": -9.205012321472168, "loss": 0.5596, "losses_ref": -0.05169714242219925, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 890, "u": -4.437578201293945, "weight": 0.043870192021131516 }, { "diff_generated": -30.3818302154541, "epoch": 1.8738550117770219, "grad_norm": 6.648166332075938, "learning_rate": 2.324392844434042e-08, "logits/chosen": -2.491211414337158, "logits/rejected": -2.0470757484436035, "logps/chosen": -229.8271026611328, "logps/rejected": -2785.03076171875, "logps_avg/chosen": -0.6076307892799377, "logps_avg/rejected": -9.11454963684082, "loss": 0.5638, "losses_ref": -0.032108329236507416, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 895, "u": -4.453085899353027, "weight": 0.02507254108786583 }, { "diff_generated": -31.61887550354004, "epoch": 1.8843234755299658, "grad_norm": 18.51567409544646, "learning_rate": 1.9483517457776434e-08, "logits/chosen": -2.4359021186828613, "logits/rejected": -2.096619129180908, "logps/chosen": -188.21896362304688, "logps/rejected": -2806.19921875, "logps_avg/chosen": -0.5758072733879089, "logps_avg/rejected": -9.485663414001465, "loss": 0.5343, "losses_ref": -0.08278501033782959, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 900, "u": -4.343811988830566, "weight": 0.08570453524589539 }, { "diff_generated": -31.200836181640625, "epoch": 1.8947919392829102, "grad_norm": 16.452754098885247, "learning_rate": 1.6051737031084533e-08, "logits/chosen": -2.453563690185547, "logits/rejected": -2.0280988216400146, "logps/chosen": -214.77395629882812, "logps/rejected": -2817.1669921875, "logps_avg/chosen": -0.5827924013137817, "logps_avg/rejected": -9.360250473022461, "loss": 0.5565, "losses_ref": -0.0487370602786541, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 905, "u": -4.410248756408691, "weight": 0.04455076903104782 }, { "diff_generated": -30.942846298217773, "epoch": 1.9052604030358546, "grad_norm": 20.55170462638644, "learning_rate": 1.2949737362087154e-08, "logits/chosen": -2.467200756072998, "logits/rejected": -2.096820831298828, "logps/chosen": -206.9503936767578, "logps/rejected": -2817.215087890625, "logps_avg/chosen": -0.6169668436050415, "logps_avg/rejected": -9.282854080200195, "loss": 0.5886, "losses_ref": -0.0511205717921257, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 910, "u": -4.404674053192139, "weight": 0.04643367975950241 }, { "diff_generated": -30.169301986694336, "epoch": 1.9157288667887986, "grad_norm": 8.053020444587133, "learning_rate": 1.0178558119067315e-08, "logits/chosen": -2.4181623458862305, "logits/rejected": -2.028630018234253, "logps/chosen": -212.6619873046875, "logps/rejected": -2651.956787109375, "logps_avg/chosen": -0.5928919315338135, "logps_avg/rejected": -9.050790786743164, "loss": 0.5551, "losses_ref": -0.05854606628417969, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 915, "u": -4.413174629211426, "weight": 0.0472232848405838 }, { "diff_generated": -32.18278121948242, "epoch": 1.926197330541743, "grad_norm": 24.431507328322112, "learning_rate": 7.739128092312918e-09, "logits/chosen": -2.4973015785217285, "logits/rejected": -2.0860588550567627, "logps/chosen": -216.73666381835938, "logps/rejected": -2769.303955078125, "logps_avg/chosen": -0.6046438813209534, "logps_avg/rejected": -9.654834747314453, "loss": 0.5467, "losses_ref": -0.06063861399888992, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 920, "u": -4.384333610534668, "weight": 0.07002799212932587 }, { "diff_generated": -33.56671142578125, "epoch": 1.9366657942946872, "grad_norm": 12.304529486565588, "learning_rate": 5.632264882822757e-09, "logits/chosen": -2.499455451965332, "logits/rejected": -2.059584140777588, "logps/chosen": -228.59640502929688, "logps/rejected": -2900.51123046875, "logps_avg/chosen": -0.6097213625907898, "logps_avg/rejected": -10.070013046264648, "loss": 0.5799, "losses_ref": -0.0342455692589283, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 925, "u": -4.438467979431152, "weight": 0.030456313863396645 }, { "diff_generated": -30.743816375732422, "epoch": 1.9471342580476314, "grad_norm": 11.65318893393544, "learning_rate": 3.858674628278824e-09, "logits/chosen": -2.4831936359405518, "logits/rejected": -2.0906691551208496, "logps/chosen": -230.875, "logps/rejected": -2670.49755859375, "logps_avg/chosen": -0.603253960609436, "logps_avg/rejected": -9.223145484924316, "loss": 0.5642, "losses_ref": -0.05138419196009636, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 930, "u": -4.435812473297119, "weight": 0.0458533950150013 }, { "diff_generated": -31.753076553344727, "epoch": 1.9576027218005758, "grad_norm": 16.58166205034555, "learning_rate": 2.418951766376742e-09, "logits/chosen": -2.4695091247558594, "logits/rejected": -2.0497422218322754, "logps/chosen": -205.1109619140625, "logps/rejected": -2825.771484375, "logps_avg/chosen": -0.5685989260673523, "logps_avg/rejected": -9.525922775268555, "loss": 0.554, "losses_ref": -0.05179325491189957, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 935, "u": -4.395134925842285, "weight": 0.04630660265684128 }, { "diff_generated": -31.87947654724121, "epoch": 1.96807118555352, "grad_norm": 26.35143781539668, "learning_rate": 1.313578835593465e-09, "logits/chosen": -2.4483304023742676, "logits/rejected": -2.004983425140381, "logps/chosen": -241.7947998046875, "logps/rejected": -2828.03173828125, "logps_avg/chosen": -0.6296852827072144, "logps_avg/rejected": -9.5638427734375, "loss": 0.5603, "losses_ref": -0.03613152354955673, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 940, "u": -4.452606678009033, "weight": 0.026990771293640137 }, { "diff_generated": -29.87912940979004, "epoch": 1.9785396493064642, "grad_norm": 15.814334066391242, "learning_rate": 5.429263134594242e-10, "logits/chosen": -2.4958741664886475, "logits/rejected": -2.101313591003418, "logps/chosen": -207.99179077148438, "logps/rejected": -2708.303466796875, "logps_avg/chosen": -0.5728383660316467, "logps_avg/rejected": -8.963739395141602, "loss": 0.5538, "losses_ref": -0.06295718252658844, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 945, "u": -4.400644302368164, "weight": 0.05401432514190674 }, { "diff_generated": -31.941226959228516, "epoch": 1.9890081130594086, "grad_norm": 8.580957108117007, "learning_rate": 1.0725249238940915e-10, "logits/chosen": -2.4698963165283203, "logits/rejected": -2.0529587268829346, "logps/chosen": -231.325927734375, "logps/rejected": -2804.859619140625, "logps_avg/chosen": -0.6270388960838318, "logps_avg/rejected": -9.582367897033691, "loss": 0.5563, "losses_ref": -0.029423978179693222, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 950, "u": -4.438694477081299, "weight": 0.025990551337599754 } ], "logging_steps": 5, "max_steps": 954, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }