Mistral-7B-DFT2 / trainer_state.json
siqi00's picture
change model hyperparameters
77d3f36
raw
history blame
138 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9973828840617638,
"eval_steps": 500,
"global_step": 954,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"diff_generated": -1.8149629831314087,
"epoch": 0.002093692750588851,
"grad_norm": 43.26649304714989,
"learning_rate": 2.083333333333333e-08,
"logits/chosen": -2.1441590785980225,
"logits/rejected": -2.0543735027313232,
"logps/chosen": -276.82366943359375,
"logps/rejected": -131.32485961914062,
"logps_avg/chosen": -1.2310187816619873,
"logps_avg/rejected": -0.5444889068603516,
"loss": 0.9706,
"losses_ref": -0.2554703652858734,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1,
"u": -1.129564642906189,
"weight": 0.727432131767273
},
{
"diff_generated": -2.051100015640259,
"epoch": 0.010468463752944255,
"grad_norm": 36.895500460127934,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -2.2114098072052,
"logits/rejected": -2.10967755317688,
"logps/chosen": -280.6037902832031,
"logps/rejected": -162.30044555664062,
"logps_avg/chosen": -1.178394079208374,
"logps_avg/rejected": -0.6153301000595093,
"loss": 0.8456,
"losses_ref": -0.2878931164741516,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 5,
"u": -1.3192780017852783,
"weight": 0.6589411497116089
},
{
"diff_generated": -2.0342957973480225,
"epoch": 0.02093692750588851,
"grad_norm": 42.24412669427099,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -2.3565850257873535,
"logits/rejected": -2.1584813594818115,
"logps/chosen": -300.6426086425781,
"logps/rejected": -167.40040588378906,
"logps_avg/chosen": -1.1184991598129272,
"logps_avg/rejected": -0.6102887988090515,
"loss": 0.8731,
"losses_ref": -0.2850458025932312,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 10,
"u": -1.2951091527938843,
"weight": 0.6724194884300232
},
{
"diff_generated": -1.9851667881011963,
"epoch": 0.031405391258832765,
"grad_norm": 31.267399626309693,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.2946715354919434,
"logits/rejected": -2.146397113800049,
"logps/chosen": -293.4947509765625,
"logps/rejected": -156.3843994140625,
"logps_avg/chosen": -1.0986683368682861,
"logps_avg/rejected": -0.5955500602722168,
"loss": 0.7379,
"losses_ref": -0.28325891494750977,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 15,
"u": -1.2593215703964233,
"weight": 0.6894552111625671
},
{
"diff_generated": -2.0035815238952637,
"epoch": 0.04187385501177702,
"grad_norm": 22.686346023577535,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -2.2586379051208496,
"logits/rejected": -2.134080410003662,
"logps/chosen": -261.52960205078125,
"logps/rejected": -161.9304656982422,
"logps_avg/chosen": -0.9046722650527954,
"logps_avg/rejected": -0.6010745763778687,
"loss": 0.5984,
"losses_ref": -0.2947906255722046,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 20,
"u": -1.323677659034729,
"weight": 0.6650992631912231
},
{
"diff_generated": -3.258924961090088,
"epoch": 0.05234231876472128,
"grad_norm": 15.412617135483135,
"learning_rate": 5.208333333333334e-07,
"logits/chosen": -2.1527328491210938,
"logits/rejected": -2.013265609741211,
"logps/chosen": -257.1512756347656,
"logps/rejected": -277.85711669921875,
"logps_avg/chosen": -0.8043298721313477,
"logps_avg/rejected": -0.9776775240898132,
"loss": 0.5813,
"losses_ref": -0.25987568497657776,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 25,
"u": -1.7415921688079834,
"weight": 0.4334268569946289
},
{
"diff_generated": -6.022626876831055,
"epoch": 0.06281078251766553,
"grad_norm": 15.25952740077981,
"learning_rate": 6.249999999999999e-07,
"logits/chosen": -2.1849024295806885,
"logits/rejected": -2.1174261569976807,
"logps/chosen": -248.16909790039062,
"logps/rejected": -534.7174682617188,
"logps_avg/chosen": -0.8181886672973633,
"logps_avg/rejected": -1.8067880868911743,
"loss": 0.667,
"losses_ref": -0.1500019133090973,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 30,
"u": -2.0229365825653076,
"weight": 0.225816011428833
},
{
"diff_generated": -9.153361320495605,
"epoch": 0.07327924627060979,
"grad_norm": 18.48300356782214,
"learning_rate": 7.291666666666666e-07,
"logits/chosen": -2.2708792686462402,
"logits/rejected": -2.130821704864502,
"logps/chosen": -255.21701049804688,
"logps/rejected": -782.3409423828125,
"logps_avg/chosen": -0.7904274463653564,
"logps_avg/rejected": -2.7460083961486816,
"loss": 0.6695,
"losses_ref": -0.1412452608346939,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 35,
"u": -2.0066444873809814,
"weight": 0.2316206991672516
},
{
"diff_generated": -13.209306716918945,
"epoch": 0.08374771002355404,
"grad_norm": 11.436173876886219,
"learning_rate": 8.333333333333333e-07,
"logits/chosen": -2.2111456394195557,
"logits/rejected": -2.13924241065979,
"logps/chosen": -241.15072631835938,
"logps/rejected": -1223.218017578125,
"logps_avg/chosen": -0.7820993661880493,
"logps_avg/rejected": -3.962791919708252,
"loss": 0.6798,
"losses_ref": -0.09846386313438416,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 40,
"u": -2.131727457046509,
"weight": 0.1441923826932907
},
{
"diff_generated": -14.63012409210205,
"epoch": 0.0942161737764983,
"grad_norm": 59.29532742939981,
"learning_rate": 9.374999999999999e-07,
"logits/chosen": -2.298741102218628,
"logits/rejected": -2.0653302669525146,
"logps/chosen": -264.97357177734375,
"logps/rejected": -1320.9332275390625,
"logps_avg/chosen": -0.779043436050415,
"logps_avg/rejected": -4.389036655426025,
"loss": 0.6914,
"losses_ref": -0.08891113847494125,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 45,
"u": -2.13495135307312,
"weight": 0.13693246245384216
},
{
"diff_generated": -12.911537170410156,
"epoch": 0.10468463752944256,
"grad_norm": 8.930786410211843,
"learning_rate": 1.0416666666666667e-06,
"logits/chosen": -2.302333116531372,
"logits/rejected": -2.2043356895446777,
"logps/chosen": -241.756103515625,
"logps/rejected": -1145.5604248046875,
"logps_avg/chosen": -0.7927433252334595,
"logps_avg/rejected": -3.8734612464904785,
"loss": 0.6993,
"losses_ref": -0.10359562933444977,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 50,
"u": -2.1199097633361816,
"weight": 0.15450677275657654
},
{
"diff_generated": -11.095788955688477,
"epoch": 0.11515310128238682,
"grad_norm": 9.783120635378207,
"learning_rate": 1.1458333333333333e-06,
"logits/chosen": -2.4609317779541016,
"logits/rejected": -2.3575634956359863,
"logps/chosen": -245.7393798828125,
"logps/rejected": -981.2423095703125,
"logps_avg/chosen": -0.8303758502006531,
"logps_avg/rejected": -3.3287365436553955,
"loss": 0.6926,
"losses_ref": -0.08979364484548569,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 55,
"u": -2.17197322845459,
"weight": 0.11447404325008392
},
{
"diff_generated": -13.795969009399414,
"epoch": 0.12562156503533106,
"grad_norm": 9.420248973366883,
"learning_rate": 1.2499999999999999e-06,
"logits/chosen": -2.5860393047332764,
"logits/rejected": -2.482574939727783,
"logps/chosen": -249.44070434570312,
"logps/rejected": -1232.59228515625,
"logps_avg/chosen": -0.7758530378341675,
"logps_avg/rejected": -4.138791084289551,
"loss": 0.6815,
"losses_ref": -0.0876917839050293,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 60,
"u": -2.1477303504943848,
"weight": 0.12934879958629608
},
{
"diff_generated": -16.25264549255371,
"epoch": 0.1360900287882753,
"grad_norm": 12.24868452539092,
"learning_rate": 1.3541666666666667e-06,
"logits/chosen": -2.640986204147339,
"logits/rejected": -2.510274648666382,
"logps/chosen": -258.56109619140625,
"logps/rejected": -1508.2763671875,
"logps_avg/chosen": -0.7998191118240356,
"logps_avg/rejected": -4.875794410705566,
"loss": 0.7039,
"losses_ref": -0.07322683185338974,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 65,
"u": -2.180368423461914,
"weight": 0.1039782166481018
},
{
"diff_generated": -16.121641159057617,
"epoch": 0.14655849254121958,
"grad_norm": 7.905374307014113,
"learning_rate": 1.4583333333333333e-06,
"logits/chosen": -2.581535816192627,
"logits/rejected": -2.4923813343048096,
"logps/chosen": -238.9574432373047,
"logps/rejected": -1444.403564453125,
"logps_avg/chosen": -0.8027188181877136,
"logps_avg/rejected": -4.836493015289307,
"loss": 0.6907,
"losses_ref": -0.0750691220164299,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 70,
"u": -2.189579486846924,
"weight": 0.09880717098712921
},
{
"diff_generated": -16.705251693725586,
"epoch": 0.15702695629416383,
"grad_norm": 9.573720561122785,
"learning_rate": 1.5624999999999999e-06,
"logits/chosen": -2.598374128341675,
"logits/rejected": -2.446035146713257,
"logps/chosen": -270.2249450683594,
"logps/rejected": -1517.441650390625,
"logps_avg/chosen": -0.7964105606079102,
"logps_avg/rejected": -5.011575698852539,
"loss": 0.725,
"losses_ref": -0.07196028530597687,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 75,
"u": -2.21059513092041,
"weight": 0.08612708002328873
},
{
"diff_generated": -18.304201126098633,
"epoch": 0.16749542004710807,
"grad_norm": 7.0924424799681,
"learning_rate": 1.6666666666666667e-06,
"logits/chosen": -2.591045618057251,
"logits/rejected": -2.489673376083374,
"logps/chosen": -216.99685668945312,
"logps/rejected": -1667.5283203125,
"logps_avg/chosen": -0.7215350866317749,
"logps_avg/rejected": -5.491259574890137,
"loss": 0.6699,
"losses_ref": -0.06580645591020584,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 80,
"u": -2.2118382453918457,
"weight": 0.08225957304239273
},
{
"diff_generated": -18.906076431274414,
"epoch": 0.17796388380005235,
"grad_norm": 7.632608732109636,
"learning_rate": 1.7708333333333332e-06,
"logits/chosen": -2.6046338081359863,
"logits/rejected": -2.4658734798431396,
"logps/chosen": -244.0012664794922,
"logps/rejected": -1689.686767578125,
"logps_avg/chosen": -0.7541030049324036,
"logps_avg/rejected": -5.671823978424072,
"loss": 0.7032,
"losses_ref": -0.06257248669862747,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 85,
"u": -2.2184884548187256,
"weight": 0.07795710116624832
},
{
"diff_generated": -22.26788902282715,
"epoch": 0.1884323475529966,
"grad_norm": 10.332533231863795,
"learning_rate": 1.8749999999999998e-06,
"logits/chosen": -2.62504243850708,
"logits/rejected": -2.4670681953430176,
"logps/chosen": -241.73550415039062,
"logps/rejected": -1991.0435791015625,
"logps_avg/chosen": -0.7270082235336304,
"logps_avg/rejected": -6.680366516113281,
"loss": 0.689,
"losses_ref": -0.06023075059056282,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 90,
"u": -2.2096261978149414,
"weight": 0.08131252229213715
},
{
"diff_generated": -20.242061614990234,
"epoch": 0.19890081130594087,
"grad_norm": 7.009998646854354,
"learning_rate": 1.9791666666666666e-06,
"logits/chosen": -2.5733673572540283,
"logits/rejected": -2.4526114463806152,
"logps/chosen": -241.0827178955078,
"logps/rejected": -1833.453369140625,
"logps_avg/chosen": -0.7628769278526306,
"logps_avg/rejected": -6.07261848449707,
"loss": 0.6963,
"losses_ref": -0.06475149095058441,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 95,
"u": -2.2028064727783203,
"weight": 0.0875387191772461
},
{
"diff_generated": -20.439355850219727,
"epoch": 0.2093692750588851,
"grad_norm": 8.018231688525765,
"learning_rate": 1.9998927475076105e-06,
"logits/chosen": -2.621689558029175,
"logits/rejected": -2.470346689224243,
"logps/chosen": -245.5767059326172,
"logps/rejected": -1799.0728759765625,
"logps_avg/chosen": -0.7319446802139282,
"logps_avg/rejected": -6.13180685043335,
"loss": 0.713,
"losses_ref": -0.06253904104232788,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 100,
"u": -2.2177913188934326,
"weight": 0.07825066894292831
},
{
"diff_generated": -20.04744529724121,
"epoch": 0.21983773881182936,
"grad_norm": 7.248502316485956,
"learning_rate": 1.9994570736865402e-06,
"logits/chosen": -2.5862081050872803,
"logits/rejected": -2.4370968341827393,
"logps/chosen": -236.89501953125,
"logps/rejected": -1794.0465087890625,
"logps_avg/chosen": -0.7266777753829956,
"logps_avg/rejected": -6.01423454284668,
"loss": 0.6834,
"losses_ref": -0.06446884572505951,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 105,
"u": -2.198464870452881,
"weight": 0.0902954638004303
},
{
"diff_generated": -20.10696792602539,
"epoch": 0.23030620256477363,
"grad_norm": 6.989545794085033,
"learning_rate": 1.9986864211644068e-06,
"logits/chosen": -2.570603609085083,
"logits/rejected": -2.431187391281128,
"logps/chosen": -236.31884765625,
"logps/rejected": -1773.07421875,
"logps_avg/chosen": -0.7348344326019287,
"logps_avg/rejected": -6.032090187072754,
"loss": 0.6907,
"losses_ref": -0.06961078941822052,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 110,
"u": -2.2041425704956055,
"weight": 0.08867262303829193
},
{
"diff_generated": -21.548114776611328,
"epoch": 0.24077466631771788,
"grad_norm": 8.060053280392543,
"learning_rate": 1.997581048233623e-06,
"logits/chosen": -2.581951141357422,
"logits/rejected": -2.4441328048706055,
"logps/chosen": -232.8576202392578,
"logps/rejected": -1942.4847412109375,
"logps_avg/chosen": -0.7739059329032898,
"logps_avg/rejected": -6.4644341468811035,
"loss": 0.6817,
"losses_ref": -0.062096286565065384,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 115,
"u": -2.216289520263672,
"weight": 0.07720647752285004
},
{
"diff_generated": -20.77760887145996,
"epoch": 0.2512431300706621,
"grad_norm": 6.53936940072868,
"learning_rate": 1.9961413253717214e-06,
"logits/chosen": -2.610959768295288,
"logits/rejected": -2.4528729915618896,
"logps/chosen": -233.8311004638672,
"logps/rejected": -1862.2890625,
"logps_avg/chosen": -0.7324265837669373,
"logps_avg/rejected": -6.233283519744873,
"loss": 0.6932,
"losses_ref": -0.0750860795378685,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 120,
"u": -2.200193405151367,
"weight": 0.09466435015201569
},
{
"diff_generated": -23.185279846191406,
"epoch": 0.26171159382360637,
"grad_norm": 7.018169897249557,
"learning_rate": 1.994367735117177e-06,
"logits/chosen": -2.5702836513519287,
"logits/rejected": -2.391747236251831,
"logps/chosen": -220.02792358398438,
"logps/rejected": -2155.526123046875,
"logps_avg/chosen": -0.7447048425674438,
"logps_avg/rejected": -6.955584526062012,
"loss": 0.7052,
"losses_ref": -0.05986471846699715,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 125,
"u": -2.1955928802490234,
"weight": 0.08941423892974854
},
{
"diff_generated": -22.66459846496582,
"epoch": 0.2721800575765506,
"grad_norm": 31.410489955444024,
"learning_rate": 1.992260871907687e-06,
"logits/chosen": -2.567049503326416,
"logits/rejected": -2.4223153591156006,
"logps/chosen": -242.8145751953125,
"logps/rejected": -2053.98388671875,
"logps_avg/chosen": -0.7978746294975281,
"logps_avg/rejected": -6.799378871917725,
"loss": 0.7155,
"losses_ref": -0.04843521863222122,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 130,
"u": -2.239774465560913,
"weight": 0.05849189683794975
},
{
"diff_generated": -23.263744354248047,
"epoch": 0.2826485213294949,
"grad_norm": 7.49886026826363,
"learning_rate": 1.9898214418809326e-06,
"logits/chosen": -2.532973289489746,
"logits/rejected": -2.372011423110962,
"logps/chosen": -241.5897674560547,
"logps/rejected": -2110.734375,
"logps_avg/chosen": -0.7454018592834473,
"logps_avg/rejected": -6.979123592376709,
"loss": 0.6961,
"losses_ref": -0.04879006743431091,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 135,
"u": -2.2302093505859375,
"weight": 0.06326891481876373
},
{
"diff_generated": -22.754619598388672,
"epoch": 0.29311698508243916,
"grad_norm": 7.014311333863948,
"learning_rate": 1.9870502626379126e-06,
"logits/chosen": -2.488236904144287,
"logits/rejected": -2.361851215362549,
"logps/chosen": -234.2844696044922,
"logps/rejected": -2074.984375,
"logps_avg/chosen": -0.7961763143539429,
"logps_avg/rejected": -6.826386451721191,
"loss": 0.7285,
"losses_ref": -0.055333297699689865,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 140,
"u": -2.2265305519104004,
"weight": 0.06895061582326889
},
{
"diff_generated": -20.225128173828125,
"epoch": 0.3035854488353834,
"grad_norm": 6.7478341009341865,
"learning_rate": 1.983948262968915e-06,
"logits/chosen": -2.5856704711914062,
"logits/rejected": -2.4371695518493652,
"logps/chosen": -263.78900146484375,
"logps/rejected": -1824.1302490234375,
"logps_avg/chosen": -0.7517282366752625,
"logps_avg/rejected": -6.067538738250732,
"loss": 0.6839,
"losses_ref": -0.06395243108272552,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 145,
"u": -2.2037534713745117,
"weight": 0.08503635227680206
},
{
"diff_generated": -24.050996780395508,
"epoch": 0.31405391258832765,
"grad_norm": 7.353090756036984,
"learning_rate": 1.9805164825422237e-06,
"logits/chosen": -2.607673168182373,
"logits/rejected": -2.408552646636963,
"logps/chosen": -241.8136749267578,
"logps/rejected": -2169.62353515625,
"logps_avg/chosen": -0.7578203082084656,
"logps_avg/rejected": -7.215299129486084,
"loss": 0.6958,
"losses_ref": -0.05395021289587021,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 150,
"u": -2.234814167022705,
"weight": 0.06379680335521698
},
{
"diff_generated": -23.94837188720703,
"epoch": 0.3245223763412719,
"grad_norm": 7.484499798723553,
"learning_rate": 1.9767560715556594e-06,
"logits/chosen": -2.5357837677001953,
"logits/rejected": -2.3741650581359863,
"logps/chosen": -237.78701782226562,
"logps/rejected": -2074.5205078125,
"logps_avg/chosen": -0.7676432132720947,
"logps_avg/rejected": -7.184511661529541,
"loss": 0.7199,
"losses_ref": -0.044619906693696976,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 155,
"u": -2.2300286293029785,
"weight": 0.061775337904691696
},
{
"diff_generated": -24.210857391357422,
"epoch": 0.33499084009421615,
"grad_norm": 7.8117370330190115,
"learning_rate": 1.972668290351084e-06,
"logits/chosen": -2.532038688659668,
"logits/rejected": -2.3655738830566406,
"logps/chosen": -246.5824432373047,
"logps/rejected": -2090.85693359375,
"logps_avg/chosen": -0.7575558423995972,
"logps_avg/rejected": -7.2632575035095215,
"loss": 0.6939,
"losses_ref": -0.04590834304690361,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 160,
"u": -2.236487865447998,
"weight": 0.05844121426343918
},
{
"diff_generated": -20.957683563232422,
"epoch": 0.34545930384716045,
"grad_norm": 7.4058662270815026,
"learning_rate": 1.968254508991978e-06,
"logits/chosen": -2.6238338947296143,
"logits/rejected": -2.4566922187805176,
"logps/chosen": -245.81436157226562,
"logps/rejected": -1895.0390625,
"logps_avg/chosen": -0.7605465054512024,
"logps_avg/rejected": -6.2873053550720215,
"loss": 0.701,
"losses_ref": -0.05409424751996994,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 165,
"u": -2.236602783203125,
"weight": 0.0619116947054863
},
{
"diff_generated": -23.36783218383789,
"epoch": 0.3559277676001047,
"grad_norm": 7.74288657614709,
"learning_rate": 1.9635162068042544e-06,
"logits/chosen": -2.5531725883483887,
"logits/rejected": -2.385223627090454,
"logps/chosen": -250.6099090576172,
"logps/rejected": -2106.687744140625,
"logps_avg/chosen": -0.7441612482070923,
"logps_avg/rejected": -7.010349273681641,
"loss": 0.7035,
"losses_ref": -0.060589499771595,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 170,
"u": -2.218136787414551,
"weight": 0.0771271213889122
},
{
"diff_generated": -23.426584243774414,
"epoch": 0.36639623135304894,
"grad_norm": 6.175218562127925,
"learning_rate": 1.958454971880441e-06,
"logits/chosen": -2.545517683029175,
"logits/rejected": -2.3892464637756348,
"logps/chosen": -271.62152099609375,
"logps/rejected": -2128.689208984375,
"logps_avg/chosen": -0.7712885141372681,
"logps_avg/rejected": -7.027975559234619,
"loss": 0.6768,
"losses_ref": -0.059747565537691116,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 175,
"u": -2.221135377883911,
"weight": 0.07428421080112457
},
{
"diff_generated": -23.27652931213379,
"epoch": 0.3768646951059932,
"grad_norm": 7.602114045248552,
"learning_rate": 1.9530725005474194e-06,
"logits/chosen": -2.5965559482574463,
"logits/rejected": -2.4581873416900635,
"logps/chosen": -225.35818481445312,
"logps/rejected": -2096.1943359375,
"logps_avg/chosen": -0.7377344369888306,
"logps_avg/rejected": -6.982959747314453,
"loss": 0.6599,
"losses_ref": -0.06142450496554375,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 180,
"u": -2.224907875061035,
"weight": 0.07244168221950531
},
{
"diff_generated": -24.591943740844727,
"epoch": 0.38733315885893743,
"grad_norm": 6.781608060052273,
"learning_rate": 1.9473705967978807e-06,
"logits/chosen": -2.6047005653381348,
"logits/rejected": -2.4540090560913086,
"logps/chosen": -231.2947235107422,
"logps/rejected": -2179.2568359375,
"logps_avg/chosen": -0.689501166343689,
"logps_avg/rejected": -7.3775835037231445,
"loss": 0.6665,
"losses_ref": -0.05740996077656746,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 185,
"u": -2.2239882946014404,
"weight": 0.07182185351848602
},
{
"diff_generated": -25.36248016357422,
"epoch": 0.39780162261188173,
"grad_norm": 10.641404317565371,
"learning_rate": 1.941351171685697e-06,
"logits/chosen": -2.5710506439208984,
"logits/rejected": -2.4436774253845215,
"logps/chosen": -236.1158905029297,
"logps/rejected": -2273.37158203125,
"logps_avg/chosen": -0.7929750680923462,
"logps_avg/rejected": -7.6087446212768555,
"loss": 0.7108,
"losses_ref": -0.05253469944000244,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 190,
"u": -2.239004373550415,
"weight": 0.06010523438453674
},
{
"diff_generated": -25.077518463134766,
"epoch": 0.408270086364826,
"grad_norm": 9.470830241427814,
"learning_rate": 1.9350162426854148e-06,
"logits/chosen": -2.602252244949341,
"logits/rejected": -2.4661412239074707,
"logps/chosen": -197.88571166992188,
"logps/rejected": -2272.28076171875,
"logps_avg/chosen": -0.7630836367607117,
"logps_avg/rejected": -7.523255348205566,
"loss": 0.6999,
"losses_ref": -0.04595743492245674,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 195,
"u": -2.243717670440674,
"weight": 0.054233819246292114
},
{
"diff_generated": -24.682910919189453,
"epoch": 0.4187385501177702,
"grad_norm": 6.269041714690376,
"learning_rate": 1.9283679330160725e-06,
"logits/chosen": -2.5849337577819824,
"logits/rejected": -2.394373655319214,
"logps/chosen": -242.97378540039062,
"logps/rejected": -2224.541015625,
"logps_avg/chosen": -0.7199097871780396,
"logps_avg/rejected": -7.404873847961426,
"loss": 0.69,
"losses_ref": -0.0516563281416893,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 200,
"u": -2.2307353019714355,
"weight": 0.06507831811904907
},
{
"diff_generated": -22.525114059448242,
"epoch": 0.42920701387071447,
"grad_norm": 6.963251924926938,
"learning_rate": 1.9214084709295847e-06,
"logits/chosen": -2.6382362842559814,
"logits/rejected": -2.4577651023864746,
"logps/chosen": -259.39349365234375,
"logps/rejected": -2065.585693359375,
"logps_avg/chosen": -0.7225343585014343,
"logps_avg/rejected": -6.757534027099609,
"loss": 0.696,
"losses_ref": -0.05577712133526802,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 205,
"u": -2.2293906211853027,
"weight": 0.0664394274353981
},
{
"diff_generated": -22.145648956298828,
"epoch": 0.4396754776236587,
"grad_norm": 7.299076527075288,
"learning_rate": 1.9141401889639164e-06,
"logits/chosen": -2.5583319664001465,
"logits/rejected": -2.4039664268493652,
"logps/chosen": -238.9542694091797,
"logps/rejected": -2062.404541015625,
"logps_avg/chosen": -0.7716320753097534,
"logps_avg/rejected": -6.6436944007873535,
"loss": 0.6993,
"losses_ref": -0.058913152664899826,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 210,
"u": -2.2152769565582275,
"weight": 0.07614172250032425
},
{
"diff_generated": -23.579111099243164,
"epoch": 0.45014394137660296,
"grad_norm": 8.50842985439364,
"learning_rate": 1.906565523161312e-06,
"logits/chosen": -2.600001335144043,
"logits/rejected": -2.4590041637420654,
"logps/chosen": -231.87673950195312,
"logps/rejected": -2083.391357421875,
"logps_avg/chosen": -0.7907384634017944,
"logps_avg/rejected": -7.073732852935791,
"loss": 0.7066,
"losses_ref": -0.05489416792988777,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 215,
"u": -2.234017848968506,
"weight": 0.06394322961568832
},
{
"diff_generated": -27.941696166992188,
"epoch": 0.46061240512954726,
"grad_norm": 6.305196829448204,
"learning_rate": 1.8986870122518259e-06,
"logits/chosen": -2.6018145084381104,
"logits/rejected": -2.436535358428955,
"logps/chosen": -245.06005859375,
"logps/rejected": -2555.211181640625,
"logps_avg/chosen": -0.7695084810256958,
"logps_avg/rejected": -8.382509231567383,
"loss": 0.7137,
"losses_ref": -0.04443511739373207,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 220,
"u": -2.2481765747070312,
"weight": 0.051543742418289185
},
{
"diff_generated": -26.58075523376465,
"epoch": 0.4710808688824915,
"grad_norm": 6.961879634992629,
"learning_rate": 1.8905072968024423e-06,
"logits/chosen": -2.567117214202881,
"logits/rejected": -2.3789048194885254,
"logps/chosen": -244.94296264648438,
"logps/rejected": -2428.1923828125,
"logps_avg/chosen": -0.7622503042221069,
"logps_avg/rejected": -7.974226474761963,
"loss": 0.6936,
"losses_ref": -0.04088358208537102,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 225,
"u": -2.2515604496002197,
"weight": 0.04799002408981323
},
{
"diff_generated": -24.84002113342285,
"epoch": 0.48154933263543576,
"grad_norm": 7.1763831101881275,
"learning_rate": 1.88202911833206e-06,
"logits/chosen": -2.520597219467163,
"logits/rejected": -2.395034074783325,
"logps/chosen": -213.36929321289062,
"logps/rejected": -2192.75390625,
"logps_avg/chosen": -0.7349015474319458,
"logps_avg/rejected": -7.4520063400268555,
"loss": 0.6978,
"losses_ref": -0.051292240619659424,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 230,
"u": -2.231480360031128,
"weight": 0.06503967195749283
},
{
"diff_generated": -26.721317291259766,
"epoch": 0.49201779638838,
"grad_norm": 5.9191842237687835,
"learning_rate": 1.873255318392644e-06,
"logits/chosen": -2.4896910190582275,
"logits/rejected": -2.296112060546875,
"logps/chosen": -239.5654296875,
"logps/rejected": -2448.593505859375,
"logps_avg/chosen": -0.7563043236732483,
"logps_avg/rejected": -8.01639461517334,
"loss": 0.7163,
"losses_ref": -0.05161570757627487,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 235,
"u": -2.230043649673462,
"weight": 0.06615348160266876
},
{
"diff_generated": -22.361705780029297,
"epoch": 0.5024862601413242,
"grad_norm": 6.264520814093759,
"learning_rate": 1.8641888376168483e-06,
"logits/chosen": -2.4571125507354736,
"logits/rejected": -2.3177151679992676,
"logps/chosen": -219.5469207763672,
"logps/rejected": -1993.8834228515625,
"logps_avg/chosen": -0.7551349997520447,
"logps_avg/rejected": -6.708512783050537,
"loss": 0.7049,
"losses_ref": -0.05244841426610947,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 240,
"u": -2.2336738109588623,
"weight": 0.06469963490962982
},
{
"diff_generated": -19.673988342285156,
"epoch": 0.5129547238942685,
"grad_norm": 6.373155717275301,
"learning_rate": 1.8548327147324312e-06,
"logits/chosen": -2.459257125854492,
"logits/rejected": -2.273050546646118,
"logps/chosen": -248.42935180664062,
"logps/rejected": -1772.5706787109375,
"logps_avg/chosen": -0.7812148928642273,
"logps_avg/rejected": -5.902197360992432,
"loss": 0.6961,
"losses_ref": -0.0656919851899147,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 245,
"u": -2.2100472450256348,
"weight": 0.08213107287883759
},
{
"diff_generated": -18.21377182006836,
"epoch": 0.5234231876472127,
"grad_norm": 7.015946672120974,
"learning_rate": 1.8451900855437948e-06,
"logits/chosen": -2.4628689289093018,
"logits/rejected": -2.322192430496216,
"logps/chosen": -242.85488891601562,
"logps/rejected": -1614.31201171875,
"logps_avg/chosen": -0.7260557413101196,
"logps_avg/rejected": -5.4641313552856445,
"loss": 0.6754,
"losses_ref": -0.05365673825144768,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 250,
"u": -2.234142303466797,
"weight": 0.06292648613452911
},
{
"diff_generated": -21.006351470947266,
"epoch": 0.533891651400157,
"grad_norm": 6.444057235727556,
"learning_rate": 1.8352641818809846e-06,
"logits/chosen": -2.44881010055542,
"logits/rejected": -2.264845371246338,
"logps/chosen": -258.3345031738281,
"logps/rejected": -1910.637451171875,
"logps_avg/chosen": -0.7704434394836426,
"logps_avg/rejected": -6.301905155181885,
"loss": 0.6922,
"losses_ref": -0.05841520428657532,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 255,
"u": -2.230961322784424,
"weight": 0.06754828989505768
},
{
"diff_generated": -21.22915267944336,
"epoch": 0.5443601151531012,
"grad_norm": 6.052398997240752,
"learning_rate": 1.8250583305165094e-06,
"logits/chosen": -2.3323371410369873,
"logits/rejected": -2.212430477142334,
"logps/chosen": -235.18038940429688,
"logps/rejected": -1926.814453125,
"logps_avg/chosen": -0.6792945861816406,
"logps_avg/rejected": -6.368745803833008,
"loss": 0.6742,
"losses_ref": -0.047284115105867386,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 260,
"u": -2.220738172531128,
"weight": 0.06843873858451843
},
{
"diff_generated": -21.301851272583008,
"epoch": 0.5548285789060455,
"grad_norm": 6.4499158810515755,
"learning_rate": 1.8145759520503357e-06,
"logits/chosen": -2.4637808799743652,
"logits/rejected": -2.2752346992492676,
"logps/chosen": -246.92269897460938,
"logps/rejected": -1889.571533203125,
"logps_avg/chosen": -0.7389290928840637,
"logps_avg/rejected": -6.390555381774902,
"loss": 0.6763,
"losses_ref": -0.05337480455636978,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 265,
"u": -2.2342476844787598,
"weight": 0.06287747621536255
},
{
"diff_generated": -24.129053115844727,
"epoch": 0.5652970426589898,
"grad_norm": 6.150486891273085,
"learning_rate": 1.803820559763439e-06,
"logits/chosen": -2.463932752609253,
"logits/rejected": -2.262209415435791,
"logps/chosen": -218.674072265625,
"logps/rejected": -2158.11376953125,
"logps_avg/chosen": -0.7358182072639465,
"logps_avg/rejected": -7.238715171813965,
"loss": 0.7092,
"losses_ref": -0.05700932815670967,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 270,
"u": -2.235081911087036,
"weight": 0.0649222731590271
},
{
"diff_generated": -22.390344619750977,
"epoch": 0.575765506411934,
"grad_norm": 7.077728369391663,
"learning_rate": 1.7927957584402895e-06,
"logits/chosen": -2.4641366004943848,
"logits/rejected": -2.289757251739502,
"logps/chosen": -230.87442016601562,
"logps/rejected": -1978.302734375,
"logps_avg/chosen": -0.6890340447425842,
"logps_avg/rejected": -6.717103004455566,
"loss": 0.6762,
"losses_ref": -0.05622117966413498,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 275,
"u": -2.217959403991699,
"weight": 0.07241992652416229
},
{
"diff_generated": -21.651906967163086,
"epoch": 0.5862339701648783,
"grad_norm": 6.269922997412507,
"learning_rate": 1.78150524316067e-06,
"logits/chosen": -2.512561082839966,
"logits/rejected": -2.3291046619415283,
"logps/chosen": -247.04129028320312,
"logps/rejected": -1997.1549072265625,
"logps_avg/chosen": -0.7235974073410034,
"logps_avg/rejected": -6.495572566986084,
"loss": 0.6702,
"losses_ref": -0.04933195561170578,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 280,
"u": -2.2435684204101562,
"weight": 0.05631055310368538
},
{
"diff_generated": -24.519784927368164,
"epoch": 0.5967024339178225,
"grad_norm": 6.4591538424452475,
"learning_rate": 1.7699527980612304e-06,
"logits/chosen": -2.533612012863159,
"logits/rejected": -2.310060501098633,
"logps/chosen": -241.06430053710938,
"logps/rejected": -2117.74609375,
"logps_avg/chosen": -0.7511512041091919,
"logps_avg/rejected": -7.355935573577881,
"loss": 0.7064,
"losses_ref": -0.0406634621322155,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 285,
"u": -2.252286434173584,
"weight": 0.04695131629705429
},
{
"diff_generated": -22.960649490356445,
"epoch": 0.6071708976707668,
"grad_norm": 6.2154170319293724,
"learning_rate": 1.758142295067194e-06,
"logits/chosen": -2.508026123046875,
"logits/rejected": -2.2768871784210205,
"logps/chosen": -256.1479797363281,
"logps/rejected": -2004.0556640625,
"logps_avg/chosen": -0.7584555745124817,
"logps_avg/rejected": -6.888195037841797,
"loss": 0.6642,
"losses_ref": -0.05948421359062195,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 290,
"u": -2.2216153144836426,
"weight": 0.07435683906078339
},
{
"diff_generated": -23.191375732421875,
"epoch": 0.6176393614237111,
"grad_norm": 6.58174772631908,
"learning_rate": 1.7460776925946416e-06,
"logits/chosen": -2.5151877403259277,
"logits/rejected": -2.297478199005127,
"logps/chosen": -233.7965087890625,
"logps/rejected": -2135.15673828125,
"logps_avg/chosen": -0.7887166738510132,
"logps_avg/rejected": -6.957413673400879,
"loss": 0.6755,
"losses_ref": -0.055867087095975876,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 295,
"u": -2.237914562225342,
"weight": 0.062143467366695404
},
{
"diff_generated": -24.709823608398438,
"epoch": 0.6281078251766553,
"grad_norm": 7.437442244122165,
"learning_rate": 1.7337630342238039e-06,
"logits/chosen": -2.525470018386841,
"logits/rejected": -2.3166513442993164,
"logps/chosen": -229.94558715820312,
"logps/rejected": -2169.215576171875,
"logps_avg/chosen": -0.7630201578140259,
"logps_avg/rejected": -7.412947177886963,
"loss": 0.7146,
"losses_ref": -0.0521920807659626,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 300,
"u": -2.238734722137451,
"weight": 0.06009601429104805
},
{
"diff_generated": -25.142507553100586,
"epoch": 0.6385762889295996,
"grad_norm": 6.81810983140467,
"learning_rate": 1.7212024473438145e-06,
"logits/chosen": -2.5295021533966064,
"logits/rejected": -2.3437719345092773,
"logps/chosen": -230.28018188476562,
"logps/rejected": -2279.5810546875,
"logps_avg/chosen": -0.6913032531738281,
"logps_avg/rejected": -7.54275369644165,
"loss": 0.6605,
"losses_ref": -0.04879279434680939,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 305,
"u": -2.2388291358947754,
"weight": 0.058758098632097244
},
{
"diff_generated": -24.345029830932617,
"epoch": 0.6490447526825438,
"grad_norm": 6.09422333137857,
"learning_rate": 1.70840014176937e-06,
"logits/chosen": -2.496091604232788,
"logits/rejected": -2.2605936527252197,
"logps/chosen": -264.0978698730469,
"logps/rejected": -2208.2470703125,
"logps_avg/chosen": -0.7388861179351807,
"logps_avg/rejected": -7.303508758544922,
"loss": 0.6912,
"losses_ref": -0.042303841561079025,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 310,
"u": -2.246720552444458,
"weight": 0.05154282599687576
},
{
"diff_generated": -23.305843353271484,
"epoch": 0.6595132164354881,
"grad_norm": 6.009874799920644,
"learning_rate": 1.6953604083297663e-06,
"logits/chosen": -2.5141513347625732,
"logits/rejected": -2.3054990768432617,
"logps/chosen": -241.82406616210938,
"logps/rejected": -2167.42724609375,
"logps_avg/chosen": -0.740818202495575,
"logps_avg/rejected": -6.991753578186035,
"loss": 0.6887,
"losses_ref": -0.0596298985183239,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 315,
"u": -2.2182247638702393,
"weight": 0.07611407339572906
},
{
"diff_generated": -27.0042724609375,
"epoch": 0.6699816801884323,
"grad_norm": 5.920473182891855,
"learning_rate": 1.6820876174307821e-06,
"logits/chosen": -2.482053279876709,
"logits/rejected": -2.2886459827423096,
"logps/chosen": -223.24893188476562,
"logps/rejected": -2428.3193359375,
"logps_avg/chosen": -0.7374002933502197,
"logps_avg/rejected": -8.101282119750977,
"loss": 0.6816,
"losses_ref": -0.049068134278059006,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 320,
"u": -2.235114574432373,
"weight": 0.06041133403778076
},
{
"diff_generated": -25.161632537841797,
"epoch": 0.6804501439413766,
"grad_norm": 6.759097342452152,
"learning_rate": 1.668586217589889e-06,
"logits/chosen": -2.4576220512390137,
"logits/rejected": -2.2568023204803467,
"logps/chosen": -255.9824676513672,
"logps/rejected": -2272.87451171875,
"logps_avg/chosen": -0.8112290501594543,
"logps_avg/rejected": -7.548490047454834,
"loss": 0.7034,
"losses_ref": -0.04155198484659195,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 325,
"u": -2.2597880363464355,
"weight": 0.04243909567594528
},
{
"diff_generated": -26.866863250732422,
"epoch": 0.6909186076943209,
"grad_norm": 5.913181146879915,
"learning_rate": 1.6548607339452852e-06,
"logits/chosen": -2.5034430027008057,
"logits/rejected": -2.2873404026031494,
"logps/chosen": -219.890625,
"logps/rejected": -2450.533203125,
"logps_avg/chosen": -0.7192927598953247,
"logps_avg/rejected": -8.060060501098633,
"loss": 0.679,
"losses_ref": -0.04148325324058533,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 330,
"u": -2.258359432220459,
"weight": 0.04333708435297012
},
{
"diff_generated": -26.58041000366211,
"epoch": 0.7013870714472651,
"grad_norm": 6.258862828154151,
"learning_rate": 1.6409157667392455e-06,
"logits/chosen": -2.5029423236846924,
"logits/rejected": -2.2649450302124023,
"logps/chosen": -239.6374969482422,
"logps/rejected": -2410.685302734375,
"logps_avg/chosen": -0.7706997990608215,
"logps_avg/rejected": -7.974122524261475,
"loss": 0.7035,
"losses_ref": -0.05212752893567085,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 335,
"u": -2.2294507026672363,
"weight": 0.06685757637023926
},
{
"diff_generated": -30.061986923217773,
"epoch": 0.7118555352002094,
"grad_norm": 7.477206152513725,
"learning_rate": 1.6267559897763027e-06,
"logits/chosen": -2.3795595169067383,
"logits/rejected": -2.18742036819458,
"logps/chosen": -192.0414276123047,
"logps/rejected": -2674.73486328125,
"logps_avg/chosen": -0.7409474849700928,
"logps_avg/rejected": -9.018596649169922,
"loss": 0.6831,
"losses_ref": -0.044330693781375885,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 340,
"u": -2.235874652862549,
"weight": 0.059127964079380035
},
{
"diff_generated": -28.720870971679688,
"epoch": 0.7223239989531536,
"grad_norm": 5.9002590426162325,
"learning_rate": 1.6123861488567708e-06,
"logits/chosen": -2.4881465435028076,
"logits/rejected": -2.2146873474121094,
"logps/chosen": -260.3475341796875,
"logps/rejected": -2515.25,
"logps_avg/chosen": -0.7461652755737305,
"logps_avg/rejected": -8.61626148223877,
"loss": 0.6968,
"losses_ref": -0.044901080429553986,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 345,
"u": -2.2523629665374756,
"weight": 0.04855785518884659
},
{
"diff_generated": -28.04868507385254,
"epoch": 0.7327924627060979,
"grad_norm": 15.894199978415127,
"learning_rate": 1.5978110601861409e-06,
"logits/chosen": -2.471588611602783,
"logits/rejected": -2.2580113410949707,
"logps/chosen": -255.3411102294922,
"logps/rejected": -2506.482666015625,
"logps_avg/chosen": -0.7827759385108948,
"logps_avg/rejected": -8.414606094360352,
"loss": 0.7362,
"losses_ref": -0.04014447331428528,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 350,
"u": -2.2483315467834473,
"weight": 0.04962104931473732
},
{
"diff_generated": -27.11871337890625,
"epoch": 0.7432609264590422,
"grad_norm": 5.4012187487436725,
"learning_rate": 1.5830356087608763e-06,
"logits/chosen": -2.4285144805908203,
"logits/rejected": -2.1649179458618164,
"logps/chosen": -218.6619415283203,
"logps/rejected": -2413.4892578125,
"logps_avg/chosen": -0.7086374163627625,
"logps_avg/rejected": -8.135615348815918,
"loss": 0.7021,
"losses_ref": -0.03781733289361,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 355,
"u": -2.2616829872131348,
"weight": 0.0397658608853817
},
{
"diff_generated": -26.739330291748047,
"epoch": 0.7537293902119864,
"grad_norm": 6.5263260405852614,
"learning_rate": 1.5680647467311555e-06,
"logits/chosen": -2.4289963245391846,
"logits/rejected": -2.133953332901001,
"logps/chosen": -247.11563110351562,
"logps/rejected": -2465.95849609375,
"logps_avg/chosen": -0.7823926210403442,
"logps_avg/rejected": -8.02180004119873,
"loss": 0.6853,
"losses_ref": -0.0527551993727684,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 360,
"u": -2.2420668601989746,
"weight": 0.0583949089050293
},
{
"diff_generated": -27.283761978149414,
"epoch": 0.7641978539649307,
"grad_norm": 6.979588218526593,
"learning_rate": 1.552903491741107e-06,
"logits/chosen": -2.449387550354004,
"logits/rejected": -2.1368231773376465,
"logps/chosen": -234.6686553955078,
"logps/rejected": -2578.747802734375,
"logps_avg/chosen": -0.740507185459137,
"logps_avg/rejected": -8.185129165649414,
"loss": 0.6824,
"losses_ref": -0.03961649537086487,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 365,
"u": -2.2473196983337402,
"weight": 0.05010756850242615
},
{
"diff_generated": -26.48313331604004,
"epoch": 0.7746663177178749,
"grad_norm": 7.697158528726637,
"learning_rate": 1.5375569252470895e-06,
"logits/chosen": -2.5160136222839355,
"logits/rejected": -2.2105443477630615,
"logps/chosen": -270.76727294921875,
"logps/rejected": -2356.61376953125,
"logps_avg/chosen": -0.798203706741333,
"logps_avg/rejected": -7.944940090179443,
"loss": 0.6956,
"losses_ref": -0.03987672179937363,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 370,
"u": -2.259321689605713,
"weight": 0.042373161762952805
},
{
"diff_generated": -25.16873550415039,
"epoch": 0.7851347814708192,
"grad_norm": 6.394620991151716,
"learning_rate": 1.5220301908145903e-06,
"logits/chosen": -2.464231491088867,
"logits/rejected": -2.1346030235290527,
"logps/chosen": -240.89230346679688,
"logps/rejected": -2322.256591796875,
"logps_avg/chosen": -0.6929277181625366,
"logps_avg/rejected": -7.55062198638916,
"loss": 0.6924,
"losses_ref": -0.04263712465763092,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 375,
"u": -2.2447123527526855,
"weight": 0.05186506360769272
},
{
"diff_generated": -26.598400115966797,
"epoch": 0.7956032452237635,
"grad_norm": 6.833084085030009,
"learning_rate": 1.5063284923943028e-06,
"logits/chosen": -2.4700121879577637,
"logits/rejected": -2.1623213291168213,
"logps/chosen": -255.25228881835938,
"logps/rejected": -2325.41162109375,
"logps_avg/chosen": -0.7505702376365662,
"logps_avg/rejected": -7.97952127456665,
"loss": 0.6914,
"losses_ref": -0.039328016340732574,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 380,
"u": -2.2589406967163086,
"weight": 0.04286640137434006
},
{
"diff_generated": -29.339923858642578,
"epoch": 0.8060717089767077,
"grad_norm": 6.446112080414134,
"learning_rate": 1.490457092577968e-06,
"logits/chosen": -2.4463934898376465,
"logits/rejected": -2.0776758193969727,
"logps/chosen": -232.91452026367188,
"logps/rejected": -2714.375244140625,
"logps_avg/chosen": -0.6785185933113098,
"logps_avg/rejected": -8.801977157592773,
"loss": 0.6865,
"losses_ref": -0.04436464607715607,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 385,
"u": -2.247189998626709,
"weight": 0.05228755623102188
},
{
"diff_generated": -27.133153915405273,
"epoch": 0.816540172729652,
"grad_norm": 5.888520537518448,
"learning_rate": 1.4744213108345602e-06,
"logits/chosen": -2.5249063968658447,
"logits/rejected": -2.1448757648468018,
"logps/chosen": -258.61212158203125,
"logps/rejected": -2449.294677734375,
"logps_avg/chosen": -0.7527631521224976,
"logps_avg/rejected": -8.139945983886719,
"loss": 0.685,
"losses_ref": -0.0589534267783165,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 390,
"u": -2.2169764041900635,
"weight": 0.0769612193107605
},
{
"diff_generated": -27.033132553100586,
"epoch": 0.8270086364825961,
"grad_norm": 6.211789156823427,
"learning_rate": 1.4582265217274103e-06,
"logits/chosen": -2.4122936725616455,
"logits/rejected": -2.095203161239624,
"logps/chosen": -251.5576629638672,
"logps/rejected": -2401.735595703125,
"logps_avg/chosen": -0.7489043474197388,
"logps_avg/rejected": -8.109941482543945,
"loss": 0.6753,
"losses_ref": -0.048131681978702545,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 395,
"u": -2.247305393218994,
"weight": 0.053915899246931076
},
{
"diff_generated": -30.035808563232422,
"epoch": 0.8374771002355405,
"grad_norm": 6.698107767192597,
"learning_rate": 1.4418781531128635e-06,
"logits/chosen": -2.486995220184326,
"logits/rejected": -2.131185531616211,
"logps/chosen": -239.08642578125,
"logps/rejected": -2759.15625,
"logps_avg/chosen": -0.7630764245986938,
"logps_avg/rejected": -9.010741233825684,
"loss": 0.6892,
"losses_ref": -0.036631032824516296,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 400,
"u": -2.2631499767303467,
"weight": 0.038820598274469376
},
{
"diff_generated": -27.721935272216797,
"epoch": 0.8479455639884846,
"grad_norm": 7.031324917308057,
"learning_rate": 1.4253816843210748e-06,
"logits/chosen": -2.4483680725097656,
"logits/rejected": -2.089618444442749,
"logps/chosen": -249.0079803466797,
"logps/rejected": -2574.352783203125,
"logps_avg/chosen": -0.722091019153595,
"logps_avg/rejected": -8.316580772399902,
"loss": 0.7066,
"losses_ref": -0.043711207807064056,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 405,
"u": -2.2515838146209717,
"weight": 0.048544611781835556
},
{
"diff_generated": -30.00594711303711,
"epoch": 0.8584140277414289,
"grad_norm": 5.878873328550679,
"learning_rate": 1.4087426443195547e-06,
"logits/chosen": -2.4377264976501465,
"logits/rejected": -2.0860629081726074,
"logps/chosen": -220.13644409179688,
"logps/rejected": -2700.03369140625,
"logps_avg/chosen": -0.7378045916557312,
"logps_avg/rejected": -9.001784324645996,
"loss": 0.6757,
"losses_ref": -0.032459113746881485,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 410,
"u": -2.2480547428131104,
"weight": 0.04561341553926468
},
{
"diff_generated": -27.96181297302246,
"epoch": 0.8688824914943732,
"grad_norm": 6.085121754886306,
"learning_rate": 1.391966609860075e-06,
"logits/chosen": -2.4773359298706055,
"logits/rejected": -2.129520893096924,
"logps/chosen": -239.4454803466797,
"logps/rejected": -2550.92919921875,
"logps_avg/chosen": -0.7163268327713013,
"logps_avg/rejected": -8.388544082641602,
"loss": 0.6864,
"losses_ref": -0.03842215612530708,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 415,
"u": -2.2578535079956055,
"weight": 0.04306939244270325
},
{
"diff_generated": -27.701797485351562,
"epoch": 0.8793509552473174,
"grad_norm": 6.898834621323108,
"learning_rate": 1.3750592036095619e-06,
"logits/chosen": -2.4759981632232666,
"logits/rejected": -2.1207737922668457,
"logps/chosen": -255.3009490966797,
"logps/rejected": -2467.61328125,
"logps_avg/chosen": -0.7468316555023193,
"logps_avg/rejected": -8.310539245605469,
"loss": 0.6929,
"losses_ref": -0.050536155700683594,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 420,
"u": -2.2376935482025146,
"weight": 0.05989469215273857
},
{
"diff_generated": -27.900798797607422,
"epoch": 0.8898194190002617,
"grad_norm": 7.318402161699278,
"learning_rate": 1.3580260922655984e-06,
"logits/chosen": -2.459564685821533,
"logits/rejected": -2.133777379989624,
"logps/chosen": -232.8207550048828,
"logps/rejected": -2438.7041015625,
"logps_avg/chosen": -0.7522517442703247,
"logps_avg/rejected": -8.370241165161133,
"loss": 0.6907,
"losses_ref": -0.040023092180490494,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 425,
"u": -2.2584662437438965,
"weight": 0.043312422931194305
},
{
"diff_generated": -29.812658309936523,
"epoch": 0.9002878827532059,
"grad_norm": 6.38418063766252,
"learning_rate": 1.3408729846571713e-06,
"logits/chosen": -2.4594979286193848,
"logits/rejected": -2.071135997772217,
"logps/chosen": -280.634521484375,
"logps/rejected": -2652.205322265625,
"logps_avg/chosen": -0.7122408747673035,
"logps_avg/rejected": -8.943798065185547,
"loss": 0.6859,
"losses_ref": -0.03510651737451553,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 430,
"u": -2.2575087547302246,
"weight": 0.04237521067261696
},
{
"diff_generated": -25.203630447387695,
"epoch": 0.9107563465061502,
"grad_norm": 6.588604544150575,
"learning_rate": 1.3236056298312956e-06,
"logits/chosen": -2.481071949005127,
"logits/rejected": -2.1608479022979736,
"logps/chosen": -234.13027954101562,
"logps/rejected": -2276.569580078125,
"logps_avg/chosen": -0.7077358365058899,
"logps_avg/rejected": -7.561089992523193,
"loss": 0.6722,
"losses_ref": -0.04718080908060074,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 435,
"u": -2.249262809753418,
"weight": 0.05173084884881973
},
{
"diff_generated": -23.8907470703125,
"epoch": 0.9212248102590945,
"grad_norm": 6.857956310477159,
"learning_rate": 1.3062298151261591e-06,
"logits/chosen": -2.5335617065429688,
"logits/rejected": -2.219560146331787,
"logps/chosen": -250.57705688476562,
"logps/rejected": -2190.95947265625,
"logps_avg/chosen": -0.6971117854118347,
"logps_avg/rejected": -7.167223930358887,
"loss": 0.6666,
"losses_ref": -0.04138738289475441,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 440,
"u": -2.250927686691284,
"weight": 0.04870566353201866
},
{
"diff_generated": -24.81663703918457,
"epoch": 0.9316932740120387,
"grad_norm": 7.035268937333438,
"learning_rate": 1.2887513642314372e-06,
"logits/chosen": -2.466610908508301,
"logits/rejected": -2.1418159008026123,
"logps/chosen": -234.072021484375,
"logps/rejected": -2254.32177734375,
"logps_avg/chosen": -0.7226396203041077,
"logps_avg/rejected": -7.4449920654296875,
"loss": 0.6772,
"losses_ref": -0.02925349771976471,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 445,
"u": -2.261955738067627,
"weight": 0.036579299718141556
},
{
"diff_generated": -23.858409881591797,
"epoch": 0.942161737764983,
"grad_norm": 5.8496221029871895,
"learning_rate": 1.271176135236417e-06,
"logits/chosen": -2.5474791526794434,
"logits/rejected": -2.2467288970947266,
"logps/chosen": -259.63043212890625,
"logps/rejected": -2068.02978515625,
"logps_avg/chosen": -0.7590965032577515,
"logps_avg/rejected": -7.157523155212402,
"loss": 0.6926,
"losses_ref": -0.04666949436068535,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 450,
"u": -2.24082612991333,
"weight": 0.057170577347278595
},
{
"diff_generated": -21.257368087768555,
"epoch": 0.9526302015179272,
"grad_norm": 9.579263194990599,
"learning_rate": 1.2535100186666e-06,
"logits/chosen": -2.5334389209747314,
"logits/rejected": -2.2800872325897217,
"logps/chosen": -258.4393615722656,
"logps/rejected": -1949.274658203125,
"logps_avg/chosen": -0.7667442560195923,
"logps_avg/rejected": -6.377211093902588,
"loss": 0.7272,
"losses_ref": -0.04685154929757118,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 455,
"u": -2.239955186843872,
"weight": 0.05661741644144058
},
{
"diff_generated": -21.260351181030273,
"epoch": 0.9630986652708715,
"grad_norm": 7.19097418251884,
"learning_rate": 1.2357589355094273e-06,
"logits/chosen": -2.5235114097595215,
"logits/rejected": -2.2688846588134766,
"logps/chosen": -274.0472106933594,
"logps/rejected": -1854.4193115234375,
"logps_avg/chosen": -0.7401561141014099,
"logps_avg/rejected": -6.378105163574219,
"loss": 0.6996,
"losses_ref": -0.04187412187457085,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 460,
"u": -2.255484104156494,
"weight": 0.04560910537838936
},
{
"diff_generated": -20.870580673217773,
"epoch": 0.9735671290238157,
"grad_norm": 5.873029901097039,
"learning_rate": 1.2179288352297982e-06,
"logits/chosen": -2.5459725856781006,
"logits/rejected": -2.300191879272461,
"logps/chosen": -233.07363891601562,
"logps/rejected": -1780.218505859375,
"logps_avg/chosen": -0.676838755607605,
"logps_avg/rejected": -6.26117467880249,
"loss": 0.701,
"losses_ref": -0.035965751856565475,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 465,
"u": -2.2623190879821777,
"weight": 0.03852839767932892
},
{
"diff_generated": -20.314434051513672,
"epoch": 0.98403559277676,
"grad_norm": 6.047640955364439,
"learning_rate": 1.2000256937760445e-06,
"logits/chosen": -2.478569746017456,
"logits/rejected": -2.2165324687957764,
"logps/chosen": -241.59115600585938,
"logps/rejected": -1793.3131103515625,
"logps_avg/chosen": -0.7300271987915039,
"logps_avg/rejected": -6.094330787658691,
"loss": 0.6661,
"losses_ref": -0.03309565782546997,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 470,
"u": -2.2623355388641357,
"weight": 0.03777972236275673
},
{
"diff_generated": -20.79926109313965,
"epoch": 0.9945040565297043,
"grad_norm": 8.164412498048108,
"learning_rate": 1.1820555115770255e-06,
"logits/chosen": -2.5342564582824707,
"logits/rejected": -2.2890594005584717,
"logps/chosen": -230.3572235107422,
"logps/rejected": -1833.0390625,
"logps_avg/chosen": -0.751907467842102,
"logps_avg/rejected": -6.239778995513916,
"loss": 0.6895,
"losses_ref": -0.03975466638803482,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 475,
"u": -2.2550594806671143,
"weight": 0.04479961842298508
},
{
"diff_generated": -20.89034080505371,
"epoch": 1.0049725202826485,
"grad_norm": 9.91162629957212,
"learning_rate": 1.1640243115310217e-06,
"logits/chosen": -2.515481948852539,
"logits/rejected": -2.238800525665283,
"logps/chosen": -238.7968292236328,
"logps/rejected": -1904.6226806640625,
"logps_avg/chosen": -0.730613112449646,
"logps_avg/rejected": -6.2671027183532715,
"loss": 0.6185,
"losses_ref": -0.0886848121881485,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 480,
"u": -3.230264186859131,
"weight": 0.0876741111278534
},
{
"diff_generated": -22.67684555053711,
"epoch": 1.0154409840355927,
"grad_norm": 8.949481189927978,
"learning_rate": 1.1459381369870972e-06,
"logits/chosen": -2.4899425506591797,
"logits/rejected": -2.1274173259735107,
"logps/chosen": -239.3141632080078,
"logps/rejected": -2098.4287109375,
"logps_avg/chosen": -0.6295738816261292,
"logps_avg/rejected": -6.8030548095703125,
"loss": 0.5199,
"losses_ref": -0.09897326678037643,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 485,
"u": -4.378706932067871,
"weight": 0.08350441604852676
},
{
"diff_generated": -24.163660049438477,
"epoch": 1.025909447788537,
"grad_norm": 8.708694233875605,
"learning_rate": 1.1278030497196046e-06,
"logits/chosen": -2.448932409286499,
"logits/rejected": -2.0961108207702637,
"logps/chosen": -197.19461059570312,
"logps/rejected": -2133.96630859375,
"logps_avg/chosen": -0.5785419940948486,
"logps_avg/rejected": -7.2490973472595215,
"loss": 0.5111,
"losses_ref": -0.12583398818969727,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 490,
"u": -4.304468631744385,
"weight": 0.12433832883834839
},
{
"diff_generated": -25.089040756225586,
"epoch": 1.0363779115414813,
"grad_norm": 8.538618246529412,
"learning_rate": 1.1096251278965172e-06,
"logits/chosen": -2.4840457439422607,
"logits/rejected": -2.1427814960479736,
"logps/chosen": -202.72528076171875,
"logps/rejected": -2115.415283203125,
"logps_avg/chosen": -0.5992251038551331,
"logps_avg/rejected": -7.526711940765381,
"loss": 0.4987,
"losses_ref": -0.10639525949954987,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 495,
"u": -4.343268394470215,
"weight": 0.10977953672409058
},
{
"diff_generated": -24.132022857666016,
"epoch": 1.0468463752944255,
"grad_norm": 7.67811116418592,
"learning_rate": 1.0914104640422679e-06,
"logits/chosen": -2.4932920932769775,
"logits/rejected": -2.1089999675750732,
"logps/chosen": -199.10342407226562,
"logps/rejected": -2176.26318359375,
"logps_avg/chosen": -0.6183401346206665,
"logps_avg/rejected": -7.2396063804626465,
"loss": 0.5202,
"losses_ref": -0.12012694031000137,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 500,
"u": -4.302992820739746,
"weight": 0.11936072260141373
},
{
"diff_generated": -23.451740264892578,
"epoch": 1.05731483904737,
"grad_norm": 20.37435210253164,
"learning_rate": 1.0731651629957721e-06,
"logits/chosen": -2.42221736907959,
"logits/rejected": -2.1496148109436035,
"logps/chosen": -229.11068725585938,
"logps/rejected": -2094.52197265625,
"logps_avg/chosen": -0.6533752679824829,
"logps_avg/rejected": -7.035521507263184,
"loss": 0.5184,
"losses_ref": -0.1230870932340622,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 505,
"u": -4.369751930236816,
"weight": 0.1066075786948204
},
{
"diff_generated": -22.74098777770996,
"epoch": 1.067783302800314,
"grad_norm": 7.268444145722818,
"learning_rate": 1.0548953398643274e-06,
"logits/chosen": -2.4076297283172607,
"logits/rejected": -2.0819380283355713,
"logps/chosen": -233.77938842773438,
"logps/rejected": -2035.225830078125,
"logps_avg/chosen": -0.6575011014938354,
"logps_avg/rejected": -6.822296142578125,
"loss": 0.4947,
"losses_ref": -0.14097091555595398,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 510,
"u": -4.253186225891113,
"weight": 0.1333218514919281
},
{
"diff_generated": -25.923725128173828,
"epoch": 1.0782517665532583,
"grad_norm": 8.062661700192072,
"learning_rate": 1.0366071179740706e-06,
"logits/chosen": -2.4787120819091797,
"logits/rejected": -2.12414288520813,
"logps/chosen": -257.2312927246094,
"logps/rejected": -2302.900634765625,
"logps_avg/chosen": -0.6627689003944397,
"logps_avg/rejected": -7.777116298675537,
"loss": 0.5085,
"losses_ref": -0.10705102980136871,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 515,
"u": -4.345104217529297,
"weight": 0.10459395498037338
},
{
"diff_generated": -27.071746826171875,
"epoch": 1.0887202303062025,
"grad_norm": 7.3598703596101975,
"learning_rate": 1.0183066268176775e-06,
"logits/chosen": -2.436248779296875,
"logits/rejected": -2.075679063796997,
"logps/chosen": -244.1257781982422,
"logps/rejected": -2375.113525390625,
"logps_avg/chosen": -0.6157761812210083,
"logps_avg/rejected": -8.1215238571167,
"loss": 0.5683,
"losses_ref": -0.08251279592514038,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 520,
"u": -4.396883010864258,
"weight": 0.06908340752124786
},
{
"diff_generated": -26.481449127197266,
"epoch": 1.0991886940591469,
"grad_norm": 8.892060607648993,
"learning_rate": 1e-06,
"logits/chosen": -2.4646589756011963,
"logits/rejected": -2.096703290939331,
"logps/chosen": -226.17453002929688,
"logps/rejected": -2343.119384765625,
"logps_avg/chosen": -0.6375609040260315,
"logps_avg/rejected": -7.944435119628906,
"loss": 0.5652,
"losses_ref": -0.08028392493724823,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 525,
"u": -4.399170398712158,
"weight": 0.07661790400743484
},
{
"diff_generated": -25.77886962890625,
"epoch": 1.109657157812091,
"grad_norm": 11.93280848823974,
"learning_rate": 9.816933731823228e-07,
"logits/chosen": -2.4755985736846924,
"logits/rejected": -2.1236746311187744,
"logps/chosen": -219.5588836669922,
"logps/rejected": -2258.547119140625,
"logps_avg/chosen": -0.6109720468521118,
"logps_avg/rejected": -7.733660697937012,
"loss": 0.5032,
"losses_ref": -0.09919899702072144,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 530,
"u": -4.327098846435547,
"weight": 0.0968068465590477
},
{
"diff_generated": -26.962757110595703,
"epoch": 1.1201256215650353,
"grad_norm": 11.74024044453861,
"learning_rate": 9.633928820259293e-07,
"logits/chosen": -2.382981777191162,
"logits/rejected": -1.9988247156143188,
"logps/chosen": -198.56578063964844,
"logps/rejected": -2398.09326171875,
"logps_avg/chosen": -0.6096338033676147,
"logps_avg/rejected": -8.088827133178711,
"loss": 0.5305,
"losses_ref": -0.06856809556484222,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 535,
"u": -4.411980152130127,
"weight": 0.06424126774072647
},
{
"diff_generated": -26.22715187072754,
"epoch": 1.1305940853179797,
"grad_norm": 11.054487118285914,
"learning_rate": 9.451046601356725e-07,
"logits/chosen": -2.4410181045532227,
"logits/rejected": -2.095543146133423,
"logps/chosen": -207.6184844970703,
"logps/rejected": -2253.38623046875,
"logps_avg/chosen": -0.6336568593978882,
"logps_avg/rejected": -7.868145942687988,
"loss": 0.5357,
"losses_ref": -0.0955720990896225,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 540,
"u": -4.348645210266113,
"weight": 0.09204810112714767
},
{
"diff_generated": -25.266141891479492,
"epoch": 1.1410625490709239,
"grad_norm": 8.805909515635294,
"learning_rate": 9.268348370042281e-07,
"logits/chosen": -2.4485838413238525,
"logits/rejected": -2.1053905487060547,
"logps/chosen": -216.48910522460938,
"logps/rejected": -2250.44775390625,
"logps_avg/chosen": -0.588961124420166,
"logps_avg/rejected": -7.579843044281006,
"loss": 0.5159,
"losses_ref": -0.09172032028436661,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 545,
"u": -4.326230525970459,
"weight": 0.09086887538433075
},
{
"diff_generated": -26.917110443115234,
"epoch": 1.151531012823868,
"grad_norm": 10.666064793677686,
"learning_rate": 9.085895359577323e-07,
"logits/chosen": -2.404174566268921,
"logits/rejected": -2.037463665008545,
"logps/chosen": -205.3460235595703,
"logps/rejected": -2429.36279296875,
"logps_avg/chosen": -0.5989923477172852,
"logps_avg/rejected": -8.07513427734375,
"loss": 0.5332,
"losses_ref": -0.06065789982676506,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 550,
"u": -4.387241363525391,
"weight": 0.0559367910027504
},
{
"diff_generated": -25.942188262939453,
"epoch": 1.1619994765768125,
"grad_norm": 10.199822581929254,
"learning_rate": 8.903748721034826e-07,
"logits/chosen": -2.432077407836914,
"logits/rejected": -2.0631113052368164,
"logps/chosen": -209.88076782226562,
"logps/rejected": -2297.24853515625,
"logps_avg/chosen": -0.6222396492958069,
"logps_avg/rejected": -7.782655239105225,
"loss": 0.5436,
"losses_ref": -0.053764212876558304,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 555,
"u": -4.413111209869385,
"weight": 0.05243021994829178
},
{
"diff_generated": -26.842655181884766,
"epoch": 1.1724679403297567,
"grad_norm": 9.055623269790141,
"learning_rate": 8.721969502803953e-07,
"logits/chosen": -2.4761881828308105,
"logits/rejected": -2.037745952606201,
"logps/chosen": -228.0619659423828,
"logps/rejected": -2454.422607421875,
"logps_avg/chosen": -0.6156254410743713,
"logps_avg/rejected": -8.052797317504883,
"loss": 0.4938,
"losses_ref": -0.06194459646940231,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 560,
"u": -4.417675018310547,
"weight": 0.05182374641299248
},
{
"diff_generated": -25.78971290588379,
"epoch": 1.1829364040827008,
"grad_norm": 11.397081928275703,
"learning_rate": 8.540618630129027e-07,
"logits/chosen": -2.4368996620178223,
"logits/rejected": -2.0613627433776855,
"logps/chosen": -244.33059692382812,
"logps/rejected": -2314.3056640625,
"logps_avg/chosen": -0.6685888171195984,
"logps_avg/rejected": -7.736914157867432,
"loss": 0.5495,
"losses_ref": -0.07071459293365479,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 565,
"u": -4.390778064727783,
"weight": 0.06269918382167816
},
{
"diff_generated": -26.82694435119629,
"epoch": 1.193404867835645,
"grad_norm": 9.221832000440747,
"learning_rate": 8.359756884689783e-07,
"logits/chosen": -2.497908115386963,
"logits/rejected": -2.125258207321167,
"logps/chosen": -215.4803009033203,
"logps/rejected": -2407.225830078125,
"logps_avg/chosen": -0.6236811876296997,
"logps_avg/rejected": -8.048083305358887,
"loss": 0.5244,
"losses_ref": -0.08507435768842697,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 570,
"u": -4.405519485473633,
"weight": 0.07415871322154999
},
{
"diff_generated": -27.44614601135254,
"epoch": 1.2038733315885894,
"grad_norm": 14.484772212758768,
"learning_rate": 8.179444884229744e-07,
"logits/chosen": -2.415398597717285,
"logits/rejected": -2.0458593368530273,
"logps/chosen": -224.60482788085938,
"logps/rejected": -2476.796142578125,
"logps_avg/chosen": -0.6788522601127625,
"logps_avg/rejected": -8.233844757080078,
"loss": 0.5625,
"losses_ref": -0.05934012681245804,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 575,
"u": -4.461823463439941,
"weight": 0.044574279338121414
},
{
"diff_generated": -29.135217666625977,
"epoch": 1.2143417953415336,
"grad_norm": 18.01394064023352,
"learning_rate": 7.999743062239557e-07,
"logits/chosen": -2.4544944763183594,
"logits/rejected": -2.104241371154785,
"logps/chosen": -210.87893676757812,
"logps/rejected": -2643.50390625,
"logps_avg/chosen": -0.6716314554214478,
"logps_avg/rejected": -8.740565299987793,
"loss": 0.5555,
"losses_ref": -0.056417226791381836,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 580,
"u": -4.44529914855957,
"weight": 0.04976346716284752
},
{
"diff_generated": -27.484622955322266,
"epoch": 1.2248102590944778,
"grad_norm": 10.29630717051048,
"learning_rate": 7.820711647702017e-07,
"logits/chosen": -2.4541475772857666,
"logits/rejected": -2.0904035568237305,
"logps/chosen": -202.5820770263672,
"logps/rejected": -2515.11962890625,
"logps_avg/chosen": -0.5754384994506836,
"logps_avg/rejected": -8.245387077331543,
"loss": 0.5346,
"losses_ref": -0.08221448957920074,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 585,
"u": -4.365923881530762,
"weight": 0.07960718125104904
},
{
"diff_generated": -26.950695037841797,
"epoch": 1.235278722847422,
"grad_norm": 10.223108898541343,
"learning_rate": 7.642410644905726e-07,
"logits/chosen": -2.3840575218200684,
"logits/rejected": -2.0544769763946533,
"logps/chosen": -205.935546875,
"logps/rejected": -2364.6396484375,
"logps_avg/chosen": -0.5895050764083862,
"logps_avg/rejected": -8.08520793914795,
"loss": 0.5503,
"losses_ref": -0.10383725166320801,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 590,
"u": -4.365530490875244,
"weight": 0.09789486229419708
},
{
"diff_generated": -29.25247573852539,
"epoch": 1.2457471866003664,
"grad_norm": 12.09100466478698,
"learning_rate": 7.464899813334e-07,
"logits/chosen": -2.3943965435028076,
"logits/rejected": -2.067821979522705,
"logps/chosen": -215.44094848632812,
"logps/rejected": -2522.196533203125,
"logps_avg/chosen": -0.6099680662155151,
"logps_avg/rejected": -8.77574348449707,
"loss": 0.5325,
"losses_ref": -0.07746943831443787,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 595,
"u": -4.356167793273926,
"weight": 0.07601340860128403
},
{
"diff_generated": -27.34578514099121,
"epoch": 1.2562156503533106,
"grad_norm": 8.052346731222642,
"learning_rate": 7.288238647635829e-07,
"logits/chosen": -2.435148239135742,
"logits/rejected": -2.1030170917510986,
"logps/chosen": -226.7269744873047,
"logps/rejected": -2427.451171875,
"logps_avg/chosen": -0.6252392530441284,
"logps_avg/rejected": -8.2037353515625,
"loss": 0.5356,
"losses_ref": -0.06464961916208267,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 600,
"u": -4.406424522399902,
"weight": 0.061459980905056
},
{
"diff_generated": -27.118465423583984,
"epoch": 1.2666841141062548,
"grad_norm": 11.655006277757288,
"learning_rate": 7.112486357685631e-07,
"logits/chosen": -2.450383424758911,
"logits/rejected": -2.0887584686279297,
"logps/chosen": -222.7769012451172,
"logps/rejected": -2357.30712890625,
"logps_avg/chosen": -0.6189793348312378,
"logps_avg/rejected": -8.135540008544922,
"loss": 0.5517,
"losses_ref": -0.08965682238340378,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 605,
"u": -4.366550445556641,
"weight": 0.09028217941522598
},
{
"diff_generated": -27.826339721679688,
"epoch": 1.2771525778591992,
"grad_norm": 8.355569379147827,
"learning_rate": 6.937701848738407e-07,
"logits/chosen": -2.4444997310638428,
"logits/rejected": -2.103099822998047,
"logps/chosen": -200.1586151123047,
"logps/rejected": -2441.192138671875,
"logps_avg/chosen": -0.5492798089981079,
"logps_avg/rejected": -8.347902297973633,
"loss": 0.5273,
"losses_ref": -0.05201203376054764,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 610,
"u": -4.470887660980225,
"weight": 0.04102148860692978
},
{
"diff_generated": -27.140499114990234,
"epoch": 1.2876210416121434,
"grad_norm": 19.07484346081228,
"learning_rate": 6.763943701687045e-07,
"logits/chosen": -2.4840033054351807,
"logits/rejected": -2.0714080333709717,
"logps/chosen": -237.1542510986328,
"logps/rejected": -2492.620849609375,
"logps_avg/chosen": -0.6195243000984192,
"logps_avg/rejected": -8.142149925231934,
"loss": 0.5249,
"losses_ref": -0.07448837906122208,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 615,
"u": -4.392305374145508,
"weight": 0.06138737127184868
},
{
"diff_generated": -29.331090927124023,
"epoch": 1.2980895053650876,
"grad_norm": 14.350296949575641,
"learning_rate": 6.591270153428288e-07,
"logits/chosen": -2.5314509868621826,
"logits/rejected": -2.1232359409332275,
"logps/chosen": -230.3607940673828,
"logps/rejected": -2496.131103515625,
"logps_avg/chosen": -0.6086186170578003,
"logps_avg/rejected": -8.799327850341797,
"loss": 0.5301,
"losses_ref": -0.05894411355257034,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 620,
"u": -4.37734842300415,
"weight": 0.053639549762010574
},
{
"diff_generated": -26.76749038696289,
"epoch": 1.308557969118032,
"grad_norm": 8.772096019129755,
"learning_rate": 6.419739077344016e-07,
"logits/chosen": -2.517256259918213,
"logits/rejected": -2.158301591873169,
"logps/chosen": -236.55648803710938,
"logps/rejected": -2372.91796875,
"logps_avg/chosen": -0.6213998794555664,
"logps_avg/rejected": -8.030247688293457,
"loss": 0.544,
"losses_ref": -0.09482914954423904,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 625,
"u": -4.376565456390381,
"weight": 0.07662535458803177
},
{
"diff_generated": -28.416824340820312,
"epoch": 1.3190264328709762,
"grad_norm": 7.8466631670725935,
"learning_rate": 6.24940796390438e-07,
"logits/chosen": -2.4629857540130615,
"logits/rejected": -2.0768308639526367,
"logps/chosen": -214.29360961914062,
"logps/rejected": -2455.93115234375,
"logps_avg/chosen": -0.6123236417770386,
"logps_avg/rejected": -8.52504825592041,
"loss": 0.5392,
"losses_ref": -0.059877872467041016,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 630,
"u": -4.431421756744385,
"weight": 0.043088506907224655
},
{
"diff_generated": -29.8402099609375,
"epoch": 1.3294948966239204,
"grad_norm": 20.160929381759352,
"learning_rate": 6.08033390139925e-07,
"logits/chosen": -2.4479854106903076,
"logits/rejected": -2.0140042304992676,
"logps/chosen": -228.12948608398438,
"logps/rejected": -2645.977294921875,
"logps_avg/chosen": -0.6280118227005005,
"logps_avg/rejected": -8.95206356048584,
"loss": 0.5647,
"losses_ref": -0.0805547907948494,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 635,
"u": -4.419759750366211,
"weight": 0.06746160984039307
},
{
"diff_generated": -29.193140029907227,
"epoch": 1.3399633603768648,
"grad_norm": 17.984653220174852,
"learning_rate": 5.912573556804452e-07,
"logits/chosen": -2.4721744060516357,
"logits/rejected": -2.0706074237823486,
"logps/chosen": -219.49658203125,
"logps/rejected": -2600.13525390625,
"logps_avg/chosen": -0.5888947248458862,
"logps_avg/rejected": -8.757942199707031,
"loss": 0.5708,
"losses_ref": -0.06751363724470139,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 640,
"u": -4.354660511016846,
"weight": 0.08399678766727448
},
{
"diff_generated": -29.59097671508789,
"epoch": 1.350431824129809,
"grad_norm": 8.832363301034992,
"learning_rate": 5.746183156789252e-07,
"logits/chosen": -2.522441864013672,
"logits/rejected": -2.069122076034546,
"logps/chosen": -234.3195343017578,
"logps/rejected": -2680.282470703125,
"logps_avg/chosen": -0.6104280352592468,
"logps_avg/rejected": -8.877291679382324,
"loss": 0.5457,
"losses_ref": -0.05418990179896355,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 645,
"u": -4.4123215675354,
"weight": 0.058007679879665375
},
{
"diff_generated": -28.265172958374023,
"epoch": 1.3609002878827532,
"grad_norm": 12.218786161167232,
"learning_rate": 5.581218468871365e-07,
"logits/chosen": -2.4173598289489746,
"logits/rejected": -2.0515952110290527,
"logps/chosen": -190.7438507080078,
"logps/rejected": -2539.76953125,
"logps_avg/chosen": -0.5876272320747375,
"logps_avg/rejected": -8.479551315307617,
"loss": 0.5169,
"losses_ref": -0.08093442767858505,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 650,
"u": -4.324867248535156,
"weight": 0.08522786945104599
},
{
"diff_generated": -31.353778839111328,
"epoch": 1.3713687516356974,
"grad_norm": 10.11440836146207,
"learning_rate": 5.417734782725896e-07,
"logits/chosen": -2.459190845489502,
"logits/rejected": -2.060859203338623,
"logps/chosen": -211.8318634033203,
"logps/rejected": -2672.73583984375,
"logps_avg/chosen": -0.5790122151374817,
"logps_avg/rejected": -9.406133651733398,
"loss": 0.5603,
"losses_ref": -0.038860172033309937,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 655,
"u": -4.468893051147461,
"weight": 0.03133354336023331
},
{
"diff_generated": -28.588571548461914,
"epoch": 1.3818372153886418,
"grad_norm": 12.583696879491457,
"learning_rate": 5.255786891654399e-07,
"logits/chosen": -2.4734246730804443,
"logits/rejected": -2.0776007175445557,
"logps/chosen": -203.22389221191406,
"logps/rejected": -2578.066162109375,
"logps_avg/chosen": -0.6348826289176941,
"logps_avg/rejected": -8.57657241821289,
"loss": 0.5486,
"losses_ref": -0.06403845548629761,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 660,
"u": -4.37404203414917,
"weight": 0.06924913823604584
},
{
"diff_generated": -29.78804588317871,
"epoch": 1.392305679141586,
"grad_norm": 13.032538343695713,
"learning_rate": 5.095429074220319e-07,
"logits/chosen": -2.4960551261901855,
"logits/rejected": -2.1090826988220215,
"logps/chosen": -213.1850128173828,
"logps/rejected": -2626.316162109375,
"logps_avg/chosen": -0.6238334774971008,
"logps_avg/rejected": -8.93641471862793,
"loss": 0.5533,
"losses_ref": -0.06042981147766113,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 665,
"u": -4.39785099029541,
"weight": 0.05738676339387894
},
{
"diff_generated": -31.395706176757812,
"epoch": 1.4027741428945302,
"grad_norm": 29.282292978403014,
"learning_rate": 4.936715076056974e-07,
"logits/chosen": -2.519998073577881,
"logits/rejected": -2.1003477573394775,
"logps/chosen": -227.49972534179688,
"logps/rejected": -2841.53759765625,
"logps_avg/chosen": -0.6322627067565918,
"logps_avg/rejected": -9.418710708618164,
"loss": 0.545,
"losses_ref": -0.04599471017718315,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 670,
"u": -4.469276428222656,
"weight": 0.033300966024398804
},
{
"diff_generated": -32.34383010864258,
"epoch": 1.4132426066474744,
"grad_norm": 21.235357659003228,
"learning_rate": 4.779698091854098e-07,
"logits/chosen": -2.5733542442321777,
"logits/rejected": -2.1177892684936523,
"logps/chosen": -241.3948516845703,
"logps/rejected": -2941.85205078125,
"logps_avg/chosen": -0.634663999080658,
"logps_avg/rejected": -9.70314884185791,
"loss": 0.5578,
"losses_ref": -0.03548940271139145,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 675,
"u": -4.454364776611328,
"weight": 0.025668436661362648
},
{
"diff_generated": -29.166423797607422,
"epoch": 1.4237110704004188,
"grad_norm": 9.728306873667183,
"learning_rate": 4.624430747529102e-07,
"logits/chosen": -2.5310111045837402,
"logits/rejected": -2.1089558601379395,
"logps/chosen": -245.45083618164062,
"logps/rejected": -2643.77001953125,
"logps_avg/chosen": -0.6183468699455261,
"logps_avg/rejected": -8.749927520751953,
"loss": 0.5228,
"losses_ref": -0.08980627357959747,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 680,
"u": -4.334953308105469,
"weight": 0.07751224935054779
},
{
"diff_generated": -33.88722610473633,
"epoch": 1.434179534153363,
"grad_norm": 14.616844426526761,
"learning_rate": 4.4709650825889277e-07,
"logits/chosen": -2.460334300994873,
"logits/rejected": -2.0326919555664062,
"logps/chosen": -193.82003784179688,
"logps/rejected": -2947.883544921875,
"logps_avg/chosen": -0.5843343138694763,
"logps_avg/rejected": -10.166168212890625,
"loss": 0.5694,
"losses_ref": -0.03547119349241257,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 685,
"u": -4.463663578033447,
"weight": 0.030837317928671837
},
{
"diff_generated": -30.6027774810791,
"epoch": 1.4446479979063072,
"grad_norm": 11.081953598678401,
"learning_rate": 4.3193525326884426e-07,
"logits/chosen": -2.5122551918029785,
"logits/rejected": -2.0895779132843018,
"logps/chosen": -238.4690704345703,
"logps/rejected": -2627.096435546875,
"logps_avg/chosen": -0.6726236343383789,
"logps_avg/rejected": -9.180832862854004,
"loss": 0.587,
"losses_ref": -0.05756605789065361,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 690,
"u": -4.38980770111084,
"weight": 0.0510624423623085
},
{
"diff_generated": -32.015716552734375,
"epoch": 1.4551164616592516,
"grad_norm": 11.608639050571856,
"learning_rate": 4.1696439123912406e-07,
"logits/chosen": -2.4778366088867188,
"logits/rejected": -2.0454444885253906,
"logps/chosen": -205.8911590576172,
"logps/rejected": -2957.13525390625,
"logps_avg/chosen": -0.6116452217102051,
"logps_avg/rejected": -9.604714393615723,
"loss": 0.5502,
"losses_ref": -0.05736450105905533,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 695,
"u": -4.357982635498047,
"weight": 0.05952075123786926
},
{
"diff_generated": -35.234153747558594,
"epoch": 1.4655849254121958,
"grad_norm": 8.17712308208093,
"learning_rate": 4.0218893981385927e-07,
"logits/chosen": -2.485691547393799,
"logits/rejected": -2.046220064163208,
"logps/chosen": -200.62582397460938,
"logps/rejected": -3101.075439453125,
"logps_avg/chosen": -0.5734541416168213,
"logps_avg/rejected": -10.570245742797852,
"loss": 0.5729,
"losses_ref": -0.028310665860772133,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 700,
"u": -4.45696496963501,
"weight": 0.023738497868180275
},
{
"diff_generated": -35.26641082763672,
"epoch": 1.47605338916514,
"grad_norm": 16.950355166034456,
"learning_rate": 3.87613851143229e-07,
"logits/chosen": -2.494295597076416,
"logits/rejected": -2.00370717048645,
"logps/chosen": -230.57400512695312,
"logps/rejected": -3109.327392578125,
"logps_avg/chosen": -0.6209388971328735,
"logps_avg/rejected": -10.57992172241211,
"loss": 0.5466,
"losses_ref": -0.0546514168381691,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 705,
"u": -4.406841278076172,
"weight": 0.04961226135492325
},
{
"diff_generated": -34.927207946777344,
"epoch": 1.4865218529180844,
"grad_norm": 9.208840009036596,
"learning_rate": 3.7324401022369744e-07,
"logits/chosen": -2.4626827239990234,
"logits/rejected": -1.9565467834472656,
"logps/chosen": -232.802001953125,
"logps/rejected": -3108.4921875,
"logps_avg/chosen": -0.6169513463973999,
"logps_avg/rejected": -10.47816276550293,
"loss": 0.5383,
"losses_ref": -0.051527369767427444,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 710,
"u": -4.410109043121338,
"weight": 0.04789410158991814
},
{
"diff_generated": -31.93350601196289,
"epoch": 1.4969903166710286,
"grad_norm": 8.74366239695945,
"learning_rate": 3.5908423326075455e-07,
"logits/chosen": -2.470921039581299,
"logits/rejected": -2.028719425201416,
"logps/chosen": -197.37814331054688,
"logps/rejected": -2799.31396484375,
"logps_avg/chosen": -0.5950369834899902,
"logps_avg/rejected": -9.580052375793457,
"loss": 0.5627,
"losses_ref": -0.05724947527050972,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 715,
"u": -4.411328315734863,
"weight": 0.047762464731931686
},
{
"diff_generated": -33.14401626586914,
"epoch": 1.5074587804239727,
"grad_norm": 8.842328295664547,
"learning_rate": 3.45139266054715e-07,
"logits/chosen": -2.5109152793884277,
"logits/rejected": -2.010921001434326,
"logps/chosen": -247.7344207763672,
"logps/rejected": -3127.861328125,
"logps_avg/chosen": -0.6326244473457336,
"logps_avg/rejected": -9.943206787109375,
"loss": 0.5529,
"losses_ref": -0.05398111790418625,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 720,
"u": -4.407879829406738,
"weight": 0.051137275993824005
},
{
"diff_generated": -34.548439025878906,
"epoch": 1.5179272441769172,
"grad_norm": 9.975694420372704,
"learning_rate": 3.314137824101111e-07,
"logits/chosen": -2.5249905586242676,
"logits/rejected": -2.0087645053863525,
"logps/chosen": -254.705322265625,
"logps/rejected": -3178.156494140625,
"logps_avg/chosen": -0.6393792033195496,
"logps_avg/rejected": -10.364530563354492,
"loss": 0.5512,
"losses_ref": -0.05713530257344246,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 725,
"u": -4.388852119445801,
"weight": 0.061693333089351654
},
{
"diff_generated": -32.73970413208008,
"epoch": 1.5283957079298613,
"grad_norm": 11.767533184902167,
"learning_rate": 3.179123825692178e-07,
"logits/chosen": -2.47417950630188,
"logits/rejected": -2.016237497329712,
"logps/chosen": -209.87802124023438,
"logps/rejected": -2884.9580078125,
"logps_avg/chosen": -0.5899583101272583,
"logps_avg/rejected": -9.821910858154297,
"loss": 0.5576,
"losses_ref": -0.05416392162442207,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 730,
"u": -4.406733989715576,
"weight": 0.052076805382966995
},
{
"diff_generated": -32.37422561645508,
"epoch": 1.5388641716828055,
"grad_norm": 9.47936945913295,
"learning_rate": 3.0463959167023335e-07,
"logits/chosen": -2.5015838146209717,
"logits/rejected": -2.069798231124878,
"logps/chosen": -217.7288055419922,
"logps/rejected": -2870.407958984375,
"logps_avg/chosen": -0.6165660619735718,
"logps_avg/rejected": -9.712267875671387,
"loss": 0.5285,
"losses_ref": -0.08272585272789001,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 735,
"u": -4.363903999328613,
"weight": 0.07859805971384048
},
{
"diff_generated": -32.28863525390625,
"epoch": 1.54933263543575,
"grad_norm": 9.124308513157976,
"learning_rate": 2.915998582306299e-07,
"logits/chosen": -2.5220367908477783,
"logits/rejected": -2.038191318511963,
"logps/chosen": -229.7245330810547,
"logps/rejected": -2982.073486328125,
"logps_avg/chosen": -0.617731511592865,
"logps_avg/rejected": -9.686590194702148,
"loss": 0.5329,
"losses_ref": -0.05901874229311943,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 740,
"u": -4.457033634185791,
"weight": 0.051646940410137177
},
{
"diff_generated": -31.57918357849121,
"epoch": 1.559801099188694,
"grad_norm": 8.788334428443942,
"learning_rate": 2.7879755265618557e-07,
"logits/chosen": -2.385359287261963,
"logits/rejected": -2.0353574752807617,
"logps/chosen": -191.27542114257812,
"logps/rejected": -2743.20849609375,
"logps_avg/chosen": -0.5724462270736694,
"logps_avg/rejected": -9.473755836486816,
"loss": 0.5301,
"losses_ref": -0.06048304960131645,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 745,
"u": -4.434350967407227,
"weight": 0.051485490053892136
},
{
"diff_generated": -33.260643005371094,
"epoch": 1.5702695629416383,
"grad_norm": 13.597985798817346,
"learning_rate": 2.6623696577619625e-07,
"logits/chosen": -2.498661518096924,
"logits/rejected": -2.070701837539673,
"logps/chosen": -227.7393035888672,
"logps/rejected": -2963.530517578125,
"logps_avg/chosen": -0.6551213264465332,
"logps_avg/rejected": -9.978193283081055,
"loss": 0.5837,
"losses_ref": -0.03624705597758293,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 750,
"u": -4.466372966766357,
"weight": 0.028057094663381577
},
{
"diff_generated": -29.464405059814453,
"epoch": 1.5807380266945825,
"grad_norm": 9.250307778356563,
"learning_rate": 2.5392230740535846e-07,
"logits/chosen": -2.5032472610473633,
"logits/rejected": -2.06776762008667,
"logps/chosen": -251.3708953857422,
"logps/rejected": -2650.0810546875,
"logps_avg/chosen": -0.6423950791358948,
"logps_avg/rejected": -8.839322090148926,
"loss": 0.5765,
"losses_ref": -0.052409954369068146,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 755,
"u": -4.408892631530762,
"weight": 0.05609407275915146
},
{
"diff_generated": -29.876062393188477,
"epoch": 1.5912064904475267,
"grad_norm": 12.686799097235559,
"learning_rate": 2.418577049328058e-07,
"logits/chosen": -2.5676896572113037,
"logits/rejected": -2.1377835273742676,
"logps/chosen": -265.7136535644531,
"logps/rejected": -2646.18896484375,
"logps_avg/chosen": -0.665650486946106,
"logps_avg/rejected": -8.962818145751953,
"loss": 0.5887,
"losses_ref": -0.06443095207214355,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 760,
"u": -4.407828330993652,
"weight": 0.06109876185655594
},
{
"diff_generated": -33.68701171875,
"epoch": 1.6016749542004711,
"grad_norm": 10.274482248605684,
"learning_rate": 2.300472019387697e-07,
"logits/chosen": -2.469991683959961,
"logits/rejected": -2.029064893722534,
"logps/chosen": -220.9040985107422,
"logps/rejected": -3017.740234375,
"logps_avg/chosen": -0.6078630685806274,
"logps_avg/rejected": -10.10610294342041,
"loss": 0.5524,
"losses_ref": -0.04078926518559456,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 765,
"u": -4.417876243591309,
"weight": 0.03704729676246643
},
{
"diff_generated": -31.825037002563477,
"epoch": 1.6121434179534153,
"grad_norm": 11.839464542057028,
"learning_rate": 2.1849475683932994e-07,
"logits/chosen": -2.4939956665039062,
"logits/rejected": -2.1075644493103027,
"logps/chosen": -223.6890869140625,
"logps/rejected": -2828.83447265625,
"logps_avg/chosen": -0.6260048747062683,
"logps_avg/rejected": -9.547511100769043,
"loss": 0.5492,
"losses_ref": -0.05019731447100639,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 770,
"u": -4.3696393966674805,
"weight": 0.05455632880330086
},
{
"diff_generated": -30.594751358032227,
"epoch": 1.6226118817063595,
"grad_norm": 9.146985127674856,
"learning_rate": 2.0720424155971038e-07,
"logits/chosen": -2.4665775299072266,
"logits/rejected": -2.0385656356811523,
"logps/chosen": -238.6437530517578,
"logps/rejected": -2788.4453125,
"logps_avg/chosen": -0.6432589292526245,
"logps_avg/rejected": -9.178424835205078,
"loss": 0.5603,
"losses_ref": -0.060744620859622955,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 775,
"u": -4.3677592277526855,
"weight": 0.054513733834028244
},
{
"diff_generated": -30.54671859741211,
"epoch": 1.633080345459304,
"grad_norm": 12.431506597181475,
"learning_rate": 1.961794402365611e-07,
"logits/chosen": -2.48872971534729,
"logits/rejected": -2.045698404312134,
"logps/chosen": -238.8667755126953,
"logps/rejected": -2746.897705078125,
"logps_avg/chosen": -0.6708707809448242,
"logps_avg/rejected": -9.16401481628418,
"loss": 0.5942,
"losses_ref": -0.043663203716278076,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 780,
"u": -4.429020881652832,
"weight": 0.0392422154545784
},
{
"diff_generated": -30.78244400024414,
"epoch": 1.643548809212248,
"grad_norm": 14.0111361325287,
"learning_rate": 1.8542404794966427e-07,
"logits/chosen": -2.5275959968566895,
"logits/rejected": -2.0743932723999023,
"logps/chosen": -236.8502655029297,
"logps/rejected": -2726.872802734375,
"logps_avg/chosen": -0.6049509644508362,
"logps_avg/rejected": -9.234731674194336,
"loss": 0.5559,
"losses_ref": -0.040397271513938904,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 785,
"u": -4.439243316650391,
"weight": 0.034041326493024826
},
{
"diff_generated": -30.46352767944336,
"epoch": 1.6540172729651923,
"grad_norm": 13.778205091571524,
"learning_rate": 1.7494166948349053e-07,
"logits/chosen": -2.4739108085632324,
"logits/rejected": -2.0248847007751465,
"logps/chosen": -188.06265258789062,
"logps/rejected": -2811.63427734375,
"logps_avg/chosen": -0.58104407787323,
"logps_avg/rejected": -9.139059066772461,
"loss": 0.5279,
"losses_ref": -0.0705099031329155,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 790,
"u": -4.394803047180176,
"weight": 0.06850212812423706
},
{
"diff_generated": -31.430471420288086,
"epoch": 1.6644857367181367,
"grad_norm": 7.6385064901749775,
"learning_rate": 1.6473581811901528e-07,
"logits/chosen": -2.465888500213623,
"logits/rejected": -2.0527515411376953,
"logps/chosen": -210.7668914794922,
"logps/rejected": -2648.2431640625,
"logps_avg/chosen": -0.6304226517677307,
"logps_avg/rejected": -9.429141998291016,
"loss": 0.5656,
"losses_ref": -0.035576872527599335,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 795,
"u": -4.473604679107666,
"weight": 0.025509512051939964
},
{
"diff_generated": -31.38290023803711,
"epoch": 1.674954200471081,
"grad_norm": 10.762504453960963,
"learning_rate": 1.5480991445620538e-07,
"logits/chosen": -2.458466053009033,
"logits/rejected": -2.0299301147460938,
"logps/chosen": -205.1313018798828,
"logps/rejected": -2810.052001953125,
"logps_avg/chosen": -0.5803036093711853,
"logps_avg/rejected": -9.414871215820312,
"loss": 0.5407,
"losses_ref": -0.06857903301715851,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 800,
"u": -4.383907318115234,
"weight": 0.07058969140052795
},
{
"diff_generated": -32.339012145996094,
"epoch": 1.685422664224025,
"grad_norm": 12.623391530366172,
"learning_rate": 1.4516728526756873e-07,
"logits/chosen": -2.4743473529815674,
"logits/rejected": -2.0498290061950684,
"logps/chosen": -213.2050018310547,
"logps/rejected": -2888.50927734375,
"logps_avg/chosen": -0.5934925079345703,
"logps_avg/rejected": -9.701704025268555,
"loss": 0.5501,
"losses_ref": -0.061614394187927246,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 805,
"u": -4.301063537597656,
"weight": 0.06795644760131836
},
{
"diff_generated": -29.015087127685547,
"epoch": 1.6958911279769695,
"grad_norm": 17.58977680719491,
"learning_rate": 1.3581116238315194e-07,
"logits/chosen": -2.4904446601867676,
"logits/rejected": -2.050494909286499,
"logps/chosen": -245.46176147460938,
"logps/rejected": -2670.2060546875,
"logps_avg/chosen": -0.6670945882797241,
"logps_avg/rejected": -8.704526901245117,
"loss": 0.5769,
"losses_ref": -0.05934567004442215,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 810,
"u": -4.374115943908691,
"weight": 0.05290456861257553
},
{
"diff_generated": -31.670734405517578,
"epoch": 1.7063595917299135,
"grad_norm": 20.41492239134003,
"learning_rate": 1.2674468160735586e-07,
"logits/chosen": -2.5279009342193604,
"logits/rejected": -2.089564800262451,
"logps/chosen": -219.30712890625,
"logps/rejected": -2705.98193359375,
"logps_avg/chosen": -0.6055987477302551,
"logps_avg/rejected": -9.501219749450684,
"loss": 0.5913,
"losses_ref": -0.04426007717847824,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 815,
"u": -4.457495212554932,
"weight": 0.04348568618297577
},
{
"diff_generated": -32.43749237060547,
"epoch": 1.7168280554828579,
"grad_norm": 8.725588658168348,
"learning_rate": 1.1797088166794e-07,
"logits/chosen": -2.479827880859375,
"logits/rejected": -2.0322813987731934,
"logps/chosen": -209.2858428955078,
"logps/rejected": -2927.29150390625,
"logps_avg/chosen": -0.5941019058227539,
"logps_avg/rejected": -9.731245994567871,
"loss": 0.5891,
"losses_ref": -0.03500083088874817,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 820,
"u": -4.411776065826416,
"weight": 0.028576117008924484
},
{
"diff_generated": -29.760284423828125,
"epoch": 1.7272965192358023,
"grad_norm": 7.224696592212977,
"learning_rate": 1.0949270319755766e-07,
"logits/chosen": -2.5083603858947754,
"logits/rejected": -2.0863795280456543,
"logps/chosen": -206.98812866210938,
"logps/rejected": -2673.796875,
"logps_avg/chosen": -0.5425812005996704,
"logps_avg/rejected": -8.928085327148438,
"loss": 0.5471,
"losses_ref": -0.040049560368061066,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 825,
"u": -4.422328472137451,
"weight": 0.03419359400868416
},
{
"diff_generated": -30.241125106811523,
"epoch": 1.7377649829887463,
"grad_norm": 11.359999539925766,
"learning_rate": 1.013129877481741e-07,
"logits/chosen": -2.4465301036834717,
"logits/rejected": -2.0786962509155273,
"logps/chosen": -251.66110229492188,
"logps/rejected": -2615.54248046875,
"logps_avg/chosen": -0.6354495286941528,
"logps_avg/rejected": -9.07233715057373,
"loss": 0.5595,
"losses_ref": -0.038409143686294556,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 830,
"u": -4.411218643188477,
"weight": 0.03372519463300705
},
{
"diff_generated": -31.309673309326172,
"epoch": 1.7482334467416907,
"grad_norm": 10.689212774701963,
"learning_rate": 9.343447683868799e-08,
"logits/chosen": -2.459969997406006,
"logits/rejected": -2.0669496059417725,
"logps/chosen": -197.42056274414062,
"logps/rejected": -2780.952392578125,
"logps_avg/chosen": -0.5673859715461731,
"logps_avg/rejected": -9.392901420593262,
"loss": 0.5517,
"losses_ref": -0.03770770505070686,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 835,
"u": -4.412764549255371,
"weight": 0.03573904559016228
},
{
"diff_generated": -30.0009765625,
"epoch": 1.7587019104946349,
"grad_norm": 13.800508017129163,
"learning_rate": 8.585981103608342e-08,
"logits/chosen": -2.48380184173584,
"logits/rejected": -2.0376243591308594,
"logps/chosen": -247.1182861328125,
"logps/rejected": -2758.78857421875,
"logps_avg/chosen": -0.6514982581138611,
"logps_avg/rejected": -9.000292778015137,
"loss": 0.5682,
"losses_ref": -0.04732600972056389,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 840,
"u": -4.44762659072876,
"weight": 0.04196245223283768
},
{
"diff_generated": -30.065624237060547,
"epoch": 1.769170374247579,
"grad_norm": 13.143185887862547,
"learning_rate": 7.859152907041544e-08,
"logits/chosen": -2.4641730785369873,
"logits/rejected": -2.0567100048065186,
"logps/chosen": -236.99148559570312,
"logps/rejected": -2573.870849609375,
"logps_avg/chosen": -0.6164765357971191,
"logps_avg/rejected": -9.019688606262207,
"loss": 0.5526,
"losses_ref": -0.05898575857281685,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 845,
"u": -4.398618698120117,
"weight": 0.060839347541332245
},
{
"diff_generated": -30.66835594177246,
"epoch": 1.7796388380005235,
"grad_norm": 17.88344708080126,
"learning_rate": 7.163206698392742e-08,
"logits/chosen": -2.4754815101623535,
"logits/rejected": -2.077538251876831,
"logps/chosen": -222.5938262939453,
"logps/rejected": -2694.906494140625,
"logps_avg/chosen": -0.6013268232345581,
"logps_avg/rejected": -9.200507164001465,
"loss": 0.5739,
"losses_ref": -0.05739979073405266,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 850,
"u": -4.392508506774902,
"weight": 0.04965168982744217
},
{
"diff_generated": -29.592029571533203,
"epoch": 1.7901073017534677,
"grad_norm": 13.06278922990348,
"learning_rate": 6.498375731458527e-08,
"logits/chosen": -2.514953136444092,
"logits/rejected": -2.096156597137451,
"logps/chosen": -233.39132690429688,
"logps/rejected": -2654.203857421875,
"logps_avg/chosen": -0.6016189455986023,
"logps_avg/rejected": -8.877609252929688,
"loss": 0.5566,
"losses_ref": -0.04416666924953461,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 855,
"u": -4.415837287902832,
"weight": 0.03559427708387375
},
{
"diff_generated": -32.24101638793945,
"epoch": 1.8005757655064119,
"grad_norm": 12.360220474861023,
"learning_rate": 5.8648828314302735e-08,
"logits/chosen": -2.4461560249328613,
"logits/rejected": -2.015535354614258,
"logps/chosen": -225.93533325195312,
"logps/rejected": -2782.87255859375,
"logps_avg/chosen": -0.5964374542236328,
"logps_avg/rejected": -9.6723051071167,
"loss": 0.5666,
"losses_ref": -0.03670288249850273,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 860,
"u": -4.398609161376953,
"weight": 0.03543057292699814
},
{
"diff_generated": -31.262531280517578,
"epoch": 1.8110442292593563,
"grad_norm": 16.164691771356388,
"learning_rate": 5.2629403202119505e-08,
"logits/chosen": -2.4537065029144287,
"logits/rejected": -2.062150716781616,
"logps/chosen": -204.52587890625,
"logps/rejected": -2741.170654296875,
"logps_avg/chosen": -0.5822928547859192,
"logps_avg/rejected": -9.378759384155273,
"loss": 0.5402,
"losses_ref": -0.03764919191598892,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 865,
"u": -4.42946720123291,
"weight": 0.031336475163698196
},
{
"diff_generated": -30.04671859741211,
"epoch": 1.8215126930123005,
"grad_norm": 10.013135246955365,
"learning_rate": 4.692749945258057e-08,
"logits/chosen": -2.4766173362731934,
"logits/rejected": -2.0611166954040527,
"logps/chosen": -236.82284545898438,
"logps/rejected": -2744.845458984375,
"logps_avg/chosen": -0.6182196736335754,
"logps_avg/rejected": -9.014015197753906,
"loss": 0.5905,
"losses_ref": -0.07179991900920868,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 870,
"u": -4.407280921936035,
"weight": 0.07166210561990738
},
{
"diff_generated": -32.90989303588867,
"epoch": 1.8319811567652446,
"grad_norm": 9.513246816083905,
"learning_rate": 4.1545028119559066e-08,
"logits/chosen": -2.4886152744293213,
"logits/rejected": -2.066333770751953,
"logps/chosen": -223.5939483642578,
"logps/rejected": -2896.932373046875,
"logps_avg/chosen": -0.6256131529808044,
"logps_avg/rejected": -9.872968673706055,
"loss": 0.5458,
"losses_ref": -0.0590200200676918,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 875,
"u": -4.408657073974609,
"weight": 0.056226253509521484
},
{
"diff_generated": -31.053930282592773,
"epoch": 1.842449620518189,
"grad_norm": 94.13052968470578,
"learning_rate": 3.648379319574568e-08,
"logits/chosen": -2.528390407562256,
"logits/rejected": -2.073420524597168,
"logps/chosen": -222.608642578125,
"logps/rejected": -2745.4130859375,
"logps_avg/chosen": -0.6137613654136658,
"logps_avg/rejected": -9.316179275512695,
"loss": 0.5237,
"losses_ref": -0.06711964309215546,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 880,
"u": -4.3957600593566895,
"weight": 0.06412933766841888
},
{
"diff_generated": -30.842365264892578,
"epoch": 1.8529180842711332,
"grad_norm": 8.312877021027528,
"learning_rate": 3.17454910080216e-08,
"logits/chosen": -2.5333809852600098,
"logits/rejected": -2.1170499324798584,
"logps/chosen": -253.5600128173828,
"logps/rejected": -2778.802001953125,
"logps_avg/chosen": -0.6801126599311829,
"logps_avg/rejected": -9.25270938873291,
"loss": 0.5709,
"losses_ref": -0.0633564293384552,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 885,
"u": -4.409802436828613,
"weight": 0.05907650664448738
},
{
"diff_generated": -30.68337631225586,
"epoch": 1.8633865480240774,
"grad_norm": 8.793473948703046,
"learning_rate": 2.733170964891607e-08,
"logits/chosen": -2.46742582321167,
"logits/rejected": -2.0830397605895996,
"logps/chosen": -204.62625122070312,
"logps/rejected": -2726.16552734375,
"logps_avg/chosen": -0.5727981328964233,
"logps_avg/rejected": -9.205012321472168,
"loss": 0.5596,
"losses_ref": -0.05169714242219925,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 890,
"u": -4.437578201293945,
"weight": 0.043870192021131516
},
{
"diff_generated": -30.3818302154541,
"epoch": 1.8738550117770219,
"grad_norm": 6.648166332075938,
"learning_rate": 2.324392844434042e-08,
"logits/chosen": -2.491211414337158,
"logits/rejected": -2.0470757484436035,
"logps/chosen": -229.8271026611328,
"logps/rejected": -2785.03076171875,
"logps_avg/chosen": -0.6076307892799377,
"logps_avg/rejected": -9.11454963684082,
"loss": 0.5638,
"losses_ref": -0.032108329236507416,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 895,
"u": -4.453085899353027,
"weight": 0.02507254108786583
},
{
"diff_generated": -31.61887550354004,
"epoch": 1.8843234755299658,
"grad_norm": 18.51567409544646,
"learning_rate": 1.9483517457776434e-08,
"logits/chosen": -2.4359021186828613,
"logits/rejected": -2.096619129180908,
"logps/chosen": -188.21896362304688,
"logps/rejected": -2806.19921875,
"logps_avg/chosen": -0.5758072733879089,
"logps_avg/rejected": -9.485663414001465,
"loss": 0.5343,
"losses_ref": -0.08278501033782959,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 900,
"u": -4.343811988830566,
"weight": 0.08570453524589539
},
{
"diff_generated": -31.200836181640625,
"epoch": 1.8947919392829102,
"grad_norm": 16.452754098885247,
"learning_rate": 1.6051737031084533e-08,
"logits/chosen": -2.453563690185547,
"logits/rejected": -2.0280988216400146,
"logps/chosen": -214.77395629882812,
"logps/rejected": -2817.1669921875,
"logps_avg/chosen": -0.5827924013137817,
"logps_avg/rejected": -9.360250473022461,
"loss": 0.5565,
"losses_ref": -0.0487370602786541,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 905,
"u": -4.410248756408691,
"weight": 0.04455076903104782
},
{
"diff_generated": -30.942846298217773,
"epoch": 1.9052604030358546,
"grad_norm": 20.55170462638644,
"learning_rate": 1.2949737362087154e-08,
"logits/chosen": -2.467200756072998,
"logits/rejected": -2.096820831298828,
"logps/chosen": -206.9503936767578,
"logps/rejected": -2817.215087890625,
"logps_avg/chosen": -0.6169668436050415,
"logps_avg/rejected": -9.282854080200195,
"loss": 0.5886,
"losses_ref": -0.0511205717921257,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 910,
"u": -4.404674053192139,
"weight": 0.04643367975950241
},
{
"diff_generated": -30.169301986694336,
"epoch": 1.9157288667887986,
"grad_norm": 8.053020444587133,
"learning_rate": 1.0178558119067315e-08,
"logits/chosen": -2.4181623458862305,
"logits/rejected": -2.028630018234253,
"logps/chosen": -212.6619873046875,
"logps/rejected": -2651.956787109375,
"logps_avg/chosen": -0.5928919315338135,
"logps_avg/rejected": -9.050790786743164,
"loss": 0.5551,
"losses_ref": -0.05854606628417969,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 915,
"u": -4.413174629211426,
"weight": 0.0472232848405838
},
{
"diff_generated": -32.18278121948242,
"epoch": 1.926197330541743,
"grad_norm": 24.431507328322112,
"learning_rate": 7.739128092312918e-09,
"logits/chosen": -2.4973015785217285,
"logits/rejected": -2.0860588550567627,
"logps/chosen": -216.73666381835938,
"logps/rejected": -2769.303955078125,
"logps_avg/chosen": -0.6046438813209534,
"logps_avg/rejected": -9.654834747314453,
"loss": 0.5467,
"losses_ref": -0.06063861399888992,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 920,
"u": -4.384333610534668,
"weight": 0.07002799212932587
},
{
"diff_generated": -33.56671142578125,
"epoch": 1.9366657942946872,
"grad_norm": 12.304529486565588,
"learning_rate": 5.632264882822757e-09,
"logits/chosen": -2.499455451965332,
"logits/rejected": -2.059584140777588,
"logps/chosen": -228.59640502929688,
"logps/rejected": -2900.51123046875,
"logps_avg/chosen": -0.6097213625907898,
"logps_avg/rejected": -10.070013046264648,
"loss": 0.5799,
"losses_ref": -0.0342455692589283,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 925,
"u": -4.438467979431152,
"weight": 0.030456313863396645
},
{
"diff_generated": -30.743816375732422,
"epoch": 1.9471342580476314,
"grad_norm": 11.65318893393544,
"learning_rate": 3.858674628278824e-09,
"logits/chosen": -2.4831936359405518,
"logits/rejected": -2.0906691551208496,
"logps/chosen": -230.875,
"logps/rejected": -2670.49755859375,
"logps_avg/chosen": -0.603253960609436,
"logps_avg/rejected": -9.223145484924316,
"loss": 0.5642,
"losses_ref": -0.05138419196009636,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 930,
"u": -4.435812473297119,
"weight": 0.0458533950150013
},
{
"diff_generated": -31.753076553344727,
"epoch": 1.9576027218005758,
"grad_norm": 16.58166205034555,
"learning_rate": 2.418951766376742e-09,
"logits/chosen": -2.4695091247558594,
"logits/rejected": -2.0497422218322754,
"logps/chosen": -205.1109619140625,
"logps/rejected": -2825.771484375,
"logps_avg/chosen": -0.5685989260673523,
"logps_avg/rejected": -9.525922775268555,
"loss": 0.554,
"losses_ref": -0.05179325491189957,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 935,
"u": -4.395134925842285,
"weight": 0.04630660265684128
},
{
"diff_generated": -31.87947654724121,
"epoch": 1.96807118555352,
"grad_norm": 26.35143781539668,
"learning_rate": 1.313578835593465e-09,
"logits/chosen": -2.4483304023742676,
"logits/rejected": -2.004983425140381,
"logps/chosen": -241.7947998046875,
"logps/rejected": -2828.03173828125,
"logps_avg/chosen": -0.6296852827072144,
"logps_avg/rejected": -9.5638427734375,
"loss": 0.5603,
"losses_ref": -0.03613152354955673,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 940,
"u": -4.452606678009033,
"weight": 0.026990771293640137
},
{
"diff_generated": -29.87912940979004,
"epoch": 1.9785396493064642,
"grad_norm": 15.814334066391242,
"learning_rate": 5.429263134594242e-10,
"logits/chosen": -2.4958741664886475,
"logits/rejected": -2.101313591003418,
"logps/chosen": -207.99179077148438,
"logps/rejected": -2708.303466796875,
"logps_avg/chosen": -0.5728383660316467,
"logps_avg/rejected": -8.963739395141602,
"loss": 0.5538,
"losses_ref": -0.06295718252658844,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 945,
"u": -4.400644302368164,
"weight": 0.05401432514190674
},
{
"diff_generated": -31.941226959228516,
"epoch": 1.9890081130594086,
"grad_norm": 8.580957108117007,
"learning_rate": 1.0725249238940915e-10,
"logits/chosen": -2.4698963165283203,
"logits/rejected": -2.0529587268829346,
"logps/chosen": -231.325927734375,
"logps/rejected": -2804.859619140625,
"logps_avg/chosen": -0.6270388960838318,
"logps_avg/rejected": -9.582367897033691,
"loss": 0.5563,
"losses_ref": -0.029423978179693222,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 950,
"u": -4.438694477081299,
"weight": 0.025990551337599754
}
],
"logging_steps": 5,
"max_steps": 954,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}