{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5808212812917465, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005808212812917465, "grad_norm": 306.3320007324219, "learning_rate": 4.998547356188263e-06, "logits/chosen": -0.7514113187789917, "logits/rejected": -0.6686298251152039, "logps/chosen": -75.72093200683594, "logps/rejected": -73.8106918334961, "loss": 13.8629, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.001161642562583493, "grad_norm": 301.2439270019531, "learning_rate": 4.997094712376526e-06, "logits/chosen": -0.794822096824646, "logits/rejected": -0.7371929287910461, "logps/chosen": -72.30989074707031, "logps/rejected": -67.51399993896484, "loss": 13.9577, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.0054689692333340645, "rewards/margins": -0.009410643950104713, "rewards/rejected": 0.003941674716770649, "step": 2 }, { "epoch": 0.0017424638438752395, "grad_norm": 296.5701904296875, "learning_rate": 4.995642068564789e-06, "logits/chosen": -0.8363990783691406, "logits/rejected": -0.8187875747680664, "logps/chosen": -71.92262268066406, "logps/rejected": -72.27050018310547, "loss": 13.7847, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0027333260513842106, "rewards/margins": 0.008102846331894398, "rewards/rejected": -0.0053695198148489, "step": 3 }, { "epoch": 0.002323285125166986, "grad_norm": 345.2494201660156, "learning_rate": 4.9941894247530506e-06, "logits/chosen": -0.7175111174583435, "logits/rejected": -0.7101837396621704, "logps/chosen": -77.45024108886719, "logps/rejected": -76.15581512451172, "loss": 13.9709, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.01066683791577816, "rewards/margins": -0.01057196594774723, "rewards/rejected": -9.487159695709124e-05, "step": 4 }, { "epoch": 0.0029041064064587326, "grad_norm": 295.31768798828125, "learning_rate": 4.992736780941313e-06, "logits/chosen": -0.7365175485610962, "logits/rejected": -0.7447739839553833, "logps/chosen": -76.33888244628906, "logps/rejected": -61.07477951049805, "loss": 13.8975, "rewards/accuracies": 0.5, "rewards/chosen": 0.0010636046063154936, "rewards/margins": -0.0031540922354906797, "rewards/rejected": 0.004217695910483599, "step": 5 }, { "epoch": 0.003484927687750479, "grad_norm": 312.8099060058594, "learning_rate": 4.991284137129576e-06, "logits/chosen": -0.875682532787323, "logits/rejected": -0.8198660016059875, "logps/chosen": -79.96182250976562, "logps/rejected": -77.87804412841797, "loss": 13.6993, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01062581967562437, "rewards/margins": 0.016683798283338547, "rewards/rejected": -0.0060579776763916016, "step": 6 }, { "epoch": 0.004065748969042226, "grad_norm": 317.39288330078125, "learning_rate": 4.989831493317839e-06, "logits/chosen": -0.6050316095352173, "logits/rejected": -0.6816262602806091, "logps/chosen": -70.26258850097656, "logps/rejected": -75.84834289550781, "loss": 13.8254, "rewards/accuracies": 0.5, "rewards/chosen": 0.010981644503772259, "rewards/margins": 0.003875770838931203, "rewards/rejected": 0.007105874828994274, "step": 7 }, { "epoch": 0.004646570250333972, "grad_norm": 321.52996826171875, "learning_rate": 4.9883788495061015e-06, "logits/chosen": -0.8232254981994629, "logits/rejected": -0.7795180082321167, "logps/chosen": -72.38011169433594, "logps/rejected": -67.78025817871094, "loss": 13.6941, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.017304277047514915, "rewards/margins": 0.01702544279396534, "rewards/rejected": 0.00027883489383384585, "step": 8 }, { "epoch": 0.005227391531625719, "grad_norm": 311.1319885253906, "learning_rate": 4.986926205694364e-06, "logits/chosen": -0.7957251667976379, "logits/rejected": -0.748576819896698, "logps/chosen": -76.44227600097656, "logps/rejected": -72.32237243652344, "loss": 13.8266, "rewards/accuracies": 0.5, "rewards/chosen": 0.007577553391456604, "rewards/margins": 0.003815555479377508, "rewards/rejected": 0.0037619969807565212, "step": 9 }, { "epoch": 0.005808212812917465, "grad_norm": 290.04388427734375, "learning_rate": 4.985473561882627e-06, "logits/chosen": -0.851279079914093, "logits/rejected": -0.8175627589225769, "logps/chosen": -62.6363639831543, "logps/rejected": -66.80535125732422, "loss": 13.8568, "rewards/accuracies": 0.5, "rewards/chosen": 0.001977672567591071, "rewards/margins": 0.0007607266306877136, "rewards/rejected": 0.0012169458204880357, "step": 10 }, { "epoch": 0.006389034094209212, "grad_norm": 477.3677673339844, "learning_rate": 4.984020918070889e-06, "logits/chosen": -0.7694743871688843, "logits/rejected": -0.7370525598526001, "logps/chosen": -68.05220794677734, "logps/rejected": -73.65959167480469, "loss": 13.9053, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0020100977271795273, "rewards/margins": -0.004024811089038849, "rewards/rejected": 0.002014713129028678, "step": 11 }, { "epoch": 0.006969855375500958, "grad_norm": 339.96746826171875, "learning_rate": 4.982568274259152e-06, "logits/chosen": -0.4584922194480896, "logits/rejected": -0.4653104245662689, "logps/chosen": -68.20719146728516, "logps/rejected": -76.61314392089844, "loss": 13.789, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.010751953348517418, "rewards/margins": 0.007597408257424831, "rewards/rejected": 0.003154544625431299, "step": 12 }, { "epoch": 0.0075506766567927045, "grad_norm": 307.5588684082031, "learning_rate": 4.9811156304474144e-06, "logits/chosen": -0.6956412196159363, "logits/rejected": -0.5891402959823608, "logps/chosen": -66.07670593261719, "logps/rejected": -77.52650451660156, "loss": 13.8012, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0009560681646689773, "rewards/margins": 0.006629952695220709, "rewards/rejected": -0.0075860219076275826, "step": 13 }, { "epoch": 0.008131497938084452, "grad_norm": 324.5359802246094, "learning_rate": 4.979662986635677e-06, "logits/chosen": -0.5961264371871948, "logits/rejected": -0.6395691633224487, "logps/chosen": -76.06859588623047, "logps/rejected": -75.65780639648438, "loss": 13.8604, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0018664164235815406, "rewards/margins": 0.0006155198207125068, "rewards/rejected": -0.0024819376412779093, "step": 14 }, { "epoch": 0.008712319219376197, "grad_norm": 321.2744445800781, "learning_rate": 4.97821034282394e-06, "logits/chosen": -0.609241247177124, "logits/rejected": -0.6541947722434998, "logps/chosen": -72.74789428710938, "logps/rejected": -78.14617156982422, "loss": 13.6613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.007168769836425781, "rewards/margins": 0.020653218030929565, "rewards/rejected": -0.013484450057148933, "step": 15 }, { "epoch": 0.009293140500667945, "grad_norm": 321.91412353515625, "learning_rate": 4.976757699012203e-06, "logits/chosen": -0.8816198110580444, "logits/rejected": -1.0502017736434937, "logps/chosen": -81.1220474243164, "logps/rejected": -78.71932220458984, "loss": 14.1071, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.012946806848049164, "rewards/margins": -0.023911792784929276, "rewards/rejected": 0.010964984074234962, "step": 16 }, { "epoch": 0.00987396178195969, "grad_norm": 285.6275939941406, "learning_rate": 4.9753050552004654e-06, "logits/chosen": -0.7112148404121399, "logits/rejected": -0.6412473917007446, "logps/chosen": -65.52027893066406, "logps/rejected": -71.49274444580078, "loss": 13.7638, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.007086105644702911, "rewards/margins": 0.010165892541408539, "rewards/rejected": -0.00307978643104434, "step": 17 }, { "epoch": 0.010454783063251438, "grad_norm": 328.0557556152344, "learning_rate": 4.973852411388727e-06, "logits/chosen": -0.756773829460144, "logits/rejected": -0.853185772895813, "logps/chosen": -71.16859436035156, "logps/rejected": -70.41301727294922, "loss": 13.7103, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006009445525705814, "rewards/margins": 0.016000624746084213, "rewards/rejected": -0.022010069340467453, "step": 18 }, { "epoch": 0.011035604344543185, "grad_norm": 286.8959045410156, "learning_rate": 4.97239976757699e-06, "logits/chosen": -0.7466567754745483, "logits/rejected": -0.8737386465072632, "logps/chosen": -70.04942321777344, "logps/rejected": -69.66856384277344, "loss": 13.9803, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00499460194259882, "rewards/margins": -0.011114511638879776, "rewards/rejected": 0.006119909696280956, "step": 19 }, { "epoch": 0.01161642562583493, "grad_norm": 322.2734069824219, "learning_rate": 4.970947123765253e-06, "logits/chosen": -0.7975467443466187, "logits/rejected": -0.9792510271072388, "logps/chosen": -76.38298034667969, "logps/rejected": -82.6656265258789, "loss": 13.9642, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.015522710978984833, "rewards/margins": -0.009529724717140198, "rewards/rejected": -0.005992984864860773, "step": 20 }, { "epoch": 0.012197246907126678, "grad_norm": 316.05517578125, "learning_rate": 4.9694944799535164e-06, "logits/chosen": -0.9200956225395203, "logits/rejected": -0.8602321743965149, "logps/chosen": -69.63390350341797, "logps/rejected": -80.1779556274414, "loss": 13.8359, "rewards/accuracies": 0.5, "rewards/chosen": -0.017169209197163582, "rewards/margins": 0.004035423509776592, "rewards/rejected": -0.0212046317756176, "step": 21 }, { "epoch": 0.012778068188418423, "grad_norm": 335.1014404296875, "learning_rate": 4.968041836141778e-06, "logits/chosen": -0.787733256816864, "logits/rejected": -0.7936286926269531, "logps/chosen": -79.13917541503906, "logps/rejected": -66.25904083251953, "loss": 14.2123, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.030913371592760086, "rewards/margins": -0.03371546417474747, "rewards/rejected": 0.0028020956087857485, "step": 22 }, { "epoch": 0.01335888946971017, "grad_norm": 294.2160339355469, "learning_rate": 4.966589192330041e-06, "logits/chosen": -0.6786571741104126, "logits/rejected": -0.7551315426826477, "logps/chosen": -69.29032897949219, "logps/rejected": -69.82914733886719, "loss": 13.8587, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006731729954481125, "rewards/margins": 0.0010073954472318292, "rewards/rejected": -0.00773912388831377, "step": 23 }, { "epoch": 0.013939710751001916, "grad_norm": 323.32000732421875, "learning_rate": 4.965136548518304e-06, "logits/chosen": -0.9059945940971375, "logits/rejected": -0.7469512224197388, "logps/chosen": -78.02278137207031, "logps/rejected": -66.72486877441406, "loss": 13.9736, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.030265387147665024, "rewards/margins": -0.009611329063773155, "rewards/rejected": -0.02065405808389187, "step": 24 }, { "epoch": 0.014520532032293663, "grad_norm": 333.2950744628906, "learning_rate": 4.963683904706567e-06, "logits/chosen": -0.8123146891593933, "logits/rejected": -0.7186424136161804, "logps/chosen": -76.54938507080078, "logps/rejected": -67.63246154785156, "loss": 13.8885, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.013149453327059746, "rewards/margins": -0.0020573907531797886, "rewards/rejected": -0.01109206210821867, "step": 25 }, { "epoch": 0.015101353313585409, "grad_norm": 357.38775634765625, "learning_rate": 4.962231260894829e-06, "logits/chosen": -0.8334075808525085, "logits/rejected": -0.8764799237251282, "logps/chosen": -77.63631439208984, "logps/rejected": -77.82304382324219, "loss": 14.058, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.015404301695525646, "rewards/margins": -0.01881382428109646, "rewards/rejected": 0.0034095190931111574, "step": 26 }, { "epoch": 0.015682174594877155, "grad_norm": 422.6893615722656, "learning_rate": 4.960778617083092e-06, "logits/chosen": -0.9417294263839722, "logits/rejected": -1.0054762363433838, "logps/chosen": -73.45500183105469, "logps/rejected": -71.62086486816406, "loss": 13.9493, "rewards/accuracies": 0.5, "rewards/chosen": -0.02026461809873581, "rewards/margins": -0.007518300320953131, "rewards/rejected": -0.012746316380798817, "step": 27 }, { "epoch": 0.016262995876168904, "grad_norm": 302.3515625, "learning_rate": 4.959325973271355e-06, "logits/chosen": -0.7729172110557556, "logits/rejected": -0.9286600947380066, "logps/chosen": -74.0387191772461, "logps/rejected": -74.87562561035156, "loss": 13.7489, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.001342860283330083, "rewards/margins": 0.012399435043334961, "rewards/rejected": -0.0137422950938344, "step": 28 }, { "epoch": 0.01684381715746065, "grad_norm": 311.303466796875, "learning_rate": 4.957873329459617e-06, "logits/chosen": -0.7707468271255493, "logits/rejected": -0.7816058993339539, "logps/chosen": -80.28218078613281, "logps/rejected": -76.27729034423828, "loss": 13.7301, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.017700577154755592, "rewards/margins": 0.01469388883560896, "rewards/rejected": -0.03239446505904198, "step": 29 }, { "epoch": 0.017424638438752395, "grad_norm": 272.25091552734375, "learning_rate": 4.9564206856478795e-06, "logits/chosen": -0.9470396041870117, "logits/rejected": -1.0160866975784302, "logps/chosen": -63.82170867919922, "logps/rejected": -67.61204528808594, "loss": 13.7505, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0032949731685221195, "rewards/margins": 0.011799006722867489, "rewards/rejected": -0.008504033088684082, "step": 30 }, { "epoch": 0.018005459720044144, "grad_norm": 320.925048828125, "learning_rate": 4.954968041836142e-06, "logits/chosen": -0.8708783388137817, "logits/rejected": -0.8632427453994751, "logps/chosen": -73.67878723144531, "logps/rejected": -74.83086395263672, "loss": 13.6099, "rewards/accuracies": 0.75, "rewards/chosen": -0.002552690450102091, "rewards/margins": 0.026570502668619156, "rewards/rejected": -0.02912319265305996, "step": 31 }, { "epoch": 0.01858628100133589, "grad_norm": 303.17852783203125, "learning_rate": 4.953515398024405e-06, "logits/chosen": -0.7567359805107117, "logits/rejected": -0.8445581197738647, "logps/chosen": -66.88877868652344, "logps/rejected": -71.22685241699219, "loss": 13.845, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.01957935281097889, "rewards/margins": 0.0028636164497584105, "rewards/rejected": -0.022442970424890518, "step": 32 }, { "epoch": 0.019167102282627635, "grad_norm": 317.728515625, "learning_rate": 4.952062754212668e-06, "logits/chosen": -0.917065441608429, "logits/rejected": -0.8019243478775024, "logps/chosen": -72.54550170898438, "logps/rejected": -81.64886474609375, "loss": 13.6166, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.011156129650771618, "rewards/margins": 0.025838393718004227, "rewards/rejected": -0.03699452430009842, "step": 33 }, { "epoch": 0.01974792356391938, "grad_norm": 295.3751525878906, "learning_rate": 4.9506101104009305e-06, "logits/chosen": -0.8145904541015625, "logits/rejected": -0.7906870245933533, "logps/chosen": -64.89479064941406, "logps/rejected": -76.6470947265625, "loss": 13.7107, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01140755694359541, "rewards/margins": 0.017068836838006973, "rewards/rejected": -0.02847639098763466, "step": 34 }, { "epoch": 0.02032874484521113, "grad_norm": 305.5505676269531, "learning_rate": 4.949157466589193e-06, "logits/chosen": -0.7340711355209351, "logits/rejected": -0.7537750005722046, "logps/chosen": -75.50181579589844, "logps/rejected": -70.16544342041016, "loss": 14.1136, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.028647294268012047, "rewards/margins": -0.023600969463586807, "rewards/rejected": -0.005046320613473654, "step": 35 }, { "epoch": 0.020909566126502875, "grad_norm": 315.96038818359375, "learning_rate": 4.947704822777455e-06, "logits/chosen": -0.8396091461181641, "logits/rejected": -0.9497518539428711, "logps/chosen": -72.89092254638672, "logps/rejected": -83.45375061035156, "loss": 13.8082, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02720179595053196, "rewards/margins": 0.006764040794223547, "rewards/rejected": -0.033965837210416794, "step": 36 }, { "epoch": 0.02149038740779462, "grad_norm": 320.8994445800781, "learning_rate": 4.946252178965718e-06, "logits/chosen": -0.7043382525444031, "logits/rejected": -0.8497918248176575, "logps/chosen": -80.10662841796875, "logps/rejected": -80.18870544433594, "loss": 13.8802, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04296343773603439, "rewards/margins": 0.0006133742863312364, "rewards/rejected": -0.043576814234256744, "step": 37 }, { "epoch": 0.02207120868908637, "grad_norm": 323.31488037109375, "learning_rate": 4.944799535153981e-06, "logits/chosen": -0.9256412386894226, "logits/rejected": -0.9272140264511108, "logps/chosen": -79.0914306640625, "logps/rejected": -79.18199157714844, "loss": 13.6203, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.016189271584153175, "rewards/margins": 0.02659023180603981, "rewards/rejected": -0.042779501527547836, "step": 38 }, { "epoch": 0.022652029970378115, "grad_norm": 309.1510314941406, "learning_rate": 4.943346891342243e-06, "logits/chosen": -0.7942522168159485, "logits/rejected": -0.9100838899612427, "logps/chosen": -78.5674057006836, "logps/rejected": -69.88294982910156, "loss": 13.854, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.028528030961751938, "rewards/margins": 0.0029734233394265175, "rewards/rejected": -0.03150145336985588, "step": 39 }, { "epoch": 0.02323285125166986, "grad_norm": 326.2637023925781, "learning_rate": 4.941894247530506e-06, "logits/chosen": -0.8673677444458008, "logits/rejected": -0.8385285139083862, "logps/chosen": -85.15605926513672, "logps/rejected": -84.39887237548828, "loss": 13.801, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04420323297381401, "rewards/margins": 0.006766452454030514, "rewards/rejected": -0.0509696826338768, "step": 40 }, { "epoch": 0.023813672532961606, "grad_norm": 309.16204833984375, "learning_rate": 4.940441603718769e-06, "logits/chosen": -0.8228281736373901, "logits/rejected": -0.8817359209060669, "logps/chosen": -76.8722915649414, "logps/rejected": -74.45531463623047, "loss": 13.6428, "rewards/accuracies": 0.5, "rewards/chosen": -0.009386795572936535, "rewards/margins": 0.024640636518597603, "rewards/rejected": -0.03402743488550186, "step": 41 }, { "epoch": 0.024394493814253355, "grad_norm": 322.296630859375, "learning_rate": 4.938988959907032e-06, "logits/chosen": -0.7055200338363647, "logits/rejected": -0.779880166053772, "logps/chosen": -75.47132873535156, "logps/rejected": -75.7305908203125, "loss": 13.6296, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.015317773446440697, "rewards/margins": 0.025457333773374557, "rewards/rejected": -0.0407751128077507, "step": 42 }, { "epoch": 0.0249753150955451, "grad_norm": 334.68182373046875, "learning_rate": 4.9375363160952935e-06, "logits/chosen": -0.7710026502609253, "logits/rejected": -0.82500159740448, "logps/chosen": -77.85375213623047, "logps/rejected": -74.20629119873047, "loss": 13.6993, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007363433483988047, "rewards/margins": 0.018297644332051277, "rewards/rejected": -0.025661081075668335, "step": 43 }, { "epoch": 0.025556136376836847, "grad_norm": 317.21905517578125, "learning_rate": 4.936083672283556e-06, "logits/chosen": -0.988287627696991, "logits/rejected": -0.9961759448051453, "logps/chosen": -77.48571014404297, "logps/rejected": -74.44812774658203, "loss": 14.0204, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.04125748947262764, "rewards/margins": -0.01405995525419712, "rewards/rejected": -0.02719753421843052, "step": 44 }, { "epoch": 0.026136957658128592, "grad_norm": 315.16363525390625, "learning_rate": 4.934631028471819e-06, "logits/chosen": -0.8442651629447937, "logits/rejected": -0.9505764842033386, "logps/chosen": -74.02120208740234, "logps/rejected": -72.55667114257812, "loss": 13.8445, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009219218045473099, "rewards/margins": 0.003767718095332384, "rewards/rejected": -0.01298693846911192, "step": 45 }, { "epoch": 0.02671777893942034, "grad_norm": 333.017578125, "learning_rate": 4.933178384660082e-06, "logits/chosen": -0.8707895278930664, "logits/rejected": -0.8848034739494324, "logps/chosen": -82.85108947753906, "logps/rejected": -82.15937805175781, "loss": 14.1048, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.05764341354370117, "rewards/margins": -0.019157161936163902, "rewards/rejected": -0.03848625347018242, "step": 46 }, { "epoch": 0.027298600220712087, "grad_norm": 328.5982971191406, "learning_rate": 4.9317257408483445e-06, "logits/chosen": -0.8642932176589966, "logits/rejected": -0.8195087313652039, "logps/chosen": -75.42310333251953, "logps/rejected": -78.54690551757812, "loss": 13.8605, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.027368813753128052, "rewards/margins": 0.0027305304538458586, "rewards/rejected": -0.03009933792054653, "step": 47 }, { "epoch": 0.027879421502003832, "grad_norm": 332.7618713378906, "learning_rate": 4.930273097036607e-06, "logits/chosen": -0.8657892346382141, "logits/rejected": -0.9026430249214172, "logps/chosen": -86.77098083496094, "logps/rejected": -70.10636901855469, "loss": 13.9202, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03694070875644684, "rewards/margins": -0.0033356398344039917, "rewards/rejected": -0.033605072647333145, "step": 48 }, { "epoch": 0.02846024278329558, "grad_norm": 326.447265625, "learning_rate": 4.928820453224869e-06, "logits/chosen": -0.931209921836853, "logits/rejected": -0.7586521506309509, "logps/chosen": -78.26404571533203, "logps/rejected": -75.22278594970703, "loss": 13.6775, "rewards/accuracies": 0.75, "rewards/chosen": -0.009883576072752476, "rewards/margins": 0.02028750441968441, "rewards/rejected": -0.030171077698469162, "step": 49 }, { "epoch": 0.029041064064587327, "grad_norm": 323.82501220703125, "learning_rate": 4.927367809413132e-06, "logits/chosen": -0.8577003479003906, "logits/rejected": -0.9445897340774536, "logps/chosen": -80.17283630371094, "logps/rejected": -73.15702819824219, "loss": 13.8828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04805426672101021, "rewards/margins": 6.009042408550158e-05, "rewards/rejected": -0.0481143593788147, "step": 50 }, { "epoch": 0.029621885345879072, "grad_norm": 345.8555603027344, "learning_rate": 4.925915165601395e-06, "logits/chosen": -0.8427948951721191, "logits/rejected": -0.6843789219856262, "logps/chosen": -67.90852355957031, "logps/rejected": -78.57261657714844, "loss": 13.9293, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03557172790169716, "rewards/margins": -0.005063153803348541, "rewards/rejected": -0.030508574098348618, "step": 51 }, { "epoch": 0.030202706627170818, "grad_norm": 330.11419677734375, "learning_rate": 4.924462521789657e-06, "logits/chosen": -0.8312528729438782, "logits/rejected": -0.8623727560043335, "logps/chosen": -73.96318054199219, "logps/rejected": -74.79844665527344, "loss": 14.0308, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.040969304740428925, "rewards/margins": -0.014493905007839203, "rewards/rejected": -0.02647540345788002, "step": 52 }, { "epoch": 0.030783527908462567, "grad_norm": 439.39111328125, "learning_rate": 4.92300987797792e-06, "logits/chosen": -0.8176994323730469, "logits/rejected": -0.7079430818557739, "logps/chosen": -72.11351013183594, "logps/rejected": -75.1050033569336, "loss": 13.9598, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.011684712953865528, "rewards/margins": -0.007551603019237518, "rewards/rejected": -0.004133109003305435, "step": 53 }, { "epoch": 0.03136434918975431, "grad_norm": 299.9485778808594, "learning_rate": 4.921557234166183e-06, "logits/chosen": -0.7416559457778931, "logits/rejected": -0.7767287492752075, "logps/chosen": -70.76741790771484, "logps/rejected": -73.53133392333984, "loss": 13.7678, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0056773098185658455, "rewards/margins": 0.010591240599751472, "rewards/rejected": -0.016268549486994743, "step": 54 }, { "epoch": 0.03194517047104606, "grad_norm": 294.43988037109375, "learning_rate": 4.920104590354446e-06, "logits/chosen": -0.6667272448539734, "logits/rejected": -0.8636151552200317, "logps/chosen": -63.9805793762207, "logps/rejected": -74.08900451660156, "loss": 13.9136, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.028553063049912453, "rewards/margins": -0.0023740821052342653, "rewards/rejected": -0.026178985834121704, "step": 55 }, { "epoch": 0.03252599175233781, "grad_norm": 315.2623596191406, "learning_rate": 4.9186519465427075e-06, "logits/chosen": -0.9049865007400513, "logits/rejected": -0.9067096710205078, "logps/chosen": -71.81194305419922, "logps/rejected": -69.87062072753906, "loss": 13.6316, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.010484697297215462, "rewards/margins": 0.025315653532743454, "rewards/rejected": -0.03580035641789436, "step": 56 }, { "epoch": 0.03310681303362955, "grad_norm": 317.02972412109375, "learning_rate": 4.91719930273097e-06, "logits/chosen": -0.8822166323661804, "logits/rejected": -0.8712530136108398, "logps/chosen": -73.8038330078125, "logps/rejected": -70.23847198486328, "loss": 13.7197, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0062989904545247555, "rewards/margins": 0.016949044540524483, "rewards/rejected": -0.010650052689015865, "step": 57 }, { "epoch": 0.0336876343149213, "grad_norm": 379.47076416015625, "learning_rate": 4.915746658919233e-06, "logits/chosen": -0.9493061304092407, "logits/rejected": -0.8905242681503296, "logps/chosen": -77.11824798583984, "logps/rejected": -74.96271514892578, "loss": 13.7691, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.024838361889123917, "rewards/margins": 0.011607111431658268, "rewards/rejected": -0.03644547611474991, "step": 58 }, { "epoch": 0.03426845559621305, "grad_norm": 306.8603210449219, "learning_rate": 4.914294015107496e-06, "logits/chosen": -0.8163889050483704, "logits/rejected": -0.8286741971969604, "logps/chosen": -65.88087463378906, "logps/rejected": -69.69749450683594, "loss": 14.0756, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.025371646508574486, "rewards/margins": -0.01930541917681694, "rewards/rejected": -0.006066230591386557, "step": 59 }, { "epoch": 0.03484927687750479, "grad_norm": 313.75762939453125, "learning_rate": 4.9128413712957585e-06, "logits/chosen": -0.8723942041397095, "logits/rejected": -0.852526068687439, "logps/chosen": -69.1073989868164, "logps/rejected": -78.30496978759766, "loss": 13.6396, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007079926319420338, "rewards/margins": 0.024318810552358627, "rewards/rejected": -0.03139873221516609, "step": 60 }, { "epoch": 0.03543009815879654, "grad_norm": 301.9634704589844, "learning_rate": 4.911388727484021e-06, "logits/chosen": -0.9334823489189148, "logits/rejected": -0.8579393625259399, "logps/chosen": -68.89993286132812, "logps/rejected": -71.83196258544922, "loss": 13.8795, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02495257556438446, "rewards/margins": 0.0009910565568134189, "rewards/rejected": -0.025943631306290627, "step": 61 }, { "epoch": 0.03601091944008829, "grad_norm": 312.14678955078125, "learning_rate": 4.909936083672284e-06, "logits/chosen": -1.0733639001846313, "logits/rejected": -0.9336859583854675, "logps/chosen": -71.59821319580078, "logps/rejected": -83.17411804199219, "loss": 13.7566, "rewards/accuracies": 0.5, "rewards/chosen": -0.03083074651658535, "rewards/margins": 0.01191837340593338, "rewards/rejected": -0.04274912178516388, "step": 62 }, { "epoch": 0.03659174072138003, "grad_norm": 427.48895263671875, "learning_rate": 4.908483439860547e-06, "logits/chosen": -0.9465745091438293, "logits/rejected": -0.9409465789794922, "logps/chosen": -77.52535247802734, "logps/rejected": -77.35426330566406, "loss": 13.8253, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04855041205883026, "rewards/margins": 0.005750913638621569, "rewards/rejected": -0.05430132895708084, "step": 63 }, { "epoch": 0.03717256200267178, "grad_norm": 329.34857177734375, "learning_rate": 4.9070307960488095e-06, "logits/chosen": -0.8759803771972656, "logits/rejected": -0.9234689474105835, "logps/chosen": -62.841156005859375, "logps/rejected": -66.76082611083984, "loss": 13.9698, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.018310727551579475, "rewards/margins": -0.008804955519735813, "rewards/rejected": -0.009505772963166237, "step": 64 }, { "epoch": 0.03775338328396353, "grad_norm": 290.18682861328125, "learning_rate": 4.905578152237072e-06, "logits/chosen": -0.9876300692558289, "logits/rejected": -0.9204443097114563, "logps/chosen": -70.43892669677734, "logps/rejected": -69.47483825683594, "loss": 13.7851, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.017851095646619797, "rewards/margins": 0.011618509888648987, "rewards/rejected": -0.029469609260559082, "step": 65 }, { "epoch": 0.03833420456525527, "grad_norm": 331.4013977050781, "learning_rate": 4.904125508425335e-06, "logits/chosen": -0.8038953542709351, "logits/rejected": -0.7992674112319946, "logps/chosen": -67.48692321777344, "logps/rejected": -85.4039077758789, "loss": 13.6661, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02300168201327324, "rewards/margins": 0.022488538175821304, "rewards/rejected": -0.045490216463804245, "step": 66 }, { "epoch": 0.03891502584654702, "grad_norm": 324.8753356933594, "learning_rate": 4.902672864613598e-06, "logits/chosen": -0.7837721705436707, "logits/rejected": -0.9071874618530273, "logps/chosen": -69.61351013183594, "logps/rejected": -68.86196899414062, "loss": 13.79, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.02017301693558693, "rewards/margins": 0.008808745071291924, "rewards/rejected": -0.028981763869524002, "step": 67 }, { "epoch": 0.03949584712783876, "grad_norm": 317.4655456542969, "learning_rate": 4.90122022080186e-06, "logits/chosen": -0.8771296739578247, "logits/rejected": -0.8569726943969727, "logps/chosen": -73.78062438964844, "logps/rejected": -68.98119354248047, "loss": 13.5934, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009509051218628883, "rewards/margins": 0.02956991270184517, "rewards/rejected": -0.0390789620578289, "step": 68 }, { "epoch": 0.04007666840913051, "grad_norm": 349.1176452636719, "learning_rate": 4.8997675769901224e-06, "logits/chosen": -0.8722847700119019, "logits/rejected": -0.8638531565666199, "logps/chosen": -74.6583251953125, "logps/rejected": -73.25667572021484, "loss": 13.7595, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01801307685673237, "rewards/margins": 0.012483290396630764, "rewards/rejected": -0.030496370047330856, "step": 69 }, { "epoch": 0.04065748969042226, "grad_norm": 321.077880859375, "learning_rate": 4.898314933178385e-06, "logits/chosen": -0.6883363723754883, "logits/rejected": -0.6323266625404358, "logps/chosen": -73.32432556152344, "logps/rejected": -87.47152709960938, "loss": 13.6591, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0362272784113884, "rewards/margins": 0.024491379037499428, "rewards/rejected": -0.060718655586242676, "step": 70 }, { "epoch": 0.041238310971714, "grad_norm": 306.60955810546875, "learning_rate": 4.896862289366648e-06, "logits/chosen": -0.9396616816520691, "logits/rejected": -0.8107419013977051, "logps/chosen": -75.45014953613281, "logps/rejected": -74.94744110107422, "loss": 13.6692, "rewards/accuracies": 0.75, "rewards/chosen": -0.030824948102235794, "rewards/margins": 0.020616179332137108, "rewards/rejected": -0.051441121846437454, "step": 71 }, { "epoch": 0.04181913225300575, "grad_norm": 309.03094482421875, "learning_rate": 4.895409645554911e-06, "logits/chosen": -1.0361931324005127, "logits/rejected": -0.920698344707489, "logps/chosen": -74.54027557373047, "logps/rejected": -66.71863555908203, "loss": 14.2171, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.06477208435535431, "rewards/margins": -0.031630050390958786, "rewards/rejected": -0.03314203768968582, "step": 72 }, { "epoch": 0.0423999535342975, "grad_norm": 336.6788635253906, "learning_rate": 4.893957001743173e-06, "logits/chosen": -0.9846957921981812, "logits/rejected": -0.9497137069702148, "logps/chosen": -73.15393829345703, "logps/rejected": -73.5201416015625, "loss": 13.9208, "rewards/accuracies": 0.5, "rewards/chosen": -0.019983595237135887, "rewards/margins": -0.003768919501453638, "rewards/rejected": -0.016214676201343536, "step": 73 }, { "epoch": 0.04298077481558924, "grad_norm": 328.0151062011719, "learning_rate": 4.892504357931436e-06, "logits/chosen": -0.9070509672164917, "logits/rejected": -0.890802264213562, "logps/chosen": -74.13055419921875, "logps/rejected": -74.56624603271484, "loss": 14.2509, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.03255216404795647, "rewards/margins": -0.03526480868458748, "rewards/rejected": 0.0027126409113407135, "step": 74 }, { "epoch": 0.04356159609688099, "grad_norm": 312.4993591308594, "learning_rate": 4.891051714119698e-06, "logits/chosen": -0.8348041772842407, "logits/rejected": -0.8501715660095215, "logps/chosen": -76.722900390625, "logps/rejected": -71.301025390625, "loss": 14.0253, "rewards/accuracies": 0.5, "rewards/chosen": -0.024392826482653618, "rewards/margins": -0.01172790676355362, "rewards/rejected": -0.012664918787777424, "step": 75 }, { "epoch": 0.04414241737817274, "grad_norm": 308.2872314453125, "learning_rate": 4.889599070307961e-06, "logits/chosen": -0.5838706493377686, "logits/rejected": -0.637297511100769, "logps/chosen": -72.34232330322266, "logps/rejected": -66.2028579711914, "loss": 14.1823, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.019541073590517044, "rewards/margins": -0.02966020628809929, "rewards/rejected": 0.010119132697582245, "step": 76 }, { "epoch": 0.04472323865946448, "grad_norm": 305.4457702636719, "learning_rate": 4.8881464264962236e-06, "logits/chosen": -0.973824143409729, "logits/rejected": -0.9775202870368958, "logps/chosen": -72.74274444580078, "logps/rejected": -73.25370788574219, "loss": 14.0823, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.02193290740251541, "rewards/margins": -0.019592974334955215, "rewards/rejected": -0.0023399335332214832, "step": 77 }, { "epoch": 0.04530405994075623, "grad_norm": 483.3565368652344, "learning_rate": 4.886693782684486e-06, "logits/chosen": -0.8660491704940796, "logits/rejected": -0.8472667932510376, "logps/chosen": -79.67647552490234, "logps/rejected": -76.70512390136719, "loss": 14.0415, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.035908013582229614, "rewards/margins": -0.01337387878447771, "rewards/rejected": -0.02253413386642933, "step": 78 }, { "epoch": 0.04588488122204797, "grad_norm": 315.49755859375, "learning_rate": 4.885241138872749e-06, "logits/chosen": -0.6490969061851501, "logits/rejected": -0.7820181250572205, "logps/chosen": -74.25728607177734, "logps/rejected": -73.80535125732422, "loss": 13.4779, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.014750528149306774, "rewards/margins": 0.043530743569135666, "rewards/rejected": -0.058281272649765015, "step": 79 }, { "epoch": 0.04646570250333972, "grad_norm": 315.58837890625, "learning_rate": 4.883788495061012e-06, "logits/chosen": -0.8913451433181763, "logits/rejected": -0.8876463770866394, "logps/chosen": -73.64271545410156, "logps/rejected": -69.37626647949219, "loss": 13.957, "rewards/accuracies": 0.5, "rewards/chosen": -0.01223234087228775, "rewards/margins": -0.005636455025523901, "rewards/rejected": -0.006595888640731573, "step": 80 }, { "epoch": 0.04704652378463147, "grad_norm": 313.813232421875, "learning_rate": 4.882335851249274e-06, "logits/chosen": -0.8722120523452759, "logits/rejected": -0.7989141345024109, "logps/chosen": -70.44036865234375, "logps/rejected": -70.73589324951172, "loss": 14.1477, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.026375526562333107, "rewards/margins": -0.026495525613427162, "rewards/rejected": 0.00012000077549600974, "step": 81 }, { "epoch": 0.04762734506592321, "grad_norm": 314.9659729003906, "learning_rate": 4.8808832074375365e-06, "logits/chosen": -0.7730615735054016, "logits/rejected": -0.8082054257392883, "logps/chosen": -69.96324157714844, "logps/rejected": -77.90928649902344, "loss": 13.778, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.014157066121697426, "rewards/margins": 0.010449771769344807, "rewards/rejected": -0.024606838822364807, "step": 82 }, { "epoch": 0.04820816634721496, "grad_norm": 303.14599609375, "learning_rate": 4.879430563625799e-06, "logits/chosen": -0.9874809980392456, "logits/rejected": -1.0569359064102173, "logps/chosen": -74.83180236816406, "logps/rejected": -76.95542907714844, "loss": 13.6937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0425366535782814, "rewards/margins": 0.021705975756049156, "rewards/rejected": -0.06424263119697571, "step": 83 }, { "epoch": 0.04878898762850671, "grad_norm": 317.16961669921875, "learning_rate": 4.877977919814062e-06, "logits/chosen": -0.9083768725395203, "logits/rejected": -0.9116488695144653, "logps/chosen": -80.80293273925781, "logps/rejected": -72.5535888671875, "loss": 14.2559, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.06587149202823639, "rewards/margins": -0.036308903247117996, "rewards/rejected": -0.029562586918473244, "step": 84 }, { "epoch": 0.04936980890979845, "grad_norm": 301.5, "learning_rate": 4.876525276002325e-06, "logits/chosen": -0.9549547433853149, "logits/rejected": -0.9115845561027527, "logps/chosen": -75.24485778808594, "logps/rejected": -68.90257263183594, "loss": 14.065, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.015530148521065712, "rewards/margins": -0.0176745243370533, "rewards/rejected": 0.0021443753503262997, "step": 85 }, { "epoch": 0.0499506301910902, "grad_norm": 307.9525451660156, "learning_rate": 4.8750726321905875e-06, "logits/chosen": -0.9091174006462097, "logits/rejected": -0.9419649839401245, "logps/chosen": -76.40791320800781, "logps/rejected": -77.6815185546875, "loss": 13.7705, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.015883130952715874, "rewards/margins": 0.01122428011149168, "rewards/rejected": -0.02710741199553013, "step": 86 }, { "epoch": 0.05053145147238195, "grad_norm": 321.51507568359375, "learning_rate": 4.87361998837885e-06, "logits/chosen": -0.9909089803695679, "logits/rejected": -0.9910022616386414, "logps/chosen": -77.63959503173828, "logps/rejected": -72.30487823486328, "loss": 13.8477, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006782358977943659, "rewards/margins": 0.003400030778720975, "rewards/rejected": -0.010182389989495277, "step": 87 }, { "epoch": 0.05111227275367369, "grad_norm": 321.3612060546875, "learning_rate": 4.872167344567112e-06, "logits/chosen": -0.7301020622253418, "logits/rejected": -0.5697265267372131, "logps/chosen": -70.13436126708984, "logps/rejected": -77.76301574707031, "loss": 13.7418, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.015415447764098644, "rewards/margins": 0.013852661475539207, "rewards/rejected": 0.0015627862885594368, "step": 88 }, { "epoch": 0.05169309403496544, "grad_norm": 322.7413330078125, "learning_rate": 4.870714700755375e-06, "logits/chosen": -0.8953542709350586, "logits/rejected": -0.8882652521133423, "logps/chosen": -69.642333984375, "logps/rejected": -66.21636962890625, "loss": 13.8948, "rewards/accuracies": 0.5, "rewards/chosen": -0.015242251567542553, "rewards/margins": -0.00036549606011249125, "rewards/rejected": -0.014876757748425007, "step": 89 }, { "epoch": 0.052273915316257184, "grad_norm": 316.21905517578125, "learning_rate": 4.869262056943638e-06, "logits/chosen": -0.826396107673645, "logits/rejected": -0.8409526944160461, "logps/chosen": -84.0012435913086, "logps/rejected": -73.15267944335938, "loss": 14.2303, "rewards/accuracies": 0.25, "rewards/chosen": -0.03959153965115547, "rewards/margins": -0.03513690084218979, "rewards/rejected": -0.004454641602933407, "step": 90 }, { "epoch": 0.05285473659754893, "grad_norm": 314.6103515625, "learning_rate": 4.8678094131319e-06, "logits/chosen": -0.9043526649475098, "logits/rejected": -0.9540117979049683, "logps/chosen": -85.2854995727539, "logps/rejected": -74.1194076538086, "loss": 14.256, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.05172392725944519, "rewards/margins": -0.03560470789670944, "rewards/rejected": -0.016119223088026047, "step": 91 }, { "epoch": 0.05343555787884068, "grad_norm": 298.3580322265625, "learning_rate": 4.866356769320163e-06, "logits/chosen": -0.83808434009552, "logits/rejected": -0.858515739440918, "logps/chosen": -67.17420959472656, "logps/rejected": -84.23038482666016, "loss": 13.4033, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0225966926664114, "rewards/margins": 0.05055863410234451, "rewards/rejected": -0.027961939573287964, "step": 92 }, { "epoch": 0.054016379160132424, "grad_norm": 293.33282470703125, "learning_rate": 4.864904125508426e-06, "logits/chosen": -0.9872828722000122, "logits/rejected": -1.1029959917068481, "logps/chosen": -72.00138854980469, "logps/rejected": -69.6135025024414, "loss": 13.7105, "rewards/accuracies": 0.5, "rewards/chosen": -0.0009010225767269731, "rewards/margins": 0.017413010820746422, "rewards/rejected": -0.01831403188407421, "step": 93 }, { "epoch": 0.054597200441424174, "grad_norm": 356.63946533203125, "learning_rate": 4.863451481696689e-06, "logits/chosen": -0.8439090847969055, "logits/rejected": -0.8643622398376465, "logps/chosen": -70.82389831542969, "logps/rejected": -75.1737289428711, "loss": 13.8326, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0158695038408041, "rewards/margins": 0.0050217146053910255, "rewards/rejected": -0.0208912193775177, "step": 94 }, { "epoch": 0.05517802172271592, "grad_norm": 304.1072998046875, "learning_rate": 4.8619988378849505e-06, "logits/chosen": -1.0255842208862305, "logits/rejected": -1.041534662246704, "logps/chosen": -68.80192565917969, "logps/rejected": -67.79802703857422, "loss": 13.6247, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01212473027408123, "rewards/margins": 0.02532658539712429, "rewards/rejected": -0.01320185698568821, "step": 95 }, { "epoch": 0.055758843004007665, "grad_norm": 314.3151550292969, "learning_rate": 4.860546194073213e-06, "logits/chosen": -0.8595132827758789, "logits/rejected": -0.8583124279975891, "logps/chosen": -72.1944808959961, "logps/rejected": -72.77186584472656, "loss": 13.9049, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.031448423862457275, "rewards/margins": -0.0011334316805005074, "rewards/rejected": -0.030314987525343895, "step": 96 }, { "epoch": 0.056339664285299414, "grad_norm": 320.2240905761719, "learning_rate": 4.859093550261476e-06, "logits/chosen": -0.749364972114563, "logits/rejected": -0.8103491067886353, "logps/chosen": -79.1120376586914, "logps/rejected": -75.91130065917969, "loss": 13.9032, "rewards/accuracies": 0.5, "rewards/chosen": -0.03181576728820801, "rewards/margins": -0.0017790347337722778, "rewards/rejected": -0.03003673627972603, "step": 97 }, { "epoch": 0.05692048556659116, "grad_norm": 314.99896240234375, "learning_rate": 4.857640906449739e-06, "logits/chosen": -0.8008295297622681, "logits/rejected": -0.847716212272644, "logps/chosen": -79.81649017333984, "logps/rejected": -68.53919982910156, "loss": 14.0979, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.021854501217603683, "rewards/margins": -0.02203025482594967, "rewards/rejected": 0.00017575845413375646, "step": 98 }, { "epoch": 0.057501306847882905, "grad_norm": 315.1274108886719, "learning_rate": 4.8561882626380015e-06, "logits/chosen": -0.8115674257278442, "logits/rejected": -0.8901892900466919, "logps/chosen": -72.59163665771484, "logps/rejected": -78.07933044433594, "loss": 13.7458, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.000810365192592144, "rewards/margins": 0.014865818433463573, "rewards/rejected": -0.01405545137822628, "step": 99 }, { "epoch": 0.058082128129174654, "grad_norm": 318.1490173339844, "learning_rate": 4.854735618826264e-06, "logits/chosen": -0.9334144592285156, "logits/rejected": -0.9743694067001343, "logps/chosen": -74.79545593261719, "logps/rejected": -80.59342193603516, "loss": 13.5047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.00012166835222160444, "rewards/margins": 0.038393907248973846, "rewards/rejected": -0.03827223926782608, "step": 100 }, { "epoch": 0.0586629494104664, "grad_norm": 312.1184997558594, "learning_rate": 4.853282975014527e-06, "logits/chosen": -0.8218280076980591, "logits/rejected": -0.848983645439148, "logps/chosen": -77.64916229248047, "logps/rejected": -77.53215026855469, "loss": 13.6869, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0032627955079078674, "rewards/margins": 0.022136736661195755, "rewards/rejected": -0.025399532169103622, "step": 101 }, { "epoch": 0.059243770691758145, "grad_norm": 314.2687072753906, "learning_rate": 4.851830331202789e-06, "logits/chosen": -0.8590647578239441, "logits/rejected": -0.8945513963699341, "logps/chosen": -78.69733428955078, "logps/rejected": -68.73847198486328, "loss": 13.7324, "rewards/accuracies": 0.5, "rewards/chosen": -0.010157021693885326, "rewards/margins": 0.016794661059975624, "rewards/rejected": -0.026951681822538376, "step": 102 }, { "epoch": 0.059824591973049894, "grad_norm": 322.7899475097656, "learning_rate": 4.850377687391052e-06, "logits/chosen": -0.9431624412536621, "logits/rejected": -0.8844934701919556, "logps/chosen": -71.08782958984375, "logps/rejected": -76.45503234863281, "loss": 13.7423, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0048022083938121796, "rewards/margins": 0.014014339074492455, "rewards/rejected": -0.018816547468304634, "step": 103 }, { "epoch": 0.060405413254341636, "grad_norm": 328.67974853515625, "learning_rate": 4.848925043579314e-06, "logits/chosen": -0.7548056840896606, "logits/rejected": -0.7886452078819275, "logps/chosen": -74.668212890625, "logps/rejected": -74.39926147460938, "loss": 13.3898, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.050213318318128586, "rewards/margins": 0.04900939390063286, "rewards/rejected": 0.0012039269786328077, "step": 104 }, { "epoch": 0.060986234535633385, "grad_norm": 306.2090148925781, "learning_rate": 4.847472399767578e-06, "logits/chosen": -0.7457195520401001, "logits/rejected": -0.7296000123023987, "logps/chosen": -66.86776733398438, "logps/rejected": -68.6021499633789, "loss": 14.2794, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.01881163753569126, "rewards/margins": -0.039370544254779816, "rewards/rejected": 0.020558910444378853, "step": 105 }, { "epoch": 0.061567055816925134, "grad_norm": 429.5802917480469, "learning_rate": 4.84601975595584e-06, "logits/chosen": -0.7268589735031128, "logits/rejected": -0.824454665184021, "logps/chosen": -73.6258773803711, "logps/rejected": -73.26065826416016, "loss": 13.8393, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.019226079806685448, "rewards/margins": 0.005577159114181995, "rewards/rejected": -0.02480323798954487, "step": 106 }, { "epoch": 0.062147877098216876, "grad_norm": 310.4759521484375, "learning_rate": 4.844567112144103e-06, "logits/chosen": -0.7717695236206055, "logits/rejected": -0.6804165244102478, "logps/chosen": -73.10393524169922, "logps/rejected": -71.20040893554688, "loss": 13.5079, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.018317507579922676, "rewards/margins": 0.03789940103888512, "rewards/rejected": -0.05621690675616264, "step": 107 }, { "epoch": 0.06272869837950862, "grad_norm": 313.2925720214844, "learning_rate": 4.843114468332365e-06, "logits/chosen": -0.7994370460510254, "logits/rejected": -0.9286397099494934, "logps/chosen": -73.3687515258789, "logps/rejected": -73.391357421875, "loss": 13.7167, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0021995368879288435, "rewards/margins": 0.017970655113458633, "rewards/rejected": -0.020170193165540695, "step": 108 }, { "epoch": 0.06330951966080037, "grad_norm": 298.9688415527344, "learning_rate": 4.841661824520628e-06, "logits/chosen": -0.783902108669281, "logits/rejected": -0.8706483840942383, "logps/chosen": -72.7656021118164, "logps/rejected": -67.734619140625, "loss": 14.0378, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.010019464418292046, "rewards/margins": -0.016036922112107277, "rewards/rejected": 0.006017456762492657, "step": 109 }, { "epoch": 0.06389034094209212, "grad_norm": 292.80926513671875, "learning_rate": 4.840209180708891e-06, "logits/chosen": -0.867302417755127, "logits/rejected": -0.934320330619812, "logps/chosen": -69.18145751953125, "logps/rejected": -74.14984893798828, "loss": 13.9161, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04971027001738548, "rewards/margins": -0.003587187733501196, "rewards/rejected": -0.04612307995557785, "step": 110 }, { "epoch": 0.06447116222338387, "grad_norm": 316.4573059082031, "learning_rate": 4.838756536897154e-06, "logits/chosen": -0.7158193588256836, "logits/rejected": -0.703850269317627, "logps/chosen": -72.75650024414062, "logps/rejected": -74.89552307128906, "loss": 13.7563, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.02667677029967308, "rewards/margins": 0.013364049606025219, "rewards/rejected": 0.013312721624970436, "step": 111 }, { "epoch": 0.06505198350467561, "grad_norm": 310.0760498046875, "learning_rate": 4.837303893085416e-06, "logits/chosen": -0.7871710658073425, "logits/rejected": -0.7631909847259521, "logps/chosen": -72.4153823852539, "logps/rejected": -69.98908233642578, "loss": 14.0832, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.027912933379411697, "rewards/margins": -0.020552167668938637, "rewards/rejected": -0.007360764779150486, "step": 112 }, { "epoch": 0.06563280478596736, "grad_norm": 298.72796630859375, "learning_rate": 4.835851249273678e-06, "logits/chosen": -0.8587129712104797, "logits/rejected": -0.828883171081543, "logps/chosen": -76.20726013183594, "logps/rejected": -70.39952087402344, "loss": 13.7695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01639840006828308, "rewards/margins": 0.01160570327192545, "rewards/rejected": 0.004792699124664068, "step": 113 }, { "epoch": 0.0662136260672591, "grad_norm": 306.0551452636719, "learning_rate": 4.834398605461941e-06, "logits/chosen": -0.7858158349990845, "logits/rejected": -0.8138400912284851, "logps/chosen": -71.29947662353516, "logps/rejected": -74.6938247680664, "loss": 13.6944, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.009441891685128212, "rewards/margins": 0.020461464300751686, "rewards/rejected": -0.011019574478268623, "step": 114 }, { "epoch": 0.06679444734855085, "grad_norm": 299.12298583984375, "learning_rate": 4.832945961650204e-06, "logits/chosen": -0.8294021487236023, "logits/rejected": -0.9068562388420105, "logps/chosen": -72.25274658203125, "logps/rejected": -74.01802062988281, "loss": 13.8028, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.007599563803523779, "rewards/margins": 0.009034421294927597, "rewards/rejected": -0.01663398928940296, "step": 115 }, { "epoch": 0.0673752686298426, "grad_norm": 316.1508483886719, "learning_rate": 4.8314933178384665e-06, "logits/chosen": -0.7372664213180542, "logits/rejected": -0.6621009707450867, "logps/chosen": -71.98650360107422, "logps/rejected": -73.5616226196289, "loss": 14.0602, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.040212083607912064, "rewards/margins": -0.01578274928033352, "rewards/rejected": -0.024429330602288246, "step": 116 }, { "epoch": 0.06795608991113435, "grad_norm": 311.5640869140625, "learning_rate": 4.830040674026729e-06, "logits/chosen": -0.8206332325935364, "logits/rejected": -0.8395715951919556, "logps/chosen": -71.33819580078125, "logps/rejected": -67.67076110839844, "loss": 14.0576, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.04526720941066742, "rewards/margins": -0.017988480627536774, "rewards/rejected": -0.027278726920485497, "step": 117 }, { "epoch": 0.0685369111924261, "grad_norm": 319.0810852050781, "learning_rate": 4.828588030214992e-06, "logits/chosen": -0.75376957654953, "logits/rejected": -0.7071677446365356, "logps/chosen": -79.94001770019531, "logps/rejected": -73.17815399169922, "loss": 14.162, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.015543567016720772, "rewards/margins": -0.02663475275039673, "rewards/rejected": 0.011091184802353382, "step": 118 }, { "epoch": 0.06911773247371784, "grad_norm": 296.51690673828125, "learning_rate": 4.827135386403255e-06, "logits/chosen": -0.8419657945632935, "logits/rejected": -0.7793577313423157, "logps/chosen": -72.96027374267578, "logps/rejected": -71.80851745605469, "loss": 13.2528, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.016827832907438278, "rewards/margins": 0.06467505544424057, "rewards/rejected": -0.047847211360931396, "step": 119 }, { "epoch": 0.06969855375500958, "grad_norm": 305.2845458984375, "learning_rate": 4.825682742591517e-06, "logits/chosen": -0.7169400453567505, "logits/rejected": -0.7394998073577881, "logps/chosen": -69.85813903808594, "logps/rejected": -76.11322784423828, "loss": 13.4805, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0008814238244667649, "rewards/margins": 0.041327688843011856, "rewards/rejected": -0.04220911115407944, "step": 120 }, { "epoch": 0.07027937503630133, "grad_norm": 329.4168395996094, "learning_rate": 4.824230098779779e-06, "logits/chosen": -0.8370596170425415, "logits/rejected": -0.8346614837646484, "logps/chosen": -76.3817138671875, "logps/rejected": -69.7574691772461, "loss": 14.3016, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.017085228115320206, "rewards/margins": -0.039855439215898514, "rewards/rejected": 0.02277020923793316, "step": 121 }, { "epoch": 0.07086019631759308, "grad_norm": 328.85821533203125, "learning_rate": 4.822777454968042e-06, "logits/chosen": -0.8888875246047974, "logits/rejected": -0.8640028834342957, "logps/chosen": -81.20426940917969, "logps/rejected": -73.75362396240234, "loss": 14.2056, "rewards/accuracies": 0.5, "rewards/chosen": -0.05541349574923515, "rewards/margins": -0.030969763174653053, "rewards/rejected": -0.02444373071193695, "step": 122 }, { "epoch": 0.07144101759888483, "grad_norm": 329.655517578125, "learning_rate": 4.821324811156305e-06, "logits/chosen": -0.8150280714035034, "logits/rejected": -0.7874841094017029, "logps/chosen": -74.91133880615234, "logps/rejected": -76.65450286865234, "loss": 13.9383, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.04368335008621216, "rewards/margins": -0.005534342024475336, "rewards/rejected": -0.03814900666475296, "step": 123 }, { "epoch": 0.07202183888017658, "grad_norm": 287.1902770996094, "learning_rate": 4.819872167344568e-06, "logits/chosen": -0.9967269897460938, "logits/rejected": -1.0370653867721558, "logps/chosen": -73.49361419677734, "logps/rejected": -76.58966827392578, "loss": 13.7119, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.034154172986745834, "rewards/margins": 0.019786100834608078, "rewards/rejected": -0.05394027382135391, "step": 124 }, { "epoch": 0.07260266016146831, "grad_norm": 321.77142333984375, "learning_rate": 4.81841952353283e-06, "logits/chosen": -0.8000070452690125, "logits/rejected": -0.8691812753677368, "logps/chosen": -77.01860809326172, "logps/rejected": -73.41801452636719, "loss": 13.4661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00704178074374795, "rewards/margins": 0.04257983714342117, "rewards/rejected": -0.04962162673473358, "step": 125 }, { "epoch": 0.07318348144276006, "grad_norm": 329.365478515625, "learning_rate": 4.816966879721093e-06, "logits/chosen": -0.6257106065750122, "logits/rejected": -0.7529661059379578, "logps/chosen": -70.44479370117188, "logps/rejected": -71.65373229980469, "loss": 13.9067, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.030417144298553467, "rewards/margins": -0.0017956402152776718, "rewards/rejected": -0.028621505945920944, "step": 126 }, { "epoch": 0.07376430272405181, "grad_norm": 305.7592468261719, "learning_rate": 4.815514235909355e-06, "logits/chosen": -0.8345580101013184, "logits/rejected": -0.809880256652832, "logps/chosen": -75.27732849121094, "logps/rejected": -76.76701354980469, "loss": 13.7657, "rewards/accuracies": 0.5, "rewards/chosen": -0.02379775047302246, "rewards/margins": 0.011454248800873756, "rewards/rejected": -0.03525200113654137, "step": 127 }, { "epoch": 0.07434512400534356, "grad_norm": 326.51171875, "learning_rate": 4.814061592097618e-06, "logits/chosen": -0.8751303553581238, "logits/rejected": -0.7346702814102173, "logps/chosen": -71.24671936035156, "logps/rejected": -76.22618103027344, "loss": 13.7425, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.002272081095725298, "rewards/margins": 0.013138952665030956, "rewards/rejected": -0.010866871103644371, "step": 128 }, { "epoch": 0.0749259452866353, "grad_norm": 331.1041259765625, "learning_rate": 4.8126089482858805e-06, "logits/chosen": -0.840943455696106, "logits/rejected": -0.8373934626579285, "logps/chosen": -74.03942108154297, "logps/rejected": -69.12279510498047, "loss": 14.2463, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0546981580555439, "rewards/margins": -0.03357243537902832, "rewards/rejected": -0.02112571895122528, "step": 129 }, { "epoch": 0.07550676656792706, "grad_norm": 312.5914611816406, "learning_rate": 4.811156304474143e-06, "logits/chosen": -0.8838016390800476, "logits/rejected": -0.9450550079345703, "logps/chosen": -79.87310028076172, "logps/rejected": -80.9448013305664, "loss": 13.7435, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03348521143198013, "rewards/margins": 0.014457473531365395, "rewards/rejected": -0.04794268682599068, "step": 130 }, { "epoch": 0.07608758784921879, "grad_norm": 291.64898681640625, "learning_rate": 4.809703660662406e-06, "logits/chosen": -0.9270390272140503, "logits/rejected": -1.0079596042633057, "logps/chosen": -68.3660659790039, "logps/rejected": -75.16285705566406, "loss": 13.761, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.01317480206489563, "rewards/margins": 0.013511622324585915, "rewards/rejected": -0.026686420664191246, "step": 131 }, { "epoch": 0.07666840913051054, "grad_norm": 325.22186279296875, "learning_rate": 4.808251016850669e-06, "logits/chosen": -0.793510913848877, "logits/rejected": -0.8536527752876282, "logps/chosen": -79.37129211425781, "logps/rejected": -81.64805603027344, "loss": 13.9288, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04622223600745201, "rewards/margins": -0.0038099661469459534, "rewards/rejected": -0.042412273585796356, "step": 132 }, { "epoch": 0.07724923041180229, "grad_norm": 332.0003662109375, "learning_rate": 4.8067983730389315e-06, "logits/chosen": -0.863021731376648, "logits/rejected": -0.7807949185371399, "logps/chosen": -70.98688507080078, "logps/rejected": -74.82456970214844, "loss": 13.6363, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.03255782276391983, "rewards/margins": 0.027058040723204613, "rewards/rejected": -0.059615861624479294, "step": 133 }, { "epoch": 0.07783005169309404, "grad_norm": 419.6676025390625, "learning_rate": 4.8053457292271934e-06, "logits/chosen": -0.6968336701393127, "logits/rejected": -0.8155566453933716, "logps/chosen": -76.53175354003906, "logps/rejected": -80.69217681884766, "loss": 13.5714, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.003611106425523758, "rewards/margins": 0.0325997918844223, "rewards/rejected": -0.03621090203523636, "step": 134 }, { "epoch": 0.07841087297438579, "grad_norm": 318.14093017578125, "learning_rate": 4.803893085415456e-06, "logits/chosen": -0.8911747932434082, "logits/rejected": -0.8688879013061523, "logps/chosen": -78.88302612304688, "logps/rejected": -72.34574890136719, "loss": 13.7806, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009762173518538475, "rewards/margins": 0.012868879362940788, "rewards/rejected": -0.022631052881479263, "step": 135 }, { "epoch": 0.07899169425567752, "grad_norm": 301.05181884765625, "learning_rate": 4.802440441603719e-06, "logits/chosen": -0.9665401577949524, "logits/rejected": -1.0596462488174438, "logps/chosen": -69.1599349975586, "logps/rejected": -72.39261627197266, "loss": 13.9663, "rewards/accuracies": 0.5, "rewards/chosen": -0.006733216345310211, "rewards/margins": -0.007116011343896389, "rewards/rejected": 0.00038279517320916057, "step": 136 }, { "epoch": 0.07957251553696927, "grad_norm": 318.62030029296875, "learning_rate": 4.800987797791982e-06, "logits/chosen": -0.7195813059806824, "logits/rejected": -0.7145063281059265, "logps/chosen": -79.5794448852539, "logps/rejected": -84.13545227050781, "loss": 13.4993, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.002331190975382924, "rewards/margins": 0.03869814798235893, "rewards/rejected": -0.04102934151887894, "step": 137 }, { "epoch": 0.08015333681826102, "grad_norm": 425.3757019042969, "learning_rate": 4.7995351539802444e-06, "logits/chosen": -0.8645333051681519, "logits/rejected": -0.7608574032783508, "logps/chosen": -69.97611999511719, "logps/rejected": -85.6728515625, "loss": 13.1834, "rewards/accuracies": 0.75, "rewards/chosen": 0.011998976580798626, "rewards/margins": 0.07343053817749023, "rewards/rejected": -0.06143154948949814, "step": 138 }, { "epoch": 0.08073415809955277, "grad_norm": 312.2695007324219, "learning_rate": 4.798082510168507e-06, "logits/chosen": -0.7758678197860718, "logits/rejected": -0.7573307752609253, "logps/chosen": -76.74276733398438, "logps/rejected": -70.28133392333984, "loss": 13.8316, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0051659406162798405, "rewards/margins": 0.006639555096626282, "rewards/rejected": -0.0014736183220520616, "step": 139 }, { "epoch": 0.08131497938084452, "grad_norm": 315.1054992675781, "learning_rate": 4.79662986635677e-06, "logits/chosen": -0.718429684638977, "logits/rejected": -0.8549942970275879, "logps/chosen": -70.87870788574219, "logps/rejected": -68.09158325195312, "loss": 13.8946, "rewards/accuracies": 0.5, "rewards/chosen": -0.027258872985839844, "rewards/margins": -0.0010386653011664748, "rewards/rejected": -0.026220208033919334, "step": 140 }, { "epoch": 0.08189580066213627, "grad_norm": 315.59136962890625, "learning_rate": 4.795177222545032e-06, "logits/chosen": -0.8170045614242554, "logits/rejected": -0.7605774402618408, "logps/chosen": -69.022705078125, "logps/rejected": -81.53439331054688, "loss": 13.625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.006630831863731146, "rewards/margins": 0.02589798904955387, "rewards/rejected": -0.019267160445451736, "step": 141 }, { "epoch": 0.082476621943428, "grad_norm": 303.656982421875, "learning_rate": 4.793724578733295e-06, "logits/chosen": -0.9443836212158203, "logits/rejected": -0.944778561592102, "logps/chosen": -77.53956604003906, "logps/rejected": -78.73460388183594, "loss": 13.8955, "rewards/accuracies": 0.5, "rewards/chosen": -0.02790834940969944, "rewards/margins": -0.0015594146680086851, "rewards/rejected": -0.026348933577537537, "step": 142 }, { "epoch": 0.08305744322471975, "grad_norm": 316.501220703125, "learning_rate": 4.792271934921557e-06, "logits/chosen": -0.7522753477096558, "logits/rejected": -0.8681309819221497, "logps/chosen": -77.05738830566406, "logps/rejected": -70.8436508178711, "loss": 14.4085, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.031116142868995667, "rewards/margins": -0.05178617313504219, "rewards/rejected": 0.020670032128691673, "step": 143 }, { "epoch": 0.0836382645060115, "grad_norm": 673.7465209960938, "learning_rate": 4.79081929110982e-06, "logits/chosen": -0.8983888626098633, "logits/rejected": -0.8548039197921753, "logps/chosen": -78.78826141357422, "logps/rejected": -71.03697204589844, "loss": 13.8593, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004244650714099407, "rewards/margins": 0.004877415020018816, "rewards/rejected": -0.00912206619977951, "step": 144 }, { "epoch": 0.08421908578730325, "grad_norm": 308.432861328125, "learning_rate": 4.789366647298083e-06, "logits/chosen": -0.8701874613761902, "logits/rejected": -0.8173721432685852, "logps/chosen": -73.85729217529297, "logps/rejected": -74.20492553710938, "loss": 13.6175, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01854265108704567, "rewards/margins": 0.02803659439086914, "rewards/rejected": -0.04657924920320511, "step": 145 }, { "epoch": 0.084799907068595, "grad_norm": 295.6097717285156, "learning_rate": 4.787914003486346e-06, "logits/chosen": -0.8499331474304199, "logits/rejected": -0.8638921976089478, "logps/chosen": -66.98426818847656, "logps/rejected": -65.90644836425781, "loss": 13.8752, "rewards/accuracies": 0.5, "rewards/chosen": -0.0066559212282299995, "rewards/margins": 0.0012805939186364412, "rewards/rejected": -0.007936513982713223, "step": 146 }, { "epoch": 0.08538072834988673, "grad_norm": 331.2653503417969, "learning_rate": 4.786461359674608e-06, "logits/chosen": -0.9896368980407715, "logits/rejected": -1.1149613857269287, "logps/chosen": -77.21143341064453, "logps/rejected": -85.37650299072266, "loss": 13.5235, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.003070878330618143, "rewards/margins": 0.03570512682199478, "rewards/rejected": -0.038776006549596786, "step": 147 }, { "epoch": 0.08596154963117848, "grad_norm": 356.3915710449219, "learning_rate": 4.785008715862871e-06, "logits/chosen": -0.8506426811218262, "logits/rejected": -0.7640115022659302, "logps/chosen": -67.30644989013672, "logps/rejected": -71.46052551269531, "loss": 13.6259, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.010771533474326134, "rewards/margins": 0.026811867952346802, "rewards/rejected": -0.037583399564027786, "step": 148 }, { "epoch": 0.08654237091247023, "grad_norm": 303.959716796875, "learning_rate": 4.783556072051134e-06, "logits/chosen": -0.9652125239372253, "logits/rejected": -1.0568349361419678, "logps/chosen": -78.24043273925781, "logps/rejected": -82.4593734741211, "loss": 13.4476, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0065461816266179085, "rewards/margins": 0.049500562250614166, "rewards/rejected": -0.04295438155531883, "step": 149 }, { "epoch": 0.08712319219376198, "grad_norm": 301.6387939453125, "learning_rate": 4.7821034282393966e-06, "logits/chosen": -0.9109539985656738, "logits/rejected": -0.8469412922859192, "logps/chosen": -71.34504699707031, "logps/rejected": -70.11753845214844, "loss": 14.0276, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.013672858476638794, "rewards/margins": -0.013126525096595287, "rewards/rejected": -0.0005463305860757828, "step": 150 }, { "epoch": 0.08770401347505373, "grad_norm": 385.97100830078125, "learning_rate": 4.780650784427659e-06, "logits/chosen": -0.974000096321106, "logits/rejected": -0.8754386901855469, "logps/chosen": -78.10624694824219, "logps/rejected": -71.39083099365234, "loss": 13.8936, "rewards/accuracies": 0.5, "rewards/chosen": 0.005910863634198904, "rewards/margins": -0.0005791831645183265, "rewards/rejected": 0.006490050349384546, "step": 151 }, { "epoch": 0.08828483475634548, "grad_norm": 288.5458679199219, "learning_rate": 4.779198140615921e-06, "logits/chosen": -0.8864970207214355, "logits/rejected": -0.8748113512992859, "logps/chosen": -74.03097534179688, "logps/rejected": -68.74441528320312, "loss": 13.8866, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.009058129973709583, "rewards/margins": 0.003120752517133951, "rewards/rejected": 0.005937379319220781, "step": 152 }, { "epoch": 0.08886565603763721, "grad_norm": 335.7358093261719, "learning_rate": 4.777745496804184e-06, "logits/chosen": -0.7208220362663269, "logits/rejected": -0.7323756217956543, "logps/chosen": -71.64644622802734, "logps/rejected": -80.21810913085938, "loss": 13.6955, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004168490879237652, "rewards/margins": 0.020404014736413956, "rewards/rejected": -0.01623552106320858, "step": 153 }, { "epoch": 0.08944647731892896, "grad_norm": 302.51763916015625, "learning_rate": 4.776292852992447e-06, "logits/chosen": -0.6920525431632996, "logits/rejected": -0.7309268712997437, "logps/chosen": -69.67924499511719, "logps/rejected": -76.42180633544922, "loss": 13.5794, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005118966568261385, "rewards/margins": 0.031105687841773033, "rewards/rejected": -0.03622465208172798, "step": 154 }, { "epoch": 0.09002729860022071, "grad_norm": 316.7055358886719, "learning_rate": 4.7748402091807095e-06, "logits/chosen": -0.7559612989425659, "logits/rejected": -0.6078428626060486, "logps/chosen": -70.20340728759766, "logps/rejected": -66.18345642089844, "loss": 13.9598, "rewards/accuracies": 0.5, "rewards/chosen": -0.0274501983076334, "rewards/margins": -0.007842998020350933, "rewards/rejected": -0.019607199355959892, "step": 155 }, { "epoch": 0.09060811988151246, "grad_norm": 328.9933166503906, "learning_rate": 4.773387565368972e-06, "logits/chosen": -0.7114741802215576, "logits/rejected": -0.8936127424240112, "logps/chosen": -72.7677993774414, "logps/rejected": -77.62763214111328, "loss": 13.5946, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.008942861109972, "rewards/margins": 0.030701924115419388, "rewards/rejected": -0.03964478522539139, "step": 156 }, { "epoch": 0.09118894116280421, "grad_norm": 337.06060791015625, "learning_rate": 4.771934921557235e-06, "logits/chosen": -0.86445152759552, "logits/rejected": -0.8468233942985535, "logps/chosen": -84.74078369140625, "logps/rejected": -83.00550079345703, "loss": 13.9382, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03650820627808571, "rewards/margins": -0.002918243408203125, "rewards/rejected": -0.03358996659517288, "step": 157 }, { "epoch": 0.09176976244409595, "grad_norm": 318.7173767089844, "learning_rate": 4.770482277745498e-06, "logits/chosen": -0.7902041673660278, "logits/rejected": -0.8102203607559204, "logps/chosen": -72.93890380859375, "logps/rejected": -77.42249298095703, "loss": 13.745, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01063698623329401, "rewards/margins": 0.013827304355800152, "rewards/rejected": -0.02446429245173931, "step": 158 }, { "epoch": 0.0923505837253877, "grad_norm": 347.96240234375, "learning_rate": 4.76902963393376e-06, "logits/chosen": -0.8171346783638, "logits/rejected": -0.8012442588806152, "logps/chosen": -69.07205963134766, "logps/rejected": -80.80284118652344, "loss": 13.7243, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05406276509165764, "rewards/margins": 0.016692936420440674, "rewards/rejected": -0.07075570523738861, "step": 159 }, { "epoch": 0.09293140500667944, "grad_norm": 312.94622802734375, "learning_rate": 4.767576990122022e-06, "logits/chosen": -0.8780553936958313, "logits/rejected": -0.7555993795394897, "logps/chosen": -75.97615814208984, "logps/rejected": -73.26390838623047, "loss": 13.8975, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.01142085064202547, "rewards/margins": -0.0006685241824015975, "rewards/rejected": -0.010752325877547264, "step": 160 }, { "epoch": 0.09351222628797119, "grad_norm": 319.6703186035156, "learning_rate": 4.766124346310285e-06, "logits/chosen": -0.9523868560791016, "logits/rejected": -0.9787250757217407, "logps/chosen": -78.85665130615234, "logps/rejected": -76.03120422363281, "loss": 13.7211, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00805568601936102, "rewards/margins": 0.019266730174422264, "rewards/rejected": -0.027322417125105858, "step": 161 }, { "epoch": 0.09409304756926294, "grad_norm": 304.69439697265625, "learning_rate": 4.764671702498548e-06, "logits/chosen": -0.7667674422264099, "logits/rejected": -0.8059636354446411, "logps/chosen": -71.68733215332031, "logps/rejected": -74.61915588378906, "loss": 13.3629, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.03330715745687485, "rewards/margins": 0.05361800268292427, "rewards/rejected": -0.02031084895133972, "step": 162 }, { "epoch": 0.09467386885055469, "grad_norm": 302.20050048828125, "learning_rate": 4.763219058686811e-06, "logits/chosen": -0.7947182655334473, "logits/rejected": -0.8342369198799133, "logps/chosen": -71.64237213134766, "logps/rejected": -70.30358123779297, "loss": 13.4606, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010867975652217865, "rewards/margins": 0.04301146790385246, "rewards/rejected": -0.053879447281360626, "step": 163 }, { "epoch": 0.09525469013184643, "grad_norm": 289.6075134277344, "learning_rate": 4.761766414875073e-06, "logits/chosen": -0.7613986134529114, "logits/rejected": -0.8836091160774231, "logps/chosen": -67.74214935302734, "logps/rejected": -78.71183776855469, "loss": 13.2522, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0031068138778209686, "rewards/margins": 0.06571229547262192, "rewards/rejected": -0.06881911307573318, "step": 164 }, { "epoch": 0.09583551141313817, "grad_norm": 326.1537780761719, "learning_rate": 4.760313771063336e-06, "logits/chosen": -0.8384913206100464, "logits/rejected": -0.7940191030502319, "logps/chosen": -66.64979553222656, "logps/rejected": -69.57666015625, "loss": 13.5813, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.01473635621368885, "rewards/margins": 0.02965986728668213, "rewards/rejected": -0.014923503622412682, "step": 165 }, { "epoch": 0.09641633269442992, "grad_norm": 317.4963684082031, "learning_rate": 4.758861127251598e-06, "logits/chosen": -0.7010576128959656, "logits/rejected": -0.7300230860710144, "logps/chosen": -76.84666442871094, "logps/rejected": -84.0490493774414, "loss": 13.9696, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.04205438122153282, "rewards/margins": -0.005721705500036478, "rewards/rejected": -0.03633267432451248, "step": 166 }, { "epoch": 0.09699715397572167, "grad_norm": 318.7678527832031, "learning_rate": 4.757408483439861e-06, "logits/chosen": -0.9276505708694458, "logits/rejected": -0.9561537504196167, "logps/chosen": -71.64440155029297, "logps/rejected": -80.91627502441406, "loss": 13.4072, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0023384944070130587, "rewards/margins": 0.04981740936636925, "rewards/rejected": -0.05215590447187424, "step": 167 }, { "epoch": 0.09757797525701342, "grad_norm": 287.4025573730469, "learning_rate": 4.7559558396281235e-06, "logits/chosen": -0.7570281624794006, "logits/rejected": -0.8512780070304871, "logps/chosen": -68.49506378173828, "logps/rejected": -70.23898315429688, "loss": 13.8651, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007149294018745422, "rewards/margins": 0.0015381794655695558, "rewards/rejected": -0.008687476627528667, "step": 168 }, { "epoch": 0.09815879653830516, "grad_norm": 310.3047790527344, "learning_rate": 4.754503195816386e-06, "logits/chosen": -0.8154839277267456, "logits/rejected": -1.0209014415740967, "logps/chosen": -78.32124328613281, "logps/rejected": -68.3904037475586, "loss": 13.368, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.016737347468733788, "rewards/margins": 0.0557125024497509, "rewards/rejected": -0.038975149393081665, "step": 169 }, { "epoch": 0.0987396178195969, "grad_norm": 303.63385009765625, "learning_rate": 4.753050552004649e-06, "logits/chosen": -0.7536464333534241, "logits/rejected": -0.7967413663864136, "logps/chosen": -73.55670928955078, "logps/rejected": -75.25071716308594, "loss": 13.9022, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.022630508989095688, "rewards/margins": -0.0017447940772399306, "rewards/rejected": -0.020885715261101723, "step": 170 }, { "epoch": 0.09932043910088866, "grad_norm": 328.81494140625, "learning_rate": 4.751597908192912e-06, "logits/chosen": -0.9858807325363159, "logits/rejected": -0.9747235178947449, "logps/chosen": -79.53309631347656, "logps/rejected": -64.26655578613281, "loss": 14.1977, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.03506975993514061, "rewards/margins": -0.029723864048719406, "rewards/rejected": -0.005345895420759916, "step": 171 }, { "epoch": 0.0999012603821804, "grad_norm": 303.1661682128906, "learning_rate": 4.750145264381174e-06, "logits/chosen": -0.8304456472396851, "logits/rejected": -0.754758358001709, "logps/chosen": -79.57183837890625, "logps/rejected": -73.25392150878906, "loss": 13.3539, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0045074475929141045, "rewards/margins": 0.05535992234945297, "rewards/rejected": -0.059867363423109055, "step": 172 }, { "epoch": 0.10048208166347215, "grad_norm": 285.4842224121094, "learning_rate": 4.748692620569436e-06, "logits/chosen": -0.7988349199295044, "logits/rejected": -0.8758390545845032, "logps/chosen": -69.04563903808594, "logps/rejected": -63.191673278808594, "loss": 13.8072, "rewards/accuracies": 0.5, "rewards/chosen": -0.02564077451825142, "rewards/margins": 0.009737257845699787, "rewards/rejected": -0.03537803143262863, "step": 173 }, { "epoch": 0.1010629029447639, "grad_norm": 316.76220703125, "learning_rate": 4.747239976757699e-06, "logits/chosen": -0.8752381205558777, "logits/rejected": -0.9098547697067261, "logps/chosen": -71.7885513305664, "logps/rejected": -77.3386001586914, "loss": 13.8798, "rewards/accuracies": 0.5, "rewards/chosen": -0.03445696085691452, "rewards/margins": 0.00015344536222983152, "rewards/rejected": -0.03461039811372757, "step": 174 }, { "epoch": 0.10164372422605564, "grad_norm": 285.7683410644531, "learning_rate": 4.745787332945962e-06, "logits/chosen": -0.8894672393798828, "logits/rejected": -1.066870927810669, "logps/chosen": -71.97786712646484, "logps/rejected": -70.2954330444336, "loss": 13.8723, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.027886096388101578, "rewards/margins": 0.003271915018558502, "rewards/rejected": -0.03115800954401493, "step": 175 }, { "epoch": 0.10222454550734739, "grad_norm": 314.2114562988281, "learning_rate": 4.744334689134225e-06, "logits/chosen": -0.9301921725273132, "logits/rejected": -0.9640763401985168, "logps/chosen": -68.97576904296875, "logps/rejected": -73.05828857421875, "loss": 13.546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00339788431301713, "rewards/margins": 0.03368956595659256, "rewards/rejected": -0.03708745166659355, "step": 176 }, { "epoch": 0.10280536678863914, "grad_norm": 350.662353515625, "learning_rate": 4.742882045322487e-06, "logits/chosen": -0.9687705039978027, "logits/rejected": -0.8688680529594421, "logps/chosen": -76.81888580322266, "logps/rejected": -75.65296936035156, "loss": 14.2808, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.07494613528251648, "rewards/margins": -0.03318053483963013, "rewards/rejected": -0.04176560416817665, "step": 177 }, { "epoch": 0.10338618806993088, "grad_norm": 335.7806396484375, "learning_rate": 4.74142940151075e-06, "logits/chosen": -0.7820795178413391, "logits/rejected": -0.8843638300895691, "logps/chosen": -83.46534729003906, "logps/rejected": -73.49069213867188, "loss": 13.5938, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.011855507269501686, "rewards/margins": 0.030727148056030273, "rewards/rejected": -0.04258265346288681, "step": 178 }, { "epoch": 0.10396700935122263, "grad_norm": 298.5807800292969, "learning_rate": 4.739976757699012e-06, "logits/chosen": -0.9966332316398621, "logits/rejected": -0.8384010195732117, "logps/chosen": -75.45916748046875, "logps/rejected": -79.22760009765625, "loss": 13.5549, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.045658472925424576, "rewards/margins": 0.03411347419023514, "rewards/rejected": -0.07977195084095001, "step": 179 }, { "epoch": 0.10454783063251437, "grad_norm": 308.5312805175781, "learning_rate": 4.738524113887275e-06, "logits/chosen": -1.1542797088623047, "logits/rejected": -1.1540412902832031, "logps/chosen": -71.09564971923828, "logps/rejected": -66.72710418701172, "loss": 13.8693, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.01030341349542141, "rewards/margins": 0.0030408282764256, "rewards/rejected": -0.013344240374863148, "step": 180 }, { "epoch": 0.10512865191380612, "grad_norm": 382.357666015625, "learning_rate": 4.7370714700755375e-06, "logits/chosen": -0.8503016233444214, "logits/rejected": -0.8578587770462036, "logps/chosen": -67.1768798828125, "logps/rejected": -74.30778503417969, "loss": 13.5419, "rewards/accuracies": 0.5, "rewards/chosen": -0.024283096194267273, "rewards/margins": 0.036231573671102524, "rewards/rejected": -0.0605146698653698, "step": 181 }, { "epoch": 0.10570947319509787, "grad_norm": 316.55859375, "learning_rate": 4.7356188262638e-06, "logits/chosen": -0.8488529324531555, "logits/rejected": -0.8070418238639832, "logps/chosen": -75.79255676269531, "logps/rejected": -76.4787826538086, "loss": 13.6775, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01909615471959114, "rewards/margins": 0.02385520003736019, "rewards/rejected": -0.04295135289430618, "step": 182 }, { "epoch": 0.10629029447638962, "grad_norm": 309.40887451171875, "learning_rate": 4.734166182452063e-06, "logits/chosen": -0.9303333163261414, "logits/rejected": -0.900216281414032, "logps/chosen": -74.81334686279297, "logps/rejected": -76.99653625488281, "loss": 14.0831, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.08230753988027573, "rewards/margins": -0.01739531196653843, "rewards/rejected": -0.06491222977638245, "step": 183 }, { "epoch": 0.10687111575768136, "grad_norm": 308.8415832519531, "learning_rate": 4.732713538640326e-06, "logits/chosen": -0.9111288785934448, "logits/rejected": -0.8835655450820923, "logps/chosen": -82.20323181152344, "logps/rejected": -69.86083221435547, "loss": 13.6755, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03169438987970352, "rewards/margins": 0.023478079587221146, "rewards/rejected": -0.055172473192214966, "step": 184 }, { "epoch": 0.10745193703897311, "grad_norm": 294.7135925292969, "learning_rate": 4.7312608948285885e-06, "logits/chosen": -0.7559576630592346, "logits/rejected": -0.7548641562461853, "logps/chosen": -72.80931091308594, "logps/rejected": -68.47846221923828, "loss": 13.6632, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.03249110281467438, "rewards/margins": 0.026273757219314575, "rewards/rejected": -0.05876486748456955, "step": 185 }, { "epoch": 0.10803275832026485, "grad_norm": 300.0313720703125, "learning_rate": 4.7298082510168504e-06, "logits/chosen": -0.7012637853622437, "logits/rejected": -0.7670835256576538, "logps/chosen": -69.26997375488281, "logps/rejected": -75.66789245605469, "loss": 13.8065, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.05606495216488838, "rewards/margins": 0.008019248023629189, "rewards/rejected": -0.06408419460058212, "step": 186 }, { "epoch": 0.1086135796015566, "grad_norm": 305.9514465332031, "learning_rate": 4.728355607205113e-06, "logits/chosen": -0.848249614238739, "logits/rejected": -0.9618538022041321, "logps/chosen": -76.71751403808594, "logps/rejected": -75.86561584472656, "loss": 13.9142, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.058838047087192535, "rewards/margins": -0.0012072951067239046, "rewards/rejected": -0.057630755007267, "step": 187 }, { "epoch": 0.10919440088284835, "grad_norm": 299.03106689453125, "learning_rate": 4.726902963393376e-06, "logits/chosen": -0.7864362001419067, "logits/rejected": -0.9607146382331848, "logps/chosen": -73.52447509765625, "logps/rejected": -83.42733001708984, "loss": 13.0054, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0022229477763175964, "rewards/margins": 0.09174972772598267, "rewards/rejected": -0.08952677994966507, "step": 188 }, { "epoch": 0.1097752221641401, "grad_norm": 301.38275146484375, "learning_rate": 4.7254503195816395e-06, "logits/chosen": -0.9863910675048828, "logits/rejected": -0.9701216816902161, "logps/chosen": -73.6039047241211, "logps/rejected": -74.17659759521484, "loss": 13.5522, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.027939602732658386, "rewards/margins": 0.033790357410907745, "rewards/rejected": -0.06172995641827583, "step": 189 }, { "epoch": 0.11035604344543185, "grad_norm": 321.6877136230469, "learning_rate": 4.723997675769902e-06, "logits/chosen": -0.796249508857727, "logits/rejected": -0.7661502957344055, "logps/chosen": -78.24516296386719, "logps/rejected": -74.38370513916016, "loss": 13.5199, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.038561441004276276, "rewards/margins": 0.039519913494586945, "rewards/rejected": -0.07808135449886322, "step": 190 }, { "epoch": 0.11093686472672358, "grad_norm": 317.3888854980469, "learning_rate": 4.722545031958164e-06, "logits/chosen": -0.9007613062858582, "logits/rejected": -0.9852052927017212, "logps/chosen": -82.073486328125, "logps/rejected": -72.91219329833984, "loss": 13.6764, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03767850995063782, "rewards/margins": 0.023520758375525475, "rewards/rejected": -0.06119927018880844, "step": 191 }, { "epoch": 0.11151768600801533, "grad_norm": 316.7236328125, "learning_rate": 4.721092388146427e-06, "logits/chosen": -0.983284592628479, "logits/rejected": -0.8711267709732056, "logps/chosen": -76.91285705566406, "logps/rejected": -67.65292358398438, "loss": 14.1306, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.03306933492422104, "rewards/margins": -0.022902240976691246, "rewards/rejected": -0.010167093947529793, "step": 192 }, { "epoch": 0.11209850728930708, "grad_norm": 313.7123718261719, "learning_rate": 4.71963974433469e-06, "logits/chosen": -0.9627790451049805, "logits/rejected": -0.8592319488525391, "logps/chosen": -76.01704406738281, "logps/rejected": -72.71672058105469, "loss": 14.0736, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.049292050302028656, "rewards/margins": -0.01814861036837101, "rewards/rejected": -0.031143436208367348, "step": 193 }, { "epoch": 0.11267932857059883, "grad_norm": 303.0953063964844, "learning_rate": 4.718187100522952e-06, "logits/chosen": -0.8993155360221863, "logits/rejected": -0.8849833607673645, "logps/chosen": -71.47299194335938, "logps/rejected": -75.4497299194336, "loss": 13.3576, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004056071862578392, "rewards/margins": 0.05458641052246094, "rewards/rejected": -0.050530336797237396, "step": 194 }, { "epoch": 0.11326014985189058, "grad_norm": 311.2488098144531, "learning_rate": 4.716734456711215e-06, "logits/chosen": -0.9742434620857239, "logits/rejected": -0.9584578275680542, "logps/chosen": -75.96842956542969, "logps/rejected": -71.90623474121094, "loss": 13.6582, "rewards/accuracies": 0.5, "rewards/chosen": -0.01095439214259386, "rewards/margins": 0.023807067424058914, "rewards/rejected": -0.0347614586353302, "step": 195 }, { "epoch": 0.11384097113318233, "grad_norm": 300.0875549316406, "learning_rate": 4.715281812899478e-06, "logits/chosen": -0.594025194644928, "logits/rejected": -0.6777793765068054, "logps/chosen": -74.22691345214844, "logps/rejected": -74.91129302978516, "loss": 13.5482, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0018765062559396029, "rewards/margins": 0.03627584129571915, "rewards/rejected": -0.03439933806657791, "step": 196 }, { "epoch": 0.11442179241447406, "grad_norm": 299.1294860839844, "learning_rate": 4.713829169087741e-06, "logits/chosen": -0.7969453930854797, "logits/rejected": -0.8590051531791687, "logps/chosen": -74.24263000488281, "logps/rejected": -80.28329467773438, "loss": 13.9374, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.029978638514876366, "rewards/margins": -0.0027882575523108244, "rewards/rejected": -0.027190377935767174, "step": 197 }, { "epoch": 0.11500261369576581, "grad_norm": 320.9808044433594, "learning_rate": 4.7123765252760026e-06, "logits/chosen": -0.9466894865036011, "logits/rejected": -1.0433050394058228, "logps/chosen": -72.66864776611328, "logps/rejected": -75.08808135986328, "loss": 14.3219, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.033920880407094955, "rewards/margins": -0.04234874248504639, "rewards/rejected": 0.008427867665886879, "step": 198 }, { "epoch": 0.11558343497705756, "grad_norm": 308.2682189941406, "learning_rate": 4.710923881464265e-06, "logits/chosen": -0.7543210387229919, "logits/rejected": -0.8157938718795776, "logps/chosen": -81.17222595214844, "logps/rejected": -72.63612365722656, "loss": 14.0541, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.022938327863812447, "rewards/margins": -0.015443983487784863, "rewards/rejected": -0.007494345307350159, "step": 199 }, { "epoch": 0.11616425625834931, "grad_norm": 312.9922180175781, "learning_rate": 4.709471237652528e-06, "logits/chosen": -0.6948032975196838, "logits/rejected": -0.6942026615142822, "logps/chosen": -73.12284851074219, "logps/rejected": -77.82487487792969, "loss": 13.9199, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.029393082484602928, "rewards/margins": -0.0029410452116280794, "rewards/rejected": -0.02645203471183777, "step": 200 }, { "epoch": 0.11674507753964106, "grad_norm": 311.9326477050781, "learning_rate": 4.708018593840791e-06, "logits/chosen": -1.1453566551208496, "logits/rejected": -1.1129220724105835, "logps/chosen": -68.6366958618164, "logps/rejected": -73.55378723144531, "loss": 13.8228, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.010440112091600895, "rewards/margins": 0.009912310168147087, "rewards/rejected": -0.020352421328425407, "step": 201 }, { "epoch": 0.1173258988209328, "grad_norm": 308.12060546875, "learning_rate": 4.7065659500290536e-06, "logits/chosen": -0.8134487271308899, "logits/rejected": -0.7991577386856079, "logps/chosen": -70.990966796875, "logps/rejected": -78.31260681152344, "loss": 13.2819, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.004076024051755667, "rewards/margins": 0.06265188753604889, "rewards/rejected": -0.06672791391611099, "step": 202 }, { "epoch": 0.11790672010222454, "grad_norm": 287.57794189453125, "learning_rate": 4.705113306217316e-06, "logits/chosen": -0.9393211603164673, "logits/rejected": -0.9954279065132141, "logps/chosen": -76.2332534790039, "logps/rejected": -70.47115325927734, "loss": 13.2255, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.00021018953702878207, "rewards/margins": 0.06832081824541092, "rewards/rejected": -0.06853101402521133, "step": 203 }, { "epoch": 0.11848754138351629, "grad_norm": 305.3521423339844, "learning_rate": 4.703660662405578e-06, "logits/chosen": -0.9575725793838501, "logits/rejected": -1.0199588537216187, "logps/chosen": -80.7159423828125, "logps/rejected": -67.89506530761719, "loss": 14.0192, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.06569384038448334, "rewards/margins": -0.01254147756844759, "rewards/rejected": -0.053152360022068024, "step": 204 }, { "epoch": 0.11906836266480804, "grad_norm": 278.6866760253906, "learning_rate": 4.702208018593841e-06, "logits/chosen": -0.7608988881111145, "logits/rejected": -0.7628680467605591, "logps/chosen": -70.3031005859375, "logps/rejected": -70.50173950195312, "loss": 13.39, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.021148493513464928, "rewards/margins": 0.052839674055576324, "rewards/rejected": -0.0739881694316864, "step": 205 }, { "epoch": 0.11964918394609979, "grad_norm": 304.92913818359375, "learning_rate": 4.700755374782104e-06, "logits/chosen": -0.8696325421333313, "logits/rejected": -0.9427449107170105, "logps/chosen": -74.74107360839844, "logps/rejected": -75.46221160888672, "loss": 13.7106, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.051121801137924194, "rewards/margins": 0.018347539007663727, "rewards/rejected": -0.06946934014558792, "step": 206 }, { "epoch": 0.12023000522739154, "grad_norm": 315.7526550292969, "learning_rate": 4.6993027309703665e-06, "logits/chosen": -0.8430768251419067, "logits/rejected": -0.7843033075332642, "logps/chosen": -71.7156982421875, "logps/rejected": -74.4084243774414, "loss": 13.8928, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.024820514023303986, "rewards/margins": -0.0010401479667052627, "rewards/rejected": -0.023780368268489838, "step": 207 }, { "epoch": 0.12081082650868327, "grad_norm": 380.44561767578125, "learning_rate": 4.697850087158629e-06, "logits/chosen": -0.8150386810302734, "logits/rejected": -0.7826833724975586, "logps/chosen": -76.09224700927734, "logps/rejected": -83.32121276855469, "loss": 13.609, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.032852984964847565, "rewards/margins": 0.02871485985815525, "rewards/rejected": -0.061567842960357666, "step": 208 }, { "epoch": 0.12139164778997502, "grad_norm": 330.4120788574219, "learning_rate": 4.696397443346892e-06, "logits/chosen": -1.0164332389831543, "logits/rejected": -0.9415372610092163, "logps/chosen": -78.38670349121094, "logps/rejected": -79.13145446777344, "loss": 14.0359, "rewards/accuracies": 0.5, "rewards/chosen": -0.061431754380464554, "rewards/margins": -0.01327196042984724, "rewards/rejected": -0.04815979301929474, "step": 209 }, { "epoch": 0.12197246907126677, "grad_norm": 336.34759521484375, "learning_rate": 4.694944799535155e-06, "logits/chosen": -0.9332435727119446, "logits/rejected": -0.9620237350463867, "logps/chosen": -78.70216369628906, "logps/rejected": -90.57441711425781, "loss": 13.5176, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00978778675198555, "rewards/margins": 0.0402628593146801, "rewards/rejected": -0.050050653517246246, "step": 210 }, { "epoch": 0.12255329035255852, "grad_norm": 303.8833923339844, "learning_rate": 4.693492155723417e-06, "logits/chosen": -0.9775910377502441, "logits/rejected": -0.9736326932907104, "logps/chosen": -78.42801666259766, "logps/rejected": -73.2563705444336, "loss": 14.1172, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.021739017218351364, "rewards/margins": -0.02232358045876026, "rewards/rejected": 0.0005845635896548629, "step": 211 }, { "epoch": 0.12313411163385027, "grad_norm": 307.4227294921875, "learning_rate": 4.692039511911679e-06, "logits/chosen": -0.8911228179931641, "logits/rejected": -1.0791237354278564, "logps/chosen": -65.61485290527344, "logps/rejected": -75.7813720703125, "loss": 13.3959, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.014038696885108948, "rewards/margins": 0.05425255373120308, "rewards/rejected": -0.04021385312080383, "step": 212 }, { "epoch": 0.12371493291514202, "grad_norm": 287.40045166015625, "learning_rate": 4.690586868099942e-06, "logits/chosen": -0.8995906710624695, "logits/rejected": -0.8471341133117676, "logps/chosen": -66.9594497680664, "logps/rejected": -71.04551696777344, "loss": 13.6394, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01811062917113304, "rewards/margins": 0.027448922395706177, "rewards/rejected": -0.04555954784154892, "step": 213 }, { "epoch": 0.12429575419643375, "grad_norm": 306.50128173828125, "learning_rate": 4.689134224288205e-06, "logits/chosen": -0.8109323382377625, "logits/rejected": -0.7679190635681152, "logps/chosen": -66.3537826538086, "logps/rejected": -78.93473815917969, "loss": 13.4859, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04216752201318741, "rewards/margins": 0.0444951057434082, "rewards/rejected": -0.08666262775659561, "step": 214 }, { "epoch": 0.1248765754777255, "grad_norm": 327.2640075683594, "learning_rate": 4.687681580476468e-06, "logits/chosen": -0.895319938659668, "logits/rejected": -0.9245772361755371, "logps/chosen": -69.65949249267578, "logps/rejected": -71.36875915527344, "loss": 14.5911, "rewards/accuracies": 0.25, "rewards/chosen": -0.08888588845729828, "rewards/margins": -0.06758525222539902, "rewards/rejected": -0.021300649270415306, "step": 215 }, { "epoch": 0.12545739675901724, "grad_norm": 322.3681640625, "learning_rate": 4.68622893666473e-06, "logits/chosen": -0.8954951167106628, "logits/rejected": -0.81683349609375, "logps/chosen": -82.0992431640625, "logps/rejected": -71.04183197021484, "loss": 14.0854, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0422581322491169, "rewards/margins": -0.01680077239871025, "rewards/rejected": -0.025457357987761497, "step": 216 }, { "epoch": 0.12603821804030899, "grad_norm": 332.36920166015625, "learning_rate": 4.684776292852993e-06, "logits/chosen": -0.887475311756134, "logits/rejected": -0.7960731983184814, "logps/chosen": -74.29244995117188, "logps/rejected": -81.73658752441406, "loss": 14.184, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09383013099431992, "rewards/margins": -0.027784889563918114, "rewards/rejected": -0.06604524701833725, "step": 217 }, { "epoch": 0.12661903932160073, "grad_norm": 329.1742858886719, "learning_rate": 4.683323649041255e-06, "logits/chosen": -0.7891820073127747, "logits/rejected": -1.006798505783081, "logps/chosen": -77.01458740234375, "logps/rejected": -71.24549102783203, "loss": 14.3551, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.07353498041629791, "rewards/margins": -0.045538146048784256, "rewards/rejected": -0.02799682877957821, "step": 218 }, { "epoch": 0.12719986060289248, "grad_norm": 300.3213806152344, "learning_rate": 4.681871005229518e-06, "logits/chosen": -0.9636018872261047, "logits/rejected": -1.0088518857955933, "logps/chosen": -64.98380279541016, "logps/rejected": -71.98371887207031, "loss": 13.5981, "rewards/accuracies": 0.5, "rewards/chosen": -0.009727184660732746, "rewards/margins": 0.030616506934165955, "rewards/rejected": -0.04034368693828583, "step": 219 }, { "epoch": 0.12778068188418423, "grad_norm": 316.0754699707031, "learning_rate": 4.6804183614177805e-06, "logits/chosen": -0.8850613832473755, "logits/rejected": -0.9165294766426086, "logps/chosen": -77.82759857177734, "logps/rejected": -73.77207946777344, "loss": 13.467, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.020776253193616867, "rewards/margins": 0.04324489086866379, "rewards/rejected": -0.022468645125627518, "step": 220 }, { "epoch": 0.12836150316547598, "grad_norm": 298.15130615234375, "learning_rate": 4.678965717606043e-06, "logits/chosen": -0.9083736538887024, "logits/rejected": -0.9166957139968872, "logps/chosen": -64.89097595214844, "logps/rejected": -71.65401458740234, "loss": 13.4666, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.004524010233581066, "rewards/margins": 0.045348040759563446, "rewards/rejected": -0.049872055649757385, "step": 221 }, { "epoch": 0.12894232444676773, "grad_norm": 328.2380065917969, "learning_rate": 4.677513073794306e-06, "logits/chosen": -0.8878594636917114, "logits/rejected": -0.8982254862785339, "logps/chosen": -74.40402221679688, "logps/rejected": -67.42572021484375, "loss": 14.4278, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.07353280484676361, "rewards/margins": -0.05194491147994995, "rewards/rejected": -0.02158789522945881, "step": 222 }, { "epoch": 0.12952314572805948, "grad_norm": 312.8938903808594, "learning_rate": 4.676060429982569e-06, "logits/chosen": -0.9499589204788208, "logits/rejected": -0.9511539340019226, "logps/chosen": -74.2853012084961, "logps/rejected": -74.38381958007812, "loss": 13.8183, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.04615163058042526, "rewards/margins": 0.010002289898693562, "rewards/rejected": -0.05615391582250595, "step": 223 }, { "epoch": 0.13010396700935123, "grad_norm": 308.1676330566406, "learning_rate": 4.6746077861708315e-06, "logits/chosen": -0.9691025614738464, "logits/rejected": -1.0638432502746582, "logps/chosen": -68.75132751464844, "logps/rejected": -64.51249694824219, "loss": 14.5132, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.0783403068780899, "rewards/margins": -0.058567456901073456, "rewards/rejected": -0.019772853702306747, "step": 224 }, { "epoch": 0.13068478829064298, "grad_norm": 300.42218017578125, "learning_rate": 4.673155142359093e-06, "logits/chosen": -0.9048269391059875, "logits/rejected": -0.9264401197433472, "logps/chosen": -73.9933090209961, "logps/rejected": -80.50711822509766, "loss": 13.5877, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.017903203144669533, "rewards/margins": 0.03073006495833397, "rewards/rejected": -0.04863326996564865, "step": 225 }, { "epoch": 0.13126560957193473, "grad_norm": 344.2493896484375, "learning_rate": 4.671702498547356e-06, "logits/chosen": -0.8985874056816101, "logits/rejected": -0.843769371509552, "logps/chosen": -72.87957000732422, "logps/rejected": -78.47367095947266, "loss": 13.6586, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01365675963461399, "rewards/margins": 0.0262027345597744, "rewards/rejected": -0.03985949605703354, "step": 226 }, { "epoch": 0.13184643085322645, "grad_norm": 314.46856689453125, "learning_rate": 4.670249854735619e-06, "logits/chosen": -0.9261584281921387, "logits/rejected": -1.041046142578125, "logps/chosen": -76.67170715332031, "logps/rejected": -66.8155517578125, "loss": 13.9811, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.03505679965019226, "rewards/margins": -0.007680593524128199, "rewards/rejected": -0.0273762084543705, "step": 227 }, { "epoch": 0.1324272521345182, "grad_norm": 341.5300598144531, "learning_rate": 4.668797210923882e-06, "logits/chosen": -0.8199512362480164, "logits/rejected": -0.8454850316047668, "logps/chosen": -63.34907150268555, "logps/rejected": -67.3141098022461, "loss": 13.2625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.009090743958950043, "rewards/margins": 0.06578489392995834, "rewards/rejected": -0.0566941499710083, "step": 228 }, { "epoch": 0.13300807341580995, "grad_norm": 301.3994445800781, "learning_rate": 4.667344567112144e-06, "logits/chosen": -1.106350302696228, "logits/rejected": -1.0245457887649536, "logps/chosen": -72.74774932861328, "logps/rejected": -74.6384506225586, "loss": 14.0679, "rewards/accuracies": 0.5, "rewards/chosen": -0.011247700080275536, "rewards/margins": -0.01573079079389572, "rewards/rejected": 0.004483087919652462, "step": 229 }, { "epoch": 0.1335888946971017, "grad_norm": 309.1897277832031, "learning_rate": 4.665891923300407e-06, "logits/chosen": -0.9697766304016113, "logits/rejected": -1.008512258529663, "logps/chosen": -78.22880554199219, "logps/rejected": -62.13957977294922, "loss": 13.917, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.030580034479498863, "rewards/margins": -0.0008186303311958909, "rewards/rejected": -0.029761407524347305, "step": 230 }, { "epoch": 0.13416971597839344, "grad_norm": 332.3008117675781, "learning_rate": 4.66443927948867e-06, "logits/chosen": -0.9368025064468384, "logits/rejected": -0.8751896023750305, "logps/chosen": -80.72710418701172, "logps/rejected": -75.58308410644531, "loss": 13.9649, "rewards/accuracies": 0.5, "rewards/chosen": -0.060019601136446, "rewards/margins": -0.0033089532516896725, "rewards/rejected": -0.05671064928174019, "step": 231 }, { "epoch": 0.1347505372596852, "grad_norm": 326.7151794433594, "learning_rate": 4.662986635676933e-06, "logits/chosen": -0.9234104156494141, "logits/rejected": -0.9875878095626831, "logps/chosen": -79.9911117553711, "logps/rejected": -80.50080871582031, "loss": 13.995, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.05462227389216423, "rewards/margins": -0.009203016757965088, "rewards/rejected": -0.04541926831007004, "step": 232 }, { "epoch": 0.13533135854097694, "grad_norm": 297.00897216796875, "learning_rate": 4.661533991865195e-06, "logits/chosen": -0.6637614965438843, "logits/rejected": -0.8283042907714844, "logps/chosen": -71.94387817382812, "logps/rejected": -65.46293640136719, "loss": 14.1861, "rewards/accuracies": 0.5, "rewards/chosen": -0.0613144226372242, "rewards/margins": -0.02849789522588253, "rewards/rejected": -0.03281652554869652, "step": 233 }, { "epoch": 0.1359121798222687, "grad_norm": 308.1539306640625, "learning_rate": 4.660081348053458e-06, "logits/chosen": -1.0005970001220703, "logits/rejected": -1.005382776260376, "logps/chosen": -75.04243469238281, "logps/rejected": -70.30781555175781, "loss": 13.5591, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.005779562052339315, "rewards/margins": 0.03400464728474617, "rewards/rejected": -0.03978421539068222, "step": 234 }, { "epoch": 0.13649300110356044, "grad_norm": 314.4703674316406, "learning_rate": 4.658628704241721e-06, "logits/chosen": -0.8988968729972839, "logits/rejected": -0.9323342442512512, "logps/chosen": -70.89219665527344, "logps/rejected": -72.37486267089844, "loss": 13.8039, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03381185606122017, "rewards/margins": 0.010169675573706627, "rewards/rejected": -0.04398152977228165, "step": 235 }, { "epoch": 0.1370738223848522, "grad_norm": 318.1397705078125, "learning_rate": 4.657176060429983e-06, "logits/chosen": -0.9645525217056274, "logits/rejected": -0.8109456896781921, "logps/chosen": -74.8411865234375, "logps/rejected": -73.68878936767578, "loss": 14.1379, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.045103732496500015, "rewards/margins": -0.02444007806479931, "rewards/rejected": -0.020663652569055557, "step": 236 }, { "epoch": 0.13765464366614394, "grad_norm": 297.63604736328125, "learning_rate": 4.6557234166182455e-06, "logits/chosen": -0.9129088521003723, "logits/rejected": -0.9517717361450195, "logps/chosen": -68.26078033447266, "logps/rejected": -78.68707275390625, "loss": 13.3907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.013581380248069763, "rewards/margins": 0.05378935858607292, "rewards/rejected": -0.06737073510885239, "step": 237 }, { "epoch": 0.1382354649474357, "grad_norm": 322.44964599609375, "learning_rate": 4.654270772806508e-06, "logits/chosen": -1.0076889991760254, "logits/rejected": -0.9823756217956543, "logps/chosen": -70.8648681640625, "logps/rejected": -75.48307037353516, "loss": 13.5039, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.021284928545355797, "rewards/margins": 0.039048969745635986, "rewards/rejected": -0.06033390760421753, "step": 238 }, { "epoch": 0.1388162862287274, "grad_norm": 320.17193603515625, "learning_rate": 4.652818128994771e-06, "logits/chosen": -0.9012085199356079, "logits/rejected": -1.0353925228118896, "logps/chosen": -77.64144134521484, "logps/rejected": -72.55008697509766, "loss": 13.9087, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0533623993396759, "rewards/margins": -0.0019609176088124514, "rewards/rejected": -0.05140148475766182, "step": 239 }, { "epoch": 0.13939710751001916, "grad_norm": 291.0409851074219, "learning_rate": 4.651365485183034e-06, "logits/chosen": -0.7694844603538513, "logits/rejected": -0.7582476735115051, "logps/chosen": -66.96981048583984, "logps/rejected": -69.53636169433594, "loss": 13.8481, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.029136648401618004, "rewards/margins": 0.006323431618511677, "rewards/rejected": -0.035460080951452255, "step": 240 }, { "epoch": 0.1399779287913109, "grad_norm": 309.0951843261719, "learning_rate": 4.6499128413712965e-06, "logits/chosen": -1.09603750705719, "logits/rejected": -1.1345731019973755, "logps/chosen": -69.8927001953125, "logps/rejected": -76.84407806396484, "loss": 13.2567, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.028979092836380005, "rewards/margins": 0.06684218347072601, "rewards/rejected": -0.03786309435963631, "step": 241 }, { "epoch": 0.14055875007260266, "grad_norm": 341.45458984375, "learning_rate": 4.648460197559559e-06, "logits/chosen": -0.7778705358505249, "logits/rejected": -0.9100780487060547, "logps/chosen": -83.03035736083984, "logps/rejected": -76.86042785644531, "loss": 13.7437, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.028774719685316086, "rewards/margins": 0.01645464077591896, "rewards/rejected": -0.04522935673594475, "step": 242 }, { "epoch": 0.1411395713538944, "grad_norm": 292.6492919921875, "learning_rate": 4.647007553747821e-06, "logits/chosen": -1.0126192569732666, "logits/rejected": -0.8217967748641968, "logps/chosen": -78.85697937011719, "logps/rejected": -69.64402770996094, "loss": 13.8864, "rewards/accuracies": 0.5, "rewards/chosen": -0.031996484845876694, "rewards/margins": 0.002992384135723114, "rewards/rejected": -0.034988872706890106, "step": 243 }, { "epoch": 0.14172039263518615, "grad_norm": 312.7855529785156, "learning_rate": 4.645554909936084e-06, "logits/chosen": -0.9456573724746704, "logits/rejected": -0.9571585655212402, "logps/chosen": -74.84808349609375, "logps/rejected": -71.9338150024414, "loss": 14.0443, "rewards/accuracies": 0.5, "rewards/chosen": -0.046040650457143784, "rewards/margins": -0.014010493643581867, "rewards/rejected": -0.03203015774488449, "step": 244 }, { "epoch": 0.1423012139164779, "grad_norm": 318.9042053222656, "learning_rate": 4.644102266124347e-06, "logits/chosen": -0.7081719636917114, "logits/rejected": -0.8551041483879089, "logps/chosen": -77.06927490234375, "logps/rejected": -80.43268585205078, "loss": 13.833, "rewards/accuracies": 0.5, "rewards/chosen": -0.04912562295794487, "rewards/margins": 0.007158022373914719, "rewards/rejected": -0.05628364533185959, "step": 245 }, { "epoch": 0.14288203519776965, "grad_norm": 333.2130126953125, "learning_rate": 4.642649622312609e-06, "logits/chosen": -1.046338438987732, "logits/rejected": -1.0311440229415894, "logps/chosen": -76.91915893554688, "logps/rejected": -76.76275634765625, "loss": 13.7829, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.005943517200648785, "rewards/margins": 0.011601355858147144, "rewards/rejected": -0.0056578353978693485, "step": 246 }, { "epoch": 0.1434628564790614, "grad_norm": 315.692626953125, "learning_rate": 4.641196978500872e-06, "logits/chosen": -0.7881887555122375, "logits/rejected": -0.8423610925674438, "logps/chosen": -72.17633056640625, "logps/rejected": -80.27653503417969, "loss": 13.5325, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007096032612025738, "rewards/margins": 0.035623110830783844, "rewards/rejected": -0.042719148099422455, "step": 247 }, { "epoch": 0.14404367776035315, "grad_norm": 324.49945068359375, "learning_rate": 4.639744334689135e-06, "logits/chosen": -0.9588478803634644, "logits/rejected": -0.9462132453918457, "logps/chosen": -77.13792419433594, "logps/rejected": -72.57328796386719, "loss": 13.6316, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0016011804109439254, "rewards/margins": 0.02814745344221592, "rewards/rejected": -0.02974862977862358, "step": 248 }, { "epoch": 0.1446244990416449, "grad_norm": 299.9615478515625, "learning_rate": 4.638291690877398e-06, "logits/chosen": -0.857986569404602, "logits/rejected": -0.9681297540664673, "logps/chosen": -73.25657653808594, "logps/rejected": -80.2601547241211, "loss": 13.4933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00817027036100626, "rewards/margins": 0.04146546870470047, "rewards/rejected": -0.04963573440909386, "step": 249 }, { "epoch": 0.14520532032293662, "grad_norm": 302.1356201171875, "learning_rate": 4.6368390470656596e-06, "logits/chosen": -0.9270285367965698, "logits/rejected": -1.053038239479065, "logps/chosen": -65.88447570800781, "logps/rejected": -72.62715148925781, "loss": 14.0395, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.032028716057538986, "rewards/margins": -0.014700132422149181, "rewards/rejected": -0.01732858456671238, "step": 250 }, { "epoch": 0.14578614160422837, "grad_norm": 296.97564697265625, "learning_rate": 4.635386403253922e-06, "logits/chosen": -0.8831275105476379, "logits/rejected": -0.8724244832992554, "logps/chosen": -66.05812072753906, "logps/rejected": -67.20362854003906, "loss": 13.9448, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0004748806240968406, "rewards/margins": -0.0028558894991874695, "rewards/rejected": 0.0033307753037661314, "step": 251 }, { "epoch": 0.14636696288552012, "grad_norm": 315.11810302734375, "learning_rate": 4.633933759442185e-06, "logits/chosen": -1.0385617017745972, "logits/rejected": -1.0641318559646606, "logps/chosen": -72.60139465332031, "logps/rejected": -70.97787475585938, "loss": 13.838, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.012739667668938637, "rewards/margins": 0.007387256715446711, "rewards/rejected": -0.02012692391872406, "step": 252 }, { "epoch": 0.14694778416681187, "grad_norm": 315.47259521484375, "learning_rate": 4.632481115630448e-06, "logits/chosen": -0.8671859502792358, "logits/rejected": -0.8790918588638306, "logps/chosen": -80.53587341308594, "logps/rejected": -78.8593978881836, "loss": 13.9845, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.034586913883686066, "rewards/margins": -0.007644425146281719, "rewards/rejected": -0.02694249153137207, "step": 253 }, { "epoch": 0.14752860544810362, "grad_norm": 310.9654541015625, "learning_rate": 4.6310284718187105e-06, "logits/chosen": -0.8506280183792114, "logits/rejected": -0.9469968676567078, "logps/chosen": -83.29402923583984, "logps/rejected": -84.33534240722656, "loss": 13.4946, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.028773630037903786, "rewards/margins": 0.0417972095310688, "rewards/rejected": -0.07057084143161774, "step": 254 }, { "epoch": 0.14810942672939537, "grad_norm": 304.36968994140625, "learning_rate": 4.629575828006973e-06, "logits/chosen": -0.7221881151199341, "logits/rejected": -0.8106364011764526, "logps/chosen": -73.52366638183594, "logps/rejected": -76.93601989746094, "loss": 13.6697, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.017531022429466248, "rewards/margins": 0.022295940667390823, "rewards/rejected": -0.03982696682214737, "step": 255 }, { "epoch": 0.14869024801068711, "grad_norm": 731.6671752929688, "learning_rate": 4.628123184195236e-06, "logits/chosen": -1.0281916856765747, "logits/rejected": -1.0932250022888184, "logps/chosen": -73.1572036743164, "logps/rejected": -76.2696304321289, "loss": 13.9093, "rewards/accuracies": 0.5, "rewards/chosen": -0.009217451326549053, "rewards/margins": -0.0013483152724802494, "rewards/rejected": -0.007869137451052666, "step": 256 }, { "epoch": 0.14927106929197886, "grad_norm": 317.27630615234375, "learning_rate": 4.626670540383498e-06, "logits/chosen": -0.7391899824142456, "logits/rejected": -0.8331745862960815, "logps/chosen": -70.43421936035156, "logps/rejected": -76.86418151855469, "loss": 13.9393, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.015171433798968792, "rewards/margins": -0.0031314187217503786, "rewards/rejected": 0.018302852287888527, "step": 257 }, { "epoch": 0.1498518905732706, "grad_norm": 323.65386962890625, "learning_rate": 4.625217896571761e-06, "logits/chosen": -0.8975432515144348, "logits/rejected": -1.0589958429336548, "logps/chosen": -75.39137268066406, "logps/rejected": -71.28562927246094, "loss": 13.5174, "rewards/accuracies": 0.75, "rewards/chosen": -0.009832754731178284, "rewards/margins": 0.03878428786993027, "rewards/rejected": -0.04861704260110855, "step": 258 }, { "epoch": 0.15043271185456236, "grad_norm": 304.92144775390625, "learning_rate": 4.6237652527600234e-06, "logits/chosen": -0.7835612297058105, "logits/rejected": -0.8767670392990112, "logps/chosen": -73.34691619873047, "logps/rejected": -70.6404037475586, "loss": 14.2398, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.04404080659151077, "rewards/margins": -0.032345980405807495, "rewards/rejected": -0.011694823391735554, "step": 259 }, { "epoch": 0.1510135331358541, "grad_norm": 324.8614196777344, "learning_rate": 4.622312608948286e-06, "logits/chosen": -0.6987928152084351, "logits/rejected": -0.765534520149231, "logps/chosen": -72.92799377441406, "logps/rejected": -69.84391784667969, "loss": 13.887, "rewards/accuracies": 0.5, "rewards/chosen": -0.027519341558218002, "rewards/margins": 0.0016624340787529945, "rewards/rejected": -0.02918177843093872, "step": 260 }, { "epoch": 0.15159435441714583, "grad_norm": 311.1237487792969, "learning_rate": 4.620859965136549e-06, "logits/chosen": -0.7852567434310913, "logits/rejected": -0.9455236196517944, "logps/chosen": -72.65814971923828, "logps/rejected": -79.49076080322266, "loss": 13.4405, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02082516811788082, "rewards/margins": 0.046132300049066544, "rewards/rejected": -0.06695746630430222, "step": 261 }, { "epoch": 0.15217517569843758, "grad_norm": 312.7984924316406, "learning_rate": 4.619407321324812e-06, "logits/chosen": -0.9117915034294128, "logits/rejected": -0.9733545184135437, "logps/chosen": -77.04357147216797, "logps/rejected": -66.96760559082031, "loss": 14.3348, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0389360710978508, "rewards/margins": -0.043142665177583694, "rewards/rejected": 0.0042065903544425964, "step": 262 }, { "epoch": 0.15275599697972933, "grad_norm": 338.8244934082031, "learning_rate": 4.6179546775130744e-06, "logits/chosen": -0.8544296026229858, "logits/rejected": -0.8088324666023254, "logps/chosen": -76.08251190185547, "logps/rejected": -74.12071990966797, "loss": 13.896, "rewards/accuracies": 0.5, "rewards/chosen": -0.017584245651960373, "rewards/margins": -0.000663819897454232, "rewards/rejected": -0.0169204268604517, "step": 263 }, { "epoch": 0.15333681826102108, "grad_norm": 286.11407470703125, "learning_rate": 4.616502033701336e-06, "logits/chosen": -0.8225248456001282, "logits/rejected": -0.8068425059318542, "logps/chosen": -75.96072387695312, "logps/rejected": -70.82938385009766, "loss": 13.5013, "rewards/accuracies": 0.75, "rewards/chosen": 0.02497541345655918, "rewards/margins": 0.04040759429335594, "rewards/rejected": -0.01543218083679676, "step": 264 }, { "epoch": 0.15391763954231283, "grad_norm": 319.89794921875, "learning_rate": 4.615049389889599e-06, "logits/chosen": -0.9208014607429504, "logits/rejected": -0.9296444654464722, "logps/chosen": -75.55086517333984, "logps/rejected": -79.83537292480469, "loss": 13.6912, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0359053872525692, "rewards/margins": 0.024030333384871483, "rewards/rejected": -0.05993572622537613, "step": 265 }, { "epoch": 0.15449846082360458, "grad_norm": 360.5916442871094, "learning_rate": 4.613596746077862e-06, "logits/chosen": -0.745102047920227, "logits/rejected": -0.7995504140853882, "logps/chosen": -75.1971206665039, "logps/rejected": -68.94092559814453, "loss": 13.8718, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02639504335820675, "rewards/margins": 0.0018470294307917356, "rewards/rejected": -0.028242075815796852, "step": 266 }, { "epoch": 0.15507928210489633, "grad_norm": 306.31646728515625, "learning_rate": 4.612144102266125e-06, "logits/chosen": -0.7601212859153748, "logits/rejected": -0.8015406727790833, "logps/chosen": -79.32325744628906, "logps/rejected": -73.96139526367188, "loss": 14.0699, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.05416913703083992, "rewards/margins": -0.017042648047208786, "rewards/rejected": -0.037126488983631134, "step": 267 }, { "epoch": 0.15566010338618808, "grad_norm": 371.703125, "learning_rate": 4.610691458454387e-06, "logits/chosen": -0.9163106679916382, "logits/rejected": -1.0520881414413452, "logps/chosen": -78.73876953125, "logps/rejected": -79.40687561035156, "loss": 13.5618, "rewards/accuracies": 0.5, "rewards/chosen": -0.009102868847548962, "rewards/margins": 0.032982636243104935, "rewards/rejected": -0.04208550602197647, "step": 268 }, { "epoch": 0.15624092466747982, "grad_norm": 309.1504211425781, "learning_rate": 4.60923881464265e-06, "logits/chosen": -0.9318111538887024, "logits/rejected": -0.9711005091667175, "logps/chosen": -74.52928924560547, "logps/rejected": -78.32316589355469, "loss": 13.5172, "rewards/accuracies": 0.5, "rewards/chosen": -0.03735839203000069, "rewards/margins": 0.03915861248970032, "rewards/rejected": -0.0765170007944107, "step": 269 }, { "epoch": 0.15682174594877157, "grad_norm": 297.4718017578125, "learning_rate": 4.607786170830912e-06, "logits/chosen": -0.9283093214035034, "logits/rejected": -0.9299993515014648, "logps/chosen": -73.22312927246094, "logps/rejected": -69.17174530029297, "loss": 13.9256, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0193580724298954, "rewards/margins": -0.0019154436886310577, "rewards/rejected": -0.017442626878619194, "step": 270 }, { "epoch": 0.15740256723006332, "grad_norm": 315.2553405761719, "learning_rate": 4.606333527019175e-06, "logits/chosen": -0.8597780466079712, "logits/rejected": -0.811455249786377, "logps/chosen": -67.54701232910156, "logps/rejected": -77.47270965576172, "loss": 13.5603, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0011511326301842928, "rewards/margins": 0.03523620590567589, "rewards/rejected": -0.03638733923435211, "step": 271 }, { "epoch": 0.15798338851135504, "grad_norm": 788.3867797851562, "learning_rate": 4.6048808832074375e-06, "logits/chosen": -0.8214397430419922, "logits/rejected": -0.791312575340271, "logps/chosen": -69.90260314941406, "logps/rejected": -70.34420776367188, "loss": 13.6075, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.010176298208534718, "rewards/margins": 0.028864461928606033, "rewards/rejected": -0.03904075548052788, "step": 272 }, { "epoch": 0.1585642097926468, "grad_norm": 301.57794189453125, "learning_rate": 4.603428239395701e-06, "logits/chosen": -0.8692893981933594, "logits/rejected": -0.8743621706962585, "logps/chosen": -68.12577056884766, "logps/rejected": -66.91731262207031, "loss": 13.7415, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.013723989017307758, "rewards/margins": 0.015195539221167564, "rewards/rejected": -0.028919529169797897, "step": 273 }, { "epoch": 0.15914503107393854, "grad_norm": 290.8895263671875, "learning_rate": 4.601975595583964e-06, "logits/chosen": -0.8542564511299133, "logits/rejected": -0.8347529172897339, "logps/chosen": -73.254150390625, "logps/rejected": -68.77384948730469, "loss": 13.5118, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0077955336309969425, "rewards/margins": 0.03801111876964569, "rewards/rejected": -0.030215587466955185, "step": 274 }, { "epoch": 0.1597258523552303, "grad_norm": 299.75982666015625, "learning_rate": 4.600522951772226e-06, "logits/chosen": -0.9737634658813477, "logits/rejected": -1.0128427743911743, "logps/chosen": -72.66883850097656, "logps/rejected": -72.12713623046875, "loss": 14.1505, "rewards/accuracies": 0.5, "rewards/chosen": -0.04277346283197403, "rewards/margins": -0.02521767094731331, "rewards/rejected": -0.01755579002201557, "step": 275 }, { "epoch": 0.16030667363652204, "grad_norm": 317.3355712890625, "learning_rate": 4.5990703079604885e-06, "logits/chosen": -0.7819968461990356, "logits/rejected": -0.7388423681259155, "logps/chosen": -68.65778350830078, "logps/rejected": -69.62420654296875, "loss": 14.0828, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.014692326076328754, "rewards/margins": -0.017574016004800797, "rewards/rejected": 0.03226633742451668, "step": 276 }, { "epoch": 0.1608874949178138, "grad_norm": 300.1111145019531, "learning_rate": 4.597617664148751e-06, "logits/chosen": -0.8221060037612915, "logits/rejected": -0.8107389211654663, "logps/chosen": -64.23518371582031, "logps/rejected": -74.13258361816406, "loss": 13.5993, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.01461729221045971, "rewards/margins": 0.028885364532470703, "rewards/rejected": -0.014268075115978718, "step": 277 }, { "epoch": 0.16146831619910554, "grad_norm": 324.15899658203125, "learning_rate": 4.596165020337014e-06, "logits/chosen": -0.8910449147224426, "logits/rejected": -1.0317879915237427, "logps/chosen": -72.35569763183594, "logps/rejected": -86.64521789550781, "loss": 13.3693, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04237177222967148, "rewards/margins": 0.05554167553782463, "rewards/rejected": -0.09791344404220581, "step": 278 }, { "epoch": 0.1620491374803973, "grad_norm": 335.16619873046875, "learning_rate": 4.594712376525277e-06, "logits/chosen": -0.8945894241333008, "logits/rejected": -0.9022412300109863, "logps/chosen": -72.68575286865234, "logps/rejected": -77.0567626953125, "loss": 13.5153, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0022069262340664864, "rewards/margins": 0.03883010149002075, "rewards/rejected": -0.03662317246198654, "step": 279 }, { "epoch": 0.16262995876168904, "grad_norm": 296.8701477050781, "learning_rate": 4.5932597327135395e-06, "logits/chosen": -0.8403556942939758, "logits/rejected": -0.8022698163986206, "logps/chosen": -68.22055053710938, "logps/rejected": -68.78074645996094, "loss": 13.5981, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01986340805888176, "rewards/margins": 0.03163480758666992, "rewards/rejected": -0.05149821564555168, "step": 280 }, { "epoch": 0.16321078004298079, "grad_norm": 305.6988830566406, "learning_rate": 4.591807088901802e-06, "logits/chosen": -0.8042858839035034, "logits/rejected": -0.7476822137832642, "logps/chosen": -64.76155090332031, "logps/rejected": -72.77494812011719, "loss": 13.7642, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.013462590985000134, "rewards/margins": 0.014226732775568962, "rewards/rejected": -0.02768932655453682, "step": 281 }, { "epoch": 0.16379160132427253, "grad_norm": 317.730712890625, "learning_rate": 4.590354445090064e-06, "logits/chosen": -0.8070831298828125, "logits/rejected": -0.7661058306694031, "logps/chosen": -69.77413940429688, "logps/rejected": -69.04270935058594, "loss": 13.623, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.008160887286067009, "rewards/margins": 0.029747169464826584, "rewards/rejected": -0.021586284041404724, "step": 282 }, { "epoch": 0.16437242260556426, "grad_norm": 313.9194641113281, "learning_rate": 4.588901801278327e-06, "logits/chosen": -0.8869432210922241, "logits/rejected": -1.0530153512954712, "logps/chosen": -76.52225494384766, "logps/rejected": -70.056884765625, "loss": 13.6882, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.044385336339473724, "rewards/margins": 0.02604733407497406, "rewards/rejected": -0.07043267786502838, "step": 283 }, { "epoch": 0.164953243886856, "grad_norm": 326.5664978027344, "learning_rate": 4.58744915746659e-06, "logits/chosen": -0.7581676244735718, "logits/rejected": -0.7644235491752625, "logps/chosen": -75.07527160644531, "logps/rejected": -67.26031494140625, "loss": 14.2293, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.024589911103248596, "rewards/margins": -0.03345941752195358, "rewards/rejected": 0.008869504556059837, "step": 284 }, { "epoch": 0.16553406516814775, "grad_norm": 331.2121276855469, "learning_rate": 4.585996513654852e-06, "logits/chosen": -0.6432844400405884, "logits/rejected": -0.6698486804962158, "logps/chosen": -79.6335220336914, "logps/rejected": -84.1219253540039, "loss": 13.6497, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007236090488731861, "rewards/margins": 0.02394806034862995, "rewards/rejected": -0.03118414804339409, "step": 285 }, { "epoch": 0.1661148864494395, "grad_norm": 325.1278381347656, "learning_rate": 4.584543869843115e-06, "logits/chosen": -0.805732250213623, "logits/rejected": -0.810941219329834, "logps/chosen": -81.93310546875, "logps/rejected": -72.44585418701172, "loss": 14.3678, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.03649924322962761, "rewards/margins": -0.046101175248622894, "rewards/rejected": 0.009601928293704987, "step": 286 }, { "epoch": 0.16669570773073125, "grad_norm": 304.5572814941406, "learning_rate": 4.583091226031378e-06, "logits/chosen": -0.7995644211769104, "logits/rejected": -0.8172422647476196, "logps/chosen": -72.41849517822266, "logps/rejected": -67.6212387084961, "loss": 13.8286, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.016089126467704773, "rewards/margins": 0.00835293810814619, "rewards/rejected": 0.007736186496913433, "step": 287 }, { "epoch": 0.167276529012023, "grad_norm": 338.3421936035156, "learning_rate": 4.581638582219641e-06, "logits/chosen": -0.7788872122764587, "logits/rejected": -0.8800986409187317, "logps/chosen": -77.22239685058594, "logps/rejected": -84.37377166748047, "loss": 13.6912, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01143337320536375, "rewards/margins": 0.02296261489391327, "rewards/rejected": -0.034395989030599594, "step": 288 }, { "epoch": 0.16785735029331475, "grad_norm": 303.7924499511719, "learning_rate": 4.5801859384079025e-06, "logits/chosen": -0.8267248868942261, "logits/rejected": -0.8682713508605957, "logps/chosen": -76.82869720458984, "logps/rejected": -75.02543640136719, "loss": 13.9469, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.009327888488769531, "rewards/margins": -0.004273596219718456, "rewards/rejected": 0.013601483777165413, "step": 289 }, { "epoch": 0.1684381715746065, "grad_norm": 310.5744323730469, "learning_rate": 4.578733294596165e-06, "logits/chosen": -0.8373514413833618, "logits/rejected": -0.8355886340141296, "logps/chosen": -78.4135971069336, "logps/rejected": -68.44206237792969, "loss": 13.3873, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.01929556205868721, "rewards/margins": 0.05108966678380966, "rewards/rejected": -0.07038523256778717, "step": 290 }, { "epoch": 0.16901899285589825, "grad_norm": 294.162353515625, "learning_rate": 4.577280650784428e-06, "logits/chosen": -0.8757956624031067, "logits/rejected": -0.9635500907897949, "logps/chosen": -79.70155334472656, "logps/rejected": -72.04487609863281, "loss": 13.3423, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.015520468354225159, "rewards/margins": 0.056552886962890625, "rewards/rejected": -0.041032422333955765, "step": 291 }, { "epoch": 0.16959981413719, "grad_norm": 406.17034912109375, "learning_rate": 4.575828006972691e-06, "logits/chosen": -0.712080717086792, "logits/rejected": -0.6416457891464233, "logps/chosen": -74.07041931152344, "logps/rejected": -75.76925659179688, "loss": 13.8944, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009814701043069363, "rewards/margins": 0.0012444716412574053, "rewards/rejected": -0.0110591696575284, "step": 292 }, { "epoch": 0.17018063541848175, "grad_norm": 319.25677490234375, "learning_rate": 4.5743753631609535e-06, "logits/chosen": -0.978374183177948, "logits/rejected": -0.9963283538818359, "logps/chosen": -68.7942886352539, "logps/rejected": -71.56004333496094, "loss": 13.9775, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.004177446011453867, "rewards/margins": -0.006889923010021448, "rewards/rejected": 0.002712479094043374, "step": 293 }, { "epoch": 0.17076145669977347, "grad_norm": 333.2608642578125, "learning_rate": 4.572922719349216e-06, "logits/chosen": -0.9167648553848267, "logits/rejected": -0.8717905282974243, "logps/chosen": -70.68975067138672, "logps/rejected": -77.90010070800781, "loss": 13.5974, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.002308635041117668, "rewards/margins": 0.027850687503814697, "rewards/rejected": -0.02554205060005188, "step": 294 }, { "epoch": 0.17134227798106522, "grad_norm": 312.9326477050781, "learning_rate": 4.571470075537478e-06, "logits/chosen": -0.7617141008377075, "logits/rejected": -0.8908483386039734, "logps/chosen": -70.78837585449219, "logps/rejected": -72.64606475830078, "loss": 14.292, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.008612537756562233, "rewards/margins": -0.03856853395700455, "rewards/rejected": 0.029956001788377762, "step": 295 }, { "epoch": 0.17192309926235697, "grad_norm": 308.8948669433594, "learning_rate": 4.570017431725741e-06, "logits/chosen": -0.913497269153595, "logits/rejected": -0.8606641888618469, "logps/chosen": -70.07915496826172, "logps/rejected": -82.69215393066406, "loss": 13.7394, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.011818322353065014, "rewards/margins": 0.017899369820952415, "rewards/rejected": -0.029717693105340004, "step": 296 }, { "epoch": 0.17250392054364871, "grad_norm": 291.6385803222656, "learning_rate": 4.568564787914004e-06, "logits/chosen": -0.8296724557876587, "logits/rejected": -0.9198773503303528, "logps/chosen": -67.22807312011719, "logps/rejected": -78.60759735107422, "loss": 13.3225, "rewards/accuracies": 0.75, "rewards/chosen": -0.0034950252156704664, "rewards/margins": 0.06311126798391342, "rewards/rejected": -0.0666062980890274, "step": 297 }, { "epoch": 0.17308474182494046, "grad_norm": 291.3044738769531, "learning_rate": 4.567112144102266e-06, "logits/chosen": -0.8050098419189453, "logits/rejected": -0.7676219940185547, "logps/chosen": -67.16458129882812, "logps/rejected": -78.90528869628906, "loss": 13.2096, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02983633242547512, "rewards/margins": 0.07233790308237076, "rewards/rejected": -0.042501576244831085, "step": 298 }, { "epoch": 0.1736655631062322, "grad_norm": 314.0892333984375, "learning_rate": 4.565659500290529e-06, "logits/chosen": -1.0540732145309448, "logits/rejected": -0.9160014986991882, "logps/chosen": -71.54045104980469, "logps/rejected": -75.05534362792969, "loss": 13.7491, "rewards/accuracies": 0.5, "rewards/chosen": 0.009140492416918278, "rewards/margins": 0.015866661444306374, "rewards/rejected": -0.006726170424371958, "step": 299 }, { "epoch": 0.17424638438752396, "grad_norm": 310.53985595703125, "learning_rate": 4.564206856478792e-06, "logits/chosen": -0.8692498207092285, "logits/rejected": -0.8690904378890991, "logps/chosen": -71.8248062133789, "logps/rejected": -82.47134399414062, "loss": 12.8867, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.015918642282485962, "rewards/margins": 0.10448668152093887, "rewards/rejected": -0.08856804668903351, "step": 300 }, { "epoch": 0.1748272056688157, "grad_norm": 298.11541748046875, "learning_rate": 4.562754212667055e-06, "logits/chosen": -0.8415626287460327, "logits/rejected": -0.8000283241271973, "logps/chosen": -69.10453033447266, "logps/rejected": -65.39994812011719, "loss": 13.946, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.024037543684244156, "rewards/margins": -0.00543618481606245, "rewards/rejected": -0.018601354211568832, "step": 301 }, { "epoch": 0.17540802695010746, "grad_norm": 286.9059753417969, "learning_rate": 4.5613015688553165e-06, "logits/chosen": -0.8321945071220398, "logits/rejected": -0.8791966438293457, "logps/chosen": -66.0924301147461, "logps/rejected": -70.165283203125, "loss": 13.9943, "rewards/accuracies": 0.5, "rewards/chosen": -0.0016370766097679734, "rewards/margins": -0.008972769603133202, "rewards/rejected": 0.007335691247135401, "step": 302 }, { "epoch": 0.1759888482313992, "grad_norm": 324.43927001953125, "learning_rate": 4.559848925043579e-06, "logits/chosen": -0.8788312077522278, "logits/rejected": -0.8677918314933777, "logps/chosen": -70.78472137451172, "logps/rejected": -70.58416748046875, "loss": 13.756, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.015381020493805408, "rewards/margins": 0.013850994408130646, "rewards/rejected": -0.02923201583325863, "step": 303 }, { "epoch": 0.17656966951269096, "grad_norm": 333.6717224121094, "learning_rate": 4.558396281231842e-06, "logits/chosen": -0.9329641461372375, "logits/rejected": -0.8686521649360657, "logps/chosen": -73.89896392822266, "logps/rejected": -78.4896011352539, "loss": 13.8188, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00456998823210597, "rewards/margins": 0.008396768942475319, "rewards/rejected": -0.0038267814088612795, "step": 304 }, { "epoch": 0.17715049079398268, "grad_norm": 306.7808532714844, "learning_rate": 4.556943637420105e-06, "logits/chosen": -0.8817129135131836, "logits/rejected": -0.7859727144241333, "logps/chosen": -70.53137969970703, "logps/rejected": -73.36188507080078, "loss": 13.6795, "rewards/accuracies": 0.5, "rewards/chosen": 0.0007974056643433869, "rewards/margins": 0.025630056858062744, "rewards/rejected": -0.024832649156451225, "step": 305 }, { "epoch": 0.17773131207527443, "grad_norm": 299.79339599609375, "learning_rate": 4.5554909936083675e-06, "logits/chosen": -0.8397809863090515, "logits/rejected": -0.962628960609436, "logps/chosen": -74.5028305053711, "logps/rejected": -72.23627471923828, "loss": 14.1074, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.013882589526474476, "rewards/margins": -0.022180356085300446, "rewards/rejected": 0.03606294468045235, "step": 306 }, { "epoch": 0.17831213335656618, "grad_norm": 305.6503601074219, "learning_rate": 4.55403834979663e-06, "logits/chosen": -0.9371621012687683, "logits/rejected": -0.8676670789718628, "logps/chosen": -68.56401824951172, "logps/rejected": -66.05886840820312, "loss": 13.7674, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.023868542164564133, "rewards/margins": 0.013151749968528748, "rewards/rejected": 0.010716790333390236, "step": 307 }, { "epoch": 0.17889295463785793, "grad_norm": 316.7124328613281, "learning_rate": 4.552585705984893e-06, "logits/chosen": -0.795932412147522, "logits/rejected": -0.8041373491287231, "logps/chosen": -75.10924530029297, "logps/rejected": -74.57756042480469, "loss": 14.0814, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.026643935590982437, "rewards/margins": -0.018288511782884598, "rewards/rejected": -0.008355428464710712, "step": 308 }, { "epoch": 0.17947377591914968, "grad_norm": 316.54193115234375, "learning_rate": 4.551133062173155e-06, "logits/chosen": -0.7379294037818909, "logits/rejected": -0.8033340573310852, "logps/chosen": -70.75437927246094, "logps/rejected": -68.13688659667969, "loss": 14.334, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.04628816246986389, "rewards/margins": -0.04288000240921974, "rewards/rejected": -0.0034081649500876665, "step": 309 }, { "epoch": 0.18005459720044142, "grad_norm": 320.3726501464844, "learning_rate": 4.549680418361418e-06, "logits/chosen": -0.8026307225227356, "logits/rejected": -0.7256749272346497, "logps/chosen": -77.1530532836914, "logps/rejected": -71.47651672363281, "loss": 14.3986, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.03423710912466049, "rewards/margins": -0.04747765138745308, "rewards/rejected": 0.013240538537502289, "step": 310 }, { "epoch": 0.18063541848173317, "grad_norm": 333.2999572753906, "learning_rate": 4.5482277745496804e-06, "logits/chosen": -0.9229456186294556, "logits/rejected": -0.9455530047416687, "logps/chosen": -80.14155578613281, "logps/rejected": -68.71078491210938, "loss": 14.274, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.03716535493731499, "rewards/margins": -0.035925768315792084, "rewards/rejected": -0.001239586272276938, "step": 311 }, { "epoch": 0.18121623976302492, "grad_norm": 344.1839904785156, "learning_rate": 4.546775130737943e-06, "logits/chosen": -0.7100509405136108, "logits/rejected": -0.7682685852050781, "logps/chosen": -70.23936462402344, "logps/rejected": -70.01929473876953, "loss": 13.9029, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0003937946748919785, "rewards/margins": -7.176501821959391e-05, "rewards/rejected": -0.0003220273065380752, "step": 312 }, { "epoch": 0.18179706104431667, "grad_norm": 311.8206787109375, "learning_rate": 4.545322486926206e-06, "logits/chosen": -0.786939263343811, "logits/rejected": -0.7103012800216675, "logps/chosen": -73.2970962524414, "logps/rejected": -68.45187377929688, "loss": 14.034, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01686491258442402, "rewards/margins": -0.013414248824119568, "rewards/rejected": -0.0034506588708609343, "step": 313 }, { "epoch": 0.18237788232560842, "grad_norm": 290.2313537597656, "learning_rate": 4.543869843114469e-06, "logits/chosen": -0.8251843452453613, "logits/rejected": -0.8637291193008423, "logps/chosen": -76.97846984863281, "logps/rejected": -66.21952819824219, "loss": 13.9921, "rewards/accuracies": 0.5, "rewards/chosen": -0.003126763505861163, "rewards/margins": -0.010613595135509968, "rewards/rejected": 0.007486830465495586, "step": 314 }, { "epoch": 0.18295870360690017, "grad_norm": 357.3050842285156, "learning_rate": 4.542417199302731e-06, "logits/chosen": -0.7790865898132324, "logits/rejected": -0.8820877075195312, "logps/chosen": -69.9126968383789, "logps/rejected": -73.85781860351562, "loss": 13.243, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004135083872824907, "rewards/margins": 0.07195592671632767, "rewards/rejected": -0.06782083958387375, "step": 315 }, { "epoch": 0.1835395248881919, "grad_norm": 304.0280456542969, "learning_rate": 4.540964555490994e-06, "logits/chosen": -0.8775313496589661, "logits/rejected": -0.966948390007019, "logps/chosen": -73.7503890991211, "logps/rejected": -71.78192138671875, "loss": 13.2604, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.02027755416929722, "rewards/margins": 0.07092587649822235, "rewards/rejected": -0.050648320466279984, "step": 316 }, { "epoch": 0.18412034616948364, "grad_norm": 307.97015380859375, "learning_rate": 4.539511911679257e-06, "logits/chosen": -0.8565672636032104, "logits/rejected": -0.9156128764152527, "logps/chosen": -73.54744720458984, "logps/rejected": -73.94361877441406, "loss": 13.7971, "rewards/accuracies": 0.5, "rewards/chosen": -0.005255700554698706, "rewards/margins": 0.008574297651648521, "rewards/rejected": -0.013830000534653664, "step": 317 }, { "epoch": 0.1847011674507754, "grad_norm": 312.7808837890625, "learning_rate": 4.53805926786752e-06, "logits/chosen": -0.8373235464096069, "logits/rejected": -0.8614371418952942, "logps/chosen": -71.75985717773438, "logps/rejected": -80.21531677246094, "loss": 13.8805, "rewards/accuracies": 0.5, "rewards/chosen": 0.002429866697639227, "rewards/margins": 0.001796521246433258, "rewards/rejected": 0.0006333448109216988, "step": 318 }, { "epoch": 0.18528198873206714, "grad_norm": 322.60736083984375, "learning_rate": 4.536606624055782e-06, "logits/chosen": -0.8135954141616821, "logits/rejected": -0.8624083399772644, "logps/chosen": -67.81532287597656, "logps/rejected": -81.07484436035156, "loss": 13.4658, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.008448420092463493, "rewards/margins": 0.04661915823817253, "rewards/rejected": -0.055067580193281174, "step": 319 }, { "epoch": 0.1858628100133589, "grad_norm": 289.1242370605469, "learning_rate": 4.535153980244045e-06, "logits/chosen": -0.7294256687164307, "logits/rejected": -0.7239043116569519, "logps/chosen": -70.69343566894531, "logps/rejected": -76.38871765136719, "loss": 13.2891, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04026845842599869, "rewards/margins": 0.06055659055709839, "rewards/rejected": -0.020288124680519104, "step": 320 }, { "epoch": 0.18644363129465064, "grad_norm": 316.1600341796875, "learning_rate": 4.533701336432307e-06, "logits/chosen": -0.7641903162002563, "logits/rejected": -0.8571245074272156, "logps/chosen": -72.52635192871094, "logps/rejected": -72.56694030761719, "loss": 13.0747, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.040480513125658035, "rewards/margins": 0.08481685817241669, "rewards/rejected": -0.04433634877204895, "step": 321 }, { "epoch": 0.18702445257594238, "grad_norm": 331.61419677734375, "learning_rate": 4.53224869262057e-06, "logits/chosen": -0.7687914967536926, "logits/rejected": -0.862636387348175, "logps/chosen": -69.64253997802734, "logps/rejected": -72.28370666503906, "loss": 13.9979, "rewards/accuracies": 0.5, "rewards/chosen": -0.030815565958619118, "rewards/margins": -0.009632897563278675, "rewards/rejected": -0.021182667464017868, "step": 322 }, { "epoch": 0.18760527385723413, "grad_norm": 294.9023742675781, "learning_rate": 4.5307960488088326e-06, "logits/chosen": -0.7634553909301758, "logits/rejected": -0.8270981907844543, "logps/chosen": -72.29499816894531, "logps/rejected": -74.75524139404297, "loss": 13.8647, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.014828644692897797, "rewards/margins": 0.004178445786237717, "rewards/rejected": -0.019007090479135513, "step": 323 }, { "epoch": 0.18818609513852588, "grad_norm": 303.8163757324219, "learning_rate": 4.529343404997095e-06, "logits/chosen": -0.7258373498916626, "logits/rejected": -0.7167456746101379, "logps/chosen": -71.8295669555664, "logps/rejected": -68.2274169921875, "loss": 14.0144, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04376727342605591, "rewards/margins": -0.011802466586232185, "rewards/rejected": -0.03196480870246887, "step": 324 }, { "epoch": 0.18876691641981763, "grad_norm": 304.3807678222656, "learning_rate": 4.527890761185358e-06, "logits/chosen": -0.8415002822875977, "logits/rejected": -0.8635972738265991, "logps/chosen": -76.11531829833984, "logps/rejected": -74.70405578613281, "loss": 14.0264, "rewards/accuracies": 0.5, "rewards/chosen": -0.02958356961607933, "rewards/margins": -0.012393072247505188, "rewards/rejected": -0.017190497368574142, "step": 325 }, { "epoch": 0.18934773770110938, "grad_norm": 315.16876220703125, "learning_rate": 4.526438117373621e-06, "logits/chosen": -0.7613179683685303, "logits/rejected": -0.776989221572876, "logps/chosen": -67.979248046875, "logps/rejected": -69.2055435180664, "loss": 13.738, "rewards/accuracies": 0.5, "rewards/chosen": 0.007391949184238911, "rewards/margins": 0.015842467546463013, "rewards/rejected": -0.008450517430901527, "step": 326 }, { "epoch": 0.1899285589824011, "grad_norm": 310.1661682128906, "learning_rate": 4.524985473561883e-06, "logits/chosen": -0.9094734191894531, "logits/rejected": -0.8339530825614929, "logps/chosen": -76.08878326416016, "logps/rejected": -70.17434692382812, "loss": 13.821, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.020542889833450317, "rewards/margins": 0.006295947823673487, "rewards/rejected": 0.014246943406760693, "step": 327 }, { "epoch": 0.19050938026369285, "grad_norm": 313.35858154296875, "learning_rate": 4.5235328297501455e-06, "logits/chosen": -0.7620694041252136, "logits/rejected": -0.7337735295295715, "logps/chosen": -68.53215026855469, "logps/rejected": -75.737548828125, "loss": 13.5514, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0018144982168450952, "rewards/margins": 0.035731058567762375, "rewards/rejected": -0.03391656279563904, "step": 328 }, { "epoch": 0.1910902015449846, "grad_norm": 371.4503479003906, "learning_rate": 4.522080185938408e-06, "logits/chosen": -0.8343189358711243, "logits/rejected": -0.887412428855896, "logps/chosen": -69.50465393066406, "logps/rejected": -78.40193939208984, "loss": 13.6258, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.025033259764313698, "rewards/margins": 0.02790558710694313, "rewards/rejected": -0.05293884128332138, "step": 329 }, { "epoch": 0.19167102282627635, "grad_norm": 368.6702575683594, "learning_rate": 4.520627542126671e-06, "logits/chosen": -0.8924549221992493, "logits/rejected": -0.88835608959198, "logps/chosen": -67.6229019165039, "logps/rejected": -74.09444427490234, "loss": 13.7015, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01569213904440403, "rewards/margins": 0.018520962446928024, "rewards/rejected": -0.0028288268949836493, "step": 330 }, { "epoch": 0.1922518441075681, "grad_norm": 324.3641052246094, "learning_rate": 4.519174898314934e-06, "logits/chosen": -0.8418426513671875, "logits/rejected": -0.8403279185295105, "logps/chosen": -72.23628234863281, "logps/rejected": -74.98982238769531, "loss": 13.8397, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.013874625787138939, "rewards/margins": 0.007325878832489252, "rewards/rejected": -0.021200504153966904, "step": 331 }, { "epoch": 0.19283266538885985, "grad_norm": 321.1864929199219, "learning_rate": 4.5177222545031964e-06, "logits/chosen": -0.8301184773445129, "logits/rejected": -0.748178243637085, "logps/chosen": -72.51194763183594, "logps/rejected": -72.02323913574219, "loss": 13.6885, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00831710733473301, "rewards/margins": 0.020771954208612442, "rewards/rejected": -0.02908906899392605, "step": 332 }, { "epoch": 0.1934134866701516, "grad_norm": 302.070556640625, "learning_rate": 4.516269610691459e-06, "logits/chosen": -0.8225947618484497, "logits/rejected": -0.7799954414367676, "logps/chosen": -73.186279296875, "logps/rejected": -67.39868927001953, "loss": 14.105, "rewards/accuracies": 0.5, "rewards/chosen": -0.012917375192046165, "rewards/margins": -0.019778212532401085, "rewards/rejected": 0.006860838737338781, "step": 333 }, { "epoch": 0.19399430795144335, "grad_norm": 320.9998779296875, "learning_rate": 4.514816966879721e-06, "logits/chosen": -0.8660848736763, "logits/rejected": -0.8538210988044739, "logps/chosen": -77.54426574707031, "logps/rejected": -70.16163635253906, "loss": 14.0522, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008558349683880806, "rewards/margins": -0.00849790871143341, "rewards/rejected": -6.044358087820001e-05, "step": 334 }, { "epoch": 0.1945751292327351, "grad_norm": 321.6021423339844, "learning_rate": 4.513364323067984e-06, "logits/chosen": -0.8346333503723145, "logits/rejected": -0.88763827085495, "logps/chosen": -74.38595581054688, "logps/rejected": -65.50772094726562, "loss": 14.0522, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.004940299782902002, "rewards/margins": -0.015277815982699394, "rewards/rejected": 0.010337515734136105, "step": 335 }, { "epoch": 0.19515595051402684, "grad_norm": 329.2897033691406, "learning_rate": 4.511911679256247e-06, "logits/chosen": -0.9041665196418762, "logits/rejected": -0.8650445938110352, "logps/chosen": -77.61279296875, "logps/rejected": -78.71620178222656, "loss": 13.5759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0006745524588041008, "rewards/margins": 0.03203847259283066, "rewards/rejected": -0.03136391565203667, "step": 336 }, { "epoch": 0.1957367717953186, "grad_norm": 314.5374450683594, "learning_rate": 4.510459035444509e-06, "logits/chosen": -0.7889400720596313, "logits/rejected": -0.7860768437385559, "logps/chosen": -73.5044174194336, "logps/rejected": -74.55040740966797, "loss": 13.7341, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00419281842187047, "rewards/margins": 0.017519472166895866, "rewards/rejected": -0.013326652348041534, "step": 337 }, { "epoch": 0.19631759307661031, "grad_norm": 303.45733642578125, "learning_rate": 4.509006391632772e-06, "logits/chosen": -0.9617490768432617, "logits/rejected": -0.9624162912368774, "logps/chosen": -72.66578674316406, "logps/rejected": -73.77256774902344, "loss": 13.46, "rewards/accuracies": 0.5, "rewards/chosen": 0.024341020733118057, "rewards/margins": 0.044455137103796005, "rewards/rejected": -0.020114116370677948, "step": 338 }, { "epoch": 0.19689841435790206, "grad_norm": 297.1424560546875, "learning_rate": 4.507553747821035e-06, "logits/chosen": -0.8083046078681946, "logits/rejected": -0.8331934213638306, "logps/chosen": -71.33175659179688, "logps/rejected": -72.65750885009766, "loss": 13.8942, "rewards/accuracies": 0.5, "rewards/chosen": 0.017249945551156998, "rewards/margins": 0.002586670219898224, "rewards/rejected": 0.014663276262581348, "step": 339 }, { "epoch": 0.1974792356391938, "grad_norm": 312.68145751953125, "learning_rate": 4.506101104009298e-06, "logits/chosen": -0.861495316028595, "logits/rejected": -0.8806974291801453, "logps/chosen": -73.86710357666016, "logps/rejected": -74.84933471679688, "loss": 13.6776, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.015142458491027355, "rewards/margins": 0.02246389351785183, "rewards/rejected": -0.03760635107755661, "step": 340 }, { "epoch": 0.19806005692048556, "grad_norm": 306.0162353515625, "learning_rate": 4.5046484601975595e-06, "logits/chosen": -0.7776416540145874, "logits/rejected": -0.7664206624031067, "logps/chosen": -74.73188018798828, "logps/rejected": -71.7195053100586, "loss": 13.8777, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.01987052895128727, "rewards/margins": 0.001284526428207755, "rewards/rejected": -0.02115505374968052, "step": 341 }, { "epoch": 0.1986408782017773, "grad_norm": 318.2908935546875, "learning_rate": 4.503195816385822e-06, "logits/chosen": -0.9286810159683228, "logits/rejected": -0.8470717668533325, "logps/chosen": -77.0329360961914, "logps/rejected": -79.6993637084961, "loss": 13.9516, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.03609747439622879, "rewards/margins": -0.0019962512888014317, "rewards/rejected": -0.0341012142598629, "step": 342 }, { "epoch": 0.19922169948306906, "grad_norm": 343.0354309082031, "learning_rate": 4.501743172574085e-06, "logits/chosen": -0.6656073331832886, "logits/rejected": -0.6898230314254761, "logps/chosen": -75.619384765625, "logps/rejected": -77.71598052978516, "loss": 13.666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0036884776782244444, "rewards/margins": 0.024708479642868042, "rewards/rejected": -0.028396958485245705, "step": 343 }, { "epoch": 0.1998025207643608, "grad_norm": 318.9757995605469, "learning_rate": 4.500290528762348e-06, "logits/chosen": -0.9170076251029968, "logits/rejected": -0.9125305414199829, "logps/chosen": -61.814781188964844, "logps/rejected": -73.81932830810547, "loss": 13.8999, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.009178831242024899, "rewards/margins": 0.0020302850753068924, "rewards/rejected": -0.011209115386009216, "step": 344 }, { "epoch": 0.20038334204565256, "grad_norm": 301.6319580078125, "learning_rate": 4.4988378849506105e-06, "logits/chosen": -0.8031547665596008, "logits/rejected": -0.8383834958076477, "logps/chosen": -71.64036560058594, "logps/rejected": -69.63984680175781, "loss": 14.3238, "rewards/accuracies": 0.5, "rewards/chosen": -0.024623652920126915, "rewards/margins": -0.042028360068798065, "rewards/rejected": 0.0174047090113163, "step": 345 }, { "epoch": 0.2009641633269443, "grad_norm": 312.9844055175781, "learning_rate": 4.497385241138873e-06, "logits/chosen": -0.9153481721878052, "logits/rejected": -0.880916953086853, "logps/chosen": -75.21138000488281, "logps/rejected": -69.07037353515625, "loss": 13.5771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0012881949078291655, "rewards/margins": 0.031341440975666046, "rewards/rejected": -0.03262963145971298, "step": 346 }, { "epoch": 0.20154498460823606, "grad_norm": 349.6585388183594, "learning_rate": 4.495932597327136e-06, "logits/chosen": -0.7509949803352356, "logits/rejected": -0.6910394430160522, "logps/chosen": -70.14628601074219, "logps/rejected": -71.41559600830078, "loss": 14.3833, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.01079236064106226, "rewards/margins": -0.048026543110609055, "rewards/rejected": 0.03723418340086937, "step": 347 }, { "epoch": 0.2021258058895278, "grad_norm": 291.7655029296875, "learning_rate": 4.494479953515398e-06, "logits/chosen": -0.8026542663574219, "logits/rejected": -0.803175151348114, "logps/chosen": -71.43840789794922, "logps/rejected": -71.1689224243164, "loss": 13.6799, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0039477264508605, "rewards/margins": 0.021708643063902855, "rewards/rejected": -0.01776091754436493, "step": 348 }, { "epoch": 0.20270662717081953, "grad_norm": 293.5408935546875, "learning_rate": 4.493027309703661e-06, "logits/chosen": -0.8886486887931824, "logits/rejected": -0.8125056028366089, "logps/chosen": -77.56742095947266, "logps/rejected": -66.32499694824219, "loss": 13.7167, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.02104121446609497, "rewards/margins": 0.019321372732520103, "rewards/rejected": 0.001719839172437787, "step": 349 }, { "epoch": 0.20328744845211127, "grad_norm": 305.96697998046875, "learning_rate": 4.491574665891923e-06, "logits/chosen": -0.7537021636962891, "logits/rejected": -0.7224977016448975, "logps/chosen": -71.26932525634766, "logps/rejected": -69.65036010742188, "loss": 13.7664, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.008614836260676384, "rewards/margins": 0.012173959985375404, "rewards/rejected": -0.02078879624605179, "step": 350 }, { "epoch": 0.20386826973340302, "grad_norm": 304.3200988769531, "learning_rate": 4.490122022080186e-06, "logits/chosen": -0.7840813398361206, "logits/rejected": -0.7836223840713501, "logps/chosen": -67.91246032714844, "logps/rejected": -72.22390747070312, "loss": 13.389, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.016956433653831482, "rewards/margins": 0.051059335470199585, "rewards/rejected": -0.034102894365787506, "step": 351 }, { "epoch": 0.20444909101469477, "grad_norm": 291.3819885253906, "learning_rate": 4.488669378268449e-06, "logits/chosen": -0.7469109296798706, "logits/rejected": -0.7488623857498169, "logps/chosen": -69.54528045654297, "logps/rejected": -72.68141174316406, "loss": 13.5492, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.017030106857419014, "rewards/margins": 0.03490540385246277, "rewards/rejected": -0.017875295132398605, "step": 352 }, { "epoch": 0.20502991229598652, "grad_norm": 311.4941711425781, "learning_rate": 4.487216734456712e-06, "logits/chosen": -0.7565064430236816, "logits/rejected": -0.814095675945282, "logps/chosen": -77.25660705566406, "logps/rejected": -70.50426483154297, "loss": 13.9861, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.023350324481725693, "rewards/margins": -0.00980983767658472, "rewards/rejected": -0.013540486805140972, "step": 353 }, { "epoch": 0.20561073357727827, "grad_norm": 317.1282958984375, "learning_rate": 4.485764090644974e-06, "logits/chosen": -0.6474046111106873, "logits/rejected": -0.5795416235923767, "logps/chosen": -73.16572570800781, "logps/rejected": -73.50746154785156, "loss": 13.6889, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.022182445973157883, "rewards/margins": 0.024161117151379585, "rewards/rejected": -0.0019786744378507137, "step": 354 }, { "epoch": 0.20619155485857002, "grad_norm": 353.6116638183594, "learning_rate": 4.484311446833236e-06, "logits/chosen": -0.6704899072647095, "logits/rejected": -0.7246066331863403, "logps/chosen": -75.37571716308594, "logps/rejected": -77.72221374511719, "loss": 14.1555, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.020454417914152145, "rewards/margins": -0.026207536458969116, "rewards/rejected": 0.005753117147833109, "step": 355 }, { "epoch": 0.20677237613986177, "grad_norm": 306.08367919921875, "learning_rate": 4.482858803021499e-06, "logits/chosen": -0.8278299570083618, "logits/rejected": -0.7702374458312988, "logps/chosen": -79.121337890625, "logps/rejected": -79.2762680053711, "loss": 13.3845, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.017661219462752342, "rewards/margins": 0.05229003354907036, "rewards/rejected": -0.034628815948963165, "step": 356 }, { "epoch": 0.20735319742115352, "grad_norm": 301.5957946777344, "learning_rate": 4.481406159209763e-06, "logits/chosen": -0.7846859693527222, "logits/rejected": -0.6674401164054871, "logps/chosen": -70.37995910644531, "logps/rejected": -72.95897674560547, "loss": 13.5715, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.012851124629378319, "rewards/margins": 0.033614885061979294, "rewards/rejected": -0.020763758569955826, "step": 357 }, { "epoch": 0.20793401870244527, "grad_norm": 292.0884094238281, "learning_rate": 4.479953515398025e-06, "logits/chosen": -0.7394891381263733, "logits/rejected": -0.7185255289077759, "logps/chosen": -69.58491516113281, "logps/rejected": -70.53492736816406, "loss": 13.4921, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.008961153216660023, "rewards/margins": 0.04007042199373245, "rewards/rejected": -0.031109267845749855, "step": 358 }, { "epoch": 0.20851483998373702, "grad_norm": 307.0670471191406, "learning_rate": 4.478500871586287e-06, "logits/chosen": -0.7576676607131958, "logits/rejected": -0.8148140907287598, "logps/chosen": -73.82097625732422, "logps/rejected": -76.19036102294922, "loss": 13.6695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.014283361844718456, "rewards/margins": 0.0231223963201046, "rewards/rejected": -0.008839035406708717, "step": 359 }, { "epoch": 0.20909566126502874, "grad_norm": 300.3365173339844, "learning_rate": 4.47704822777455e-06, "logits/chosen": -0.8952158093452454, "logits/rejected": -0.7517813444137573, "logps/chosen": -67.8691635131836, "logps/rejected": -75.82969665527344, "loss": 14.1246, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.004317197948694229, "rewards/margins": -0.022498121485114098, "rewards/rejected": 0.01818092353641987, "step": 360 }, { "epoch": 0.2096764825463205, "grad_norm": 322.0217590332031, "learning_rate": 4.475595583962813e-06, "logits/chosen": -0.9054155349731445, "logits/rejected": -0.7519516348838806, "logps/chosen": -73.81487274169922, "logps/rejected": -73.7669906616211, "loss": 13.4128, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.025307711213827133, "rewards/margins": 0.04915634170174599, "rewards/rejected": -0.023848628625273705, "step": 361 }, { "epoch": 0.21025730382761224, "grad_norm": 305.5541687011719, "learning_rate": 4.4741429401510755e-06, "logits/chosen": -0.7097185254096985, "logits/rejected": -0.7574303150177002, "logps/chosen": -71.90687561035156, "logps/rejected": -68.58987426757812, "loss": 13.4067, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00799614004790783, "rewards/margins": 0.051417239010334015, "rewards/rejected": -0.059413373470306396, "step": 362 }, { "epoch": 0.21083812510890398, "grad_norm": 353.3551025390625, "learning_rate": 4.472690296339338e-06, "logits/chosen": -0.925432026386261, "logits/rejected": -0.7938982844352722, "logps/chosen": -65.05897521972656, "logps/rejected": -73.69389343261719, "loss": 13.7132, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.015448915772140026, "rewards/margins": 0.017804250121116638, "rewards/rejected": -0.03325316682457924, "step": 363 }, { "epoch": 0.21141894639019573, "grad_norm": 295.042236328125, "learning_rate": 4.471237652527601e-06, "logits/chosen": -0.5853012800216675, "logits/rejected": -0.6785317659378052, "logps/chosen": -70.28089904785156, "logps/rejected": -81.96311950683594, "loss": 13.2481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.01614985801279545, "rewards/margins": 0.06508197635412216, "rewards/rejected": -0.04893212765455246, "step": 364 }, { "epoch": 0.21199976767148748, "grad_norm": 317.24664306640625, "learning_rate": 4.469785008715864e-06, "logits/chosen": -0.7670639157295227, "logits/rejected": -0.732401967048645, "logps/chosen": -73.05061340332031, "logps/rejected": -82.63640594482422, "loss": 13.4454, "rewards/accuracies": 0.5, "rewards/chosen": -0.009124317206442356, "rewards/margins": 0.047481339424848557, "rewards/rejected": -0.05660565569996834, "step": 365 }, { "epoch": 0.21258058895277923, "grad_norm": 308.0080871582031, "learning_rate": 4.468332364904126e-06, "logits/chosen": -0.7194598913192749, "logits/rejected": -0.6728766560554504, "logps/chosen": -73.11915588378906, "logps/rejected": -68.853759765625, "loss": 13.9253, "rewards/accuracies": 0.5, "rewards/chosen": -0.021355021744966507, "rewards/margins": -0.003295204136520624, "rewards/rejected": -0.01805981807410717, "step": 366 }, { "epoch": 0.21316141023407098, "grad_norm": 327.4511413574219, "learning_rate": 4.466879721092388e-06, "logits/chosen": -0.7333757281303406, "logits/rejected": -0.7388908863067627, "logps/chosen": -81.80522155761719, "logps/rejected": -69.60041046142578, "loss": 13.8993, "rewards/accuracies": 0.5, "rewards/chosen": -0.011159257963299751, "rewards/margins": 0.005005787592381239, "rewards/rejected": -0.016165047883987427, "step": 367 }, { "epoch": 0.21374223151536273, "grad_norm": 298.0936279296875, "learning_rate": 4.465427077280651e-06, "logits/chosen": -0.9420258402824402, "logits/rejected": -0.7203485369682312, "logps/chosen": -78.59254455566406, "logps/rejected": -70.5693130493164, "loss": 13.4892, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.02526816353201866, "rewards/margins": 0.04102920740842819, "rewards/rejected": -0.01576104201376438, "step": 368 }, { "epoch": 0.21432305279665448, "grad_norm": 297.84136962890625, "learning_rate": 4.463974433468914e-06, "logits/chosen": -0.8287162780761719, "logits/rejected": -0.8401540517807007, "logps/chosen": -70.97822570800781, "logps/rejected": -73.44509887695312, "loss": 13.4517, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03007066808640957, "rewards/margins": 0.047326039522886276, "rewards/rejected": -0.017255373299121857, "step": 369 }, { "epoch": 0.21490387407794623, "grad_norm": 322.31854248046875, "learning_rate": 4.462521789657177e-06, "logits/chosen": -0.936688244342804, "logits/rejected": -0.8096345663070679, "logps/chosen": -78.00273132324219, "logps/rejected": -76.44258117675781, "loss": 13.933, "rewards/accuracies": 0.5, "rewards/chosen": -0.024149218574166298, "rewards/margins": 0.0015145957004278898, "rewards/rejected": -0.025663817301392555, "step": 370 }, { "epoch": 0.21548469535923795, "grad_norm": 316.048583984375, "learning_rate": 4.461069145845439e-06, "logits/chosen": -0.7057862877845764, "logits/rejected": -0.6469615697860718, "logps/chosen": -77.95980834960938, "logps/rejected": -83.23860931396484, "loss": 13.6137, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.009251176379621029, "rewards/margins": 0.03268672153353691, "rewards/rejected": -0.04193788766860962, "step": 371 }, { "epoch": 0.2160655166405297, "grad_norm": 306.4630432128906, "learning_rate": 4.459616502033702e-06, "logits/chosen": -0.4769328534603119, "logits/rejected": -0.44898924231529236, "logps/chosen": -77.04486846923828, "logps/rejected": -78.56315612792969, "loss": 13.6158, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02271147631108761, "rewards/margins": 0.02890445664525032, "rewards/rejected": -0.05161593109369278, "step": 372 }, { "epoch": 0.21664633792182145, "grad_norm": 304.4960021972656, "learning_rate": 4.458163858221964e-06, "logits/chosen": -0.7167657017707825, "logits/rejected": -0.6918506622314453, "logps/chosen": -71.01347351074219, "logps/rejected": -72.31254577636719, "loss": 13.732, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.000307169568259269, "rewards/margins": 0.01947268471121788, "rewards/rejected": -0.0191655196249485, "step": 373 }, { "epoch": 0.2172271592031132, "grad_norm": 296.60205078125, "learning_rate": 4.456711214410227e-06, "logits/chosen": -0.7357224225997925, "logits/rejected": -0.6462749242782593, "logps/chosen": -75.4621810913086, "logps/rejected": -70.76265716552734, "loss": 14.1072, "rewards/accuracies": 0.5, "rewards/chosen": -0.03969653695821762, "rewards/margins": -0.01930340752005577, "rewards/rejected": -0.02039313316345215, "step": 374 }, { "epoch": 0.21780798048440494, "grad_norm": 299.0235595703125, "learning_rate": 4.4552585705984895e-06, "logits/chosen": -0.9737260937690735, "logits/rejected": -0.883916974067688, "logps/chosen": -76.01069641113281, "logps/rejected": -64.29400634765625, "loss": 13.8576, "rewards/accuracies": 0.5, "rewards/chosen": 0.0021567821968346834, "rewards/margins": 0.005682178307324648, "rewards/rejected": -0.003525395644828677, "step": 375 }, { "epoch": 0.2183888017656967, "grad_norm": 323.23663330078125, "learning_rate": 4.453805926786752e-06, "logits/chosen": -0.7908447980880737, "logits/rejected": -0.7856351137161255, "logps/chosen": -73.55455017089844, "logps/rejected": -75.9771957397461, "loss": 14.1184, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.018270045518875122, "rewards/margins": -0.020638611167669296, "rewards/rejected": 0.0023685642518103123, "step": 376 }, { "epoch": 0.21896962304698844, "grad_norm": 322.6252746582031, "learning_rate": 4.452353282975015e-06, "logits/chosen": -0.8486648797988892, "logits/rejected": -0.8361449241638184, "logps/chosen": -71.75032043457031, "logps/rejected": -70.97474670410156, "loss": 13.4519, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0086447075009346, "rewards/margins": 0.04460224509239197, "rewards/rejected": -0.05324694514274597, "step": 377 }, { "epoch": 0.2195504443282802, "grad_norm": 533.611083984375, "learning_rate": 4.450900639163278e-06, "logits/chosen": -0.7714306116104126, "logits/rejected": -0.7245787382125854, "logps/chosen": -73.3573226928711, "logps/rejected": -72.71539306640625, "loss": 13.7809, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01556567382067442, "rewards/margins": 0.01385035365819931, "rewards/rejected": -0.029416028410196304, "step": 378 }, { "epoch": 0.22013126560957194, "grad_norm": 287.4798583984375, "learning_rate": 4.4494479953515405e-06, "logits/chosen": -0.8422917127609253, "logits/rejected": -0.7723952531814575, "logps/chosen": -67.02293395996094, "logps/rejected": -76.67664337158203, "loss": 13.2566, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.013728572055697441, "rewards/margins": 0.0658256933093071, "rewards/rejected": -0.052097123116254807, "step": 379 }, { "epoch": 0.2207120868908637, "grad_norm": 333.2846374511719, "learning_rate": 4.4479953515398024e-06, "logits/chosen": -0.7608442306518555, "logits/rejected": -0.7422316670417786, "logps/chosen": -72.54397583007812, "logps/rejected": -69.54707336425781, "loss": 14.3318, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.04744737595319748, "rewards/margins": -0.040470145642757416, "rewards/rejected": -0.00697722565382719, "step": 380 }, { "epoch": 0.22129290817215544, "grad_norm": 315.6219177246094, "learning_rate": 4.446542707728065e-06, "logits/chosen": -0.7969782948493958, "logits/rejected": -0.7673382759094238, "logps/chosen": -76.81396484375, "logps/rejected": -74.59119415283203, "loss": 14.2554, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.04644475132226944, "rewards/margins": -0.03383153676986694, "rewards/rejected": -0.012613209895789623, "step": 381 }, { "epoch": 0.22187372945344716, "grad_norm": 360.37274169921875, "learning_rate": 4.445090063916328e-06, "logits/chosen": -0.7555001974105835, "logits/rejected": -0.8732713460922241, "logps/chosen": -80.61076354980469, "logps/rejected": -67.64311981201172, "loss": 14.0055, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.014537744224071503, "rewards/margins": -0.010739547200500965, "rewards/rejected": -0.0037982002831995487, "step": 382 }, { "epoch": 0.2224545507347389, "grad_norm": 305.7255554199219, "learning_rate": 4.443637420104591e-06, "logits/chosen": -0.6732084155082703, "logits/rejected": -0.6309961080551147, "logps/chosen": -69.0905532836914, "logps/rejected": -72.41564178466797, "loss": 13.2323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.053167276084423065, "rewards/margins": 0.06811966747045517, "rewards/rejected": -0.014952393248677254, "step": 383 }, { "epoch": 0.22303537201603066, "grad_norm": 318.80755615234375, "learning_rate": 4.4421847762928534e-06, "logits/chosen": -0.8548835515975952, "logits/rejected": -0.7376150488853455, "logps/chosen": -74.19415283203125, "logps/rejected": -83.93922424316406, "loss": 14.1736, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.04532115161418915, "rewards/margins": -0.02556682750582695, "rewards/rejected": -0.01975431852042675, "step": 384 }, { "epoch": 0.2236161932973224, "grad_norm": 298.5427551269531, "learning_rate": 4.440732132481116e-06, "logits/chosen": -0.7442782521247864, "logits/rejected": -0.7569425702095032, "logps/chosen": -70.04218292236328, "logps/rejected": -76.72520446777344, "loss": 13.364, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.015433462336659431, "rewards/margins": 0.052092112600803375, "rewards/rejected": -0.03665865212678909, "step": 385 }, { "epoch": 0.22419701457861416, "grad_norm": 309.5966491699219, "learning_rate": 4.439279488669379e-06, "logits/chosen": -0.6254906058311462, "logits/rejected": -0.6205364465713501, "logps/chosen": -72.59516906738281, "logps/rejected": -75.21646881103516, "loss": 13.6555, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004701803904026747, "rewards/margins": 0.02681594528257847, "rewards/rejected": -0.031517744064331055, "step": 386 }, { "epoch": 0.2247778358599059, "grad_norm": 288.473876953125, "learning_rate": 4.437826844857641e-06, "logits/chosen": -0.8325392603874207, "logits/rejected": -1.0077170133590698, "logps/chosen": -72.17142486572266, "logps/rejected": -77.3849105834961, "loss": 13.1622, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.030657533556222916, "rewards/margins": 0.08042553812265396, "rewards/rejected": -0.049768008291721344, "step": 387 }, { "epoch": 0.22535865714119765, "grad_norm": 337.0882263183594, "learning_rate": 4.436374201045904e-06, "logits/chosen": -0.6630915999412537, "logits/rejected": -0.7251507639884949, "logps/chosen": -79.76948547363281, "logps/rejected": -77.0470199584961, "loss": 14.1512, "rewards/accuracies": 0.5, "rewards/chosen": -0.01571335829794407, "rewards/margins": -0.025171738117933273, "rewards/rejected": 0.009458379819989204, "step": 388 }, { "epoch": 0.2259394784224894, "grad_norm": 312.39605712890625, "learning_rate": 4.434921557234166e-06, "logits/chosen": -0.7051008343696594, "logits/rejected": -0.7591395974159241, "logps/chosen": -70.51537322998047, "logps/rejected": -74.8718032836914, "loss": 14.1159, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.05321550369262695, "rewards/margins": -0.019576644524931908, "rewards/rejected": -0.033638857305049896, "step": 389 }, { "epoch": 0.22652029970378115, "grad_norm": 318.12457275390625, "learning_rate": 4.433468913422429e-06, "logits/chosen": -0.7092422246932983, "logits/rejected": -0.7855595946311951, "logps/chosen": -73.6144790649414, "logps/rejected": -70.25662231445312, "loss": 13.7536, "rewards/accuracies": 0.5, "rewards/chosen": -0.015159961767494678, "rewards/margins": 0.01655631884932518, "rewards/rejected": -0.03171628341078758, "step": 390 }, { "epoch": 0.2271011209850729, "grad_norm": 297.23565673828125, "learning_rate": 4.432016269610692e-06, "logits/chosen": -0.6911486387252808, "logits/rejected": -0.5650784373283386, "logps/chosen": -61.7591438293457, "logps/rejected": -79.15550231933594, "loss": 13.4395, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.007602124474942684, "rewards/margins": 0.0479809045791626, "rewards/rejected": -0.04037877544760704, "step": 391 }, { "epoch": 0.22768194226636465, "grad_norm": 315.382080078125, "learning_rate": 4.4305636257989546e-06, "logits/chosen": -0.7994186282157898, "logits/rejected": -0.8166016340255737, "logps/chosen": -81.44297790527344, "logps/rejected": -74.20890808105469, "loss": 13.8923, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.06704780459403992, "rewards/margins": 0.012969402596354485, "rewards/rejected": -0.08001720905303955, "step": 392 }, { "epoch": 0.2282627635476564, "grad_norm": 310.18865966796875, "learning_rate": 4.4291109819872165e-06, "logits/chosen": -0.6670494079589844, "logits/rejected": -0.7397955656051636, "logps/chosen": -69.39032745361328, "logps/rejected": -77.76183319091797, "loss": 13.8815, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.033610161393880844, "rewards/margins": 0.0040161325596272945, "rewards/rejected": -0.037626296281814575, "step": 393 }, { "epoch": 0.22884358482894812, "grad_norm": 1028.6781005859375, "learning_rate": 4.427658338175479e-06, "logits/chosen": -0.6542873978614807, "logits/rejected": -0.6277574300765991, "logps/chosen": -71.77285766601562, "logps/rejected": -82.8670654296875, "loss": 13.6692, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.003381280694156885, "rewards/margins": 0.02418811246752739, "rewards/rejected": -0.02756938897073269, "step": 394 }, { "epoch": 0.22942440611023987, "grad_norm": 300.6793212890625, "learning_rate": 4.426205694363742e-06, "logits/chosen": -0.657829761505127, "logits/rejected": -0.6191960573196411, "logps/chosen": -71.67803192138672, "logps/rejected": -71.7804183959961, "loss": 13.7474, "rewards/accuracies": 0.75, "rewards/chosen": 0.0207084771245718, "rewards/margins": 0.01424718089401722, "rewards/rejected": 0.006461297161877155, "step": 395 }, { "epoch": 0.23000522739153162, "grad_norm": 336.28759765625, "learning_rate": 4.424753050552005e-06, "logits/chosen": -0.7785569429397583, "logits/rejected": -0.8487392663955688, "logps/chosen": -82.74131774902344, "logps/rejected": -74.78742980957031, "loss": 14.0335, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.03686489909887314, "rewards/margins": -0.003692910075187683, "rewards/rejected": -0.03317200019955635, "step": 396 }, { "epoch": 0.23058604867282337, "grad_norm": 300.73101806640625, "learning_rate": 4.4233004067402675e-06, "logits/chosen": -0.7787965536117554, "logits/rejected": -0.7510574460029602, "logps/chosen": -72.66011047363281, "logps/rejected": -75.55070495605469, "loss": 13.6585, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.015632599592208862, "rewards/margins": 0.02679685689508915, "rewards/rejected": -0.04242945462465286, "step": 397 }, { "epoch": 0.23116686995411512, "grad_norm": 297.7178955078125, "learning_rate": 4.42184776292853e-06, "logits/chosen": -0.6219455003738403, "logits/rejected": -0.619706928730011, "logps/chosen": -72.49166107177734, "logps/rejected": -64.84117126464844, "loss": 13.6924, "rewards/accuracies": 0.5, "rewards/chosen": -0.011766968294978142, "rewards/margins": 0.021194420754909515, "rewards/rejected": -0.03296138718724251, "step": 398 }, { "epoch": 0.23174769123540687, "grad_norm": 1181.5919189453125, "learning_rate": 4.420395119116793e-06, "logits/chosen": -0.7124764323234558, "logits/rejected": -0.7599982023239136, "logps/chosen": -82.6731185913086, "logps/rejected": -77.68875885009766, "loss": 13.8524, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004232340957969427, "rewards/margins": 0.005291967652738094, "rewards/rejected": -0.001059626811183989, "step": 399 }, { "epoch": 0.23232851251669862, "grad_norm": 291.1778869628906, "learning_rate": 4.418942475305056e-06, "logits/chosen": -0.707445502281189, "logits/rejected": -0.7544277310371399, "logps/chosen": -67.69174194335938, "logps/rejected": -73.61653137207031, "loss": 13.4031, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.02160044014453888, "rewards/margins": 0.05302933603525162, "rewards/rejected": -0.03142889216542244, "step": 400 }, { "epoch": 0.23290933379799036, "grad_norm": 308.5126037597656, "learning_rate": 4.4174898314933185e-06, "logits/chosen": -0.7360697984695435, "logits/rejected": -0.6314154863357544, "logps/chosen": -73.9341049194336, "logps/rejected": -75.5310287475586, "loss": 13.7204, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -7.766112685203552e-05, "rewards/margins": 0.02340327948331833, "rewards/rejected": -0.023480940610170364, "step": 401 }, { "epoch": 0.2334901550792821, "grad_norm": 297.6388244628906, "learning_rate": 4.416037187681581e-06, "logits/chosen": -0.6773185729980469, "logits/rejected": -0.6966699957847595, "logps/chosen": -71.93721008300781, "logps/rejected": -73.58920288085938, "loss": 13.8046, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.014304883778095245, "rewards/margins": 0.012389096431434155, "rewards/rejected": 0.0019157860660925508, "step": 402 }, { "epoch": 0.23407097636057386, "grad_norm": 319.0065612792969, "learning_rate": 4.414584543869844e-06, "logits/chosen": -0.6499051451683044, "logits/rejected": -0.9121743440628052, "logps/chosen": -81.61251831054688, "logps/rejected": -78.27536010742188, "loss": 13.3141, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0065872580744326115, "rewards/margins": 0.07194839417934418, "rewards/rejected": -0.07853565365076065, "step": 403 }, { "epoch": 0.2346517976418656, "grad_norm": 398.7077941894531, "learning_rate": 4.413131900058107e-06, "logits/chosen": -0.6521409749984741, "logits/rejected": -0.7569887638092041, "logps/chosen": -71.24628448486328, "logps/rejected": -66.81242370605469, "loss": 13.7869, "rewards/accuracies": 0.5, "rewards/chosen": 0.0004381166654638946, "rewards/margins": 0.012258688919246197, "rewards/rejected": -0.011820574291050434, "step": 404 }, { "epoch": 0.23523261892315733, "grad_norm": 296.407470703125, "learning_rate": 4.411679256246369e-06, "logits/chosen": -0.8323850631713867, "logits/rejected": -0.7685847282409668, "logps/chosen": -70.29356384277344, "logps/rejected": -73.77702331542969, "loss": 13.6071, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.008858191780745983, "rewards/margins": 0.034311629831790924, "rewards/rejected": -0.025453437119722366, "step": 405 }, { "epoch": 0.23581344020444908, "grad_norm": 316.6470031738281, "learning_rate": 4.410226612434631e-06, "logits/chosen": -0.8853529691696167, "logits/rejected": -0.8087629079818726, "logps/chosen": -74.18810272216797, "logps/rejected": -75.40741729736328, "loss": 13.8599, "rewards/accuracies": 0.5, "rewards/chosen": 0.02046876773238182, "rewards/margins": 0.006527154240757227, "rewards/rejected": 0.013941613025963306, "step": 406 }, { "epoch": 0.23639426148574083, "grad_norm": 307.0223388671875, "learning_rate": 4.408773968622894e-06, "logits/chosen": -0.5616191625595093, "logits/rejected": -0.5830402970314026, "logps/chosen": -73.06590270996094, "logps/rejected": -75.1976318359375, "loss": 13.5981, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.030349424108862877, "rewards/margins": 0.031421925872564316, "rewards/rejected": -0.0010725029278546572, "step": 407 }, { "epoch": 0.23697508276703258, "grad_norm": 306.7025146484375, "learning_rate": 4.407321324811157e-06, "logits/chosen": -0.6714197993278503, "logits/rejected": -0.7746154069900513, "logps/chosen": -76.53195190429688, "logps/rejected": -69.44393920898438, "loss": 13.6952, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0025624283589422703, "rewards/margins": 0.02050391212105751, "rewards/rejected": -0.023066340014338493, "step": 408 }, { "epoch": 0.23755590404832433, "grad_norm": 315.3130187988281, "learning_rate": 4.40586868099942e-06, "logits/chosen": -0.6545313000679016, "logits/rejected": -0.6212127804756165, "logps/chosen": -75.73979187011719, "logps/rejected": -75.5727310180664, "loss": 13.0526, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.03893669322133064, "rewards/margins": 0.08733826875686646, "rewards/rejected": -0.04840157553553581, "step": 409 }, { "epoch": 0.23813672532961608, "grad_norm": 319.98065185546875, "learning_rate": 4.404416037187682e-06, "logits/chosen": -0.6923006772994995, "logits/rejected": -0.5738077163696289, "logps/chosen": -72.50543975830078, "logps/rejected": -73.67253112792969, "loss": 13.8894, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.011189279146492481, "rewards/margins": 0.001272419816814363, "rewards/rejected": -0.012461700476706028, "step": 410 }, { "epoch": 0.23871754661090783, "grad_norm": 353.4020080566406, "learning_rate": 4.402963393375945e-06, "logits/chosen": -0.6882558465003967, "logits/rejected": -0.6723198890686035, "logps/chosen": -75.6532211303711, "logps/rejected": -70.86685943603516, "loss": 13.8372, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0034420304000377655, "rewards/margins": 0.008326428011059761, "rewards/rejected": -0.004884395748376846, "step": 411 }, { "epoch": 0.23929836789219958, "grad_norm": 305.5731506347656, "learning_rate": 4.401510749564207e-06, "logits/chosen": -0.7900495529174805, "logits/rejected": -0.7300332188606262, "logps/chosen": -69.94303894042969, "logps/rejected": -73.22996520996094, "loss": 13.4269, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.033657319843769073, "rewards/margins": 0.048354774713516235, "rewards/rejected": -0.014697456732392311, "step": 412 }, { "epoch": 0.23987918917349133, "grad_norm": 337.9052734375, "learning_rate": 4.40005810575247e-06, "logits/chosen": -0.7297667264938354, "logits/rejected": -0.8206753730773926, "logps/chosen": -79.34073638916016, "logps/rejected": -79.64215850830078, "loss": 13.3179, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.033630721271038055, "rewards/margins": 0.06482113897800446, "rewards/rejected": -0.03119041956961155, "step": 413 }, { "epoch": 0.24046001045478307, "grad_norm": 328.78985595703125, "learning_rate": 4.3986054619407325e-06, "logits/chosen": -0.5856087803840637, "logits/rejected": -0.6253767013549805, "logps/chosen": -76.09996032714844, "logps/rejected": -76.34832763671875, "loss": 14.0925, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.030036872252821922, "rewards/margins": -0.014562776312232018, "rewards/rejected": -0.015474090352654457, "step": 414 }, { "epoch": 0.24104083173607482, "grad_norm": 297.6679382324219, "learning_rate": 4.397152818128995e-06, "logits/chosen": -0.8083240389823914, "logits/rejected": -0.7946100234985352, "logps/chosen": -72.3720703125, "logps/rejected": -68.49833679199219, "loss": 14.016, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.012605169788002968, "rewards/margins": -0.009788742288947105, "rewards/rejected": -0.0028164288960397243, "step": 415 }, { "epoch": 0.24162165301736654, "grad_norm": 310.3009338378906, "learning_rate": 4.395700174317258e-06, "logits/chosen": -0.6257360577583313, "logits/rejected": -0.6403561234474182, "logps/chosen": -76.71080017089844, "logps/rejected": -77.48155212402344, "loss": 13.5, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04251420125365257, "rewards/margins": 0.04183372110128403, "rewards/rejected": 0.0006804756703786552, "step": 416 }, { "epoch": 0.2422024742986583, "grad_norm": 325.7873840332031, "learning_rate": 4.394247530505521e-06, "logits/chosen": -0.6726334691047668, "logits/rejected": -0.6855700016021729, "logps/chosen": -75.29472351074219, "logps/rejected": -71.83308410644531, "loss": 14.0017, "rewards/accuracies": 0.5, "rewards/chosen": 0.018920384347438812, "rewards/margins": -0.009463165886700153, "rewards/rejected": 0.02838354930281639, "step": 417 }, { "epoch": 0.24278329557995004, "grad_norm": 542.8521728515625, "learning_rate": 4.3927948866937835e-06, "logits/chosen": -0.5671173334121704, "logits/rejected": -0.5828499794006348, "logps/chosen": -80.865966796875, "logps/rejected": -77.36320495605469, "loss": 14.1966, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.010454867966473103, "rewards/margins": -0.0281839556992054, "rewards/rejected": 0.01772909425199032, "step": 418 }, { "epoch": 0.2433641168612418, "grad_norm": 313.10650634765625, "learning_rate": 4.391342242882045e-06, "logits/chosen": -0.5604667663574219, "logits/rejected": -0.6470273733139038, "logps/chosen": -72.81732177734375, "logps/rejected": -70.16290283203125, "loss": 13.2515, "rewards/accuracies": 0.5, "rewards/chosen": 0.017285270616412163, "rewards/margins": 0.07715705037117004, "rewards/rejected": -0.05987178534269333, "step": 419 }, { "epoch": 0.24394493814253354, "grad_norm": 322.7060241699219, "learning_rate": 4.389889599070308e-06, "logits/chosen": -0.7095221281051636, "logits/rejected": -0.70353764295578, "logps/chosen": -72.71652221679688, "logps/rejected": -67.59053802490234, "loss": 14.0589, "rewards/accuracies": 0.5, "rewards/chosen": -0.017042094841599464, "rewards/margins": -0.016451817005872726, "rewards/rejected": -0.0005902774864807725, "step": 420 }, { "epoch": 0.2445257594238253, "grad_norm": 285.8162536621094, "learning_rate": 4.388436955258571e-06, "logits/chosen": -0.8471421003341675, "logits/rejected": -0.7050653696060181, "logps/chosen": -71.2718505859375, "logps/rejected": -66.46906280517578, "loss": 13.5971, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004547786898910999, "rewards/margins": 0.030440161004662514, "rewards/rejected": -0.02589237317442894, "step": 421 }, { "epoch": 0.24510658070511704, "grad_norm": 312.5526428222656, "learning_rate": 4.386984311446834e-06, "logits/chosen": -0.6499220728874207, "logits/rejected": -0.7068791389465332, "logps/chosen": -70.94161987304688, "logps/rejected": -67.37171936035156, "loss": 13.7637, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.014181653037667274, "rewards/margins": 0.019841155037283897, "rewards/rejected": -0.005659504793584347, "step": 422 }, { "epoch": 0.2456874019864088, "grad_norm": 306.18194580078125, "learning_rate": 4.385531667635096e-06, "logits/chosen": -0.6181563138961792, "logits/rejected": -0.7241290807723999, "logps/chosen": -63.1754035949707, "logps/rejected": -76.5521469116211, "loss": 13.3346, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.06029351428151131, "rewards/margins": 0.05993475764989853, "rewards/rejected": 0.00035875439061783254, "step": 423 }, { "epoch": 0.24626822326770054, "grad_norm": 307.78765869140625, "learning_rate": 4.384079023823359e-06, "logits/chosen": -0.488178014755249, "logits/rejected": -0.6498016119003296, "logps/chosen": -70.04771423339844, "logps/rejected": -67.56314849853516, "loss": 13.8241, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.002901268657296896, "rewards/margins": 0.014369833283126354, "rewards/rejected": -0.011468565091490746, "step": 424 }, { "epoch": 0.24684904454899229, "grad_norm": 321.4908142089844, "learning_rate": 4.382626380011621e-06, "logits/chosen": -0.7453001737594604, "logits/rejected": -0.7693713903427124, "logps/chosen": -69.00504302978516, "logps/rejected": -69.38874816894531, "loss": 14.0894, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.03128911182284355, "rewards/margins": -0.019808074459433556, "rewards/rejected": -0.011481037363409996, "step": 425 }, { "epoch": 0.24742986583028403, "grad_norm": 306.6564025878906, "learning_rate": 4.381173736199884e-06, "logits/chosen": -0.6882332563400269, "logits/rejected": -0.666528582572937, "logps/chosen": -71.15715026855469, "logps/rejected": -72.64927673339844, "loss": 13.5929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.045943137258291245, "rewards/margins": 0.03135029971599579, "rewards/rejected": 0.01459283847361803, "step": 426 }, { "epoch": 0.24801068711157576, "grad_norm": 319.85552978515625, "learning_rate": 4.3797210923881465e-06, "logits/chosen": -0.6911331415176392, "logits/rejected": -0.6865822076797485, "logps/chosen": -70.95181274414062, "logps/rejected": -69.65785217285156, "loss": 13.6944, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.00031525566009804606, "rewards/margins": 0.025617409497499466, "rewards/rejected": -0.025302153080701828, "step": 427 }, { "epoch": 0.2485915083928675, "grad_norm": 305.7183532714844, "learning_rate": 4.378268448576409e-06, "logits/chosen": -0.5291553735733032, "logits/rejected": -0.5490658283233643, "logps/chosen": -69.1864242553711, "logps/rejected": -74.73960876464844, "loss": 13.8674, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005592182278633118, "rewards/margins": 0.003548829350620508, "rewards/rejected": 0.002043351763859391, "step": 428 }, { "epoch": 0.24917232967415925, "grad_norm": 297.0389404296875, "learning_rate": 4.376815804764672e-06, "logits/chosen": -0.6868799924850464, "logits/rejected": -0.71811842918396, "logps/chosen": -67.46659088134766, "logps/rejected": -66.69960021972656, "loss": 13.6168, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0253940187394619, "rewards/margins": 0.02877502143383026, "rewards/rejected": -0.0033809959422796965, "step": 429 }, { "epoch": 0.249753150955451, "grad_norm": 304.1397705078125, "learning_rate": 4.375363160952935e-06, "logits/chosen": -0.6851626634597778, "logits/rejected": -0.7611708641052246, "logps/chosen": -66.39443969726562, "logps/rejected": -74.42601013183594, "loss": 14.2112, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.003943081013858318, "rewards/margins": -0.02984614297747612, "rewards/rejected": 0.03378922492265701, "step": 430 }, { "epoch": 0.25033397223674275, "grad_norm": 294.4915771484375, "learning_rate": 4.3739105171411975e-06, "logits/chosen": -0.6287668943405151, "logits/rejected": -0.6079251170158386, "logps/chosen": -75.12745666503906, "logps/rejected": -70.1561508178711, "loss": 13.7044, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.015175809152424335, "rewards/margins": 0.02054971642792225, "rewards/rejected": -0.005373907275497913, "step": 431 }, { "epoch": 0.2509147935180345, "grad_norm": 331.4616394042969, "learning_rate": 4.3724578733294594e-06, "logits/chosen": -0.6616209149360657, "logits/rejected": -0.6820401549339294, "logps/chosen": -65.7193374633789, "logps/rejected": -67.92347717285156, "loss": 13.6835, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.032937806099653244, "rewards/margins": 0.024601612240076065, "rewards/rejected": 0.008336210623383522, "step": 432 }, { "epoch": 0.25149561479932625, "grad_norm": 302.4757385253906, "learning_rate": 4.371005229517722e-06, "logits/chosen": -0.5871397852897644, "logits/rejected": -0.6894339323043823, "logps/chosen": -70.20496368408203, "logps/rejected": -75.09275817871094, "loss": 13.6636, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0664600282907486, "rewards/margins": 0.022688765078783035, "rewards/rejected": 0.04377124831080437, "step": 433 }, { "epoch": 0.25207643608061797, "grad_norm": 301.0609436035156, "learning_rate": 4.369552585705985e-06, "logits/chosen": -0.72092604637146, "logits/rejected": -0.7814493775367737, "logps/chosen": -69.15807342529297, "logps/rejected": -69.16731262207031, "loss": 13.3789, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.03117241896688938, "rewards/margins": 0.05374479293823242, "rewards/rejected": -0.02257237769663334, "step": 434 }, { "epoch": 0.25265725736190975, "grad_norm": 310.7433776855469, "learning_rate": 4.368099941894248e-06, "logits/chosen": -0.5796935558319092, "logits/rejected": -0.6400626301765442, "logps/chosen": -74.19486236572266, "logps/rejected": -78.7645263671875, "loss": 13.3789, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.040846049785614014, "rewards/margins": 0.05530615895986557, "rewards/rejected": -0.014460104517638683, "step": 435 }, { "epoch": 0.25323807864320147, "grad_norm": 320.4769287109375, "learning_rate": 4.36664729808251e-06, "logits/chosen": -0.7232745289802551, "logits/rejected": -0.850020706653595, "logps/chosen": -66.91292572021484, "logps/rejected": -63.138587951660156, "loss": 14.3721, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.009416877292096615, "rewards/margins": -0.04486392065882683, "rewards/rejected": 0.03544704243540764, "step": 436 }, { "epoch": 0.25381889992449325, "grad_norm": 322.4789733886719, "learning_rate": 4.365194654270773e-06, "logits/chosen": -0.7586982846260071, "logits/rejected": -0.8501695394515991, "logps/chosen": -74.4521255493164, "logps/rejected": -72.19835662841797, "loss": 13.5591, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.026751860976219177, "rewards/margins": 0.03630850464105606, "rewards/rejected": -0.009556648321449757, "step": 437 }, { "epoch": 0.25439972120578497, "grad_norm": 301.9642639160156, "learning_rate": 4.363742010459036e-06, "logits/chosen": -0.7936211228370667, "logits/rejected": -0.8182106018066406, "logps/chosen": -68.75806427001953, "logps/rejected": -67.1334457397461, "loss": 13.7116, "rewards/accuracies": 0.5, "rewards/chosen": 0.03210698813199997, "rewards/margins": 0.019230013713240623, "rewards/rejected": 0.012876978144049644, "step": 438 }, { "epoch": 0.25498054248707674, "grad_norm": 317.6552429199219, "learning_rate": 4.362289366647298e-06, "logits/chosen": -0.6283025741577148, "logits/rejected": -0.5947784781455994, "logps/chosen": -72.60492706298828, "logps/rejected": -81.98297882080078, "loss": 13.6121, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.029752427712082863, "rewards/margins": 0.03360595554113388, "rewards/rejected": -0.003853529691696167, "step": 439 }, { "epoch": 0.25556136376836847, "grad_norm": 301.3587951660156, "learning_rate": 4.3608367228355606e-06, "logits/chosen": -0.521526575088501, "logits/rejected": -0.6442473530769348, "logps/chosen": -72.14176177978516, "logps/rejected": -79.62040710449219, "loss": 13.238, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.06225038319826126, "rewards/margins": 0.06984097510576248, "rewards/rejected": -0.007590600289404392, "step": 440 }, { "epoch": 0.25614218504966024, "grad_norm": 301.3314514160156, "learning_rate": 4.359384079023824e-06, "logits/chosen": -0.5870726108551025, "logits/rejected": -0.561357855796814, "logps/chosen": -69.20145416259766, "logps/rejected": -71.50432586669922, "loss": 13.8883, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.029437389224767685, "rewards/margins": 0.002829763339832425, "rewards/rejected": 0.026607628911733627, "step": 441 }, { "epoch": 0.25672300633095196, "grad_norm": 300.83056640625, "learning_rate": 4.357931435212087e-06, "logits/chosen": -0.6320289969444275, "logits/rejected": -0.6687763333320618, "logps/chosen": -69.56461334228516, "logps/rejected": -73.77947998046875, "loss": 13.5317, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04403435066342354, "rewards/margins": 0.036295950412750244, "rewards/rejected": 0.007738398853689432, "step": 442 }, { "epoch": 0.2573038276122437, "grad_norm": 299.238525390625, "learning_rate": 4.35647879140035e-06, "logits/chosen": -0.7340787053108215, "logits/rejected": -0.8285503387451172, "logps/chosen": -69.81163024902344, "logps/rejected": -69.9961929321289, "loss": 13.6267, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.020665764808654785, "rewards/margins": 0.035551171749830246, "rewards/rejected": -0.014885407872498035, "step": 443 }, { "epoch": 0.25788464889353546, "grad_norm": 332.7782897949219, "learning_rate": 4.3550261475886116e-06, "logits/chosen": -0.6751303672790527, "logits/rejected": -0.7156798243522644, "logps/chosen": -74.23109436035156, "logps/rejected": -77.84270477294922, "loss": 14.0507, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0388188436627388, "rewards/margins": -0.01138681173324585, "rewards/rejected": 0.05020565912127495, "step": 444 }, { "epoch": 0.2584654701748272, "grad_norm": 296.38922119140625, "learning_rate": 4.353573503776874e-06, "logits/chosen": -0.8190110921859741, "logits/rejected": -0.6782156229019165, "logps/chosen": -73.74525451660156, "logps/rejected": -72.44633483886719, "loss": 13.3661, "rewards/accuracies": 0.75, "rewards/chosen": 0.07304216921329498, "rewards/margins": 0.05356328561902046, "rewards/rejected": 0.019478892907500267, "step": 445 }, { "epoch": 0.25904629145611896, "grad_norm": 310.6565246582031, "learning_rate": 4.352120859965137e-06, "logits/chosen": -0.5905870795249939, "logits/rejected": -0.6286332011222839, "logps/chosen": -73.00715637207031, "logps/rejected": -72.94271850585938, "loss": 13.9914, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0388491153717041, "rewards/margins": -0.007616178598254919, "rewards/rejected": 0.04646529257297516, "step": 446 }, { "epoch": 0.2596271127374107, "grad_norm": 352.046875, "learning_rate": 4.3506682161534e-06, "logits/chosen": -0.7802606225013733, "logits/rejected": -0.7054556608200073, "logps/chosen": -85.91800689697266, "logps/rejected": -72.05784606933594, "loss": 14.0159, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.019779253751039505, "rewards/margins": -0.008547335863113403, "rewards/rejected": 0.02832658588886261, "step": 447 }, { "epoch": 0.26020793401870246, "grad_norm": 362.30511474609375, "learning_rate": 4.3492155723416626e-06, "logits/chosen": -0.6785917282104492, "logits/rejected": -0.6494299173355103, "logps/chosen": -78.44164276123047, "logps/rejected": -72.85713958740234, "loss": 14.2555, "rewards/accuracies": 0.5, "rewards/chosen": 0.010164814069867134, "rewards/margins": -0.03138233348727226, "rewards/rejected": 0.04154714569449425, "step": 448 }, { "epoch": 0.2607887552999942, "grad_norm": 317.1739196777344, "learning_rate": 4.347762928529925e-06, "logits/chosen": -0.6515554189682007, "logits/rejected": -0.6316680908203125, "logps/chosen": -70.87915802001953, "logps/rejected": -73.50413513183594, "loss": 13.6688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.02080373838543892, "rewards/margins": 0.025081777945160866, "rewards/rejected": -0.0042780423536896706, "step": 449 }, { "epoch": 0.26136957658128596, "grad_norm": 315.79498291015625, "learning_rate": 4.346310284718187e-06, "logits/chosen": -0.6134510040283203, "logits/rejected": -0.5557273626327515, "logps/chosen": -74.24983215332031, "logps/rejected": -73.60564422607422, "loss": 13.8965, "rewards/accuracies": 0.5, "rewards/chosen": 0.027813846245408058, "rewards/margins": 0.00493066618219018, "rewards/rejected": 0.022883176803588867, "step": 450 }, { "epoch": 0.2619503978625777, "grad_norm": 313.5321350097656, "learning_rate": 4.34485764090645e-06, "logits/chosen": -0.6806867718696594, "logits/rejected": -0.6897571086883545, "logps/chosen": -70.46429443359375, "logps/rejected": -72.20818328857422, "loss": 14.0127, "rewards/accuracies": 0.5, "rewards/chosen": 0.0054542249999940395, "rewards/margins": -0.009584503248333931, "rewards/rejected": 0.015038728713989258, "step": 451 }, { "epoch": 0.26253121914386945, "grad_norm": 314.8046875, "learning_rate": 4.343404997094713e-06, "logits/chosen": -0.5489099621772766, "logits/rejected": -0.6308473348617554, "logps/chosen": -73.40983581542969, "logps/rejected": -69.65528106689453, "loss": 13.9858, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.021905479952692986, "rewards/margins": -0.0018030557548627257, "rewards/rejected": 0.023708533495664597, "step": 452 }, { "epoch": 0.2631120404251612, "grad_norm": 567.4598388671875, "learning_rate": 4.3419523532829754e-06, "logits/chosen": -0.6974093914031982, "logits/rejected": -0.7631121873855591, "logps/chosen": -70.07582092285156, "logps/rejected": -81.98683166503906, "loss": 13.3815, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.05924884229898453, "rewards/margins": 0.05474211648106575, "rewards/rejected": 0.004506723489612341, "step": 453 }, { "epoch": 0.2636928617064529, "grad_norm": 343.7535400390625, "learning_rate": 4.340499709471238e-06, "logits/chosen": -0.6991898417472839, "logits/rejected": -0.6066843867301941, "logps/chosen": -75.21280670166016, "logps/rejected": -74.64125061035156, "loss": 14.3191, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0069407084956765175, "rewards/margins": -0.04137198626995087, "rewards/rejected": 0.04831269383430481, "step": 454 }, { "epoch": 0.2642736829877447, "grad_norm": 317.2054138183594, "learning_rate": 4.339047065659501e-06, "logits/chosen": -0.7418617010116577, "logits/rejected": -0.7775954008102417, "logps/chosen": -69.52796936035156, "logps/rejected": -70.95177459716797, "loss": 13.4347, "rewards/accuracies": 0.5, "rewards/chosen": 0.04331202059984207, "rewards/margins": 0.04723655804991722, "rewards/rejected": -0.00392454257234931, "step": 455 }, { "epoch": 0.2648545042690364, "grad_norm": 315.6234130859375, "learning_rate": 4.337594421847764e-06, "logits/chosen": -0.7921939492225647, "logits/rejected": -0.9472505450248718, "logps/chosen": -69.9919662475586, "logps/rejected": -82.42518615722656, "loss": 13.9178, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0388253815472126, "rewards/margins": 0.01137163583189249, "rewards/rejected": 0.027453750371932983, "step": 456 }, { "epoch": 0.26543532555032817, "grad_norm": 308.3517150878906, "learning_rate": 4.336141778036026e-06, "logits/chosen": -0.6653806567192078, "logits/rejected": -0.5663945078849792, "logps/chosen": -73.91642761230469, "logps/rejected": -70.58480834960938, "loss": 13.8471, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.057994864881038666, "rewards/margins": 0.004966372158378363, "rewards/rejected": 0.05302848666906357, "step": 457 }, { "epoch": 0.2660161468316199, "grad_norm": 325.29302978515625, "learning_rate": 4.334689134224288e-06, "logits/chosen": -0.5894922018051147, "logits/rejected": -0.6214415431022644, "logps/chosen": -74.24130249023438, "logps/rejected": -80.94917297363281, "loss": 14.5074, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.005856601521372795, "rewards/margins": -0.0573112778365612, "rewards/rejected": 0.06316788494586945, "step": 458 }, { "epoch": 0.26659696811291167, "grad_norm": 283.90478515625, "learning_rate": 4.333236490412551e-06, "logits/chosen": -0.657735288143158, "logits/rejected": -0.6617435216903687, "logps/chosen": -76.9471435546875, "logps/rejected": -68.36034393310547, "loss": 13.1023, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.07249955832958221, "rewards/margins": 0.08381040394306183, "rewards/rejected": -0.011310835368931293, "step": 459 }, { "epoch": 0.2671777893942034, "grad_norm": 302.0826416015625, "learning_rate": 4.331783846600814e-06, "logits/chosen": -0.7745707035064697, "logits/rejected": -0.6950326561927795, "logps/chosen": -69.80528259277344, "logps/rejected": -73.91972351074219, "loss": 13.9191, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.027396023273468018, "rewards/margins": -0.00236126477830112, "rewards/rejected": 0.029757294803857803, "step": 460 }, { "epoch": 0.26775861067549517, "grad_norm": 312.1777648925781, "learning_rate": 4.330331202789077e-06, "logits/chosen": -0.6571930050849915, "logits/rejected": -0.7029620409011841, "logps/chosen": -78.16141510009766, "logps/rejected": -80.50270080566406, "loss": 13.3549, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04237108677625656, "rewards/margins": 0.05985882878303528, "rewards/rejected": -0.01748773828148842, "step": 461 }, { "epoch": 0.2683394319567869, "grad_norm": 314.3011169433594, "learning_rate": 4.328878558977339e-06, "logits/chosen": -0.6543633937835693, "logits/rejected": -0.6337400674819946, "logps/chosen": -67.81694030761719, "logps/rejected": -76.94734954833984, "loss": 13.7697, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04975013807415962, "rewards/margins": 0.016220813617110252, "rewards/rejected": 0.03352931886911392, "step": 462 }, { "epoch": 0.26892025323807867, "grad_norm": 287.1407775878906, "learning_rate": 4.327425915165602e-06, "logits/chosen": -0.7276408076286316, "logits/rejected": -0.7486574649810791, "logps/chosen": -69.2403335571289, "logps/rejected": -60.5196533203125, "loss": 13.1441, "rewards/accuracies": 0.75, "rewards/chosen": 0.06597266346216202, "rewards/margins": 0.07840771973133087, "rewards/rejected": -0.01243506371974945, "step": 463 }, { "epoch": 0.2695010745193704, "grad_norm": 307.4203796386719, "learning_rate": 4.325973271353864e-06, "logits/chosen": -0.7927466630935669, "logits/rejected": -0.8723392486572266, "logps/chosen": -71.81956481933594, "logps/rejected": -71.75994110107422, "loss": 13.1263, "rewards/accuracies": 0.75, "rewards/chosen": 0.05850011110305786, "rewards/margins": 0.08116643130779266, "rewards/rejected": -0.02266632579267025, "step": 464 }, { "epoch": 0.27008189580066216, "grad_norm": 299.2955017089844, "learning_rate": 4.324520627542127e-06, "logits/chosen": -0.6696484088897705, "logits/rejected": -0.6967477798461914, "logps/chosen": -64.9903564453125, "logps/rejected": -69.08036041259766, "loss": 13.864, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.025707131251692772, "rewards/margins": 0.0107659213244915, "rewards/rejected": 0.014941206201910973, "step": 465 }, { "epoch": 0.2706627170819539, "grad_norm": 328.59283447265625, "learning_rate": 4.3230679837303895e-06, "logits/chosen": -0.8180822134017944, "logits/rejected": -0.7941353917121887, "logps/chosen": -77.0251693725586, "logps/rejected": -79.32392883300781, "loss": 12.6981, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.08409181982278824, "rewards/margins": 0.12518665194511414, "rewards/rejected": -0.0410948283970356, "step": 466 }, { "epoch": 0.2712435383632456, "grad_norm": 302.0883483886719, "learning_rate": 4.321615339918652e-06, "logits/chosen": -0.6744131445884705, "logits/rejected": -0.7126671671867371, "logps/chosen": -82.62992858886719, "logps/rejected": -68.52928161621094, "loss": 13.8315, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04055127874016762, "rewards/margins": 0.00927782617509365, "rewards/rejected": 0.03127345070242882, "step": 467 }, { "epoch": 0.2718243596445374, "grad_norm": 296.6047668457031, "learning_rate": 4.320162696106915e-06, "logits/chosen": -0.6205381155014038, "logits/rejected": -0.6734114289283752, "logps/chosen": -72.38980865478516, "logps/rejected": -68.17768859863281, "loss": 13.3424, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.059192102402448654, "rewards/margins": 0.06497955322265625, "rewards/rejected": -0.005787448026239872, "step": 468 }, { "epoch": 0.2724051809258291, "grad_norm": 293.75018310546875, "learning_rate": 4.318710052295178e-06, "logits/chosen": -0.8345297574996948, "logits/rejected": -0.8630663752555847, "logps/chosen": -66.68868255615234, "logps/rejected": -69.19453430175781, "loss": 13.0063, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.06119300797581673, "rewards/margins": 0.09242288768291473, "rewards/rejected": -0.031229889020323753, "step": 469 }, { "epoch": 0.2729860022071209, "grad_norm": 330.23846435546875, "learning_rate": 4.3172574084834405e-06, "logits/chosen": -0.6324206590652466, "logits/rejected": -0.6938502192497253, "logps/chosen": -74.11239624023438, "logps/rejected": -73.95758056640625, "loss": 14.0991, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0044637094251811504, "rewards/margins": -0.018323037773370743, "rewards/rejected": 0.01385932881385088, "step": 470 }, { "epoch": 0.2735668234884126, "grad_norm": 291.638671875, "learning_rate": 4.315804764671702e-06, "logits/chosen": -0.6697665452957153, "logits/rejected": -0.6289907693862915, "logps/chosen": -72.07844543457031, "logps/rejected": -68.84980773925781, "loss": 13.8797, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.03210974857211113, "rewards/margins": 0.0021261770743876696, "rewards/rejected": 0.02998356893658638, "step": 471 }, { "epoch": 0.2741476447697044, "grad_norm": 326.5478515625, "learning_rate": 4.314352120859965e-06, "logits/chosen": -0.6647511124610901, "logits/rejected": -0.6528132557868958, "logps/chosen": -72.49028778076172, "logps/rejected": -72.65093231201172, "loss": 14.1427, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.015104440040886402, "rewards/margins": -0.022749561816453934, "rewards/rejected": 0.037853993475437164, "step": 472 }, { "epoch": 0.2747284660509961, "grad_norm": 330.836181640625, "learning_rate": 4.312899477048228e-06, "logits/chosen": -0.5921165347099304, "logits/rejected": -0.6261785626411438, "logps/chosen": -83.95507049560547, "logps/rejected": -69.69046020507812, "loss": 13.5342, "rewards/accuracies": 0.5, "rewards/chosen": 0.049983587116003036, "rewards/margins": 0.041167039424180984, "rewards/rejected": 0.008816548623144627, "step": 473 }, { "epoch": 0.2753092873322879, "grad_norm": 286.28839111328125, "learning_rate": 4.311446833236491e-06, "logits/chosen": -0.5407285690307617, "logits/rejected": -0.5518749356269836, "logps/chosen": -69.38809967041016, "logps/rejected": -64.3993911743164, "loss": 13.7355, "rewards/accuracies": 0.5, "rewards/chosen": 0.03605775907635689, "rewards/margins": 0.015156927518546581, "rewards/rejected": 0.02090083435177803, "step": 474 }, { "epoch": 0.2758901086135796, "grad_norm": 321.68450927734375, "learning_rate": 4.309994189424753e-06, "logits/chosen": -0.7276217341423035, "logits/rejected": -0.8036016225814819, "logps/chosen": -70.2737045288086, "logps/rejected": -66.9681625366211, "loss": 13.9062, "rewards/accuracies": 0.5, "rewards/chosen": 0.0001468472182750702, "rewards/margins": -0.00017342269711662084, "rewards/rejected": 0.00032026879489421844, "step": 475 }, { "epoch": 0.2764709298948714, "grad_norm": 306.3502502441406, "learning_rate": 4.308541545613016e-06, "logits/chosen": -0.7900197505950928, "logits/rejected": -0.6438810229301453, "logps/chosen": -69.17347717285156, "logps/rejected": -62.60564422607422, "loss": 13.6038, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04666885733604431, "rewards/margins": 0.03157535940408707, "rewards/rejected": 0.015093490481376648, "step": 476 }, { "epoch": 0.2770517511761631, "grad_norm": 382.4345397949219, "learning_rate": 4.307088901801279e-06, "logits/chosen": -0.8302785158157349, "logits/rejected": -0.830335795879364, "logps/chosen": -83.38411712646484, "logps/rejected": -72.17436981201172, "loss": 13.7722, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.02929898537695408, "rewards/margins": 0.015593672171235085, "rewards/rejected": 0.013705313205718994, "step": 477 }, { "epoch": 0.2776325724574548, "grad_norm": 292.42779541015625, "learning_rate": 4.305636257989541e-06, "logits/chosen": -0.6505283713340759, "logits/rejected": -0.5827508568763733, "logps/chosen": -64.67672729492188, "logps/rejected": -68.16065979003906, "loss": 13.2237, "rewards/accuracies": 0.75, "rewards/chosen": 0.05520979315042496, "rewards/margins": 0.0683043897151947, "rewards/rejected": -0.013094606809318066, "step": 478 }, { "epoch": 0.2782133937387466, "grad_norm": 321.1835632324219, "learning_rate": 4.3041836141778035e-06, "logits/chosen": -0.4862455725669861, "logits/rejected": -0.4693034589290619, "logps/chosen": -70.4814224243164, "logps/rejected": -84.19317626953125, "loss": 13.6491, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.016184937208890915, "rewards/margins": 0.03135867789387703, "rewards/rejected": -0.015173738822340965, "step": 479 }, { "epoch": 0.2787942150200383, "grad_norm": 311.97564697265625, "learning_rate": 4.302730970366066e-06, "logits/chosen": -0.554233193397522, "logits/rejected": -0.5739291310310364, "logps/chosen": -71.45389556884766, "logps/rejected": -76.47578430175781, "loss": 13.6573, "rewards/accuracies": 0.5, "rewards/chosen": 0.028831113129854202, "rewards/margins": 0.03090267814695835, "rewards/rejected": -0.0020715640857815742, "step": 480 }, { "epoch": 0.2793750363013301, "grad_norm": 309.7761535644531, "learning_rate": 4.301278326554329e-06, "logits/chosen": -0.6836757659912109, "logits/rejected": -0.7563328742980957, "logps/chosen": -65.9247817993164, "logps/rejected": -76.11883544921875, "loss": 14.0186, "rewards/accuracies": 0.5, "rewards/chosen": 0.06773559749126434, "rewards/margins": -0.002912606345489621, "rewards/rejected": 0.0706482082605362, "step": 481 }, { "epoch": 0.2799558575826218, "grad_norm": 301.5060729980469, "learning_rate": 4.299825682742592e-06, "logits/chosen": -0.7972155809402466, "logits/rejected": -0.7350047826766968, "logps/chosen": -67.74440002441406, "logps/rejected": -71.73310089111328, "loss": 13.6848, "rewards/accuracies": 0.5, "rewards/chosen": 0.025806616991758347, "rewards/margins": 0.022161057218909264, "rewards/rejected": 0.0036455602385103703, "step": 482 }, { "epoch": 0.2805366788639136, "grad_norm": 301.0857849121094, "learning_rate": 4.2983730389308545e-06, "logits/chosen": -0.6307353973388672, "logits/rejected": -0.5791727900505066, "logps/chosen": -70.24183654785156, "logps/rejected": -69.36378479003906, "loss": 13.4857, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.06295748054981232, "rewards/margins": 0.043685734272003174, "rewards/rejected": 0.019271746277809143, "step": 483 }, { "epoch": 0.2811175001452053, "grad_norm": 685.1707763671875, "learning_rate": 4.296920395119117e-06, "logits/chosen": -0.7495703101158142, "logits/rejected": -0.678167998790741, "logps/chosen": -77.27181243896484, "logps/rejected": -72.3003158569336, "loss": 13.8755, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.010486298240721226, "rewards/margins": 0.006247940473258495, "rewards/rejected": 0.004238357301801443, "step": 484 }, { "epoch": 0.2816983214264971, "grad_norm": 294.2120056152344, "learning_rate": 4.29546775130738e-06, "logits/chosen": -0.5442952513694763, "logits/rejected": -0.6088670492172241, "logps/chosen": -73.0514907836914, "logps/rejected": -73.78404235839844, "loss": 13.6341, "rewards/accuracies": 0.5, "rewards/chosen": -0.0040179165080189705, "rewards/margins": 0.027353759855031967, "rewards/rejected": -0.03137167543172836, "step": 485 }, { "epoch": 0.2822791427077888, "grad_norm": 306.35723876953125, "learning_rate": 4.294015107495643e-06, "logits/chosen": -0.6691688299179077, "logits/rejected": -0.7110737562179565, "logps/chosen": -68.50287628173828, "logps/rejected": -65.523193359375, "loss": 13.5978, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.014910449273884296, "rewards/margins": 0.03183292597532272, "rewards/rejected": -0.016922477632761, "step": 486 }, { "epoch": 0.2828599639890806, "grad_norm": 314.25897216796875, "learning_rate": 4.2925624636839055e-06, "logits/chosen": -0.6867062449455261, "logits/rejected": -0.6559956073760986, "logps/chosen": -86.4985122680664, "logps/rejected": -67.22306060791016, "loss": 13.7589, "rewards/accuracies": 0.5, "rewards/chosen": 0.01645682193338871, "rewards/margins": 0.017809275537729263, "rewards/rejected": -0.001352452440187335, "step": 487 }, { "epoch": 0.2834407852703723, "grad_norm": 288.4145812988281, "learning_rate": 4.291109819872168e-06, "logits/chosen": -0.6363990902900696, "logits/rejected": -0.6217866539955139, "logps/chosen": -72.58358764648438, "logps/rejected": -71.9157485961914, "loss": 13.0157, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.0661824643611908, "rewards/margins": 0.09238552302122116, "rewards/rejected": -0.026203066110610962, "step": 488 }, { "epoch": 0.28402160655166403, "grad_norm": 304.28765869140625, "learning_rate": 4.28965717606043e-06, "logits/chosen": -0.7047585248947144, "logits/rejected": -0.6908853054046631, "logps/chosen": -69.95263671875, "logps/rejected": -71.68666076660156, "loss": 13.3886, "rewards/accuracies": 0.5, "rewards/chosen": 0.04382898285984993, "rewards/margins": 0.059452660381793976, "rewards/rejected": -0.015623673796653748, "step": 489 }, { "epoch": 0.2846024278329558, "grad_norm": 314.20196533203125, "learning_rate": 4.288204532248693e-06, "logits/chosen": -0.7270024418830872, "logits/rejected": -0.7925176620483398, "logps/chosen": -73.37562561035156, "logps/rejected": -85.60382843017578, "loss": 13.1235, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03009481355547905, "rewards/margins": 0.0827663242816925, "rewards/rejected": -0.052671510726213455, "step": 490 }, { "epoch": 0.28518324911424753, "grad_norm": 296.90582275390625, "learning_rate": 4.286751888436956e-06, "logits/chosen": -0.6854633092880249, "logits/rejected": -0.7035878896713257, "logps/chosen": -68.50250244140625, "logps/rejected": -67.49859619140625, "loss": 13.7267, "rewards/accuracies": 0.5, "rewards/chosen": 0.030942970886826515, "rewards/margins": 0.019369639456272125, "rewards/rejected": 0.011573335155844688, "step": 491 }, { "epoch": 0.2857640703955393, "grad_norm": 292.898681640625, "learning_rate": 4.285299244625218e-06, "logits/chosen": -0.5533262491226196, "logits/rejected": -0.6104485392570496, "logps/chosen": -72.7857666015625, "logps/rejected": -67.43394470214844, "loss": 13.9817, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.005316922441124916, "rewards/margins": -0.0064797671511769295, "rewards/rejected": 0.0011628434294834733, "step": 492 }, { "epoch": 0.286344891676831, "grad_norm": 323.1135559082031, "learning_rate": 4.283846600813481e-06, "logits/chosen": -0.5882741808891296, "logits/rejected": -0.6077271699905396, "logps/chosen": -78.02590942382812, "logps/rejected": -83.53060150146484, "loss": 13.4642, "rewards/accuracies": 0.5, "rewards/chosen": 0.007745922543108463, "rewards/margins": 0.050111640244722366, "rewards/rejected": -0.04236571118235588, "step": 493 }, { "epoch": 0.2869257129581228, "grad_norm": 377.0531921386719, "learning_rate": 4.282393957001744e-06, "logits/chosen": -0.659203827381134, "logits/rejected": -0.6870671510696411, "logps/chosen": -68.08976745605469, "logps/rejected": -78.85200500488281, "loss": 13.4473, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.050796620547771454, "rewards/margins": 0.04922497272491455, "rewards/rejected": 0.0015716440975666046, "step": 494 }, { "epoch": 0.2875065342394145, "grad_norm": 575.7816162109375, "learning_rate": 4.280941313190007e-06, "logits/chosen": -0.6494874954223633, "logits/rejected": -0.6947387456893921, "logps/chosen": -73.44721984863281, "logps/rejected": -77.6927719116211, "loss": 14.4423, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.00039952099905349314, "rewards/margins": -0.050260018557310104, "rewards/rejected": 0.05065953731536865, "step": 495 }, { "epoch": 0.2880873555207063, "grad_norm": 292.5953369140625, "learning_rate": 4.2794886693782685e-06, "logits/chosen": -0.745873749256134, "logits/rejected": -0.6551159024238586, "logps/chosen": -68.93890380859375, "logps/rejected": -70.55396270751953, "loss": 13.5985, "rewards/accuracies": 0.5, "rewards/chosen": 0.01787867583334446, "rewards/margins": 0.032074641436338425, "rewards/rejected": -0.01419596653431654, "step": 496 }, { "epoch": 0.288668176801998, "grad_norm": 317.0806884765625, "learning_rate": 4.278036025566531e-06, "logits/chosen": -0.6900259256362915, "logits/rejected": -0.6669089198112488, "logps/chosen": -72.48905944824219, "logps/rejected": -75.67109680175781, "loss": 13.8434, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.016828546300530434, "rewards/margins": 0.006076467223465443, "rewards/rejected": 0.010752077214419842, "step": 497 }, { "epoch": 0.2892489980832898, "grad_norm": 450.179931640625, "learning_rate": 4.276583381754794e-06, "logits/chosen": -0.7485173344612122, "logits/rejected": -0.8365989923477173, "logps/chosen": -72.33958435058594, "logps/rejected": -69.51062774658203, "loss": 13.3385, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0609331838786602, "rewards/margins": 0.060974687337875366, "rewards/rejected": -4.150420500081964e-05, "step": 498 }, { "epoch": 0.2898298193645815, "grad_norm": 319.4369201660156, "learning_rate": 4.275130737943057e-06, "logits/chosen": -0.659496545791626, "logits/rejected": -0.6515854001045227, "logps/chosen": -70.12030029296875, "logps/rejected": -75.98362731933594, "loss": 13.3334, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.026825930923223495, "rewards/margins": 0.0615021288394928, "rewards/rejected": -0.03467618674039841, "step": 499 }, { "epoch": 0.29041064064587324, "grad_norm": 317.2661437988281, "learning_rate": 4.2736780941313195e-06, "logits/chosen": -0.6833754777908325, "logits/rejected": -0.6367133855819702, "logps/chosen": -81.86927795410156, "logps/rejected": -78.07447814941406, "loss": 13.1256, "rewards/accuracies": 0.75, "rewards/chosen": 0.07352960854768753, "rewards/margins": 0.08241195976734161, "rewards/rejected": -0.008882349357008934, "step": 500 }, { "epoch": 0.290991461927165, "grad_norm": 318.4466247558594, "learning_rate": 4.272225450319582e-06, "logits/chosen": -1.007275104522705, "logits/rejected": -0.8824909329414368, "logps/chosen": -70.7936782836914, "logps/rejected": -71.09184265136719, "loss": 13.608, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.039610691368579865, "rewards/margins": 0.032731592655181885, "rewards/rejected": 0.006879097782075405, "step": 501 }, { "epoch": 0.29157228320845674, "grad_norm": 293.3976745605469, "learning_rate": 4.270772806507845e-06, "logits/chosen": -0.6601558923721313, "logits/rejected": -0.6545599699020386, "logps/chosen": -71.0650405883789, "logps/rejected": -72.53407287597656, "loss": 13.7676, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00019608512229751796, "rewards/margins": 0.01757933758199215, "rewards/rejected": -0.01777542382478714, "step": 502 }, { "epoch": 0.2921531044897485, "grad_norm": 310.49322509765625, "learning_rate": 4.269320162696107e-06, "logits/chosen": -0.5019787549972534, "logits/rejected": -0.5980736017227173, "logps/chosen": -74.40044403076172, "logps/rejected": -67.54830169677734, "loss": 13.3456, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03700960800051689, "rewards/margins": 0.0595683753490448, "rewards/rejected": -0.02255876362323761, "step": 503 }, { "epoch": 0.29273392577104024, "grad_norm": 292.01019287109375, "learning_rate": 4.26786751888437e-06, "logits/chosen": -0.5457882881164551, "logits/rejected": -0.7330330610275269, "logps/chosen": -66.13023376464844, "logps/rejected": -67.7120132446289, "loss": 13.7292, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.0030907634645700455, "rewards/margins": 0.03074917569756508, "rewards/rejected": -0.033839933574199677, "step": 504 }, { "epoch": 0.293314747052332, "grad_norm": 314.1589050292969, "learning_rate": 4.2664148750726324e-06, "logits/chosen": -0.6293397545814514, "logits/rejected": -0.7415448427200317, "logps/chosen": -75.83900451660156, "logps/rejected": -84.2178726196289, "loss": 13.2021, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.055055178701877594, "rewards/margins": 0.0833793580532074, "rewards/rejected": -0.0283241868019104, "step": 505 }, { "epoch": 0.29389556833362374, "grad_norm": 288.65093994140625, "learning_rate": 4.264962231260895e-06, "logits/chosen": -0.5864494442939758, "logits/rejected": -0.7309472560882568, "logps/chosen": -67.95295715332031, "logps/rejected": -80.39786529541016, "loss": 13.0597, "rewards/accuracies": 0.75, "rewards/chosen": 0.03946956992149353, "rewards/margins": 0.09237784147262573, "rewards/rejected": -0.0529082827270031, "step": 506 }, { "epoch": 0.2944763896149155, "grad_norm": 323.42449951171875, "learning_rate": 4.263509587449158e-06, "logits/chosen": -0.49582844972610474, "logits/rejected": -0.6143544912338257, "logps/chosen": -74.20238494873047, "logps/rejected": -66.28921508789062, "loss": 13.5348, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.014446879737079144, "rewards/margins": 0.040076714009046555, "rewards/rejected": -0.025629838928580284, "step": 507 }, { "epoch": 0.29505721089620723, "grad_norm": 304.49407958984375, "learning_rate": 4.262056943637421e-06, "logits/chosen": -0.550538957118988, "logits/rejected": -0.5864927172660828, "logps/chosen": -68.0365982055664, "logps/rejected": -74.60637664794922, "loss": 13.7157, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.017872009426355362, "rewards/margins": 0.020814960822463036, "rewards/rejected": -0.002942953957244754, "step": 508 }, { "epoch": 0.295638032177499, "grad_norm": 396.60736083984375, "learning_rate": 4.2606042998256834e-06, "logits/chosen": -0.7205631732940674, "logits/rejected": -0.6172083616256714, "logps/chosen": -69.93831634521484, "logps/rejected": -71.62530517578125, "loss": 13.0703, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.06218590587377548, "rewards/margins": 0.08707333356142044, "rewards/rejected": -0.02488742023706436, "step": 509 }, { "epoch": 0.29621885345879073, "grad_norm": 340.6492919921875, "learning_rate": 4.259151656013945e-06, "logits/chosen": -0.7098259925842285, "logits/rejected": -0.8209171295166016, "logps/chosen": -79.39151000976562, "logps/rejected": -73.8201904296875, "loss": 13.3678, "rewards/accuracies": 0.5, "rewards/chosen": 0.02004718780517578, "rewards/margins": 0.06106545776128769, "rewards/rejected": -0.04101826995611191, "step": 510 }, { "epoch": 0.29679967474008245, "grad_norm": 304.18780517578125, "learning_rate": 4.257699012202208e-06, "logits/chosen": -0.6429446339607239, "logits/rejected": -0.8323208093643188, "logps/chosen": -70.1782455444336, "logps/rejected": -70.72254943847656, "loss": 13.6496, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.007325345184653997, "rewards/margins": 0.03689438849687576, "rewards/rejected": -0.02956903912127018, "step": 511 }, { "epoch": 0.29738049602137423, "grad_norm": 292.5777587890625, "learning_rate": 4.256246368390471e-06, "logits/chosen": -0.6322038173675537, "logits/rejected": -0.7191158533096313, "logps/chosen": -71.34642028808594, "logps/rejected": -73.82039642333984, "loss": 13.4242, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.007254153490066528, "rewards/margins": 0.05077975243330002, "rewards/rejected": -0.04352560266852379, "step": 512 }, { "epoch": 0.29796131730266595, "grad_norm": 288.7735595703125, "learning_rate": 4.2547937245787336e-06, "logits/chosen": -0.7022128701210022, "logits/rejected": -0.7011411786079407, "logps/chosen": -65.72151184082031, "logps/rejected": -66.15309143066406, "loss": 13.7914, "rewards/accuracies": 0.5, "rewards/chosen": 0.0430249348282814, "rewards/margins": 0.011196841485798359, "rewards/rejected": 0.03182809054851532, "step": 513 }, { "epoch": 0.29854213858395773, "grad_norm": 330.7345275878906, "learning_rate": 4.253341080766996e-06, "logits/chosen": -0.6906915307044983, "logits/rejected": -0.7374723553657532, "logps/chosen": -78.03739929199219, "logps/rejected": -93.80352783203125, "loss": 13.7291, "rewards/accuracies": 0.5, "rewards/chosen": -0.022170495241880417, "rewards/margins": 0.025943556800484657, "rewards/rejected": -0.048114050179719925, "step": 514 }, { "epoch": 0.29912295986524945, "grad_norm": 322.0339050292969, "learning_rate": 4.251888436955259e-06, "logits/chosen": -0.6764459609985352, "logits/rejected": -0.5635499954223633, "logps/chosen": -76.67605590820312, "logps/rejected": -72.45964813232422, "loss": 13.7177, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0028892538975924253, "rewards/margins": 0.023371810093522072, "rewards/rejected": -0.020482560619711876, "step": 515 }, { "epoch": 0.2997037811465412, "grad_norm": 321.15399169921875, "learning_rate": 4.250435793143521e-06, "logits/chosen": -0.6874633431434631, "logits/rejected": -0.7325695157051086, "logps/chosen": -72.60140228271484, "logps/rejected": -78.3973159790039, "loss": 14.8322, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": -0.08024777472019196, "rewards/margins": -0.08848480880260468, "rewards/rejected": 0.008237037807703018, "step": 516 }, { "epoch": 0.30028460242783295, "grad_norm": 309.9378967285156, "learning_rate": 4.248983149331784e-06, "logits/chosen": -0.6688744425773621, "logits/rejected": -0.6749259233474731, "logps/chosen": -77.09115600585938, "logps/rejected": -67.28768157958984, "loss": 13.9949, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.02630709670484066, "rewards/margins": -0.007807762827724218, "rewards/rejected": 0.034114859998226166, "step": 517 }, { "epoch": 0.3008654237091247, "grad_norm": 319.2854919433594, "learning_rate": 4.2475305055200465e-06, "logits/chosen": -0.6039861440658569, "logits/rejected": -0.6350539922714233, "logps/chosen": -80.51014709472656, "logps/rejected": -76.9443130493164, "loss": 13.3171, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.047122977674007416, "rewards/margins": 0.07121749967336655, "rewards/rejected": -0.024094533175230026, "step": 518 }, { "epoch": 0.30144624499041645, "grad_norm": 296.62738037109375, "learning_rate": 4.246077861708309e-06, "logits/chosen": -0.6995037794113159, "logits/rejected": -0.6872465014457703, "logps/chosen": -77.89250946044922, "logps/rejected": -75.6382827758789, "loss": 13.3481, "rewards/accuracies": 0.75, "rewards/chosen": 0.011319695971906185, "rewards/margins": 0.056264109909534454, "rewards/rejected": -0.044944409281015396, "step": 519 }, { "epoch": 0.3020270662717082, "grad_norm": 344.6690979003906, "learning_rate": 4.244625217896572e-06, "logits/chosen": -0.7577833533287048, "logits/rejected": -0.7368292808532715, "logps/chosen": -80.45695495605469, "logps/rejected": -76.34242248535156, "loss": 14.2532, "rewards/accuracies": 0.5, "rewards/chosen": 0.03447725996375084, "rewards/margins": -0.027280423790216446, "rewards/rejected": 0.061757683753967285, "step": 520 }, { "epoch": 0.30260788755299994, "grad_norm": 307.37774658203125, "learning_rate": 4.243172574084835e-06, "logits/chosen": -0.7661414742469788, "logits/rejected": -0.7345612645149231, "logps/chosen": -71.1924057006836, "logps/rejected": -78.25225830078125, "loss": 12.5541, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.10229741036891937, "rewards/margins": 0.14318832755088806, "rewards/rejected": -0.04089091718196869, "step": 521 }, { "epoch": 0.30318870883429166, "grad_norm": 336.8166809082031, "learning_rate": 4.2417199302730975e-06, "logits/chosen": -0.9011874198913574, "logits/rejected": -0.9594923257827759, "logps/chosen": -84.79351806640625, "logps/rejected": -76.28190612792969, "loss": 13.2933, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.008011607453227043, "rewards/margins": 0.06244700402021408, "rewards/rejected": -0.05443539097905159, "step": 522 }, { "epoch": 0.30376953011558344, "grad_norm": 305.69476318359375, "learning_rate": 4.240267286461359e-06, "logits/chosen": -0.41301050782203674, "logits/rejected": -0.5842508673667908, "logps/chosen": -73.02799987792969, "logps/rejected": -72.72782897949219, "loss": 13.5815, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.01740368828177452, "rewards/margins": 0.06376995146274567, "rewards/rejected": -0.046366266906261444, "step": 523 }, { "epoch": 0.30435035139687516, "grad_norm": 359.3056640625, "learning_rate": 4.238814642649622e-06, "logits/chosen": -0.8724691271781921, "logits/rejected": -0.8090575933456421, "logps/chosen": -80.69252014160156, "logps/rejected": -70.90336608886719, "loss": 13.8932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0029185772873461246, "rewards/margins": 0.007794302888214588, "rewards/rejected": -0.0048757269978523254, "step": 524 }, { "epoch": 0.30493117267816694, "grad_norm": 307.8243408203125, "learning_rate": 4.237361998837886e-06, "logits/chosen": -0.6465967893600464, "logits/rejected": -0.5403602123260498, "logps/chosen": -72.42932891845703, "logps/rejected": -76.02727508544922, "loss": 13.8421, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.019543122500181198, "rewards/margins": 0.0053735459223389626, "rewards/rejected": 0.014169578440487385, "step": 525 }, { "epoch": 0.30551199395945866, "grad_norm": 317.7290344238281, "learning_rate": 4.2359093550261485e-06, "logits/chosen": -0.7441942691802979, "logits/rejected": -0.739268958568573, "logps/chosen": -70.67054748535156, "logps/rejected": -73.28771209716797, "loss": 13.9572, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.008803701028227806, "rewards/margins": -0.006234263069927692, "rewards/rejected": 0.015037964098155499, "step": 526 }, { "epoch": 0.30609281524075044, "grad_norm": 321.7218017578125, "learning_rate": 4.234456711214411e-06, "logits/chosen": -0.5514706373214722, "logits/rejected": -0.6030277609825134, "logps/chosen": -80.82585144042969, "logps/rejected": -68.28355407714844, "loss": 13.5752, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.025033999234437943, "rewards/margins": 0.037477243691682816, "rewards/rejected": -0.012443247251212597, "step": 527 }, { "epoch": 0.30667363652204216, "grad_norm": 317.5115661621094, "learning_rate": 4.233004067402673e-06, "logits/chosen": -0.6700873374938965, "logits/rejected": -0.6243543028831482, "logps/chosen": -71.29347229003906, "logps/rejected": -69.8293228149414, "loss": 13.4781, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.019291725009679794, "rewards/margins": 0.047863397747278214, "rewards/rejected": -0.02857167460024357, "step": 528 }, { "epoch": 0.30725445780333394, "grad_norm": 304.30230712890625, "learning_rate": 4.231551423590936e-06, "logits/chosen": -0.6290581226348877, "logits/rejected": -0.7105584144592285, "logps/chosen": -76.74227142333984, "logps/rejected": -70.03807067871094, "loss": 13.8268, "rewards/accuracies": 0.5, "rewards/chosen": -0.030986541882157326, "rewards/margins": 0.015768680721521378, "rewards/rejected": -0.046755217015743256, "step": 529 }, { "epoch": 0.30783527908462566, "grad_norm": 308.83099365234375, "learning_rate": 4.230098779779199e-06, "logits/chosen": -0.7084294557571411, "logits/rejected": -0.6327206492424011, "logps/chosen": -79.52645874023438, "logps/rejected": -72.7778549194336, "loss": 13.9131, "rewards/accuracies": 0.5, "rewards/chosen": 0.0055747805163264275, "rewards/margins": 0.0061087412759661674, "rewards/rejected": -0.0005339615163393319, "step": 530 }, { "epoch": 0.30841610036591743, "grad_norm": 309.6053466796875, "learning_rate": 4.228646135967461e-06, "logits/chosen": -0.6223837733268738, "logits/rejected": -0.6086059808731079, "logps/chosen": -72.55455017089844, "logps/rejected": -67.36306762695312, "loss": 13.8436, "rewards/accuracies": 0.5, "rewards/chosen": 0.02795691415667534, "rewards/margins": 0.01112289633601904, "rewards/rejected": 0.016834020614624023, "step": 531 }, { "epoch": 0.30899692164720916, "grad_norm": 313.63336181640625, "learning_rate": 4.227193492155724e-06, "logits/chosen": -0.7594996690750122, "logits/rejected": -0.7488009333610535, "logps/chosen": -72.03538513183594, "logps/rejected": -79.82178497314453, "loss": 13.0208, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.010113591328263283, "rewards/margins": 0.09955569356679916, "rewards/rejected": -0.08944210410118103, "step": 532 }, { "epoch": 0.3095777429285009, "grad_norm": 306.1441650390625, "learning_rate": 4.225740848343987e-06, "logits/chosen": -0.7741316556930542, "logits/rejected": -0.8882448077201843, "logps/chosen": -73.84909057617188, "logps/rejected": -75.96207427978516, "loss": 13.5817, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004449290223419666, "rewards/margins": 0.04892003908753395, "rewards/rejected": -0.04447074979543686, "step": 533 }, { "epoch": 0.31015856420979265, "grad_norm": 289.4524230957031, "learning_rate": 4.22428820453225e-06, "logits/chosen": -0.5990484356880188, "logits/rejected": -0.7283953428268433, "logps/chosen": -68.75221252441406, "logps/rejected": -70.73362731933594, "loss": 13.4645, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03096533939242363, "rewards/margins": 0.04442495107650757, "rewards/rejected": -0.013459615409374237, "step": 534 }, { "epoch": 0.3107393854910844, "grad_norm": 306.3465576171875, "learning_rate": 4.2228355607205115e-06, "logits/chosen": -0.7504904866218567, "logits/rejected": -0.740320086479187, "logps/chosen": -69.62543487548828, "logps/rejected": -78.7964859008789, "loss": 13.2895, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06250744313001633, "rewards/margins": 0.06913135945796967, "rewards/rejected": -0.006623915396630764, "step": 535 }, { "epoch": 0.31132020677237615, "grad_norm": 313.4544982910156, "learning_rate": 4.221382916908774e-06, "logits/chosen": -0.6303409337997437, "logits/rejected": -0.6835408210754395, "logps/chosen": -78.17256164550781, "logps/rejected": -71.18514251708984, "loss": 13.3348, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02659953013062477, "rewards/margins": 0.06356380879878998, "rewards/rejected": -0.03696427494287491, "step": 536 }, { "epoch": 0.3119010280536679, "grad_norm": 337.2955627441406, "learning_rate": 4.219930273097037e-06, "logits/chosen": -0.8487906455993652, "logits/rejected": -0.872395396232605, "logps/chosen": -77.28082275390625, "logps/rejected": -82.19414520263672, "loss": 13.4067, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.012853232212364674, "rewards/margins": 0.053110599517822266, "rewards/rejected": -0.04025736823678017, "step": 537 }, { "epoch": 0.31248184933495965, "grad_norm": 286.7469482421875, "learning_rate": 4.2184776292853e-06, "logits/chosen": -0.6404751539230347, "logits/rejected": -0.6587169766426086, "logps/chosen": -74.94679260253906, "logps/rejected": -73.58045959472656, "loss": 12.8176, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.037719763815402985, "rewards/margins": 0.11207199096679688, "rewards/rejected": -0.07435222715139389, "step": 538 }, { "epoch": 0.31306267061625137, "grad_norm": 303.54852294921875, "learning_rate": 4.2170249854735625e-06, "logits/chosen": -0.5835073590278625, "logits/rejected": -0.5741956830024719, "logps/chosen": -71.42955017089844, "logps/rejected": -68.7015151977539, "loss": 13.2086, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06677176058292389, "rewards/margins": 0.07689666002988815, "rewards/rejected": -0.010124899446964264, "step": 539 }, { "epoch": 0.31364349189754315, "grad_norm": 299.73089599609375, "learning_rate": 4.215572341661825e-06, "logits/chosen": -0.8799319267272949, "logits/rejected": -0.8263736963272095, "logps/chosen": -71.5801010131836, "logps/rejected": -74.73466491699219, "loss": 13.3193, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.056644029915332794, "rewards/margins": 0.0625743716955185, "rewards/rejected": -0.005930337123572826, "step": 540 }, { "epoch": 0.31422431317883487, "grad_norm": 348.8278503417969, "learning_rate": 4.214119697850088e-06, "logits/chosen": -0.6225399971008301, "logits/rejected": -0.6066278219223022, "logps/chosen": -67.95753479003906, "logps/rejected": -70.62969207763672, "loss": 14.2989, "rewards/accuracies": 0.5, "rewards/chosen": -0.025478297844529152, "rewards/margins": -0.03527476638555527, "rewards/rejected": 0.009796475991606712, "step": 541 }, { "epoch": 0.31480513446012665, "grad_norm": 320.58349609375, "learning_rate": 4.21266705403835e-06, "logits/chosen": -0.6626735925674438, "logits/rejected": -0.7045550346374512, "logps/chosen": -68.6723403930664, "logps/rejected": -76.953125, "loss": 13.6882, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.05210607126355171, "rewards/margins": 0.027962008491158485, "rewards/rejected": 0.02414405718445778, "step": 542 }, { "epoch": 0.31538595574141837, "grad_norm": 294.38104248046875, "learning_rate": 4.211214410226613e-06, "logits/chosen": -0.6852847337722778, "logits/rejected": -0.7603325843811035, "logps/chosen": -71.50347137451172, "logps/rejected": -73.55186462402344, "loss": 13.2244, "rewards/accuracies": 0.75, "rewards/chosen": 0.03986804932355881, "rewards/margins": 0.07143954932689667, "rewards/rejected": -0.03157149627804756, "step": 543 }, { "epoch": 0.3159667770227101, "grad_norm": 292.24896240234375, "learning_rate": 4.209761766414875e-06, "logits/chosen": -0.6846610307693481, "logits/rejected": -0.7384731769561768, "logps/chosen": -69.51171112060547, "logps/rejected": -70.13493347167969, "loss": 13.8235, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0011608407367020845, "rewards/margins": 0.007756291422992945, "rewards/rejected": -0.008917133323848248, "step": 544 }, { "epoch": 0.31654759830400186, "grad_norm": 303.1371765136719, "learning_rate": 4.208309122603138e-06, "logits/chosen": -0.6676656007766724, "logits/rejected": -0.6589547395706177, "logps/chosen": -68.53245544433594, "logps/rejected": -65.3570556640625, "loss": 13.4721, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04229654744267464, "rewards/margins": 0.047294534742832184, "rewards/rejected": -0.0049979896284639835, "step": 545 }, { "epoch": 0.3171284195852936, "grad_norm": 304.0082092285156, "learning_rate": 4.206856478791401e-06, "logits/chosen": -0.7934265732765198, "logits/rejected": -0.7060130834579468, "logps/chosen": -65.05821990966797, "logps/rejected": -70.21916198730469, "loss": 14.4302, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.013832311145961285, "rewards/margins": -0.04456400126218796, "rewards/rejected": 0.058396317064762115, "step": 546 }, { "epoch": 0.31770924086658536, "grad_norm": 318.7176513671875, "learning_rate": 4.205403834979664e-06, "logits/chosen": -0.6346458196640015, "logits/rejected": -0.7181236147880554, "logps/chosen": -77.43191528320312, "logps/rejected": -72.55345153808594, "loss": 13.3574, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.015245789662003517, "rewards/margins": 0.06777335703372955, "rewards/rejected": -0.08301915228366852, "step": 547 }, { "epoch": 0.3182900621478771, "grad_norm": 307.634521484375, "learning_rate": 4.2039511911679255e-06, "logits/chosen": -0.79632169008255, "logits/rejected": -0.7473276853561401, "logps/chosen": -77.22640991210938, "logps/rejected": -75.24634552001953, "loss": 13.5692, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03856954723596573, "rewards/margins": 0.03735596686601639, "rewards/rejected": 0.0012135781580582261, "step": 548 }, { "epoch": 0.31887088342916886, "grad_norm": 299.0860595703125, "learning_rate": 4.202498547356188e-06, "logits/chosen": -0.5579553842544556, "logits/rejected": -0.5251676440238953, "logps/chosen": -66.94325256347656, "logps/rejected": -70.40986633300781, "loss": 13.9272, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008126763626933098, "rewards/margins": 0.0029366514645516872, "rewards/rejected": -0.011063413694500923, "step": 549 }, { "epoch": 0.3194517047104606, "grad_norm": 294.1581726074219, "learning_rate": 4.201045903544451e-06, "logits/chosen": -0.6971367001533508, "logits/rejected": -0.6503037214279175, "logps/chosen": -72.15474700927734, "logps/rejected": -72.1407470703125, "loss": 13.4412, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04408375173807144, "rewards/margins": 0.05293964222073555, "rewards/rejected": -0.00885589700192213, "step": 550 }, { "epoch": 0.32003252599175236, "grad_norm": 1313.975830078125, "learning_rate": 4.199593259732714e-06, "logits/chosen": -0.8329079747200012, "logits/rejected": -0.7704097032546997, "logps/chosen": -72.53883361816406, "logps/rejected": -70.23878479003906, "loss": 13.435, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.07676436007022858, "rewards/margins": 0.04872946813702583, "rewards/rejected": 0.028034895658493042, "step": 551 }, { "epoch": 0.3206133472730441, "grad_norm": 331.4753112792969, "learning_rate": 4.1981406159209765e-06, "logits/chosen": -0.7476204633712769, "logits/rejected": -0.7286363840103149, "logps/chosen": -69.5918960571289, "logps/rejected": -63.991371154785156, "loss": 14.1313, "rewards/accuracies": 0.5, "rewards/chosen": -0.013785046525299549, "rewards/margins": -0.01810169778764248, "rewards/rejected": 0.004316650331020355, "step": 552 }, { "epoch": 0.32119416855433586, "grad_norm": 332.4710388183594, "learning_rate": 4.196687972109239e-06, "logits/chosen": -0.6895079612731934, "logits/rejected": -0.7061454653739929, "logps/chosen": -77.14973449707031, "logps/rejected": -76.41380310058594, "loss": 13.5007, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.059791963547468185, "rewards/margins": 0.047647859901189804, "rewards/rejected": 0.012144106440246105, "step": 553 }, { "epoch": 0.3217749898356276, "grad_norm": 306.3582763671875, "learning_rate": 4.195235328297502e-06, "logits/chosen": -0.7227594256401062, "logits/rejected": -0.6854357123374939, "logps/chosen": -75.45829010009766, "logps/rejected": -71.11771392822266, "loss": 13.5619, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.03397790342569351, "rewards/margins": 0.04335654899477959, "rewards/rejected": -0.009378653019666672, "step": 554 }, { "epoch": 0.3223558111169193, "grad_norm": 313.3603210449219, "learning_rate": 4.193782684485764e-06, "logits/chosen": -0.788299024105072, "logits/rejected": -0.7475656270980835, "logps/chosen": -68.71570587158203, "logps/rejected": -70.02986145019531, "loss": 14.0656, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.042551252990961075, "rewards/margins": -0.008566422387957573, "rewards/rejected": 0.0511176697909832, "step": 555 }, { "epoch": 0.3229366323982111, "grad_norm": 335.6896667480469, "learning_rate": 4.192330040674027e-06, "logits/chosen": -0.7440964579582214, "logits/rejected": -0.7291234731674194, "logps/chosen": -82.71476745605469, "logps/rejected": -77.9942398071289, "loss": 13.8855, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.008972082287073135, "rewards/margins": 0.009817652404308319, "rewards/rejected": -0.000845567905344069, "step": 556 }, { "epoch": 0.3235174536795028, "grad_norm": 351.78411865234375, "learning_rate": 4.190877396862289e-06, "logits/chosen": -0.7939974069595337, "logits/rejected": -0.7254185080528259, "logps/chosen": -91.87419128417969, "logps/rejected": -90.27113342285156, "loss": 12.7765, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.044230926781892776, "rewards/margins": 0.12266170978546143, "rewards/rejected": -0.07843078672885895, "step": 557 }, { "epoch": 0.3240982749607946, "grad_norm": 295.10577392578125, "learning_rate": 4.189424753050552e-06, "logits/chosen": -0.532805323600769, "logits/rejected": -0.6035286784172058, "logps/chosen": -69.48627471923828, "logps/rejected": -70.66758728027344, "loss": 13.5119, "rewards/accuracies": 0.5, "rewards/chosen": 0.034174613654613495, "rewards/margins": 0.04042964056134224, "rewards/rejected": -0.006255028303712606, "step": 558 }, { "epoch": 0.3246790962420863, "grad_norm": 288.3070373535156, "learning_rate": 4.187972109238815e-06, "logits/chosen": -0.6821750998497009, "logits/rejected": -0.7404053807258606, "logps/chosen": -70.35401916503906, "logps/rejected": -72.6601791381836, "loss": 13.8817, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.017289195209741592, "rewards/margins": 0.004057192709296942, "rewards/rejected": -0.02134638838469982, "step": 559 }, { "epoch": 0.3252599175233781, "grad_norm": 320.47100830078125, "learning_rate": 4.186519465427078e-06, "logits/chosen": -0.6497399210929871, "logits/rejected": -0.607397735118866, "logps/chosen": -74.75005340576172, "logps/rejected": -78.32804870605469, "loss": 13.5264, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04527430608868599, "rewards/margins": 0.04255044087767601, "rewards/rejected": 0.00272386590950191, "step": 560 }, { "epoch": 0.3258407388046698, "grad_norm": 301.3592834472656, "learning_rate": 4.18506682161534e-06, "logits/chosen": -0.6746230721473694, "logits/rejected": -0.6951724886894226, "logps/chosen": -71.5419921875, "logps/rejected": -73.84922790527344, "loss": 13.1544, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03311220183968544, "rewards/margins": 0.08440899848937988, "rewards/rejected": -0.05129680037498474, "step": 561 }, { "epoch": 0.32642156008596157, "grad_norm": 313.3168029785156, "learning_rate": 4.183614177803602e-06, "logits/chosen": -0.6119886636734009, "logits/rejected": -0.6854387521743774, "logps/chosen": -70.71475982666016, "logps/rejected": -73.73226165771484, "loss": 13.9581, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0057665156200528145, "rewards/margins": -0.0022575571201741695, "rewards/rejected": 0.008024071343243122, "step": 562 }, { "epoch": 0.3270023813672533, "grad_norm": 295.86163330078125, "learning_rate": 4.182161533991865e-06, "logits/chosen": -0.5245779752731323, "logits/rejected": -0.589116096496582, "logps/chosen": -70.53639221191406, "logps/rejected": -73.27334594726562, "loss": 13.6625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.033234186470508575, "rewards/margins": 0.02967524155974388, "rewards/rejected": 0.0035589407198131084, "step": 563 }, { "epoch": 0.32758320264854507, "grad_norm": 292.6914978027344, "learning_rate": 4.180708890180128e-06, "logits/chosen": -0.7187098264694214, "logits/rejected": -0.7310807108879089, "logps/chosen": -70.57524871826172, "logps/rejected": -71.39189147949219, "loss": 13.7081, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.041135989129543304, "rewards/margins": 0.019587086513638496, "rewards/rejected": 0.021548902615904808, "step": 564 }, { "epoch": 0.3281640239298368, "grad_norm": 298.4253234863281, "learning_rate": 4.1792562463683906e-06, "logits/chosen": -0.7534239292144775, "logits/rejected": -0.7867181897163391, "logps/chosen": -72.37812805175781, "logps/rejected": -69.36402130126953, "loss": 13.7307, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.06334944069385529, "rewards/margins": 0.022134965285658836, "rewards/rejected": 0.0412144735455513, "step": 565 }, { "epoch": 0.3287448452111285, "grad_norm": 305.56884765625, "learning_rate": 4.177803602556653e-06, "logits/chosen": -0.8273922204971313, "logits/rejected": -0.6768549084663391, "logps/chosen": -72.92654418945312, "logps/rejected": -68.28416442871094, "loss": 13.5351, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.06688196212053299, "rewards/margins": 0.040909625589847565, "rewards/rejected": 0.025972336530685425, "step": 566 }, { "epoch": 0.3293256664924203, "grad_norm": 317.9597473144531, "learning_rate": 4.176350958744916e-06, "logits/chosen": -0.6657764315605164, "logits/rejected": -0.632070779800415, "logps/chosen": -74.51126861572266, "logps/rejected": -81.23558807373047, "loss": 14.0378, "rewards/accuracies": 0.5, "rewards/chosen": 0.016053617000579834, "rewards/margins": -0.010346947237849236, "rewards/rejected": 0.02640056610107422, "step": 567 }, { "epoch": 0.329906487773712, "grad_norm": 296.336669921875, "learning_rate": 4.174898314933179e-06, "logits/chosen": -0.44092756509780884, "logits/rejected": -0.4833409786224365, "logps/chosen": -79.0365982055664, "logps/rejected": -65.05804443359375, "loss": 13.3725, "rewards/accuracies": 0.5, "rewards/chosen": 0.02982897125184536, "rewards/margins": 0.061524223536252975, "rewards/rejected": -0.031695254147052765, "step": 568 }, { "epoch": 0.3304873090550038, "grad_norm": 301.1853942871094, "learning_rate": 4.1734456711214416e-06, "logits/chosen": -0.7429706454277039, "logits/rejected": -0.7991895079612732, "logps/chosen": -72.23892974853516, "logps/rejected": -74.65027618408203, "loss": 13.5499, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04059157520532608, "rewards/margins": 0.03457489609718323, "rewards/rejected": 0.00601667445152998, "step": 569 }, { "epoch": 0.3310681303362955, "grad_norm": 318.2872009277344, "learning_rate": 4.171993027309704e-06, "logits/chosen": -0.8133655786514282, "logits/rejected": -0.7936081886291504, "logps/chosen": -75.30345153808594, "logps/rejected": -74.62310791015625, "loss": 13.7668, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.02610892988741398, "rewards/margins": 0.020346611738204956, "rewards/rejected": 0.0057623242028057575, "step": 570 }, { "epoch": 0.3316489516175873, "grad_norm": 339.5540466308594, "learning_rate": 4.170540383497967e-06, "logits/chosen": -0.6674878001213074, "logits/rejected": -0.7041738629341125, "logps/chosen": -81.82828521728516, "logps/rejected": -85.7889175415039, "loss": 14.0594, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.014918116852641106, "rewards/margins": -0.00931711494922638, "rewards/rejected": 0.024235233664512634, "step": 571 }, { "epoch": 0.332229772898879, "grad_norm": 291.53070068359375, "learning_rate": 4.16908773968623e-06, "logits/chosen": -0.5750855803489685, "logits/rejected": -0.6670510172843933, "logps/chosen": -70.49334716796875, "logps/rejected": -79.74695587158203, "loss": 13.5056, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.000834398262668401, "rewards/margins": 0.04568032547831535, "rewards/rejected": -0.04484592750668526, "step": 572 }, { "epoch": 0.3328105941801708, "grad_norm": 338.2127990722656, "learning_rate": 4.167635095874492e-06, "logits/chosen": -0.6680286526679993, "logits/rejected": -0.6583356857299805, "logps/chosen": -74.26518249511719, "logps/rejected": -76.2101821899414, "loss": 13.9376, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.06941697746515274, "rewards/margins": -0.0016284823650494218, "rewards/rejected": 0.07104545831680298, "step": 573 }, { "epoch": 0.3333914154614625, "grad_norm": 324.9139709472656, "learning_rate": 4.1661824520627544e-06, "logits/chosen": -0.6829845309257507, "logits/rejected": -0.7503547668457031, "logps/chosen": -79.72300720214844, "logps/rejected": -77.9569320678711, "loss": 13.4892, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.05273396894335747, "rewards/margins": 0.041454561054706573, "rewards/rejected": 0.011279400438070297, "step": 574 }, { "epoch": 0.3339722367427543, "grad_norm": 294.2154235839844, "learning_rate": 4.164729808251017e-06, "logits/chosen": -0.6569613814353943, "logits/rejected": -0.6229342222213745, "logps/chosen": -72.42926025390625, "logps/rejected": -68.15298461914062, "loss": 14.7445, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.039252303540706635, "rewards/margins": -0.08114682883024216, "rewards/rejected": 0.04189452528953552, "step": 575 }, { "epoch": 0.334553058024046, "grad_norm": 335.7951965332031, "learning_rate": 4.16327716443928e-06, "logits/chosen": -0.6173557639122009, "logits/rejected": -0.7220408916473389, "logps/chosen": -69.47132873535156, "logps/rejected": -70.61470031738281, "loss": 12.8536, "rewards/accuracies": 0.75, "rewards/chosen": 0.047998152673244476, "rewards/margins": 0.11041458696126938, "rewards/rejected": -0.062416426837444305, "step": 576 }, { "epoch": 0.3351338793053377, "grad_norm": 305.68243408203125, "learning_rate": 4.161824520627543e-06, "logits/chosen": -0.6545889377593994, "logits/rejected": -0.6952469348907471, "logps/chosen": -82.79303741455078, "logps/rejected": -67.61451721191406, "loss": 14.1696, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.011021709069609642, "rewards/margins": -0.020772431045770645, "rewards/rejected": 0.009750718250870705, "step": 577 }, { "epoch": 0.3357147005866295, "grad_norm": 306.0280456542969, "learning_rate": 4.1603718768158054e-06, "logits/chosen": -0.6899808645248413, "logits/rejected": -0.6691157221794128, "logps/chosen": -75.88886260986328, "logps/rejected": -70.24473571777344, "loss": 13.5661, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.05026114732027054, "rewards/margins": 0.04536793380975723, "rewards/rejected": 0.004893226083368063, "step": 578 }, { "epoch": 0.3362955218679212, "grad_norm": 314.8397521972656, "learning_rate": 4.158919233004068e-06, "logits/chosen": -0.5284186601638794, "logits/rejected": -0.5781315565109253, "logps/chosen": -71.56379699707031, "logps/rejected": -67.72520446777344, "loss": 14.1152, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.008697940036654472, "rewards/margins": -0.018017660826444626, "rewards/rejected": 0.009319724515080452, "step": 579 }, { "epoch": 0.336876343149213, "grad_norm": 312.38427734375, "learning_rate": 4.15746658919233e-06, "logits/chosen": -0.7509424686431885, "logits/rejected": -0.8194720149040222, "logps/chosen": -68.1953125, "logps/rejected": -79.12274169921875, "loss": 14.2305, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.021652111783623695, "rewards/margins": -0.0276623722165823, "rewards/rejected": 0.0493144765496254, "step": 580 }, { "epoch": 0.3374571644305047, "grad_norm": 296.4921875, "learning_rate": 4.156013945380593e-06, "logits/chosen": -0.5414460897445679, "logits/rejected": -0.579011082649231, "logps/chosen": -71.58470153808594, "logps/rejected": -77.13636016845703, "loss": 13.9073, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.022373903542757034, "rewards/margins": 0.008005992509424686, "rewards/rejected": 0.014367911033332348, "step": 581 }, { "epoch": 0.3380379857117965, "grad_norm": 295.32965087890625, "learning_rate": 4.154561301568856e-06, "logits/chosen": -0.8184449076652527, "logits/rejected": -0.946854293346405, "logps/chosen": -65.59197235107422, "logps/rejected": -69.1659927368164, "loss": 13.6964, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.02355037070810795, "rewards/margins": 0.024608146399259567, "rewards/rejected": -0.0010577782522886992, "step": 582 }, { "epoch": 0.3386188069930882, "grad_norm": 316.63592529296875, "learning_rate": 4.153108657757118e-06, "logits/chosen": -0.6438121199607849, "logits/rejected": -0.7025144696235657, "logps/chosen": -77.89268493652344, "logps/rejected": -75.57435607910156, "loss": 13.9757, "rewards/accuracies": 0.5, "rewards/chosen": 0.03375036641955376, "rewards/margins": 0.004329390823841095, "rewards/rejected": 0.02942098118364811, "step": 583 }, { "epoch": 0.33919962827438, "grad_norm": 303.33251953125, "learning_rate": 4.151656013945381e-06, "logits/chosen": -0.6150007247924805, "logits/rejected": -0.6166488528251648, "logps/chosen": -73.3652572631836, "logps/rejected": -78.53809356689453, "loss": 13.1565, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.04595949128270149, "rewards/margins": 0.08009113371372223, "rewards/rejected": -0.03413163870573044, "step": 584 }, { "epoch": 0.3397804495556717, "grad_norm": 301.8458557128906, "learning_rate": 4.150203370133644e-06, "logits/chosen": -0.6662709712982178, "logits/rejected": -0.8405240774154663, "logps/chosen": -68.98908996582031, "logps/rejected": -80.71705627441406, "loss": 13.6482, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0004700004938058555, "rewards/margins": 0.030774127691984177, "rewards/rejected": -0.03030412830412388, "step": 585 }, { "epoch": 0.3403612708369635, "grad_norm": 315.66986083984375, "learning_rate": 4.148750726321907e-06, "logits/chosen": -0.6200467348098755, "logits/rejected": -0.6849344968795776, "logps/chosen": -72.97185516357422, "logps/rejected": -66.83480834960938, "loss": 13.8744, "rewards/accuracies": 0.5, "rewards/chosen": 0.03314518183469772, "rewards/margins": 0.006186559796333313, "rewards/rejected": 0.02695862017571926, "step": 586 }, { "epoch": 0.3409420921182552, "grad_norm": 316.5364074707031, "learning_rate": 4.1472980825101685e-06, "logits/chosen": -0.6957502365112305, "logits/rejected": -0.77708899974823, "logps/chosen": -73.742919921875, "logps/rejected": -72.82991027832031, "loss": 14.2651, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.02233811281621456, "rewards/margins": -0.03094848431646824, "rewards/rejected": 0.008610370568931103, "step": 587 }, { "epoch": 0.34152291339954693, "grad_norm": 300.6039733886719, "learning_rate": 4.145845438698431e-06, "logits/chosen": -0.5576103925704956, "logits/rejected": -0.4413486421108246, "logps/chosen": -76.01347351074219, "logps/rejected": -71.07667541503906, "loss": 13.5076, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.014086894690990448, "rewards/margins": 0.0441095307469368, "rewards/rejected": -0.03002263978123665, "step": 588 }, { "epoch": 0.3421037346808387, "grad_norm": 312.4639587402344, "learning_rate": 4.144392794886694e-06, "logits/chosen": -0.7106123566627502, "logits/rejected": -0.6808815002441406, "logps/chosen": -72.44802856445312, "logps/rejected": -76.73309326171875, "loss": 13.6203, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.022707967087626457, "rewards/margins": 0.03484155982732773, "rewards/rejected": -0.01213359646499157, "step": 589 }, { "epoch": 0.34268455596213043, "grad_norm": 302.58642578125, "learning_rate": 4.142940151074957e-06, "logits/chosen": -0.6447229981422424, "logits/rejected": -0.75730299949646, "logps/chosen": -73.82032775878906, "logps/rejected": -72.29612731933594, "loss": 13.4369, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04192671552300453, "rewards/margins": 0.04805769771337509, "rewards/rejected": -0.006130979862064123, "step": 590 }, { "epoch": 0.3432653772434222, "grad_norm": 427.1463928222656, "learning_rate": 4.1414875072632195e-06, "logits/chosen": -0.6302953958511353, "logits/rejected": -0.7209846377372742, "logps/chosen": -71.4793930053711, "logps/rejected": -66.0015640258789, "loss": 13.3978, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03149376064538956, "rewards/margins": 0.05494686961174011, "rewards/rejected": -0.02345309965312481, "step": 591 }, { "epoch": 0.34384619852471393, "grad_norm": 305.7786560058594, "learning_rate": 4.140034863451482e-06, "logits/chosen": -0.6327844858169556, "logits/rejected": -0.7896640300750732, "logps/chosen": -73.62690734863281, "logps/rejected": -71.11927795410156, "loss": 14.0315, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.018691398203372955, "rewards/margins": -0.008565169759094715, "rewards/rejected": 0.027256567031145096, "step": 592 }, { "epoch": 0.3444270198060057, "grad_norm": 291.48944091796875, "learning_rate": 4.138582219639745e-06, "logits/chosen": -0.6120963096618652, "logits/rejected": -0.6668158769607544, "logps/chosen": -71.43421936035156, "logps/rejected": -73.65585327148438, "loss": 13.6873, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009499874897301197, "rewards/margins": 0.033064182847738266, "rewards/rejected": -0.04256405681371689, "step": 593 }, { "epoch": 0.34500784108729743, "grad_norm": 283.23406982421875, "learning_rate": 4.137129575828007e-06, "logits/chosen": -0.69483482837677, "logits/rejected": -0.8557512164115906, "logps/chosen": -71.92134094238281, "logps/rejected": -74.01726531982422, "loss": 12.8864, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.01784166321158409, "rewards/margins": 0.10965867340564728, "rewards/rejected": -0.09181700646877289, "step": 594 }, { "epoch": 0.3455886623685892, "grad_norm": 281.31951904296875, "learning_rate": 4.13567693201627e-06, "logits/chosen": -0.702051043510437, "logits/rejected": -0.7037491202354431, "logps/chosen": -69.55567932128906, "logps/rejected": -65.29097747802734, "loss": 13.4975, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.025551121681928635, "rewards/margins": 0.045175038278102875, "rewards/rejected": -0.07072616368532181, "step": 595 }, { "epoch": 0.3461694836498809, "grad_norm": 310.319091796875, "learning_rate": 4.134224288204532e-06, "logits/chosen": -0.604898989200592, "logits/rejected": -0.5756534337997437, "logps/chosen": -72.46310424804688, "logps/rejected": -75.34378051757812, "loss": 13.812, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.018788345158100128, "rewards/margins": 0.01757490262389183, "rewards/rejected": 0.0012134440476074815, "step": 596 }, { "epoch": 0.3467503049311727, "grad_norm": 329.1076354980469, "learning_rate": 4.132771644392795e-06, "logits/chosen": -0.6654219031333923, "logits/rejected": -0.5940228700637817, "logps/chosen": -77.3206558227539, "logps/rejected": -70.39866638183594, "loss": 14.0139, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.008175027556717396, "rewards/margins": -0.004936867859214544, "rewards/rejected": 0.013111898675560951, "step": 597 }, { "epoch": 0.3473311262124644, "grad_norm": 356.73992919921875, "learning_rate": 4.131319000581058e-06, "logits/chosen": -0.6939708590507507, "logits/rejected": -0.7016677856445312, "logps/chosen": -78.16722869873047, "logps/rejected": -69.61579895019531, "loss": 13.4649, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.014713537879288197, "rewards/margins": 0.04950027912855148, "rewards/rejected": -0.03478673845529556, "step": 598 }, { "epoch": 0.34791194749375615, "grad_norm": 335.206298828125, "learning_rate": 4.129866356769321e-06, "logits/chosen": -0.5904199481010437, "logits/rejected": -0.603577196598053, "logps/chosen": -79.11907196044922, "logps/rejected": -83.3130874633789, "loss": 13.9855, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0011968526523560286, "rewards/margins": 0.002481907606124878, "rewards/rejected": -0.0012850582133978605, "step": 599 }, { "epoch": 0.3484927687750479, "grad_norm": 318.6190490722656, "learning_rate": 4.128413712957583e-06, "logits/chosen": -0.572067379951477, "logits/rejected": -0.647987961769104, "logps/chosen": -85.52361297607422, "logps/rejected": -79.0700454711914, "loss": 13.2384, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010197488591074944, "rewards/margins": 0.08274447917938232, "rewards/rejected": -0.09294196218252182, "step": 600 }, { "epoch": 0.34907359005633964, "grad_norm": 317.1435241699219, "learning_rate": 4.126961069145845e-06, "logits/chosen": -0.7126006484031677, "logits/rejected": -0.7353073358535767, "logps/chosen": -68.36890411376953, "logps/rejected": -76.00941467285156, "loss": 14.2786, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.012436933815479279, "rewards/margins": -0.03413182869553566, "rewards/rejected": 0.02169489674270153, "step": 601 }, { "epoch": 0.3496544113376314, "grad_norm": 303.97283935546875, "learning_rate": 4.125508425334108e-06, "logits/chosen": -0.5907556414604187, "logits/rejected": -0.6256478428840637, "logps/chosen": -76.39266204833984, "logps/rejected": -74.95109558105469, "loss": 12.4707, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.04735926538705826, "rewards/margins": 0.1522711217403412, "rewards/rejected": -0.10491186380386353, "step": 602 }, { "epoch": 0.35023523261892314, "grad_norm": 313.0004577636719, "learning_rate": 4.124055781522371e-06, "logits/chosen": -0.6598242521286011, "logits/rejected": -0.6428278684616089, "logps/chosen": -64.64137268066406, "logps/rejected": -74.44950866699219, "loss": 13.724, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.016485584899783134, "rewards/margins": 0.02066924050450325, "rewards/rejected": -0.004183652810752392, "step": 603 }, { "epoch": 0.3508160539002149, "grad_norm": 367.84808349609375, "learning_rate": 4.1226031377106335e-06, "logits/chosen": -0.5852268934249878, "logits/rejected": -0.6519114971160889, "logps/chosen": -69.59758758544922, "logps/rejected": -69.01988983154297, "loss": 13.7946, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.04319590702652931, "rewards/margins": 0.01627770997583866, "rewards/rejected": 0.0269182026386261, "step": 604 }, { "epoch": 0.35139687518150664, "grad_norm": 316.7720947265625, "learning_rate": 4.121150493898896e-06, "logits/chosen": -0.6948081851005554, "logits/rejected": -0.7896615266799927, "logps/chosen": -79.47821807861328, "logps/rejected": -79.4210433959961, "loss": 12.9175, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.08956082910299301, "rewards/margins": 0.10826803743839264, "rewards/rejected": -0.018707215785980225, "step": 605 }, { "epoch": 0.3519776964627984, "grad_norm": 320.3518371582031, "learning_rate": 4.119697850087159e-06, "logits/chosen": -0.6471167802810669, "logits/rejected": -0.5845264196395874, "logps/chosen": -72.15476989746094, "logps/rejected": -66.76841735839844, "loss": 14.2575, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.012289335019886494, "rewards/margins": -0.03404436260461807, "rewards/rejected": 0.021755026653409004, "step": 606 }, { "epoch": 0.35255851774409014, "grad_norm": 307.3863525390625, "learning_rate": 4.118245206275422e-06, "logits/chosen": -0.5887011289596558, "logits/rejected": -0.6126518249511719, "logps/chosen": -81.18836975097656, "logps/rejected": -77.85643005371094, "loss": 13.6217, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.024307237938046455, "rewards/margins": 0.029000231996178627, "rewards/rejected": -0.004692991729825735, "step": 607 }, { "epoch": 0.3531393390253819, "grad_norm": 281.9200744628906, "learning_rate": 4.116792562463684e-06, "logits/chosen": -0.8018544912338257, "logits/rejected": -0.7951962351799011, "logps/chosen": -73.1496810913086, "logps/rejected": -74.61567687988281, "loss": 13.2176, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.009971092455089092, "rewards/margins": 0.08315368741750717, "rewards/rejected": -0.0731825903058052, "step": 608 }, { "epoch": 0.35372016030667364, "grad_norm": 303.743896484375, "learning_rate": 4.115339918651947e-06, "logits/chosen": -0.7372728586196899, "logits/rejected": -0.8942030668258667, "logps/chosen": -74.2212905883789, "logps/rejected": -72.51829528808594, "loss": 12.8897, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.10641467571258545, "rewards/margins": 0.13827785849571228, "rewards/rejected": -0.03186319023370743, "step": 609 }, { "epoch": 0.35430098158796536, "grad_norm": 293.53631591796875, "learning_rate": 4.11388727484021e-06, "logits/chosen": -0.8246177434921265, "logits/rejected": -0.7705774903297424, "logps/chosen": -70.34178161621094, "logps/rejected": -73.58016204833984, "loss": 13.598, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.030447527766227722, "rewards/margins": 0.03866151347756386, "rewards/rejected": -0.008213978260755539, "step": 610 }, { "epoch": 0.35488180286925713, "grad_norm": 307.77294921875, "learning_rate": 4.112434631028473e-06, "logits/chosen": -0.7274325489997864, "logits/rejected": -0.7559345364570618, "logps/chosen": -76.30690002441406, "logps/rejected": -75.15937042236328, "loss": 13.8293, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.01979799196124077, "rewards/margins": 0.014523372054100037, "rewards/rejected": -0.0343213714659214, "step": 611 }, { "epoch": 0.35546262415054886, "grad_norm": 314.6964416503906, "learning_rate": 4.110981987216735e-06, "logits/chosen": -0.7580679655075073, "logits/rejected": -0.6873196363449097, "logps/chosen": -72.52796173095703, "logps/rejected": -71.6462631225586, "loss": 13.9075, "rewards/accuracies": 0.5, "rewards/chosen": 0.024890460073947906, "rewards/margins": 0.01054720301181078, "rewards/rejected": 0.01434325985610485, "step": 612 }, { "epoch": 0.35604344543184063, "grad_norm": 315.9555358886719, "learning_rate": 4.109529343404997e-06, "logits/chosen": -0.779710590839386, "logits/rejected": -0.726919412612915, "logps/chosen": -77.7830810546875, "logps/rejected": -71.8597183227539, "loss": 14.1527, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.022592689841985703, "rewards/margins": -0.015459999442100525, "rewards/rejected": -0.0071326917968690395, "step": 613 }, { "epoch": 0.35662426671313235, "grad_norm": 301.09161376953125, "learning_rate": 4.10807669959326e-06, "logits/chosen": -0.7886669635772705, "logits/rejected": -0.8101975321769714, "logps/chosen": -73.55926513671875, "logps/rejected": -79.67219543457031, "loss": 13.0973, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.06979019194841385, "rewards/margins": 0.08688534796237946, "rewards/rejected": -0.017095154151320457, "step": 614 }, { "epoch": 0.35720508799442413, "grad_norm": 290.6911926269531, "learning_rate": 4.106624055781523e-06, "logits/chosen": -0.721502423286438, "logits/rejected": -0.7246500253677368, "logps/chosen": -69.41618347167969, "logps/rejected": -73.62068176269531, "loss": 13.3402, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.04512856528162956, "rewards/margins": 0.0591517873108387, "rewards/rejected": -0.014023219235241413, "step": 615 }, { "epoch": 0.35778590927571585, "grad_norm": 284.9295959472656, "learning_rate": 4.105171411969786e-06, "logits/chosen": -0.6129464507102966, "logits/rejected": -0.6507026553153992, "logps/chosen": -75.41661071777344, "logps/rejected": -68.95967102050781, "loss": 12.9987, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.029339304193854332, "rewards/margins": 0.09721994400024414, "rewards/rejected": -0.06788064539432526, "step": 616 }, { "epoch": 0.35836673055700763, "grad_norm": 318.4294738769531, "learning_rate": 4.103718768158048e-06, "logits/chosen": -0.594603419303894, "logits/rejected": -0.542455792427063, "logps/chosen": -76.71803283691406, "logps/rejected": -76.74525451660156, "loss": 13.5157, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.04475555941462517, "rewards/margins": 0.04647917300462723, "rewards/rejected": -0.00172361359000206, "step": 617 }, { "epoch": 0.35894755183829935, "grad_norm": 307.6109924316406, "learning_rate": 4.102266124346311e-06, "logits/chosen": -0.6024777889251709, "logits/rejected": -0.6433155536651611, "logps/chosen": -70.24485778808594, "logps/rejected": -69.94918823242188, "loss": 13.3803, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00755837420001626, "rewards/margins": 0.064852774143219, "rewards/rejected": -0.05729439854621887, "step": 618 }, { "epoch": 0.3595283731195911, "grad_norm": 329.19183349609375, "learning_rate": 4.100813480534573e-06, "logits/chosen": -0.521633505821228, "logits/rejected": -0.5646509528160095, "logps/chosen": -78.86627197265625, "logps/rejected": -71.89064025878906, "loss": 14.1275, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.00912876520305872, "rewards/margins": -0.02030937746167183, "rewards/rejected": 0.011180608533322811, "step": 619 }, { "epoch": 0.36010919440088285, "grad_norm": 306.20867919921875, "learning_rate": 4.099360836722836e-06, "logits/chosen": -0.6866437792778015, "logits/rejected": -0.7564648389816284, "logps/chosen": -74.31228637695312, "logps/rejected": -65.19970703125, "loss": 13.9814, "rewards/accuracies": 0.5, "rewards/chosen": -0.00010943338565994054, "rewards/margins": -0.00688022980466485, "rewards/rejected": 0.00677079102024436, "step": 620 }, { "epoch": 0.36069001568217457, "grad_norm": 307.6351318359375, "learning_rate": 4.0979081929110985e-06, "logits/chosen": -0.6264073252677917, "logits/rejected": -0.6101059913635254, "logps/chosen": -79.4601821899414, "logps/rejected": -68.94159698486328, "loss": 13.2178, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00908391922712326, "rewards/margins": 0.07702421396970749, "rewards/rejected": -0.06794029474258423, "step": 621 }, { "epoch": 0.36127083696346635, "grad_norm": 309.8197326660156, "learning_rate": 4.096455549099361e-06, "logits/chosen": -0.6580110788345337, "logits/rejected": -0.8408079147338867, "logps/chosen": -73.57799530029297, "logps/rejected": -81.4070816040039, "loss": 14.01, "rewards/accuracies": 0.5, "rewards/chosen": -0.03712352365255356, "rewards/margins": -0.004934538155794144, "rewards/rejected": -0.03218898922204971, "step": 622 }, { "epoch": 0.36185165824475807, "grad_norm": 293.77130126953125, "learning_rate": 4.095002905287624e-06, "logits/chosen": -0.6701300740242004, "logits/rejected": -0.7274219393730164, "logps/chosen": -68.0548095703125, "logps/rejected": -68.81280517578125, "loss": 13.7275, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0037030756939202547, "rewards/margins": 0.027376368641853333, "rewards/rejected": -0.031079450622200966, "step": 623 }, { "epoch": 0.36243247952604984, "grad_norm": 288.8678894042969, "learning_rate": 4.093550261475887e-06, "logits/chosen": -0.7441659569740295, "logits/rejected": -0.7965889573097229, "logps/chosen": -70.99964904785156, "logps/rejected": -71.14028930664062, "loss": 13.362, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.02267099916934967, "rewards/margins": 0.05958705022931099, "rewards/rejected": -0.03691605478525162, "step": 624 }, { "epoch": 0.36301330080734157, "grad_norm": 373.985595703125, "learning_rate": 4.0920976176641495e-06, "logits/chosen": -0.783255398273468, "logits/rejected": -0.7588286995887756, "logps/chosen": -74.08125305175781, "logps/rejected": -80.33110809326172, "loss": 13.2311, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.038171131163835526, "rewards/margins": 0.07666865736246109, "rewards/rejected": -0.03849751502275467, "step": 625 }, { "epoch": 0.36359412208863334, "grad_norm": 332.46234130859375, "learning_rate": 4.0906449738524114e-06, "logits/chosen": -0.6622897982597351, "logits/rejected": -0.5771545767784119, "logps/chosen": -78.99581909179688, "logps/rejected": -71.4215316772461, "loss": 14.3235, "rewards/accuracies": 0.5, "rewards/chosen": -0.07639651745557785, "rewards/margins": -0.0335925929248333, "rewards/rejected": -0.04280392453074455, "step": 626 }, { "epoch": 0.36417494336992506, "grad_norm": 339.90875244140625, "learning_rate": 4.089192330040674e-06, "logits/chosen": -0.7199238538742065, "logits/rejected": -0.7270562052726746, "logps/chosen": -76.53150939941406, "logps/rejected": -88.87334442138672, "loss": 13.3714, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.010254562832415104, "rewards/margins": 0.06578972935676575, "rewards/rejected": -0.05553516745567322, "step": 627 }, { "epoch": 0.36475576465121684, "grad_norm": 318.9749755859375, "learning_rate": 4.087739686228937e-06, "logits/chosen": -0.6694357395172119, "logits/rejected": -0.7174801826477051, "logps/chosen": -69.3748779296875, "logps/rejected": -77.28684997558594, "loss": 14.3935, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.024402162060141563, "rewards/margins": -0.044836897403001785, "rewards/rejected": 0.020434733480215073, "step": 628 }, { "epoch": 0.36533658593250856, "grad_norm": 308.3586120605469, "learning_rate": 4.0862870424172e-06, "logits/chosen": -0.4848438799381256, "logits/rejected": -0.47951608896255493, "logps/chosen": -66.2980728149414, "logps/rejected": -80.6948471069336, "loss": 13.5319, "rewards/accuracies": 0.5, "rewards/chosen": -0.0488138273358345, "rewards/margins": 0.04704555124044418, "rewards/rejected": -0.09585938602685928, "step": 629 }, { "epoch": 0.36591740721380034, "grad_norm": 329.3526306152344, "learning_rate": 4.0848343986054624e-06, "logits/chosen": -0.5832070112228394, "logits/rejected": -0.6127845644950867, "logps/chosen": -77.78378295898438, "logps/rejected": -67.96507263183594, "loss": 14.4786, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00027630740078166127, "rewards/margins": -0.04388640448451042, "rewards/rejected": 0.04416271299123764, "step": 630 }, { "epoch": 0.36649822849509206, "grad_norm": 282.7289733886719, "learning_rate": 4.083381754793725e-06, "logits/chosen": -0.6684743165969849, "logits/rejected": -0.7016184329986572, "logps/chosen": -69.01554870605469, "logps/rejected": -68.71202087402344, "loss": 13.2268, "rewards/accuracies": 0.5, "rewards/chosen": 0.04856640845537186, "rewards/margins": 0.07856104522943497, "rewards/rejected": -0.029994633048772812, "step": 631 }, { "epoch": 0.3670790497763838, "grad_norm": 1237.3873291015625, "learning_rate": 4.081929110981988e-06, "logits/chosen": -0.5690406560897827, "logits/rejected": -0.6339157819747925, "logps/chosen": -79.32388305664062, "logps/rejected": -73.99163818359375, "loss": 13.3389, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.02145705372095108, "rewards/margins": 0.06668312102556229, "rewards/rejected": -0.045226071029901505, "step": 632 }, { "epoch": 0.36765987105767556, "grad_norm": 309.4449157714844, "learning_rate": 4.08047646717025e-06, "logits/chosen": -0.7146767377853394, "logits/rejected": -0.7952632904052734, "logps/chosen": -71.60173797607422, "logps/rejected": -70.31050872802734, "loss": 13.4012, "rewards/accuracies": 0.5, "rewards/chosen": -0.01931237056851387, "rewards/margins": 0.07422088086605072, "rewards/rejected": -0.09353326261043549, "step": 633 }, { "epoch": 0.3682406923389673, "grad_norm": 321.6402282714844, "learning_rate": 4.0790238233585126e-06, "logits/chosen": -0.4627179503440857, "logits/rejected": -0.49417465925216675, "logps/chosen": -68.27827453613281, "logps/rejected": -68.9576187133789, "loss": 14.036, "rewards/accuracies": 0.5, "rewards/chosen": -0.0748579353094101, "rewards/margins": -0.010526349768042564, "rewards/rejected": -0.06433158367872238, "step": 634 }, { "epoch": 0.36882151362025906, "grad_norm": 315.15155029296875, "learning_rate": 4.077571179546775e-06, "logits/chosen": -0.4579242765903473, "logits/rejected": -0.4775335192680359, "logps/chosen": -66.7532958984375, "logps/rejected": -72.78587341308594, "loss": 13.4604, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.024356910958886147, "rewards/margins": 0.052337825298309326, "rewards/rejected": -0.07669473439455032, "step": 635 }, { "epoch": 0.3694023349015508, "grad_norm": 314.5369567871094, "learning_rate": 4.076118535735038e-06, "logits/chosen": -0.7437750101089478, "logits/rejected": -0.7035426497459412, "logps/chosen": -74.56098937988281, "logps/rejected": -82.19285583496094, "loss": 13.6072, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.027040233835577965, "rewards/margins": 0.03697334975004196, "rewards/rejected": -0.00993311870843172, "step": 636 }, { "epoch": 0.36998315618284255, "grad_norm": 316.7352294921875, "learning_rate": 4.074665891923301e-06, "logits/chosen": -0.7988698482513428, "logits/rejected": -0.6878986358642578, "logps/chosen": -68.0035629272461, "logps/rejected": -67.69490051269531, "loss": 13.5846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0013492100406438112, "rewards/margins": 0.03704356402158737, "rewards/rejected": -0.038392774760723114, "step": 637 }, { "epoch": 0.3705639774641343, "grad_norm": 332.4465637207031, "learning_rate": 4.0732132481115636e-06, "logits/chosen": -0.432908833026886, "logits/rejected": -0.49958959221839905, "logps/chosen": -74.47289276123047, "logps/rejected": -73.33644104003906, "loss": 13.719, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.03480667248368263, "rewards/margins": 0.02545056864619255, "rewards/rejected": 0.009356101974844933, "step": 638 }, { "epoch": 0.37114479874542605, "grad_norm": 306.9449768066406, "learning_rate": 4.0717606042998255e-06, "logits/chosen": -0.6738191843032837, "logits/rejected": -0.6642774343490601, "logps/chosen": -71.95086669921875, "logps/rejected": -81.12242126464844, "loss": 13.009, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.056968025863170624, "rewards/margins": 0.0969221368432045, "rewards/rejected": -0.039954110980033875, "step": 639 }, { "epoch": 0.3717256200267178, "grad_norm": 278.2969970703125, "learning_rate": 4.070307960488088e-06, "logits/chosen": -0.680871844291687, "logits/rejected": -0.6395547389984131, "logps/chosen": -64.48146057128906, "logps/rejected": -70.27690124511719, "loss": 12.5765, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.04453757777810097, "rewards/margins": 0.14197415113449097, "rewards/rejected": -0.0974365621805191, "step": 640 }, { "epoch": 0.37230644130800955, "grad_norm": 297.1010437011719, "learning_rate": 4.068855316676351e-06, "logits/chosen": -0.6302939653396606, "logits/rejected": -0.7064687609672546, "logps/chosen": -71.5230941772461, "logps/rejected": -68.34951782226562, "loss": 12.9383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04373275116086006, "rewards/margins": 0.10687211900949478, "rewards/rejected": -0.06313937157392502, "step": 641 }, { "epoch": 0.37288726258930127, "grad_norm": 310.2055969238281, "learning_rate": 4.067402672864614e-06, "logits/chosen": -0.8405004739761353, "logits/rejected": -0.8176850080490112, "logps/chosen": -71.03034210205078, "logps/rejected": -77.72269439697266, "loss": 13.2716, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.011298018507659435, "rewards/margins": 0.076210156083107, "rewards/rejected": -0.06491214036941528, "step": 642 }, { "epoch": 0.373468083870593, "grad_norm": 288.0509033203125, "learning_rate": 4.0659500290528765e-06, "logits/chosen": -0.807684063911438, "logits/rejected": -0.8896517753601074, "logps/chosen": -71.74691009521484, "logps/rejected": -67.34007263183594, "loss": 13.4595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03239784389734268, "rewards/margins": 0.04948094114661217, "rewards/rejected": -0.017083102837204933, "step": 643 }, { "epoch": 0.37404890515188477, "grad_norm": 295.96917724609375, "learning_rate": 4.064497385241139e-06, "logits/chosen": -0.6759235858917236, "logits/rejected": -0.5782681703567505, "logps/chosen": -70.57847595214844, "logps/rejected": -74.41899108886719, "loss": 13.0928, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.05720372125506401, "rewards/margins": 0.09038721024990082, "rewards/rejected": -0.03318347781896591, "step": 644 }, { "epoch": 0.3746297264331765, "grad_norm": 301.96588134765625, "learning_rate": 4.063044741429402e-06, "logits/chosen": -0.7978429198265076, "logits/rejected": -0.8465709686279297, "logps/chosen": -70.97563171386719, "logps/rejected": -74.35396575927734, "loss": 13.3645, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.030164528638124466, "rewards/margins": 0.06472036987543106, "rewards/rejected": -0.03455584496259689, "step": 645 }, { "epoch": 0.37521054771446827, "grad_norm": 307.4619140625, "learning_rate": 4.061592097617664e-06, "logits/chosen": -0.6767681837081909, "logits/rejected": -0.7364662885665894, "logps/chosen": -75.87703704833984, "logps/rejected": -73.02762603759766, "loss": 13.3096, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.02578345499932766, "rewards/margins": 0.06501609086990356, "rewards/rejected": -0.039232634007930756, "step": 646 }, { "epoch": 0.37579136899576, "grad_norm": 302.466552734375, "learning_rate": 4.060139453805927e-06, "logits/chosen": -0.7271603941917419, "logits/rejected": -0.8793846368789673, "logps/chosen": -72.50651550292969, "logps/rejected": -75.49837493896484, "loss": 13.0485, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03700340539216995, "rewards/margins": 0.09631631523370743, "rewards/rejected": -0.05931291729211807, "step": 647 }, { "epoch": 0.37637219027705177, "grad_norm": 305.48046875, "learning_rate": 4.058686809994189e-06, "logits/chosen": -0.803033709526062, "logits/rejected": -0.7026882171630859, "logps/chosen": -72.45674133300781, "logps/rejected": -76.67012786865234, "loss": 12.9947, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.007967616431415081, "rewards/margins": 0.09880019724369049, "rewards/rejected": -0.09083259105682373, "step": 648 }, { "epoch": 0.3769530115583435, "grad_norm": 332.3240051269531, "learning_rate": 4.057234166182452e-06, "logits/chosen": -0.6038522720336914, "logits/rejected": -0.555856466293335, "logps/chosen": -76.23857116699219, "logps/rejected": -73.07332611083984, "loss": 13.5949, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.037921372801065445, "rewards/margins": 0.046288467943668365, "rewards/rejected": -0.008367091417312622, "step": 649 }, { "epoch": 0.37753383283963526, "grad_norm": 415.86383056640625, "learning_rate": 4.055781522370715e-06, "logits/chosen": -0.7161723375320435, "logits/rejected": -0.7207925915718079, "logps/chosen": -68.08644104003906, "logps/rejected": -69.88211059570312, "loss": 13.6038, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.022032026201486588, "rewards/margins": 0.038105495274066925, "rewards/rejected": -0.016073474660515785, "step": 650 }, { "epoch": 0.378114654120927, "grad_norm": 314.11492919921875, "learning_rate": 4.054328878558978e-06, "logits/chosen": -0.7593899369239807, "logits/rejected": -0.7028077840805054, "logps/chosen": -72.53091430664062, "logps/rejected": -77.84877014160156, "loss": 13.655, "rewards/accuracies": 0.5, "rewards/chosen": 0.016945505514740944, "rewards/margins": 0.02826940454542637, "rewards/rejected": -0.011323900893330574, "step": 651 }, { "epoch": 0.37869547540221876, "grad_norm": 311.8132019042969, "learning_rate": 4.05287623474724e-06, "logits/chosen": -0.7351340651512146, "logits/rejected": -0.7461525797843933, "logps/chosen": -79.3344497680664, "logps/rejected": -68.52467346191406, "loss": 13.3454, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.013737152330577374, "rewards/margins": 0.07491800934076309, "rewards/rejected": -0.08865516632795334, "step": 652 }, { "epoch": 0.3792762966835105, "grad_norm": 321.1096496582031, "learning_rate": 4.051423590935503e-06, "logits/chosen": -0.6296363472938538, "logits/rejected": -0.6592835783958435, "logps/chosen": -74.88213348388672, "logps/rejected": -78.33106231689453, "loss": 13.6806, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.038465339690446854, "rewards/margins": 0.029468858614563942, "rewards/rejected": -0.06793420016765594, "step": 653 }, { "epoch": 0.3798571179648022, "grad_norm": 299.4139404296875, "learning_rate": 4.049970947123766e-06, "logits/chosen": -0.7071855664253235, "logits/rejected": -0.749565839767456, "logps/chosen": -75.56929016113281, "logps/rejected": -70.52806854248047, "loss": 13.3743, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03643953427672386, "rewards/margins": 0.06584478914737701, "rewards/rejected": -0.10228432714939117, "step": 654 }, { "epoch": 0.380437939246094, "grad_norm": 334.07879638671875, "learning_rate": 4.048518303312029e-06, "logits/chosen": -0.68670654296875, "logits/rejected": -0.893659234046936, "logps/chosen": -72.4461898803711, "logps/rejected": -86.50956726074219, "loss": 13.315, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.010635538958013058, "rewards/margins": 0.09464681148529053, "rewards/rejected": -0.08401128649711609, "step": 655 }, { "epoch": 0.3810187605273857, "grad_norm": 315.373779296875, "learning_rate": 4.047065659500291e-06, "logits/chosen": -0.6514602899551392, "logits/rejected": -0.7396805286407471, "logps/chosen": -77.65937805175781, "logps/rejected": -70.10638427734375, "loss": 14.1567, "rewards/accuracies": 0.5, "rewards/chosen": -0.06592129915952682, "rewards/margins": -0.020820606499910355, "rewards/rejected": -0.04510069265961647, "step": 656 }, { "epoch": 0.3815995818086775, "grad_norm": 321.6916198730469, "learning_rate": 4.045613015688554e-06, "logits/chosen": -0.7136542201042175, "logits/rejected": -0.7592549920082092, "logps/chosen": -76.73454284667969, "logps/rejected": -73.61782836914062, "loss": 13.8061, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009097671136260033, "rewards/margins": 0.025288304314017296, "rewards/rejected": -0.03438597545027733, "step": 657 }, { "epoch": 0.3821804030899692, "grad_norm": 296.6112365722656, "learning_rate": 4.044160371876816e-06, "logits/chosen": -0.6855738759040833, "logits/rejected": -0.5937049984931946, "logps/chosen": -68.712158203125, "logps/rejected": -74.79548645019531, "loss": 13.4851, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006267908029258251, "rewards/margins": 0.046390462666749954, "rewards/rejected": -0.05265836790204048, "step": 658 }, { "epoch": 0.382761224371261, "grad_norm": 322.6234130859375, "learning_rate": 4.042707728065079e-06, "logits/chosen": -0.8190171122550964, "logits/rejected": -0.8573258519172668, "logps/chosen": -72.13541412353516, "logps/rejected": -65.77295684814453, "loss": 14.5607, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.004783238284289837, "rewards/margins": -0.06286787241697311, "rewards/rejected": 0.06765110790729523, "step": 659 }, { "epoch": 0.3833420456525527, "grad_norm": 326.6965026855469, "learning_rate": 4.0412550842533415e-06, "logits/chosen": -0.6807196736335754, "logits/rejected": -0.7124016284942627, "logps/chosen": -80.24552917480469, "logps/rejected": -75.31925964355469, "loss": 14.4393, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.10739662498235703, "rewards/margins": -0.041134439408779144, "rewards/rejected": -0.06626218557357788, "step": 660 }, { "epoch": 0.3839228669338445, "grad_norm": 310.8951721191406, "learning_rate": 4.039802440441604e-06, "logits/chosen": -0.7667916417121887, "logits/rejected": -0.7749382257461548, "logps/chosen": -78.43385314941406, "logps/rejected": -76.484130859375, "loss": 14.1133, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.033638693392276764, "rewards/margins": -0.00871949177235365, "rewards/rejected": -0.024919208139181137, "step": 661 }, { "epoch": 0.3845036882151362, "grad_norm": 332.6522216796875, "learning_rate": 4.038349796629867e-06, "logits/chosen": -0.6729914546012878, "logits/rejected": -0.8632933497428894, "logps/chosen": -80.35395812988281, "logps/rejected": -80.90164947509766, "loss": 13.6627, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.049274541437625885, "rewards/margins": 0.03929256647825241, "rewards/rejected": -0.0885671079158783, "step": 662 }, { "epoch": 0.385084509496428, "grad_norm": 323.0083312988281, "learning_rate": 4.03689715281813e-06, "logits/chosen": -0.9102508425712585, "logits/rejected": -0.8042302131652832, "logps/chosen": -72.66389465332031, "logps/rejected": -71.84483337402344, "loss": 14.1066, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.07181964069604874, "rewards/margins": -0.017825614660978317, "rewards/rejected": -0.05399402976036072, "step": 663 }, { "epoch": 0.3856653307777197, "grad_norm": 340.7964782714844, "learning_rate": 4.0354445090063925e-06, "logits/chosen": -0.7347263097763062, "logits/rejected": -0.774405300617218, "logps/chosen": -86.1633529663086, "logps/rejected": -79.38214874267578, "loss": 14.2786, "rewards/accuracies": 0.5, "rewards/chosen": -0.020557690411806107, "rewards/margins": -0.019774410873651505, "rewards/rejected": -0.0007832825067453086, "step": 664 }, { "epoch": 0.3862461520590114, "grad_norm": 345.67315673828125, "learning_rate": 4.033991865194654e-06, "logits/chosen": -0.6604214906692505, "logits/rejected": -0.7178537249565125, "logps/chosen": -72.80216979980469, "logps/rejected": -72.74732971191406, "loss": 14.3951, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07244611531496048, "rewards/margins": -0.021248834207654, "rewards/rejected": -0.05119727924466133, "step": 665 }, { "epoch": 0.3868269733403032, "grad_norm": 332.7640686035156, "learning_rate": 4.032539221382917e-06, "logits/chosen": -0.7640831470489502, "logits/rejected": -0.7370232343673706, "logps/chosen": -70.40046691894531, "logps/rejected": -69.4461898803711, "loss": 13.6271, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.01473932433873415, "rewards/margins": 0.044324107468128204, "rewards/rejected": -0.05906342715024948, "step": 666 }, { "epoch": 0.3874077946215949, "grad_norm": 334.3003845214844, "learning_rate": 4.03108657757118e-06, "logits/chosen": -0.7360302209854126, "logits/rejected": -0.7869777679443359, "logps/chosen": -85.70533752441406, "logps/rejected": -72.21440887451172, "loss": 13.4059, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04054093360900879, "rewards/margins": 0.05683417245745659, "rewards/rejected": -0.016293242573738098, "step": 667 }, { "epoch": 0.3879886159028867, "grad_norm": 288.31170654296875, "learning_rate": 4.029633933759443e-06, "logits/chosen": -0.7544078826904297, "logits/rejected": -0.7067466974258423, "logps/chosen": -72.47239685058594, "logps/rejected": -64.64818572998047, "loss": 12.8853, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.06467044353485107, "rewards/margins": 0.11924330145120621, "rewards/rejected": -0.05457286164164543, "step": 668 }, { "epoch": 0.3885694371841784, "grad_norm": 319.8020935058594, "learning_rate": 4.028181289947705e-06, "logits/chosen": -0.51985102891922, "logits/rejected": -0.6410090327262878, "logps/chosen": -72.35859680175781, "logps/rejected": -74.97998809814453, "loss": 13.5131, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.022069012746214867, "rewards/margins": 0.04354357719421387, "rewards/rejected": -0.021474560722708702, "step": 669 }, { "epoch": 0.3891502584654702, "grad_norm": 306.6014709472656, "learning_rate": 4.026728646135968e-06, "logits/chosen": -0.6216963529586792, "logits/rejected": -0.6753697395324707, "logps/chosen": -77.22586822509766, "logps/rejected": -70.61882019042969, "loss": 14.0923, "rewards/accuracies": 0.5, "rewards/chosen": -0.01738358661532402, "rewards/margins": -0.0009111147373914719, "rewards/rejected": -0.01647247187793255, "step": 670 }, { "epoch": 0.3897310797467619, "grad_norm": 307.93670654296875, "learning_rate": 4.02527600232423e-06, "logits/chosen": -0.7877734303474426, "logits/rejected": -0.8353005647659302, "logps/chosen": -79.74364471435547, "logps/rejected": -78.38971710205078, "loss": 13.4594, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.013755464926362038, "rewards/margins": 0.05986330658197403, "rewards/rejected": -0.07361876964569092, "step": 671 }, { "epoch": 0.3903119010280537, "grad_norm": 317.7315368652344, "learning_rate": 4.023823358512493e-06, "logits/chosen": -0.6735803484916687, "logits/rejected": -0.732440173625946, "logps/chosen": -76.98451232910156, "logps/rejected": -80.71636962890625, "loss": 13.8575, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008590728044509888, "rewards/margins": 0.008975962176918983, "rewards/rejected": -0.01756669022142887, "step": 672 }, { "epoch": 0.3908927223093454, "grad_norm": 311.2159729003906, "learning_rate": 4.0223707147007555e-06, "logits/chosen": -0.7430940866470337, "logits/rejected": -0.7209886312484741, "logps/chosen": -72.24745178222656, "logps/rejected": -72.40113830566406, "loss": 14.7028, "rewards/accuracies": 0.5, "rewards/chosen": -0.05548017472028732, "rewards/margins": -0.061144404113292694, "rewards/rejected": 0.005664219614118338, "step": 673 }, { "epoch": 0.3914735435906372, "grad_norm": 308.31805419921875, "learning_rate": 4.020918070889018e-06, "logits/chosen": -0.5726041793823242, "logits/rejected": -0.6990719437599182, "logps/chosen": -68.8240737915039, "logps/rejected": -69.23278045654297, "loss": 14.2499, "rewards/accuracies": 0.5, "rewards/chosen": -0.0369163416326046, "rewards/margins": -0.02730955183506012, "rewards/rejected": -0.009606788866221905, "step": 674 }, { "epoch": 0.3920543648719289, "grad_norm": 313.16650390625, "learning_rate": 4.019465427077281e-06, "logits/chosen": -0.6481889486312866, "logits/rejected": -0.6497616171836853, "logps/chosen": -73.71761322021484, "logps/rejected": -70.89939880371094, "loss": 13.6169, "rewards/accuracies": 0.5, "rewards/chosen": 0.017801934853196144, "rewards/margins": 0.03877174109220505, "rewards/rejected": -0.02096981182694435, "step": 675 }, { "epoch": 0.39263518615322063, "grad_norm": 313.9520263671875, "learning_rate": 4.018012783265544e-06, "logits/chosen": -0.7253094911575317, "logits/rejected": -0.6935799717903137, "logps/chosen": -74.28784942626953, "logps/rejected": -76.39752197265625, "loss": 14.0332, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.01669791154563427, "rewards/margins": -0.0043237642385065556, "rewards/rejected": -0.012374145910143852, "step": 676 }, { "epoch": 0.3932160074345124, "grad_norm": 327.840576171875, "learning_rate": 4.0165601394538065e-06, "logits/chosen": -0.5964494943618774, "logits/rejected": -0.8218010663986206, "logps/chosen": -76.28643035888672, "logps/rejected": -73.67375183105469, "loss": 13.48, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00023033171601127833, "rewards/margins": 0.07958341389894485, "rewards/rejected": -0.07935307919979095, "step": 677 }, { "epoch": 0.3937968287158041, "grad_norm": 469.1325378417969, "learning_rate": 4.015107495642068e-06, "logits/chosen": -0.7487168312072754, "logits/rejected": -0.7056422829627991, "logps/chosen": -71.22139739990234, "logps/rejected": -77.16386413574219, "loss": 13.3125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.013796500861644745, "rewards/margins": 0.06354296207427979, "rewards/rejected": -0.04974645376205444, "step": 678 }, { "epoch": 0.3943776499970959, "grad_norm": 298.0015869140625, "learning_rate": 4.013654851830331e-06, "logits/chosen": -0.7079442739486694, "logits/rejected": -0.9590281248092651, "logps/chosen": -74.9395980834961, "logps/rejected": -75.6585922241211, "loss": 12.8306, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03988940268754959, "rewards/margins": 0.14326909184455872, "rewards/rejected": -0.10337970405817032, "step": 679 }, { "epoch": 0.3949584712783876, "grad_norm": 318.6708984375, "learning_rate": 4.012202208018594e-06, "logits/chosen": -0.5922810435295105, "logits/rejected": -0.5826822519302368, "logps/chosen": -72.672119140625, "logps/rejected": -74.83512115478516, "loss": 13.6615, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.07369053363800049, "rewards/margins": 0.028689509257674217, "rewards/rejected": 0.045001011341810226, "step": 680 }, { "epoch": 0.3955392925596794, "grad_norm": 296.9277648925781, "learning_rate": 4.010749564206857e-06, "logits/chosen": -0.6584534049034119, "logits/rejected": -0.6963584423065186, "logps/chosen": -72.95672607421875, "logps/rejected": -71.49995422363281, "loss": 13.4348, "rewards/accuracies": 0.5, "rewards/chosen": 0.031108522787690163, "rewards/margins": 0.05346295237541199, "rewards/rejected": -0.022354427725076675, "step": 681 }, { "epoch": 0.3961201138409711, "grad_norm": 296.8775329589844, "learning_rate": 4.009296920395119e-06, "logits/chosen": -0.7727608680725098, "logits/rejected": -0.753118634223938, "logps/chosen": -75.11225128173828, "logps/rejected": -68.10901641845703, "loss": 13.6226, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.010256567969918251, "rewards/margins": 0.03369814157485962, "rewards/rejected": -0.023441573604941368, "step": 682 }, { "epoch": 0.3967009351222629, "grad_norm": 304.0804748535156, "learning_rate": 4.007844276583382e-06, "logits/chosen": -0.6444306373596191, "logits/rejected": -0.7014695405960083, "logps/chosen": -67.61308288574219, "logps/rejected": -67.55976104736328, "loss": 14.3106, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.028560440987348557, "rewards/margins": -0.03599892184138298, "rewards/rejected": 0.007438488304615021, "step": 683 }, { "epoch": 0.3972817564035546, "grad_norm": 345.8822021484375, "learning_rate": 4.006391632771645e-06, "logits/chosen": -0.5845493078231812, "logits/rejected": -0.6856909990310669, "logps/chosen": -68.45945739746094, "logps/rejected": -67.73657989501953, "loss": 13.9901, "rewards/accuracies": 0.5, "rewards/chosen": -0.026378994807600975, "rewards/margins": 0.004468211438506842, "rewards/rejected": -0.03084721229970455, "step": 684 }, { "epoch": 0.3978625776848464, "grad_norm": 308.62628173828125, "learning_rate": 4.004938988959907e-06, "logits/chosen": -0.6908701062202454, "logits/rejected": -0.7285133600234985, "logps/chosen": -68.93060302734375, "logps/rejected": -76.362548828125, "loss": 13.9869, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.020935364067554474, "rewards/margins": 0.005568015389144421, "rewards/rejected": -0.02650338038802147, "step": 685 }, { "epoch": 0.3984433989661381, "grad_norm": 307.853271484375, "learning_rate": 4.0034863451481696e-06, "logits/chosen": -0.6336562633514404, "logits/rejected": -0.601762592792511, "logps/chosen": -73.2461929321289, "logps/rejected": -64.52156066894531, "loss": 13.7892, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.021032018586993217, "rewards/margins": 0.01943250373005867, "rewards/rejected": 0.0015995114808902144, "step": 686 }, { "epoch": 0.39902422024742984, "grad_norm": 304.0710754394531, "learning_rate": 4.002033701336432e-06, "logits/chosen": -0.8108295202255249, "logits/rejected": -0.8165252804756165, "logps/chosen": -68.44810485839844, "logps/rejected": -71.20729064941406, "loss": 13.428, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.06437697261571884, "rewards/margins": 0.05376725271344185, "rewards/rejected": 0.010609723627567291, "step": 687 }, { "epoch": 0.3996050415287216, "grad_norm": 302.7625732421875, "learning_rate": 4.000581057524695e-06, "logits/chosen": -0.7618246078491211, "logits/rejected": -0.7047253251075745, "logps/chosen": -69.78257751464844, "logps/rejected": -78.85441589355469, "loss": 13.4723, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.013262261636555195, "rewards/margins": 0.05348924919962883, "rewards/rejected": -0.06675150990486145, "step": 688 }, { "epoch": 0.40018586281001334, "grad_norm": 326.1325988769531, "learning_rate": 3.999128413712958e-06, "logits/chosen": -0.7000328302383423, "logits/rejected": -0.7599672675132751, "logps/chosen": -79.16525268554688, "logps/rejected": -72.787109375, "loss": 14.4071, "rewards/accuracies": 0.5, "rewards/chosen": -0.010734537616372108, "rewards/margins": -0.04592505842447281, "rewards/rejected": 0.03519051522016525, "step": 689 }, { "epoch": 0.4007666840913051, "grad_norm": 398.7939758300781, "learning_rate": 3.9976757699012206e-06, "logits/chosen": -0.6336180567741394, "logits/rejected": -0.5584805607795715, "logps/chosen": -79.39990234375, "logps/rejected": -70.52845764160156, "loss": 13.6773, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.02765648439526558, "rewards/margins": 0.03300664573907852, "rewards/rejected": -0.005350158549845219, "step": 690 }, { "epoch": 0.40134750537259684, "grad_norm": 316.0892028808594, "learning_rate": 3.996223126089483e-06, "logits/chosen": -0.7683295011520386, "logits/rejected": -0.5641528964042664, "logps/chosen": -81.93292236328125, "logps/rejected": -74.35237121582031, "loss": 14.2073, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.032339874655008316, "rewards/margins": -0.020309090614318848, "rewards/rejected": -0.01203078217804432, "step": 691 }, { "epoch": 0.4019283266538886, "grad_norm": 322.30126953125, "learning_rate": 3.994770482277745e-06, "logits/chosen": -0.7082656025886536, "logits/rejected": -0.7538729906082153, "logps/chosen": -74.92417907714844, "logps/rejected": -71.00489807128906, "loss": 14.2258, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.02695903740823269, "rewards/margins": -0.026692455634474754, "rewards/rejected": -0.0002665802894625813, "step": 692 }, { "epoch": 0.40250914793518033, "grad_norm": 325.52874755859375, "learning_rate": 3.993317838466009e-06, "logits/chosen": -0.7899920344352722, "logits/rejected": -0.846684455871582, "logps/chosen": -75.40380859375, "logps/rejected": -71.71354675292969, "loss": 13.1936, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.058164648711681366, "rewards/margins": 0.08477498590946198, "rewards/rejected": -0.026610326021909714, "step": 693 }, { "epoch": 0.4030899692164721, "grad_norm": 336.93310546875, "learning_rate": 3.9918651946542715e-06, "logits/chosen": -0.6738449335098267, "logits/rejected": -0.6702437400817871, "logps/chosen": -68.76301574707031, "logps/rejected": -81.96266174316406, "loss": 12.6865, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.04724300652742386, "rewards/margins": 0.13397014141082764, "rewards/rejected": -0.08672711998224258, "step": 694 }, { "epoch": 0.40367079049776383, "grad_norm": 276.9514465332031, "learning_rate": 3.990412550842534e-06, "logits/chosen": -0.7575210332870483, "logits/rejected": -0.6729222536087036, "logps/chosen": -70.20577239990234, "logps/rejected": -74.03892517089844, "loss": 13.151, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.061166275292634964, "rewards/margins": 0.08962966501712799, "rewards/rejected": -0.02846338413655758, "step": 695 }, { "epoch": 0.4042516117790556, "grad_norm": 319.8224792480469, "learning_rate": 3.988959907030796e-06, "logits/chosen": -0.575911819934845, "logits/rejected": -0.5608241558074951, "logps/chosen": -70.46253967285156, "logps/rejected": -82.12687683105469, "loss": 13.5207, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.048672787845134735, "rewards/margins": 0.048407696187496185, "rewards/rejected": 0.0002650946262292564, "step": 696 }, { "epoch": 0.40483243306034733, "grad_norm": 325.2471618652344, "learning_rate": 3.987507263219059e-06, "logits/chosen": -0.6701699495315552, "logits/rejected": -0.6926618218421936, "logps/chosen": -79.00935363769531, "logps/rejected": -78.64802551269531, "loss": 13.9549, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.009344990365207195, "rewards/margins": -0.002072087023407221, "rewards/rejected": 0.01141707319766283, "step": 697 }, { "epoch": 0.40541325434163905, "grad_norm": 319.7401428222656, "learning_rate": 3.986054619407322e-06, "logits/chosen": -0.606377124786377, "logits/rejected": -0.6375631093978882, "logps/chosen": -74.11344909667969, "logps/rejected": -65.52316284179688, "loss": 13.5328, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.07139308750629425, "rewards/margins": 0.043192584067583084, "rewards/rejected": 0.02820049785077572, "step": 698 }, { "epoch": 0.40599407562293083, "grad_norm": 290.7586669921875, "learning_rate": 3.9846019755955844e-06, "logits/chosen": -0.6408273577690125, "logits/rejected": -0.6153540015220642, "logps/chosen": -74.5358657836914, "logps/rejected": -74.61564636230469, "loss": 13.6657, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.011930713430047035, "rewards/margins": 0.03009539470076561, "rewards/rejected": -0.018164681270718575, "step": 699 }, { "epoch": 0.40657489690422255, "grad_norm": 324.70916748046875, "learning_rate": 3.983149331783847e-06, "logits/chosen": -0.6709033250808716, "logits/rejected": -0.7083495259284973, "logps/chosen": -77.2063980102539, "logps/rejected": -75.69718170166016, "loss": 13.9698, "rewards/accuracies": 0.5, "rewards/chosen": 0.004046584479510784, "rewards/margins": 0.004778122063726187, "rewards/rejected": -0.0007315344992093742, "step": 700 }, { "epoch": 0.4071557181855143, "grad_norm": 308.9340515136719, "learning_rate": 3.98169668797211e-06, "logits/chosen": -0.603878378868103, "logits/rejected": -0.657692015171051, "logps/chosen": -74.08457946777344, "logps/rejected": -67.98051452636719, "loss": 13.4656, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.017448043450713158, "rewards/margins": 0.04989555850625038, "rewards/rejected": -0.03244751691818237, "step": 701 }, { "epoch": 0.40773653946680605, "grad_norm": 304.8455505371094, "learning_rate": 3.980244044160373e-06, "logits/chosen": -0.6844228506088257, "logits/rejected": -0.7053964138031006, "logps/chosen": -69.97819519042969, "logps/rejected": -72.03315734863281, "loss": 13.7659, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.02907332219183445, "rewards/margins": 0.025167405605316162, "rewards/rejected": 0.003905917750671506, "step": 702 }, { "epoch": 0.4083173607480978, "grad_norm": 308.1776428222656, "learning_rate": 3.978791400348635e-06, "logits/chosen": -0.5791782140731812, "logits/rejected": -0.646981954574585, "logps/chosen": -82.76728820800781, "logps/rejected": -72.67459869384766, "loss": 13.6071, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.035702355206012726, "rewards/margins": 0.03648295998573303, "rewards/rejected": -0.0007806119392625988, "step": 703 }, { "epoch": 0.40889818202938955, "grad_norm": 313.1890869140625, "learning_rate": 3.977338756536897e-06, "logits/chosen": -0.7870761156082153, "logits/rejected": -0.7041381001472473, "logps/chosen": -74.79627227783203, "logps/rejected": -77.46690368652344, "loss": 13.0616, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.07957975566387177, "rewards/margins": 0.09422776103019714, "rewards/rejected": -0.014648010022938251, "step": 704 }, { "epoch": 0.4094790033106813, "grad_norm": 324.8765869140625, "learning_rate": 3.97588611272516e-06, "logits/chosen": -0.8196079134941101, "logits/rejected": -0.7677024602890015, "logps/chosen": -68.56859588623047, "logps/rejected": -76.06591033935547, "loss": 13.521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03140668943524361, "rewards/margins": 0.04369328171014786, "rewards/rejected": -0.01228659600019455, "step": 705 }, { "epoch": 0.41005982459197304, "grad_norm": 334.3929138183594, "learning_rate": 3.974433468913423e-06, "logits/chosen": -0.6489372849464417, "logits/rejected": -0.6309608221054077, "logps/chosen": -76.56379699707031, "logps/rejected": -82.5246353149414, "loss": 14.4875, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.03186134248971939, "rewards/margins": -0.04680603742599487, "rewards/rejected": 0.014944696798920631, "step": 706 }, { "epoch": 0.4106406458732648, "grad_norm": 277.21441650390625, "learning_rate": 3.972980825101686e-06, "logits/chosen": -0.6851536631584167, "logits/rejected": -0.6442720890045166, "logps/chosen": -71.58158874511719, "logps/rejected": -65.97663116455078, "loss": 12.4476, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.09448430687189102, "rewards/margins": 0.1556243896484375, "rewards/rejected": -0.06114007905125618, "step": 707 }, { "epoch": 0.41122146715455654, "grad_norm": 304.2071533203125, "learning_rate": 3.971528181289948e-06, "logits/chosen": -0.6629344820976257, "logits/rejected": -0.6967271566390991, "logps/chosen": -73.93403625488281, "logps/rejected": -69.1995849609375, "loss": 13.107, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.025395860895514488, "rewards/margins": 0.08553169667720795, "rewards/rejected": -0.06013583019375801, "step": 708 }, { "epoch": 0.41180228843584826, "grad_norm": 300.8368835449219, "learning_rate": 3.970075537478211e-06, "logits/chosen": -0.5319725871086121, "logits/rejected": -0.6029818654060364, "logps/chosen": -74.1766128540039, "logps/rejected": -71.83555603027344, "loss": 13.7521, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.042827047407627106, "rewards/margins": 0.020685147494077682, "rewards/rejected": 0.022141892462968826, "step": 709 }, { "epoch": 0.41238310971714004, "grad_norm": 386.9114685058594, "learning_rate": 3.968622893666473e-06, "logits/chosen": -0.7132548093795776, "logits/rejected": -0.5851150751113892, "logps/chosen": -79.25279235839844, "logps/rejected": -73.51028442382812, "loss": 14.7161, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.05419192835688591, "rewards/margins": -0.07584583759307861, "rewards/rejected": 0.021653901785612106, "step": 710 }, { "epoch": 0.41296393099843176, "grad_norm": 322.0220947265625, "learning_rate": 3.967170249854736e-06, "logits/chosen": -0.6925621628761292, "logits/rejected": -0.7238684892654419, "logps/chosen": -75.48567199707031, "logps/rejected": -68.52146911621094, "loss": 13.8128, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.054952871054410934, "rewards/margins": 0.009693034924566746, "rewards/rejected": 0.045259833335876465, "step": 711 }, { "epoch": 0.41354475227972354, "grad_norm": 332.9398498535156, "learning_rate": 3.9657176060429985e-06, "logits/chosen": -0.6430686712265015, "logits/rejected": -0.7087141275405884, "logps/chosen": -68.45174407958984, "logps/rejected": -67.28535461425781, "loss": 14.5686, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.015753645449876785, "rewards/margins": -0.05839390307664871, "rewards/rejected": 0.0741475448012352, "step": 712 }, { "epoch": 0.41412557356101526, "grad_norm": 313.2064514160156, "learning_rate": 3.964264962231261e-06, "logits/chosen": -0.6678661108016968, "logits/rejected": -0.6293448209762573, "logps/chosen": -81.44425201416016, "logps/rejected": -72.38615417480469, "loss": 13.5039, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.019158076494932175, "rewards/margins": 0.047387607395648956, "rewards/rejected": -0.028229529038071632, "step": 713 }, { "epoch": 0.41470639484230704, "grad_norm": 301.8228454589844, "learning_rate": 3.962812318419524e-06, "logits/chosen": -0.6319082975387573, "logits/rejected": -0.6024832129478455, "logps/chosen": -62.35749053955078, "logps/rejected": -71.47017669677734, "loss": 13.4994, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0663997158408165, "rewards/margins": 0.04610716551542282, "rewards/rejected": 0.02029253914952278, "step": 714 }, { "epoch": 0.41528721612359876, "grad_norm": 345.0201110839844, "learning_rate": 3.961359674607787e-06, "logits/chosen": -0.6447056531906128, "logits/rejected": -0.6276105642318726, "logps/chosen": -79.46293640136719, "logps/rejected": -75.56305694580078, "loss": 14.4818, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03468679264187813, "rewards/margins": -0.044044964015483856, "rewards/rejected": 0.009358169510960579, "step": 715 }, { "epoch": 0.41586803740489053, "grad_norm": 341.76220703125, "learning_rate": 3.9599070307960495e-06, "logits/chosen": -0.5557613372802734, "logits/rejected": -0.6789140701293945, "logps/chosen": -79.2572021484375, "logps/rejected": -89.22966003417969, "loss": 14.2231, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.06551501154899597, "rewards/margins": -0.027714461088180542, "rewards/rejected": -0.03780054301023483, "step": 716 }, { "epoch": 0.41644885868618225, "grad_norm": 282.11700439453125, "learning_rate": 3.958454386984311e-06, "logits/chosen": -0.6314088106155396, "logits/rejected": -0.7064960598945618, "logps/chosen": -65.93815612792969, "logps/rejected": -75.64500427246094, "loss": 13.2297, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01477456372231245, "rewards/margins": 0.07601601630449295, "rewards/rejected": -0.06124146655201912, "step": 717 }, { "epoch": 0.41702967996747403, "grad_norm": 273.31231689453125, "learning_rate": 3.957001743172574e-06, "logits/chosen": -0.659504771232605, "logits/rejected": -0.8665167689323425, "logps/chosen": -73.95689392089844, "logps/rejected": -73.23133087158203, "loss": 12.612, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.08090134710073471, "rewards/margins": 0.15707240998744965, "rewards/rejected": -0.07617107778787613, "step": 718 }, { "epoch": 0.41761050124876575, "grad_norm": 314.1131591796875, "learning_rate": 3.955549099360837e-06, "logits/chosen": -0.6384779214859009, "logits/rejected": -0.6633267998695374, "logps/chosen": -71.73622131347656, "logps/rejected": -75.96559143066406, "loss": 13.3543, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.05409403517842293, "rewards/margins": 0.060982413589954376, "rewards/rejected": -0.0068883770145475864, "step": 719 }, { "epoch": 0.4181913225300575, "grad_norm": 316.5398864746094, "learning_rate": 3.9540964555491e-06, "logits/chosen": -0.7581623196601868, "logits/rejected": -0.6908892393112183, "logps/chosen": -78.83635711669922, "logps/rejected": -73.35903930664062, "loss": 13.6177, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0010162651306018233, "rewards/margins": 0.038523249328136444, "rewards/rejected": -0.03750698268413544, "step": 720 }, { "epoch": 0.41877214381134925, "grad_norm": 289.4864807128906, "learning_rate": 3.952643811737362e-06, "logits/chosen": -0.6493207216262817, "logits/rejected": -0.6764460802078247, "logps/chosen": -75.92554473876953, "logps/rejected": -68.46549224853516, "loss": 14.2656, "rewards/accuracies": 0.5, "rewards/chosen": -0.07545448839664459, "rewards/margins": -0.029577041044831276, "rewards/rejected": -0.045877449214458466, "step": 721 }, { "epoch": 0.419352965092641, "grad_norm": 307.9372253417969, "learning_rate": 3.951191167925625e-06, "logits/chosen": -0.6669474244117737, "logits/rejected": -0.8115142583847046, "logps/chosen": -67.41337585449219, "logps/rejected": -64.86238098144531, "loss": 13.7911, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.005310344509780407, "rewards/margins": 0.02083405666053295, "rewards/rejected": -0.015523704700171947, "step": 722 }, { "epoch": 0.41993378637393275, "grad_norm": 283.71624755859375, "learning_rate": 3.949738524113888e-06, "logits/chosen": -0.7168434858322144, "logits/rejected": -0.7291404008865356, "logps/chosen": -73.59349060058594, "logps/rejected": -68.3053207397461, "loss": 13.7698, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.002716714981943369, "rewards/margins": 0.01622510887682438, "rewards/rejected": -0.013508396223187447, "step": 723 }, { "epoch": 0.42051460765522447, "grad_norm": 349.4669494628906, "learning_rate": 3.94828588030215e-06, "logits/chosen": -0.7875105738639832, "logits/rejected": -0.7584863901138306, "logps/chosen": -72.06592559814453, "logps/rejected": -77.07530212402344, "loss": 14.4762, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.032033637166023254, "rewards/margins": -0.050473153591156006, "rewards/rejected": 0.0184395220130682, "step": 724 }, { "epoch": 0.42109542893651625, "grad_norm": 359.53485107421875, "learning_rate": 3.9468332364904125e-06, "logits/chosen": -0.6558941602706909, "logits/rejected": -0.5819476842880249, "logps/chosen": -68.26545715332031, "logps/rejected": -73.47137451171875, "loss": 14.0153, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.005019567906856537, "rewards/margins": 0.0016324628377333283, "rewards/rejected": 0.0033871070481836796, "step": 725 }, { "epoch": 0.42167625021780797, "grad_norm": 323.96868896484375, "learning_rate": 3.945380592678675e-06, "logits/chosen": -0.6153490543365479, "logits/rejected": -0.6308630704879761, "logps/chosen": -72.20024108886719, "logps/rejected": -67.79884338378906, "loss": 13.4744, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.002849989803507924, "rewards/margins": 0.07088102400302887, "rewards/rejected": -0.06803102791309357, "step": 726 }, { "epoch": 0.42225707149909975, "grad_norm": 448.19921875, "learning_rate": 3.943927948866938e-06, "logits/chosen": -0.7376095056533813, "logits/rejected": -0.6571551561355591, "logps/chosen": -76.57099151611328, "logps/rejected": -74.70809173583984, "loss": 14.4323, "rewards/accuracies": 0.5, "rewards/chosen": -0.01626337133347988, "rewards/margins": -0.03361039236187935, "rewards/rejected": 0.01734701171517372, "step": 727 }, { "epoch": 0.42283789278039147, "grad_norm": 301.0350646972656, "learning_rate": 3.942475305055201e-06, "logits/chosen": -0.7029098272323608, "logits/rejected": -0.7672086358070374, "logps/chosen": -74.73590850830078, "logps/rejected": -72.44758605957031, "loss": 13.2734, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.021909218281507492, "rewards/margins": 0.06785444915294647, "rewards/rejected": -0.04594522714614868, "step": 728 }, { "epoch": 0.42341871406168324, "grad_norm": 323.43463134765625, "learning_rate": 3.9410226612434635e-06, "logits/chosen": -0.70557701587677, "logits/rejected": -0.7261152267456055, "logps/chosen": -77.63336181640625, "logps/rejected": -71.49602508544922, "loss": 14.8529, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.05248032882809639, "rewards/margins": -0.09009293466806412, "rewards/rejected": 0.037612609565258026, "step": 729 }, { "epoch": 0.42399953534297496, "grad_norm": 330.1050720214844, "learning_rate": 3.939570017431726e-06, "logits/chosen": -0.5593830347061157, "logits/rejected": -0.6348336935043335, "logps/chosen": -84.4022445678711, "logps/rejected": -81.31836700439453, "loss": 13.0719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0025830883532762527, "rewards/margins": 0.09452484548091888, "rewards/rejected": -0.09710792452096939, "step": 730 }, { "epoch": 0.4245803566242667, "grad_norm": 317.7475280761719, "learning_rate": 3.938117373619988e-06, "logits/chosen": -0.4515294134616852, "logits/rejected": -0.38896283507347107, "logps/chosen": -69.86441040039062, "logps/rejected": -78.91719055175781, "loss": 14.0736, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04327527433633804, "rewards/margins": -0.009526817128062248, "rewards/rejected": -0.033748455345630646, "step": 731 }, { "epoch": 0.42516117790555846, "grad_norm": 319.0084533691406, "learning_rate": 3.936664729808251e-06, "logits/chosen": -0.6351519823074341, "logits/rejected": -0.6803760528564453, "logps/chosen": -69.61072540283203, "logps/rejected": -71.7401351928711, "loss": 14.1246, "rewards/accuracies": 0.5, "rewards/chosen": -0.05607505515217781, "rewards/margins": -0.01812058314681053, "rewards/rejected": -0.03795447573065758, "step": 732 }, { "epoch": 0.4257419991868502, "grad_norm": 335.4173889160156, "learning_rate": 3.935212085996514e-06, "logits/chosen": -0.7924081087112427, "logits/rejected": -0.7546756267547607, "logps/chosen": -70.29083251953125, "logps/rejected": -76.52457427978516, "loss": 14.3631, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.028643876314163208, "rewards/margins": -0.033245109021663666, "rewards/rejected": 0.004601232707500458, "step": 733 }, { "epoch": 0.42632282046814196, "grad_norm": 302.1365051269531, "learning_rate": 3.933759442184776e-06, "logits/chosen": -0.7318800687789917, "logits/rejected": -0.7138211727142334, "logps/chosen": -74.42980194091797, "logps/rejected": -76.93816375732422, "loss": 14.0023, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.018741881474852562, "rewards/margins": -0.004794149659574032, "rewards/rejected": -0.013947735540568829, "step": 734 }, { "epoch": 0.4269036417494337, "grad_norm": 304.879638671875, "learning_rate": 3.932306798373039e-06, "logits/chosen": -0.6316365003585815, "logits/rejected": -0.6677497029304504, "logps/chosen": -71.40536499023438, "logps/rejected": -72.45552825927734, "loss": 13.5122, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.06338571012020111, "rewards/margins": 0.04115435481071472, "rewards/rejected": 0.02223135158419609, "step": 735 }, { "epoch": 0.42748446303072546, "grad_norm": 298.0213317871094, "learning_rate": 3.930854154561302e-06, "logits/chosen": -0.7024403214454651, "logits/rejected": -0.7682080864906311, "logps/chosen": -71.77532196044922, "logps/rejected": -69.44412994384766, "loss": 13.3706, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03764911741018295, "rewards/margins": 0.06105116754770279, "rewards/rejected": -0.023402050137519836, "step": 736 }, { "epoch": 0.4280652843120172, "grad_norm": 280.1487121582031, "learning_rate": 3.929401510749565e-06, "logits/chosen": -0.9538941383361816, "logits/rejected": -0.8766900300979614, "logps/chosen": -66.9570541381836, "logps/rejected": -72.75699615478516, "loss": 12.845, "rewards/accuracies": 0.75, "rewards/chosen": 0.06486214697360992, "rewards/margins": 0.11776605993509293, "rewards/rejected": -0.05290389060974121, "step": 737 }, { "epoch": 0.42864610559330896, "grad_norm": 325.70440673828125, "learning_rate": 3.927948866937827e-06, "logits/chosen": -0.6374528408050537, "logits/rejected": -0.5448911190032959, "logps/chosen": -74.01681518554688, "logps/rejected": -62.33977127075195, "loss": 13.9094, "rewards/accuracies": 0.5, "rewards/chosen": 0.0036867789458483458, "rewards/margins": 0.0037749551702290773, "rewards/rejected": -8.817538764560595e-05, "step": 738 }, { "epoch": 0.4292269268746007, "grad_norm": 324.1231689453125, "learning_rate": 3.92649622312609e-06, "logits/chosen": -0.7090293169021606, "logits/rejected": -0.7085317373275757, "logps/chosen": -80.50122833251953, "logps/rejected": -66.37286376953125, "loss": 14.212, "rewards/accuracies": 0.5, "rewards/chosen": -0.005439291708171368, "rewards/margins": -0.020895807072520256, "rewards/rejected": 0.015456515364348888, "step": 739 }, { "epoch": 0.42980774815589246, "grad_norm": 310.73968505859375, "learning_rate": 3.925043579314353e-06, "logits/chosen": -0.6998378038406372, "logits/rejected": -0.649873673915863, "logps/chosen": -75.84722137451172, "logps/rejected": -73.048095703125, "loss": 13.2256, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.017816346138715744, "rewards/margins": 0.0821092426776886, "rewards/rejected": -0.06429289281368256, "step": 740 }, { "epoch": 0.4303885694371842, "grad_norm": 297.516845703125, "learning_rate": 3.923590935502616e-06, "logits/chosen": -0.6497145891189575, "logits/rejected": -0.6620525121688843, "logps/chosen": -80.6782455444336, "logps/rejected": -72.28392028808594, "loss": 13.2971, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0035281956661492586, "rewards/margins": 0.07035094499588013, "rewards/rejected": -0.07387915253639221, "step": 741 }, { "epoch": 0.4309693907184759, "grad_norm": 322.9284973144531, "learning_rate": 3.9221382916908775e-06, "logits/chosen": -0.5986420512199402, "logits/rejected": -0.5791078209877014, "logps/chosen": -68.31107330322266, "logps/rejected": -71.18339538574219, "loss": 13.9564, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.011520462110638618, "rewards/margins": 0.00581027427688241, "rewards/rejected": -0.017330732196569443, "step": 742 }, { "epoch": 0.4315502119997677, "grad_norm": 309.1309814453125, "learning_rate": 3.92068564787914e-06, "logits/chosen": -0.7334194183349609, "logits/rejected": -0.5711901783943176, "logps/chosen": -75.96092224121094, "logps/rejected": -72.91703796386719, "loss": 13.3731, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.012258807197213173, "rewards/margins": 0.057714782655239105, "rewards/rejected": -0.06997359544038773, "step": 743 }, { "epoch": 0.4321310332810594, "grad_norm": 342.9898376464844, "learning_rate": 3.919233004067403e-06, "logits/chosen": -0.7844117283821106, "logits/rejected": -0.7111651301383972, "logps/chosen": -79.83903503417969, "logps/rejected": -80.42810821533203, "loss": 14.6354, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.030741948634386063, "rewards/margins": -0.07022452354431152, "rewards/rejected": 0.03948257490992546, "step": 744 }, { "epoch": 0.4327118545623512, "grad_norm": 298.7255859375, "learning_rate": 3.917780360255666e-06, "logits/chosen": -0.7623270750045776, "logits/rejected": -0.8348603248596191, "logps/chosen": -74.46296691894531, "logps/rejected": -72.27961730957031, "loss": 13.4668, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03371516615152359, "rewards/margins": 0.04851061850786209, "rewards/rejected": -0.08222578465938568, "step": 745 }, { "epoch": 0.4332926758436429, "grad_norm": 294.17254638671875, "learning_rate": 3.9163277164439285e-06, "logits/chosen": -0.7279725670814514, "logits/rejected": -0.739454984664917, "logps/chosen": -70.19053649902344, "logps/rejected": -70.75625610351562, "loss": 13.4564, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03082049824297428, "rewards/margins": 0.059337783604860306, "rewards/rejected": -0.09015828371047974, "step": 746 }, { "epoch": 0.43387349712493467, "grad_norm": 307.3867492675781, "learning_rate": 3.914875072632191e-06, "logits/chosen": -0.6892760396003723, "logits/rejected": -0.7267829179763794, "logps/chosen": -69.66451263427734, "logps/rejected": -73.37333679199219, "loss": 13.2647, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.07057398557662964, "rewards/margins": 0.07388408482074738, "rewards/rejected": -0.0033100892324000597, "step": 747 }, { "epoch": 0.4344543184062264, "grad_norm": 334.6365661621094, "learning_rate": 3.913422428820454e-06, "logits/chosen": -0.5453966856002808, "logits/rejected": -0.6484876871109009, "logps/chosen": -75.01347351074219, "logps/rejected": -84.60029602050781, "loss": 13.4385, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.02981463447213173, "rewards/margins": 0.05149499326944351, "rewards/rejected": -0.021680355072021484, "step": 748 }, { "epoch": 0.43503513968751817, "grad_norm": 293.3527526855469, "learning_rate": 3.911969785008716e-06, "logits/chosen": -0.6956106424331665, "logits/rejected": -0.7478946447372437, "logps/chosen": -69.3331069946289, "logps/rejected": -75.3602523803711, "loss": 13.4493, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.025318946689367294, "rewards/margins": 0.05369370058178902, "rewards/rejected": -0.028374750167131424, "step": 749 }, { "epoch": 0.4356159609688099, "grad_norm": 312.72625732421875, "learning_rate": 3.910517141196979e-06, "logits/chosen": -0.6957664489746094, "logits/rejected": -0.6999244689941406, "logps/chosen": -76.08050537109375, "logps/rejected": -71.7776870727539, "loss": 13.3181, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.011510677635669708, "rewards/margins": 0.061960190534591675, "rewards/rejected": -0.07347087562084198, "step": 750 }, { "epoch": 0.43619678225010167, "grad_norm": 322.7213134765625, "learning_rate": 3.9090644973852414e-06, "logits/chosen": -0.7363187074661255, "logits/rejected": -0.7484365105628967, "logps/chosen": -77.98309326171875, "logps/rejected": -74.41910552978516, "loss": 13.3354, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03591137006878853, "rewards/margins": 0.07070378959178925, "rewards/rejected": -0.034792426973581314, "step": 751 }, { "epoch": 0.4367776035313934, "grad_norm": 308.99310302734375, "learning_rate": 3.907611853573504e-06, "logits/chosen": -0.6841967701911926, "logits/rejected": -0.7418473362922668, "logps/chosen": -74.3568115234375, "logps/rejected": -72.77391052246094, "loss": 12.8602, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.026743004098534584, "rewards/margins": 0.11576484143733978, "rewards/rejected": -0.08902183920145035, "step": 752 }, { "epoch": 0.4373584248126851, "grad_norm": 327.3843994140625, "learning_rate": 3.906159209761767e-06, "logits/chosen": -0.7423384189605713, "logits/rejected": -0.7147141098976135, "logps/chosen": -70.23924255371094, "logps/rejected": -68.62517547607422, "loss": 13.8643, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.012057294137775898, "rewards/margins": 0.011633175425231457, "rewards/rejected": 0.00042412133188918233, "step": 753 }, { "epoch": 0.4379392460939769, "grad_norm": 318.1657409667969, "learning_rate": 3.90470656595003e-06, "logits/chosen": -0.823145866394043, "logits/rejected": -0.8383482694625854, "logps/chosen": -73.15534973144531, "logps/rejected": -74.34989929199219, "loss": 14.5024, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.057351987808942795, "rewards/margins": -0.05695341154932976, "rewards/rejected": -0.0003985777439083904, "step": 754 }, { "epoch": 0.4385200673752686, "grad_norm": 308.12493896484375, "learning_rate": 3.903253922138292e-06, "logits/chosen": -0.7838844060897827, "logits/rejected": -0.7964550256729126, "logps/chosen": -69.17980194091797, "logps/rejected": -77.6092758178711, "loss": 13.97, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.05046354979276657, "rewards/margins": 0.004970133304595947, "rewards/rejected": -0.05543368309736252, "step": 755 }, { "epoch": 0.4391008886565604, "grad_norm": 328.02618408203125, "learning_rate": 3.901801278326554e-06, "logits/chosen": -0.6274627447128296, "logits/rejected": -0.7087150812149048, "logps/chosen": -67.25186920166016, "logps/rejected": -69.48518371582031, "loss": 14.5885, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.02703443542122841, "rewards/margins": -0.061674535274505615, "rewards/rejected": 0.03464009612798691, "step": 756 }, { "epoch": 0.4396817099378521, "grad_norm": 345.1214904785156, "learning_rate": 3.900348634514817e-06, "logits/chosen": -0.6962085962295532, "logits/rejected": -0.6879252195358276, "logps/chosen": -74.38581848144531, "logps/rejected": -74.284423828125, "loss": 14.0462, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.004917553160339594, "rewards/margins": -0.009653748944401741, "rewards/rejected": 0.00473619531840086, "step": 757 }, { "epoch": 0.4402625312191439, "grad_norm": 360.69549560546875, "learning_rate": 3.89889599070308e-06, "logits/chosen": -0.770926833152771, "logits/rejected": -0.7564770579338074, "logps/chosen": -71.93128204345703, "logps/rejected": -76.0265121459961, "loss": 14.132, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.017853304743766785, "rewards/margins": -0.016330275684595108, "rewards/rejected": -0.0015230312710627913, "step": 758 }, { "epoch": 0.4408433525004356, "grad_norm": 325.82342529296875, "learning_rate": 3.8974433468913426e-06, "logits/chosen": -0.7091246843338013, "logits/rejected": -0.7488458752632141, "logps/chosen": -67.05516815185547, "logps/rejected": -79.63505554199219, "loss": 13.8617, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04035795480012894, "rewards/margins": 0.01930631510913372, "rewards/rejected": -0.0596642792224884, "step": 759 }, { "epoch": 0.4414241737817274, "grad_norm": 324.606201171875, "learning_rate": 3.895990703079605e-06, "logits/chosen": -0.8655685186386108, "logits/rejected": -0.8269084095954895, "logps/chosen": -71.62133026123047, "logps/rejected": -81.47102355957031, "loss": 13.8222, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.03722354397177696, "rewards/margins": 0.020576555281877518, "rewards/rejected": -0.05780010297894478, "step": 760 }, { "epoch": 0.4420049950630191, "grad_norm": 311.0895690917969, "learning_rate": 3.894538059267868e-06, "logits/chosen": -0.5948249697685242, "logits/rejected": -0.6009396314620972, "logps/chosen": -71.13340759277344, "logps/rejected": -74.0931625366211, "loss": 13.6398, "rewards/accuracies": 0.5, "rewards/chosen": -0.004656590521335602, "rewards/margins": 0.03076646290719509, "rewards/rejected": -0.035423047840595245, "step": 761 }, { "epoch": 0.4425858163443109, "grad_norm": 348.3786315917969, "learning_rate": 3.89308541545613e-06, "logits/chosen": -0.6613047122955322, "logits/rejected": -0.7789413332939148, "logps/chosen": -80.16218566894531, "logps/rejected": -78.0029296875, "loss": 13.6857, "rewards/accuracies": 0.5, "rewards/chosen": 0.03409738838672638, "rewards/margins": 0.029163142666220665, "rewards/rejected": 0.004934241063892841, "step": 762 }, { "epoch": 0.4431666376256026, "grad_norm": 273.7310485839844, "learning_rate": 3.891632771644393e-06, "logits/chosen": -0.8774948120117188, "logits/rejected": -0.8197600245475769, "logps/chosen": -67.24043273925781, "logps/rejected": -73.55924987792969, "loss": 12.5638, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.040607668459415436, "rewards/margins": 0.15121008455753326, "rewards/rejected": -0.11060241609811783, "step": 763 }, { "epoch": 0.4437474589068943, "grad_norm": 289.9358825683594, "learning_rate": 3.8901801278326555e-06, "logits/chosen": -0.7543174028396606, "logits/rejected": -0.7527318596839905, "logps/chosen": -70.50340270996094, "logps/rejected": -71.85820007324219, "loss": 12.648, "rewards/accuracies": 0.75, "rewards/chosen": 0.0302159134298563, "rewards/margins": 0.13685820996761322, "rewards/rejected": -0.10664232075214386, "step": 764 }, { "epoch": 0.4443282801881861, "grad_norm": 310.6090393066406, "learning_rate": 3.888727484020918e-06, "logits/chosen": -0.7520283460617065, "logits/rejected": -0.8482815027236938, "logps/chosen": -80.78767395019531, "logps/rejected": -71.63502502441406, "loss": 13.935, "rewards/accuracies": 0.5, "rewards/chosen": -0.02459750324487686, "rewards/margins": 7.411837577819824e-05, "rewards/rejected": -0.02467162348330021, "step": 765 }, { "epoch": 0.4449091014694778, "grad_norm": 320.955322265625, "learning_rate": 3.887274840209181e-06, "logits/chosen": -0.6493512988090515, "logits/rejected": -0.7719279527664185, "logps/chosen": -71.09584045410156, "logps/rejected": -79.13584899902344, "loss": 13.7825, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.004698459059000015, "rewards/margins": 0.015006857924163342, "rewards/rejected": -0.01970531791448593, "step": 766 }, { "epoch": 0.4454899227507696, "grad_norm": 420.12744140625, "learning_rate": 3.885822196397444e-06, "logits/chosen": -0.5989329218864441, "logits/rejected": -0.5799925923347473, "logps/chosen": -67.3458251953125, "logps/rejected": -70.0871810913086, "loss": 14.2179, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.024614132940769196, "rewards/margins": -0.01728874072432518, "rewards/rejected": -0.00732539314776659, "step": 767 }, { "epoch": 0.4460707440320613, "grad_norm": 310.5950927734375, "learning_rate": 3.8843695525857065e-06, "logits/chosen": -0.6708532571792603, "logits/rejected": -0.6406176686286926, "logps/chosen": -67.67707824707031, "logps/rejected": -73.09920501708984, "loss": 13.3177, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.022770235314965248, "rewards/margins": 0.07169272005558014, "rewards/rejected": -0.04892248287796974, "step": 768 }, { "epoch": 0.4466515653133531, "grad_norm": 346.4231872558594, "learning_rate": 3.882916908773968e-06, "logits/chosen": -0.6678956151008606, "logits/rejected": -0.7639406323432922, "logps/chosen": -76.6402359008789, "logps/rejected": -70.21337127685547, "loss": 13.5121, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009954368695616722, "rewards/margins": 0.04720446467399597, "rewards/rejected": -0.057158827781677246, "step": 769 }, { "epoch": 0.4472323865946448, "grad_norm": 315.72998046875, "learning_rate": 3.881464264962231e-06, "logits/chosen": -0.6422103643417358, "logits/rejected": -0.6924742460250854, "logps/chosen": -77.5414047241211, "logps/rejected": -85.16349792480469, "loss": 13.3908, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.02027234062552452, "rewards/margins": 0.06246469169855118, "rewards/rejected": -0.04219234734773636, "step": 770 }, { "epoch": 0.4478132078759366, "grad_norm": 325.33782958984375, "learning_rate": 3.880011621150494e-06, "logits/chosen": -0.8680871725082397, "logits/rejected": -0.8178070187568665, "logps/chosen": -77.91289520263672, "logps/rejected": -76.54703521728516, "loss": 13.9257, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.004537890199571848, "rewards/margins": 0.007611017674207687, "rewards/rejected": -0.012148907408118248, "step": 771 }, { "epoch": 0.4483940291572283, "grad_norm": 332.87200927734375, "learning_rate": 3.878558977338757e-06, "logits/chosen": -0.7658424377441406, "logits/rejected": -0.7849222421646118, "logps/chosen": -72.97148132324219, "logps/rejected": -69.93640899658203, "loss": 14.4375, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.02565593644976616, "rewards/margins": -0.046471305191516876, "rewards/rejected": 0.020815376192331314, "step": 772 }, { "epoch": 0.4489748504385201, "grad_norm": 298.5015563964844, "learning_rate": 3.877106333527019e-06, "logits/chosen": -0.715753436088562, "logits/rejected": -0.7763512134552002, "logps/chosen": -69.95814514160156, "logps/rejected": -67.6918716430664, "loss": 13.4113, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.04581587761640549, "rewards/margins": 0.05313152074813843, "rewards/rejected": -0.007315640337765217, "step": 773 }, { "epoch": 0.4495556717198118, "grad_norm": 325.48419189453125, "learning_rate": 3.875653689715282e-06, "logits/chosen": -0.6810447573661804, "logits/rejected": -0.6709048748016357, "logps/chosen": -88.08033752441406, "logps/rejected": -82.59185791015625, "loss": 14.0073, "rewards/accuracies": 0.5, "rewards/chosen": -0.03442404791712761, "rewards/margins": -0.004874535836279392, "rewards/rejected": -0.02954951487481594, "step": 774 }, { "epoch": 0.4501364930011036, "grad_norm": 316.40673828125, "learning_rate": 3.874201045903545e-06, "logits/chosen": -0.648461639881134, "logits/rejected": -0.6868349313735962, "logps/chosen": -74.5299301147461, "logps/rejected": -67.671875, "loss": 13.9253, "rewards/accuracies": 0.5, "rewards/chosen": 0.010094478726387024, "rewards/margins": 0.0021387473680078983, "rewards/rejected": 0.007955733686685562, "step": 775 }, { "epoch": 0.4507173142823953, "grad_norm": 306.03436279296875, "learning_rate": 3.872748402091807e-06, "logits/chosen": -0.6466223001480103, "logits/rejected": -0.6228175759315491, "logps/chosen": -72.484619140625, "logps/rejected": -65.22151947021484, "loss": 13.3478, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.02175520732998848, "rewards/margins": 0.06977047026157379, "rewards/rejected": -0.04801527410745621, "step": 776 }, { "epoch": 0.45129813556368703, "grad_norm": 303.6878662109375, "learning_rate": 3.87129575828007e-06, "logits/chosen": -0.7520937919616699, "logits/rejected": -0.6366826295852661, "logps/chosen": -72.18943786621094, "logps/rejected": -68.44010162353516, "loss": 14.4002, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.10646836459636688, "rewards/margins": -0.042999267578125, "rewards/rejected": -0.06346909701824188, "step": 777 }, { "epoch": 0.4518789568449788, "grad_norm": 319.6017761230469, "learning_rate": 3.869843114468333e-06, "logits/chosen": -0.6259938478469849, "logits/rejected": -0.6519014835357666, "logps/chosen": -68.73445129394531, "logps/rejected": -68.32264709472656, "loss": 13.8294, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.008890246972441673, "rewards/margins": 0.014760665595531464, "rewards/rejected": -0.005870418157428503, "step": 778 }, { "epoch": 0.45245977812627053, "grad_norm": 299.91241455078125, "learning_rate": 3.868390470656596e-06, "logits/chosen": -0.6745251417160034, "logits/rejected": -0.7223367691040039, "logps/chosen": -77.0238037109375, "logps/rejected": -68.16448211669922, "loss": 12.6462, "rewards/accuracies": 0.75, "rewards/chosen": 0.06470943987369537, "rewards/margins": 0.1429821252822876, "rewards/rejected": -0.07827268540859222, "step": 779 }, { "epoch": 0.4530405994075623, "grad_norm": 299.5854797363281, "learning_rate": 3.866937826844859e-06, "logits/chosen": -0.7268974781036377, "logits/rejected": -0.7254965901374817, "logps/chosen": -72.63690185546875, "logps/rejected": -73.64038848876953, "loss": 13.6416, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.021657172590494156, "rewards/margins": 0.045961178839206696, "rewards/rejected": -0.06761835515499115, "step": 780 }, { "epoch": 0.453621420688854, "grad_norm": 315.7631530761719, "learning_rate": 3.8654851830331205e-06, "logits/chosen": -0.7705460786819458, "logits/rejected": -0.7526998519897461, "logps/chosen": -71.8909912109375, "logps/rejected": -75.11150360107422, "loss": 14.061, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.007896973751485348, "rewards/margins": -0.006453951355069876, "rewards/rejected": -0.0014430228620767593, "step": 781 }, { "epoch": 0.4542022419701458, "grad_norm": 306.499267578125, "learning_rate": 3.864032539221383e-06, "logits/chosen": -0.7486709356307983, "logits/rejected": -0.7750841379165649, "logps/chosen": -73.98805236816406, "logps/rejected": -71.86274719238281, "loss": 13.4212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.008844633586704731, "rewards/margins": 0.058701179921627045, "rewards/rejected": -0.04985655099153519, "step": 782 }, { "epoch": 0.4547830632514375, "grad_norm": 289.93267822265625, "learning_rate": 3.862579895409646e-06, "logits/chosen": -0.6648403406143188, "logits/rejected": -0.5542594790458679, "logps/chosen": -72.4288558959961, "logps/rejected": -62.92414474487305, "loss": 13.8115, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.025623226538300514, "rewards/margins": 0.012874387204647064, "rewards/rejected": 0.012748842127621174, "step": 783 }, { "epoch": 0.4553638845327293, "grad_norm": 320.468505859375, "learning_rate": 3.861127251597909e-06, "logits/chosen": -0.6467171907424927, "logits/rejected": -0.6780973672866821, "logps/chosen": -70.74858856201172, "logps/rejected": -75.70240020751953, "loss": 13.4407, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.019734477624297142, "rewards/margins": 0.056104935705661774, "rewards/rejected": -0.07583941519260406, "step": 784 }, { "epoch": 0.455944705814021, "grad_norm": 314.7945556640625, "learning_rate": 3.8596746077861715e-06, "logits/chosen": -0.7252448797225952, "logits/rejected": -0.6654443144798279, "logps/chosen": -69.35020446777344, "logps/rejected": -74.48258209228516, "loss": 13.722, "rewards/accuracies": 0.5, "rewards/chosen": 0.004625787492841482, "rewards/margins": 0.02148066833615303, "rewards/rejected": -0.01685487851500511, "step": 785 }, { "epoch": 0.4565255270953128, "grad_norm": 336.58001708984375, "learning_rate": 3.858221963974434e-06, "logits/chosen": -0.9182072877883911, "logits/rejected": -0.8336294889450073, "logps/chosen": -70.61786651611328, "logps/rejected": -96.56700134277344, "loss": 13.3177, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005032673943787813, "rewards/margins": 0.06978379189968109, "rewards/rejected": -0.07481645792722702, "step": 786 }, { "epoch": 0.4571063483766045, "grad_norm": 340.71490478515625, "learning_rate": 3.856769320162697e-06, "logits/chosen": -0.8411046266555786, "logits/rejected": -0.8996657133102417, "logps/chosen": -74.63349914550781, "logps/rejected": -84.13905334472656, "loss": 13.212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04600357636809349, "rewards/margins": 0.07600485533475876, "rewards/rejected": -0.030001282691955566, "step": 787 }, { "epoch": 0.45768716965789624, "grad_norm": 333.482666015625, "learning_rate": 3.855316676350959e-06, "logits/chosen": -0.6594542264938354, "logits/rejected": -0.6667267084121704, "logps/chosen": -70.84710693359375, "logps/rejected": -72.32903289794922, "loss": 13.3626, "rewards/accuracies": 0.5, "rewards/chosen": 0.029414311051368713, "rewards/margins": 0.060748837888240814, "rewards/rejected": -0.0313345268368721, "step": 788 }, { "epoch": 0.458267990939188, "grad_norm": 312.21856689453125, "learning_rate": 3.853864032539222e-06, "logits/chosen": -0.8260235786437988, "logits/rejected": -0.853685200214386, "logps/chosen": -62.875640869140625, "logps/rejected": -75.08868408203125, "loss": 13.8724, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.05585920810699463, "rewards/margins": 0.006377121899276972, "rewards/rejected": -0.062236327677965164, "step": 789 }, { "epoch": 0.45884881222047974, "grad_norm": 312.63446044921875, "learning_rate": 3.852411388727484e-06, "logits/chosen": -0.651515781879425, "logits/rejected": -0.6716042160987854, "logps/chosen": -76.17277526855469, "logps/rejected": -69.4102554321289, "loss": 13.3182, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02180982567369938, "rewards/margins": 0.06877848505973816, "rewards/rejected": -0.04696866124868393, "step": 790 }, { "epoch": 0.4594296335017715, "grad_norm": 294.8563537597656, "learning_rate": 3.850958744915747e-06, "logits/chosen": -0.9102421998977661, "logits/rejected": -0.8507450819015503, "logps/chosen": -70.69921112060547, "logps/rejected": -72.28413391113281, "loss": 14.0738, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.016772408038377762, "rewards/margins": -0.005055961664766073, "rewards/rejected": -0.011716444976627827, "step": 791 }, { "epoch": 0.46001045478306324, "grad_norm": 303.2449035644531, "learning_rate": 3.84950610110401e-06, "logits/chosen": -0.8211100697517395, "logits/rejected": -0.7845950126647949, "logps/chosen": -67.99166870117188, "logps/rejected": -68.65469360351562, "loss": 12.9882, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.008265440352261066, "rewards/margins": 0.10116876661777496, "rewards/rejected": -0.09290332347154617, "step": 792 }, { "epoch": 0.460591276064355, "grad_norm": 313.7750549316406, "learning_rate": 3.848053457292273e-06, "logits/chosen": -0.6798004508018494, "logits/rejected": -0.6693185567855835, "logps/chosen": -82.59542083740234, "logps/rejected": -71.71035766601562, "loss": 13.5624, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00769240316003561, "rewards/margins": 0.05136420577764511, "rewards/rejected": -0.0590566024184227, "step": 793 }, { "epoch": 0.46117209734564674, "grad_norm": 287.33343505859375, "learning_rate": 3.8466008134805345e-06, "logits/chosen": -0.8326593637466431, "logits/rejected": -0.8718840479850769, "logps/chosen": -67.961669921875, "logps/rejected": -73.32498168945312, "loss": 13.2243, "rewards/accuracies": 0.75, "rewards/chosen": 0.022751104086637497, "rewards/margins": 0.08069998770952225, "rewards/rejected": -0.05794887617230415, "step": 794 }, { "epoch": 0.4617529186269385, "grad_norm": 330.590087890625, "learning_rate": 3.845148169668797e-06, "logits/chosen": -0.8093253374099731, "logits/rejected": -0.8372796177864075, "logps/chosen": -73.56087493896484, "logps/rejected": -84.00132751464844, "loss": 14.7452, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.07513497769832611, "rewards/margins": -0.07671995460987091, "rewards/rejected": 0.0015849672490730882, "step": 795 }, { "epoch": 0.46233373990823023, "grad_norm": 295.850830078125, "learning_rate": 3.84369552585706e-06, "logits/chosen": -0.8418253064155579, "logits/rejected": -0.8993352055549622, "logps/chosen": -69.29906463623047, "logps/rejected": -72.6314697265625, "loss": 13.7799, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00324619235470891, "rewards/margins": 0.02390274405479431, "rewards/rejected": -0.027148932218551636, "step": 796 }, { "epoch": 0.462914561189522, "grad_norm": 335.1182556152344, "learning_rate": 3.842242882045323e-06, "logits/chosen": -0.7779780626296997, "logits/rejected": -0.6619454026222229, "logps/chosen": -64.12891387939453, "logps/rejected": -74.85227966308594, "loss": 14.454, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.031012335792183876, "rewards/margins": -0.047410137951374054, "rewards/rejected": 0.01639780029654503, "step": 797 }, { "epoch": 0.46349538247081373, "grad_norm": 310.0404052734375, "learning_rate": 3.8407902382335855e-06, "logits/chosen": -0.8367874026298523, "logits/rejected": -0.7925940155982971, "logps/chosen": -75.65746307373047, "logps/rejected": -71.82279205322266, "loss": 13.9021, "rewards/accuracies": 0.5, "rewards/chosen": 0.008593537844717503, "rewards/margins": 0.010417133569717407, "rewards/rejected": -0.001823595492169261, "step": 798 }, { "epoch": 0.46407620375210545, "grad_norm": 305.9939270019531, "learning_rate": 3.839337594421848e-06, "logits/chosen": -0.7069296836853027, "logits/rejected": -0.7354210615158081, "logps/chosen": -81.55580139160156, "logps/rejected": -76.04948425292969, "loss": 13.6306, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.01436761487275362, "rewards/margins": 0.03523886203765869, "rewards/rejected": -0.020871248096227646, "step": 799 }, { "epoch": 0.46465702503339723, "grad_norm": 319.43377685546875, "learning_rate": 3.837884950610111e-06, "logits/chosen": -0.775700032711029, "logits/rejected": -0.6986560225486755, "logps/chosen": -74.52439880371094, "logps/rejected": -77.12367248535156, "loss": 13.7436, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.015211820602416992, "rewards/margins": 0.019106799736618996, "rewards/rejected": -0.0038949784357100725, "step": 800 }, { "epoch": 0.46523784631468895, "grad_norm": 306.27374267578125, "learning_rate": 3.836432306798373e-06, "logits/chosen": -0.8053959012031555, "logits/rejected": -0.8362959623336792, "logps/chosen": -70.24461364746094, "logps/rejected": -73.2827377319336, "loss": 13.0576, "rewards/accuracies": 0.75, "rewards/chosen": 0.0017929266905412078, "rewards/margins": 0.09252931922674179, "rewards/rejected": -0.09073638916015625, "step": 801 }, { "epoch": 0.46581866759598073, "grad_norm": 310.7746887207031, "learning_rate": 3.834979662986636e-06, "logits/chosen": -1.0116592645645142, "logits/rejected": -0.8014705777168274, "logps/chosen": -71.8570785522461, "logps/rejected": -82.65800476074219, "loss": 13.0408, "rewards/accuracies": 0.75, "rewards/chosen": 0.03449561074376106, "rewards/margins": 0.09185705333948135, "rewards/rejected": -0.05736144259572029, "step": 802 }, { "epoch": 0.46639948887727245, "grad_norm": 341.84814453125, "learning_rate": 3.833527019174898e-06, "logits/chosen": -0.7799655199050903, "logits/rejected": -0.7931917309761047, "logps/chosen": -76.88513946533203, "logps/rejected": -71.80252075195312, "loss": 14.5748, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.08196653425693512, "rewards/margins": -0.050925351679325104, "rewards/rejected": -0.031041180714964867, "step": 803 }, { "epoch": 0.4669803101585642, "grad_norm": 317.75946044921875, "learning_rate": 3.832074375363161e-06, "logits/chosen": -0.830287754535675, "logits/rejected": -0.8624798655509949, "logps/chosen": -71.75201416015625, "logps/rejected": -73.34031677246094, "loss": 13.3196, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0004104435502085835, "rewards/margins": 0.0665319561958313, "rewards/rejected": -0.06694237887859344, "step": 804 }, { "epoch": 0.46756113143985595, "grad_norm": 321.6256103515625, "learning_rate": 3.830621731551424e-06, "logits/chosen": -0.7428877353668213, "logits/rejected": -0.7949348092079163, "logps/chosen": -72.76325988769531, "logps/rejected": -71.17098236083984, "loss": 14.5415, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.015930796042084694, "rewards/margins": -0.05372260883450508, "rewards/rejected": 0.06965340673923492, "step": 805 }, { "epoch": 0.4681419527211477, "grad_norm": 323.96905517578125, "learning_rate": 3.829169087739687e-06, "logits/chosen": -0.8501350283622742, "logits/rejected": -0.8383558392524719, "logps/chosen": -71.15087127685547, "logps/rejected": -70.85906982421875, "loss": 14.7471, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.09234993904829025, "rewards/margins": -0.0763244479894638, "rewards/rejected": -0.016025494784116745, "step": 806 }, { "epoch": 0.46872277400243945, "grad_norm": 336.2712097167969, "learning_rate": 3.827716443927949e-06, "logits/chosen": -0.8244204521179199, "logits/rejected": -0.8803207278251648, "logps/chosen": -86.31137084960938, "logps/rejected": -71.70460510253906, "loss": 14.0914, "rewards/accuracies": 0.5, "rewards/chosen": -0.029995271936058998, "rewards/margins": -0.010159234516322613, "rewards/rejected": -0.01983603462576866, "step": 807 }, { "epoch": 0.4693035952837312, "grad_norm": 286.52447509765625, "learning_rate": 3.826263800116211e-06, "logits/chosen": -0.8492335081100464, "logits/rejected": -0.9490715265274048, "logps/chosen": -67.36164855957031, "logps/rejected": -68.58895111083984, "loss": 13.456, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0037143989466130733, "rewards/margins": 0.05153452232480049, "rewards/rejected": -0.05524892359972, "step": 808 }, { "epoch": 0.46988441656502294, "grad_norm": 335.1893005371094, "learning_rate": 3.824811156304474e-06, "logits/chosen": -0.8309356570243835, "logits/rejected": -0.8245996236801147, "logps/chosen": -72.35955047607422, "logps/rejected": -76.44219970703125, "loss": 14.0092, "rewards/accuracies": 0.5, "rewards/chosen": -0.006397458724677563, "rewards/margins": -0.003385643707588315, "rewards/rejected": -0.003011818276718259, "step": 809 }, { "epoch": 0.47046523784631467, "grad_norm": 326.7082824707031, "learning_rate": 3.823358512492737e-06, "logits/chosen": -0.6885795593261719, "logits/rejected": -0.7032235860824585, "logps/chosen": -74.98950958251953, "logps/rejected": -68.4511947631836, "loss": 13.7991, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.007976588793098927, "rewards/margins": 0.016375992447137833, "rewards/rejected": -0.024352580308914185, "step": 810 }, { "epoch": 0.47104605912760644, "grad_norm": 336.2662048339844, "learning_rate": 3.8219058686809996e-06, "logits/chosen": -0.7334426641464233, "logits/rejected": -0.8731748461723328, "logps/chosen": -78.27384948730469, "logps/rejected": -82.20344543457031, "loss": 14.1666, "rewards/accuracies": 0.5, "rewards/chosen": -0.01889634132385254, "rewards/margins": -0.0168316587805748, "rewards/rejected": -0.0020646885968744755, "step": 811 }, { "epoch": 0.47162688040889816, "grad_norm": 317.0302734375, "learning_rate": 3.820453224869262e-06, "logits/chosen": -0.7193567752838135, "logits/rejected": -0.7776497006416321, "logps/chosen": -81.74263000488281, "logps/rejected": -81.36003875732422, "loss": 13.3265, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.033215902745723724, "rewards/margins": 0.10018181800842285, "rewards/rejected": -0.06696590781211853, "step": 812 }, { "epoch": 0.47220770169018994, "grad_norm": 308.6961975097656, "learning_rate": 3.819000581057525e-06, "logits/chosen": -0.7122281193733215, "logits/rejected": -0.7115441560745239, "logps/chosen": -75.91162872314453, "logps/rejected": -70.84449768066406, "loss": 13.6834, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.016342181712388992, "rewards/margins": 0.0316503569483757, "rewards/rejected": -0.015308169648051262, "step": 813 }, { "epoch": 0.47278852297148166, "grad_norm": 307.2214050292969, "learning_rate": 3.817547937245788e-06, "logits/chosen": -0.8540679812431335, "logits/rejected": -0.9650880694389343, "logps/chosen": -69.38043975830078, "logps/rejected": -89.74822235107422, "loss": 13.2044, "rewards/accuracies": 0.75, "rewards/chosen": 0.05181754380464554, "rewards/margins": 0.07774057984352112, "rewards/rejected": -0.02592303231358528, "step": 814 }, { "epoch": 0.47336934425277344, "grad_norm": 310.0686340332031, "learning_rate": 3.81609529343405e-06, "logits/chosen": -0.8766164779663086, "logits/rejected": -0.7675495147705078, "logps/chosen": -75.18499755859375, "logps/rejected": -70.85122680664062, "loss": 13.5494, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.037073832005262375, "rewards/margins": 0.04066385701298714, "rewards/rejected": -0.0035900219809263945, "step": 815 }, { "epoch": 0.47395016553406516, "grad_norm": 319.04913330078125, "learning_rate": 3.814642649622313e-06, "logits/chosen": -0.773638904094696, "logits/rejected": -0.688806414604187, "logps/chosen": -69.4584732055664, "logps/rejected": -78.58277130126953, "loss": 13.5887, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.04278705641627312, "rewards/margins": 0.03746388480067253, "rewards/rejected": 0.005323170684278011, "step": 816 }, { "epoch": 0.47453098681535694, "grad_norm": 314.92266845703125, "learning_rate": 3.813190005810575e-06, "logits/chosen": -0.8834367990493774, "logits/rejected": -0.892521858215332, "logps/chosen": -77.84300231933594, "logps/rejected": -79.00919342041016, "loss": 13.2775, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.016293682157993317, "rewards/margins": 0.06721033900976181, "rewards/rejected": -0.050916653126478195, "step": 817 }, { "epoch": 0.47511180809664866, "grad_norm": 293.46875, "learning_rate": 3.811737361998838e-06, "logits/chosen": -0.9297618865966797, "logits/rejected": -0.944604218006134, "logps/chosen": -79.2077407836914, "logps/rejected": -73.26544952392578, "loss": 13.6234, "rewards/accuracies": 0.5, "rewards/chosen": -0.0014585237950086594, "rewards/margins": 0.03561336174607277, "rewards/rejected": -0.037071891129016876, "step": 818 }, { "epoch": 0.47569262937794043, "grad_norm": 377.02056884765625, "learning_rate": 3.810284718187101e-06, "logits/chosen": -0.7812774777412415, "logits/rejected": -0.8613381385803223, "logps/chosen": -72.1882553100586, "logps/rejected": -74.88095092773438, "loss": 13.8795, "rewards/accuracies": 0.5, "rewards/chosen": -0.0032719247974455357, "rewards/margins": 0.010365369729697704, "rewards/rejected": -0.013637298718094826, "step": 819 }, { "epoch": 0.47627345065923216, "grad_norm": 316.15924072265625, "learning_rate": 3.808832074375364e-06, "logits/chosen": -0.8011975288391113, "logits/rejected": -0.9280617833137512, "logps/chosen": -73.37976837158203, "logps/rejected": -81.17606353759766, "loss": 13.2769, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.005650301463901997, "rewards/margins": 0.08938013017177582, "rewards/rejected": -0.0837298333644867, "step": 820 }, { "epoch": 0.4768542719405239, "grad_norm": 337.64898681640625, "learning_rate": 3.807379430563626e-06, "logits/chosen": -0.6541992425918579, "logits/rejected": -0.6755486130714417, "logps/chosen": -68.98448181152344, "logps/rejected": -71.53800201416016, "loss": 14.094, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.021977489814162254, "rewards/margins": -0.01157850306481123, "rewards/rejected": -0.010398988611996174, "step": 821 }, { "epoch": 0.47743509322181565, "grad_norm": 395.7552795410156, "learning_rate": 3.805926786751889e-06, "logits/chosen": -0.8055717349052429, "logits/rejected": -0.7719739079475403, "logps/chosen": -78.54650115966797, "logps/rejected": -69.30370330810547, "loss": 14.2552, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.04533190652728081, "rewards/margins": -0.03052378073334694, "rewards/rejected": -0.014808130450546741, "step": 822 }, { "epoch": 0.4780159145031074, "grad_norm": 329.39019775390625, "learning_rate": 3.8044741429401517e-06, "logits/chosen": -0.9106775522232056, "logits/rejected": -0.9728819131851196, "logps/chosen": -75.94145202636719, "logps/rejected": -72.80001831054688, "loss": 14.326, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.04924512654542923, "rewards/margins": -0.034947142004966736, "rewards/rejected": -0.014297977089881897, "step": 823 }, { "epoch": 0.47859673578439915, "grad_norm": 311.20428466796875, "learning_rate": 3.8030214991284144e-06, "logits/chosen": -0.7255929708480835, "logits/rejected": -0.8986749649047852, "logps/chosen": -72.01505279541016, "logps/rejected": -78.78022766113281, "loss": 13.9084, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.033934831619262695, "rewards/margins": 0.015138444490730762, "rewards/rejected": -0.04907327890396118, "step": 824 }, { "epoch": 0.4791775570656909, "grad_norm": 350.76275634765625, "learning_rate": 3.8015688553166768e-06, "logits/chosen": -0.8962694406509399, "logits/rejected": -0.9466017484664917, "logps/chosen": -77.05352783203125, "logps/rejected": -73.25821685791016, "loss": 14.4856, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.09311369061470032, "rewards/margins": -0.03986470773816109, "rewards/rejected": -0.05324899032711983, "step": 825 }, { "epoch": 0.47975837834698265, "grad_norm": 331.6654968261719, "learning_rate": 3.8001162115049395e-06, "logits/chosen": -0.8453457951545715, "logits/rejected": -0.8091884851455688, "logps/chosen": -75.66307067871094, "logps/rejected": -78.55387115478516, "loss": 13.6351, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.022285010665655136, "rewards/margins": 0.04095644876360893, "rewards/rejected": -0.018671434372663498, "step": 826 }, { "epoch": 0.48033919962827437, "grad_norm": 319.9637451171875, "learning_rate": 3.7986635676932023e-06, "logits/chosen": -0.7850121259689331, "logits/rejected": -0.887397289276123, "logps/chosen": -63.03386688232422, "logps/rejected": -72.07405090332031, "loss": 13.5181, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0016908079851418734, "rewards/margins": 0.0456506684422493, "rewards/rejected": -0.043959856033325195, "step": 827 }, { "epoch": 0.48092002090956615, "grad_norm": 335.4803771972656, "learning_rate": 3.7972109238814646e-06, "logits/chosen": -0.8599593043327332, "logits/rejected": -0.8137737512588501, "logps/chosen": -83.40064239501953, "logps/rejected": -74.57695007324219, "loss": 14.3082, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.014900142326951027, "rewards/margins": -0.03146820515394211, "rewards/rejected": 0.04636834189295769, "step": 828 }, { "epoch": 0.48150084219085787, "grad_norm": 341.3481750488281, "learning_rate": 3.7957582800697273e-06, "logits/chosen": -0.8422654867172241, "logits/rejected": -0.9839199185371399, "logps/chosen": -82.99832916259766, "logps/rejected": -72.62747192382812, "loss": 13.2486, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0003313060733489692, "rewards/margins": 0.07307750731706619, "rewards/rejected": -0.07274620234966278, "step": 829 }, { "epoch": 0.48208166347214965, "grad_norm": 324.2658386230469, "learning_rate": 3.79430563625799e-06, "logits/chosen": -0.9340323209762573, "logits/rejected": -0.8132963180541992, "logps/chosen": -76.088623046875, "logps/rejected": -78.1231689453125, "loss": 13.8514, "rewards/accuracies": 0.5, "rewards/chosen": -0.044568758457899094, "rewards/margins": 0.016439538449048996, "rewards/rejected": -0.061008304357528687, "step": 830 }, { "epoch": 0.48266248475344137, "grad_norm": 599.396240234375, "learning_rate": 3.7928529924462524e-06, "logits/chosen": -0.9430292248725891, "logits/rejected": -1.02345871925354, "logps/chosen": -72.04682922363281, "logps/rejected": -71.67384338378906, "loss": 13.8161, "rewards/accuracies": 0.5, "rewards/chosen": 0.021543148905038834, "rewards/margins": 0.017365697771310806, "rewards/rejected": 0.004177444148808718, "step": 831 }, { "epoch": 0.4832433060347331, "grad_norm": 385.9155578613281, "learning_rate": 3.791400348634515e-06, "logits/chosen": -0.8752473592758179, "logits/rejected": -0.8917462229728699, "logps/chosen": -74.92131042480469, "logps/rejected": -85.57743835449219, "loss": 13.2866, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.053486812859773636, "rewards/margins": 0.06886574625968933, "rewards/rejected": -0.01537893433123827, "step": 832 }, { "epoch": 0.48382412731602487, "grad_norm": 292.564697265625, "learning_rate": 3.789947704822778e-06, "logits/chosen": -0.8002208471298218, "logits/rejected": -0.7895336151123047, "logps/chosen": -71.14689636230469, "logps/rejected": -73.53022766113281, "loss": 13.5167, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02734098769724369, "rewards/margins": 0.043625812977552414, "rewards/rejected": -0.016284827142953873, "step": 833 }, { "epoch": 0.4844049485973166, "grad_norm": 319.7361755371094, "learning_rate": 3.7884950610110407e-06, "logits/chosen": -0.7712616920471191, "logits/rejected": -0.727273166179657, "logps/chosen": -88.34223937988281, "logps/rejected": -80.57120513916016, "loss": 13.1972, "rewards/accuracies": 0.75, "rewards/chosen": 0.04582667723298073, "rewards/margins": 0.07808013260364532, "rewards/rejected": -0.03225346654653549, "step": 834 }, { "epoch": 0.48498576987860836, "grad_norm": 325.38385009765625, "learning_rate": 3.787042417199303e-06, "logits/chosen": -0.9470648765563965, "logits/rejected": -0.9024521708488464, "logps/chosen": -72.9203872680664, "logps/rejected": -80.79722595214844, "loss": 13.8307, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.028173645958304405, "rewards/margins": 0.016614312306046486, "rewards/rejected": -0.04478795826435089, "step": 835 }, { "epoch": 0.4855665911599001, "grad_norm": 346.70098876953125, "learning_rate": 3.7855897733875657e-06, "logits/chosen": -0.9459084272384644, "logits/rejected": -0.9673193693161011, "logps/chosen": -70.69756317138672, "logps/rejected": -76.44866943359375, "loss": 13.1924, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.01108443271368742, "rewards/margins": 0.0785098746418953, "rewards/rejected": -0.0674254447221756, "step": 836 }, { "epoch": 0.48614741244119186, "grad_norm": 346.8296813964844, "learning_rate": 3.7841371295758285e-06, "logits/chosen": -0.9128061532974243, "logits/rejected": -0.8887417912483215, "logps/chosen": -87.18028259277344, "logps/rejected": -85.31761169433594, "loss": 14.1875, "rewards/accuracies": 0.5, "rewards/chosen": -0.070538230240345, "rewards/margins": -0.020030764862895012, "rewards/rejected": -0.050507474690675735, "step": 837 }, { "epoch": 0.4867282337224836, "grad_norm": 308.1683349609375, "learning_rate": 3.782684485764091e-06, "logits/chosen": -0.8262773752212524, "logits/rejected": -0.7639673948287964, "logps/chosen": -76.27080535888672, "logps/rejected": -80.69708251953125, "loss": 13.3472, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01220276765525341, "rewards/margins": 0.06231879070401192, "rewards/rejected": -0.050116024911403656, "step": 838 }, { "epoch": 0.48730905500377536, "grad_norm": 327.6427001953125, "learning_rate": 3.7812318419523535e-06, "logits/chosen": -0.7329220175743103, "logits/rejected": -0.8371773958206177, "logps/chosen": -80.13806915283203, "logps/rejected": -85.14008331298828, "loss": 12.5923, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.040527790784835815, "rewards/margins": 0.15911123156547546, "rewards/rejected": -0.11858340352773666, "step": 839 }, { "epoch": 0.4878898762850671, "grad_norm": 347.03125, "learning_rate": 3.7797791981406163e-06, "logits/chosen": -0.842019259929657, "logits/rejected": -0.8081402778625488, "logps/chosen": -85.04810333251953, "logps/rejected": -70.10143280029297, "loss": 13.535, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.05649472028017044, "rewards/margins": 0.04177948087453842, "rewards/rejected": 0.014715233817696571, "step": 840 }, { "epoch": 0.48847069756635886, "grad_norm": 343.4723205566406, "learning_rate": 3.778326554328879e-06, "logits/chosen": -0.9070305824279785, "logits/rejected": -0.8659006953239441, "logps/chosen": -75.98943328857422, "logps/rejected": -72.68668365478516, "loss": 14.1313, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.046057023108005524, "rewards/margins": -0.016837935894727707, "rewards/rejected": -0.029219087213277817, "step": 841 }, { "epoch": 0.4890515188476506, "grad_norm": 297.91790771484375, "learning_rate": 3.7768739105171414e-06, "logits/chosen": -0.8112656474113464, "logits/rejected": -0.7686861753463745, "logps/chosen": -71.14762115478516, "logps/rejected": -66.70329284667969, "loss": 13.8801, "rewards/accuracies": 0.5, "rewards/chosen": 0.013187101110816002, "rewards/margins": 0.013149671256542206, "rewards/rejected": 3.743171691894531e-05, "step": 842 }, { "epoch": 0.4896323401289423, "grad_norm": 324.6368408203125, "learning_rate": 3.775421266705404e-06, "logits/chosen": -0.7056946754455566, "logits/rejected": -0.7951919436454773, "logps/chosen": -78.37447357177734, "logps/rejected": -74.64752960205078, "loss": 13.5777, "rewards/accuracies": 0.5, "rewards/chosen": -0.004278040025383234, "rewards/margins": 0.03796951100230217, "rewards/rejected": -0.042247556149959564, "step": 843 }, { "epoch": 0.4902131614102341, "grad_norm": 300.9444885253906, "learning_rate": 3.773968622893667e-06, "logits/chosen": -1.035330891609192, "logits/rejected": -1.0094799995422363, "logps/chosen": -72.4145278930664, "logps/rejected": -69.17391204833984, "loss": 13.6103, "rewards/accuracies": 0.5, "rewards/chosen": -0.0112453643232584, "rewards/margins": 0.031555455178022385, "rewards/rejected": -0.042800821363925934, "step": 844 }, { "epoch": 0.4907939826915258, "grad_norm": 314.42364501953125, "learning_rate": 3.772515979081929e-06, "logits/chosen": -0.8987109065055847, "logits/rejected": -0.771704912185669, "logps/chosen": -76.92586517333984, "logps/rejected": -70.66104888916016, "loss": 14.2906, "rewards/accuracies": 0.5, "rewards/chosen": -0.0432976633310318, "rewards/margins": -0.030374759808182716, "rewards/rejected": -0.012922905385494232, "step": 845 }, { "epoch": 0.4913748039728176, "grad_norm": 293.44970703125, "learning_rate": 3.771063335270192e-06, "logits/chosen": -0.8815056085586548, "logits/rejected": -0.8597631454467773, "logps/chosen": -69.40690612792969, "logps/rejected": -73.88740539550781, "loss": 13.5889, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.031432799994945526, "rewards/margins": 0.03864104300737381, "rewards/rejected": -0.007208243012428284, "step": 846 }, { "epoch": 0.4919556252541093, "grad_norm": 293.9603576660156, "learning_rate": 3.7696106914584547e-06, "logits/chosen": -0.7362099885940552, "logits/rejected": -0.7803904414176941, "logps/chosen": -74.73085021972656, "logps/rejected": -70.69336700439453, "loss": 13.1999, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.019457781687378883, "rewards/margins": 0.08030074834823608, "rewards/rejected": -0.060842953622341156, "step": 847 }, { "epoch": 0.4925364465354011, "grad_norm": 317.520751953125, "learning_rate": 3.768158047646717e-06, "logits/chosen": -0.8254894018173218, "logits/rejected": -0.8281943202018738, "logps/chosen": -71.23345184326172, "logps/rejected": -71.3423080444336, "loss": 14.302, "rewards/accuracies": 0.25, "rewards/chosen": -0.028336822986602783, "rewards/margins": -0.034830138087272644, "rewards/rejected": 0.006493322551250458, "step": 848 }, { "epoch": 0.4931172678166928, "grad_norm": 339.6095275878906, "learning_rate": 3.7667054038349798e-06, "logits/chosen": -0.786754846572876, "logits/rejected": -0.8182505369186401, "logps/chosen": -75.18721771240234, "logps/rejected": -77.47499084472656, "loss": 14.7391, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.06475405395030975, "rewards/margins": -0.06797143071889877, "rewards/rejected": 0.0032173804938793182, "step": 849 }, { "epoch": 0.49369808909798457, "grad_norm": 318.309326171875, "learning_rate": 3.7652527600232425e-06, "logits/chosen": -0.8893791437149048, "logits/rejected": -0.8710716366767883, "logps/chosen": -74.41831970214844, "logps/rejected": -73.89128875732422, "loss": 13.2277, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.0054091643542051315, "rewards/margins": 0.07398828119039536, "rewards/rejected": -0.06857912242412567, "step": 850 }, { "epoch": 0.4942789103792763, "grad_norm": 315.2265930175781, "learning_rate": 3.7638001162115053e-06, "logits/chosen": -0.7621113657951355, "logits/rejected": -0.81293123960495, "logps/chosen": -78.86296081542969, "logps/rejected": -81.82852172851562, "loss": 13.244, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03520115092396736, "rewards/margins": 0.07237912714481354, "rewards/rejected": -0.03717798367142677, "step": 851 }, { "epoch": 0.49485973166056807, "grad_norm": 296.0227355957031, "learning_rate": 3.7623474723997676e-06, "logits/chosen": -0.862707257270813, "logits/rejected": -0.7818862795829773, "logps/chosen": -72.54582977294922, "logps/rejected": -71.32649230957031, "loss": 12.8667, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.05158460885286331, "rewards/margins": 0.11155477911233902, "rewards/rejected": -0.05997015908360481, "step": 852 }, { "epoch": 0.4954405529418598, "grad_norm": 299.976806640625, "learning_rate": 3.7608948285880303e-06, "logits/chosen": -0.803802490234375, "logits/rejected": -0.9642475843429565, "logps/chosen": -71.30755615234375, "logps/rejected": -69.78184509277344, "loss": 13.1189, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.03701017424464226, "rewards/margins": 0.11028815805912018, "rewards/rejected": -0.07327798753976822, "step": 853 }, { "epoch": 0.4960213742231515, "grad_norm": 306.97320556640625, "learning_rate": 3.759442184776293e-06, "logits/chosen": -0.7374328374862671, "logits/rejected": -0.7047148942947388, "logps/chosen": -79.32594299316406, "logps/rejected": -76.9816665649414, "loss": 13.0718, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.031373415142297745, "rewards/margins": 0.10228639841079712, "rewards/rejected": -0.07091299444437027, "step": 854 }, { "epoch": 0.4966021955044433, "grad_norm": 298.43389892578125, "learning_rate": 3.7579895409645554e-06, "logits/chosen": -0.8767411112785339, "logits/rejected": -0.7835893630981445, "logps/chosen": -71.76991271972656, "logps/rejected": -82.6242904663086, "loss": 13.2089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.024014713242650032, "rewards/margins": 0.07497559487819672, "rewards/rejected": -0.05096089839935303, "step": 855 }, { "epoch": 0.497183016785735, "grad_norm": 297.3446960449219, "learning_rate": 3.756536897152818e-06, "logits/chosen": -0.7374379634857178, "logits/rejected": -0.705323338508606, "logps/chosen": -72.24920654296875, "logps/rejected": -71.93132019042969, "loss": 13.2626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.018467990681529045, "rewards/margins": 0.070315420627594, "rewards/rejected": -0.0518474280834198, "step": 856 }, { "epoch": 0.4977638380670268, "grad_norm": 330.3368225097656, "learning_rate": 3.755084253341081e-06, "logits/chosen": -0.7391015291213989, "logits/rejected": -0.8123418688774109, "logps/chosen": -73.87885284423828, "logps/rejected": -79.49480438232422, "loss": 13.2954, "rewards/accuracies": 0.5, "rewards/chosen": 0.021732434630393982, "rewards/margins": 0.0644746795296669, "rewards/rejected": -0.04274224489927292, "step": 857 }, { "epoch": 0.4983446593483185, "grad_norm": 311.4170227050781, "learning_rate": 3.7536316095293436e-06, "logits/chosen": -0.7112471461296082, "logits/rejected": -0.743118166923523, "logps/chosen": -73.13099670410156, "logps/rejected": -73.69615173339844, "loss": 13.587, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03787677735090256, "rewards/margins": 0.04251720756292343, "rewards/rejected": -0.004640430212020874, "step": 858 }, { "epoch": 0.4989254806296103, "grad_norm": 285.54547119140625, "learning_rate": 3.752178965717606e-06, "logits/chosen": -0.805162250995636, "logits/rejected": -0.8250443339347839, "logps/chosen": -70.72074127197266, "logps/rejected": -70.56238555908203, "loss": 12.6761, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.048637259751558304, "rewards/margins": 0.1373429000377655, "rewards/rejected": -0.0887056440114975, "step": 859 }, { "epoch": 0.499506301910902, "grad_norm": 294.9131774902344, "learning_rate": 3.7507263219058687e-06, "logits/chosen": -0.909680962562561, "logits/rejected": -0.8498638272285461, "logps/chosen": -66.12528991699219, "logps/rejected": -68.66703796386719, "loss": 13.6062, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03878556936979294, "rewards/margins": 0.03481757268309593, "rewards/rejected": 0.003967995289713144, "step": 860 }, { "epoch": 0.5000871231921937, "grad_norm": 347.707275390625, "learning_rate": 3.749273678094132e-06, "logits/chosen": -0.8227556943893433, "logits/rejected": -0.799028754234314, "logps/chosen": -76.69181823730469, "logps/rejected": -73.48719787597656, "loss": 13.869, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.02884422242641449, "rewards/margins": 0.014967566356062889, "rewards/rejected": -0.04381179064512253, "step": 861 }, { "epoch": 0.5006679444734855, "grad_norm": 310.5048828125, "learning_rate": 3.7478210342823946e-06, "logits/chosen": -0.6707647442817688, "logits/rejected": -0.6352334022521973, "logps/chosen": -67.49751281738281, "logps/rejected": -72.48457336425781, "loss": 13.9948, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03386518359184265, "rewards/margins": 0.0032462819945067167, "rewards/rejected": -0.03711146116256714, "step": 862 }, { "epoch": 0.5012487657547773, "grad_norm": 361.6993713378906, "learning_rate": 3.746368390470657e-06, "logits/chosen": -0.8487448692321777, "logits/rejected": -0.7983860373497009, "logps/chosen": -70.15946960449219, "logps/rejected": -77.11515808105469, "loss": 13.3064, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.029017318040132523, "rewards/margins": 0.07312078028917313, "rewards/rejected": -0.0441034696996212, "step": 863 }, { "epoch": 0.501829587036069, "grad_norm": 317.2196960449219, "learning_rate": 3.7449157466589197e-06, "logits/chosen": -0.693240761756897, "logits/rejected": -0.6469998359680176, "logps/chosen": -71.18406677246094, "logps/rejected": -75.88301086425781, "loss": 13.6113, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.03340126946568489, "rewards/margins": 0.03463831916451454, "rewards/rejected": -0.06803958117961884, "step": 864 }, { "epoch": 0.5024104083173607, "grad_norm": 314.50555419921875, "learning_rate": 3.7434631028471825e-06, "logits/chosen": -0.8128454089164734, "logits/rejected": -0.7394827604293823, "logps/chosen": -70.54247283935547, "logps/rejected": -70.06904602050781, "loss": 15.048, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.027236783877015114, "rewards/margins": -0.09788013994693756, "rewards/rejected": 0.070643350481987, "step": 865 }, { "epoch": 0.5029912295986525, "grad_norm": 312.5657653808594, "learning_rate": 3.742010459035445e-06, "logits/chosen": -0.9018377065658569, "logits/rejected": -0.9190446138381958, "logps/chosen": -69.02962493896484, "logps/rejected": -70.04609680175781, "loss": 13.2226, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.012089548632502556, "rewards/margins": 0.07575584948062897, "rewards/rejected": -0.08784539997577667, "step": 866 }, { "epoch": 0.5035720508799443, "grad_norm": 330.9731140136719, "learning_rate": 3.7405578152237075e-06, "logits/chosen": -0.8004295229911804, "logits/rejected": -0.8180350065231323, "logps/chosen": -73.49920654296875, "logps/rejected": -85.6948013305664, "loss": 13.8988, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.05013388395309448, "rewards/margins": 0.010029042139649391, "rewards/rejected": -0.060162924230098724, "step": 867 }, { "epoch": 0.5041528721612359, "grad_norm": 280.4370422363281, "learning_rate": 3.7391051714119703e-06, "logits/chosen": -0.9341899752616882, "logits/rejected": -0.8376950025558472, "logps/chosen": -71.9174575805664, "logps/rejected": -73.74215698242188, "loss": 13.1955, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.036871034651994705, "rewards/margins": 0.07967637479305267, "rewards/rejected": -0.04280534014105797, "step": 868 }, { "epoch": 0.5047336934425277, "grad_norm": 287.02764892578125, "learning_rate": 3.737652527600233e-06, "logits/chosen": -0.817496657371521, "logits/rejected": -0.7701882123947144, "logps/chosen": -76.1856460571289, "logps/rejected": -78.75908660888672, "loss": 13.4908, "rewards/accuracies": 0.75, "rewards/chosen": 0.014233636669814587, "rewards/margins": 0.05255434662103653, "rewards/rejected": -0.038320716470479965, "step": 869 }, { "epoch": 0.5053145147238195, "grad_norm": 321.85247802734375, "learning_rate": 3.7361998837884954e-06, "logits/chosen": -0.8572362661361694, "logits/rejected": -0.8262847661972046, "logps/chosen": -72.18296813964844, "logps/rejected": -73.68527221679688, "loss": 13.3539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.023406734690070152, "rewards/margins": 0.07013611495494843, "rewards/rejected": -0.04672938585281372, "step": 870 }, { "epoch": 0.5058953360051113, "grad_norm": 318.94415283203125, "learning_rate": 3.734747239976758e-06, "logits/chosen": -0.7708402872085571, "logits/rejected": -0.8278223276138306, "logps/chosen": -75.5167236328125, "logps/rejected": -82.86116027832031, "loss": 13.5957, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.018369678407907486, "rewards/margins": 0.034253206104040146, "rewards/rejected": -0.01588352955877781, "step": 871 }, { "epoch": 0.5064761572864029, "grad_norm": 315.12384033203125, "learning_rate": 3.733294596165021e-06, "logits/chosen": -0.8158319592475891, "logits/rejected": -0.7617601156234741, "logps/chosen": -65.91107940673828, "logps/rejected": -74.64680480957031, "loss": 13.6166, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.021057626232504845, "rewards/margins": 0.0367097444832325, "rewards/rejected": -0.057767368853092194, "step": 872 }, { "epoch": 0.5070569785676947, "grad_norm": 289.8279113769531, "learning_rate": 3.7318419523532836e-06, "logits/chosen": -1.12114679813385, "logits/rejected": -0.9899064898490906, "logps/chosen": -67.73985290527344, "logps/rejected": -74.02053833007812, "loss": 13.2951, "rewards/accuracies": 0.75, "rewards/chosen": 0.018153967335820198, "rewards/margins": 0.06531564891338348, "rewards/rejected": -0.04716167598962784, "step": 873 }, { "epoch": 0.5076377998489865, "grad_norm": 291.0061340332031, "learning_rate": 3.730389308541546e-06, "logits/chosen": -0.8384881019592285, "logits/rejected": -0.8821040987968445, "logps/chosen": -70.64179992675781, "logps/rejected": -74.30955505371094, "loss": 13.453, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.009448887780308723, "rewards/margins": 0.05563428997993469, "rewards/rejected": -0.04618540033698082, "step": 874 }, { "epoch": 0.5082186211302782, "grad_norm": 309.33905029296875, "learning_rate": 3.7289366647298087e-06, "logits/chosen": -0.7903847098350525, "logits/rejected": -0.8384620547294617, "logps/chosen": -78.0987548828125, "logps/rejected": -80.98674011230469, "loss": 12.9163, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004122171550989151, "rewards/margins": 0.10959751904010773, "rewards/rejected": -0.10547534376382828, "step": 875 }, { "epoch": 0.5087994424115699, "grad_norm": 297.92266845703125, "learning_rate": 3.7274840209180714e-06, "logits/chosen": -0.9015482068061829, "logits/rejected": -0.8714168667793274, "logps/chosen": -73.52940368652344, "logps/rejected": -77.55482482910156, "loss": 12.4707, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.08103947341442108, "rewards/margins": 0.15564516186714172, "rewards/rejected": -0.07460571825504303, "step": 876 }, { "epoch": 0.5093802636928617, "grad_norm": 269.5519714355469, "learning_rate": 3.7260313771063337e-06, "logits/chosen": -0.7991993427276611, "logits/rejected": -0.7887776494026184, "logps/chosen": -70.71697998046875, "logps/rejected": -70.16976928710938, "loss": 13.1393, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.06928080320358276, "rewards/margins": 0.08986689150333405, "rewards/rejected": -0.02058609202504158, "step": 877 }, { "epoch": 0.5099610849741535, "grad_norm": 323.81719970703125, "learning_rate": 3.7245787332945965e-06, "logits/chosen": -0.6992955803871155, "logits/rejected": -0.8329144716262817, "logps/chosen": -76.07032012939453, "logps/rejected": -72.76537322998047, "loss": 13.1797, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.039288222789764404, "rewards/margins": 0.08043770492076874, "rewards/rejected": -0.11972592771053314, "step": 878 }, { "epoch": 0.5105419062554452, "grad_norm": 282.5196228027344, "learning_rate": 3.7231260894828592e-06, "logits/chosen": -0.8015006184577942, "logits/rejected": -0.8797491788864136, "logps/chosen": -68.26971435546875, "logps/rejected": -71.91568756103516, "loss": 12.5433, "rewards/accuracies": 0.75, "rewards/chosen": 0.09241674095392227, "rewards/margins": 0.15844053030014038, "rewards/rejected": -0.0660238116979599, "step": 879 }, { "epoch": 0.5111227275367369, "grad_norm": 313.0185852050781, "learning_rate": 3.7216734456711216e-06, "logits/chosen": -0.7841866612434387, "logits/rejected": -0.7923185229301453, "logps/chosen": -76.68905639648438, "logps/rejected": -71.82386779785156, "loss": 14.0049, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03983437269926071, "rewards/margins": -0.002283641602844, "rewards/rejected": -0.03755073621869087, "step": 880 }, { "epoch": 0.5117035488180287, "grad_norm": 304.0037536621094, "learning_rate": 3.7202208018593843e-06, "logits/chosen": -0.8578149676322937, "logits/rejected": -0.8352439999580383, "logps/chosen": -71.64418029785156, "logps/rejected": -68.3890151977539, "loss": 13.8305, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005443647503852844, "rewards/margins": 0.013478374108672142, "rewards/rejected": -0.018922025337815285, "step": 881 }, { "epoch": 0.5122843700993205, "grad_norm": 310.70489501953125, "learning_rate": 3.718768158047647e-06, "logits/chosen": -0.8352943658828735, "logits/rejected": -0.8491352200508118, "logps/chosen": -75.1880111694336, "logps/rejected": -82.05909729003906, "loss": 12.9929, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.028002599254250526, "rewards/margins": 0.104197658598423, "rewards/rejected": -0.13220027089118958, "step": 882 }, { "epoch": 0.5128651913806122, "grad_norm": 312.11712646484375, "learning_rate": 3.71731551423591e-06, "logits/chosen": -0.7563623785972595, "logits/rejected": -0.8587571978569031, "logps/chosen": -74.63130187988281, "logps/rejected": -69.7817153930664, "loss": 14.3503, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.003069313708692789, "rewards/margins": -0.03878450393676758, "rewards/rejected": 0.04185382276773453, "step": 883 }, { "epoch": 0.5134460126619039, "grad_norm": 317.8841247558594, "learning_rate": 3.715862870424172e-06, "logits/chosen": -0.8039946556091309, "logits/rejected": -0.9972225427627563, "logps/chosen": -85.01696014404297, "logps/rejected": -66.03048706054688, "loss": 14.4587, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.07630066573619843, "rewards/margins": -0.04431800916790962, "rewards/rejected": -0.0319826602935791, "step": 884 }, { "epoch": 0.5140268339431957, "grad_norm": 341.647705078125, "learning_rate": 3.714410226612435e-06, "logits/chosen": -0.8729516863822937, "logits/rejected": -0.8806362152099609, "logps/chosen": -74.71580505371094, "logps/rejected": -74.4959487915039, "loss": 14.1933, "rewards/accuracies": 0.5, "rewards/chosen": -0.062128596007823944, "rewards/margins": -0.019561443477869034, "rewards/rejected": -0.04256715252995491, "step": 885 }, { "epoch": 0.5146076552244874, "grad_norm": 324.4678955078125, "learning_rate": 3.7129575828006976e-06, "logits/chosen": -0.9463183283805847, "logits/rejected": -0.9414758682250977, "logps/chosen": -76.0389633178711, "logps/rejected": -74.2925033569336, "loss": 13.6209, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0420696847140789, "rewards/margins": 0.036721937358379364, "rewards/rejected": -0.07879161834716797, "step": 886 }, { "epoch": 0.5151884765057791, "grad_norm": 350.20831298828125, "learning_rate": 3.71150493898896e-06, "logits/chosen": -0.7539618015289307, "logits/rejected": -0.6907280683517456, "logps/chosen": -77.5100326538086, "logps/rejected": -68.97132873535156, "loss": 14.1455, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0666489452123642, "rewards/margins": -0.008004192262887955, "rewards/rejected": -0.058644749224185944, "step": 887 }, { "epoch": 0.5157692977870709, "grad_norm": 2863.912353515625, "learning_rate": 3.7100522951772227e-06, "logits/chosen": -0.9735726118087769, "logits/rejected": -0.8810579180717468, "logps/chosen": -75.67332458496094, "logps/rejected": -68.54508209228516, "loss": 14.7704, "rewards/accuracies": 0.25, "rewards/chosen": -0.038121938705444336, "rewards/margins": -0.07279206067323685, "rewards/rejected": 0.034670114517211914, "step": 888 }, { "epoch": 0.5163501190683627, "grad_norm": 294.0743713378906, "learning_rate": 3.7085996513654855e-06, "logits/chosen": -0.9168221354484558, "logits/rejected": -0.7986600399017334, "logps/chosen": -76.08323669433594, "logps/rejected": -67.5433120727539, "loss": 13.0438, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.014614105224609375, "rewards/margins": 0.10053624212741852, "rewards/rejected": -0.08592212200164795, "step": 889 }, { "epoch": 0.5169309403496544, "grad_norm": 322.785400390625, "learning_rate": 3.707147007553748e-06, "logits/chosen": -0.814892590045929, "logits/rejected": -0.8575268983840942, "logps/chosen": -71.9342269897461, "logps/rejected": -86.5703125, "loss": 13.6283, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.013843230903148651, "rewards/margins": 0.03725925087928772, "rewards/rejected": -0.05110248178243637, "step": 890 }, { "epoch": 0.5175117616309461, "grad_norm": 292.9025573730469, "learning_rate": 3.7056943637420105e-06, "logits/chosen": -0.7913263440132141, "logits/rejected": -0.8443421125411987, "logps/chosen": -60.87749481201172, "logps/rejected": -78.35881042480469, "loss": 13.2476, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04118134453892708, "rewards/margins": 0.07499329000711441, "rewards/rejected": -0.1161746233701706, "step": 891 }, { "epoch": 0.5180925829122379, "grad_norm": 309.80169677734375, "learning_rate": 3.7042417199302733e-06, "logits/chosen": -1.0392483472824097, "logits/rejected": -0.970413088798523, "logps/chosen": -74.8720474243164, "logps/rejected": -75.41184997558594, "loss": 13.5448, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.024524737149477005, "rewards/margins": 0.050737909972667694, "rewards/rejected": -0.075262650847435, "step": 892 }, { "epoch": 0.5186734041935297, "grad_norm": 324.7698059082031, "learning_rate": 3.702789076118536e-06, "logits/chosen": -0.7926565408706665, "logits/rejected": -0.8359763026237488, "logps/chosen": -78.66641235351562, "logps/rejected": -77.9851303100586, "loss": 13.4493, "rewards/accuracies": 0.5, "rewards/chosen": 0.013259848579764366, "rewards/margins": 0.06008158251643181, "rewards/rejected": -0.04682173952460289, "step": 893 }, { "epoch": 0.5192542254748214, "grad_norm": 311.0304260253906, "learning_rate": 3.7013364323067984e-06, "logits/chosen": -0.8284494280815125, "logits/rejected": -0.8559365272521973, "logps/chosen": -76.58866119384766, "logps/rejected": -79.98295593261719, "loss": 13.8651, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.035661615431308746, "rewards/margins": 0.0057185059413313866, "rewards/rejected": -0.04138011857867241, "step": 894 }, { "epoch": 0.5198350467561131, "grad_norm": 309.0633239746094, "learning_rate": 3.699883788495061e-06, "logits/chosen": -0.8562793731689453, "logits/rejected": -0.7222028970718384, "logps/chosen": -72.55845642089844, "logps/rejected": -67.55829620361328, "loss": 13.8677, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.02681596204638481, "rewards/margins": 0.007083225063979626, "rewards/rejected": -0.03389918431639671, "step": 895 }, { "epoch": 0.5204158680374049, "grad_norm": 281.7559509277344, "learning_rate": 3.698431144683324e-06, "logits/chosen": -0.7869559526443481, "logits/rejected": -0.7652812004089355, "logps/chosen": -63.31262969970703, "logps/rejected": -70.62659454345703, "loss": 13.6084, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.033197641372680664, "rewards/margins": 0.037535399198532104, "rewards/rejected": -0.07073303312063217, "step": 896 }, { "epoch": 0.5209966893186966, "grad_norm": 305.7250061035156, "learning_rate": 3.696978500871586e-06, "logits/chosen": -1.0239300727844238, "logits/rejected": -0.8789726495742798, "logps/chosen": -70.31292724609375, "logps/rejected": -72.78204345703125, "loss": 13.9797, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.02144424058496952, "rewards/margins": -0.005210808012634516, "rewards/rejected": 0.0266550425440073, "step": 897 }, { "epoch": 0.5215775105999884, "grad_norm": 343.10711669921875, "learning_rate": 3.695525857059849e-06, "logits/chosen": -0.8156943321228027, "logits/rejected": -0.8041397333145142, "logps/chosen": -81.42640686035156, "logps/rejected": -72.1771469116211, "loss": 14.2155, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02531343698501587, "rewards/margins": -0.019732346758246422, "rewards/rejected": -0.00558108277618885, "step": 898 }, { "epoch": 0.5221583318812801, "grad_norm": 316.3569641113281, "learning_rate": 3.6940732132481117e-06, "logits/chosen": -0.701055645942688, "logits/rejected": -0.6751523613929749, "logps/chosen": -72.49714660644531, "logps/rejected": -70.53977966308594, "loss": 13.9347, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.007911900989711285, "rewards/margins": 0.0022199698723852634, "rewards/rejected": -0.010131875053048134, "step": 899 }, { "epoch": 0.5227391531625719, "grad_norm": 352.64306640625, "learning_rate": 3.6926205694363744e-06, "logits/chosen": -0.7641804814338684, "logits/rejected": -0.82763671875, "logps/chosen": -77.14866638183594, "logps/rejected": -79.8504867553711, "loss": 14.8528, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.06686311960220337, "rewards/margins": -0.07604999095201492, "rewards/rejected": 0.009186875075101852, "step": 900 }, { "epoch": 0.5233199744438636, "grad_norm": 315.5428466796875, "learning_rate": 3.6911679256246367e-06, "logits/chosen": -0.8268402814865112, "logits/rejected": -0.7936877608299255, "logps/chosen": -75.72247314453125, "logps/rejected": -74.36206817626953, "loss": 14.1353, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0019198231166228652, "rewards/margins": -0.01289793848991394, "rewards/rejected": 0.010978116653859615, "step": 901 }, { "epoch": 0.5239007957251554, "grad_norm": 303.87799072265625, "learning_rate": 3.6897152818128995e-06, "logits/chosen": -0.9414458274841309, "logits/rejected": -0.8633650541305542, "logps/chosen": -73.24238586425781, "logps/rejected": -81.83255767822266, "loss": 13.4568, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.027762720361351967, "rewards/margins": 0.05169368535280228, "rewards/rejected": -0.02393096126616001, "step": 902 }, { "epoch": 0.5244816170064471, "grad_norm": 336.22467041015625, "learning_rate": 3.6882626380011627e-06, "logits/chosen": -0.7279499173164368, "logits/rejected": -0.9009740948677063, "logps/chosen": -81.50945281982422, "logps/rejected": -75.34597778320312, "loss": 13.8267, "rewards/accuracies": 0.5, "rewards/chosen": -0.023870524019002914, "rewards/margins": 0.016882585361599922, "rewards/rejected": -0.04075310379266739, "step": 903 }, { "epoch": 0.5250624382877389, "grad_norm": 308.542724609375, "learning_rate": 3.6868099941894254e-06, "logits/chosen": -0.8215476274490356, "logits/rejected": -0.8264580965042114, "logps/chosen": -73.51042175292969, "logps/rejected": -70.37979125976562, "loss": 14.331, "rewards/accuracies": 0.5, "rewards/chosen": 0.010108297690749168, "rewards/margins": -0.03372166305780411, "rewards/rejected": 0.043829962611198425, "step": 904 }, { "epoch": 0.5256432595690306, "grad_norm": 374.73370361328125, "learning_rate": 3.6853573503776877e-06, "logits/chosen": -0.7862215638160706, "logits/rejected": -0.6950373649597168, "logps/chosen": -76.63065338134766, "logps/rejected": -72.22526550292969, "loss": 13.453, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.02635444700717926, "rewards/margins": 0.0567445270717144, "rewards/rejected": -0.08309897035360336, "step": 905 }, { "epoch": 0.5262240808503224, "grad_norm": 320.5014343261719, "learning_rate": 3.6839047065659505e-06, "logits/chosen": -0.6966902017593384, "logits/rejected": -0.767865002155304, "logps/chosen": -73.40306854248047, "logps/rejected": -71.08512115478516, "loss": 14.5145, "rewards/accuracies": 0.5, "rewards/chosen": -0.05028662830591202, "rewards/margins": -0.047402046620845795, "rewards/rejected": -0.002884581685066223, "step": 906 }, { "epoch": 0.5268049021316141, "grad_norm": 329.3699951171875, "learning_rate": 3.6824520627542132e-06, "logits/chosen": -0.7144160270690918, "logits/rejected": -0.7741755843162537, "logps/chosen": -75.09828186035156, "logps/rejected": -78.51341247558594, "loss": 14.6293, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.06854270398616791, "rewards/margins": -0.057167064398527145, "rewards/rejected": -0.01137564517557621, "step": 907 }, { "epoch": 0.5273857234129058, "grad_norm": 310.02880859375, "learning_rate": 3.680999418942476e-06, "logits/chosen": -0.8827294111251831, "logits/rejected": -0.969725489616394, "logps/chosen": -72.49862670898438, "logps/rejected": -78.67535400390625, "loss": 13.1686, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.014622239395976067, "rewards/margins": 0.07751540094614029, "rewards/rejected": -0.06289316713809967, "step": 908 }, { "epoch": 0.5279665446941976, "grad_norm": 301.2107849121094, "learning_rate": 3.6795467751307383e-06, "logits/chosen": -0.8012347221374512, "logits/rejected": -0.8275817632675171, "logps/chosen": -72.11550903320312, "logps/rejected": -66.7422103881836, "loss": 13.0052, "rewards/accuracies": 0.75, "rewards/chosen": 0.0879693329334259, "rewards/margins": 0.09242668747901917, "rewards/rejected": -0.004457362927496433, "step": 909 }, { "epoch": 0.5285473659754893, "grad_norm": 327.29351806640625, "learning_rate": 3.678094131319001e-06, "logits/chosen": -0.8897010684013367, "logits/rejected": -0.9195152521133423, "logps/chosen": -75.18798828125, "logps/rejected": -78.10919952392578, "loss": 13.6292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.00252416729927063, "rewards/margins": 0.035282202064991, "rewards/rejected": -0.03275803476572037, "step": 910 }, { "epoch": 0.5291281872567811, "grad_norm": 572.74853515625, "learning_rate": 3.676641487507264e-06, "logits/chosen": -0.8815191984176636, "logits/rejected": -0.9849497675895691, "logps/chosen": -71.17215728759766, "logps/rejected": -74.537353515625, "loss": 13.3058, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.038278210908174515, "rewards/margins": 0.06560848653316498, "rewards/rejected": -0.02733028493821621, "step": 911 }, { "epoch": 0.5297090085380728, "grad_norm": 365.2557373046875, "learning_rate": 3.675188843695526e-06, "logits/chosen": -0.9522512555122375, "logits/rejected": -1.010914921760559, "logps/chosen": -70.09083557128906, "logps/rejected": -68.78644561767578, "loss": 13.7416, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05300981551408768, "rewards/margins": 0.02500464953482151, "rewards/rejected": -0.07801447063684464, "step": 912 }, { "epoch": 0.5302898298193646, "grad_norm": 328.8595275878906, "learning_rate": 3.673736199883789e-06, "logits/chosen": -0.9032946825027466, "logits/rejected": -0.8389007449150085, "logps/chosen": -77.11427307128906, "logps/rejected": -67.35809326171875, "loss": 13.7302, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.00886593945324421, "rewards/margins": 0.030330365523695946, "rewards/rejected": -0.03919629380106926, "step": 913 }, { "epoch": 0.5308706511006563, "grad_norm": 298.7828369140625, "learning_rate": 3.6722835560720516e-06, "logits/chosen": -0.6680513620376587, "logits/rejected": -0.7470614910125732, "logps/chosen": -72.6822280883789, "logps/rejected": -72.9788818359375, "loss": 13.5136, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.010395077057182789, "rewards/margins": 0.0404205322265625, "rewards/rejected": -0.030025456100702286, "step": 914 }, { "epoch": 0.5314514723819481, "grad_norm": 293.72021484375, "learning_rate": 3.6708309122603144e-06, "logits/chosen": -0.9234855771064758, "logits/rejected": -0.9707880020141602, "logps/chosen": -73.39376068115234, "logps/rejected": -71.17304992675781, "loss": 12.8332, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.052253782749176025, "rewards/margins": 0.12076272815465927, "rewards/rejected": -0.06850893795490265, "step": 915 }, { "epoch": 0.5320322936632398, "grad_norm": 309.4018249511719, "learning_rate": 3.6693782684485767e-06, "logits/chosen": -0.8161689639091492, "logits/rejected": -0.9288781881332397, "logps/chosen": -82.39122009277344, "logps/rejected": -70.08438873291016, "loss": 13.7639, "rewards/accuracies": 0.5, "rewards/chosen": -0.04291496425867081, "rewards/margins": 0.02595677599310875, "rewards/rejected": -0.06887174397706985, "step": 916 }, { "epoch": 0.5326131149445316, "grad_norm": 300.3961181640625, "learning_rate": 3.6679256246368394e-06, "logits/chosen": -0.877852737903595, "logits/rejected": -0.8263446092605591, "logps/chosen": -69.59159851074219, "logps/rejected": -69.27183532714844, "loss": 13.7373, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.004416265990585089, "rewards/margins": 0.022350173443555832, "rewards/rejected": -0.026766439899802208, "step": 917 }, { "epoch": 0.5331939362258233, "grad_norm": 326.1161804199219, "learning_rate": 3.666472980825102e-06, "logits/chosen": -0.7212264537811279, "logits/rejected": -0.8631353378295898, "logps/chosen": -78.27708435058594, "logps/rejected": -80.06709289550781, "loss": 13.7463, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0017011367017403245, "rewards/margins": 0.020040009170770645, "rewards/rejected": -0.018338870257139206, "step": 918 }, { "epoch": 0.533774757507115, "grad_norm": 324.24853515625, "learning_rate": 3.6650203370133645e-06, "logits/chosen": -0.8396110534667969, "logits/rejected": -0.9127988815307617, "logps/chosen": -79.79960632324219, "logps/rejected": -77.63664245605469, "loss": 13.6282, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004808626137673855, "rewards/margins": 0.03346925228834152, "rewards/rejected": -0.028660621494054794, "step": 919 }, { "epoch": 0.5343555787884068, "grad_norm": 302.04913330078125, "learning_rate": 3.6635676932016273e-06, "logits/chosen": -1.0392416715621948, "logits/rejected": -1.1793787479400635, "logps/chosen": -72.44638061523438, "logps/rejected": -72.2593994140625, "loss": 12.7915, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.028838008642196655, "rewards/margins": 0.13549332320690155, "rewards/rejected": -0.1066553145647049, "step": 920 }, { "epoch": 0.5349364000696986, "grad_norm": 314.06964111328125, "learning_rate": 3.66211504938989e-06, "logits/chosen": -0.8888261914253235, "logits/rejected": -0.761336624622345, "logps/chosen": -72.33747863769531, "logps/rejected": -73.57784271240234, "loss": 13.9558, "rewards/accuracies": 0.5, "rewards/chosen": -0.0025641447864472866, "rewards/margins": 0.002194226486608386, "rewards/rejected": -0.004758368246257305, "step": 921 }, { "epoch": 0.5355172213509903, "grad_norm": 296.27716064453125, "learning_rate": 3.6606624055781528e-06, "logits/chosen": -0.7798510789871216, "logits/rejected": -0.7086332440376282, "logps/chosen": -73.7872543334961, "logps/rejected": -67.88629150390625, "loss": 13.5935, "rewards/accuracies": 0.5, "rewards/chosen": 0.004376659635454416, "rewards/margins": 0.04153291508555412, "rewards/rejected": -0.03715625777840614, "step": 922 }, { "epoch": 0.536098042632282, "grad_norm": 312.75262451171875, "learning_rate": 3.659209761766415e-06, "logits/chosen": -0.8951870799064636, "logits/rejected": -0.9884964823722839, "logps/chosen": -81.04949951171875, "logps/rejected": -72.61628723144531, "loss": 12.8728, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.04099228233098984, "rewards/margins": 0.10998284816741943, "rewards/rejected": -0.06899057328701019, "step": 923 }, { "epoch": 0.5366788639135738, "grad_norm": 335.9742126464844, "learning_rate": 3.657757117954678e-06, "logits/chosen": -0.9431589841842651, "logits/rejected": -1.0438756942749023, "logps/chosen": -78.76590728759766, "logps/rejected": -71.42301177978516, "loss": 13.8252, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0073237232863903046, "rewards/margins": 0.013142207637429237, "rewards/rejected": -0.020465927198529243, "step": 924 }, { "epoch": 0.5372596851948656, "grad_norm": 348.2239685058594, "learning_rate": 3.6563044741429406e-06, "logits/chosen": -0.6811865568161011, "logits/rejected": -0.7321907877922058, "logps/chosen": -73.98486328125, "logps/rejected": -75.19612121582031, "loss": 13.6646, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0041376203298568726, "rewards/margins": 0.028655271977186203, "rewards/rejected": -0.02451765164732933, "step": 925 }, { "epoch": 0.5378405064761573, "grad_norm": 293.9329528808594, "learning_rate": 3.654851830331203e-06, "logits/chosen": -0.8226318359375, "logits/rejected": -0.8354133367538452, "logps/chosen": -70.20458984375, "logps/rejected": -67.4134521484375, "loss": 13.3345, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.002299271058291197, "rewards/margins": 0.060788147151470184, "rewards/rejected": -0.0584888756275177, "step": 926 }, { "epoch": 0.538421327757449, "grad_norm": 310.4035949707031, "learning_rate": 3.6533991865194657e-06, "logits/chosen": -0.8177189826965332, "logits/rejected": -0.9231401681900024, "logps/chosen": -69.95651245117188, "logps/rejected": -76.14701080322266, "loss": 13.749, "rewards/accuracies": 0.5, "rewards/chosen": -0.0049784379079937935, "rewards/margins": 0.020388662815093994, "rewards/rejected": -0.025367099791765213, "step": 927 }, { "epoch": 0.5390021490387408, "grad_norm": 316.76336669921875, "learning_rate": 3.6519465427077284e-06, "logits/chosen": -0.8412486910820007, "logits/rejected": -0.7791180610656738, "logps/chosen": -76.65928649902344, "logps/rejected": -79.48634338378906, "loss": 13.4202, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.019436553120613098, "rewards/margins": 0.060405828058719635, "rewards/rejected": -0.04096927493810654, "step": 928 }, { "epoch": 0.5395829703200326, "grad_norm": 321.49761962890625, "learning_rate": 3.6504938988959907e-06, "logits/chosen": -0.8120080828666687, "logits/rejected": -0.7900758981704712, "logps/chosen": -66.97466278076172, "logps/rejected": -77.61859893798828, "loss": 14.3962, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.02041034772992134, "rewards/margins": -0.04176231473684311, "rewards/rejected": 0.021351967006921768, "step": 929 }, { "epoch": 0.5401637916013243, "grad_norm": 325.2042541503906, "learning_rate": 3.6490412550842535e-06, "logits/chosen": -0.8616862297058105, "logits/rejected": -0.8327882885932922, "logps/chosen": -68.14276123046875, "logps/rejected": -71.96087646484375, "loss": 14.0464, "rewards/accuracies": 0.5, "rewards/chosen": -0.02421240136027336, "rewards/margins": -0.0003652706800494343, "rewards/rejected": -0.023847129195928574, "step": 930 }, { "epoch": 0.540744612882616, "grad_norm": 323.3883361816406, "learning_rate": 3.6475886112725162e-06, "logits/chosen": -0.9148539304733276, "logits/rejected": -0.8845453262329102, "logps/chosen": -75.4246826171875, "logps/rejected": -67.24982452392578, "loss": 14.2044, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.03846925124526024, "rewards/margins": -0.011761395260691643, "rewards/rejected": -0.026707852259278297, "step": 931 }, { "epoch": 0.5413254341639078, "grad_norm": 301.5912780761719, "learning_rate": 3.646135967460779e-06, "logits/chosen": -0.8207147717475891, "logits/rejected": -0.8844156265258789, "logps/chosen": -73.19061279296875, "logps/rejected": -80.32855987548828, "loss": 13.0744, "rewards/accuracies": 0.75, "rewards/chosen": 0.008339379914104939, "rewards/margins": 0.09396232664585114, "rewards/rejected": -0.08562295138835907, "step": 932 }, { "epoch": 0.5419062554451995, "grad_norm": 313.2552795410156, "learning_rate": 3.6446833236490413e-06, "logits/chosen": -0.8866413235664368, "logits/rejected": -0.8442390561103821, "logps/chosen": -71.72423553466797, "logps/rejected": -71.9534912109375, "loss": 13.7789, "rewards/accuracies": 0.5, "rewards/chosen": 0.0012622743379324675, "rewards/margins": 0.024342460557818413, "rewards/rejected": -0.023080188781023026, "step": 933 }, { "epoch": 0.5424870767264912, "grad_norm": 381.43170166015625, "learning_rate": 3.643230679837304e-06, "logits/chosen": -0.8554956316947937, "logits/rejected": -0.8630772829055786, "logps/chosen": -76.0161361694336, "logps/rejected": -74.90440368652344, "loss": 13.7078, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.004416673444211483, "rewards/margins": 0.028584444895386696, "rewards/rejected": -0.024167772382497787, "step": 934 }, { "epoch": 0.543067898007783, "grad_norm": 319.60302734375, "learning_rate": 3.641778036025567e-06, "logits/chosen": -0.7402527928352356, "logits/rejected": -0.7208027839660645, "logps/chosen": -74.16792297363281, "logps/rejected": -76.62369537353516, "loss": 13.9184, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.029772579669952393, "rewards/margins": 0.016960347071290016, "rewards/rejected": -0.04673292487859726, "step": 935 }, { "epoch": 0.5436487192890748, "grad_norm": 329.7962951660156, "learning_rate": 3.640325392213829e-06, "logits/chosen": -0.7540744543075562, "logits/rejected": -0.6933923959732056, "logps/chosen": -73.45732116699219, "logps/rejected": -73.99406433105469, "loss": 13.4842, "rewards/accuracies": 0.5, "rewards/chosen": 0.011500137858092785, "rewards/margins": 0.04718298465013504, "rewards/rejected": -0.035682838410139084, "step": 936 }, { "epoch": 0.5442295405703665, "grad_norm": 293.1716003417969, "learning_rate": 3.638872748402092e-06, "logits/chosen": -0.7672283053398132, "logits/rejected": -0.8011919856071472, "logps/chosen": -70.21018981933594, "logps/rejected": -73.24788665771484, "loss": 13.4906, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.01028083823621273, "rewards/margins": 0.05888774245977402, "rewards/rejected": -0.04860690236091614, "step": 937 }, { "epoch": 0.5448103618516582, "grad_norm": 322.726318359375, "learning_rate": 3.6374201045903546e-06, "logits/chosen": -0.7294805645942688, "logits/rejected": -0.6923807859420776, "logps/chosen": -73.052734375, "logps/rejected": -68.35235595703125, "loss": 13.6451, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.03901199623942375, "rewards/margins": 0.034640949219465256, "rewards/rejected": 0.004371042363345623, "step": 938 }, { "epoch": 0.54539118313295, "grad_norm": 294.18133544921875, "learning_rate": 3.6359674607786174e-06, "logits/chosen": -0.7671042680740356, "logits/rejected": -0.7237785458564758, "logps/chosen": -66.60963439941406, "logps/rejected": -75.8608169555664, "loss": 13.6033, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007068338803946972, "rewards/margins": 0.04283539205789566, "rewards/rejected": -0.04990372806787491, "step": 939 }, { "epoch": 0.5459720044142418, "grad_norm": 323.2447204589844, "learning_rate": 3.6345148169668797e-06, "logits/chosen": -0.7898720502853394, "logits/rejected": -0.8044264912605286, "logps/chosen": -73.30155944824219, "logps/rejected": -74.01371002197266, "loss": 15.0194, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.06720665097236633, "rewards/margins": -0.09914363920688629, "rewards/rejected": 0.031936999410390854, "step": 940 }, { "epoch": 0.5465528256955335, "grad_norm": 290.3876953125, "learning_rate": 3.6330621731551424e-06, "logits/chosen": -0.9731463193893433, "logits/rejected": -0.8236031532287598, "logps/chosen": -70.91753387451172, "logps/rejected": -73.23826599121094, "loss": 13.6824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.024254988878965378, "rewards/margins": 0.034400004893541336, "rewards/rejected": -0.05865498632192612, "step": 941 }, { "epoch": 0.5471336469768252, "grad_norm": 309.3848571777344, "learning_rate": 3.631609529343405e-06, "logits/chosen": -0.8001748323440552, "logits/rejected": -0.8197957873344421, "logps/chosen": -71.79235076904297, "logps/rejected": -81.0816650390625, "loss": 13.3633, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.008796481415629387, "rewards/margins": 0.08566837012767792, "rewards/rejected": -0.07687188684940338, "step": 942 }, { "epoch": 0.547714468258117, "grad_norm": 298.1664123535156, "learning_rate": 3.6301568855316675e-06, "logits/chosen": -0.8537214398384094, "logits/rejected": -0.7996708750724792, "logps/chosen": -71.4506607055664, "logps/rejected": -71.5552978515625, "loss": 13.3384, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.048867061734199524, "rewards/margins": 0.07090874016284943, "rewards/rejected": -0.022041672840714455, "step": 943 }, { "epoch": 0.5482952895394088, "grad_norm": 325.9350891113281, "learning_rate": 3.6287042417199303e-06, "logits/chosen": -0.7929685115814209, "logits/rejected": -0.7167041301727295, "logps/chosen": -81.67355346679688, "logps/rejected": -79.35840606689453, "loss": 13.0682, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.04618043079972267, "rewards/margins": 0.0885276198387146, "rewards/rejected": -0.04234719276428223, "step": 944 }, { "epoch": 0.5488761108207004, "grad_norm": 321.2001037597656, "learning_rate": 3.6272515979081934e-06, "logits/chosen": -0.8102318644523621, "logits/rejected": -0.8700863122940063, "logps/chosen": -78.3038330078125, "logps/rejected": -70.6084213256836, "loss": 14.7495, "rewards/accuracies": 0.25, "rewards/chosen": -0.053139828145504, "rewards/margins": -0.0726703479886055, "rewards/rejected": 0.01953052543103695, "step": 945 }, { "epoch": 0.5494569321019922, "grad_norm": 317.0191650390625, "learning_rate": 3.625798954096456e-06, "logits/chosen": -1.0644843578338623, "logits/rejected": -1.0164694786071777, "logps/chosen": -78.31604766845703, "logps/rejected": -71.96430969238281, "loss": 14.0692, "rewards/accuracies": 0.5, "rewards/chosen": 0.012915946543216705, "rewards/margins": -0.0076404451392591, "rewards/rejected": 0.020556394010782242, "step": 946 }, { "epoch": 0.550037753383284, "grad_norm": 296.7738342285156, "learning_rate": 3.624346310284719e-06, "logits/chosen": -0.8858498334884644, "logits/rejected": -0.7842674255371094, "logps/chosen": -72.47500610351562, "logps/rejected": -66.57456970214844, "loss": 13.1119, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06248743459582329, "rewards/margins": 0.08361674845218658, "rewards/rejected": -0.02112930826842785, "step": 947 }, { "epoch": 0.5506185746645758, "grad_norm": 503.9831237792969, "learning_rate": 3.6228936664729813e-06, "logits/chosen": -0.7140674591064453, "logits/rejected": -0.8015111088752747, "logps/chosen": -75.12227630615234, "logps/rejected": -75.85064697265625, "loss": 13.9841, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01585085317492485, "rewards/margins": -0.0022057548630982637, "rewards/rejected": -0.013645097613334656, "step": 948 }, { "epoch": 0.5511993959458674, "grad_norm": 345.8603210449219, "learning_rate": 3.621441022661244e-06, "logits/chosen": -0.7985360026359558, "logits/rejected": -0.9361134767532349, "logps/chosen": -81.85668182373047, "logps/rejected": -76.9217300415039, "loss": 14.4827, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.05369080975651741, "rewards/margins": -0.04964115470647812, "rewards/rejected": -0.004049652721732855, "step": 949 }, { "epoch": 0.5517802172271592, "grad_norm": 360.7843017578125, "learning_rate": 3.6199883788495068e-06, "logits/chosen": -0.9570215344429016, "logits/rejected": -0.978718638420105, "logps/chosen": -77.3498764038086, "logps/rejected": -78.39435577392578, "loss": 15.2509, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.0370364785194397, "rewards/margins": -0.12158574908971786, "rewards/rejected": 0.08454927057027817, "step": 950 }, { "epoch": 0.552361038508451, "grad_norm": 323.8126525878906, "learning_rate": 3.618535735037769e-06, "logits/chosen": -0.8473807573318481, "logits/rejected": -0.8004460334777832, "logps/chosen": -72.76786804199219, "logps/rejected": -80.35932922363281, "loss": 13.0489, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03187683969736099, "rewards/margins": 0.0921851173043251, "rewards/rejected": -0.060308270156383514, "step": 951 }, { "epoch": 0.5529418597897428, "grad_norm": 352.3288269042969, "learning_rate": 3.617083091226032e-06, "logits/chosen": -0.6603168845176697, "logits/rejected": -0.7307640910148621, "logps/chosen": -75.92056274414062, "logps/rejected": -76.89772033691406, "loss": 14.0047, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00199211691506207, "rewards/margins": -0.0017544865841045976, "rewards/rejected": -0.0002376347838435322, "step": 952 }, { "epoch": 0.5535226810710344, "grad_norm": 304.5995788574219, "learning_rate": 3.6156304474142946e-06, "logits/chosen": -0.9129480123519897, "logits/rejected": -0.8230624198913574, "logps/chosen": -77.09077453613281, "logps/rejected": -80.87089538574219, "loss": 12.8239, "rewards/accuracies": 0.75, "rewards/chosen": 0.030163967981934547, "rewards/margins": 0.12263661623001099, "rewards/rejected": -0.09247267246246338, "step": 953 }, { "epoch": 0.5541035023523262, "grad_norm": 342.901123046875, "learning_rate": 3.614177803602557e-06, "logits/chosen": -0.8802892565727234, "logits/rejected": -0.9021528959274292, "logps/chosen": -73.71620178222656, "logps/rejected": -70.15324401855469, "loss": 14.2906, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.009347392246127129, "rewards/margins": -0.028505731374025345, "rewards/rejected": 0.019158339127898216, "step": 954 }, { "epoch": 0.554684323633618, "grad_norm": 319.2974853515625, "learning_rate": 3.6127251597908197e-06, "logits/chosen": -0.8257215619087219, "logits/rejected": -0.8133133053779602, "logps/chosen": -67.73748016357422, "logps/rejected": -70.88557434082031, "loss": 13.7505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.01775963604450226, "rewards/margins": 0.016628600656986237, "rewards/rejected": -0.034388236701488495, "step": 955 }, { "epoch": 0.5552651449149096, "grad_norm": 304.05657958984375, "learning_rate": 3.6112725159790824e-06, "logits/chosen": -0.8870047330856323, "logits/rejected": -0.9050837755203247, "logps/chosen": -71.76180267333984, "logps/rejected": -70.63961791992188, "loss": 14.454, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.04455003887414932, "rewards/margins": -0.05116810277104378, "rewards/rejected": 0.00661806296557188, "step": 956 }, { "epoch": 0.5558459661962014, "grad_norm": 303.3849182128906, "learning_rate": 3.609819872167345e-06, "logits/chosen": -0.6691077351570129, "logits/rejected": -0.8779782056808472, "logps/chosen": -85.62307739257812, "logps/rejected": -74.10694885253906, "loss": 13.2578, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.017601244151592255, "rewards/margins": 0.11997624486684799, "rewards/rejected": -0.10237500816583633, "step": 957 }, { "epoch": 0.5564267874774932, "grad_norm": 298.76275634765625, "learning_rate": 3.6083672283556075e-06, "logits/chosen": -0.8502671122550964, "logits/rejected": -0.8758736848831177, "logps/chosen": -76.85331726074219, "logps/rejected": -74.97396087646484, "loss": 12.5147, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.07757486402988434, "rewards/margins": 0.15835627913475037, "rewards/rejected": -0.08078142255544662, "step": 958 }, { "epoch": 0.557007608758785, "grad_norm": 287.95330810546875, "learning_rate": 3.6069145845438702e-06, "logits/chosen": -0.8888352513313293, "logits/rejected": -1.0603057146072388, "logps/chosen": -67.73649597167969, "logps/rejected": -75.5111083984375, "loss": 12.9569, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.026487339287996292, "rewards/margins": 0.1329462230205536, "rewards/rejected": -0.15943357348442078, "step": 959 }, { "epoch": 0.5575884300400766, "grad_norm": 306.65069580078125, "learning_rate": 3.605461940732133e-06, "logits/chosen": -0.7654293775558472, "logits/rejected": -0.6960537433624268, "logps/chosen": -71.85382080078125, "logps/rejected": -71.59233856201172, "loss": 13.6787, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.02338285744190216, "rewards/margins": 0.026502352207899094, "rewards/rejected": -0.003119495464488864, "step": 960 }, { "epoch": 0.5581692513213684, "grad_norm": 322.8121643066406, "learning_rate": 3.6040092969203953e-06, "logits/chosen": -0.7408978343009949, "logits/rejected": -0.8127990961074829, "logps/chosen": -80.58521270751953, "logps/rejected": -77.42636108398438, "loss": 13.0516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0336628220975399, "rewards/margins": 0.09334491193294525, "rewards/rejected": -0.059682078659534454, "step": 961 }, { "epoch": 0.5587500726026602, "grad_norm": 312.6292724609375, "learning_rate": 3.602556653108658e-06, "logits/chosen": -0.9119815826416016, "logits/rejected": -0.929103672504425, "logps/chosen": -85.22322082519531, "logps/rejected": -73.23065948486328, "loss": 13.4033, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.007932068780064583, "rewards/margins": 0.05446599796414375, "rewards/rejected": -0.04653392732143402, "step": 962 }, { "epoch": 0.559330893883952, "grad_norm": 344.50537109375, "learning_rate": 3.601104009296921e-06, "logits/chosen": -0.7761534452438354, "logits/rejected": -0.73918616771698, "logps/chosen": -76.0566177368164, "logps/rejected": -76.8918228149414, "loss": 13.6649, "rewards/accuracies": 0.5, "rewards/chosen": 0.00900066178292036, "rewards/margins": 0.031557004898786545, "rewards/rejected": -0.02255634218454361, "step": 963 }, { "epoch": 0.5599117151652436, "grad_norm": 297.2245178222656, "learning_rate": 3.5996513654851835e-06, "logits/chosen": -0.7133857011795044, "logits/rejected": -0.7351251840591431, "logps/chosen": -71.87410736083984, "logps/rejected": -75.77886199951172, "loss": 13.3632, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03325660899281502, "rewards/margins": 0.06159430742263794, "rewards/rejected": -0.09485092014074326, "step": 964 }, { "epoch": 0.5604925364465354, "grad_norm": 327.8712463378906, "learning_rate": 3.598198721673446e-06, "logits/chosen": -0.7299011945724487, "logits/rejected": -0.6946900486946106, "logps/chosen": -82.22821807861328, "logps/rejected": -71.68350982666016, "loss": 13.0658, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.010126391425728798, "rewards/margins": 0.0904078334569931, "rewards/rejected": -0.08028144389390945, "step": 965 }, { "epoch": 0.5610733577278272, "grad_norm": 314.0105285644531, "learning_rate": 3.5967460778617086e-06, "logits/chosen": -0.7300828695297241, "logits/rejected": -0.7897091507911682, "logps/chosen": -72.77555847167969, "logps/rejected": -68.70625305175781, "loss": 14.111, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.02419973351061344, "rewards/margins": -0.012854543514549732, "rewards/rejected": -0.01134518813341856, "step": 966 }, { "epoch": 0.5616541790091188, "grad_norm": 325.6894836425781, "learning_rate": 3.5952934340499714e-06, "logits/chosen": -0.7582443952560425, "logits/rejected": -0.7317585349082947, "logps/chosen": -75.26892852783203, "logps/rejected": -72.90524291992188, "loss": 14.4269, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.02938658557832241, "rewards/margins": -0.04390256479382515, "rewards/rejected": 0.01451596338301897, "step": 967 }, { "epoch": 0.5622350002904106, "grad_norm": 282.8306884765625, "learning_rate": 3.5938407902382337e-06, "logits/chosen": -0.8044571876525879, "logits/rejected": -0.8136259317398071, "logps/chosen": -63.56464385986328, "logps/rejected": -73.00951385498047, "loss": 13.2138, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.04722351208329201, "rewards/margins": 0.08173610270023346, "rewards/rejected": -0.034512586891651154, "step": 968 }, { "epoch": 0.5628158215717024, "grad_norm": 296.7197570800781, "learning_rate": 3.5923881464264964e-06, "logits/chosen": -0.8755059242248535, "logits/rejected": -0.9272353053092957, "logps/chosen": -69.00138092041016, "logps/rejected": -73.67031860351562, "loss": 13.3606, "rewards/accuracies": 0.5, "rewards/chosen": 0.04788496717810631, "rewards/margins": 0.06125533580780029, "rewards/rejected": -0.013370366767048836, "step": 969 }, { "epoch": 0.5633966428529942, "grad_norm": 315.6614074707031, "learning_rate": 3.590935502614759e-06, "logits/chosen": -0.7783223390579224, "logits/rejected": -0.8105076551437378, "logps/chosen": -70.99641418457031, "logps/rejected": -81.84101104736328, "loss": 13.0554, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.048261307179927826, "rewards/margins": 0.09377080202102661, "rewards/rejected": -0.045509494841098785, "step": 970 }, { "epoch": 0.5639774641342858, "grad_norm": 324.253662109375, "learning_rate": 3.5894828588030215e-06, "logits/chosen": -0.714714765548706, "logits/rejected": -0.7137161493301392, "logps/chosen": -78.5097885131836, "logps/rejected": -75.41368103027344, "loss": 14.0157, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.07694325596094131, "rewards/margins": -0.0059618866071105, "rewards/rejected": 0.0829051285982132, "step": 971 }, { "epoch": 0.5645582854155776, "grad_norm": 431.720947265625, "learning_rate": 3.5880302149912843e-06, "logits/chosen": -0.7840239405632019, "logits/rejected": -0.8164595365524292, "logps/chosen": -80.40445709228516, "logps/rejected": -75.40513610839844, "loss": 14.0158, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0007454866426996887, "rewards/margins": -0.0007643516873940825, "rewards/rejected": 1.8867478502215818e-05, "step": 972 }, { "epoch": 0.5651391066968694, "grad_norm": 356.0662536621094, "learning_rate": 3.586577571179547e-06, "logits/chosen": -0.9265009760856628, "logits/rejected": -0.7497826814651489, "logps/chosen": -72.06420135498047, "logps/rejected": -75.36701965332031, "loss": 13.7262, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.05553603172302246, "rewards/margins": 0.020493101328611374, "rewards/rejected": 0.035042934119701385, "step": 973 }, { "epoch": 0.5657199279781612, "grad_norm": 319.8960266113281, "learning_rate": 3.5851249273678098e-06, "logits/chosen": -0.8483496904373169, "logits/rejected": -0.9086085557937622, "logps/chosen": -72.57324981689453, "logps/rejected": -82.24463653564453, "loss": 13.7945, "rewards/accuracies": 0.5, "rewards/chosen": 0.00978184211999178, "rewards/margins": 0.019335050135850906, "rewards/rejected": -0.009553213603794575, "step": 974 }, { "epoch": 0.5663007492594528, "grad_norm": 331.1341857910156, "learning_rate": 3.583672283556072e-06, "logits/chosen": -0.9468619227409363, "logits/rejected": -0.9842132329940796, "logps/chosen": -73.61444854736328, "logps/rejected": -69.44871520996094, "loss": 14.548, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.002568404423072934, "rewards/margins": -0.05364561080932617, "rewards/rejected": 0.05107720568776131, "step": 975 }, { "epoch": 0.5668815705407446, "grad_norm": 311.6826171875, "learning_rate": 3.582219639744335e-06, "logits/chosen": -0.8095108866691589, "logits/rejected": -0.7686668634414673, "logps/chosen": -79.30653381347656, "logps/rejected": -73.29191589355469, "loss": 13.7811, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006576309911906719, "rewards/margins": 0.019946260377764702, "rewards/rejected": -0.026522571220993996, "step": 976 }, { "epoch": 0.5674623918220364, "grad_norm": 309.6329345703125, "learning_rate": 3.5807669959325976e-06, "logits/chosen": -0.7286791205406189, "logits/rejected": -0.6476708054542542, "logps/chosen": -73.79474639892578, "logps/rejected": -64.9814453125, "loss": 13.4035, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.009394103661179543, "rewards/margins": 0.05736853927373886, "rewards/rejected": -0.04797443002462387, "step": 977 }, { "epoch": 0.5680432131033281, "grad_norm": 312.59039306640625, "learning_rate": 3.57931435212086e-06, "logits/chosen": -0.791883111000061, "logits/rejected": -0.6957409381866455, "logps/chosen": -70.85968017578125, "logps/rejected": -69.31065368652344, "loss": 13.7411, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.017146646976470947, "rewards/margins": 0.02252192422747612, "rewards/rejected": -0.039668574929237366, "step": 978 }, { "epoch": 0.5686240343846198, "grad_norm": 316.483154296875, "learning_rate": 3.5778617083091226e-06, "logits/chosen": -0.8095332980155945, "logits/rejected": -0.7965534925460815, "logps/chosen": -72.15980529785156, "logps/rejected": -71.30020904541016, "loss": 13.2213, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.04437297210097313, "rewards/margins": 0.0787133201956749, "rewards/rejected": -0.034340351819992065, "step": 979 }, { "epoch": 0.5692048556659116, "grad_norm": 283.54541015625, "learning_rate": 3.5764090644973854e-06, "logits/chosen": -0.8537616729736328, "logits/rejected": -0.9587091207504272, "logps/chosen": -69.26819610595703, "logps/rejected": -72.88671112060547, "loss": 13.3589, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.000316973018925637, "rewards/margins": 0.06976380944252014, "rewards/rejected": -0.07008077204227448, "step": 980 }, { "epoch": 0.5697856769472034, "grad_norm": 279.60089111328125, "learning_rate": 3.574956420685648e-06, "logits/chosen": -0.9589298963546753, "logits/rejected": -0.9463101625442505, "logps/chosen": -67.99465942382812, "logps/rejected": -65.81513977050781, "loss": 13.0732, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.013214176520705223, "rewards/margins": 0.09724701941013336, "rewards/rejected": -0.08403284102678299, "step": 981 }, { "epoch": 0.5703664982284951, "grad_norm": 299.99664306640625, "learning_rate": 3.5735037768739105e-06, "logits/chosen": -0.8253474235534668, "logits/rejected": -0.7606749534606934, "logps/chosen": -64.20758819580078, "logps/rejected": -78.39196014404297, "loss": 13.5467, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.02975400909781456, "rewards/margins": 0.04441283270716667, "rewards/rejected": -0.014658820815384388, "step": 982 }, { "epoch": 0.5709473195097868, "grad_norm": 316.7391052246094, "learning_rate": 3.5720511330621732e-06, "logits/chosen": -0.781932532787323, "logits/rejected": -0.7325922250747681, "logps/chosen": -71.48280334472656, "logps/rejected": -72.66695404052734, "loss": 13.52, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.06907404214143753, "rewards/margins": 0.04966479912400246, "rewards/rejected": 0.01940922625362873, "step": 983 }, { "epoch": 0.5715281407910786, "grad_norm": 305.2508544921875, "learning_rate": 3.570598489250436e-06, "logits/chosen": -0.7936916947364807, "logits/rejected": -0.7673407793045044, "logps/chosen": -81.65838623046875, "logps/rejected": -74.53465270996094, "loss": 13.3595, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.015117776580154896, "rewards/margins": 0.06472204625606537, "rewards/rejected": -0.04960425943136215, "step": 984 }, { "epoch": 0.5721089620723704, "grad_norm": 302.62506103515625, "learning_rate": 3.5691458454386983e-06, "logits/chosen": -0.765379786491394, "logits/rejected": -0.8423225283622742, "logps/chosen": -65.9737319946289, "logps/rejected": -71.31224060058594, "loss": 13.5828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04746689647436142, "rewards/margins": 0.039320915937423706, "rewards/rejected": 0.008145982399582863, "step": 985 }, { "epoch": 0.572689783353662, "grad_norm": 318.6903991699219, "learning_rate": 3.567693201626961e-06, "logits/chosen": -0.8056508302688599, "logits/rejected": -0.7189480662345886, "logps/chosen": -79.517822265625, "logps/rejected": -71.33512878417969, "loss": 13.5867, "rewards/accuracies": 0.75, "rewards/chosen": 0.021028585731983185, "rewards/margins": 0.03351649269461632, "rewards/rejected": -0.012487906031310558, "step": 986 }, { "epoch": 0.5732706046349538, "grad_norm": 285.9707336425781, "learning_rate": 3.5662405578152242e-06, "logits/chosen": -0.7602620720863342, "logits/rejected": -0.6153804063796997, "logps/chosen": -70.01246643066406, "logps/rejected": -69.94181823730469, "loss": 13.9504, "rewards/accuracies": 0.5, "rewards/chosen": 0.007558539509773254, "rewards/margins": 0.003087778342887759, "rewards/rejected": 0.004470758140087128, "step": 987 }, { "epoch": 0.5738514259162456, "grad_norm": 322.0240478515625, "learning_rate": 3.564787914003487e-06, "logits/chosen": -0.7658705115318298, "logits/rejected": -0.7216984033584595, "logps/chosen": -71.78937530517578, "logps/rejected": -70.35456085205078, "loss": 14.4603, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.00903630256652832, "rewards/margins": -0.05042557790875435, "rewards/rejected": 0.05946188047528267, "step": 988 }, { "epoch": 0.5744322471975373, "grad_norm": 319.0548400878906, "learning_rate": 3.5633352701917497e-06, "logits/chosen": -0.8453356027603149, "logits/rejected": -0.8643606901168823, "logps/chosen": -77.3846435546875, "logps/rejected": -81.15995788574219, "loss": 12.8317, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.031046276912093163, "rewards/margins": 0.11689448356628418, "rewards/rejected": -0.08584820479154587, "step": 989 }, { "epoch": 0.575013068478829, "grad_norm": 347.3050537109375, "learning_rate": 3.561882626380012e-06, "logits/chosen": -0.647269070148468, "logits/rejected": -0.7291135787963867, "logps/chosen": -73.54329681396484, "logps/rejected": -68.59510040283203, "loss": 13.2637, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.07152654230594635, "rewards/margins": 0.07307065278291702, "rewards/rejected": -0.00154411350376904, "step": 990 }, { "epoch": 0.5755938897601208, "grad_norm": 350.6729736328125, "learning_rate": 3.5604299825682748e-06, "logits/chosen": -0.6177459955215454, "logits/rejected": -0.6841322779655457, "logps/chosen": -83.19955444335938, "logps/rejected": -93.72516632080078, "loss": 13.6459, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.006069374270737171, "rewards/margins": 0.028904488310217857, "rewards/rejected": -0.02283511497080326, "step": 991 }, { "epoch": 0.5761747110414126, "grad_norm": 321.6015319824219, "learning_rate": 3.5589773387565375e-06, "logits/chosen": -0.7162039875984192, "logits/rejected": -0.6496952772140503, "logps/chosen": -70.39527893066406, "logps/rejected": -69.74990844726562, "loss": 13.3159, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.10396634042263031, "rewards/margins": 0.06428544223308563, "rewards/rejected": 0.039680901914834976, "step": 992 }, { "epoch": 0.5767555323227043, "grad_norm": 293.7535095214844, "learning_rate": 3.5575246949448e-06, "logits/chosen": -0.8607357144355774, "logits/rejected": -0.8218991160392761, "logps/chosen": -66.10880279541016, "logps/rejected": -73.78075408935547, "loss": 13.5722, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.008624534122645855, "rewards/margins": 0.03976871818304062, "rewards/rejected": -0.03114417754113674, "step": 993 }, { "epoch": 0.577336353603996, "grad_norm": 399.10052490234375, "learning_rate": 3.5560720511330626e-06, "logits/chosen": -0.7579528093338013, "logits/rejected": -0.759405791759491, "logps/chosen": -73.0143051147461, "logps/rejected": -76.09436798095703, "loss": 13.904, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0034693810157477856, "rewards/margins": 0.00545927882194519, "rewards/rejected": -0.001989898504689336, "step": 994 }, { "epoch": 0.5779171748852878, "grad_norm": 304.847412109375, "learning_rate": 3.5546194073213254e-06, "logits/chosen": -0.8246349096298218, "logits/rejected": -0.8366926312446594, "logps/chosen": -68.1496810913086, "logps/rejected": -73.1882095336914, "loss": 13.0704, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.10885617882013321, "rewards/margins": 0.09571322053670883, "rewards/rejected": 0.013142948038876057, "step": 995 }, { "epoch": 0.5784979961665796, "grad_norm": 333.4711608886719, "learning_rate": 3.553166763509588e-06, "logits/chosen": -0.7208290100097656, "logits/rejected": -0.6368588209152222, "logps/chosen": -72.82388305664062, "logps/rejected": -70.48295593261719, "loss": 14.5623, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.06020081788301468, "rewards/margins": -0.06112781912088394, "rewards/rejected": 0.0009269967558793724, "step": 996 }, { "epoch": 0.5790788174478713, "grad_norm": 333.8407897949219, "learning_rate": 3.5517141196978504e-06, "logits/chosen": -0.7055472135543823, "logits/rejected": -0.8043805956840515, "logps/chosen": -73.97242736816406, "logps/rejected": -75.32130432128906, "loss": 13.9701, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0029317201115190983, "rewards/margins": -0.003249152097851038, "rewards/rejected": 0.0003174312296323478, "step": 997 }, { "epoch": 0.579659638729163, "grad_norm": 283.136962890625, "learning_rate": 3.550261475886113e-06, "logits/chosen": -0.808709979057312, "logits/rejected": -0.8320805430412292, "logps/chosen": -70.24005889892578, "logps/rejected": -74.91935729980469, "loss": 13.5613, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.025156404823064804, "rewards/margins": 0.04108768701553345, "rewards/rejected": -0.06624408811330795, "step": 998 }, { "epoch": 0.5802404600104548, "grad_norm": 301.6640319824219, "learning_rate": 3.548808832074376e-06, "logits/chosen": -0.8165045976638794, "logits/rejected": -0.8338570594787598, "logps/chosen": -72.01176452636719, "logps/rejected": -72.26456451416016, "loss": 13.5339, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.041943442076444626, "rewards/margins": 0.04307948797941208, "rewards/rejected": -0.0011360436910763383, "step": 999 }, { "epoch": 0.5808212812917465, "grad_norm": 609.0323486328125, "learning_rate": 3.5473561882626382e-06, "logits/chosen": -0.7630084753036499, "logits/rejected": -0.7541495561599731, "logps/chosen": -70.21868896484375, "logps/rejected": -77.51250457763672, "loss": 14.8816, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.0775180235505104, "rewards/margins": -0.09175113588571548, "rewards/rejected": 0.014233121648430824, "step": 1000 } ], "logging_steps": 1, "max_steps": 3442, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }