diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6220839813374806, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006220839813374805, + "grad_norm": 14.179065704345703, + "learning_rate": 5.0000000000000004e-08, + "logits/chosen": -0.09663959592580795, + "logits/rejected": -0.29295116662979126, + "logps/chosen": -306.8134765625, + "logps/rejected": -502.7719421386719, + "loss": 0.4995, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.255997657775879, + "rewards/margins": 1.6211460828781128, + "rewards/rejected": -2.8771438598632812, + "step": 1 + }, + { + "epoch": 0.001244167962674961, + "grad_norm": 11.77914810180664, + "learning_rate": 1.0000000000000001e-07, + "logits/chosen": -0.2633693814277649, + "logits/rejected": -0.3102447986602783, + "logps/chosen": -295.5501708984375, + "logps/rejected": -419.68878173828125, + "loss": 0.471, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6002616882324219, + "rewards/margins": 0.962213397026062, + "rewards/rejected": -1.5624749660491943, + "step": 2 + }, + { + "epoch": 0.0018662519440124418, + "grad_norm": 19.002405166625977, + "learning_rate": 1.5000000000000002e-07, + "logits/chosen": -0.2428935170173645, + "logits/rejected": -0.254658579826355, + "logps/chosen": -371.24652099609375, + "logps/rejected": -479.6505432128906, + "loss": 0.8115, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6839216351509094, + "rewards/margins": 0.6187509298324585, + "rewards/rejected": -1.3026723861694336, + "step": 3 + }, + { + "epoch": 0.002488335925349922, + "grad_norm": 12.522491455078125, + "learning_rate": 2.0000000000000002e-07, + "logits/chosen": -0.10803362727165222, + "logits/rejected": -0.17355093359947205, + "logps/chosen": -206.69967651367188, + "logps/rejected": -391.93511962890625, + "loss": 0.5817, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2418205738067627, + "rewards/margins": 0.9081786870956421, + "rewards/rejected": -2.1499993801116943, + "step": 4 + }, + { + "epoch": 0.003110419906687403, + "grad_norm": 13.049732208251953, + "learning_rate": 2.5000000000000004e-07, + "logits/chosen": -0.3344213366508484, + "logits/rejected": -0.27875983715057373, + "logps/chosen": -511.88751220703125, + "logps/rejected": -408.63616943359375, + "loss": 0.4205, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6255606412887573, + "rewards/margins": 1.032475471496582, + "rewards/rejected": -1.658036231994629, + "step": 5 + }, + { + "epoch": 0.0037325038880248835, + "grad_norm": 16.811622619628906, + "learning_rate": 3.0000000000000004e-07, + "logits/chosen": -0.12298917770385742, + "logits/rejected": -0.2680993378162384, + "logps/chosen": -303.9392395019531, + "logps/rejected": -482.5489196777344, + "loss": 0.722, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3000998497009277, + "rewards/margins": 0.6062828302383423, + "rewards/rejected": -1.9063827991485596, + "step": 6 + }, + { + "epoch": 0.004354587869362364, + "grad_norm": 9.753623008728027, + "learning_rate": 3.5000000000000004e-07, + "logits/chosen": -0.22940252721309662, + "logits/rejected": -0.3106110095977783, + "logps/chosen": -389.7403869628906, + "logps/rejected": -532.2648315429688, + "loss": 0.4724, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5673116445541382, + "rewards/margins": 1.6019636392593384, + "rewards/rejected": -2.1692755222320557, + "step": 7 + }, + { + "epoch": 0.004976671850699844, + "grad_norm": 18.520763397216797, + "learning_rate": 4.0000000000000003e-07, + "logits/chosen": -0.2085423469543457, + "logits/rejected": -0.18259884417057037, + "logps/chosen": -385.1637268066406, + "logps/rejected": -356.86328125, + "loss": 1.1976, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7579731941223145, + "rewards/margins": -0.4501006603240967, + "rewards/rejected": -1.3078724145889282, + "step": 8 + }, + { + "epoch": 0.005598755832037325, + "grad_norm": 10.160990715026855, + "learning_rate": 4.5000000000000003e-07, + "logits/chosen": -0.16166917979717255, + "logits/rejected": -0.27789413928985596, + "logps/chosen": -266.14459228515625, + "logps/rejected": -384.45904541015625, + "loss": 0.4304, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16488167643547058, + "rewards/margins": 0.9874380230903625, + "rewards/rejected": -1.1523196697235107, + "step": 9 + }, + { + "epoch": 0.006220839813374806, + "grad_norm": 13.668052673339844, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": -0.1738671511411667, + "logits/rejected": -0.19918832182884216, + "logps/chosen": -358.0230712890625, + "logps/rejected": -457.3133544921875, + "loss": 0.502, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4125533699989319, + "rewards/margins": 0.812768280506134, + "rewards/rejected": -1.2253215312957764, + "step": 10 + }, + { + "epoch": 0.006842923794712286, + "grad_norm": 14.790095329284668, + "learning_rate": 5.5e-07, + "logits/chosen": -0.12035153806209564, + "logits/rejected": -0.258575439453125, + "logps/chosen": -360.260009765625, + "logps/rejected": -486.62451171875, + "loss": 0.5221, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.661494255065918, + "rewards/margins": 1.0127524137496948, + "rewards/rejected": -2.6742467880249023, + "step": 11 + }, + { + "epoch": 0.007465007776049767, + "grad_norm": 6.891914367675781, + "learning_rate": 6.000000000000001e-07, + "logits/chosen": -0.24263660609722137, + "logits/rejected": -0.2932659983634949, + "logps/chosen": -284.80224609375, + "logps/rejected": -390.12884521484375, + "loss": 0.3038, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5474095344543457, + "rewards/margins": 1.4920446872711182, + "rewards/rejected": -2.039454221725464, + "step": 12 + }, + { + "epoch": 0.008087091757387248, + "grad_norm": 16.581096649169922, + "learning_rate": 6.5e-07, + "logits/chosen": -0.1820787787437439, + "logits/rejected": -0.2822909653186798, + "logps/chosen": -254.50169372558594, + "logps/rejected": -390.7183837890625, + "loss": 0.772, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.393714189529419, + "rewards/margins": 0.1790848672389984, + "rewards/rejected": -1.5727989673614502, + "step": 13 + }, + { + "epoch": 0.008709175738724729, + "grad_norm": 11.865903854370117, + "learning_rate": 7.000000000000001e-07, + "logits/chosen": -0.13393919169902802, + "logits/rejected": -0.20724566280841827, + "logps/chosen": -249.39512634277344, + "logps/rejected": -467.53509521484375, + "loss": 0.5062, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8844162225723267, + "rewards/margins": 0.9113656282424927, + "rewards/rejected": -1.7957818508148193, + "step": 14 + }, + { + "epoch": 0.00933125972006221, + "grad_norm": 11.295125961303711, + "learning_rate": 7.5e-07, + "logits/chosen": -0.15880821645259857, + "logits/rejected": -0.29340383410453796, + "logps/chosen": -239.0933837890625, + "logps/rejected": -368.84857177734375, + "loss": 0.5177, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7486093640327454, + "rewards/margins": 0.802417516708374, + "rewards/rejected": -1.5510269403457642, + "step": 15 + }, + { + "epoch": 0.009953343701399688, + "grad_norm": 15.220681190490723, + "learning_rate": 8.000000000000001e-07, + "logits/chosen": -0.258389949798584, + "logits/rejected": -0.3167637586593628, + "logps/chosen": -322.184326171875, + "logps/rejected": -408.5572814941406, + "loss": 0.7328, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13355258107185364, + "rewards/margins": 0.3222728967666626, + "rewards/rejected": -0.45582544803619385, + "step": 16 + }, + { + "epoch": 0.010575427682737169, + "grad_norm": 11.698396682739258, + "learning_rate": 8.500000000000001e-07, + "logits/chosen": -0.13664107024669647, + "logits/rejected": -0.24286630749702454, + "logps/chosen": -262.89996337890625, + "logps/rejected": -374.6935729980469, + "loss": 0.5305, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8824573755264282, + "rewards/margins": 1.0661792755126953, + "rewards/rejected": -1.948636770248413, + "step": 17 + }, + { + "epoch": 0.01119751166407465, + "grad_norm": 10.8157958984375, + "learning_rate": 9.000000000000001e-07, + "logits/chosen": -0.17083550989627838, + "logits/rejected": -0.21770785748958588, + "logps/chosen": -287.0339050292969, + "logps/rejected": -434.8841552734375, + "loss": 0.4935, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9494354724884033, + "rewards/margins": 0.9396749138832092, + "rewards/rejected": -1.8891104459762573, + "step": 18 + }, + { + "epoch": 0.01181959564541213, + "grad_norm": 17.668495178222656, + "learning_rate": 9.500000000000001e-07, + "logits/chosen": -0.10040243715047836, + "logits/rejected": -0.27325940132141113, + "logps/chosen": -352.14263916015625, + "logps/rejected": -455.9179382324219, + "loss": 0.8235, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3606045246124268, + "rewards/margins": 1.2895393371582031, + "rewards/rejected": -2.65014386177063, + "step": 19 + }, + { + "epoch": 0.012441679626749611, + "grad_norm": 13.79849910736084, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -0.13011546432971954, + "logits/rejected": -0.24261751770973206, + "logps/chosen": -392.6343994140625, + "logps/rejected": -554.8592529296875, + "loss": 0.5062, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0570666790008545, + "rewards/margins": 1.2219442129135132, + "rewards/rejected": -2.279010772705078, + "step": 20 + }, + { + "epoch": 0.013063763608087092, + "grad_norm": 25.17713737487793, + "learning_rate": 1.0500000000000001e-06, + "logits/chosen": -0.21922960877418518, + "logits/rejected": -0.2387387603521347, + "logps/chosen": -398.945068359375, + "logps/rejected": -516.2724609375, + "loss": 0.9162, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3859711289405823, + "rewards/margins": -0.00037989020347595215, + "rewards/rejected": -0.3855912387371063, + "step": 21 + }, + { + "epoch": 0.013685847589424573, + "grad_norm": 17.10285186767578, + "learning_rate": 1.1e-06, + "logits/chosen": -0.21595719456672668, + "logits/rejected": -0.2929472327232361, + "logps/chosen": -246.27789306640625, + "logps/rejected": -379.24200439453125, + "loss": 0.9767, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4987471103668213, + "rewards/margins": 0.007953926920890808, + "rewards/rejected": -1.506700873374939, + "step": 22 + }, + { + "epoch": 0.014307931570762053, + "grad_norm": 17.92582130432129, + "learning_rate": 1.1500000000000002e-06, + "logits/chosen": -0.22393420338630676, + "logits/rejected": -0.24265322089195251, + "logps/chosen": -410.4319763183594, + "logps/rejected": -557.3082885742188, + "loss": 0.8574, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18853901326656342, + "rewards/margins": 0.324137806892395, + "rewards/rejected": -0.5126769542694092, + "step": 23 + }, + { + "epoch": 0.014930015552099534, + "grad_norm": 15.830718994140625, + "learning_rate": 1.2000000000000002e-06, + "logits/chosen": -0.26339858770370483, + "logits/rejected": -0.3155684173107147, + "logps/chosen": -277.84246826171875, + "logps/rejected": -405.0123291015625, + "loss": 0.5695, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9685299396514893, + "rewards/margins": 1.253878116607666, + "rewards/rejected": -2.2224080562591553, + "step": 24 + }, + { + "epoch": 0.015552099533437015, + "grad_norm": 7.998365879058838, + "learning_rate": 1.25e-06, + "logits/chosen": -0.10873141884803772, + "logits/rejected": -0.21873541176319122, + "logps/chosen": -155.264404296875, + "logps/rejected": -276.8229064941406, + "loss": 0.536, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.046856045722961426, + "rewards/margins": 1.5767072439193726, + "rewards/rejected": -1.623563289642334, + "step": 25 + }, + { + "epoch": 0.016174183514774496, + "grad_norm": 14.385464668273926, + "learning_rate": 1.3e-06, + "logits/chosen": -0.3095937967300415, + "logits/rejected": -0.3485206961631775, + "logps/chosen": -220.8693084716797, + "logps/rejected": -299.0721435546875, + "loss": 0.6129, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5018860101699829, + "rewards/margins": 0.775991678237915, + "rewards/rejected": -1.2778778076171875, + "step": 26 + }, + { + "epoch": 0.016796267496111975, + "grad_norm": 20.059619903564453, + "learning_rate": 1.3500000000000002e-06, + "logits/chosen": -0.1587831825017929, + "logits/rejected": -0.17471978068351746, + "logps/chosen": -391.550537109375, + "logps/rejected": -395.39453125, + "loss": 1.1242, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2111552953720093, + "rewards/margins": -0.24754559993743896, + "rewards/rejected": -0.9636096954345703, + "step": 27 + }, + { + "epoch": 0.017418351477449457, + "grad_norm": 22.013578414916992, + "learning_rate": 1.4000000000000001e-06, + "logits/chosen": -0.17577999830245972, + "logits/rejected": -0.2767271399497986, + "logps/chosen": -356.16436767578125, + "logps/rejected": -513.57177734375, + "loss": 0.9105, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2383060455322266, + "rewards/margins": 0.14490197598934174, + "rewards/rejected": -1.3832080364227295, + "step": 28 + }, + { + "epoch": 0.018040435458786936, + "grad_norm": 13.351226806640625, + "learning_rate": 1.45e-06, + "logits/chosen": -0.21776744723320007, + "logits/rejected": -0.3049355745315552, + "logps/chosen": -500.8392639160156, + "logps/rejected": -437.8874206542969, + "loss": 0.342, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7385349273681641, + "rewards/margins": 1.8749885559082031, + "rewards/rejected": -2.613523483276367, + "step": 29 + }, + { + "epoch": 0.01866251944012442, + "grad_norm": 9.927164077758789, + "learning_rate": 1.5e-06, + "logits/chosen": -0.17475713789463043, + "logits/rejected": -0.226411372423172, + "logps/chosen": -351.47210693359375, + "logps/rejected": -440.9518127441406, + "loss": 0.3561, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2620359659194946, + "rewards/margins": 1.1363576650619507, + "rewards/rejected": -2.3983936309814453, + "step": 30 + }, + { + "epoch": 0.019284603421461897, + "grad_norm": 10.73594856262207, + "learning_rate": 1.5500000000000002e-06, + "logits/chosen": -0.2690945267677307, + "logits/rejected": -0.33507466316223145, + "logps/chosen": -309.91668701171875, + "logps/rejected": -364.99462890625, + "loss": 0.5151, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7403770685195923, + "rewards/margins": 0.7023105621337891, + "rewards/rejected": -1.4426876306533813, + "step": 31 + }, + { + "epoch": 0.019906687402799376, + "grad_norm": 13.486499786376953, + "learning_rate": 1.6000000000000001e-06, + "logits/chosen": -0.20531223714351654, + "logits/rejected": -0.2851923406124115, + "logps/chosen": -193.35791015625, + "logps/rejected": -380.63482666015625, + "loss": 0.3554, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6723178029060364, + "rewards/margins": 1.519931435585022, + "rewards/rejected": -2.192249059677124, + "step": 32 + }, + { + "epoch": 0.02052877138413686, + "grad_norm": 10.148118019104004, + "learning_rate": 1.6500000000000003e-06, + "logits/chosen": -0.12586656212806702, + "logits/rejected": -0.2555898427963257, + "logps/chosen": -278.3027648925781, + "logps/rejected": -433.91961669921875, + "loss": 0.334, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8123304843902588, + "rewards/margins": 2.020124673843384, + "rewards/rejected": -3.8324551582336426, + "step": 33 + }, + { + "epoch": 0.021150855365474338, + "grad_norm": 11.05343246459961, + "learning_rate": 1.7000000000000002e-06, + "logits/chosen": -0.235120490193367, + "logits/rejected": -0.28808438777923584, + "logps/chosen": -390.21319580078125, + "logps/rejected": -564.244384765625, + "loss": 0.4385, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2345523834228516, + "rewards/margins": 1.8450043201446533, + "rewards/rejected": -3.079556703567505, + "step": 34 + }, + { + "epoch": 0.02177293934681182, + "grad_norm": 18.567405700683594, + "learning_rate": 1.75e-06, + "logits/chosen": -0.24076960980892181, + "logits/rejected": -0.3182660639286041, + "logps/chosen": -405.08929443359375, + "logps/rejected": -414.69580078125, + "loss": 0.7105, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0119682550430298, + "rewards/margins": 0.7925464510917664, + "rewards/rejected": -1.804514765739441, + "step": 35 + }, + { + "epoch": 0.0223950233281493, + "grad_norm": 13.088397979736328, + "learning_rate": 1.8000000000000001e-06, + "logits/chosen": -0.20046073198318481, + "logits/rejected": -0.3034290373325348, + "logps/chosen": -211.29237365722656, + "logps/rejected": -378.7991943359375, + "loss": 0.4626, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7679038643836975, + "rewards/margins": 0.7711363434791565, + "rewards/rejected": -1.5390403270721436, + "step": 36 + }, + { + "epoch": 0.023017107309486782, + "grad_norm": 2.089670181274414, + "learning_rate": 1.85e-06, + "logits/chosen": -0.18681660294532776, + "logits/rejected": -0.23626521229743958, + "logps/chosen": -403.9193115234375, + "logps/rejected": -532.716064453125, + "loss": 0.0599, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3829203248023987, + "rewards/margins": 3.036440849304199, + "rewards/rejected": -2.6535205841064453, + "step": 37 + }, + { + "epoch": 0.02363919129082426, + "grad_norm": 23.806045532226562, + "learning_rate": 1.9000000000000002e-06, + "logits/chosen": -0.23479099571704865, + "logits/rejected": -0.20861080288887024, + "logps/chosen": -408.87890625, + "logps/rejected": -411.87738037109375, + "loss": 1.6667, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.641960859298706, + "rewards/margins": -0.08910135924816132, + "rewards/rejected": -1.5528594255447388, + "step": 38 + }, + { + "epoch": 0.024261275272161743, + "grad_norm": 14.741189956665039, + "learning_rate": 1.9500000000000004e-06, + "logits/chosen": -0.178049236536026, + "logits/rejected": -0.2930225133895874, + "logps/chosen": -268.2509460449219, + "logps/rejected": -338.8417663574219, + "loss": 0.6374, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4974963665008545, + "rewards/margins": 0.6124590635299683, + "rewards/rejected": -2.109955310821533, + "step": 39 + }, + { + "epoch": 0.024883359253499222, + "grad_norm": 10.034878730773926, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -0.17119136452674866, + "logits/rejected": -0.23427794873714447, + "logps/chosen": -295.2297058105469, + "logps/rejected": -482.6625671386719, + "loss": 0.38, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4640917778015137, + "rewards/margins": 1.3562004566192627, + "rewards/rejected": -2.8202919960021973, + "step": 40 + }, + { + "epoch": 0.0255054432348367, + "grad_norm": 8.486538887023926, + "learning_rate": 2.05e-06, + "logits/chosen": -0.13372741639614105, + "logits/rejected": -0.3056889772415161, + "logps/chosen": -172.62648010253906, + "logps/rejected": -391.78021240234375, + "loss": 0.3687, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7312526702880859, + "rewards/margins": 1.964490532875061, + "rewards/rejected": -2.6957430839538574, + "step": 41 + }, + { + "epoch": 0.026127527216174184, + "grad_norm": 20.43706703186035, + "learning_rate": 2.1000000000000002e-06, + "logits/chosen": -0.13358715176582336, + "logits/rejected": -0.18170273303985596, + "logps/chosen": -521.5180053710938, + "logps/rejected": -489.009521484375, + "loss": 0.6501, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6865028142929077, + "rewards/margins": 0.7075815200805664, + "rewards/rejected": -1.3940844535827637, + "step": 42 + }, + { + "epoch": 0.026749611197511663, + "grad_norm": 23.01974868774414, + "learning_rate": 2.15e-06, + "logits/chosen": -0.2493523806333542, + "logits/rejected": -0.23068276047706604, + "logps/chosen": -428.3470458984375, + "logps/rejected": -412.4192810058594, + "loss": 1.1218, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8403460383415222, + "rewards/margins": -0.26085755228996277, + "rewards/rejected": -0.5794885754585266, + "step": 43 + }, + { + "epoch": 0.027371695178849145, + "grad_norm": 7.1443657875061035, + "learning_rate": 2.2e-06, + "logits/chosen": -0.19250428676605225, + "logits/rejected": -0.3385680913925171, + "logps/chosen": -287.33380126953125, + "logps/rejected": -580.844970703125, + "loss": 0.1994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7818142175674438, + "rewards/margins": 1.8287538290023804, + "rewards/rejected": -2.610568046569824, + "step": 44 + }, + { + "epoch": 0.027993779160186624, + "grad_norm": 16.03560447692871, + "learning_rate": 2.25e-06, + "logits/chosen": -0.21126647293567657, + "logits/rejected": -0.2900475561618805, + "logps/chosen": -340.1061096191406, + "logps/rejected": -477.5196838378906, + "loss": 0.6079, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.566444993019104, + "rewards/margins": 0.8992220163345337, + "rewards/rejected": -1.4656668901443481, + "step": 45 + }, + { + "epoch": 0.028615863141524107, + "grad_norm": 9.423442840576172, + "learning_rate": 2.3000000000000004e-06, + "logits/chosen": -0.22664828598499298, + "logits/rejected": -0.27953359484672546, + "logps/chosen": -219.21914672851562, + "logps/rejected": -295.5262756347656, + "loss": 0.5133, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7769243121147156, + "rewards/margins": 0.7776564955711365, + "rewards/rejected": -1.554580807685852, + "step": 46 + }, + { + "epoch": 0.029237947122861586, + "grad_norm": 10.364957809448242, + "learning_rate": 2.35e-06, + "logits/chosen": -0.10724575817584991, + "logits/rejected": -0.2539420425891876, + "logps/chosen": -269.37646484375, + "logps/rejected": -436.5749206542969, + "loss": 0.4076, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9695836305618286, + "rewards/margins": 1.7071529626846313, + "rewards/rejected": -2.67673659324646, + "step": 47 + }, + { + "epoch": 0.029860031104199068, + "grad_norm": 7.340808868408203, + "learning_rate": 2.4000000000000003e-06, + "logits/chosen": -0.149932399392128, + "logits/rejected": -0.2582329511642456, + "logps/chosen": -227.40234375, + "logps/rejected": -362.63525390625, + "loss": 0.2668, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2572637796401978, + "rewards/margins": 1.8111741542816162, + "rewards/rejected": -3.0684380531311035, + "step": 48 + }, + { + "epoch": 0.030482115085536547, + "grad_norm": 6.69564151763916, + "learning_rate": 2.4500000000000003e-06, + "logits/chosen": -0.1832939088344574, + "logits/rejected": -0.2623113691806793, + "logps/chosen": -138.41905212402344, + "logps/rejected": -331.13690185546875, + "loss": 0.2986, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.662757158279419, + "rewards/margins": 1.217343807220459, + "rewards/rejected": -1.880100965499878, + "step": 49 + }, + { + "epoch": 0.03110419906687403, + "grad_norm": 8.444109916687012, + "learning_rate": 2.5e-06, + "logits/chosen": -0.1639014482498169, + "logits/rejected": -0.32977503538131714, + "logps/chosen": -179.28326416015625, + "logps/rejected": -469.6463623046875, + "loss": 0.2486, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36024248600006104, + "rewards/margins": 1.6571729183197021, + "rewards/rejected": -2.0174152851104736, + "step": 50 + }, + { + "epoch": 0.031726283048211505, + "grad_norm": 16.295948028564453, + "learning_rate": 2.55e-06, + "logits/chosen": -0.20464730262756348, + "logits/rejected": -0.2880536615848541, + "logps/chosen": -231.85267639160156, + "logps/rejected": -403.3456115722656, + "loss": 0.5314, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1476625204086304, + "rewards/margins": 1.25139319896698, + "rewards/rejected": -2.3990557193756104, + "step": 51 + }, + { + "epoch": 0.03234836702954899, + "grad_norm": 6.52236795425415, + "learning_rate": 2.6e-06, + "logits/chosen": -0.26059409976005554, + "logits/rejected": -0.29845130443573, + "logps/chosen": -536.4730834960938, + "logps/rejected": -434.3161926269531, + "loss": 0.2151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9093705415725708, + "rewards/margins": 2.3885202407836914, + "rewards/rejected": -3.2978909015655518, + "step": 52 + }, + { + "epoch": 0.03297045101088647, + "grad_norm": 13.477189064025879, + "learning_rate": 2.6500000000000005e-06, + "logits/chosen": -0.28778886795043945, + "logits/rejected": -0.3191465735435486, + "logps/chosen": -320.70166015625, + "logps/rejected": -349.4329528808594, + "loss": 0.5188, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5972996950149536, + "rewards/margins": 1.1935516595840454, + "rewards/rejected": -2.790851354598999, + "step": 53 + }, + { + "epoch": 0.03359253499222395, + "grad_norm": 9.599281311035156, + "learning_rate": 2.7000000000000004e-06, + "logits/chosen": -0.17920365929603577, + "logits/rejected": -0.280004620552063, + "logps/chosen": -143.53419494628906, + "logps/rejected": -473.53424072265625, + "loss": 0.3455, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.018561437726020813, + "rewards/margins": 1.4982223510742188, + "rewards/rejected": -1.5167839527130127, + "step": 54 + }, + { + "epoch": 0.03421461897356143, + "grad_norm": 25.547439575195312, + "learning_rate": 2.7500000000000004e-06, + "logits/chosen": -0.2866336703300476, + "logits/rejected": -0.29679301381111145, + "logps/chosen": -599.33935546875, + "logps/rejected": -560.5665283203125, + "loss": 1.0235, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.202643632888794, + "rewards/margins": -0.10936909914016724, + "rewards/rejected": -1.093274474143982, + "step": 55 + }, + { + "epoch": 0.034836702954898914, + "grad_norm": 26.236724853515625, + "learning_rate": 2.8000000000000003e-06, + "logits/chosen": -0.0002057589590549469, + "logits/rejected": -0.0764215812087059, + "logps/chosen": -538.2767333984375, + "logps/rejected": -597.5361938476562, + "loss": 0.9727, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1882060766220093, + "rewards/margins": 0.24304497241973877, + "rewards/rejected": -1.4312509298324585, + "step": 56 + }, + { + "epoch": 0.03545878693623639, + "grad_norm": 8.85472297668457, + "learning_rate": 2.85e-06, + "logits/chosen": -0.20457614958286285, + "logits/rejected": -0.2736588716506958, + "logps/chosen": -345.30242919921875, + "logps/rejected": -480.9373779296875, + "loss": 0.293, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17795437574386597, + "rewards/margins": 1.440328598022461, + "rewards/rejected": -1.6182829141616821, + "step": 57 + }, + { + "epoch": 0.03608087091757387, + "grad_norm": 11.532721519470215, + "learning_rate": 2.9e-06, + "logits/chosen": -0.27522847056388855, + "logits/rejected": -0.3393068313598633, + "logps/chosen": -183.15977478027344, + "logps/rejected": -426.8497314453125, + "loss": 0.4879, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13434720039367676, + "rewards/margins": 0.7762142419815063, + "rewards/rejected": -0.6418670415878296, + "step": 58 + }, + { + "epoch": 0.03670295489891135, + "grad_norm": 15.900622367858887, + "learning_rate": 2.95e-06, + "logits/chosen": -0.2537090480327606, + "logits/rejected": -0.3194456100463867, + "logps/chosen": -425.156982421875, + "logps/rejected": -506.6101379394531, + "loss": 0.6889, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6701778173446655, + "rewards/margins": 0.3513681888580322, + "rewards/rejected": -2.021545886993408, + "step": 59 + }, + { + "epoch": 0.03732503888024884, + "grad_norm": 12.097509384155273, + "learning_rate": 3e-06, + "logits/chosen": -0.3678496479988098, + "logits/rejected": -0.41114020347595215, + "logps/chosen": -206.7186279296875, + "logps/rejected": -395.35211181640625, + "loss": 0.4351, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1714004278182983, + "rewards/margins": 0.9028781652450562, + "rewards/rejected": -2.0742785930633545, + "step": 60 + }, + { + "epoch": 0.037947122861586316, + "grad_norm": 15.27265453338623, + "learning_rate": 3.05e-06, + "logits/chosen": -0.1501697450876236, + "logits/rejected": -0.21398228406906128, + "logps/chosen": -206.0605010986328, + "logps/rejected": -440.1636962890625, + "loss": 0.5908, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45971035957336426, + "rewards/margins": 2.233668327331543, + "rewards/rejected": -2.6933789253234863, + "step": 61 + }, + { + "epoch": 0.038569206842923795, + "grad_norm": 11.626727104187012, + "learning_rate": 3.1000000000000004e-06, + "logits/chosen": -0.10590653866529465, + "logits/rejected": -0.12088129669427872, + "logps/chosen": -391.425537109375, + "logps/rejected": -398.5299072265625, + "loss": 0.441, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3214738368988037, + "rewards/margins": 0.8211286067962646, + "rewards/rejected": -2.1426024436950684, + "step": 62 + }, + { + "epoch": 0.039191290824261274, + "grad_norm": 7.35396146774292, + "learning_rate": 3.1500000000000003e-06, + "logits/chosen": -0.1809176206588745, + "logits/rejected": -0.2615310549736023, + "logps/chosen": -410.23345947265625, + "logps/rejected": -574.097412109375, + "loss": 0.1761, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3829352855682373, + "rewards/margins": 2.565948963165283, + "rewards/rejected": -3.9488844871520996, + "step": 63 + }, + { + "epoch": 0.03981337480559875, + "grad_norm": 13.276795387268066, + "learning_rate": 3.2000000000000003e-06, + "logits/chosen": -0.20947271585464478, + "logits/rejected": -0.3012083172798157, + "logps/chosen": -480.8250732421875, + "logps/rejected": -600.383056640625, + "loss": 0.3154, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9986147284507751, + "rewards/margins": 3.042830228805542, + "rewards/rejected": -4.041444778442383, + "step": 64 + }, + { + "epoch": 0.04043545878693624, + "grad_norm": 17.25835609436035, + "learning_rate": 3.2500000000000002e-06, + "logits/chosen": -0.24339747428894043, + "logits/rejected": -0.27869996428489685, + "logps/chosen": -440.91943359375, + "logps/rejected": -518.6160888671875, + "loss": 0.7742, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3874032497406006, + "rewards/margins": 0.3544794023036957, + "rewards/rejected": -1.741882562637329, + "step": 65 + }, + { + "epoch": 0.04105754276827372, + "grad_norm": 6.817550182342529, + "learning_rate": 3.3000000000000006e-06, + "logits/chosen": -0.2159850150346756, + "logits/rejected": -0.2737243175506592, + "logps/chosen": -291.11114501953125, + "logps/rejected": -452.4122314453125, + "loss": 0.1987, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7368097305297852, + "rewards/margins": 2.572923183441162, + "rewards/rejected": -3.3097331523895264, + "step": 66 + }, + { + "epoch": 0.0416796267496112, + "grad_norm": 15.981302261352539, + "learning_rate": 3.3500000000000005e-06, + "logits/chosen": -0.221299946308136, + "logits/rejected": -0.33752116560935974, + "logps/chosen": -283.2926330566406, + "logps/rejected": -474.64202880859375, + "loss": 0.5376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9600274562835693, + "rewards/margins": 2.743755340576172, + "rewards/rejected": -3.703782558441162, + "step": 67 + }, + { + "epoch": 0.042301710730948676, + "grad_norm": 18.9468936920166, + "learning_rate": 3.4000000000000005e-06, + "logits/chosen": -0.11929792910814285, + "logits/rejected": -0.24849550426006317, + "logps/chosen": -276.6759033203125, + "logps/rejected": -341.380859375, + "loss": 1.027, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.7878270149230957, + "rewards/margins": 0.3675115704536438, + "rewards/rejected": -3.155338764190674, + "step": 68 + }, + { + "epoch": 0.04292379471228616, + "grad_norm": 16.16079330444336, + "learning_rate": 3.45e-06, + "logits/chosen": -0.2587287425994873, + "logits/rejected": -0.2987760603427887, + "logps/chosen": -359.5409851074219, + "logps/rejected": -425.91351318359375, + "loss": 0.8361, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.526947021484375, + "rewards/margins": 0.6344957947731018, + "rewards/rejected": -2.161442756652832, + "step": 69 + }, + { + "epoch": 0.04354587869362364, + "grad_norm": 7.773891448974609, + "learning_rate": 3.5e-06, + "logits/chosen": -0.17838647961616516, + "logits/rejected": -0.27623099088668823, + "logps/chosen": -134.31735229492188, + "logps/rejected": -207.3904266357422, + "loss": 0.4711, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7136498689651489, + "rewards/margins": 1.2838187217712402, + "rewards/rejected": -1.9974687099456787, + "step": 70 + }, + { + "epoch": 0.04416796267496112, + "grad_norm": 12.592650413513184, + "learning_rate": 3.5500000000000003e-06, + "logits/chosen": -0.03479360044002533, + "logits/rejected": -0.09530510008335114, + "logps/chosen": -200.797607421875, + "logps/rejected": -342.7199401855469, + "loss": 0.4076, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2463843822479248, + "rewards/margins": 2.5453548431396484, + "rewards/rejected": -3.791738986968994, + "step": 71 + }, + { + "epoch": 0.0447900466562986, + "grad_norm": 11.650457382202148, + "learning_rate": 3.6000000000000003e-06, + "logits/chosen": -0.34126096963882446, + "logits/rejected": -0.3750931918621063, + "logps/chosen": -372.76300048828125, + "logps/rejected": -436.2012939453125, + "loss": 0.2268, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3615607023239136, + "rewards/margins": 2.6495361328125, + "rewards/rejected": -4.011096477508545, + "step": 72 + }, + { + "epoch": 0.04541213063763608, + "grad_norm": 12.697369575500488, + "learning_rate": 3.65e-06, + "logits/chosen": -0.28850501775741577, + "logits/rejected": -0.3038886487483978, + "logps/chosen": -325.43963623046875, + "logps/rejected": -572.3331909179688, + "loss": 0.4343, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9497873187065125, + "rewards/margins": 2.2582414150238037, + "rewards/rejected": -3.20802903175354, + "step": 73 + }, + { + "epoch": 0.046034214618973564, + "grad_norm": 13.571394920349121, + "learning_rate": 3.7e-06, + "logits/chosen": -0.21433620154857635, + "logits/rejected": -0.303901344537735, + "logps/chosen": -286.341552734375, + "logps/rejected": -303.99688720703125, + "loss": 0.5332, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1739815473556519, + "rewards/margins": 2.0686354637145996, + "rewards/rejected": -3.242616891860962, + "step": 74 + }, + { + "epoch": 0.04665629860031104, + "grad_norm": 9.394055366516113, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": -0.287506639957428, + "logits/rejected": -0.3159993290901184, + "logps/chosen": -404.67462158203125, + "logps/rejected": -362.9585876464844, + "loss": 0.4001, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6658294200897217, + "rewards/margins": 1.2909448146820068, + "rewards/rejected": -3.9567742347717285, + "step": 75 + }, + { + "epoch": 0.04727838258164852, + "grad_norm": 8.563478469848633, + "learning_rate": 3.8000000000000005e-06, + "logits/chosen": -0.14961880445480347, + "logits/rejected": -0.2825284004211426, + "logps/chosen": -344.805908203125, + "logps/rejected": -492.7919006347656, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.252165675163269, + "rewards/margins": 2.953425884246826, + "rewards/rejected": -4.205591678619385, + "step": 76 + }, + { + "epoch": 0.047900466562986, + "grad_norm": 5.910562038421631, + "learning_rate": 3.85e-06, + "logits/chosen": -0.17163342237472534, + "logits/rejected": -0.294676810503006, + "logps/chosen": -383.13525390625, + "logps/rejected": -623.71875, + "loss": 0.1594, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7410997748374939, + "rewards/margins": 3.5932066440582275, + "rewards/rejected": -4.334306716918945, + "step": 77 + }, + { + "epoch": 0.04852255054432349, + "grad_norm": 11.526333808898926, + "learning_rate": 3.900000000000001e-06, + "logits/chosen": -0.39154335856437683, + "logits/rejected": -0.44125962257385254, + "logps/chosen": -298.5296325683594, + "logps/rejected": -379.02215576171875, + "loss": 0.6187, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5695186853408813, + "rewards/margins": 0.872893214225769, + "rewards/rejected": -1.4424117803573608, + "step": 78 + }, + { + "epoch": 0.049144634525660966, + "grad_norm": 3.5099329948425293, + "learning_rate": 3.95e-06, + "logits/chosen": -0.20980992913246155, + "logits/rejected": -0.30509504675865173, + "logps/chosen": -206.57229614257812, + "logps/rejected": -375.4094543457031, + "loss": 0.1367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7920137643814087, + "rewards/margins": 2.8831026554107666, + "rewards/rejected": -3.6751160621643066, + "step": 79 + }, + { + "epoch": 0.049766718506998445, + "grad_norm": 13.557099342346191, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -0.21356728672981262, + "logits/rejected": -0.294747918844223, + "logps/chosen": -413.43890380859375, + "logps/rejected": -546.9957885742188, + "loss": 0.2713, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5689355731010437, + "rewards/margins": 2.0055856704711914, + "rewards/rejected": -2.57452130317688, + "step": 80 + }, + { + "epoch": 0.050388802488335924, + "grad_norm": 3.7775638103485107, + "learning_rate": 4.05e-06, + "logits/chosen": -0.09411117434501648, + "logits/rejected": -0.23960420489311218, + "logps/chosen": -254.82440185546875, + "logps/rejected": -424.4232177734375, + "loss": 0.1103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23673325777053833, + "rewards/margins": 3.46421480178833, + "rewards/rejected": -3.7009479999542236, + "step": 81 + }, + { + "epoch": 0.0510108864696734, + "grad_norm": 4.780115127563477, + "learning_rate": 4.1e-06, + "logits/chosen": -0.2380412220954895, + "logits/rejected": -0.3345107436180115, + "logps/chosen": -341.47064208984375, + "logps/rejected": -511.33941650390625, + "loss": 0.1228, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9272913932800293, + "rewards/margins": 2.6540956497192383, + "rewards/rejected": -4.581387042999268, + "step": 82 + }, + { + "epoch": 0.05163297045101089, + "grad_norm": 9.37759017944336, + "learning_rate": 4.15e-06, + "logits/chosen": -0.18571540713310242, + "logits/rejected": -0.30302560329437256, + "logps/chosen": -373.7727966308594, + "logps/rejected": -488.87017822265625, + "loss": 0.2612, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.323201060295105, + "rewards/margins": 1.6000735759735107, + "rewards/rejected": -2.923274517059326, + "step": 83 + }, + { + "epoch": 0.05225505443234837, + "grad_norm": 11.258021354675293, + "learning_rate": 4.2000000000000004e-06, + "logits/chosen": -0.21024304628372192, + "logits/rejected": -0.2846536636352539, + "logps/chosen": -212.07369995117188, + "logps/rejected": -434.2860107421875, + "loss": 0.5468, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1689891815185547, + "rewards/margins": 2.4463162422180176, + "rewards/rejected": -3.6153054237365723, + "step": 84 + }, + { + "epoch": 0.05287713841368585, + "grad_norm": 4.400533676147461, + "learning_rate": 4.25e-06, + "logits/chosen": -0.24485322833061218, + "logits/rejected": -0.26796457171440125, + "logps/chosen": -195.15414428710938, + "logps/rejected": -409.75830078125, + "loss": 0.1552, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28108054399490356, + "rewards/margins": 2.787832736968994, + "rewards/rejected": -3.068913459777832, + "step": 85 + }, + { + "epoch": 0.053499222395023326, + "grad_norm": 5.219532012939453, + "learning_rate": 4.3e-06, + "logits/chosen": -0.2686353623867035, + "logits/rejected": -0.31684648990631104, + "logps/chosen": -341.1234130859375, + "logps/rejected": -423.0592041015625, + "loss": 0.1603, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44465991854667664, + "rewards/margins": 2.7123236656188965, + "rewards/rejected": -2.2676637172698975, + "step": 86 + }, + { + "epoch": 0.05412130637636081, + "grad_norm": 5.169269561767578, + "learning_rate": 4.350000000000001e-06, + "logits/chosen": -0.21402984857559204, + "logits/rejected": -0.2912907004356384, + "logps/chosen": -182.60205078125, + "logps/rejected": -304.4798583984375, + "loss": 0.1435, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4342317581176758, + "rewards/margins": 3.114931106567383, + "rewards/rejected": -3.5491628646850586, + "step": 87 + }, + { + "epoch": 0.05474339035769829, + "grad_norm": 10.944323539733887, + "learning_rate": 4.4e-06, + "logits/chosen": -0.20865029096603394, + "logits/rejected": -0.1855701506137848, + "logps/chosen": -369.087646484375, + "logps/rejected": -406.01043701171875, + "loss": 0.3274, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8161664009094238, + "rewards/margins": 1.1966300010681152, + "rewards/rejected": -3.012796401977539, + "step": 88 + }, + { + "epoch": 0.05536547433903577, + "grad_norm": 8.107218742370605, + "learning_rate": 4.450000000000001e-06, + "logits/chosen": -0.25534749031066895, + "logits/rejected": -0.3505297601222992, + "logps/chosen": -220.32781982421875, + "logps/rejected": -419.001708984375, + "loss": 0.2747, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0189403295516968, + "rewards/margins": 2.8895504474639893, + "rewards/rejected": -3.9084906578063965, + "step": 89 + }, + { + "epoch": 0.05598755832037325, + "grad_norm": 6.013584136962891, + "learning_rate": 4.5e-06, + "logits/chosen": -0.23414170742034912, + "logits/rejected": -0.3143121004104614, + "logps/chosen": -314.42413330078125, + "logps/rejected": -466.62030029296875, + "loss": 0.1221, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0058807134628296, + "rewards/margins": 3.304506301879883, + "rewards/rejected": -4.310386657714844, + "step": 90 + }, + { + "epoch": 0.05660964230171073, + "grad_norm": 4.858855724334717, + "learning_rate": 4.5500000000000005e-06, + "logits/chosen": -0.23594039678573608, + "logits/rejected": -0.2708371579647064, + "logps/chosen": -296.9461669921875, + "logps/rejected": -347.0289001464844, + "loss": 0.1416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8973784446716309, + "rewards/margins": 2.2921180725097656, + "rewards/rejected": -3.1894965171813965, + "step": 91 + }, + { + "epoch": 0.05723172628304821, + "grad_norm": 16.984817504882812, + "learning_rate": 4.600000000000001e-06, + "logits/chosen": -0.2350495457649231, + "logits/rejected": -0.27719932794570923, + "logps/chosen": -347.6307373046875, + "logps/rejected": -572.4061279296875, + "loss": 0.4125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43612140417099, + "rewards/margins": 1.724344253540039, + "rewards/rejected": -2.160465717315674, + "step": 92 + }, + { + "epoch": 0.05785381026438569, + "grad_norm": 14.711394309997559, + "learning_rate": 4.65e-06, + "logits/chosen": -0.2187137007713318, + "logits/rejected": -0.2728143334388733, + "logps/chosen": -282.34136962890625, + "logps/rejected": -296.1759033203125, + "loss": 0.4731, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9558428525924683, + "rewards/margins": 1.1256113052368164, + "rewards/rejected": -2.081454038619995, + "step": 93 + }, + { + "epoch": 0.05847589424572317, + "grad_norm": 6.850039482116699, + "learning_rate": 4.7e-06, + "logits/chosen": -0.16619889438152313, + "logits/rejected": -0.27166396379470825, + "logps/chosen": -353.1280517578125, + "logps/rejected": -530.3130493164062, + "loss": 0.1538, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.534222960472107, + "rewards/margins": 3.014535665512085, + "rewards/rejected": -4.548758506774902, + "step": 94 + }, + { + "epoch": 0.05909797822706065, + "grad_norm": 6.875, + "learning_rate": 4.75e-06, + "logits/chosen": -0.14405858516693115, + "logits/rejected": -0.30219656229019165, + "logps/chosen": -316.3645324707031, + "logps/rejected": -594.8283081054688, + "loss": 0.0972, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8174395561218262, + "rewards/margins": 5.287438869476318, + "rewards/rejected": -6.104877948760986, + "step": 95 + }, + { + "epoch": 0.059720062208398136, + "grad_norm": 0.6278870701789856, + "learning_rate": 4.800000000000001e-06, + "logits/chosen": -0.17816558480262756, + "logits/rejected": -0.24042931199073792, + "logps/chosen": -447.1346435546875, + "logps/rejected": -545.14013671875, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8847008347511292, + "rewards/margins": 5.524404525756836, + "rewards/rejected": -6.409104824066162, + "step": 96 + }, + { + "epoch": 0.060342146189735615, + "grad_norm": 2.6970157623291016, + "learning_rate": 4.85e-06, + "logits/chosen": -0.25803327560424805, + "logits/rejected": -0.32198384404182434, + "logps/chosen": -285.0729675292969, + "logps/rejected": -432.37042236328125, + "loss": 0.142, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6657625436782837, + "rewards/margins": 2.6728787422180176, + "rewards/rejected": -4.338641166687012, + "step": 97 + }, + { + "epoch": 0.060964230171073094, + "grad_norm": 3.8611464500427246, + "learning_rate": 4.9000000000000005e-06, + "logits/chosen": -0.1978822946548462, + "logits/rejected": -0.25351041555404663, + "logps/chosen": -217.11026000976562, + "logps/rejected": -345.26397705078125, + "loss": 0.148, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0106687545776367, + "rewards/margins": 2.3796229362487793, + "rewards/rejected": -3.390291690826416, + "step": 98 + }, + { + "epoch": 0.06158631415241057, + "grad_norm": 15.627909660339355, + "learning_rate": 4.95e-06, + "logits/chosen": -0.20255884528160095, + "logits/rejected": -0.21642224490642548, + "logps/chosen": -490.784912109375, + "logps/rejected": -420.4334411621094, + "loss": 0.672, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8641340732574463, + "rewards/margins": 1.5182538032531738, + "rewards/rejected": -4.382388114929199, + "step": 99 + }, + { + "epoch": 0.06220839813374806, + "grad_norm": 17.873910903930664, + "learning_rate": 5e-06, + "logits/chosen": -0.2238234281539917, + "logits/rejected": -0.2846287488937378, + "logps/chosen": -475.9477844238281, + "logps/rejected": -546.1002197265625, + "loss": 0.5868, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.902486801147461, + "rewards/margins": 1.5364171266555786, + "rewards/rejected": -3.438904047012329, + "step": 100 + }, + { + "epoch": 0.06283048211508553, + "grad_norm": 4.983588218688965, + "learning_rate": 4.994444444444445e-06, + "logits/chosen": -0.1988169401884079, + "logits/rejected": -0.25054481625556946, + "logps/chosen": -334.4656982421875, + "logps/rejected": -437.20458984375, + "loss": 0.125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.479587197303772, + "rewards/margins": 2.7294890880584717, + "rewards/rejected": -4.209076404571533, + "step": 101 + }, + { + "epoch": 0.06345256609642301, + "grad_norm": 4.785112380981445, + "learning_rate": 4.988888888888889e-06, + "logits/chosen": -0.10407230257987976, + "logits/rejected": -0.3203573524951935, + "logps/chosen": -171.38693237304688, + "logps/rejected": -524.530517578125, + "loss": 0.1056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8518204689025879, + "rewards/margins": 4.632033824920654, + "rewards/rejected": -5.483854293823242, + "step": 102 + }, + { + "epoch": 0.0640746500777605, + "grad_norm": 6.751524925231934, + "learning_rate": 4.983333333333334e-06, + "logits/chosen": -0.1599581390619278, + "logits/rejected": -0.2534915506839752, + "logps/chosen": -299.6903381347656, + "logps/rejected": -550.2454223632812, + "loss": 0.2099, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0283139944076538, + "rewards/margins": 3.365248680114746, + "rewards/rejected": -4.3935627937316895, + "step": 103 + }, + { + "epoch": 0.06469673405909798, + "grad_norm": 11.333843231201172, + "learning_rate": 4.977777777777778e-06, + "logits/chosen": -0.27100732922554016, + "logits/rejected": -0.30968937277793884, + "logps/chosen": -342.60205078125, + "logps/rejected": -402.06591796875, + "loss": 0.4281, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.255129814147949, + "rewards/margins": 3.032567024230957, + "rewards/rejected": -5.287696838378906, + "step": 104 + }, + { + "epoch": 0.06531881804043546, + "grad_norm": 11.293391227722168, + "learning_rate": 4.9722222222222224e-06, + "logits/chosen": -0.14339832961559296, + "logits/rejected": -0.20296230912208557, + "logps/chosen": -373.05010986328125, + "logps/rejected": -415.92462158203125, + "loss": 0.5317, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.342689037322998, + "rewards/margins": 2.439150333404541, + "rewards/rejected": -4.781839370727539, + "step": 105 + }, + { + "epoch": 0.06594090202177294, + "grad_norm": 2.5984058380126953, + "learning_rate": 4.966666666666667e-06, + "logits/chosen": -0.2143072932958603, + "logits/rejected": -0.25502657890319824, + "logps/chosen": -149.13711547851562, + "logps/rejected": -309.04296875, + "loss": 0.0938, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3711795806884766, + "rewards/margins": 3.4287259578704834, + "rewards/rejected": -4.799905776977539, + "step": 106 + }, + { + "epoch": 0.06656298600311042, + "grad_norm": 6.106658458709717, + "learning_rate": 4.961111111111111e-06, + "logits/chosen": -0.10602514445781708, + "logits/rejected": -0.13551323115825653, + "logps/chosen": -525.9331665039062, + "logps/rejected": -577.5033569335938, + "loss": 0.161, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7058937549591064, + "rewards/margins": 3.909100294113159, + "rewards/rejected": -6.614993572235107, + "step": 107 + }, + { + "epoch": 0.0671850699844479, + "grad_norm": 11.973894119262695, + "learning_rate": 4.9555555555555565e-06, + "logits/chosen": -0.24562285840511322, + "logits/rejected": -0.2282891720533371, + "logps/chosen": -422.45013427734375, + "logps/rejected": -468.3897705078125, + "loss": 0.4039, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4353492259979248, + "rewards/margins": 1.3717682361602783, + "rewards/rejected": -2.807117462158203, + "step": 108 + }, + { + "epoch": 0.06780715396578538, + "grad_norm": 12.917905807495117, + "learning_rate": 4.95e-06, + "logits/chosen": -0.22428634762763977, + "logits/rejected": -0.23714569211006165, + "logps/chosen": -510.07843017578125, + "logps/rejected": -511.8265380859375, + "loss": 0.4683, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.143575668334961, + "rewards/margins": 1.4074361324310303, + "rewards/rejected": -4.55101203918457, + "step": 109 + }, + { + "epoch": 0.06842923794712286, + "grad_norm": 5.616114616394043, + "learning_rate": 4.944444444444445e-06, + "logits/chosen": -0.2211325466632843, + "logits/rejected": -0.2838842272758484, + "logps/chosen": -322.81134033203125, + "logps/rejected": -471.2586669921875, + "loss": 0.1559, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.775026798248291, + "rewards/margins": 3.691680908203125, + "rewards/rejected": -5.466707706451416, + "step": 110 + }, + { + "epoch": 0.06905132192846034, + "grad_norm": 15.268199920654297, + "learning_rate": 4.938888888888889e-06, + "logits/chosen": -0.13068610429763794, + "logits/rejected": -0.2577150762081146, + "logps/chosen": -378.6321105957031, + "logps/rejected": -538.8584594726562, + "loss": 0.6168, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5889687538146973, + "rewards/margins": 4.1271257400512695, + "rewards/rejected": -6.716094493865967, + "step": 111 + }, + { + "epoch": 0.06967340590979783, + "grad_norm": 13.674901008605957, + "learning_rate": 4.933333333333334e-06, + "logits/chosen": -0.24035942554473877, + "logits/rejected": -0.2908879518508911, + "logps/chosen": -309.5364990234375, + "logps/rejected": -473.10308837890625, + "loss": 0.3822, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.274183511734009, + "rewards/margins": 3.3288121223449707, + "rewards/rejected": -5.602994918823242, + "step": 112 + }, + { + "epoch": 0.07029548989113531, + "grad_norm": 1.7368253469467163, + "learning_rate": 4.927777777777778e-06, + "logits/chosen": -0.05762298032641411, + "logits/rejected": -0.171758234500885, + "logps/chosen": -240.3313751220703, + "logps/rejected": -460.18438720703125, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.264184594154358, + "rewards/margins": 4.703076362609863, + "rewards/rejected": -5.96726131439209, + "step": 113 + }, + { + "epoch": 0.07091757387247279, + "grad_norm": 7.406444549560547, + "learning_rate": 4.922222222222223e-06, + "logits/chosen": -0.1805172562599182, + "logits/rejected": -0.23107612133026123, + "logps/chosen": -176.66012573242188, + "logps/rejected": -250.7313232421875, + "loss": 0.3586, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4181822538375854, + "rewards/margins": 2.411172389984131, + "rewards/rejected": -3.8293545246124268, + "step": 114 + }, + { + "epoch": 0.07153965785381027, + "grad_norm": 44.6976432800293, + "learning_rate": 4.9166666666666665e-06, + "logits/chosen": -0.10437363386154175, + "logits/rejected": -0.228228360414505, + "logps/chosen": -445.7233581542969, + "logps/rejected": -582.7456665039062, + "loss": 0.3045, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.72880220413208, + "rewards/margins": 4.351573944091797, + "rewards/rejected": -6.080376625061035, + "step": 115 + }, + { + "epoch": 0.07216174183514774, + "grad_norm": 1.8268243074417114, + "learning_rate": 4.911111111111112e-06, + "logits/chosen": -0.20171529054641724, + "logits/rejected": -0.2554078698158264, + "logps/chosen": -223.3256378173828, + "logps/rejected": -354.04290771484375, + "loss": 0.0815, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9956240653991699, + "rewards/margins": 3.8957948684692383, + "rewards/rejected": -4.891418933868408, + "step": 116 + }, + { + "epoch": 0.07278382581648522, + "grad_norm": 7.928267955780029, + "learning_rate": 4.905555555555556e-06, + "logits/chosen": -0.1302953064441681, + "logits/rejected": -0.21209967136383057, + "logps/chosen": -291.5841979980469, + "logps/rejected": -440.70166015625, + "loss": 0.1919, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7437853813171387, + "rewards/margins": 2.6985244750976562, + "rewards/rejected": -4.442309856414795, + "step": 117 + }, + { + "epoch": 0.0734059097978227, + "grad_norm": 7.725869178771973, + "learning_rate": 4.9000000000000005e-06, + "logits/chosen": -0.09175892919301987, + "logits/rejected": -0.17885896563529968, + "logps/chosen": -368.66619873046875, + "logps/rejected": -568.918212890625, + "loss": 0.2291, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7985304594039917, + "rewards/margins": 4.037115097045898, + "rewards/rejected": -4.8356451988220215, + "step": 118 + }, + { + "epoch": 0.07402799377916018, + "grad_norm": 2.1717803478240967, + "learning_rate": 4.894444444444445e-06, + "logits/chosen": -0.13691681623458862, + "logits/rejected": -0.28039994835853577, + "logps/chosen": -174.56939697265625, + "logps/rejected": -457.9192810058594, + "loss": 0.1196, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1842372417449951, + "rewards/margins": 5.027883529663086, + "rewards/rejected": -6.212120532989502, + "step": 119 + }, + { + "epoch": 0.07465007776049767, + "grad_norm": 1.247185230255127, + "learning_rate": 4.888888888888889e-06, + "logits/chosen": -0.10234531760215759, + "logits/rejected": -0.20982712507247925, + "logps/chosen": -183.46261596679688, + "logps/rejected": -486.98736572265625, + "loss": 0.0468, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.57109797000885, + "rewards/margins": 6.880401611328125, + "rewards/rejected": -8.451499938964844, + "step": 120 + }, + { + "epoch": 0.07527216174183515, + "grad_norm": 11.165543556213379, + "learning_rate": 4.883333333333334e-06, + "logits/chosen": -0.18571849167346954, + "logits/rejected": -0.20858336985111237, + "logps/chosen": -414.947998046875, + "logps/rejected": -488.66241455078125, + "loss": 0.4864, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0048108100891113, + "rewards/margins": 2.7042276859283447, + "rewards/rejected": -5.709038734436035, + "step": 121 + }, + { + "epoch": 0.07589424572317263, + "grad_norm": 3.8685173988342285, + "learning_rate": 4.877777777777778e-06, + "logits/chosen": -0.17421385645866394, + "logits/rejected": -0.2196323573589325, + "logps/chosen": -273.59783935546875, + "logps/rejected": -387.1546936035156, + "loss": 0.2151, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.501394510269165, + "rewards/margins": 3.6947269439697266, + "rewards/rejected": -5.1961212158203125, + "step": 122 + }, + { + "epoch": 0.07651632970451011, + "grad_norm": 9.008697509765625, + "learning_rate": 4.8722222222222225e-06, + "logits/chosen": -0.17155633866786957, + "logits/rejected": -0.26579058170318604, + "logps/chosen": -475.8711242675781, + "logps/rejected": -499.34820556640625, + "loss": 0.2535, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.823632001876831, + "rewards/margins": 3.0770883560180664, + "rewards/rejected": -5.900720596313477, + "step": 123 + }, + { + "epoch": 0.07713841368584759, + "grad_norm": 0.680228054523468, + "learning_rate": 4.866666666666667e-06, + "logits/chosen": -0.1389962136745453, + "logits/rejected": -0.2582349479198456, + "logps/chosen": -114.04161834716797, + "logps/rejected": -341.9712829589844, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5968825221061707, + "rewards/margins": 5.732479095458984, + "rewards/rejected": -6.3293609619140625, + "step": 124 + }, + { + "epoch": 0.07776049766718507, + "grad_norm": 6.679741382598877, + "learning_rate": 4.861111111111111e-06, + "logits/chosen": -0.14296233654022217, + "logits/rejected": -0.21161624789237976, + "logps/chosen": -243.76454162597656, + "logps/rejected": -322.19720458984375, + "loss": 0.1759, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.231826066970825, + "rewards/margins": 3.7048392295837402, + "rewards/rejected": -5.936664581298828, + "step": 125 + }, + { + "epoch": 0.07838258164852255, + "grad_norm": 6.282657623291016, + "learning_rate": 4.855555555555556e-06, + "logits/chosen": -0.2014608234167099, + "logits/rejected": -0.24732531607151031, + "logps/chosen": -287.5174560546875, + "logps/rejected": -479.3836364746094, + "loss": 0.1198, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5593644380569458, + "rewards/margins": 3.4355759620666504, + "rewards/rejected": -4.994940757751465, + "step": 126 + }, + { + "epoch": 0.07900466562986003, + "grad_norm": 8.29532527923584, + "learning_rate": 4.85e-06, + "logits/chosen": -0.3388126492500305, + "logits/rejected": -0.3605138063430786, + "logps/chosen": -380.97430419921875, + "logps/rejected": -344.24200439453125, + "loss": 0.2373, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0343973636627197, + "rewards/margins": 2.3308024406433105, + "rewards/rejected": -4.365200042724609, + "step": 127 + }, + { + "epoch": 0.0796267496111975, + "grad_norm": 16.32593536376953, + "learning_rate": 4.8444444444444446e-06, + "logits/chosen": -0.17634262144565582, + "logits/rejected": -0.2938919961452484, + "logps/chosen": -382.5877990722656, + "logps/rejected": -474.0108337402344, + "loss": 0.6278, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7991302013397217, + "rewards/margins": 2.771221160888672, + "rewards/rejected": -4.570351600646973, + "step": 128 + }, + { + "epoch": 0.080248833592535, + "grad_norm": 21.102914810180664, + "learning_rate": 4.838888888888889e-06, + "logits/chosen": -0.16238312423229218, + "logits/rejected": -0.1855742186307907, + "logps/chosen": -538.90087890625, + "logps/rejected": -541.1488037109375, + "loss": 0.5282, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.8266429901123047, + "rewards/margins": 3.2426528930664062, + "rewards/rejected": -7.069295883178711, + "step": 129 + }, + { + "epoch": 0.08087091757387248, + "grad_norm": 4.7855753898620605, + "learning_rate": 4.833333333333333e-06, + "logits/chosen": -0.030430622398853302, + "logits/rejected": -0.2926279306411743, + "logps/chosen": -166.90972900390625, + "logps/rejected": -505.212646484375, + "loss": 0.0999, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5429767370224, + "rewards/margins": 5.44045352935791, + "rewards/rejected": -6.983429908752441, + "step": 130 + }, + { + "epoch": 0.08149300155520996, + "grad_norm": 18.346792221069336, + "learning_rate": 4.827777777777778e-06, + "logits/chosen": -0.2579612135887146, + "logits/rejected": -0.28094470500946045, + "logps/chosen": -462.4564208984375, + "logps/rejected": -564.3174438476562, + "loss": 0.5206, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.2043673992156982, + "rewards/margins": 2.712836980819702, + "rewards/rejected": -5.9172043800354, + "step": 131 + }, + { + "epoch": 0.08211508553654744, + "grad_norm": 4.479557037353516, + "learning_rate": 4.822222222222222e-06, + "logits/chosen": -0.19502291083335876, + "logits/rejected": -0.29779985547065735, + "logps/chosen": -251.22531127929688, + "logps/rejected": -503.818359375, + "loss": 0.1025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.446255087852478, + "rewards/margins": 6.307229042053223, + "rewards/rejected": -7.75348424911499, + "step": 132 + }, + { + "epoch": 0.08273716951788491, + "grad_norm": 2.294543743133545, + "learning_rate": 4.816666666666667e-06, + "logits/chosen": -0.14516718685626984, + "logits/rejected": -0.2781965732574463, + "logps/chosen": -96.27398681640625, + "logps/rejected": -294.77984619140625, + "loss": 0.0897, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6634706258773804, + "rewards/margins": 4.056285858154297, + "rewards/rejected": -4.719756603240967, + "step": 133 + }, + { + "epoch": 0.0833592534992224, + "grad_norm": 6.4777703285217285, + "learning_rate": 4.811111111111111e-06, + "logits/chosen": -0.243896484375, + "logits/rejected": -0.2945478856563568, + "logps/chosen": -437.3977355957031, + "logps/rejected": -543.8512573242188, + "loss": 0.0902, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7442679405212402, + "rewards/margins": 5.306023597717285, + "rewards/rejected": -9.050291061401367, + "step": 134 + }, + { + "epoch": 0.08398133748055987, + "grad_norm": 6.514171123504639, + "learning_rate": 4.805555555555556e-06, + "logits/chosen": -0.17173555493354797, + "logits/rejected": -0.2956388592720032, + "logps/chosen": -537.8055419921875, + "logps/rejected": -529.9713134765625, + "loss": 0.1467, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.801835536956787, + "rewards/margins": 4.990110874176025, + "rewards/rejected": -7.7919464111328125, + "step": 135 + }, + { + "epoch": 0.08460342146189735, + "grad_norm": 1.3634387254714966, + "learning_rate": 4.800000000000001e-06, + "logits/chosen": -0.17122681438922882, + "logits/rejected": -0.2315588891506195, + "logps/chosen": -323.12158203125, + "logps/rejected": -460.4524230957031, + "loss": 0.1189, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0333783626556396, + "rewards/margins": 4.3034186363220215, + "rewards/rejected": -7.336796760559082, + "step": 136 + }, + { + "epoch": 0.08522550544323483, + "grad_norm": 1.2580617666244507, + "learning_rate": 4.794444444444445e-06, + "logits/chosen": -0.26822876930236816, + "logits/rejected": -0.28947991132736206, + "logps/chosen": -373.5331115722656, + "logps/rejected": -471.1632995605469, + "loss": 0.0313, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1304267644882202, + "rewards/margins": 5.299930095672607, + "rewards/rejected": -6.430357456207275, + "step": 137 + }, + { + "epoch": 0.08584758942457232, + "grad_norm": 11.069660186767578, + "learning_rate": 4.7888888888888894e-06, + "logits/chosen": -0.1682964563369751, + "logits/rejected": -0.23169651627540588, + "logps/chosen": -562.1502685546875, + "logps/rejected": -655.3009643554688, + "loss": 0.2316, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7436795234680176, + "rewards/margins": 4.400881290435791, + "rewards/rejected": -7.14456033706665, + "step": 138 + }, + { + "epoch": 0.0864696734059098, + "grad_norm": 5.9546122550964355, + "learning_rate": 4.783333333333334e-06, + "logits/chosen": -0.12826819717884064, + "logits/rejected": -0.2625492215156555, + "logps/chosen": -294.7453308105469, + "logps/rejected": -527.5718383789062, + "loss": 0.1228, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7684245109558105, + "rewards/margins": 5.07581901550293, + "rewards/rejected": -7.844243049621582, + "step": 139 + }, + { + "epoch": 0.08709175738724728, + "grad_norm": 5.184479236602783, + "learning_rate": 4.777777777777778e-06, + "logits/chosen": -0.17372861504554749, + "logits/rejected": -0.26511266827583313, + "logps/chosen": -280.52825927734375, + "logps/rejected": -447.27142333984375, + "loss": 0.0994, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.048677921295166, + "rewards/margins": 4.461150169372559, + "rewards/rejected": -6.509828090667725, + "step": 140 + }, + { + "epoch": 0.08771384136858476, + "grad_norm": 4.141274929046631, + "learning_rate": 4.772222222222223e-06, + "logits/chosen": -0.16845634579658508, + "logits/rejected": -0.29320311546325684, + "logps/chosen": -268.65631103515625, + "logps/rejected": -420.47967529296875, + "loss": 0.1174, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2053756713867188, + "rewards/margins": 4.919946193695068, + "rewards/rejected": -7.125322341918945, + "step": 141 + }, + { + "epoch": 0.08833592534992224, + "grad_norm": 2.283074140548706, + "learning_rate": 4.766666666666667e-06, + "logits/chosen": -0.18765391409397125, + "logits/rejected": -0.28243622183799744, + "logps/chosen": -257.04449462890625, + "logps/rejected": -430.07550048828125, + "loss": 0.0575, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1766958236694336, + "rewards/margins": 4.0752668380737305, + "rewards/rejected": -6.2519636154174805, + "step": 142 + }, + { + "epoch": 0.08895800933125972, + "grad_norm": 11.20405101776123, + "learning_rate": 4.7611111111111115e-06, + "logits/chosen": -0.1861506700515747, + "logits/rejected": -0.2246856838464737, + "logps/chosen": -468.6060485839844, + "logps/rejected": -439.8089294433594, + "loss": 0.1305, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8717777729034424, + "rewards/margins": 4.549718856811523, + "rewards/rejected": -6.421496868133545, + "step": 143 + }, + { + "epoch": 0.0895800933125972, + "grad_norm": 22.221357345581055, + "learning_rate": 4.755555555555556e-06, + "logits/chosen": -0.18623599410057068, + "logits/rejected": -0.32057708501815796, + "logps/chosen": -326.24896240234375, + "logps/rejected": -425.0649108886719, + "loss": 0.5203, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.6106276512145996, + "rewards/margins": 3.432828903198242, + "rewards/rejected": -7.043456077575684, + "step": 144 + }, + { + "epoch": 0.09020217729393468, + "grad_norm": 1.2252581119537354, + "learning_rate": 4.75e-06, + "logits/chosen": -0.18227611482143402, + "logits/rejected": -0.27876192331314087, + "logps/chosen": -112.36376190185547, + "logps/rejected": -318.92730712890625, + "loss": 0.0571, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.167729377746582, + "rewards/margins": 4.201499938964844, + "rewards/rejected": -5.369229316711426, + "step": 145 + }, + { + "epoch": 0.09082426127527216, + "grad_norm": 2.9624807834625244, + "learning_rate": 4.744444444444445e-06, + "logits/chosen": -0.1380217969417572, + "logits/rejected": -0.2180052101612091, + "logps/chosen": -318.90777587890625, + "logps/rejected": -419.510986328125, + "loss": 0.1331, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5582544803619385, + "rewards/margins": 5.260533332824707, + "rewards/rejected": -7.818788051605225, + "step": 146 + }, + { + "epoch": 0.09144634525660965, + "grad_norm": 3.35735821723938, + "learning_rate": 4.73888888888889e-06, + "logits/chosen": -0.10039821267127991, + "logits/rejected": -0.2074279934167862, + "logps/chosen": -195.84390258789062, + "logps/rejected": -377.0635986328125, + "loss": 0.1413, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.192751884460449, + "rewards/margins": 4.119298934936523, + "rewards/rejected": -6.312050819396973, + "step": 147 + }, + { + "epoch": 0.09206842923794713, + "grad_norm": 1.7821208238601685, + "learning_rate": 4.7333333333333335e-06, + "logits/chosen": -0.1955682635307312, + "logits/rejected": -0.35646411776542664, + "logps/chosen": -308.23382568359375, + "logps/rejected": -549.1272583007812, + "loss": 0.0588, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7741739749908447, + "rewards/margins": 5.633298873901367, + "rewards/rejected": -7.407472610473633, + "step": 148 + }, + { + "epoch": 0.0926905132192846, + "grad_norm": 0.562885582447052, + "learning_rate": 4.727777777777779e-06, + "logits/chosen": -0.06650157272815704, + "logits/rejected": -0.15252648293972015, + "logps/chosen": -499.4558410644531, + "logps/rejected": -563.2388305664062, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.998863697052002, + "rewards/margins": 5.443285942077637, + "rewards/rejected": -7.4421491622924805, + "step": 149 + }, + { + "epoch": 0.09331259720062209, + "grad_norm": 2.5517399311065674, + "learning_rate": 4.722222222222222e-06, + "logits/chosen": -0.26251065731048584, + "logits/rejected": -0.2258976399898529, + "logps/chosen": -424.430908203125, + "logps/rejected": -492.6870422363281, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5834293365478516, + "rewards/margins": 3.7995922565460205, + "rewards/rejected": -6.383021831512451, + "step": 150 + }, + { + "epoch": 0.09393468118195956, + "grad_norm": 13.786626815795898, + "learning_rate": 4.7166666666666675e-06, + "logits/chosen": -0.2045581042766571, + "logits/rejected": -0.17142236232757568, + "logps/chosen": -497.0528564453125, + "logps/rejected": -367.04693603515625, + "loss": 0.2414, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.208629846572876, + "rewards/margins": 3.262070894241333, + "rewards/rejected": -5.470700263977051, + "step": 151 + }, + { + "epoch": 0.09455676516329704, + "grad_norm": 0.7916446924209595, + "learning_rate": 4.711111111111111e-06, + "logits/chosen": -0.16975180804729462, + "logits/rejected": -0.28608939051628113, + "logps/chosen": -225.3519287109375, + "logps/rejected": -404.1348876953125, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0104199647903442, + "rewards/margins": 5.052133560180664, + "rewards/rejected": -6.062553882598877, + "step": 152 + }, + { + "epoch": 0.09517884914463452, + "grad_norm": 3.420166254043579, + "learning_rate": 4.705555555555556e-06, + "logits/chosen": -0.017493009567260742, + "logits/rejected": -0.09905597567558289, + "logps/chosen": -368.28546142578125, + "logps/rejected": -488.625, + "loss": 0.06, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0510106086730957, + "rewards/margins": 5.072464466094971, + "rewards/rejected": -8.123475074768066, + "step": 153 + }, + { + "epoch": 0.095800933125972, + "grad_norm": 2.0165693759918213, + "learning_rate": 4.7e-06, + "logits/chosen": -0.2181408703327179, + "logits/rejected": -0.28840434551239014, + "logps/chosen": -396.58148193359375, + "logps/rejected": -581.587646484375, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9049289226531982, + "rewards/margins": 6.520705699920654, + "rewards/rejected": -8.425634384155273, + "step": 154 + }, + { + "epoch": 0.09642301710730948, + "grad_norm": 10.916274070739746, + "learning_rate": 4.694444444444445e-06, + "logits/chosen": -0.055173471570014954, + "logits/rejected": -0.22513294219970703, + "logps/chosen": -197.7305908203125, + "logps/rejected": -414.687255859375, + "loss": 0.3746, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9097092151641846, + "rewards/margins": 4.763751983642578, + "rewards/rejected": -7.6734619140625, + "step": 155 + }, + { + "epoch": 0.09704510108864697, + "grad_norm": 2.832921028137207, + "learning_rate": 4.6888888888888895e-06, + "logits/chosen": -0.19660522043704987, + "logits/rejected": -0.29034486413002014, + "logps/chosen": -513.6806640625, + "logps/rejected": -665.4411010742188, + "loss": 0.0417, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.248539924621582, + "rewards/margins": 4.340322017669678, + "rewards/rejected": -7.588862419128418, + "step": 156 + }, + { + "epoch": 0.09766718506998445, + "grad_norm": 14.057051658630371, + "learning_rate": 4.683333333333334e-06, + "logits/chosen": -0.18056970834732056, + "logits/rejected": -0.19880186021327972, + "logps/chosen": -251.38540649414062, + "logps/rejected": -432.09136962890625, + "loss": 0.4259, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.90017831325531, + "rewards/margins": 5.152604103088379, + "rewards/rejected": -7.052783012390137, + "step": 157 + }, + { + "epoch": 0.09828926905132193, + "grad_norm": 6.243239402770996, + "learning_rate": 4.677777777777778e-06, + "logits/chosen": -0.09316147118806839, + "logits/rejected": -0.18470972776412964, + "logps/chosen": -291.03253173828125, + "logps/rejected": -628.9842529296875, + "loss": 0.1776, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9024248123168945, + "rewards/margins": 5.477657318115234, + "rewards/rejected": -7.380082130432129, + "step": 158 + }, + { + "epoch": 0.09891135303265941, + "grad_norm": 1.0844134092330933, + "learning_rate": 4.672222222222223e-06, + "logits/chosen": -0.25109702348709106, + "logits/rejected": -0.3132866621017456, + "logps/chosen": -164.98486328125, + "logps/rejected": -363.2279052734375, + "loss": 0.0288, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8861300945281982, + "rewards/margins": 4.472002029418945, + "rewards/rejected": -6.358132362365723, + "step": 159 + }, + { + "epoch": 0.09953343701399689, + "grad_norm": 1.1472610235214233, + "learning_rate": 4.666666666666667e-06, + "logits/chosen": -0.07884544134140015, + "logits/rejected": -0.21350839734077454, + "logps/chosen": -384.11346435546875, + "logps/rejected": -510.78106689453125, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2491915225982666, + "rewards/margins": 6.51008415222168, + "rewards/rejected": -9.759275436401367, + "step": 160 + }, + { + "epoch": 0.10015552099533437, + "grad_norm": 3.7338738441467285, + "learning_rate": 4.6611111111111116e-06, + "logits/chosen": -0.14186826348304749, + "logits/rejected": -0.20403054356575012, + "logps/chosen": -539.4088134765625, + "logps/rejected": -602.560302734375, + "loss": 0.0396, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4082674980163574, + "rewards/margins": 6.096379280090332, + "rewards/rejected": -9.504646301269531, + "step": 161 + }, + { + "epoch": 0.10077760497667185, + "grad_norm": 0.4819898307323456, + "learning_rate": 4.655555555555556e-06, + "logits/chosen": -0.18916434049606323, + "logits/rejected": -0.32843708992004395, + "logps/chosen": -401.46014404296875, + "logps/rejected": -627.66845703125, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4642140865325928, + "rewards/margins": 7.193984031677246, + "rewards/rejected": -10.658198356628418, + "step": 162 + }, + { + "epoch": 0.10139968895800933, + "grad_norm": 15.747608184814453, + "learning_rate": 4.65e-06, + "logits/chosen": -0.19415457546710968, + "logits/rejected": -0.25259146094322205, + "logps/chosen": -603.2037353515625, + "logps/rejected": -695.6295776367188, + "loss": 0.4917, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.199634552001953, + "rewards/margins": 2.497157573699951, + "rewards/rejected": -7.696791648864746, + "step": 163 + }, + { + "epoch": 0.1020217729393468, + "grad_norm": 1.7518179416656494, + "learning_rate": 4.644444444444445e-06, + "logits/chosen": -0.19639956951141357, + "logits/rejected": -0.25594067573547363, + "logps/chosen": -352.10076904296875, + "logps/rejected": -423.2796630859375, + "loss": 0.0609, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.308328866958618, + "rewards/margins": 4.884799957275391, + "rewards/rejected": -8.19312858581543, + "step": 164 + }, + { + "epoch": 0.1026438569206843, + "grad_norm": 4.809370517730713, + "learning_rate": 4.638888888888889e-06, + "logits/chosen": -0.008531246334314346, + "logits/rejected": -0.071965292096138, + "logps/chosen": -316.29931640625, + "logps/rejected": -454.67376708984375, + "loss": 0.1241, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4439988136291504, + "rewards/margins": 5.598569869995117, + "rewards/rejected": -9.042569160461426, + "step": 165 + }, + { + "epoch": 0.10326594090202178, + "grad_norm": 4.295832633972168, + "learning_rate": 4.633333333333334e-06, + "logits/chosen": -0.19534152746200562, + "logits/rejected": -0.268149733543396, + "logps/chosen": -324.1209716796875, + "logps/rejected": -526.9644775390625, + "loss": 0.0718, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9153637886047363, + "rewards/margins": 5.440799713134766, + "rewards/rejected": -8.356163024902344, + "step": 166 + }, + { + "epoch": 0.10388802488335926, + "grad_norm": 15.964556694030762, + "learning_rate": 4.627777777777778e-06, + "logits/chosen": -0.12884372472763062, + "logits/rejected": -0.27335333824157715, + "logps/chosen": -426.1744079589844, + "logps/rejected": -733.6082763671875, + "loss": 0.2553, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.257850646972656, + "rewards/margins": 6.855635643005371, + "rewards/rejected": -11.113487243652344, + "step": 167 + }, + { + "epoch": 0.10451010886469674, + "grad_norm": 3.5890421867370605, + "learning_rate": 4.622222222222222e-06, + "logits/chosen": -0.14316433668136597, + "logits/rejected": -0.1956721991300583, + "logps/chosen": -382.98974609375, + "logps/rejected": -449.3462829589844, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.330122470855713, + "rewards/margins": 4.704492568969727, + "rewards/rejected": -8.034614562988281, + "step": 168 + }, + { + "epoch": 0.10513219284603421, + "grad_norm": 0.6975039839744568, + "learning_rate": 4.616666666666667e-06, + "logits/chosen": -0.04458482563495636, + "logits/rejected": -0.18471689522266388, + "logps/chosen": -262.11895751953125, + "logps/rejected": -552.9098510742188, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.146766185760498, + "rewards/margins": 5.831897258758545, + "rewards/rejected": -8.97866439819336, + "step": 169 + }, + { + "epoch": 0.1057542768273717, + "grad_norm": 2.0358004570007324, + "learning_rate": 4.611111111111112e-06, + "logits/chosen": -0.12608499825000763, + "logits/rejected": -0.14444072544574738, + "logps/chosen": -428.214111328125, + "logps/rejected": -482.06390380859375, + "loss": 0.0384, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.520082712173462, + "rewards/margins": 5.4471235275268555, + "rewards/rejected": -7.967206001281738, + "step": 170 + }, + { + "epoch": 0.10637636080870917, + "grad_norm": 2.419172525405884, + "learning_rate": 4.605555555555556e-06, + "logits/chosen": -0.0495135560631752, + "logits/rejected": -0.1777045875787735, + "logps/chosen": -331.12249755859375, + "logps/rejected": -542.3942260742188, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.795691967010498, + "rewards/margins": 5.345503330230713, + "rewards/rejected": -8.141195297241211, + "step": 171 + }, + { + "epoch": 0.10699844479004665, + "grad_norm": 19.099863052368164, + "learning_rate": 4.600000000000001e-06, + "logits/chosen": -0.18272532522678375, + "logits/rejected": -0.22684544324874878, + "logps/chosen": -405.3909606933594, + "logps/rejected": -479.1272277832031, + "loss": 0.5975, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.808656215667725, + "rewards/margins": 5.297513961791992, + "rewards/rejected": -10.106170654296875, + "step": 172 + }, + { + "epoch": 0.10762052877138413, + "grad_norm": 3.792548656463623, + "learning_rate": 4.594444444444444e-06, + "logits/chosen": -0.1565266251564026, + "logits/rejected": -0.24974589049816132, + "logps/chosen": -447.9469299316406, + "logps/rejected": -582.8763427734375, + "loss": 0.0528, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.227704048156738, + "rewards/margins": 7.335070610046387, + "rewards/rejected": -11.562774658203125, + "step": 173 + }, + { + "epoch": 0.10824261275272162, + "grad_norm": 3.8072543144226074, + "learning_rate": 4.58888888888889e-06, + "logits/chosen": -0.15028131008148193, + "logits/rejected": -0.21350513398647308, + "logps/chosen": -241.8307342529297, + "logps/rejected": -496.02142333984375, + "loss": 0.1111, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8228719234466553, + "rewards/margins": 5.835753440856934, + "rewards/rejected": -7.658625602722168, + "step": 174 + }, + { + "epoch": 0.1088646967340591, + "grad_norm": 6.172601699829102, + "learning_rate": 4.583333333333333e-06, + "logits/chosen": -0.17400690913200378, + "logits/rejected": -0.2729160785675049, + "logps/chosen": -129.79576110839844, + "logps/rejected": -327.24365234375, + "loss": 0.1614, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8090678453445435, + "rewards/margins": 5.869539260864258, + "rewards/rejected": -7.678607940673828, + "step": 175 + }, + { + "epoch": 0.10948678071539658, + "grad_norm": 19.116458892822266, + "learning_rate": 4.5777777777777785e-06, + "logits/chosen": -0.1408090591430664, + "logits/rejected": -0.13789355754852295, + "logps/chosen": -452.5251770019531, + "logps/rejected": -511.7213134765625, + "loss": 0.5329, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.006589889526367, + "rewards/margins": 3.1998894214630127, + "rewards/rejected": -8.206480026245117, + "step": 176 + }, + { + "epoch": 0.11010886469673406, + "grad_norm": 16.205223083496094, + "learning_rate": 4.572222222222222e-06, + "logits/chosen": -0.13615313172340393, + "logits/rejected": -0.31196850538253784, + "logps/chosen": -386.6686096191406, + "logps/rejected": -549.196044921875, + "loss": 0.406, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5630232095718384, + "rewards/margins": 6.631255149841309, + "rewards/rejected": -8.194278717041016, + "step": 177 + }, + { + "epoch": 0.11073094867807154, + "grad_norm": 1.236006736755371, + "learning_rate": 4.566666666666667e-06, + "logits/chosen": -0.0907483845949173, + "logits/rejected": -0.20069406926631927, + "logps/chosen": -135.84619140625, + "logps/rejected": -373.27276611328125, + "loss": 0.0277, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.763258457183838, + "rewards/margins": 6.080168724060059, + "rewards/rejected": -7.8434271812438965, + "step": 178 + }, + { + "epoch": 0.11135303265940902, + "grad_norm": 1.5609914064407349, + "learning_rate": 4.561111111111112e-06, + "logits/chosen": -0.31839674711227417, + "logits/rejected": -0.3583574891090393, + "logps/chosen": -332.6357421875, + "logps/rejected": -497.9476013183594, + "loss": 0.051, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9489903450012207, + "rewards/margins": 6.7406744956970215, + "rewards/rejected": -9.689663887023926, + "step": 179 + }, + { + "epoch": 0.1119751166407465, + "grad_norm": 3.4912405014038086, + "learning_rate": 4.555555555555556e-06, + "logits/chosen": -0.14343750476837158, + "logits/rejected": -0.22884529829025269, + "logps/chosen": -356.65740966796875, + "logps/rejected": -522.2003784179688, + "loss": 0.0789, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.181058883666992, + "rewards/margins": 4.046661376953125, + "rewards/rejected": -6.227720737457275, + "step": 180 + }, + { + "epoch": 0.11259720062208398, + "grad_norm": 3.1803946495056152, + "learning_rate": 4.5500000000000005e-06, + "logits/chosen": -0.18044686317443848, + "logits/rejected": -0.2522306740283966, + "logps/chosen": -361.4754333496094, + "logps/rejected": -554.6898193359375, + "loss": 0.0467, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6155502796173096, + "rewards/margins": 4.796407222747803, + "rewards/rejected": -7.411957740783691, + "step": 181 + }, + { + "epoch": 0.11321928460342146, + "grad_norm": 0.745781421661377, + "learning_rate": 4.544444444444445e-06, + "logits/chosen": -0.08614101260900497, + "logits/rejected": -0.2619820833206177, + "logps/chosen": -291.77996826171875, + "logps/rejected": -574.6598510742188, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3114356994628906, + "rewards/margins": 9.701647758483887, + "rewards/rejected": -13.013082504272461, + "step": 182 + }, + { + "epoch": 0.11384136858475895, + "grad_norm": 1.4026697874069214, + "learning_rate": 4.538888888888889e-06, + "logits/chosen": -0.2762816250324249, + "logits/rejected": -0.32922181487083435, + "logps/chosen": -292.77655029296875, + "logps/rejected": -462.41973876953125, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.820987343788147, + "rewards/margins": 5.916528701782227, + "rewards/rejected": -7.737515449523926, + "step": 183 + }, + { + "epoch": 0.11446345256609643, + "grad_norm": 1.8350580930709839, + "learning_rate": 4.533333333333334e-06, + "logits/chosen": -0.1555342674255371, + "logits/rejected": -0.18492698669433594, + "logps/chosen": -408.6524658203125, + "logps/rejected": -487.68096923828125, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8913841247558594, + "rewards/margins": 5.328997611999512, + "rewards/rejected": -9.220381736755371, + "step": 184 + }, + { + "epoch": 0.1150855365474339, + "grad_norm": 10.929362297058105, + "learning_rate": 4.527777777777778e-06, + "logits/chosen": -0.2357548624277115, + "logits/rejected": -0.2508939802646637, + "logps/chosen": -244.80572509765625, + "logps/rejected": -458.1954345703125, + "loss": 0.212, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9569942951202393, + "rewards/margins": 3.9729061126708984, + "rewards/rejected": -5.9298996925354, + "step": 185 + }, + { + "epoch": 0.11570762052877138, + "grad_norm": 0.7941931486129761, + "learning_rate": 4.5222222222222225e-06, + "logits/chosen": -0.0902315080165863, + "logits/rejected": -0.1730450689792633, + "logps/chosen": -485.11041259765625, + "logps/rejected": -656.389892578125, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6595654487609863, + "rewards/margins": 6.047621250152588, + "rewards/rejected": -9.707186698913574, + "step": 186 + }, + { + "epoch": 0.11632970451010886, + "grad_norm": 0.6701790690422058, + "learning_rate": 4.516666666666667e-06, + "logits/chosen": -0.22114023566246033, + "logits/rejected": -0.2749618887901306, + "logps/chosen": -337.93408203125, + "logps/rejected": -550.8779907226562, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2866430282592773, + "rewards/margins": 6.801355361938477, + "rewards/rejected": -9.087997436523438, + "step": 187 + }, + { + "epoch": 0.11695178849144634, + "grad_norm": 0.5332739949226379, + "learning_rate": 4.511111111111111e-06, + "logits/chosen": -0.16536462306976318, + "logits/rejected": -0.22592821717262268, + "logps/chosen": -428.5924377441406, + "logps/rejected": -575.3956298828125, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.926290988922119, + "rewards/margins": 6.478388786315918, + "rewards/rejected": -9.404680252075195, + "step": 188 + }, + { + "epoch": 0.11757387247278382, + "grad_norm": 1.152905821800232, + "learning_rate": 4.505555555555556e-06, + "logits/chosen": -0.15196458995342255, + "logits/rejected": -0.22794973850250244, + "logps/chosen": -327.06756591796875, + "logps/rejected": -461.6383056640625, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.244565010070801, + "rewards/margins": 7.494634628295898, + "rewards/rejected": -11.7391996383667, + "step": 189 + }, + { + "epoch": 0.1181959564541213, + "grad_norm": 3.16579532623291, + "learning_rate": 4.5e-06, + "logits/chosen": -0.26474907994270325, + "logits/rejected": -0.3169686496257782, + "logps/chosen": -500.40460205078125, + "logps/rejected": -469.98223876953125, + "loss": 0.0722, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9309418201446533, + "rewards/margins": 6.342013359069824, + "rewards/rejected": -9.272954940795898, + "step": 190 + }, + { + "epoch": 0.1188180404354588, + "grad_norm": 5.08317756652832, + "learning_rate": 4.4944444444444445e-06, + "logits/chosen": -0.1611710786819458, + "logits/rejected": -0.2670513093471527, + "logps/chosen": -133.3809814453125, + "logps/rejected": -381.4287109375, + "loss": 0.1187, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3053531646728516, + "rewards/margins": 6.696647644042969, + "rewards/rejected": -9.00200080871582, + "step": 191 + }, + { + "epoch": 0.11944012441679627, + "grad_norm": 10.827777862548828, + "learning_rate": 4.488888888888889e-06, + "logits/chosen": -0.011868398636579514, + "logits/rejected": -0.13873547315597534, + "logps/chosen": -389.3967590332031, + "logps/rejected": -654.178466796875, + "loss": 0.1604, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.0660881996154785, + "rewards/margins": 7.500181198120117, + "rewards/rejected": -12.566267967224121, + "step": 192 + }, + { + "epoch": 0.12006220839813375, + "grad_norm": 1.23869788646698, + "learning_rate": 4.483333333333333e-06, + "logits/chosen": -0.2719615399837494, + "logits/rejected": -0.32208290696144104, + "logps/chosen": -239.8672332763672, + "logps/rejected": -567.1630859375, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9725968837738037, + "rewards/margins": 8.392380714416504, + "rewards/rejected": -11.36497688293457, + "step": 193 + }, + { + "epoch": 0.12068429237947123, + "grad_norm": 0.941590428352356, + "learning_rate": 4.477777777777778e-06, + "logits/chosen": -0.14240425825119019, + "logits/rejected": -0.1839601695537567, + "logps/chosen": -406.7160949707031, + "logps/rejected": -588.5770263671875, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.916929244995117, + "rewards/margins": 6.570363521575928, + "rewards/rejected": -11.487293243408203, + "step": 194 + }, + { + "epoch": 0.12130637636080871, + "grad_norm": 0.6569964289665222, + "learning_rate": 4.472222222222223e-06, + "logits/chosen": -0.09213235974311829, + "logits/rejected": -0.1600230634212494, + "logps/chosen": -303.3025817871094, + "logps/rejected": -495.83251953125, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4699807167053223, + "rewards/margins": 6.729430198669434, + "rewards/rejected": -9.199411392211914, + "step": 195 + }, + { + "epoch": 0.12192846034214619, + "grad_norm": 20.63555908203125, + "learning_rate": 4.4666666666666665e-06, + "logits/chosen": -0.18242889642715454, + "logits/rejected": -0.2115517407655716, + "logps/chosen": -495.2789306640625, + "logps/rejected": -584.864990234375, + "loss": 0.3162, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.116421222686768, + "rewards/margins": 4.394755840301514, + "rewards/rejected": -11.511177062988281, + "step": 196 + }, + { + "epoch": 0.12255054432348367, + "grad_norm": 19.603097915649414, + "learning_rate": 4.461111111111112e-06, + "logits/chosen": -0.008498098701238632, + "logits/rejected": -0.13363373279571533, + "logps/chosen": -441.25408935546875, + "logps/rejected": -369.1492614746094, + "loss": 0.2806, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.245054244995117, + "rewards/margins": 5.767759323120117, + "rewards/rejected": -9.012813568115234, + "step": 197 + }, + { + "epoch": 0.12317262830482115, + "grad_norm": 16.505210876464844, + "learning_rate": 4.455555555555555e-06, + "logits/chosen": -0.1613885462284088, + "logits/rejected": -0.1164340004324913, + "logps/chosen": -537.34375, + "logps/rejected": -423.7762145996094, + "loss": 0.4043, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.268223285675049, + "rewards/margins": 4.763760566711426, + "rewards/rejected": -10.031984329223633, + "step": 198 + }, + { + "epoch": 0.12379471228615863, + "grad_norm": 1.9455980062484741, + "learning_rate": 4.450000000000001e-06, + "logits/chosen": -0.16660988330841064, + "logits/rejected": -0.17308039963245392, + "logps/chosen": -418.20330810546875, + "logps/rejected": -475.8372497558594, + "loss": 0.0431, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.952427387237549, + "rewards/margins": 5.502568244934082, + "rewards/rejected": -8.454996109008789, + "step": 199 + }, + { + "epoch": 0.12441679626749612, + "grad_norm": 3.4681448936462402, + "learning_rate": 4.444444444444444e-06, + "logits/chosen": -0.11532752215862274, + "logits/rejected": -0.1828579306602478, + "logps/chosen": -321.120849609375, + "logps/rejected": -441.7661437988281, + "loss": 0.0372, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.226459503173828, + "rewards/margins": 7.537663459777832, + "rewards/rejected": -11.76412296295166, + "step": 200 + }, + { + "epoch": 0.12503888024883358, + "grad_norm": 13.86237621307373, + "learning_rate": 4.438888888888889e-06, + "logits/chosen": -0.15887734293937683, + "logits/rejected": -0.3088030517101288, + "logps/chosen": -342.96551513671875, + "logps/rejected": -534.016357421875, + "loss": 0.2529, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4156570434570312, + "rewards/margins": 8.62362289428711, + "rewards/rejected": -12.03927993774414, + "step": 201 + }, + { + "epoch": 0.12566096423017106, + "grad_norm": 0.5979342460632324, + "learning_rate": 4.433333333333334e-06, + "logits/chosen": -0.11531206965446472, + "logits/rejected": -0.27601784467697144, + "logps/chosen": -264.59954833984375, + "logps/rejected": -562.3781127929688, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.322659015655518, + "rewards/margins": 9.029813766479492, + "rewards/rejected": -13.352472305297852, + "step": 202 + }, + { + "epoch": 0.12628304821150854, + "grad_norm": 10.158177375793457, + "learning_rate": 4.427777777777778e-06, + "logits/chosen": -0.20076315104961395, + "logits/rejected": -0.21195828914642334, + "logps/chosen": -556.269775390625, + "logps/rejected": -567.4002685546875, + "loss": 0.1871, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.389566421508789, + "rewards/margins": 5.999963760375977, + "rewards/rejected": -12.38952922821045, + "step": 203 + }, + { + "epoch": 0.12690513219284602, + "grad_norm": 0.22421550750732422, + "learning_rate": 4.422222222222223e-06, + "logits/chosen": -0.043894290924072266, + "logits/rejected": -0.24361872673034668, + "logps/chosen": -285.2899169921875, + "logps/rejected": -739.2362060546875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.130539894104004, + "rewards/margins": 10.945337295532227, + "rewards/rejected": -15.075878143310547, + "step": 204 + }, + { + "epoch": 0.12752721617418353, + "grad_norm": 0.27712979912757874, + "learning_rate": 4.416666666666667e-06, + "logits/chosen": -0.1653560847043991, + "logits/rejected": -0.2496764063835144, + "logps/chosen": -267.2332763671875, + "logps/rejected": -415.05181884765625, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.429808139801025, + "rewards/margins": 6.984692096710205, + "rewards/rejected": -11.41450023651123, + "step": 205 + }, + { + "epoch": 0.128149300155521, + "grad_norm": 4.2997918128967285, + "learning_rate": 4.411111111111111e-06, + "logits/chosen": -0.11084900051355362, + "logits/rejected": -0.14535486698150635, + "logps/chosen": -405.7935791015625, + "logps/rejected": -494.04302978515625, + "loss": 0.1227, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9838545322418213, + "rewards/margins": 6.347996234893799, + "rewards/rejected": -10.331850051879883, + "step": 206 + }, + { + "epoch": 0.12877138413685849, + "grad_norm": 4.468008518218994, + "learning_rate": 4.405555555555556e-06, + "logits/chosen": -0.20454280078411102, + "logits/rejected": -0.30304861068725586, + "logps/chosen": -386.2103271484375, + "logps/rejected": -584.7804565429688, + "loss": 0.0808, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.260396957397461, + "rewards/margins": 7.33172082901001, + "rewards/rejected": -13.592119216918945, + "step": 207 + }, + { + "epoch": 0.12939346811819596, + "grad_norm": 0.4366110861301422, + "learning_rate": 4.4e-06, + "logits/chosen": -0.10170028358697891, + "logits/rejected": -0.19526614248752594, + "logps/chosen": -298.06158447265625, + "logps/rejected": -408.65447998046875, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.082025051116943, + "rewards/margins": 6.9650468826293945, + "rewards/rejected": -11.04707145690918, + "step": 208 + }, + { + "epoch": 0.13001555209953344, + "grad_norm": 3.2018239498138428, + "learning_rate": 4.3944444444444455e-06, + "logits/chosen": -0.15918540954589844, + "logits/rejected": -0.23415011167526245, + "logps/chosen": -381.16021728515625, + "logps/rejected": -571.43798828125, + "loss": 0.038, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.87255859375, + "rewards/margins": 9.649118423461914, + "rewards/rejected": -14.521677017211914, + "step": 209 + }, + { + "epoch": 0.13063763608087092, + "grad_norm": 3.6075499057769775, + "learning_rate": 4.388888888888889e-06, + "logits/chosen": -0.10588833689689636, + "logits/rejected": -0.14603900909423828, + "logps/chosen": -490.2928771972656, + "logps/rejected": -598.0196533203125, + "loss": 0.0595, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.224918842315674, + "rewards/margins": 7.642451763153076, + "rewards/rejected": -13.86737060546875, + "step": 210 + }, + { + "epoch": 0.1312597200622084, + "grad_norm": 1.7185460329055786, + "learning_rate": 4.383333333333334e-06, + "logits/chosen": -0.06590424478054047, + "logits/rejected": -0.18214921653270721, + "logps/chosen": -432.0932312011719, + "logps/rejected": -633.0640869140625, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0978474617004395, + "rewards/margins": 9.383085250854492, + "rewards/rejected": -14.48093318939209, + "step": 211 + }, + { + "epoch": 0.13188180404354588, + "grad_norm": 17.27645492553711, + "learning_rate": 4.377777777777778e-06, + "logits/chosen": -0.21469348669052124, + "logits/rejected": -0.16578437387943268, + "logps/chosen": -628.2538452148438, + "logps/rejected": -648.5946044921875, + "loss": 0.2823, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.826486587524414, + "rewards/margins": 5.544658660888672, + "rewards/rejected": -11.371145248413086, + "step": 212 + }, + { + "epoch": 0.13250388802488336, + "grad_norm": 23.630199432373047, + "learning_rate": 4.372222222222223e-06, + "logits/chosen": -0.1707504689693451, + "logits/rejected": -0.12941914796829224, + "logps/chosen": -462.0667724609375, + "logps/rejected": -436.2506408691406, + "loss": 1.2196, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.260027885437012, + "rewards/margins": 4.628278732299805, + "rewards/rejected": -9.8883056640625, + "step": 213 + }, + { + "epoch": 0.13312597200622084, + "grad_norm": 12.533119201660156, + "learning_rate": 4.366666666666667e-06, + "logits/chosen": -0.20734256505966187, + "logits/rejected": -0.3216004967689514, + "logps/chosen": -583.6842041015625, + "logps/rejected": -758.4173583984375, + "loss": 0.3955, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.938846588134766, + "rewards/margins": 5.942337512969971, + "rewards/rejected": -10.881184577941895, + "step": 214 + }, + { + "epoch": 0.13374805598755832, + "grad_norm": 1.4944169521331787, + "learning_rate": 4.361111111111112e-06, + "logits/chosen": -0.06910410523414612, + "logits/rejected": -0.136884406208992, + "logps/chosen": -378.0068359375, + "logps/rejected": -607.4515380859375, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.588863849639893, + "rewards/margins": 9.04670524597168, + "rewards/rejected": -13.635570526123047, + "step": 215 + }, + { + "epoch": 0.1343701399688958, + "grad_norm": 9.419214248657227, + "learning_rate": 4.3555555555555555e-06, + "logits/chosen": -0.06776685267686844, + "logits/rejected": -0.11489962041378021, + "logps/chosen": -326.374755859375, + "logps/rejected": -470.3567199707031, + "loss": 0.5287, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.9228034019470215, + "rewards/margins": 5.981575012207031, + "rewards/rejected": -10.904377937316895, + "step": 216 + }, + { + "epoch": 0.13499222395023328, + "grad_norm": 17.357324600219727, + "learning_rate": 4.350000000000001e-06, + "logits/chosen": -0.1938476860523224, + "logits/rejected": -0.22026970982551575, + "logps/chosen": -313.758544921875, + "logps/rejected": -489.6036071777344, + "loss": 0.4197, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.040271759033203, + "rewards/margins": 6.4016194343566895, + "rewards/rejected": -10.441890716552734, + "step": 217 + }, + { + "epoch": 0.13561430793157075, + "grad_norm": 4.745786190032959, + "learning_rate": 4.344444444444445e-06, + "logits/chosen": -0.13248351216316223, + "logits/rejected": -0.21983805298805237, + "logps/chosen": -227.53631591796875, + "logps/rejected": -317.0226135253906, + "loss": 0.0479, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.679323673248291, + "rewards/margins": 5.52722692489624, + "rewards/rejected": -9.206550598144531, + "step": 218 + }, + { + "epoch": 0.13623639191290823, + "grad_norm": 5.268618106842041, + "learning_rate": 4.3388888888888895e-06, + "logits/chosen": -0.07349290698766708, + "logits/rejected": -0.13672694563865662, + "logps/chosen": -371.81488037109375, + "logps/rejected": -563.0823364257812, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.064291477203369, + "rewards/margins": 8.103504180908203, + "rewards/rejected": -12.167794227600098, + "step": 219 + }, + { + "epoch": 0.1368584758942457, + "grad_norm": 0.2621453106403351, + "learning_rate": 4.333333333333334e-06, + "logits/chosen": -0.18309864401817322, + "logits/rejected": -0.24785983562469482, + "logps/chosen": -400.9888916015625, + "logps/rejected": -537.8822631835938, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.328176975250244, + "rewards/margins": 8.765192031860352, + "rewards/rejected": -13.093368530273438, + "step": 220 + }, + { + "epoch": 0.1374805598755832, + "grad_norm": 0.2544398307800293, + "learning_rate": 4.327777777777778e-06, + "logits/chosen": -0.14675526320934296, + "logits/rejected": -0.2419712394475937, + "logps/chosen": -423.5704345703125, + "logps/rejected": -601.5996704101562, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8980045318603516, + "rewards/margins": 7.97990608215332, + "rewards/rejected": -10.877910614013672, + "step": 221 + }, + { + "epoch": 0.13810264385692067, + "grad_norm": 0.011788148432970047, + "learning_rate": 4.322222222222223e-06, + "logits/chosen": -0.11852366477251053, + "logits/rejected": -0.25337889790534973, + "logps/chosen": -256.1315612792969, + "logps/rejected": -619.33203125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4092636108398438, + "rewards/margins": 11.004941940307617, + "rewards/rejected": -14.414204597473145, + "step": 222 + }, + { + "epoch": 0.13872472783825818, + "grad_norm": 1.7974365949630737, + "learning_rate": 4.316666666666667e-06, + "logits/chosen": -0.14163324236869812, + "logits/rejected": -0.2334975302219391, + "logps/chosen": -249.05418395996094, + "logps/rejected": -375.9051208496094, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.949312210083008, + "rewards/margins": 5.896815299987793, + "rewards/rejected": -8.8461275100708, + "step": 223 + }, + { + "epoch": 0.13934681181959566, + "grad_norm": 0.09195344895124435, + "learning_rate": 4.3111111111111115e-06, + "logits/chosen": 0.06134074926376343, + "logits/rejected": -0.15917283296585083, + "logps/chosen": -155.31539916992188, + "logps/rejected": -488.4548034667969, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.338865041732788, + "rewards/margins": 9.733442306518555, + "rewards/rejected": -12.072306632995605, + "step": 224 + }, + { + "epoch": 0.13996889580093314, + "grad_norm": 5.062751293182373, + "learning_rate": 4.305555555555556e-06, + "logits/chosen": -0.11247175931930542, + "logits/rejected": -0.27850502729415894, + "logps/chosen": -205.1552734375, + "logps/rejected": -624.4266967773438, + "loss": 0.0697, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9858102798461914, + "rewards/margins": 7.83154296875, + "rewards/rejected": -9.817352294921875, + "step": 225 + }, + { + "epoch": 0.14059097978227061, + "grad_norm": 7.39639139175415, + "learning_rate": 4.3e-06, + "logits/chosen": -0.15556737780570984, + "logits/rejected": -0.23724953830242157, + "logps/chosen": -406.7066650390625, + "logps/rejected": -505.3165588378906, + "loss": 0.1641, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.420718669891357, + "rewards/margins": 4.510132789611816, + "rewards/rejected": -8.930850982666016, + "step": 226 + }, + { + "epoch": 0.1412130637636081, + "grad_norm": 0.46206241846084595, + "learning_rate": 4.294444444444445e-06, + "logits/chosen": -0.08338303118944168, + "logits/rejected": -0.16354772448539734, + "logps/chosen": -436.1719970703125, + "logps/rejected": -499.0705871582031, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.768359899520874, + "rewards/margins": 7.0917863845825195, + "rewards/rejected": -9.860145568847656, + "step": 227 + }, + { + "epoch": 0.14183514774494557, + "grad_norm": 2.918172597885132, + "learning_rate": 4.288888888888889e-06, + "logits/chosen": -0.16940590739250183, + "logits/rejected": -0.23630109429359436, + "logps/chosen": -144.7860107421875, + "logps/rejected": -300.5555114746094, + "loss": 0.0408, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3163816928863525, + "rewards/margins": 5.495935440063477, + "rewards/rejected": -7.81231689453125, + "step": 228 + }, + { + "epoch": 0.14245723172628305, + "grad_norm": 1.1104446649551392, + "learning_rate": 4.2833333333333335e-06, + "logits/chosen": -0.18083666265010834, + "logits/rejected": -0.2724631130695343, + "logps/chosen": -261.4461669921875, + "logps/rejected": -533.562255859375, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8087317943573, + "rewards/margins": 6.4176836013793945, + "rewards/rejected": -9.226415634155273, + "step": 229 + }, + { + "epoch": 0.14307931570762053, + "grad_norm": 0.07633739709854126, + "learning_rate": 4.277777777777778e-06, + "logits/chosen": -0.17306989431381226, + "logits/rejected": -0.27053773403167725, + "logps/chosen": -288.59857177734375, + "logps/rejected": -499.30694580078125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.035839080810547, + "rewards/margins": 8.96580696105957, + "rewards/rejected": -13.001646041870117, + "step": 230 + }, + { + "epoch": 0.143701399688958, + "grad_norm": 13.571040153503418, + "learning_rate": 4.272222222222222e-06, + "logits/chosen": -0.05582804977893829, + "logits/rejected": -0.13622458279132843, + "logps/chosen": -362.85357666015625, + "logps/rejected": -526.607666015625, + "loss": 0.3648, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.291714906692505, + "rewards/margins": 5.022339344024658, + "rewards/rejected": -8.314054489135742, + "step": 231 + }, + { + "epoch": 0.1443234836702955, + "grad_norm": 0.9441532492637634, + "learning_rate": 4.266666666666668e-06, + "logits/chosen": -0.03444555774331093, + "logits/rejected": -0.12729418277740479, + "logps/chosen": -305.111083984375, + "logps/rejected": -501.290771484375, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.193662643432617, + "rewards/margins": 9.07753849029541, + "rewards/rejected": -12.271202087402344, + "step": 232 + }, + { + "epoch": 0.14494556765163297, + "grad_norm": 0.24280984699726105, + "learning_rate": 4.261111111111111e-06, + "logits/chosen": -0.23749622702598572, + "logits/rejected": -0.287462055683136, + "logps/chosen": -172.53701782226562, + "logps/rejected": -379.12420654296875, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.563803195953369, + "rewards/margins": 6.115121364593506, + "rewards/rejected": -8.678924560546875, + "step": 233 + }, + { + "epoch": 0.14556765163297045, + "grad_norm": 6.430953502655029, + "learning_rate": 4.255555555555556e-06, + "logits/chosen": -0.08341242372989655, + "logits/rejected": -0.14627495408058167, + "logps/chosen": -407.6485595703125, + "logps/rejected": -448.77288818359375, + "loss": 0.1049, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.316323757171631, + "rewards/margins": 5.533578872680664, + "rewards/rejected": -11.849903106689453, + "step": 234 + }, + { + "epoch": 0.14618973561430793, + "grad_norm": 2.4797134399414062, + "learning_rate": 4.25e-06, + "logits/chosen": -0.11957529187202454, + "logits/rejected": -0.1361967772245407, + "logps/chosen": -505.4205322265625, + "logps/rejected": -488.9862060546875, + "loss": 0.0467, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.080632209777832, + "rewards/margins": 5.105950355529785, + "rewards/rejected": -10.186582565307617, + "step": 235 + }, + { + "epoch": 0.1468118195956454, + "grad_norm": 2.2615177631378174, + "learning_rate": 4.244444444444445e-06, + "logits/chosen": -0.08932839334011078, + "logits/rejected": -0.15244892239570618, + "logps/chosen": -266.90863037109375, + "logps/rejected": -390.48162841796875, + "loss": 0.1321, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.712393045425415, + "rewards/margins": 5.196257591247559, + "rewards/rejected": -7.908651351928711, + "step": 236 + }, + { + "epoch": 0.14743390357698288, + "grad_norm": 3.334500551223755, + "learning_rate": 4.238888888888889e-06, + "logits/chosen": -0.18235519528388977, + "logits/rejected": -0.16703030467033386, + "logps/chosen": -330.26239013671875, + "logps/rejected": -664.0196533203125, + "loss": 0.0555, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.689000129699707, + "rewards/margins": 7.319486618041992, + "rewards/rejected": -11.0084867477417, + "step": 237 + }, + { + "epoch": 0.14805598755832036, + "grad_norm": 17.431671142578125, + "learning_rate": 4.233333333333334e-06, + "logits/chosen": -0.12822696566581726, + "logits/rejected": -0.21281002461910248, + "logps/chosen": -306.01788330078125, + "logps/rejected": -494.70306396484375, + "loss": 0.4511, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.080674171447754, + "rewards/margins": 4.5465312004089355, + "rewards/rejected": -9.627204895019531, + "step": 238 + }, + { + "epoch": 0.14867807153965784, + "grad_norm": 0.25756600499153137, + "learning_rate": 4.227777777777778e-06, + "logits/chosen": -0.07383677363395691, + "logits/rejected": -0.21189387142658234, + "logps/chosen": -536.4447021484375, + "logps/rejected": -764.3046875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.077761650085449, + "rewards/margins": 10.190600395202637, + "rewards/rejected": -15.268362045288086, + "step": 239 + }, + { + "epoch": 0.14930015552099535, + "grad_norm": 0.5319288969039917, + "learning_rate": 4.222222222222223e-06, + "logits/chosen": -0.0626106932759285, + "logits/rejected": -0.03897733613848686, + "logps/chosen": -307.3756103515625, + "logps/rejected": -492.5594482421875, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1444954872131348, + "rewards/margins": 9.620294570922852, + "rewards/rejected": -12.764789581298828, + "step": 240 + }, + { + "epoch": 0.14992223950233283, + "grad_norm": 0.3779299557209015, + "learning_rate": 4.216666666666667e-06, + "logits/chosen": -0.0897144079208374, + "logits/rejected": -0.11330302059650421, + "logps/chosen": -281.55780029296875, + "logps/rejected": -427.8779602050781, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7874464988708496, + "rewards/margins": 7.7265305519104, + "rewards/rejected": -11.51397705078125, + "step": 241 + }, + { + "epoch": 0.1505443234836703, + "grad_norm": 9.642871856689453, + "learning_rate": 4.211111111111112e-06, + "logits/chosen": -0.16774220764636993, + "logits/rejected": -0.18776768445968628, + "logps/chosen": -418.89398193359375, + "logps/rejected": -454.43023681640625, + "loss": 0.2664, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.804168701171875, + "rewards/margins": 5.928190231323242, + "rewards/rejected": -11.732358932495117, + "step": 242 + }, + { + "epoch": 0.15116640746500778, + "grad_norm": 0.22446109354496002, + "learning_rate": 4.205555555555556e-06, + "logits/chosen": -0.019907476380467415, + "logits/rejected": -0.11574709415435791, + "logps/chosen": -419.2196044921875, + "logps/rejected": -622.3250122070312, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.408943176269531, + "rewards/margins": 9.901863098144531, + "rewards/rejected": -14.310806274414062, + "step": 243 + }, + { + "epoch": 0.15178849144634526, + "grad_norm": 3.6235687732696533, + "learning_rate": 4.2000000000000004e-06, + "logits/chosen": -0.14729242026805878, + "logits/rejected": -0.17805354297161102, + "logps/chosen": -505.3883361816406, + "logps/rejected": -541.7753295898438, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.5113396644592285, + "rewards/margins": 6.403304100036621, + "rewards/rejected": -11.914644241333008, + "step": 244 + }, + { + "epoch": 0.15241057542768274, + "grad_norm": 0.8029597401618958, + "learning_rate": 4.194444444444445e-06, + "logits/chosen": -0.152954563498497, + "logits/rejected": -0.21796384453773499, + "logps/chosen": -487.0811767578125, + "logps/rejected": -655.6520385742188, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.250409126281738, + "rewards/margins": 8.872339248657227, + "rewards/rejected": -13.122748374938965, + "step": 245 + }, + { + "epoch": 0.15303265940902022, + "grad_norm": 9.96660041809082, + "learning_rate": 4.188888888888889e-06, + "logits/chosen": -0.1279718279838562, + "logits/rejected": -0.14475785195827484, + "logps/chosen": -293.0921325683594, + "logps/rejected": -399.8092956542969, + "loss": 0.2014, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.630045175552368, + "rewards/margins": 6.511990070343018, + "rewards/rejected": -10.142035484313965, + "step": 246 + }, + { + "epoch": 0.1536547433903577, + "grad_norm": 6.769599914550781, + "learning_rate": 4.183333333333334e-06, + "logits/chosen": -0.19370505213737488, + "logits/rejected": -0.29524803161621094, + "logps/chosen": -555.8383178710938, + "logps/rejected": -691.7930908203125, + "loss": 0.082, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.009703636169434, + "rewards/margins": 7.4122467041015625, + "rewards/rejected": -12.421951293945312, + "step": 247 + }, + { + "epoch": 0.15427682737169518, + "grad_norm": 3.107793092727661, + "learning_rate": 4.177777777777778e-06, + "logits/chosen": -0.14492307603359222, + "logits/rejected": -0.19511684775352478, + "logps/chosen": -267.9463806152344, + "logps/rejected": -425.929443359375, + "loss": 0.0363, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.50352668762207, + "rewards/margins": 5.826737403869629, + "rewards/rejected": -10.330265045166016, + "step": 248 + }, + { + "epoch": 0.15489891135303266, + "grad_norm": 0.06454760581254959, + "learning_rate": 4.1722222222222225e-06, + "logits/chosen": -0.1815810650587082, + "logits/rejected": -0.2278299331665039, + "logps/chosen": -332.06060791015625, + "logps/rejected": -562.3685302734375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5593018531799316, + "rewards/margins": 9.538451194763184, + "rewards/rejected": -12.097752571105957, + "step": 249 + }, + { + "epoch": 0.15552099533437014, + "grad_norm": 1.1465234756469727, + "learning_rate": 4.166666666666667e-06, + "logits/chosen": -0.2536413371562958, + "logits/rejected": -0.29239875078201294, + "logps/chosen": -306.61029052734375, + "logps/rejected": -431.6826171875, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.367475509643555, + "rewards/margins": 6.349707126617432, + "rewards/rejected": -10.717182159423828, + "step": 250 + }, + { + "epoch": 0.15614307931570762, + "grad_norm": 3.0188095569610596, + "learning_rate": 4.161111111111111e-06, + "logits/chosen": -0.06983910501003265, + "logits/rejected": -0.16658973693847656, + "logps/chosen": -354.5345764160156, + "logps/rejected": -477.9388427734375, + "loss": 0.0416, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.956805229187012, + "rewards/margins": 7.074861526489258, + "rewards/rejected": -12.03166675567627, + "step": 251 + }, + { + "epoch": 0.1567651632970451, + "grad_norm": 2.6784274578094482, + "learning_rate": 4.155555555555556e-06, + "logits/chosen": -0.11070965230464935, + "logits/rejected": -0.1866130828857422, + "logps/chosen": -421.6877746582031, + "logps/rejected": -506.14483642578125, + "loss": 0.023, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9549331665039062, + "rewards/margins": 7.99067497253418, + "rewards/rejected": -11.945609092712402, + "step": 252 + }, + { + "epoch": 0.15738724727838257, + "grad_norm": 0.18249952793121338, + "learning_rate": 4.15e-06, + "logits/chosen": -0.22737552225589752, + "logits/rejected": -0.25040513277053833, + "logps/chosen": -401.2856750488281, + "logps/rejected": -599.4119873046875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1789166927337646, + "rewards/margins": 7.088460922241211, + "rewards/rejected": -10.267376899719238, + "step": 253 + }, + { + "epoch": 0.15800933125972005, + "grad_norm": 0.8384153842926025, + "learning_rate": 4.1444444444444445e-06, + "logits/chosen": -0.10687308013439178, + "logits/rejected": -0.19004058837890625, + "logps/chosen": -296.13189697265625, + "logps/rejected": -538.2275390625, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5475757122039795, + "rewards/margins": 8.444770812988281, + "rewards/rejected": -11.992345809936523, + "step": 254 + }, + { + "epoch": 0.15863141524105753, + "grad_norm": 0.09486782550811768, + "learning_rate": 4.138888888888889e-06, + "logits/chosen": -0.11108750104904175, + "logits/rejected": -0.20321372151374817, + "logps/chosen": -154.17352294921875, + "logps/rejected": -358.58880615234375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.683732271194458, + "rewards/margins": 8.109987258911133, + "rewards/rejected": -10.793719291687012, + "step": 255 + }, + { + "epoch": 0.159253499222395, + "grad_norm": 1.9149295091629028, + "learning_rate": 4.133333333333333e-06, + "logits/chosen": -0.1632416844367981, + "logits/rejected": -0.2547285258769989, + "logps/chosen": -448.1278991699219, + "logps/rejected": -564.8543701171875, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9748075008392334, + "rewards/margins": 6.881025791168213, + "rewards/rejected": -9.855834007263184, + "step": 256 + }, + { + "epoch": 0.1598755832037325, + "grad_norm": 7.694660186767578, + "learning_rate": 4.1277777777777785e-06, + "logits/chosen": -0.04392428323626518, + "logits/rejected": -0.22827476263046265, + "logps/chosen": -426.02294921875, + "logps/rejected": -679.2426147460938, + "loss": 0.149, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.313868522644043, + "rewards/margins": 9.447851181030273, + "rewards/rejected": -14.76171875, + "step": 257 + }, + { + "epoch": 0.16049766718507, + "grad_norm": 3.177924156188965, + "learning_rate": 4.122222222222222e-06, + "logits/chosen": -0.13181807100772858, + "logits/rejected": -0.1860564649105072, + "logps/chosen": -363.2125549316406, + "logps/rejected": -469.60186767578125, + "loss": 0.0457, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8252716064453125, + "rewards/margins": 5.726541519165039, + "rewards/rejected": -9.551813125610352, + "step": 258 + }, + { + "epoch": 0.16111975116640748, + "grad_norm": 0.1548858880996704, + "learning_rate": 4.116666666666667e-06, + "logits/chosen": -0.1819688230752945, + "logits/rejected": -0.2746058702468872, + "logps/chosen": -249.11001586914062, + "logps/rejected": -457.58782958984375, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1980443000793457, + "rewards/margins": 7.433623313903809, + "rewards/rejected": -10.631668090820312, + "step": 259 + }, + { + "epoch": 0.16174183514774496, + "grad_norm": 10.871868133544922, + "learning_rate": 4.111111111111111e-06, + "logits/chosen": -0.05994994193315506, + "logits/rejected": -0.09650249034166336, + "logps/chosen": -435.11273193359375, + "logps/rejected": -474.5282287597656, + "loss": 0.1433, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.72778844833374, + "rewards/margins": 5.070606231689453, + "rewards/rejected": -9.798395156860352, + "step": 260 + }, + { + "epoch": 0.16236391912908243, + "grad_norm": 2.8026349544525146, + "learning_rate": 4.105555555555556e-06, + "logits/chosen": -0.13780786097049713, + "logits/rejected": -0.22941848635673523, + "logps/chosen": -307.8487243652344, + "logps/rejected": -455.23193359375, + "loss": 0.04, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.847287178039551, + "rewards/margins": 8.286890029907227, + "rewards/rejected": -13.134177207946777, + "step": 261 + }, + { + "epoch": 0.1629860031104199, + "grad_norm": 0.5517165660858154, + "learning_rate": 4.1e-06, + "logits/chosen": -0.07088734209537506, + "logits/rejected": -0.16676515340805054, + "logps/chosen": -325.2935485839844, + "logps/rejected": -652.1651000976562, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.023070335388184, + "rewards/margins": 8.473140716552734, + "rewards/rejected": -12.496212005615234, + "step": 262 + }, + { + "epoch": 0.1636080870917574, + "grad_norm": 1.954545021057129, + "learning_rate": 4.094444444444445e-06, + "logits/chosen": -0.14273375272750854, + "logits/rejected": -0.17627984285354614, + "logps/chosen": -451.7691955566406, + "logps/rejected": -469.8240051269531, + "loss": 0.0302, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3768599033355713, + "rewards/margins": 8.622417449951172, + "rewards/rejected": -11.999277114868164, + "step": 263 + }, + { + "epoch": 0.16423017107309487, + "grad_norm": 7.518491268157959, + "learning_rate": 4.088888888888889e-06, + "logits/chosen": -0.14029516279697418, + "logits/rejected": -0.16743351519107819, + "logps/chosen": -318.8870849609375, + "logps/rejected": -336.1113586425781, + "loss": 0.2156, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.50374174118042, + "rewards/margins": 4.521431922912598, + "rewards/rejected": -8.02517318725586, + "step": 264 + }, + { + "epoch": 0.16485225505443235, + "grad_norm": 0.46220335364341736, + "learning_rate": 4.083333333333334e-06, + "logits/chosen": -0.08248893171548843, + "logits/rejected": -0.20688259601593018, + "logps/chosen": -187.35861206054688, + "logps/rejected": -485.96710205078125, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7989754676818848, + "rewards/margins": 7.487030029296875, + "rewards/rejected": -11.286005020141602, + "step": 265 + }, + { + "epoch": 0.16547433903576983, + "grad_norm": 10.237696647644043, + "learning_rate": 4.077777777777778e-06, + "logits/chosen": -0.01933600753545761, + "logits/rejected": -0.1270199865102768, + "logps/chosen": -280.6358947753906, + "logps/rejected": -652.968505859375, + "loss": 0.1799, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9148459434509277, + "rewards/margins": 8.451780319213867, + "rewards/rejected": -11.366626739501953, + "step": 266 + }, + { + "epoch": 0.1660964230171073, + "grad_norm": 12.523069381713867, + "learning_rate": 4.0722222222222226e-06, + "logits/chosen": -0.11510778963565826, + "logits/rejected": -0.2414967566728592, + "logps/chosen": -399.7659912109375, + "logps/rejected": -519.09521484375, + "loss": 0.1569, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.621103286743164, + "rewards/margins": 6.574969291687012, + "rewards/rejected": -10.196073532104492, + "step": 267 + }, + { + "epoch": 0.1667185069984448, + "grad_norm": 16.624155044555664, + "learning_rate": 4.066666666666667e-06, + "logits/chosen": -0.09144223481416702, + "logits/rejected": -0.22611790895462036, + "logps/chosen": -491.80328369140625, + "logps/rejected": -705.1207885742188, + "loss": 0.2883, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.44871711730957, + "rewards/margins": 7.192960739135742, + "rewards/rejected": -12.641677856445312, + "step": 268 + }, + { + "epoch": 0.16734059097978227, + "grad_norm": 2.0742456912994385, + "learning_rate": 4.061111111111111e-06, + "logits/chosen": -0.15835008025169373, + "logits/rejected": -0.2130199670791626, + "logps/chosen": -455.2060546875, + "logps/rejected": -618.0324096679688, + "loss": 0.0306, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0951638221740723, + "rewards/margins": 7.245157241821289, + "rewards/rejected": -10.340319633483887, + "step": 269 + }, + { + "epoch": 0.16796267496111975, + "grad_norm": 12.889222145080566, + "learning_rate": 4.055555555555556e-06, + "logits/chosen": -0.26214489340782166, + "logits/rejected": -0.2487567961215973, + "logps/chosen": -343.06170654296875, + "logps/rejected": -463.5102844238281, + "loss": 0.0823, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6654186248779297, + "rewards/margins": 6.428375244140625, + "rewards/rejected": -10.093793869018555, + "step": 270 + }, + { + "epoch": 0.16858475894245722, + "grad_norm": 7.523189544677734, + "learning_rate": 4.05e-06, + "logits/chosen": -0.18480314314365387, + "logits/rejected": -0.24796177446842194, + "logps/chosen": -195.75807189941406, + "logps/rejected": -338.8868408203125, + "loss": 0.1548, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.6672298908233643, + "rewards/margins": 3.876713275909424, + "rewards/rejected": -7.543943405151367, + "step": 271 + }, + { + "epoch": 0.1692068429237947, + "grad_norm": 6.409623622894287, + "learning_rate": 4.044444444444445e-06, + "logits/chosen": -0.2590438425540924, + "logits/rejected": -0.30794912576675415, + "logps/chosen": -403.4942932128906, + "logps/rejected": -579.6800537109375, + "loss": 0.0709, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6587371826171875, + "rewards/margins": 7.888822555541992, + "rewards/rejected": -10.54755973815918, + "step": 272 + }, + { + "epoch": 0.16982892690513218, + "grad_norm": 2.4975357055664062, + "learning_rate": 4.038888888888889e-06, + "logits/chosen": -0.2463613599538803, + "logits/rejected": -0.32670921087265015, + "logps/chosen": -315.53076171875, + "logps/rejected": -462.01177978515625, + "loss": 0.083, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9087226390838623, + "rewards/margins": 7.140023708343506, + "rewards/rejected": -10.048746109008789, + "step": 273 + }, + { + "epoch": 0.17045101088646966, + "grad_norm": 2.386859893798828, + "learning_rate": 4.033333333333333e-06, + "logits/chosen": -0.0945328027009964, + "logits/rejected": -0.22304189205169678, + "logps/chosen": -238.69268798828125, + "logps/rejected": -424.4090576171875, + "loss": 0.0258, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7995095252990723, + "rewards/margins": 7.386193752288818, + "rewards/rejected": -11.18570327758789, + "step": 274 + }, + { + "epoch": 0.17107309486780714, + "grad_norm": 5.244316577911377, + "learning_rate": 4.027777777777779e-06, + "logits/chosen": -0.2192511260509491, + "logits/rejected": -0.30824869871139526, + "logps/chosen": -335.2362060546875, + "logps/rejected": -557.1527709960938, + "loss": 0.0771, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1676559448242188, + "rewards/margins": 8.024835586547852, + "rewards/rejected": -11.19249153137207, + "step": 275 + }, + { + "epoch": 0.17169517884914465, + "grad_norm": 12.626290321350098, + "learning_rate": 4.022222222222222e-06, + "logits/chosen": -0.01322026178240776, + "logits/rejected": -0.13288024067878723, + "logps/chosen": -428.9491271972656, + "logps/rejected": -688.73388671875, + "loss": 0.2838, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.1303911209106445, + "rewards/margins": 8.809720039367676, + "rewards/rejected": -12.94011116027832, + "step": 276 + }, + { + "epoch": 0.17231726283048213, + "grad_norm": 0.3722705841064453, + "learning_rate": 4.0166666666666675e-06, + "logits/chosen": -0.22373977303504944, + "logits/rejected": -0.27378931641578674, + "logps/chosen": -580.2608642578125, + "logps/rejected": -742.3297729492188, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8537003993988037, + "rewards/margins": 7.573878288269043, + "rewards/rejected": -11.42757797241211, + "step": 277 + }, + { + "epoch": 0.1729393468118196, + "grad_norm": 0.04521845653653145, + "learning_rate": 4.011111111111111e-06, + "logits/chosen": -0.14506910741329193, + "logits/rejected": -0.19569942355155945, + "logps/chosen": -293.034912109375, + "logps/rejected": -516.384033203125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.325558662414551, + "rewards/margins": 9.505696296691895, + "rewards/rejected": -12.831254005432129, + "step": 278 + }, + { + "epoch": 0.17356143079315708, + "grad_norm": 0.049368202686309814, + "learning_rate": 4.005555555555556e-06, + "logits/chosen": -0.06917404383420944, + "logits/rejected": -0.1670764833688736, + "logps/chosen": -180.20330810546875, + "logps/rejected": -357.2475891113281, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7713007926940918, + "rewards/margins": 9.181476593017578, + "rewards/rejected": -10.952775955200195, + "step": 279 + }, + { + "epoch": 0.17418351477449456, + "grad_norm": 16.95563316345215, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -0.12866783142089844, + "logits/rejected": -0.21131478250026703, + "logps/chosen": -345.57916259765625, + "logps/rejected": -484.8846435546875, + "loss": 0.6505, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.722349643707275, + "rewards/margins": 5.72164249420166, + "rewards/rejected": -10.443992614746094, + "step": 280 + }, + { + "epoch": 0.17480559875583204, + "grad_norm": 7.983461380004883, + "learning_rate": 3.994444444444445e-06, + "logits/chosen": -0.11579206585884094, + "logits/rejected": -0.19854412972927094, + "logps/chosen": -501.802978515625, + "logps/rejected": -590.4097900390625, + "loss": 0.0638, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.395183086395264, + "rewards/margins": 7.290475368499756, + "rewards/rejected": -11.68565845489502, + "step": 281 + }, + { + "epoch": 0.17542768273716952, + "grad_norm": 3.1446802616119385, + "learning_rate": 3.9888888888888895e-06, + "logits/chosen": -0.1903941035270691, + "logits/rejected": -0.1602165699005127, + "logps/chosen": -625.3588256835938, + "logps/rejected": -570.8912353515625, + "loss": 0.0411, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.458654880523682, + "rewards/margins": 5.5734758377075195, + "rewards/rejected": -10.03213119506836, + "step": 282 + }, + { + "epoch": 0.176049766718507, + "grad_norm": 1.0923452377319336, + "learning_rate": 3.983333333333334e-06, + "logits/chosen": -0.1089370995759964, + "logits/rejected": -0.18907023966312408, + "logps/chosen": -159.83824157714844, + "logps/rejected": -520.6217651367188, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0605506896972656, + "rewards/margins": 8.372678756713867, + "rewards/rejected": -10.433229446411133, + "step": 283 + }, + { + "epoch": 0.17667185069984448, + "grad_norm": 12.23865795135498, + "learning_rate": 3.977777777777778e-06, + "logits/chosen": -0.12051115185022354, + "logits/rejected": -0.10806388407945633, + "logps/chosen": -425.319580078125, + "logps/rejected": -481.9435729980469, + "loss": 0.4368, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2657275199890137, + "rewards/margins": 6.354762077331543, + "rewards/rejected": -9.620489120483398, + "step": 284 + }, + { + "epoch": 0.17729393468118196, + "grad_norm": 0.0783194974064827, + "learning_rate": 3.972222222222223e-06, + "logits/chosen": -0.09323124587535858, + "logits/rejected": -0.17756161093711853, + "logps/chosen": -390.5177917480469, + "logps/rejected": -702.4782104492188, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.93119478225708, + "rewards/margins": 9.8391752243042, + "rewards/rejected": -11.770369529724121, + "step": 285 + }, + { + "epoch": 0.17791601866251944, + "grad_norm": 0.7968442440032959, + "learning_rate": 3.966666666666667e-06, + "logits/chosen": -0.039085689932107925, + "logits/rejected": -0.17475435137748718, + "logps/chosen": -230.214599609375, + "logps/rejected": -344.4129333496094, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1485557556152344, + "rewards/margins": 6.702208518981934, + "rewards/rejected": -9.850764274597168, + "step": 286 + }, + { + "epoch": 0.17853810264385692, + "grad_norm": 0.586760938167572, + "learning_rate": 3.9611111111111115e-06, + "logits/chosen": -0.1612362265586853, + "logits/rejected": -0.19051185250282288, + "logps/chosen": -175.5810546875, + "logps/rejected": -408.9715270996094, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7933056354522705, + "rewards/margins": 7.287623882293701, + "rewards/rejected": -10.080928802490234, + "step": 287 + }, + { + "epoch": 0.1791601866251944, + "grad_norm": 6.341541290283203, + "learning_rate": 3.955555555555556e-06, + "logits/chosen": -0.14910605549812317, + "logits/rejected": -0.17914369702339172, + "logps/chosen": -509.611328125, + "logps/rejected": -523.6310424804688, + "loss": 0.1326, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7654309272766113, + "rewards/margins": 7.2451171875, + "rewards/rejected": -11.01054859161377, + "step": 288 + }, + { + "epoch": 0.17978227060653187, + "grad_norm": 2.8114283084869385, + "learning_rate": 3.95e-06, + "logits/chosen": -0.15028893947601318, + "logits/rejected": -0.22882410883903503, + "logps/chosen": -325.9549255371094, + "logps/rejected": -462.719970703125, + "loss": 0.0552, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8883886337280273, + "rewards/margins": 6.230745315551758, + "rewards/rejected": -10.119133949279785, + "step": 289 + }, + { + "epoch": 0.18040435458786935, + "grad_norm": 1.5083872079849243, + "learning_rate": 3.944444444444445e-06, + "logits/chosen": -0.04348205029964447, + "logits/rejected": -0.24326691031455994, + "logps/chosen": -129.21871948242188, + "logps/rejected": -503.5161437988281, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3853745460510254, + "rewards/margins": 9.303360939025879, + "rewards/rejected": -11.688735961914062, + "step": 290 + }, + { + "epoch": 0.18102643856920683, + "grad_norm": 0.07313703745603561, + "learning_rate": 3.938888888888889e-06, + "logits/chosen": -0.10640101879835129, + "logits/rejected": -0.2336365282535553, + "logps/chosen": -193.503662109375, + "logps/rejected": -406.8467712402344, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.81892991065979, + "rewards/margins": 8.932716369628906, + "rewards/rejected": -11.751646041870117, + "step": 291 + }, + { + "epoch": 0.1816485225505443, + "grad_norm": 14.411521911621094, + "learning_rate": 3.9333333333333335e-06, + "logits/chosen": -0.14502233266830444, + "logits/rejected": -0.21227525174617767, + "logps/chosen": -295.11181640625, + "logps/rejected": -512.0829467773438, + "loss": 0.6538, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.865895986557007, + "rewards/margins": 6.568856716156006, + "rewards/rejected": -10.43475341796875, + "step": 292 + }, + { + "epoch": 0.1822706065318818, + "grad_norm": 1.4493701457977295, + "learning_rate": 3.927777777777778e-06, + "logits/chosen": -0.12391432374715805, + "logits/rejected": -0.2255886346101761, + "logps/chosen": -322.1226806640625, + "logps/rejected": -552.146484375, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.003267765045166, + "rewards/margins": 9.465911865234375, + "rewards/rejected": -13.469179153442383, + "step": 293 + }, + { + "epoch": 0.1828926905132193, + "grad_norm": 0.02607862278819084, + "learning_rate": 3.922222222222223e-06, + "logits/chosen": -0.031725164502859116, + "logits/rejected": -0.19095200300216675, + "logps/chosen": -306.68695068359375, + "logps/rejected": -612.4155883789062, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.728273868560791, + "rewards/margins": 10.43445873260498, + "rewards/rejected": -13.16273307800293, + "step": 294 + }, + { + "epoch": 0.18351477449455678, + "grad_norm": 1.169218897819519, + "learning_rate": 3.916666666666667e-06, + "logits/chosen": -0.2128201425075531, + "logits/rejected": -0.28027665615081787, + "logps/chosen": -511.82232666015625, + "logps/rejected": -569.7046508789062, + "loss": 0.0334, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.139200687408447, + "rewards/margins": 6.32867431640625, + "rewards/rejected": -11.467874526977539, + "step": 295 + }, + { + "epoch": 0.18413685847589426, + "grad_norm": 0.07359272241592407, + "learning_rate": 3.911111111111112e-06, + "logits/chosen": -0.15398135781288147, + "logits/rejected": -0.25013861060142517, + "logps/chosen": -210.33457946777344, + "logps/rejected": -464.3741760253906, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.807844400405884, + "rewards/margins": 7.678891181945801, + "rewards/rejected": -10.486736297607422, + "step": 296 + }, + { + "epoch": 0.18475894245723173, + "grad_norm": 11.846293449401855, + "learning_rate": 3.9055555555555555e-06, + "logits/chosen": -0.12352219223976135, + "logits/rejected": -0.10686800628900528, + "logps/chosen": -424.6002502441406, + "logps/rejected": -496.3177795410156, + "loss": 0.336, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.053566932678223, + "rewards/margins": 4.290121078491211, + "rewards/rejected": -9.34368896484375, + "step": 297 + }, + { + "epoch": 0.1853810264385692, + "grad_norm": 13.389044761657715, + "learning_rate": 3.900000000000001e-06, + "logits/chosen": -0.2742255628108978, + "logits/rejected": -0.30539533495903015, + "logps/chosen": -423.1432800292969, + "logps/rejected": -455.3718566894531, + "loss": 0.3376, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.9206411838531494, + "rewards/margins": 5.543953895568848, + "rewards/rejected": -9.464594841003418, + "step": 298 + }, + { + "epoch": 0.1860031104199067, + "grad_norm": 0.8111556172370911, + "learning_rate": 3.894444444444444e-06, + "logits/chosen": 0.004206974059343338, + "logits/rejected": -0.05495908856391907, + "logps/chosen": -491.88250732421875, + "logps/rejected": -573.7401733398438, + "loss": 0.024, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.661932468414307, + "rewards/margins": 7.0668158531188965, + "rewards/rejected": -12.72874927520752, + "step": 299 + }, + { + "epoch": 0.18662519440124417, + "grad_norm": 2.397130250930786, + "learning_rate": 3.88888888888889e-06, + "logits/chosen": -0.11500917375087738, + "logits/rejected": -0.16460350155830383, + "logps/chosen": -279.34515380859375, + "logps/rejected": -417.7414855957031, + "loss": 0.0602, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1349029541015625, + "rewards/margins": 9.306596755981445, + "rewards/rejected": -11.441499710083008, + "step": 300 + }, + { + "epoch": 0.18724727838258165, + "grad_norm": 1.8491296768188477, + "learning_rate": 3.883333333333333e-06, + "logits/chosen": -0.11282320320606232, + "logits/rejected": -0.20594099164009094, + "logps/chosen": -182.48696899414062, + "logps/rejected": -406.8105773925781, + "loss": 0.0404, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7779607772827148, + "rewards/margins": 8.57518196105957, + "rewards/rejected": -10.353141784667969, + "step": 301 + }, + { + "epoch": 0.18786936236391913, + "grad_norm": 5.8792572021484375, + "learning_rate": 3.877777777777778e-06, + "logits/chosen": -0.20287154614925385, + "logits/rejected": -0.23890922963619232, + "logps/chosen": -343.49005126953125, + "logps/rejected": -414.43182373046875, + "loss": 0.1326, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3168575763702393, + "rewards/margins": 4.725082874298096, + "rewards/rejected": -7.041940212249756, + "step": 302 + }, + { + "epoch": 0.1884914463452566, + "grad_norm": 0.12104767560958862, + "learning_rate": 3.872222222222223e-06, + "logits/chosen": -0.07803389430046082, + "logits/rejected": -0.07850227504968643, + "logps/chosen": -458.8078918457031, + "logps/rejected": -551.0790405273438, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.012890338897705, + "rewards/margins": 8.999306678771973, + "rewards/rejected": -14.012197494506836, + "step": 303 + }, + { + "epoch": 0.1891135303265941, + "grad_norm": 0.16798792779445648, + "learning_rate": 3.866666666666667e-06, + "logits/chosen": -0.1706990897655487, + "logits/rejected": -0.27005675435066223, + "logps/chosen": -239.8355712890625, + "logps/rejected": -487.7419738769531, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3611295223236084, + "rewards/margins": 9.664706230163574, + "rewards/rejected": -12.025835037231445, + "step": 304 + }, + { + "epoch": 0.18973561430793157, + "grad_norm": 0.10146372020244598, + "learning_rate": 3.861111111111112e-06, + "logits/chosen": -0.12687526643276215, + "logits/rejected": -0.23298120498657227, + "logps/chosen": -152.6292724609375, + "logps/rejected": -428.32342529296875, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9358538389205933, + "rewards/margins": 10.375428199768066, + "rewards/rejected": -12.311281204223633, + "step": 305 + }, + { + "epoch": 0.19035769828926905, + "grad_norm": 0.9340623617172241, + "learning_rate": 3.855555555555556e-06, + "logits/chosen": -0.08790981024503708, + "logits/rejected": -0.1519991159439087, + "logps/chosen": -479.52557373046875, + "logps/rejected": -477.01177978515625, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.896181583404541, + "rewards/margins": 6.955130577087402, + "rewards/rejected": -10.851311683654785, + "step": 306 + }, + { + "epoch": 0.19097978227060652, + "grad_norm": 0.6949451565742493, + "learning_rate": 3.85e-06, + "logits/chosen": -0.20067211985588074, + "logits/rejected": -0.20100940763950348, + "logps/chosen": -363.6421813964844, + "logps/rejected": -456.7184753417969, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.067502021789551, + "rewards/margins": 6.717195987701416, + "rewards/rejected": -10.784697532653809, + "step": 307 + }, + { + "epoch": 0.191601866251944, + "grad_norm": 0.11013447493314743, + "learning_rate": 3.844444444444445e-06, + "logits/chosen": 0.008546624332666397, + "logits/rejected": -0.11019997298717499, + "logps/chosen": -333.4945068359375, + "logps/rejected": -559.72021484375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.004482746124268, + "rewards/margins": 11.27066707611084, + "rewards/rejected": -15.275150299072266, + "step": 308 + }, + { + "epoch": 0.19222395023328148, + "grad_norm": 9.093013763427734, + "learning_rate": 3.838888888888889e-06, + "logits/chosen": -0.1154816597700119, + "logits/rejected": -0.16686215996742249, + "logps/chosen": -335.5590515136719, + "logps/rejected": -413.1333312988281, + "loss": 0.1375, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.8123650550842285, + "rewards/margins": 6.073273658752441, + "rewards/rejected": -10.885639190673828, + "step": 309 + }, + { + "epoch": 0.19284603421461896, + "grad_norm": 10.179825782775879, + "learning_rate": 3.833333333333334e-06, + "logits/chosen": -0.054509781301021576, + "logits/rejected": -0.1947406530380249, + "logps/chosen": -332.4117126464844, + "logps/rejected": -528.9217529296875, + "loss": 0.2022, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2988693714141846, + "rewards/margins": 10.362524032592773, + "rewards/rejected": -13.661393165588379, + "step": 310 + }, + { + "epoch": 0.19346811819595647, + "grad_norm": 0.18078327178955078, + "learning_rate": 3.827777777777778e-06, + "logits/chosen": -0.0006765536963939667, + "logits/rejected": -0.14219436049461365, + "logps/chosen": -287.09246826171875, + "logps/rejected": -562.771728515625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1900763511657715, + "rewards/margins": 10.440336227416992, + "rewards/rejected": -13.630412101745605, + "step": 311 + }, + { + "epoch": 0.19409020217729395, + "grad_norm": 2.032473564147949, + "learning_rate": 3.8222222222222224e-06, + "logits/chosen": -0.16562078893184662, + "logits/rejected": -0.22813017666339874, + "logps/chosen": -321.112060546875, + "logps/rejected": -413.3865966796875, + "loss": 0.0448, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.832618236541748, + "rewards/margins": 5.070561408996582, + "rewards/rejected": -9.903180122375488, + "step": 312 + }, + { + "epoch": 0.19471228615863143, + "grad_norm": 11.149839401245117, + "learning_rate": 3.816666666666667e-06, + "logits/chosen": -0.20573855936527252, + "logits/rejected": -0.2992290258407593, + "logps/chosen": -380.82977294921875, + "logps/rejected": -529.879638671875, + "loss": 0.1418, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.583426475524902, + "rewards/margins": 5.64689826965332, + "rewards/rejected": -12.230324745178223, + "step": 313 + }, + { + "epoch": 0.1953343701399689, + "grad_norm": 25.30531883239746, + "learning_rate": 3.8111111111111117e-06, + "logits/chosen": -0.16766656935214996, + "logits/rejected": -0.1943473517894745, + "logps/chosen": -528.7338256835938, + "logps/rejected": -496.34149169921875, + "loss": 0.7063, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.979410409927368, + "rewards/margins": 4.692899227142334, + "rewards/rejected": -8.672309875488281, + "step": 314 + }, + { + "epoch": 0.19595645412130638, + "grad_norm": 0.37734857201576233, + "learning_rate": 3.8055555555555556e-06, + "logits/chosen": -0.18479357659816742, + "logits/rejected": -0.25910285115242004, + "logps/chosen": -392.4141540527344, + "logps/rejected": -533.338134765625, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.306123733520508, + "rewards/margins": 7.038917064666748, + "rewards/rejected": -11.345041275024414, + "step": 315 + }, + { + "epoch": 0.19657853810264386, + "grad_norm": 18.20452880859375, + "learning_rate": 3.8000000000000005e-06, + "logits/chosen": -0.19079279899597168, + "logits/rejected": -0.24241212010383606, + "logps/chosen": -276.8984680175781, + "logps/rejected": -384.6717834472656, + "loss": 0.8321, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.405402183532715, + "rewards/margins": 6.267552852630615, + "rewards/rejected": -11.672955513000488, + "step": 316 + }, + { + "epoch": 0.19720062208398134, + "grad_norm": 2.6795992851257324, + "learning_rate": 3.7944444444444444e-06, + "logits/chosen": -0.13392147421836853, + "logits/rejected": -0.1844259798526764, + "logps/chosen": -357.87762451171875, + "logps/rejected": -497.5061950683594, + "loss": 0.0227, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.003419876098633, + "rewards/margins": 8.011981010437012, + "rewards/rejected": -13.015401840209961, + "step": 317 + }, + { + "epoch": 0.19782270606531882, + "grad_norm": 1.4615247249603271, + "learning_rate": 3.7888888888888893e-06, + "logits/chosen": -0.171232208609581, + "logits/rejected": -0.2835785150527954, + "logps/chosen": -377.25799560546875, + "logps/rejected": -609.0706787109375, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1616735458374023, + "rewards/margins": 9.900686264038086, + "rewards/rejected": -12.062360763549805, + "step": 318 + }, + { + "epoch": 0.1984447900466563, + "grad_norm": 48.20172882080078, + "learning_rate": 3.7833333333333337e-06, + "logits/chosen": -0.08509072661399841, + "logits/rejected": -0.18594932556152344, + "logps/chosen": -317.8309631347656, + "logps/rejected": -386.01824951171875, + "loss": 1.0146, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.453725337982178, + "rewards/margins": 3.463418483734131, + "rewards/rejected": -9.917142868041992, + "step": 319 + }, + { + "epoch": 0.19906687402799378, + "grad_norm": 0.8027237057685852, + "learning_rate": 3.777777777777778e-06, + "logits/chosen": -0.09653080254793167, + "logits/rejected": -0.22551584243774414, + "logps/chosen": -302.9489440917969, + "logps/rejected": -557.2357177734375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.186956882476807, + "rewards/margins": 9.540058135986328, + "rewards/rejected": -13.727015495300293, + "step": 320 + }, + { + "epoch": 0.19968895800933126, + "grad_norm": 0.18862368166446686, + "learning_rate": 3.7722222222222225e-06, + "logits/chosen": -0.10923755913972855, + "logits/rejected": -0.1638440638780594, + "logps/chosen": -301.8031311035156, + "logps/rejected": -545.3920288085938, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.759068489074707, + "rewards/margins": 10.169463157653809, + "rewards/rejected": -13.928532600402832, + "step": 321 + }, + { + "epoch": 0.20031104199066874, + "grad_norm": 0.4338468313217163, + "learning_rate": 3.766666666666667e-06, + "logits/chosen": -0.09893743693828583, + "logits/rejected": -0.2181713879108429, + "logps/chosen": -224.11026000976562, + "logps/rejected": -462.6011047363281, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.371335744857788, + "rewards/margins": 9.610607147216797, + "rewards/rejected": -12.98194408416748, + "step": 322 + }, + { + "epoch": 0.20093312597200622, + "grad_norm": 2.3008389472961426, + "learning_rate": 3.7611111111111113e-06, + "logits/chosen": -0.12894758582115173, + "logits/rejected": -0.18000845611095428, + "logps/chosen": -357.9541015625, + "logps/rejected": -467.2584228515625, + "loss": 0.0411, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.995943307876587, + "rewards/margins": 6.40446662902832, + "rewards/rejected": -10.400408744812012, + "step": 323 + }, + { + "epoch": 0.2015552099533437, + "grad_norm": 6.87632417678833, + "learning_rate": 3.7555555555555557e-06, + "logits/chosen": -0.2005600780248642, + "logits/rejected": -0.2722019553184509, + "logps/chosen": -379.22802734375, + "logps/rejected": -530.455322265625, + "loss": 0.0903, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2425832748413086, + "rewards/margins": 7.511867523193359, + "rewards/rejected": -10.754450798034668, + "step": 324 + }, + { + "epoch": 0.20217729393468117, + "grad_norm": 0.5817131400108337, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": -0.1757623553276062, + "logits/rejected": -0.19050246477127075, + "logps/chosen": -215.6142120361328, + "logps/rejected": -393.9692687988281, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7086167335510254, + "rewards/margins": 7.628800392150879, + "rewards/rejected": -10.337417602539062, + "step": 325 + }, + { + "epoch": 0.20279937791601865, + "grad_norm": 20.24099349975586, + "learning_rate": 3.744444444444445e-06, + "logits/chosen": -0.2428133487701416, + "logits/rejected": -0.2315109223127365, + "logps/chosen": -362.8164367675781, + "logps/rejected": -520.068603515625, + "loss": 0.056, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6495227813720703, + "rewards/margins": 7.493235111236572, + "rewards/rejected": -11.142757415771484, + "step": 326 + }, + { + "epoch": 0.20342146189735613, + "grad_norm": 0.5783504247665405, + "learning_rate": 3.7388888888888893e-06, + "logits/chosen": -0.15442150831222534, + "logits/rejected": -0.22754240036010742, + "logps/chosen": -410.40045166015625, + "logps/rejected": -489.0352478027344, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5244674682617188, + "rewards/margins": 6.6846723556518555, + "rewards/rejected": -10.209139823913574, + "step": 327 + }, + { + "epoch": 0.2040435458786936, + "grad_norm": 4.367368221282959, + "learning_rate": 3.7333333333333337e-06, + "logits/chosen": 0.005438759922981262, + "logits/rejected": -0.1364053338766098, + "logps/chosen": -181.89207458496094, + "logps/rejected": -427.423095703125, + "loss": 0.2116, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5139732360839844, + "rewards/margins": 10.898889541625977, + "rewards/rejected": -14.412863731384277, + "step": 328 + }, + { + "epoch": 0.20466562986003112, + "grad_norm": 1.0591031312942505, + "learning_rate": 3.727777777777778e-06, + "logits/chosen": -0.11284130811691284, + "logits/rejected": -0.2150920331478119, + "logps/chosen": -637.9334716796875, + "logps/rejected": -721.427490234375, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.276348114013672, + "rewards/margins": 7.676934242248535, + "rewards/rejected": -13.953282356262207, + "step": 329 + }, + { + "epoch": 0.2052877138413686, + "grad_norm": 0.8814737796783447, + "learning_rate": 3.7222222222222225e-06, + "logits/chosen": -0.21122510731220245, + "logits/rejected": -0.22258779406547546, + "logps/chosen": -286.2511291503906, + "logps/rejected": -403.92620849609375, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3176865577697754, + "rewards/margins": 6.915071487426758, + "rewards/rejected": -9.232757568359375, + "step": 330 + }, + { + "epoch": 0.20590979782270608, + "grad_norm": 8.058083534240723, + "learning_rate": 3.716666666666667e-06, + "logits/chosen": -0.09542589634656906, + "logits/rejected": -0.21601343154907227, + "logps/chosen": -463.3930358886719, + "logps/rejected": -528.101318359375, + "loss": 0.1097, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.084930419921875, + "rewards/margins": 8.0634183883667, + "rewards/rejected": -12.148348808288574, + "step": 331 + }, + { + "epoch": 0.20653188180404355, + "grad_norm": 0.5373348593711853, + "learning_rate": 3.7111111111111113e-06, + "logits/chosen": -0.0671771690249443, + "logits/rejected": -0.1541689932346344, + "logps/chosen": -280.47412109375, + "logps/rejected": -451.6821594238281, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3421378135681152, + "rewards/margins": 8.082401275634766, + "rewards/rejected": -11.424538612365723, + "step": 332 + }, + { + "epoch": 0.20715396578538103, + "grad_norm": 14.336930274963379, + "learning_rate": 3.705555555555556e-06, + "logits/chosen": -0.1966056376695633, + "logits/rejected": -0.23180031776428223, + "logps/chosen": -438.124267578125, + "logps/rejected": -607.8470458984375, + "loss": 0.446, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.6886088848114014, + "rewards/margins": 5.465229511260986, + "rewards/rejected": -9.153838157653809, + "step": 333 + }, + { + "epoch": 0.2077760497667185, + "grad_norm": 2.8990976810455322, + "learning_rate": 3.7e-06, + "logits/chosen": -0.17263835668563843, + "logits/rejected": -0.23792961239814758, + "logps/chosen": -411.1330261230469, + "logps/rejected": -567.937744140625, + "loss": 0.0628, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.87552809715271, + "rewards/margins": 8.45396900177002, + "rewards/rejected": -12.329497337341309, + "step": 334 + }, + { + "epoch": 0.208398133748056, + "grad_norm": 0.0484350211918354, + "learning_rate": 3.694444444444445e-06, + "logits/chosen": -0.12815432250499725, + "logits/rejected": -0.17835107445716858, + "logps/chosen": -414.9893798828125, + "logps/rejected": -490.8114013671875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9496493339538574, + "rewards/margins": 9.752138137817383, + "rewards/rejected": -12.701786994934082, + "step": 335 + }, + { + "epoch": 0.20902021772939347, + "grad_norm": 6.507220268249512, + "learning_rate": 3.688888888888889e-06, + "logits/chosen": -0.23672106862068176, + "logits/rejected": -0.2982875406742096, + "logps/chosen": -476.8284912109375, + "logps/rejected": -574.911865234375, + "loss": 0.1496, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1567940711975098, + "rewards/margins": 7.165985107421875, + "rewards/rejected": -10.322778701782227, + "step": 336 + }, + { + "epoch": 0.20964230171073095, + "grad_norm": 0.8922080397605896, + "learning_rate": 3.6833333333333338e-06, + "logits/chosen": -0.06133444607257843, + "logits/rejected": -0.1913379430770874, + "logps/chosen": -192.1638946533203, + "logps/rejected": -388.81634521484375, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5963149070739746, + "rewards/margins": 8.28589153289795, + "rewards/rejected": -11.882207870483398, + "step": 337 + }, + { + "epoch": 0.21026438569206843, + "grad_norm": 17.994171142578125, + "learning_rate": 3.6777777777777778e-06, + "logits/chosen": -0.17548298835754395, + "logits/rejected": -0.18905183672904968, + "logps/chosen": -465.01495361328125, + "logps/rejected": -562.777587890625, + "loss": 0.6875, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.018253803253174, + "rewards/margins": 6.166885852813721, + "rewards/rejected": -12.185140609741211, + "step": 338 + }, + { + "epoch": 0.2108864696734059, + "grad_norm": 0.5738991498947144, + "learning_rate": 3.6722222222222226e-06, + "logits/chosen": -0.20180368423461914, + "logits/rejected": -0.22199206054210663, + "logps/chosen": -247.31788635253906, + "logps/rejected": -371.727294921875, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.620168685913086, + "rewards/margins": 8.354777336120605, + "rewards/rejected": -9.974946022033691, + "step": 339 + }, + { + "epoch": 0.2115085536547434, + "grad_norm": 13.470012664794922, + "learning_rate": 3.6666666666666666e-06, + "logits/chosen": -0.11512891203165054, + "logits/rejected": -0.1769413948059082, + "logps/chosen": -345.9811096191406, + "logps/rejected": -493.4841613769531, + "loss": 0.2397, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.953691244125366, + "rewards/margins": 9.004188537597656, + "rewards/rejected": -12.957880020141602, + "step": 340 + }, + { + "epoch": 0.21213063763608087, + "grad_norm": 0.7773336172103882, + "learning_rate": 3.6611111111111114e-06, + "logits/chosen": -0.20654502511024475, + "logits/rejected": -0.23511651158332825, + "logps/chosen": -522.2822265625, + "logps/rejected": -666.371337890625, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.919970989227295, + "rewards/margins": 8.817320823669434, + "rewards/rejected": -12.737292289733887, + "step": 341 + }, + { + "epoch": 0.21275272161741834, + "grad_norm": 0.18059542775154114, + "learning_rate": 3.6555555555555562e-06, + "logits/chosen": -0.09957277029752731, + "logits/rejected": -0.2179642915725708, + "logps/chosen": -234.1021270751953, + "logps/rejected": -472.538818359375, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5047701597213745, + "rewards/margins": 10.285346984863281, + "rewards/rejected": -11.790117263793945, + "step": 342 + }, + { + "epoch": 0.21337480559875582, + "grad_norm": 1.0630011558532715, + "learning_rate": 3.65e-06, + "logits/chosen": -0.11683446168899536, + "logits/rejected": -0.23474550247192383, + "logps/chosen": -320.64031982421875, + "logps/rejected": -561.516357421875, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2701375484466553, + "rewards/margins": 9.812124252319336, + "rewards/rejected": -13.08226203918457, + "step": 343 + }, + { + "epoch": 0.2139968895800933, + "grad_norm": 3.113795757293701, + "learning_rate": 3.644444444444445e-06, + "logits/chosen": -0.14496782422065735, + "logits/rejected": -0.21670858561992645, + "logps/chosen": -310.59942626953125, + "logps/rejected": -505.5811767578125, + "loss": 0.0265, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.780181884765625, + "rewards/margins": 8.750856399536133, + "rewards/rejected": -13.531037330627441, + "step": 344 + }, + { + "epoch": 0.21461897356143078, + "grad_norm": 0.22519256174564362, + "learning_rate": 3.638888888888889e-06, + "logits/chosen": -0.21577675640583038, + "logits/rejected": -0.24033984541893005, + "logps/chosen": -429.7940368652344, + "logps/rejected": -566.9864501953125, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8621768951416016, + "rewards/margins": 8.13748550415039, + "rewards/rejected": -10.999661445617676, + "step": 345 + }, + { + "epoch": 0.21524105754276826, + "grad_norm": 6.833222389221191, + "learning_rate": 3.633333333333334e-06, + "logits/chosen": -0.08164580166339874, + "logits/rejected": -0.17403572797775269, + "logps/chosen": -450.984375, + "logps/rejected": -596.1024169921875, + "loss": 0.0871, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2477617263793945, + "rewards/margins": 7.5104570388793945, + "rewards/rejected": -11.758218765258789, + "step": 346 + }, + { + "epoch": 0.21586314152410577, + "grad_norm": 0.3775492012500763, + "learning_rate": 3.627777777777778e-06, + "logits/chosen": -0.12578441202640533, + "logits/rejected": -0.23718059062957764, + "logps/chosen": -439.7431640625, + "logps/rejected": -677.3192749023438, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.5854716300964355, + "rewards/margins": 12.087300300598145, + "rewards/rejected": -17.672771453857422, + "step": 347 + }, + { + "epoch": 0.21648522550544325, + "grad_norm": 9.997420310974121, + "learning_rate": 3.6222222222222226e-06, + "logits/chosen": -0.06224264204502106, + "logits/rejected": -0.08258426189422607, + "logps/chosen": -387.1405944824219, + "logps/rejected": -465.9384765625, + "loss": 0.179, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.364864826202393, + "rewards/margins": 6.43337869644165, + "rewards/rejected": -10.798242568969727, + "step": 348 + }, + { + "epoch": 0.21710730948678073, + "grad_norm": 20.763446807861328, + "learning_rate": 3.616666666666667e-06, + "logits/chosen": -0.1708325296640396, + "logits/rejected": -0.22189892828464508, + "logps/chosen": -420.20599365234375, + "logps/rejected": -538.9539794921875, + "loss": 1.1131, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.224533557891846, + "rewards/margins": 8.561653137207031, + "rewards/rejected": -14.786188125610352, + "step": 349 + }, + { + "epoch": 0.2177293934681182, + "grad_norm": 0.026409490033984184, + "learning_rate": 3.6111111111111115e-06, + "logits/chosen": -0.11329104006290436, + "logits/rejected": -0.17425042390823364, + "logps/chosen": -175.23570251464844, + "logps/rejected": -461.90130615234375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0881595611572266, + "rewards/margins": 9.47543716430664, + "rewards/rejected": -12.563596725463867, + "step": 350 + }, + { + "epoch": 0.21835147744945568, + "grad_norm": 11.509342193603516, + "learning_rate": 3.605555555555556e-06, + "logits/chosen": 0.030307969078421593, + "logits/rejected": -0.13414856791496277, + "logps/chosen": -399.7902526855469, + "logps/rejected": -538.86376953125, + "loss": 0.2369, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.3192219734191895, + "rewards/margins": 6.249074935913086, + "rewards/rejected": -10.568296432495117, + "step": 351 + }, + { + "epoch": 0.21897356143079316, + "grad_norm": 0.06278355419635773, + "learning_rate": 3.6000000000000003e-06, + "logits/chosen": -0.016615409404039383, + "logits/rejected": -0.1305798441171646, + "logps/chosen": -206.9671630859375, + "logps/rejected": -425.337890625, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.842270851135254, + "rewards/margins": 9.94467544555664, + "rewards/rejected": -12.786947250366211, + "step": 352 + }, + { + "epoch": 0.21959564541213064, + "grad_norm": 15.22623348236084, + "learning_rate": 3.5944444444444447e-06, + "logits/chosen": -0.1217794194817543, + "logits/rejected": -0.205989271402359, + "logps/chosen": -389.064697265625, + "logps/rejected": -463.67645263671875, + "loss": 0.3422, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.209240913391113, + "rewards/margins": 7.778649806976318, + "rewards/rejected": -11.987890243530273, + "step": 353 + }, + { + "epoch": 0.22021772939346812, + "grad_norm": 0.06755994260311127, + "learning_rate": 3.588888888888889e-06, + "logits/chosen": -0.10339230298995972, + "logits/rejected": -0.1725088506937027, + "logps/chosen": -245.9210205078125, + "logps/rejected": -500.5525207519531, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.09561014175415, + "rewards/margins": 9.418173789978027, + "rewards/rejected": -13.513784408569336, + "step": 354 + }, + { + "epoch": 0.2208398133748056, + "grad_norm": 0.3437889814376831, + "learning_rate": 3.5833333333333335e-06, + "logits/chosen": -0.11296197026968002, + "logits/rejected": -0.1622902899980545, + "logps/chosen": -299.91375732421875, + "logps/rejected": -416.4541931152344, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3866829872131348, + "rewards/margins": 8.081198692321777, + "rewards/rejected": -11.46788215637207, + "step": 355 + }, + { + "epoch": 0.22146189735614308, + "grad_norm": 0.18022121489048004, + "learning_rate": 3.577777777777778e-06, + "logits/chosen": -0.20010042190551758, + "logits/rejected": -0.26866522431373596, + "logps/chosen": -325.42041015625, + "logps/rejected": -488.3052673339844, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3931467533111572, + "rewards/margins": 6.337854385375977, + "rewards/rejected": -9.731000900268555, + "step": 356 + }, + { + "epoch": 0.22208398133748056, + "grad_norm": 0.35737305879592896, + "learning_rate": 3.5722222222222223e-06, + "logits/chosen": -0.1810259222984314, + "logits/rejected": -0.2528773248195648, + "logps/chosen": -425.72998046875, + "logps/rejected": -540.394287109375, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.2725830078125, + "rewards/margins": 8.15455436706543, + "rewards/rejected": -14.42713737487793, + "step": 357 + }, + { + "epoch": 0.22270606531881804, + "grad_norm": 0.6098016500473022, + "learning_rate": 3.566666666666667e-06, + "logits/chosen": -0.1760822832584381, + "logits/rejected": -0.27641376852989197, + "logps/chosen": -259.88360595703125, + "logps/rejected": -429.99285888671875, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.19488525390625, + "rewards/margins": 7.168217658996582, + "rewards/rejected": -11.363101959228516, + "step": 358 + }, + { + "epoch": 0.22332814930015552, + "grad_norm": 4.290386199951172, + "learning_rate": 3.561111111111111e-06, + "logits/chosen": -0.15001511573791504, + "logits/rejected": -0.21489892899990082, + "logps/chosen": -506.4664306640625, + "logps/rejected": -613.994140625, + "loss": 0.0409, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.779850482940674, + "rewards/margins": 9.983102798461914, + "rewards/rejected": -12.76295280456543, + "step": 359 + }, + { + "epoch": 0.223950233281493, + "grad_norm": 2.9640703201293945, + "learning_rate": 3.555555555555556e-06, + "logits/chosen": -0.18051818013191223, + "logits/rejected": -0.2511710524559021, + "logps/chosen": -485.75384521484375, + "logps/rejected": -709.4149780273438, + "loss": 0.0301, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3066911697387695, + "rewards/margins": 10.648893356323242, + "rewards/rejected": -13.955585479736328, + "step": 360 + }, + { + "epoch": 0.22457231726283047, + "grad_norm": 16.984848022460938, + "learning_rate": 3.5500000000000003e-06, + "logits/chosen": -0.20869368314743042, + "logits/rejected": -0.22295644879341125, + "logps/chosen": -446.1551208496094, + "logps/rejected": -402.0832824707031, + "loss": 0.4751, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.934479236602783, + "rewards/margins": 4.949220657348633, + "rewards/rejected": -9.883699417114258, + "step": 361 + }, + { + "epoch": 0.22519440124416795, + "grad_norm": 0.38781794905662537, + "learning_rate": 3.5444444444444447e-06, + "logits/chosen": -0.18780189752578735, + "logits/rejected": -0.2573065161705017, + "logps/chosen": -336.21856689453125, + "logps/rejected": -592.5240478515625, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2945284843444824, + "rewards/margins": 8.605562210083008, + "rewards/rejected": -11.900091171264648, + "step": 362 + }, + { + "epoch": 0.22581648522550543, + "grad_norm": 0.2553725838661194, + "learning_rate": 3.538888888888889e-06, + "logits/chosen": -0.12298416346311569, + "logits/rejected": -0.14757713675498962, + "logps/chosen": -325.606201171875, + "logps/rejected": -511.6773986816406, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.114407539367676, + "rewards/margins": 8.453333854675293, + "rewards/rejected": -12.567741394042969, + "step": 363 + }, + { + "epoch": 0.2264385692068429, + "grad_norm": 0.04131851717829704, + "learning_rate": 3.5333333333333335e-06, + "logits/chosen": -0.14848542213439941, + "logits/rejected": -0.2643916606903076, + "logps/chosen": -467.01422119140625, + "logps/rejected": -662.3174438476562, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.8769450187683105, + "rewards/margins": 10.566459655761719, + "rewards/rejected": -16.443405151367188, + "step": 364 + }, + { + "epoch": 0.22706065318818042, + "grad_norm": 0.014407293871045113, + "learning_rate": 3.5277777777777784e-06, + "logits/chosen": -0.15832027792930603, + "logits/rejected": -0.2699451148509979, + "logps/chosen": -238.2161865234375, + "logps/rejected": -508.69921875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3034543991088867, + "rewards/margins": 10.606285095214844, + "rewards/rejected": -12.90973949432373, + "step": 365 + }, + { + "epoch": 0.2276827371695179, + "grad_norm": 1.61811363697052, + "learning_rate": 3.5222222222222223e-06, + "logits/chosen": -0.14667481184005737, + "logits/rejected": -0.301981121301651, + "logps/chosen": -264.9617614746094, + "logps/rejected": -491.2540588378906, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.448193550109863, + "rewards/margins": 8.293634414672852, + "rewards/rejected": -12.741828918457031, + "step": 366 + }, + { + "epoch": 0.22830482115085537, + "grad_norm": 0.20216894149780273, + "learning_rate": 3.516666666666667e-06, + "logits/chosen": -0.14804862439632416, + "logits/rejected": -0.28644809126853943, + "logps/chosen": -280.64251708984375, + "logps/rejected": -574.1988525390625, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.146522521972656, + "rewards/margins": 10.903705596923828, + "rewards/rejected": -15.0502290725708, + "step": 367 + }, + { + "epoch": 0.22892690513219285, + "grad_norm": 0.38105887174606323, + "learning_rate": 3.511111111111111e-06, + "logits/chosen": -0.06191030517220497, + "logits/rejected": -0.16082048416137695, + "logps/chosen": -297.89471435546875, + "logps/rejected": -592.9227905273438, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.380183696746826, + "rewards/margins": 9.996044158935547, + "rewards/rejected": -13.376228332519531, + "step": 368 + }, + { + "epoch": 0.22954898911353033, + "grad_norm": 10.316984176635742, + "learning_rate": 3.505555555555556e-06, + "logits/chosen": -0.06523730605840683, + "logits/rejected": -0.1675911545753479, + "logps/chosen": -292.000244140625, + "logps/rejected": -598.212890625, + "loss": 0.0755, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.109089374542236, + "rewards/margins": 8.413310050964355, + "rewards/rejected": -12.522398948669434, + "step": 369 + }, + { + "epoch": 0.2301710730948678, + "grad_norm": 1.8596535921096802, + "learning_rate": 3.5e-06, + "logits/chosen": -0.16046646237373352, + "logits/rejected": -0.24536313116550446, + "logps/chosen": -268.98883056640625, + "logps/rejected": -521.8341064453125, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.097864627838135, + "rewards/margins": 8.194323539733887, + "rewards/rejected": -13.29218864440918, + "step": 370 + }, + { + "epoch": 0.2307931570762053, + "grad_norm": 0.48437583446502686, + "learning_rate": 3.4944444444444448e-06, + "logits/chosen": -0.14299863576889038, + "logits/rejected": -0.23104514181613922, + "logps/chosen": -302.6101379394531, + "logps/rejected": -503.5257263183594, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.083168983459473, + "rewards/margins": 8.383268356323242, + "rewards/rejected": -12.466438293457031, + "step": 371 + }, + { + "epoch": 0.23141524105754277, + "grad_norm": 0.06720536947250366, + "learning_rate": 3.4888888888888896e-06, + "logits/chosen": -0.15616759657859802, + "logits/rejected": -0.3077367842197418, + "logps/chosen": -222.61346435546875, + "logps/rejected": -573.4774780273438, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.303955078125, + "rewards/margins": 13.112669944763184, + "rewards/rejected": -17.4166259765625, + "step": 372 + }, + { + "epoch": 0.23203732503888025, + "grad_norm": 9.391034126281738, + "learning_rate": 3.4833333333333336e-06, + "logits/chosen": -0.13026970624923706, + "logits/rejected": -0.18529203534126282, + "logps/chosen": -224.83937072753906, + "logps/rejected": -374.99945068359375, + "loss": 0.2428, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7359795570373535, + "rewards/margins": 7.217138290405273, + "rewards/rejected": -10.953117370605469, + "step": 373 + }, + { + "epoch": 0.23265940902021773, + "grad_norm": 1.1025294065475464, + "learning_rate": 3.4777777777777784e-06, + "logits/chosen": -0.1357441395521164, + "logits/rejected": -0.16787812113761902, + "logps/chosen": -309.89630126953125, + "logps/rejected": -451.30023193359375, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.619588851928711, + "rewards/margins": 8.439698219299316, + "rewards/rejected": -12.059286117553711, + "step": 374 + }, + { + "epoch": 0.2332814930015552, + "grad_norm": 13.255566596984863, + "learning_rate": 3.4722222222222224e-06, + "logits/chosen": -0.11587365716695786, + "logits/rejected": -0.22992996871471405, + "logps/chosen": -418.07659912109375, + "logps/rejected": -707.4680786132812, + "loss": 0.3606, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.922843933105469, + "rewards/margins": 10.565007209777832, + "rewards/rejected": -15.487850189208984, + "step": 375 + }, + { + "epoch": 0.23390357698289269, + "grad_norm": 0.1321662813425064, + "learning_rate": 3.4666666666666672e-06, + "logits/chosen": -0.12376575917005539, + "logits/rejected": -0.23545652627944946, + "logps/chosen": -398.9034423828125, + "logps/rejected": -625.6297607421875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5136966705322266, + "rewards/margins": 10.097312927246094, + "rewards/rejected": -13.611008644104004, + "step": 376 + }, + { + "epoch": 0.23452566096423016, + "grad_norm": 1.365851879119873, + "learning_rate": 3.461111111111111e-06, + "logits/chosen": -0.10350950807332993, + "logits/rejected": -0.21221886575222015, + "logps/chosen": -267.8864440917969, + "logps/rejected": -655.3182373046875, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4596803188323975, + "rewards/margins": 10.464975357055664, + "rewards/rejected": -12.92465591430664, + "step": 377 + }, + { + "epoch": 0.23514774494556764, + "grad_norm": 0.08298648148775101, + "learning_rate": 3.455555555555556e-06, + "logits/chosen": -0.08038352429866791, + "logits/rejected": -0.15960431098937988, + "logps/chosen": -299.8287353515625, + "logps/rejected": -541.7484130859375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4188730716705322, + "rewards/margins": 11.397758483886719, + "rewards/rejected": -14.816630363464355, + "step": 378 + }, + { + "epoch": 0.23576982892690512, + "grad_norm": 0.17237895727157593, + "learning_rate": 3.45e-06, + "logits/chosen": -0.11609259247779846, + "logits/rejected": -0.1665896326303482, + "logps/chosen": -235.63449096679688, + "logps/rejected": -390.3968505859375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.258384943008423, + "rewards/margins": 7.8450727462768555, + "rewards/rejected": -11.103458404541016, + "step": 379 + }, + { + "epoch": 0.2363919129082426, + "grad_norm": 2.9488673210144043, + "learning_rate": 3.444444444444445e-06, + "logits/chosen": -0.09931506216526031, + "logits/rejected": -0.1477060765028, + "logps/chosen": -360.4647521972656, + "logps/rejected": -541.56640625, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.368687152862549, + "rewards/margins": 8.384700775146484, + "rewards/rejected": -12.753388404846191, + "step": 380 + }, + { + "epoch": 0.23701399688958008, + "grad_norm": 4.5132832527160645, + "learning_rate": 3.4388888888888892e-06, + "logits/chosen": -0.13386908173561096, + "logits/rejected": -0.22458872199058533, + "logps/chosen": -244.76141357421875, + "logps/rejected": -523.00927734375, + "loss": 0.0921, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.369078636169434, + "rewards/margins": 9.182757377624512, + "rewards/rejected": -13.551836013793945, + "step": 381 + }, + { + "epoch": 0.2376360808709176, + "grad_norm": 11.065957069396973, + "learning_rate": 3.4333333333333336e-06, + "logits/chosen": -0.17291612923145294, + "logits/rejected": -0.19855031371116638, + "logps/chosen": -334.8529052734375, + "logps/rejected": -391.1678771972656, + "loss": 0.2345, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5479581356048584, + "rewards/margins": 7.747190952301025, + "rewards/rejected": -10.295148849487305, + "step": 382 + }, + { + "epoch": 0.23825816485225507, + "grad_norm": 0.8340943455696106, + "learning_rate": 3.427777777777778e-06, + "logits/chosen": -0.1793213039636612, + "logits/rejected": -0.21241940557956696, + "logps/chosen": -461.47332763671875, + "logps/rejected": -377.36016845703125, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.708181619644165, + "rewards/margins": 6.348997116088867, + "rewards/rejected": -10.057178497314453, + "step": 383 + }, + { + "epoch": 0.23888024883359255, + "grad_norm": 0.49028706550598145, + "learning_rate": 3.4222222222222224e-06, + "logits/chosen": -0.07210139185190201, + "logits/rejected": -0.15341822803020477, + "logps/chosen": -130.67788696289062, + "logps/rejected": -419.41741943359375, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6946709156036377, + "rewards/margins": 8.433332443237305, + "rewards/rejected": -11.128003120422363, + "step": 384 + }, + { + "epoch": 0.23950233281493002, + "grad_norm": 4.076060771942139, + "learning_rate": 3.416666666666667e-06, + "logits/chosen": -0.257233202457428, + "logits/rejected": -0.2595617473125458, + "logps/chosen": -357.646728515625, + "logps/rejected": -461.977783203125, + "loss": 0.0689, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.900479793548584, + "rewards/margins": 6.897343635559082, + "rewards/rejected": -9.797823905944824, + "step": 385 + }, + { + "epoch": 0.2401244167962675, + "grad_norm": 0.14082859456539154, + "learning_rate": 3.4111111111111113e-06, + "logits/chosen": -0.09942559897899628, + "logits/rejected": -0.25447195768356323, + "logps/chosen": -259.97052001953125, + "logps/rejected": -566.1975708007812, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6355254650115967, + "rewards/margins": 12.030058860778809, + "rewards/rejected": -14.665582656860352, + "step": 386 + }, + { + "epoch": 0.24074650077760498, + "grad_norm": 0.7881249189376831, + "learning_rate": 3.4055555555555557e-06, + "logits/chosen": -0.17156000435352325, + "logits/rejected": -0.21836459636688232, + "logps/chosen": -568.3043212890625, + "logps/rejected": -592.5196533203125, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.905832290649414, + "rewards/margins": 9.074519157409668, + "rewards/rejected": -13.980350494384766, + "step": 387 + }, + { + "epoch": 0.24136858475894246, + "grad_norm": 3.5424513816833496, + "learning_rate": 3.4000000000000005e-06, + "logits/chosen": -0.09448711574077606, + "logits/rejected": -0.22784167528152466, + "logps/chosen": -249.9168243408203, + "logps/rejected": -431.3128356933594, + "loss": 0.0625, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9340550899505615, + "rewards/margins": 8.774353981018066, + "rewards/rejected": -12.708409309387207, + "step": 388 + }, + { + "epoch": 0.24199066874027994, + "grad_norm": 2.0734033584594727, + "learning_rate": 3.3944444444444445e-06, + "logits/chosen": -0.09376323968172073, + "logits/rejected": -0.1777053028345108, + "logps/chosen": -302.1076354980469, + "logps/rejected": -554.873779296875, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.146059513092041, + "rewards/margins": 10.517495155334473, + "rewards/rejected": -15.663555145263672, + "step": 389 + }, + { + "epoch": 0.24261275272161742, + "grad_norm": 5.5834503173828125, + "learning_rate": 3.3888888888888893e-06, + "logits/chosen": -0.2597891688346863, + "logits/rejected": -0.29964298009872437, + "logps/chosen": -506.893310546875, + "logps/rejected": -623.1090698242188, + "loss": 0.0748, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.747359275817871, + "rewards/margins": 8.048364639282227, + "rewards/rejected": -12.795723915100098, + "step": 390 + }, + { + "epoch": 0.2432348367029549, + "grad_norm": 4.963590621948242, + "learning_rate": 3.3833333333333333e-06, + "logits/chosen": -0.28298333287239075, + "logits/rejected": -0.3114388585090637, + "logps/chosen": -367.48699951171875, + "logps/rejected": -432.4874572753906, + "loss": 0.1391, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.447333812713623, + "rewards/margins": 4.259817600250244, + "rewards/rejected": -7.707151412963867, + "step": 391 + }, + { + "epoch": 0.24385692068429238, + "grad_norm": 0.2512289583683014, + "learning_rate": 3.377777777777778e-06, + "logits/chosen": -0.10509233176708221, + "logits/rejected": -0.2769317328929901, + "logps/chosen": -266.9497375488281, + "logps/rejected": -634.595458984375, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2096481323242188, + "rewards/margins": 11.281206130981445, + "rewards/rejected": -14.490854263305664, + "step": 392 + }, + { + "epoch": 0.24447900466562986, + "grad_norm": 11.823335647583008, + "learning_rate": 3.372222222222222e-06, + "logits/chosen": -0.12267563492059708, + "logits/rejected": -0.19850589334964752, + "logps/chosen": -433.88037109375, + "logps/rejected": -517.8289794921875, + "loss": 0.1823, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.944214344024658, + "rewards/margins": 6.920692443847656, + "rewards/rejected": -11.864906311035156, + "step": 393 + }, + { + "epoch": 0.24510108864696734, + "grad_norm": 0.5390133857727051, + "learning_rate": 3.366666666666667e-06, + "logits/chosen": -0.16388043761253357, + "logits/rejected": -0.23693051934242249, + "logps/chosen": -346.424072265625, + "logps/rejected": -477.0155334472656, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0074269771575928, + "rewards/margins": 10.624791145324707, + "rewards/rejected": -13.632218360900879, + "step": 394 + }, + { + "epoch": 0.24572317262830481, + "grad_norm": 16.252132415771484, + "learning_rate": 3.3611111111111117e-06, + "logits/chosen": -0.23383793234825134, + "logits/rejected": -0.32776641845703125, + "logps/chosen": -391.85968017578125, + "logps/rejected": -602.112060546875, + "loss": 0.6981, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.830857038497925, + "rewards/margins": 6.380799293518066, + "rewards/rejected": -9.211657524108887, + "step": 395 + }, + { + "epoch": 0.2463452566096423, + "grad_norm": 1.0770540237426758, + "learning_rate": 3.3555555555555557e-06, + "logits/chosen": -0.1502060890197754, + "logits/rejected": -0.2934280037879944, + "logps/chosen": -275.7140197753906, + "logps/rejected": -635.568115234375, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.545418739318848, + "rewards/margins": 10.11474895477295, + "rewards/rejected": -14.660167694091797, + "step": 396 + }, + { + "epoch": 0.24696734059097977, + "grad_norm": 0.30673420429229736, + "learning_rate": 3.3500000000000005e-06, + "logits/chosen": -0.1258198320865631, + "logits/rejected": -0.23043933510780334, + "logps/chosen": -401.4937744140625, + "logps/rejected": -695.3685302734375, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.1192731857299805, + "rewards/margins": 9.460572242736816, + "rewards/rejected": -14.579845428466797, + "step": 397 + }, + { + "epoch": 0.24758942457231725, + "grad_norm": 1.11237633228302, + "learning_rate": 3.3444444444444445e-06, + "logits/chosen": -0.10767021775245667, + "logits/rejected": -0.1763923466205597, + "logps/chosen": -307.5670166015625, + "logps/rejected": -513.20556640625, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.01777458190918, + "rewards/margins": 8.246371269226074, + "rewards/rejected": -13.264144897460938, + "step": 398 + }, + { + "epoch": 0.24821150855365473, + "grad_norm": 0.1365923136472702, + "learning_rate": 3.3388888888888893e-06, + "logits/chosen": -0.13080349564552307, + "logits/rejected": -0.24716614186763763, + "logps/chosen": -265.8899841308594, + "logps/rejected": -618.0941772460938, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.353989601135254, + "rewards/margins": 11.652874946594238, + "rewards/rejected": -16.006866455078125, + "step": 399 + }, + { + "epoch": 0.24883359253499224, + "grad_norm": 0.47425249218940735, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -0.06483950465917587, + "logits/rejected": -0.21856454014778137, + "logps/chosen": -206.90472412109375, + "logps/rejected": -606.2335205078125, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1266093254089355, + "rewards/margins": 10.165671348571777, + "rewards/rejected": -13.292282104492188, + "step": 400 + }, + { + "epoch": 0.24945567651632972, + "grad_norm": 1.6052523851394653, + "learning_rate": 3.327777777777778e-06, + "logits/chosen": -0.19490833580493927, + "logits/rejected": -0.25579309463500977, + "logps/chosen": -266.3677062988281, + "logps/rejected": -514.99609375, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.6406569480896, + "rewards/margins": 8.683393478393555, + "rewards/rejected": -13.324050903320312, + "step": 401 + }, + { + "epoch": 0.25007776049766717, + "grad_norm": 0.017141887918114662, + "learning_rate": 3.322222222222222e-06, + "logits/chosen": -0.08911450952291489, + "logits/rejected": -0.19953905045986176, + "logps/chosen": -320.8982849121094, + "logps/rejected": -536.388916015625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8826522827148438, + "rewards/margins": 11.73995590209961, + "rewards/rejected": -15.62260913848877, + "step": 402 + }, + { + "epoch": 0.2506998444790047, + "grad_norm": 5.217726707458496, + "learning_rate": 3.316666666666667e-06, + "logits/chosen": -0.07051900029182434, + "logits/rejected": -0.14339309930801392, + "logps/chosen": -222.8654022216797, + "logps/rejected": -364.3012390136719, + "loss": 0.2015, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.709677219390869, + "rewards/margins": 5.599246025085449, + "rewards/rejected": -9.308923721313477, + "step": 403 + }, + { + "epoch": 0.2513219284603421, + "grad_norm": 1.3904308080673218, + "learning_rate": 3.3111111111111118e-06, + "logits/chosen": -0.16124990582466125, + "logits/rejected": -0.1902160346508026, + "logps/chosen": -339.2606201171875, + "logps/rejected": -394.0766296386719, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7638211250305176, + "rewards/margins": 5.592198848724365, + "rewards/rejected": -9.3560209274292, + "step": 404 + }, + { + "epoch": 0.25194401244167963, + "grad_norm": 0.5309785604476929, + "learning_rate": 3.3055555555555558e-06, + "logits/chosen": -0.25481978058815, + "logits/rejected": -0.29373425245285034, + "logps/chosen": -423.6588134765625, + "logps/rejected": -592.4200439453125, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.643208026885986, + "rewards/margins": 9.63177490234375, + "rewards/rejected": -14.274983406066895, + "step": 405 + }, + { + "epoch": 0.2525660964230171, + "grad_norm": 2.935518264770508, + "learning_rate": 3.3000000000000006e-06, + "logits/chosen": -0.04285181686282158, + "logits/rejected": -0.1949092298746109, + "logps/chosen": -474.88873291015625, + "logps/rejected": -733.3424682617188, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.816509246826172, + "rewards/margins": 10.754720687866211, + "rewards/rejected": -15.571229934692383, + "step": 406 + }, + { + "epoch": 0.2531881804043546, + "grad_norm": 9.831649780273438, + "learning_rate": 3.2944444444444446e-06, + "logits/chosen": -0.13811376690864563, + "logits/rejected": -0.1592123657464981, + "logps/chosen": -565.2511596679688, + "logps/rejected": -522.478271484375, + "loss": 0.4324, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7402327060699463, + "rewards/margins": 5.137905120849609, + "rewards/rejected": -8.878138542175293, + "step": 407 + }, + { + "epoch": 0.25381026438569204, + "grad_norm": 0.34724703431129456, + "learning_rate": 3.2888888888888894e-06, + "logits/chosen": -0.14397567510604858, + "logits/rejected": -0.22130745649337769, + "logps/chosen": -334.9671630859375, + "logps/rejected": -590.8428955078125, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.631126880645752, + "rewards/margins": 12.065752029418945, + "rewards/rejected": -17.69687843322754, + "step": 408 + }, + { + "epoch": 0.25443234836702955, + "grad_norm": 0.10994351655244827, + "learning_rate": 3.2833333333333334e-06, + "logits/chosen": -0.08612749725580215, + "logits/rejected": -0.14842836558818817, + "logps/chosen": -265.1435546875, + "logps/rejected": -587.9443359375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4744009971618652, + "rewards/margins": 13.773189544677734, + "rewards/rejected": -17.247591018676758, + "step": 409 + }, + { + "epoch": 0.25505443234836706, + "grad_norm": 0.09335647523403168, + "learning_rate": 3.277777777777778e-06, + "logits/chosen": -0.14135196805000305, + "logits/rejected": -0.21991834044456482, + "logps/chosen": -262.1990966796875, + "logps/rejected": -499.29583740234375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.22908091545105, + "rewards/margins": 10.918335914611816, + "rewards/rejected": -14.147416114807129, + "step": 410 + }, + { + "epoch": 0.2556765163297045, + "grad_norm": 2.4290878772735596, + "learning_rate": 3.2722222222222226e-06, + "logits/chosen": -0.17418035864830017, + "logits/rejected": -0.1484486311674118, + "logps/chosen": -441.0341491699219, + "logps/rejected": -595.4109497070312, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.063255310058594, + "rewards/margins": 9.075023651123047, + "rewards/rejected": -14.138278007507324, + "step": 411 + }, + { + "epoch": 0.256298600311042, + "grad_norm": 6.965293884277344, + "learning_rate": 3.266666666666667e-06, + "logits/chosen": -0.27927184104919434, + "logits/rejected": -0.24891149997711182, + "logps/chosen": -486.62310791015625, + "logps/rejected": -384.1436462402344, + "loss": 0.0591, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8775219917297363, + "rewards/margins": 5.209172248840332, + "rewards/rejected": -8.086694717407227, + "step": 412 + }, + { + "epoch": 0.25692068429237946, + "grad_norm": 1.7399600744247437, + "learning_rate": 3.2611111111111114e-06, + "logits/chosen": -0.014087110757827759, + "logits/rejected": -0.10838279128074646, + "logps/chosen": -385.8360900878906, + "logps/rejected": -566.1333618164062, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.670892715454102, + "rewards/margins": 10.994524955749512, + "rewards/rejected": -15.66541862487793, + "step": 413 + }, + { + "epoch": 0.25754276827371697, + "grad_norm": 1.1527899503707886, + "learning_rate": 3.255555555555556e-06, + "logits/chosen": -0.12933212518692017, + "logits/rejected": -0.19883868098258972, + "logps/chosen": -383.79327392578125, + "logps/rejected": -530.3013916015625, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7022385597229004, + "rewards/margins": 10.624407768249512, + "rewards/rejected": -14.326645851135254, + "step": 414 + }, + { + "epoch": 0.2581648522550544, + "grad_norm": 21.185585021972656, + "learning_rate": 3.2500000000000002e-06, + "logits/chosen": -0.2171405702829361, + "logits/rejected": -0.1985168308019638, + "logps/chosen": -627.809814453125, + "logps/rejected": -642.7532348632812, + "loss": 0.4862, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.26376485824585, + "rewards/margins": 8.458075523376465, + "rewards/rejected": -13.721839904785156, + "step": 415 + }, + { + "epoch": 0.25878693623639193, + "grad_norm": 2.3768348693847656, + "learning_rate": 3.2444444444444446e-06, + "logits/chosen": -0.08986632525920868, + "logits/rejected": -0.05570496618747711, + "logps/chosen": -328.1534423828125, + "logps/rejected": -530.815185546875, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7541749477386475, + "rewards/margins": 11.186346054077148, + "rewards/rejected": -14.940520286560059, + "step": 416 + }, + { + "epoch": 0.2594090202177294, + "grad_norm": 0.025697452947497368, + "learning_rate": 3.238888888888889e-06, + "logits/chosen": 0.01871795579791069, + "logits/rejected": -0.009135035797953606, + "logps/chosen": -222.60556030273438, + "logps/rejected": -610.1383056640625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6523218154907227, + "rewards/margins": 12.146879196166992, + "rewards/rejected": -15.799200057983398, + "step": 417 + }, + { + "epoch": 0.2600311041990669, + "grad_norm": 24.172733306884766, + "learning_rate": 3.2333333333333334e-06, + "logits/chosen": -0.19742831587791443, + "logits/rejected": -0.2124430388212204, + "logps/chosen": -487.728271484375, + "logps/rejected": -568.8438720703125, + "loss": 0.7825, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.253726959228516, + "rewards/margins": 5.15956974029541, + "rewards/rejected": -10.413296699523926, + "step": 418 + }, + { + "epoch": 0.26065318818040434, + "grad_norm": 0.10159821808338165, + "learning_rate": 3.227777777777778e-06, + "logits/chosen": -0.015331236645579338, + "logits/rejected": -0.14765499532222748, + "logps/chosen": -212.76318359375, + "logps/rejected": -465.6112365722656, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.197005271911621, + "rewards/margins": 10.656272888183594, + "rewards/rejected": -13.853277206420898, + "step": 419 + }, + { + "epoch": 0.26127527216174184, + "grad_norm": 0.3961324095726013, + "learning_rate": 3.2222222222222227e-06, + "logits/chosen": -0.04628559947013855, + "logits/rejected": -0.1412150263786316, + "logps/chosen": -231.1971435546875, + "logps/rejected": -437.6485900878906, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.658825397491455, + "rewards/margins": 10.538159370422363, + "rewards/rejected": -14.196985244750977, + "step": 420 + }, + { + "epoch": 0.2618973561430793, + "grad_norm": 3.546431303024292, + "learning_rate": 3.2166666666666666e-06, + "logits/chosen": -0.228714257478714, + "logits/rejected": -0.28735944628715515, + "logps/chosen": -397.3589782714844, + "logps/rejected": -629.9720458984375, + "loss": 0.0537, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6910910606384277, + "rewards/margins": 6.85723876953125, + "rewards/rejected": -10.548330307006836, + "step": 421 + }, + { + "epoch": 0.2625194401244168, + "grad_norm": 0.7852609157562256, + "learning_rate": 3.2111111111111115e-06, + "logits/chosen": -0.210056334733963, + "logits/rejected": -0.2595698833465576, + "logps/chosen": -360.6358947753906, + "logps/rejected": -509.41217041015625, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.0073676109313965, + "rewards/margins": 8.6608304977417, + "rewards/rejected": -12.668197631835938, + "step": 422 + }, + { + "epoch": 0.26314152410575425, + "grad_norm": 1.547529935836792, + "learning_rate": 3.2055555555555555e-06, + "logits/chosen": -0.11100226640701294, + "logits/rejected": -0.1665286123752594, + "logps/chosen": -288.73297119140625, + "logps/rejected": -447.40216064453125, + "loss": 0.0237, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.208193302154541, + "rewards/margins": 7.889273643493652, + "rewards/rejected": -13.097467422485352, + "step": 423 + }, + { + "epoch": 0.26376360808709176, + "grad_norm": 0.006123764906078577, + "learning_rate": 3.2000000000000003e-06, + "logits/chosen": -0.1654462069272995, + "logits/rejected": -0.2371828854084015, + "logps/chosen": -254.17465209960938, + "logps/rejected": -522.56982421875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.858889102935791, + "rewards/margins": 12.125524520874023, + "rewards/rejected": -15.984413146972656, + "step": 424 + }, + { + "epoch": 0.2643856920684292, + "grad_norm": 0.18082460761070251, + "learning_rate": 3.1944444444444443e-06, + "logits/chosen": -0.127852663397789, + "logits/rejected": -0.18703630566596985, + "logps/chosen": -204.4736328125, + "logps/rejected": -429.6404113769531, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9342873096466064, + "rewards/margins": 8.970078468322754, + "rewards/rejected": -11.904366493225098, + "step": 425 + }, + { + "epoch": 0.2650077760497667, + "grad_norm": 17.208742141723633, + "learning_rate": 3.188888888888889e-06, + "logits/chosen": -0.22985780239105225, + "logits/rejected": -0.2967871427536011, + "logps/chosen": -254.20907592773438, + "logps/rejected": -390.1443176269531, + "loss": 0.2784, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0134148597717285, + "rewards/margins": 7.339380264282227, + "rewards/rejected": -9.352794647216797, + "step": 426 + }, + { + "epoch": 0.2656298600311042, + "grad_norm": 0.6221696734428406, + "learning_rate": 3.183333333333334e-06, + "logits/chosen": -0.13043132424354553, + "logits/rejected": -0.19379711151123047, + "logps/chosen": -306.456787109375, + "logps/rejected": -436.52685546875, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.267523765563965, + "rewards/margins": 8.428032875061035, + "rewards/rejected": -12.695556640625, + "step": 427 + }, + { + "epoch": 0.2662519440124417, + "grad_norm": 3.3538691997528076, + "learning_rate": 3.177777777777778e-06, + "logits/chosen": -0.22048842906951904, + "logits/rejected": -0.2755993604660034, + "logps/chosen": -267.06658935546875, + "logps/rejected": -445.41717529296875, + "loss": 0.0498, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7199771404266357, + "rewards/margins": 7.382447242736816, + "rewards/rejected": -11.102424621582031, + "step": 428 + }, + { + "epoch": 0.2668740279937792, + "grad_norm": 7.190969467163086, + "learning_rate": 3.1722222222222227e-06, + "logits/chosen": -0.10630179941654205, + "logits/rejected": -0.1633741408586502, + "logps/chosen": -391.66571044921875, + "logps/rejected": -532.0159912109375, + "loss": 0.1454, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.005875587463379, + "rewards/margins": 8.383438110351562, + "rewards/rejected": -13.389312744140625, + "step": 429 + }, + { + "epoch": 0.26749611197511663, + "grad_norm": 0.4742722809314728, + "learning_rate": 3.1666666666666667e-06, + "logits/chosen": -0.1636980175971985, + "logits/rejected": -0.2200210839509964, + "logps/chosen": -415.80279541015625, + "logps/rejected": -611.024658203125, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1435604095458984, + "rewards/margins": 9.075042724609375, + "rewards/rejected": -12.218603134155273, + "step": 430 + }, + { + "epoch": 0.26811819595645414, + "grad_norm": 5.320164203643799, + "learning_rate": 3.1611111111111115e-06, + "logits/chosen": -0.18936291337013245, + "logits/rejected": -0.26258033514022827, + "logps/chosen": -234.565185546875, + "logps/rejected": -403.62957763671875, + "loss": 0.156, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.519001007080078, + "rewards/margins": 6.851634979248047, + "rewards/rejected": -10.370635986328125, + "step": 431 + }, + { + "epoch": 0.2687402799377916, + "grad_norm": 0.2873154878616333, + "learning_rate": 3.1555555555555555e-06, + "logits/chosen": -0.03613107651472092, + "logits/rejected": -0.22106721997261047, + "logps/chosen": -197.57789611816406, + "logps/rejected": -558.1168212890625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4183218479156494, + "rewards/margins": 10.949080467224121, + "rewards/rejected": -14.367403030395508, + "step": 432 + }, + { + "epoch": 0.2693623639191291, + "grad_norm": 0.08774023503065109, + "learning_rate": 3.1500000000000003e-06, + "logits/chosen": -0.18863654136657715, + "logits/rejected": -0.22940775752067566, + "logps/chosen": -169.11776733398438, + "logps/rejected": -563.265869140625, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.655273914337158, + "rewards/margins": 9.210572242736816, + "rewards/rejected": -11.865846633911133, + "step": 433 + }, + { + "epoch": 0.26998444790046655, + "grad_norm": 6.171657085418701, + "learning_rate": 3.144444444444445e-06, + "logits/chosen": -0.10809853672981262, + "logits/rejected": -0.1689632534980774, + "logps/chosen": -316.3516845703125, + "logps/rejected": -477.5682678222656, + "loss": 0.0848, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.7478742599487305, + "rewards/margins": 8.410304069519043, + "rewards/rejected": -13.15817928314209, + "step": 434 + }, + { + "epoch": 0.27060653188180406, + "grad_norm": 0.014070524834096432, + "learning_rate": 3.138888888888889e-06, + "logits/chosen": -0.06338763236999512, + "logits/rejected": -0.15138934552669525, + "logps/chosen": -235.95709228515625, + "logps/rejected": -472.2982482910156, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.907130718231201, + "rewards/margins": 10.996705055236816, + "rewards/rejected": -13.903836250305176, + "step": 435 + }, + { + "epoch": 0.2712286158631415, + "grad_norm": 4.218111515045166, + "learning_rate": 3.133333333333334e-06, + "logits/chosen": -0.19820624589920044, + "logits/rejected": -0.24786141514778137, + "logps/chosen": -585.7201538085938, + "logps/rejected": -706.052734375, + "loss": 0.0579, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.644243240356445, + "rewards/margins": 7.719517230987549, + "rewards/rejected": -12.363759994506836, + "step": 436 + }, + { + "epoch": 0.271850699844479, + "grad_norm": 0.6446009278297424, + "learning_rate": 3.127777777777778e-06, + "logits/chosen": -0.2661688029766083, + "logits/rejected": -0.36284852027893066, + "logps/chosen": -411.08172607421875, + "logps/rejected": -476.1949157714844, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8511571884155273, + "rewards/margins": 10.356355667114258, + "rewards/rejected": -13.207513809204102, + "step": 437 + }, + { + "epoch": 0.27247278382581647, + "grad_norm": 0.20934517681598663, + "learning_rate": 3.1222222222222228e-06, + "logits/chosen": -0.09609181433916092, + "logits/rejected": -0.169026717543602, + "logps/chosen": -445.92919921875, + "logps/rejected": -591.621337890625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.5081787109375, + "rewards/margins": 9.722309112548828, + "rewards/rejected": -15.230488777160645, + "step": 438 + }, + { + "epoch": 0.273094867807154, + "grad_norm": 0.2600097060203552, + "learning_rate": 3.1166666666666668e-06, + "logits/chosen": -0.11132031679153442, + "logits/rejected": -0.1963416486978531, + "logps/chosen": -279.1035461425781, + "logps/rejected": -449.8685302734375, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.76704740524292, + "rewards/margins": 8.63277816772461, + "rewards/rejected": -11.399826049804688, + "step": 439 + }, + { + "epoch": 0.2737169517884914, + "grad_norm": 0.32751429080963135, + "learning_rate": 3.1111111111111116e-06, + "logits/chosen": -0.17483103275299072, + "logits/rejected": -0.23492002487182617, + "logps/chosen": -320.6435546875, + "logps/rejected": -547.513671875, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7100415229797363, + "rewards/margins": 10.205655097961426, + "rewards/rejected": -12.915696144104004, + "step": 440 + }, + { + "epoch": 0.27433903576982893, + "grad_norm": 0.078452467918396, + "learning_rate": 3.1055555555555556e-06, + "logits/chosen": -0.14991967380046844, + "logits/rejected": -0.17456801235675812, + "logps/chosen": -317.349853515625, + "logps/rejected": -490.4644775390625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8672916889190674, + "rewards/margins": 9.528175354003906, + "rewards/rejected": -13.395465850830078, + "step": 441 + }, + { + "epoch": 0.2749611197511664, + "grad_norm": 1.842769742012024, + "learning_rate": 3.1000000000000004e-06, + "logits/chosen": -0.07130047678947449, + "logits/rejected": -0.2084885686635971, + "logps/chosen": -137.96734619140625, + "logps/rejected": -411.6278076171875, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.426034927368164, + "rewards/margins": 8.351211547851562, + "rewards/rejected": -10.777246475219727, + "step": 442 + }, + { + "epoch": 0.2755832037325039, + "grad_norm": 0.2418750524520874, + "learning_rate": 3.094444444444445e-06, + "logits/chosen": -0.17951354384422302, + "logits/rejected": -0.24305729568004608, + "logps/chosen": -303.5381774902344, + "logps/rejected": -539.304443359375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3903617858886719, + "rewards/margins": 9.636672973632812, + "rewards/rejected": -11.027034759521484, + "step": 443 + }, + { + "epoch": 0.27620528771384134, + "grad_norm": 10.529664993286133, + "learning_rate": 3.088888888888889e-06, + "logits/chosen": -0.19838128983974457, + "logits/rejected": -0.23999622464179993, + "logps/chosen": -360.39215087890625, + "logps/rejected": -424.9872131347656, + "loss": 0.1023, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.484769344329834, + "rewards/margins": 6.367135524749756, + "rewards/rejected": -10.851905822753906, + "step": 444 + }, + { + "epoch": 0.27682737169517885, + "grad_norm": 3.9258522987365723, + "learning_rate": 3.0833333333333336e-06, + "logits/chosen": -0.15146413445472717, + "logits/rejected": -0.2016676962375641, + "logps/chosen": -171.07119750976562, + "logps/rejected": -374.54339599609375, + "loss": 0.1134, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.205024003982544, + "rewards/margins": 8.932584762573242, + "rewards/rejected": -11.137609481811523, + "step": 445 + }, + { + "epoch": 0.27744945567651635, + "grad_norm": 0.5979270339012146, + "learning_rate": 3.077777777777778e-06, + "logits/chosen": -0.08534788340330124, + "logits/rejected": -0.1629612296819687, + "logps/chosen": -239.32870483398438, + "logps/rejected": -682.2098388671875, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5374128818511963, + "rewards/margins": 10.683612823486328, + "rewards/rejected": -13.221025466918945, + "step": 446 + }, + { + "epoch": 0.2780715396578538, + "grad_norm": 0.2475152611732483, + "learning_rate": 3.0722222222222224e-06, + "logits/chosen": -0.20038631558418274, + "logits/rejected": -0.26836103200912476, + "logps/chosen": -349.15557861328125, + "logps/rejected": -455.46533203125, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8546204566955566, + "rewards/margins": 7.790243625640869, + "rewards/rejected": -10.644864082336426, + "step": 447 + }, + { + "epoch": 0.2786936236391913, + "grad_norm": 14.47797966003418, + "learning_rate": 3.066666666666667e-06, + "logits/chosen": -0.12366615235805511, + "logits/rejected": -0.15462961792945862, + "logps/chosen": -383.641357421875, + "logps/rejected": -534.090576171875, + "loss": 0.1796, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.442785263061523, + "rewards/margins": 9.704907417297363, + "rewards/rejected": -14.147692680358887, + "step": 448 + }, + { + "epoch": 0.27931570762052876, + "grad_norm": 4.079801559448242, + "learning_rate": 3.0611111111111112e-06, + "logits/chosen": -0.03475372865796089, + "logits/rejected": -0.14843584597110748, + "logps/chosen": -370.3608703613281, + "logps/rejected": -571.1976318359375, + "loss": 0.0399, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3332693576812744, + "rewards/margins": 9.012179374694824, + "rewards/rejected": -12.345449447631836, + "step": 449 + }, + { + "epoch": 0.27993779160186627, + "grad_norm": 0.033708278089761734, + "learning_rate": 3.055555555555556e-06, + "logits/chosen": -0.13901346921920776, + "logits/rejected": -0.17485594749450684, + "logps/chosen": -241.4612579345703, + "logps/rejected": -457.0430908203125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6086089611053467, + "rewards/margins": 9.409764289855957, + "rewards/rejected": -12.018373489379883, + "step": 450 + }, + { + "epoch": 0.2805598755832037, + "grad_norm": 0.034546978771686554, + "learning_rate": 3.05e-06, + "logits/chosen": -0.10936588793992996, + "logits/rejected": -0.18450546264648438, + "logps/chosen": -280.313720703125, + "logps/rejected": -508.0997619628906, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8889126777648926, + "rewards/margins": 9.823020935058594, + "rewards/rejected": -11.711933135986328, + "step": 451 + }, + { + "epoch": 0.28118195956454123, + "grad_norm": 5.002330780029297, + "learning_rate": 3.044444444444445e-06, + "logits/chosen": -0.11415782570838928, + "logits/rejected": -0.16042780876159668, + "logps/chosen": -409.26171875, + "logps/rejected": -641.8538818359375, + "loss": 0.0548, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8045883178710938, + "rewards/margins": 11.239530563354492, + "rewards/rejected": -15.044118881225586, + "step": 452 + }, + { + "epoch": 0.2818040435458787, + "grad_norm": 0.6678714156150818, + "learning_rate": 3.038888888888889e-06, + "logits/chosen": -0.14888234436511993, + "logits/rejected": -0.2246774435043335, + "logps/chosen": -306.7550048828125, + "logps/rejected": -569.8973388671875, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.064187526702881, + "rewards/margins": 9.202631950378418, + "rewards/rejected": -13.266819953918457, + "step": 453 + }, + { + "epoch": 0.2824261275272162, + "grad_norm": 1.2459475994110107, + "learning_rate": 3.0333333333333337e-06, + "logits/chosen": -0.08470825850963593, + "logits/rejected": -0.22862324118614197, + "logps/chosen": -230.17926025390625, + "logps/rejected": -505.46734619140625, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.315659761428833, + "rewards/margins": 10.64539623260498, + "rewards/rejected": -13.961055755615234, + "step": 454 + }, + { + "epoch": 0.28304821150855364, + "grad_norm": 0.014705093577504158, + "learning_rate": 3.0277777777777776e-06, + "logits/chosen": -0.13397379219532013, + "logits/rejected": -0.23941989243030548, + "logps/chosen": -274.475830078125, + "logps/rejected": -551.6904296875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8396246433258057, + "rewards/margins": 10.74125862121582, + "rewards/rejected": -14.580883026123047, + "step": 455 + }, + { + "epoch": 0.28367029548989114, + "grad_norm": 2.9209794998168945, + "learning_rate": 3.0222222222222225e-06, + "logits/chosen": -0.22297683358192444, + "logits/rejected": -0.2814556956291199, + "logps/chosen": -286.53448486328125, + "logps/rejected": -481.3670654296875, + "loss": 0.061, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.000555515289307, + "rewards/margins": 6.754306316375732, + "rewards/rejected": -10.754861831665039, + "step": 456 + }, + { + "epoch": 0.2842923794712286, + "grad_norm": 1.2532163858413696, + "learning_rate": 3.0166666666666673e-06, + "logits/chosen": -0.23746058344841003, + "logits/rejected": -0.3169229030609131, + "logps/chosen": -303.4602355957031, + "logps/rejected": -482.18865966796875, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2682998180389404, + "rewards/margins": 6.572655200958252, + "rewards/rejected": -8.840954780578613, + "step": 457 + }, + { + "epoch": 0.2849144634525661, + "grad_norm": 0.1634194403886795, + "learning_rate": 3.0111111111111113e-06, + "logits/chosen": -0.2515576481819153, + "logits/rejected": -0.22868913412094116, + "logps/chosen": -435.6349182128906, + "logps/rejected": -479.6177062988281, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7771477699279785, + "rewards/margins": 9.235347747802734, + "rewards/rejected": -13.012495040893555, + "step": 458 + }, + { + "epoch": 0.28553654743390355, + "grad_norm": 6.2760162353515625, + "learning_rate": 3.005555555555556e-06, + "logits/chosen": -0.08520615100860596, + "logits/rejected": -0.20514705777168274, + "logps/chosen": -182.19345092773438, + "logps/rejected": -495.69012451171875, + "loss": 0.1191, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1156532764434814, + "rewards/margins": 10.959134101867676, + "rewards/rejected": -14.074787139892578, + "step": 459 + }, + { + "epoch": 0.28615863141524106, + "grad_norm": 4.827237606048584, + "learning_rate": 3e-06, + "logits/chosen": -0.044309698045253754, + "logits/rejected": -0.11431644856929779, + "logps/chosen": -422.2201843261719, + "logps/rejected": -581.1383056640625, + "loss": 0.0486, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.99733829498291, + "rewards/margins": 10.676424026489258, + "rewards/rejected": -15.673762321472168, + "step": 460 + }, + { + "epoch": 0.2867807153965785, + "grad_norm": 0.5865340828895569, + "learning_rate": 2.994444444444445e-06, + "logits/chosen": -0.18500766158103943, + "logits/rejected": -0.28454697132110596, + "logps/chosen": -404.22625732421875, + "logps/rejected": -605.3551635742188, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.713637351989746, + "rewards/margins": 10.71653938293457, + "rewards/rejected": -13.430177688598633, + "step": 461 + }, + { + "epoch": 0.287402799377916, + "grad_norm": 5.743447780609131, + "learning_rate": 2.988888888888889e-06, + "logits/chosen": -0.22422988712787628, + "logits/rejected": -0.2912850081920624, + "logps/chosen": -289.9635314941406, + "logps/rejected": -447.4309387207031, + "loss": 0.095, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.575939655303955, + "rewards/margins": 6.262747287750244, + "rewards/rejected": -9.8386869430542, + "step": 462 + }, + { + "epoch": 0.2880248833592535, + "grad_norm": 13.296899795532227, + "learning_rate": 2.9833333333333337e-06, + "logits/chosen": -0.1339966505765915, + "logits/rejected": -0.16284282505512238, + "logps/chosen": -368.0105285644531, + "logps/rejected": -503.84869384765625, + "loss": 0.2225, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.536924362182617, + "rewards/margins": 8.039335250854492, + "rewards/rejected": -13.57625961303711, + "step": 463 + }, + { + "epoch": 0.288646967340591, + "grad_norm": 0.10987541824579239, + "learning_rate": 2.9777777777777777e-06, + "logits/chosen": -0.09278355538845062, + "logits/rejected": -0.19680409133434296, + "logps/chosen": -352.1773681640625, + "logps/rejected": -598.5928955078125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8187882900238037, + "rewards/margins": 11.902942657470703, + "rewards/rejected": -15.721731185913086, + "step": 464 + }, + { + "epoch": 0.2892690513219285, + "grad_norm": 5.878370761871338, + "learning_rate": 2.9722222222222225e-06, + "logits/chosen": -0.2925691604614258, + "logits/rejected": -0.29276496171951294, + "logps/chosen": -312.41961669921875, + "logps/rejected": -432.54541015625, + "loss": 0.1419, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6918745040893555, + "rewards/margins": 6.311898231506348, + "rewards/rejected": -9.003772735595703, + "step": 465 + }, + { + "epoch": 0.28989113530326593, + "grad_norm": 4.4972429275512695, + "learning_rate": 2.9666666666666673e-06, + "logits/chosen": -0.09185846149921417, + "logits/rejected": -0.20331373810768127, + "logps/chosen": -313.5936584472656, + "logps/rejected": -557.7618408203125, + "loss": 0.0416, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6469810009002686, + "rewards/margins": 10.231560707092285, + "rewards/rejected": -12.878541946411133, + "step": 466 + }, + { + "epoch": 0.29051321928460344, + "grad_norm": 1.2410050630569458, + "learning_rate": 2.9611111111111113e-06, + "logits/chosen": -0.1738632321357727, + "logits/rejected": -0.21794861555099487, + "logps/chosen": -518.0777587890625, + "logps/rejected": -770.6430053710938, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.349998950958252, + "rewards/margins": 6.564795017242432, + "rewards/rejected": -10.914793968200684, + "step": 467 + }, + { + "epoch": 0.2911353032659409, + "grad_norm": 0.6484741568565369, + "learning_rate": 2.955555555555556e-06, + "logits/chosen": -0.1391589194536209, + "logits/rejected": -0.2381209135055542, + "logps/chosen": -321.8725280761719, + "logps/rejected": -597.044189453125, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.949552536010742, + "rewards/margins": 11.608738899230957, + "rewards/rejected": -15.558290481567383, + "step": 468 + }, + { + "epoch": 0.2917573872472784, + "grad_norm": 0.09717661142349243, + "learning_rate": 2.95e-06, + "logits/chosen": -0.07776668667793274, + "logits/rejected": -0.11933182179927826, + "logps/chosen": -410.92913818359375, + "logps/rejected": -535.0477905273438, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.096076965332031, + "rewards/margins": 10.75398063659668, + "rewards/rejected": -14.850057601928711, + "step": 469 + }, + { + "epoch": 0.29237947122861585, + "grad_norm": 0.03885764256119728, + "learning_rate": 2.944444444444445e-06, + "logits/chosen": -0.09707760065793991, + "logits/rejected": -0.2541026175022125, + "logps/chosen": -188.27169799804688, + "logps/rejected": -674.1669311523438, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9830923080444336, + "rewards/margins": 13.251914978027344, + "rewards/rejected": -15.235006332397461, + "step": 470 + }, + { + "epoch": 0.29300155520995336, + "grad_norm": 0.11630082875490189, + "learning_rate": 2.938888888888889e-06, + "logits/chosen": -0.13602793216705322, + "logits/rejected": -0.25358375906944275, + "logps/chosen": -245.46759033203125, + "logps/rejected": -519.74853515625, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7204376459121704, + "rewards/margins": 10.355583190917969, + "rewards/rejected": -12.076020240783691, + "step": 471 + }, + { + "epoch": 0.2936236391912908, + "grad_norm": 0.5335174202919006, + "learning_rate": 2.9333333333333338e-06, + "logits/chosen": -0.17102761566638947, + "logits/rejected": -0.22442284226417542, + "logps/chosen": -374.93353271484375, + "logps/rejected": -521.7992553710938, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.26758337020874, + "rewards/margins": 7.2872819900512695, + "rewards/rejected": -12.554864883422852, + "step": 472 + }, + { + "epoch": 0.2942457231726283, + "grad_norm": 0.07583454251289368, + "learning_rate": 2.927777777777778e-06, + "logits/chosen": -0.09321539849042892, + "logits/rejected": -0.20520435273647308, + "logps/chosen": -364.1098937988281, + "logps/rejected": -564.47314453125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.555822372436523, + "rewards/margins": 10.155363082885742, + "rewards/rejected": -14.711185455322266, + "step": 473 + }, + { + "epoch": 0.29486780715396577, + "grad_norm": 1.4235531091690063, + "learning_rate": 2.9222222222222226e-06, + "logits/chosen": -0.09457525610923767, + "logits/rejected": -0.18643876910209656, + "logps/chosen": -472.65203857421875, + "logps/rejected": -613.3377685546875, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.239579200744629, + "rewards/margins": 10.030845642089844, + "rewards/rejected": -14.270425796508789, + "step": 474 + }, + { + "epoch": 0.2954898911353033, + "grad_norm": 0.02795744128525257, + "learning_rate": 2.916666666666667e-06, + "logits/chosen": -0.10080792009830475, + "logits/rejected": -0.1970379501581192, + "logps/chosen": -271.2310485839844, + "logps/rejected": -479.6068115234375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.894009590148926, + "rewards/margins": 12.846738815307617, + "rewards/rejected": -15.740747451782227, + "step": 475 + }, + { + "epoch": 0.2961119751166407, + "grad_norm": 0.38417741656303406, + "learning_rate": 2.9111111111111114e-06, + "logits/chosen": -0.17229916155338287, + "logits/rejected": -0.22360265254974365, + "logps/chosen": -259.00445556640625, + "logps/rejected": -427.48248291015625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.718967437744141, + "rewards/margins": 8.314814567565918, + "rewards/rejected": -13.033782005310059, + "step": 476 + }, + { + "epoch": 0.29673405909797823, + "grad_norm": 0.08735869079828262, + "learning_rate": 2.9055555555555558e-06, + "logits/chosen": -0.09720394015312195, + "logits/rejected": -0.13246062397956848, + "logps/chosen": -517.457275390625, + "logps/rejected": -731.1558227539062, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.534847259521484, + "rewards/margins": 11.28515625, + "rewards/rejected": -16.820003509521484, + "step": 477 + }, + { + "epoch": 0.2973561430793157, + "grad_norm": 0.050694700330495834, + "learning_rate": 2.9e-06, + "logits/chosen": -0.2541845142841339, + "logits/rejected": -0.2999765872955322, + "logps/chosen": -349.387451171875, + "logps/rejected": -541.8479614257812, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.783271551132202, + "rewards/margins": 9.973506927490234, + "rewards/rejected": -12.756778717041016, + "step": 478 + }, + { + "epoch": 0.2979782270606532, + "grad_norm": 1.2826037406921387, + "learning_rate": 2.8944444444444446e-06, + "logits/chosen": -0.26695525646209717, + "logits/rejected": -0.3179347813129425, + "logps/chosen": -310.7637939453125, + "logps/rejected": -584.9155883789062, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.838679075241089, + "rewards/margins": 12.225186347961426, + "rewards/rejected": -16.063865661621094, + "step": 479 + }, + { + "epoch": 0.2986003110419907, + "grad_norm": 0.29759958386421204, + "learning_rate": 2.888888888888889e-06, + "logits/chosen": -0.01991649717092514, + "logits/rejected": -0.16739404201507568, + "logps/chosen": -276.87310791015625, + "logps/rejected": -524.6915283203125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.121692657470703, + "rewards/margins": 11.055182456970215, + "rewards/rejected": -15.176874160766602, + "step": 480 + }, + { + "epoch": 0.29922239502332815, + "grad_norm": 13.639754295349121, + "learning_rate": 2.8833333333333334e-06, + "logits/chosen": -0.10233119875192642, + "logits/rejected": -0.2591826915740967, + "logps/chosen": -364.94451904296875, + "logps/rejected": -764.7388916015625, + "loss": 0.2123, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.673079967498779, + "rewards/margins": 14.614519119262695, + "rewards/rejected": -19.287599563598633, + "step": 481 + }, + { + "epoch": 0.29984447900466565, + "grad_norm": 0.1675274521112442, + "learning_rate": 2.8777777777777782e-06, + "logits/chosen": -0.20366929471492767, + "logits/rejected": -0.261199027299881, + "logps/chosen": -264.08245849609375, + "logps/rejected": -494.366943359375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3281564712524414, + "rewards/margins": 9.212234497070312, + "rewards/rejected": -12.540390968322754, + "step": 482 + }, + { + "epoch": 0.3004665629860031, + "grad_norm": 0.5269962549209595, + "learning_rate": 2.872222222222222e-06, + "logits/chosen": -0.21765227615833282, + "logits/rejected": -0.2887171804904938, + "logps/chosen": -337.76519775390625, + "logps/rejected": -475.2828369140625, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4669342041015625, + "rewards/margins": 9.302711486816406, + "rewards/rejected": -12.769645690917969, + "step": 483 + }, + { + "epoch": 0.3010886469673406, + "grad_norm": 15.068477630615234, + "learning_rate": 2.866666666666667e-06, + "logits/chosen": -0.028242234140634537, + "logits/rejected": -0.13543789088726044, + "logps/chosen": -312.5484924316406, + "logps/rejected": -672.9266357421875, + "loss": 0.1302, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.433717727661133, + "rewards/margins": 10.821843147277832, + "rewards/rejected": -15.255559921264648, + "step": 484 + }, + { + "epoch": 0.30171073094867806, + "grad_norm": 10.417815208435059, + "learning_rate": 2.861111111111111e-06, + "logits/chosen": -0.1307336688041687, + "logits/rejected": -0.1832491010427475, + "logps/chosen": -404.92474365234375, + "logps/rejected": -510.657958984375, + "loss": 0.137, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.66949462890625, + "rewards/margins": 8.605761528015137, + "rewards/rejected": -14.275257110595703, + "step": 485 + }, + { + "epoch": 0.30233281493001557, + "grad_norm": 0.006506691686809063, + "learning_rate": 2.855555555555556e-06, + "logits/chosen": -0.0911194235086441, + "logits/rejected": -0.1422138214111328, + "logps/chosen": -444.1198425292969, + "logps/rejected": -701.445068359375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3745996952056885, + "rewards/margins": 13.032821655273438, + "rewards/rejected": -16.407421112060547, + "step": 486 + }, + { + "epoch": 0.302954898911353, + "grad_norm": 1.3962080478668213, + "learning_rate": 2.85e-06, + "logits/chosen": -0.14407968521118164, + "logits/rejected": -0.21020767092704773, + "logps/chosen": -238.18380737304688, + "logps/rejected": -404.4751892089844, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.577207326889038, + "rewards/margins": 7.741853713989258, + "rewards/rejected": -11.319061279296875, + "step": 487 + }, + { + "epoch": 0.30357698289269053, + "grad_norm": 0.35604166984558105, + "learning_rate": 2.8444444444444446e-06, + "logits/chosen": -0.15663278102874756, + "logits/rejected": -0.19888907670974731, + "logps/chosen": -354.1199951171875, + "logps/rejected": -461.6494140625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.133651256561279, + "rewards/margins": 10.217489242553711, + "rewards/rejected": -14.351139068603516, + "step": 488 + }, + { + "epoch": 0.304199066874028, + "grad_norm": 16.069007873535156, + "learning_rate": 2.8388888888888895e-06, + "logits/chosen": 0.0223111342638731, + "logits/rejected": -0.11388853192329407, + "logps/chosen": -308.058349609375, + "logps/rejected": -660.7628173828125, + "loss": 0.2369, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.774312496185303, + "rewards/margins": 10.54554271697998, + "rewards/rejected": -16.319854736328125, + "step": 489 + }, + { + "epoch": 0.3048211508553655, + "grad_norm": 2.0206010341644287, + "learning_rate": 2.8333333333333335e-06, + "logits/chosen": -0.06873568147420883, + "logits/rejected": -0.15407656133174896, + "logps/chosen": -188.10215759277344, + "logps/rejected": -432.97479248046875, + "loss": 0.0285, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.152907371520996, + "rewards/margins": 10.44174575805664, + "rewards/rejected": -14.594653129577637, + "step": 490 + }, + { + "epoch": 0.30544323483670294, + "grad_norm": 0.003014638787135482, + "learning_rate": 2.8277777777777783e-06, + "logits/chosen": -0.05213429778814316, + "logits/rejected": -0.21725119650363922, + "logps/chosen": -172.82505798339844, + "logps/rejected": -736.2109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.198672294616699, + "rewards/margins": 15.452678680419922, + "rewards/rejected": -18.651348114013672, + "step": 491 + }, + { + "epoch": 0.30606531881804044, + "grad_norm": 0.04554625600576401, + "learning_rate": 2.8222222222222223e-06, + "logits/chosen": -0.08008131384849548, + "logits/rejected": -0.2202964723110199, + "logps/chosen": -283.8503723144531, + "logps/rejected": -588.1981201171875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.27953839302063, + "rewards/margins": 11.581045150756836, + "rewards/rejected": -14.86058235168457, + "step": 492 + }, + { + "epoch": 0.3066874027993779, + "grad_norm": 3.664142370223999, + "learning_rate": 2.816666666666667e-06, + "logits/chosen": -0.09558602422475815, + "logits/rejected": -0.16038452088832855, + "logps/chosen": -403.0120849609375, + "logps/rejected": -495.21942138671875, + "loss": 0.1089, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.658576726913452, + "rewards/margins": 12.382275581359863, + "rewards/rejected": -16.04085350036621, + "step": 493 + }, + { + "epoch": 0.3073094867807154, + "grad_norm": 0.1836954951286316, + "learning_rate": 2.811111111111111e-06, + "logits/chosen": -0.030152076855301857, + "logits/rejected": -0.2116466611623764, + "logps/chosen": -204.84991455078125, + "logps/rejected": -554.8101806640625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.073650360107422, + "rewards/margins": 14.182883262634277, + "rewards/rejected": -17.256534576416016, + "step": 494 + }, + { + "epoch": 0.30793157076205285, + "grad_norm": 1.0609017610549927, + "learning_rate": 2.805555555555556e-06, + "logits/chosen": -0.1486879289150238, + "logits/rejected": -0.2631802260875702, + "logps/chosen": -400.80841064453125, + "logps/rejected": -684.8233642578125, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.026468276977539, + "rewards/margins": 10.850324630737305, + "rewards/rejected": -15.876792907714844, + "step": 495 + }, + { + "epoch": 0.30855365474339036, + "grad_norm": 0.26330575346946716, + "learning_rate": 2.8000000000000003e-06, + "logits/chosen": -0.06031130254268646, + "logits/rejected": -0.15971212089061737, + "logps/chosen": -352.3968200683594, + "logps/rejected": -582.5736083984375, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5463855266571045, + "rewards/margins": 11.460671424865723, + "rewards/rejected": -15.00705623626709, + "step": 496 + }, + { + "epoch": 0.3091757387247278, + "grad_norm": 0.08592965453863144, + "learning_rate": 2.7944444444444447e-06, + "logits/chosen": -0.04548201709985733, + "logits/rejected": -0.16116252541542053, + "logps/chosen": -246.60472106933594, + "logps/rejected": -542.3217163085938, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8501086235046387, + "rewards/margins": 10.674062728881836, + "rewards/rejected": -14.524171829223633, + "step": 497 + }, + { + "epoch": 0.3097978227060653, + "grad_norm": 2.464677095413208, + "learning_rate": 2.788888888888889e-06, + "logits/chosen": -0.07756389677524567, + "logits/rejected": -0.10145483911037445, + "logps/chosen": -318.0390625, + "logps/rejected": -426.4266052246094, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.002143859863281, + "rewards/margins": 8.591425895690918, + "rewards/rejected": -12.593570709228516, + "step": 498 + }, + { + "epoch": 0.3104199066874028, + "grad_norm": 1.3291840553283691, + "learning_rate": 2.7833333333333335e-06, + "logits/chosen": -0.17680403590202332, + "logits/rejected": -0.20953497290611267, + "logps/chosen": -333.91656494140625, + "logps/rejected": -452.54595947265625, + "loss": 0.0295, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.275571823120117, + "rewards/margins": 7.522537708282471, + "rewards/rejected": -12.79810905456543, + "step": 499 + }, + { + "epoch": 0.3110419906687403, + "grad_norm": 18.36431884765625, + "learning_rate": 2.7777777777777783e-06, + "logits/chosen": -0.19506023824214935, + "logits/rejected": -0.15425747632980347, + "logps/chosen": -634.697998046875, + "logps/rejected": -854.18701171875, + "loss": 0.4958, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.270371437072754, + "rewards/margins": 10.62537956237793, + "rewards/rejected": -14.895750999450684, + "step": 500 + }, + { + "epoch": 0.3116640746500778, + "grad_norm": 0.267206072807312, + "learning_rate": 2.7722222222222223e-06, + "logits/chosen": -0.2457205206155777, + "logits/rejected": -0.278809130191803, + "logps/chosen": -192.12503051757812, + "logps/rejected": -376.422607421875, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.569554328918457, + "rewards/margins": 9.916511535644531, + "rewards/rejected": -13.486065864562988, + "step": 501 + }, + { + "epoch": 0.31228615863141523, + "grad_norm": 0.5219511389732361, + "learning_rate": 2.766666666666667e-06, + "logits/chosen": -0.13277828693389893, + "logits/rejected": -0.14702974259853363, + "logps/chosen": -388.3888854980469, + "logps/rejected": -546.5831298828125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.013309955596924, + "rewards/margins": 9.402070045471191, + "rewards/rejected": -13.415380477905273, + "step": 502 + }, + { + "epoch": 0.31290824261275274, + "grad_norm": 2.6235926151275635, + "learning_rate": 2.761111111111111e-06, + "logits/chosen": -0.14988797903060913, + "logits/rejected": -0.2914610803127289, + "logps/chosen": -267.27130126953125, + "logps/rejected": -552.9884033203125, + "loss": 0.0438, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.714329719543457, + "rewards/margins": 11.735250473022461, + "rewards/rejected": -15.449579238891602, + "step": 503 + }, + { + "epoch": 0.3135303265940902, + "grad_norm": 0.29420575499534607, + "learning_rate": 2.755555555555556e-06, + "logits/chosen": -0.2908463776111603, + "logits/rejected": -0.3380431830883026, + "logps/chosen": -374.83551025390625, + "logps/rejected": -577.1712036132812, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0616538524627686, + "rewards/margins": 10.319271087646484, + "rewards/rejected": -12.380925178527832, + "step": 504 + }, + { + "epoch": 0.3141524105754277, + "grad_norm": 3.0164594650268555, + "learning_rate": 2.7500000000000004e-06, + "logits/chosen": -0.06285682320594788, + "logits/rejected": -0.19647175073623657, + "logps/chosen": -263.1412048339844, + "logps/rejected": -575.4772338867188, + "loss": 0.0312, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.639450550079346, + "rewards/margins": 11.54019832611084, + "rewards/rejected": -16.179649353027344, + "step": 505 + }, + { + "epoch": 0.31477449455676515, + "grad_norm": 0.13978669047355652, + "learning_rate": 2.7444444444444448e-06, + "logits/chosen": -0.07612142711877823, + "logits/rejected": -0.2217516005039215, + "logps/chosen": -334.7262268066406, + "logps/rejected": -681.8115234375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.3828444480896, + "rewards/margins": 11.813119888305664, + "rewards/rejected": -16.195964813232422, + "step": 506 + }, + { + "epoch": 0.31539657853810266, + "grad_norm": 0.23106509447097778, + "learning_rate": 2.738888888888889e-06, + "logits/chosen": -0.02157328464090824, + "logits/rejected": -0.15329419076442719, + "logps/chosen": -358.6500549316406, + "logps/rejected": -511.178466796875, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6943843364715576, + "rewards/margins": 10.371196746826172, + "rewards/rejected": -13.065580368041992, + "step": 507 + }, + { + "epoch": 0.3160186625194401, + "grad_norm": 0.5379907488822937, + "learning_rate": 2.7333333333333336e-06, + "logits/chosen": -0.16761964559555054, + "logits/rejected": -0.2871412932872772, + "logps/chosen": -233.39675903320312, + "logps/rejected": -542.978759765625, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6303582191467285, + "rewards/margins": 8.658153533935547, + "rewards/rejected": -12.288511276245117, + "step": 508 + }, + { + "epoch": 0.3166407465007776, + "grad_norm": 0.38398873805999756, + "learning_rate": 2.727777777777778e-06, + "logits/chosen": -0.1501637101173401, + "logits/rejected": -0.279177725315094, + "logps/chosen": -359.2828369140625, + "logps/rejected": -607.54052734375, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1327903270721436, + "rewards/margins": 12.589200019836426, + "rewards/rejected": -15.721989631652832, + "step": 509 + }, + { + "epoch": 0.31726283048211507, + "grad_norm": 0.17740307748317719, + "learning_rate": 2.7222222222222224e-06, + "logits/chosen": -0.17081817984580994, + "logits/rejected": -0.1822344958782196, + "logps/chosen": -195.51766967773438, + "logps/rejected": -522.216064453125, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.272092819213867, + "rewards/margins": 14.0748929977417, + "rewards/rejected": -17.34698486328125, + "step": 510 + }, + { + "epoch": 0.3178849144634526, + "grad_norm": 0.27070051431655884, + "learning_rate": 2.7166666666666668e-06, + "logits/chosen": -0.06000751629471779, + "logits/rejected": -0.17654070258140564, + "logps/chosen": -271.0890808105469, + "logps/rejected": -693.0372314453125, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.582588195800781, + "rewards/margins": 12.731411933898926, + "rewards/rejected": -17.31399917602539, + "step": 511 + }, + { + "epoch": 0.31850699844479, + "grad_norm": 0.010525842197239399, + "learning_rate": 2.7111111111111116e-06, + "logits/chosen": -0.14866989850997925, + "logits/rejected": -0.16596491634845734, + "logps/chosen": -348.90203857421875, + "logps/rejected": -530.9594116210938, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.043251037597656, + "rewards/margins": 11.44894027709961, + "rewards/rejected": -15.49219036102295, + "step": 512 + }, + { + "epoch": 0.31912908242612753, + "grad_norm": 19.624666213989258, + "learning_rate": 2.7055555555555556e-06, + "logits/chosen": -0.06951643526554108, + "logits/rejected": -0.13688591122627258, + "logps/chosen": -214.3778076171875, + "logps/rejected": -444.9913330078125, + "loss": 0.2656, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.8491039276123047, + "rewards/margins": 10.209993362426758, + "rewards/rejected": -14.059097290039062, + "step": 513 + }, + { + "epoch": 0.319751166407465, + "grad_norm": 2.375986337661743, + "learning_rate": 2.7000000000000004e-06, + "logits/chosen": -0.14318878948688507, + "logits/rejected": -0.2621482014656067, + "logps/chosen": -398.4344787597656, + "logps/rejected": -682.5390625, + "loss": 0.0307, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.115769386291504, + "rewards/margins": 10.562597274780273, + "rewards/rejected": -14.678365707397461, + "step": 514 + }, + { + "epoch": 0.3203732503888025, + "grad_norm": 0.028725923970341682, + "learning_rate": 2.6944444444444444e-06, + "logits/chosen": -0.057885073125362396, + "logits/rejected": -0.1906871795654297, + "logps/chosen": -310.1778564453125, + "logps/rejected": -570.4798583984375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.119505882263184, + "rewards/margins": 10.378107070922852, + "rewards/rejected": -14.497611999511719, + "step": 515 + }, + { + "epoch": 0.32099533437014, + "grad_norm": 0.1459958553314209, + "learning_rate": 2.6888888888888892e-06, + "logits/chosen": -0.11278890818357468, + "logits/rejected": -0.11701535433530807, + "logps/chosen": -539.1905517578125, + "logps/rejected": -578.6796264648438, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.41032600402832, + "rewards/margins": 11.470751762390137, + "rewards/rejected": -16.88107681274414, + "step": 516 + }, + { + "epoch": 0.32161741835147745, + "grad_norm": 0.1990872621536255, + "learning_rate": 2.683333333333333e-06, + "logits/chosen": -0.05690610036253929, + "logits/rejected": -0.18284788727760315, + "logps/chosen": -364.89593505859375, + "logps/rejected": -671.5843505859375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8929052352905273, + "rewards/margins": 13.40644645690918, + "rewards/rejected": -17.29935073852539, + "step": 517 + }, + { + "epoch": 0.32223950233281495, + "grad_norm": 6.287483215332031, + "learning_rate": 2.677777777777778e-06, + "logits/chosen": -0.13645893335342407, + "logits/rejected": -0.22568929195404053, + "logps/chosen": -310.99737548828125, + "logps/rejected": -655.4198608398438, + "loss": 0.0538, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4934117794036865, + "rewards/margins": 11.197587966918945, + "rewards/rejected": -14.690999984741211, + "step": 518 + }, + { + "epoch": 0.3228615863141524, + "grad_norm": 0.637638509273529, + "learning_rate": 2.672222222222223e-06, + "logits/chosen": -0.14834731817245483, + "logits/rejected": -0.21667702496051788, + "logps/chosen": -308.5120849609375, + "logps/rejected": -543.237060546875, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.447606086730957, + "rewards/margins": 10.295647621154785, + "rewards/rejected": -13.743253707885742, + "step": 519 + }, + { + "epoch": 0.3234836702954899, + "grad_norm": 0.3642505407333374, + "learning_rate": 2.666666666666667e-06, + "logits/chosen": -0.16101089119911194, + "logits/rejected": -0.27217862010002136, + "logps/chosen": -383.0250244140625, + "logps/rejected": -577.6366577148438, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.720205545425415, + "rewards/margins": 12.131732940673828, + "rewards/rejected": -15.851938247680664, + "step": 520 + }, + { + "epoch": 0.32410575427682736, + "grad_norm": 0.44776105880737305, + "learning_rate": 2.6611111111111117e-06, + "logits/chosen": -0.11927744001150131, + "logits/rejected": -0.2034514844417572, + "logps/chosen": -303.5253601074219, + "logps/rejected": -718.1970825195312, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7615952491760254, + "rewards/margins": 13.524471282958984, + "rewards/rejected": -17.28606605529785, + "step": 521 + }, + { + "epoch": 0.32472783825816487, + "grad_norm": 2.7374935150146484, + "learning_rate": 2.6555555555555556e-06, + "logits/chosen": -0.09399598836898804, + "logits/rejected": -0.1586131453514099, + "logps/chosen": -420.253662109375, + "logps/rejected": -713.23779296875, + "loss": 0.0309, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7953853607177734, + "rewards/margins": 11.262113571166992, + "rewards/rejected": -15.057498931884766, + "step": 522 + }, + { + "epoch": 0.3253499222395023, + "grad_norm": 0.026384079828858376, + "learning_rate": 2.6500000000000005e-06, + "logits/chosen": -0.10679648816585541, + "logits/rejected": -0.18506841361522675, + "logps/chosen": -303.77142333984375, + "logps/rejected": -453.0013427734375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.060769081115723, + "rewards/margins": 10.503629684448242, + "rewards/rejected": -16.56439781188965, + "step": 523 + }, + { + "epoch": 0.3259720062208398, + "grad_norm": 0.011872214265167713, + "learning_rate": 2.6444444444444444e-06, + "logits/chosen": -0.04245182126760483, + "logits/rejected": -0.15492752194404602, + "logps/chosen": -345.79827880859375, + "logps/rejected": -627.0633544921875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.199069023132324, + "rewards/margins": 15.28481388092041, + "rewards/rejected": -19.483882904052734, + "step": 524 + }, + { + "epoch": 0.3265940902021773, + "grad_norm": 0.09882494062185287, + "learning_rate": 2.6388888888888893e-06, + "logits/chosen": -0.1265636831521988, + "logits/rejected": -0.13079826533794403, + "logps/chosen": -434.79974365234375, + "logps/rejected": -575.4986572265625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.051268577575684, + "rewards/margins": 10.391288757324219, + "rewards/rejected": -17.44255828857422, + "step": 525 + }, + { + "epoch": 0.3272161741835148, + "grad_norm": 0.30695053935050964, + "learning_rate": 2.6333333333333332e-06, + "logits/chosen": -0.20760291814804077, + "logits/rejected": -0.21868647634983063, + "logps/chosen": -551.04931640625, + "logps/rejected": -719.1632080078125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.911200761795044, + "rewards/margins": 11.033697128295898, + "rewards/rejected": -14.94489860534668, + "step": 526 + }, + { + "epoch": 0.32783825816485224, + "grad_norm": 3.2343859672546387, + "learning_rate": 2.627777777777778e-06, + "logits/chosen": -0.11499400436878204, + "logits/rejected": -0.17071960866451263, + "logps/chosen": -353.1285095214844, + "logps/rejected": -506.3915100097656, + "loss": 0.0867, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.108340740203857, + "rewards/margins": 8.70768928527832, + "rewards/rejected": -13.81602954864502, + "step": 527 + }, + { + "epoch": 0.32846034214618974, + "grad_norm": 0.26881280541419983, + "learning_rate": 2.6222222222222225e-06, + "logits/chosen": -0.11391685158014297, + "logits/rejected": -0.153436541557312, + "logps/chosen": -429.0704040527344, + "logps/rejected": -561.2757568359375, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.299421310424805, + "rewards/margins": 11.24638557434082, + "rewards/rejected": -15.545806884765625, + "step": 528 + }, + { + "epoch": 0.3290824261275272, + "grad_norm": 14.134245872497559, + "learning_rate": 2.616666666666667e-06, + "logits/chosen": -0.14967621862888336, + "logits/rejected": -0.2396841049194336, + "logps/chosen": -367.34564208984375, + "logps/rejected": -662.7601928710938, + "loss": 0.1546, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.971407175064087, + "rewards/margins": 12.410030364990234, + "rewards/rejected": -16.381437301635742, + "step": 529 + }, + { + "epoch": 0.3297045101088647, + "grad_norm": 0.019193461164832115, + "learning_rate": 2.6111111111111113e-06, + "logits/chosen": -0.2562946081161499, + "logits/rejected": -0.22343404591083527, + "logps/chosen": -444.5689392089844, + "logps/rejected": -551.1650390625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.316329002380371, + "rewards/margins": 10.97963809967041, + "rewards/rejected": -15.295969009399414, + "step": 530 + }, + { + "epoch": 0.33032659409020215, + "grad_norm": 2.2893385887145996, + "learning_rate": 2.6055555555555557e-06, + "logits/chosen": -0.22346802055835724, + "logits/rejected": -0.2253425270318985, + "logps/chosen": -336.0953063964844, + "logps/rejected": -531.7342529296875, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8145017623901367, + "rewards/margins": 10.941020011901855, + "rewards/rejected": -14.755521774291992, + "step": 531 + }, + { + "epoch": 0.33094867807153966, + "grad_norm": 1.0162596702575684, + "learning_rate": 2.6e-06, + "logits/chosen": -0.17113149166107178, + "logits/rejected": -0.28699424862861633, + "logps/chosen": -430.0765380859375, + "logps/rejected": -578.50048828125, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3153884410858154, + "rewards/margins": 8.987263679504395, + "rewards/rejected": -12.302652359008789, + "step": 532 + }, + { + "epoch": 0.33157076205287717, + "grad_norm": 0.01749141700565815, + "learning_rate": 2.5944444444444445e-06, + "logits/chosen": -0.16620369255542755, + "logits/rejected": -0.16157934069633484, + "logps/chosen": -481.3836364746094, + "logps/rejected": -580.2369995117188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.575963497161865, + "rewards/margins": 14.026981353759766, + "rewards/rejected": -18.60294532775879, + "step": 533 + }, + { + "epoch": 0.3321928460342146, + "grad_norm": 0.010279673151671886, + "learning_rate": 2.5888888888888893e-06, + "logits/chosen": -0.12141431868076324, + "logits/rejected": -0.2151569128036499, + "logps/chosen": -408.76287841796875, + "logps/rejected": -629.2903442382812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.187341690063477, + "rewards/margins": 13.890841484069824, + "rewards/rejected": -19.07818603515625, + "step": 534 + }, + { + "epoch": 0.3328149300155521, + "grad_norm": 0.49737539887428284, + "learning_rate": 2.5833333333333337e-06, + "logits/chosen": -0.11440414190292358, + "logits/rejected": -0.15640833973884583, + "logps/chosen": -416.8833923339844, + "logps/rejected": -477.3837890625, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.551576614379883, + "rewards/margins": 7.868778228759766, + "rewards/rejected": -12.420354843139648, + "step": 535 + }, + { + "epoch": 0.3334370139968896, + "grad_norm": 6.656203746795654, + "learning_rate": 2.577777777777778e-06, + "logits/chosen": -0.0016360394656658173, + "logits/rejected": -0.10165539383888245, + "logps/chosen": -355.5697937011719, + "logps/rejected": -503.814453125, + "loss": 0.2294, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.762906789779663, + "rewards/margins": 10.357710838317871, + "rewards/rejected": -14.120617866516113, + "step": 536 + }, + { + "epoch": 0.3340590979782271, + "grad_norm": 1.2261230945587158, + "learning_rate": 2.5722222222222225e-06, + "logits/chosen": -0.22484543919563293, + "logits/rejected": -0.26634252071380615, + "logps/chosen": -253.96255493164062, + "logps/rejected": -428.4537353515625, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2415595054626465, + "rewards/margins": 6.659300804138184, + "rewards/rejected": -9.900860786437988, + "step": 537 + }, + { + "epoch": 0.33468118195956453, + "grad_norm": 30.134286880493164, + "learning_rate": 2.566666666666667e-06, + "logits/chosen": -0.08561117947101593, + "logits/rejected": -0.08139695227146149, + "logps/chosen": -310.949951171875, + "logps/rejected": -434.0061950683594, + "loss": 0.6421, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.807496547698975, + "rewards/margins": 8.333781242370605, + "rewards/rejected": -14.141278266906738, + "step": 538 + }, + { + "epoch": 0.33530326594090204, + "grad_norm": 4.80837869644165, + "learning_rate": 2.5611111111111113e-06, + "logits/chosen": 0.023054659366607666, + "logits/rejected": -0.059782132506370544, + "logps/chosen": -350.822998046875, + "logps/rejected": -441.0616149902344, + "loss": 0.1386, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.02433967590332, + "rewards/margins": 8.03333854675293, + "rewards/rejected": -14.05767822265625, + "step": 539 + }, + { + "epoch": 0.3359253499222395, + "grad_norm": 20.89476776123047, + "learning_rate": 2.5555555555555557e-06, + "logits/chosen": -0.11811422556638718, + "logits/rejected": -0.15433692932128906, + "logps/chosen": -343.87677001953125, + "logps/rejected": -523.602783203125, + "loss": 0.2231, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.829220771789551, + "rewards/margins": 9.165101051330566, + "rewards/rejected": -14.994321823120117, + "step": 540 + }, + { + "epoch": 0.336547433903577, + "grad_norm": 1.9713451862335205, + "learning_rate": 2.55e-06, + "logits/chosen": 0.046044863760471344, + "logits/rejected": -0.06648656725883484, + "logps/chosen": -290.5953674316406, + "logps/rejected": -511.86297607421875, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.611037731170654, + "rewards/margins": 10.995401382446289, + "rewards/rejected": -15.606439590454102, + "step": 541 + }, + { + "epoch": 0.33716951788491445, + "grad_norm": 0.21147218346595764, + "learning_rate": 2.5444444444444446e-06, + "logits/chosen": 0.02609567902982235, + "logits/rejected": -0.059266939759254456, + "logps/chosen": -318.0865478515625, + "logps/rejected": -638.4888916015625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.869584560394287, + "rewards/margins": 13.543375015258789, + "rewards/rejected": -19.412960052490234, + "step": 542 + }, + { + "epoch": 0.33779160186625196, + "grad_norm": 0.05948146805167198, + "learning_rate": 2.538888888888889e-06, + "logits/chosen": -0.1055198460817337, + "logits/rejected": -0.20683830976486206, + "logps/chosen": -461.4420471191406, + "logps/rejected": -704.8410034179688, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.08057975769043, + "rewards/margins": 13.012429237365723, + "rewards/rejected": -19.09300994873047, + "step": 543 + }, + { + "epoch": 0.3384136858475894, + "grad_norm": 4.4028120040893555, + "learning_rate": 2.5333333333333338e-06, + "logits/chosen": -0.06388797610998154, + "logits/rejected": -0.1505063772201538, + "logps/chosen": -283.4486083984375, + "logps/rejected": -547.975341796875, + "loss": 0.0921, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.3118896484375, + "rewards/margins": 10.334837913513184, + "rewards/rejected": -15.646726608276367, + "step": 544 + }, + { + "epoch": 0.3390357698289269, + "grad_norm": 2.658051013946533, + "learning_rate": 2.5277777777777778e-06, + "logits/chosen": -0.06811270117759705, + "logits/rejected": -0.1641698181629181, + "logps/chosen": -301.865234375, + "logps/rejected": -489.8460693359375, + "loss": 0.0248, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.791343688964844, + "rewards/margins": 7.434410095214844, + "rewards/rejected": -12.225752830505371, + "step": 545 + }, + { + "epoch": 0.33965785381026437, + "grad_norm": 0.10358840227127075, + "learning_rate": 2.5222222222222226e-06, + "logits/chosen": -0.14239893853664398, + "logits/rejected": -0.19713914394378662, + "logps/chosen": -347.53125, + "logps/rejected": -668.2636108398438, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.369396209716797, + "rewards/margins": 13.843975067138672, + "rewards/rejected": -19.21337127685547, + "step": 546 + }, + { + "epoch": 0.34027993779160187, + "grad_norm": 1.121317982673645, + "learning_rate": 2.5166666666666666e-06, + "logits/chosen": -0.10243719816207886, + "logits/rejected": -0.23814579844474792, + "logps/chosen": -343.67181396484375, + "logps/rejected": -717.6865844726562, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.5978169441223145, + "rewards/margins": 13.032797813415527, + "rewards/rejected": -18.630613327026367, + "step": 547 + }, + { + "epoch": 0.3409020217729393, + "grad_norm": 0.28492313623428345, + "learning_rate": 2.5111111111111114e-06, + "logits/chosen": -0.09826498478651047, + "logits/rejected": -0.20643600821495056, + "logps/chosen": -236.00787353515625, + "logps/rejected": -629.139404296875, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8400192260742188, + "rewards/margins": 14.254295349121094, + "rewards/rejected": -18.094314575195312, + "step": 548 + }, + { + "epoch": 0.34152410575427683, + "grad_norm": 0.007504681590944529, + "learning_rate": 2.5055555555555554e-06, + "logits/chosen": -0.098854660987854, + "logits/rejected": -0.19436757266521454, + "logps/chosen": -348.6170959472656, + "logps/rejected": -608.6399536132812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.133045673370361, + "rewards/margins": 15.790933609008789, + "rewards/rejected": -20.923980712890625, + "step": 549 + }, + { + "epoch": 0.3421461897356143, + "grad_norm": 0.03602633625268936, + "learning_rate": 2.5e-06, + "logits/chosen": -0.02631543204188347, + "logits/rejected": -0.19469352066516876, + "logps/chosen": -250.79229736328125, + "logps/rejected": -612.4371337890625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.840367794036865, + "rewards/margins": 14.682695388793945, + "rewards/rejected": -19.523061752319336, + "step": 550 + }, + { + "epoch": 0.3427682737169518, + "grad_norm": 0.010311393067240715, + "learning_rate": 2.4944444444444446e-06, + "logits/chosen": -0.07674053311347961, + "logits/rejected": -0.18250271677970886, + "logps/chosen": -360.99005126953125, + "logps/rejected": -600.0747680664062, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.597929954528809, + "rewards/margins": 12.559804916381836, + "rewards/rejected": -18.157733917236328, + "step": 551 + }, + { + "epoch": 0.3433903576982893, + "grad_norm": 1.2740288972854614, + "learning_rate": 2.488888888888889e-06, + "logits/chosen": -0.07220663130283356, + "logits/rejected": -0.12009716033935547, + "logps/chosen": -440.0908508300781, + "logps/rejected": -560.3773193359375, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.125776290893555, + "rewards/margins": 13.088672637939453, + "rewards/rejected": -18.214448928833008, + "step": 552 + }, + { + "epoch": 0.34401244167962675, + "grad_norm": 5.6736016273498535, + "learning_rate": 2.4833333333333334e-06, + "logits/chosen": -0.08707974851131439, + "logits/rejected": -0.06788124144077301, + "logps/chosen": -443.2345886230469, + "logps/rejected": -536.3984375, + "loss": 0.1476, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.285701274871826, + "rewards/margins": 13.053203582763672, + "rewards/rejected": -19.338905334472656, + "step": 553 + }, + { + "epoch": 0.34463452566096425, + "grad_norm": 20.004392623901367, + "learning_rate": 2.4777777777777782e-06, + "logits/chosen": -0.16373366117477417, + "logits/rejected": -0.16652898490428925, + "logps/chosen": -496.97735595703125, + "logps/rejected": -479.89996337890625, + "loss": 0.2014, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.0796942710876465, + "rewards/margins": 8.558378219604492, + "rewards/rejected": -14.638071060180664, + "step": 554 + }, + { + "epoch": 0.3452566096423017, + "grad_norm": 9.918206214904785, + "learning_rate": 2.4722222222222226e-06, + "logits/chosen": -0.15894871950149536, + "logits/rejected": -0.18903818726539612, + "logps/chosen": -316.1875, + "logps/rejected": -484.5152587890625, + "loss": 0.2633, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.4542083740234375, + "rewards/margins": 10.387214660644531, + "rewards/rejected": -14.841423034667969, + "step": 555 + }, + { + "epoch": 0.3458786936236392, + "grad_norm": 0.023522689938545227, + "learning_rate": 2.466666666666667e-06, + "logits/chosen": -0.002713322639465332, + "logits/rejected": -0.08359899371862411, + "logps/chosen": -156.97982788085938, + "logps/rejected": -468.681884765625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8327776193618774, + "rewards/margins": 14.956024169921875, + "rewards/rejected": -16.788803100585938, + "step": 556 + }, + { + "epoch": 0.34650077760497666, + "grad_norm": 1.1048927307128906, + "learning_rate": 2.4611111111111115e-06, + "logits/chosen": -0.07819259166717529, + "logits/rejected": -0.1695297360420227, + "logps/chosen": -646.2476196289062, + "logps/rejected": -776.7715454101562, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.130685806274414, + "rewards/margins": 11.563163757324219, + "rewards/rejected": -20.693851470947266, + "step": 557 + }, + { + "epoch": 0.34712286158631417, + "grad_norm": 5.061789035797119, + "learning_rate": 2.455555555555556e-06, + "logits/chosen": -0.07473543286323547, + "logits/rejected": -0.1328362673521042, + "logps/chosen": -300.32073974609375, + "logps/rejected": -446.9525451660156, + "loss": 0.1304, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.479905605316162, + "rewards/margins": 9.644369125366211, + "rewards/rejected": -15.124275207519531, + "step": 558 + }, + { + "epoch": 0.3477449455676516, + "grad_norm": 6.3568572998046875, + "learning_rate": 2.4500000000000003e-06, + "logits/chosen": -0.17143657803535461, + "logits/rejected": -0.20791882276535034, + "logps/chosen": -449.3592834472656, + "logps/rejected": -564.6376953125, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.6386895179748535, + "rewards/margins": 9.726805686950684, + "rewards/rejected": -15.365495681762695, + "step": 559 + }, + { + "epoch": 0.3483670295489891, + "grad_norm": 0.20754081010818481, + "learning_rate": 2.4444444444444447e-06, + "logits/chosen": -0.1394234597682953, + "logits/rejected": -0.27907595038414, + "logps/chosen": -264.77435302734375, + "logps/rejected": -611.5497436523438, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4954934120178223, + "rewards/margins": 12.72542953491211, + "rewards/rejected": -16.220924377441406, + "step": 560 + }, + { + "epoch": 0.3489891135303266, + "grad_norm": 0.08386385440826416, + "learning_rate": 2.438888888888889e-06, + "logits/chosen": -0.16581138968467712, + "logits/rejected": -0.26275357604026794, + "logps/chosen": -359.97955322265625, + "logps/rejected": -641.0101318359375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.931512832641602, + "rewards/margins": 14.336447715759277, + "rewards/rejected": -20.267959594726562, + "step": 561 + }, + { + "epoch": 0.3496111975116641, + "grad_norm": 5.345775127410889, + "learning_rate": 2.4333333333333335e-06, + "logits/chosen": -0.052521929144859314, + "logits/rejected": -0.11848650127649307, + "logps/chosen": -445.4350891113281, + "logps/rejected": -553.4920043945312, + "loss": 0.0825, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.196770668029785, + "rewards/margins": 11.70019245147705, + "rewards/rejected": -17.896961212158203, + "step": 562 + }, + { + "epoch": 0.35023328149300154, + "grad_norm": 0.003999311942607164, + "learning_rate": 2.427777777777778e-06, + "logits/chosen": -0.1040312796831131, + "logits/rejected": -0.21020740270614624, + "logps/chosen": -158.52911376953125, + "logps/rejected": -377.2779541015625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.033074140548706, + "rewards/margins": 10.93126106262207, + "rewards/rejected": -13.964336395263672, + "step": 563 + }, + { + "epoch": 0.35085536547433904, + "grad_norm": 0.6184300780296326, + "learning_rate": 2.4222222222222223e-06, + "logits/chosen": -0.22776159644126892, + "logits/rejected": -0.27457931637763977, + "logps/chosen": -549.8146362304688, + "logps/rejected": -687.0814208984375, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.93636703491211, + "rewards/margins": 11.485404968261719, + "rewards/rejected": -22.421772003173828, + "step": 564 + }, + { + "epoch": 0.3514774494556765, + "grad_norm": 0.009151005186140537, + "learning_rate": 2.4166666666666667e-06, + "logits/chosen": -0.057230591773986816, + "logits/rejected": -0.19938889145851135, + "logps/chosen": -345.40289306640625, + "logps/rejected": -702.0396118164062, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4042351245880127, + "rewards/margins": 15.311080932617188, + "rewards/rejected": -17.715316772460938, + "step": 565 + }, + { + "epoch": 0.352099533437014, + "grad_norm": 0.233717143535614, + "learning_rate": 2.411111111111111e-06, + "logits/chosen": -0.08323769271373749, + "logits/rejected": -0.2458912432193756, + "logps/chosen": -258.6188049316406, + "logps/rejected": -644.9756469726562, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.462327003479004, + "rewards/margins": 13.214030265808105, + "rewards/rejected": -17.67635726928711, + "step": 566 + }, + { + "epoch": 0.35272161741835145, + "grad_norm": 0.26373088359832764, + "learning_rate": 2.4055555555555555e-06, + "logits/chosen": -0.11546474695205688, + "logits/rejected": -0.17926131188869476, + "logps/chosen": -360.90960693359375, + "logps/rejected": -551.8899536132812, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.464623928070068, + "rewards/margins": 11.580972671508789, + "rewards/rejected": -19.045597076416016, + "step": 567 + }, + { + "epoch": 0.35334370139968896, + "grad_norm": 3.1684494018554688, + "learning_rate": 2.4000000000000003e-06, + "logits/chosen": -0.17414948344230652, + "logits/rejected": -0.234939306974411, + "logps/chosen": -272.03839111328125, + "logps/rejected": -525.7798461914062, + "loss": 0.0323, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.634640216827393, + "rewards/margins": 14.29772663116455, + "rewards/rejected": -18.9323673248291, + "step": 568 + }, + { + "epoch": 0.35396578538102647, + "grad_norm": 3.797706127166748, + "learning_rate": 2.3944444444444447e-06, + "logits/chosen": -0.10561051964759827, + "logits/rejected": -0.18161273002624512, + "logps/chosen": -319.162841796875, + "logps/rejected": -561.4620971679688, + "loss": 0.0905, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.833627700805664, + "rewards/margins": 12.622666358947754, + "rewards/rejected": -18.456295013427734, + "step": 569 + }, + { + "epoch": 0.3545878693623639, + "grad_norm": 33.47124099731445, + "learning_rate": 2.388888888888889e-06, + "logits/chosen": -0.12097673863172531, + "logits/rejected": -0.15202441811561584, + "logps/chosen": -725.1028442382812, + "logps/rejected": -671.7620849609375, + "loss": 0.2449, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.6082258224487305, + "rewards/margins": 12.458962440490723, + "rewards/rejected": -19.067188262939453, + "step": 570 + }, + { + "epoch": 0.3552099533437014, + "grad_norm": 5.564277648925781, + "learning_rate": 2.3833333333333335e-06, + "logits/chosen": -0.1571049988269806, + "logits/rejected": -0.20630133152008057, + "logps/chosen": -411.5460510253906, + "logps/rejected": -509.9547119140625, + "loss": 0.1468, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.463071346282959, + "rewards/margins": 7.466385841369629, + "rewards/rejected": -13.929457664489746, + "step": 571 + }, + { + "epoch": 0.3558320373250389, + "grad_norm": 0.009714186191558838, + "learning_rate": 2.377777777777778e-06, + "logits/chosen": -0.001505957916378975, + "logits/rejected": -0.17831094563007355, + "logps/chosen": -238.46356201171875, + "logps/rejected": -648.39306640625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.279341459274292, + "rewards/margins": 15.020463943481445, + "rewards/rejected": -17.2998046875, + "step": 572 + }, + { + "epoch": 0.3564541213063764, + "grad_norm": 13.96557331085205, + "learning_rate": 2.3722222222222223e-06, + "logits/chosen": -0.19209472835063934, + "logits/rejected": -0.24229933321475983, + "logps/chosen": -426.6095275878906, + "logps/rejected": -611.645263671875, + "loss": 0.1847, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.601934432983398, + "rewards/margins": 11.644729614257812, + "rewards/rejected": -17.246662139892578, + "step": 573 + }, + { + "epoch": 0.35707620528771383, + "grad_norm": 9.373568534851074, + "learning_rate": 2.3666666666666667e-06, + "logits/chosen": -0.030644766986370087, + "logits/rejected": -0.06320975720882416, + "logps/chosen": -349.83148193359375, + "logps/rejected": -481.60894775390625, + "loss": 0.1584, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.82609748840332, + "rewards/margins": 8.189970016479492, + "rewards/rejected": -14.016066551208496, + "step": 574 + }, + { + "epoch": 0.35769828926905134, + "grad_norm": 0.039770230650901794, + "learning_rate": 2.361111111111111e-06, + "logits/chosen": 0.02122540958225727, + "logits/rejected": -0.16562098264694214, + "logps/chosen": -312.4407043457031, + "logps/rejected": -578.7828979492188, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9617223739624023, + "rewards/margins": 12.726539611816406, + "rewards/rejected": -16.688262939453125, + "step": 575 + }, + { + "epoch": 0.3583203732503888, + "grad_norm": 0.1497584730386734, + "learning_rate": 2.3555555555555555e-06, + "logits/chosen": -0.0718577429652214, + "logits/rejected": -0.1633210927248001, + "logps/chosen": -246.0577850341797, + "logps/rejected": -630.9411010742188, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.044619560241699, + "rewards/margins": 13.227683067321777, + "rewards/rejected": -16.27230453491211, + "step": 576 + }, + { + "epoch": 0.3589424572317263, + "grad_norm": 0.344123899936676, + "learning_rate": 2.35e-06, + "logits/chosen": -0.12604425847530365, + "logits/rejected": -0.18204103410243988, + "logps/chosen": -224.67726135253906, + "logps/rejected": -462.916015625, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.392816066741943, + "rewards/margins": 11.147856712341309, + "rewards/rejected": -15.540672302246094, + "step": 577 + }, + { + "epoch": 0.35956454121306375, + "grad_norm": 1.8229542970657349, + "learning_rate": 2.3444444444444448e-06, + "logits/chosen": -0.22441864013671875, + "logits/rejected": -0.263460636138916, + "logps/chosen": -545.3401489257812, + "logps/rejected": -732.0248413085938, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.721031665802002, + "rewards/margins": 9.77612590789795, + "rewards/rejected": -16.49715805053711, + "step": 578 + }, + { + "epoch": 0.36018662519440126, + "grad_norm": 5.0007429122924805, + "learning_rate": 2.338888888888889e-06, + "logits/chosen": -0.11795270442962646, + "logits/rejected": -0.14047595858573914, + "logps/chosen": -389.9419860839844, + "logps/rejected": -486.0982971191406, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.190677165985107, + "rewards/margins": 8.365474700927734, + "rewards/rejected": -14.55615234375, + "step": 579 + }, + { + "epoch": 0.3608087091757387, + "grad_norm": 4.198587894439697, + "learning_rate": 2.3333333333333336e-06, + "logits/chosen": -0.1176590621471405, + "logits/rejected": -0.1862529069185257, + "logps/chosen": -224.06800842285156, + "logps/rejected": -452.8370056152344, + "loss": 0.0439, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.547196388244629, + "rewards/margins": 13.242354393005371, + "rewards/rejected": -15.78955078125, + "step": 580 + }, + { + "epoch": 0.3614307931570762, + "grad_norm": 0.22014431655406952, + "learning_rate": 2.327777777777778e-06, + "logits/chosen": -0.13275498151779175, + "logits/rejected": -0.20080845057964325, + "logps/chosen": -556.6088256835938, + "logps/rejected": -617.97119140625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.2083539962768555, + "rewards/margins": 11.477535247802734, + "rewards/rejected": -16.685890197753906, + "step": 581 + }, + { + "epoch": 0.36205287713841366, + "grad_norm": 0.13644495606422424, + "learning_rate": 2.3222222222222224e-06, + "logits/chosen": -0.10452552884817123, + "logits/rejected": -0.20629152655601501, + "logps/chosen": -448.12872314453125, + "logps/rejected": -694.920166015625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5910658836364746, + "rewards/margins": 12.046796798706055, + "rewards/rejected": -15.637864112854004, + "step": 582 + }, + { + "epoch": 0.36267496111975117, + "grad_norm": 11.217412948608398, + "learning_rate": 2.316666666666667e-06, + "logits/chosen": -0.07019175589084625, + "logits/rejected": -0.11467856168746948, + "logps/chosen": -490.2370300292969, + "logps/rejected": -564.4802856445312, + "loss": 0.3729, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.76168441772461, + "rewards/margins": 6.769569396972656, + "rewards/rejected": -15.531253814697266, + "step": 583 + }, + { + "epoch": 0.3632970451010886, + "grad_norm": 0.6402060389518738, + "learning_rate": 2.311111111111111e-06, + "logits/chosen": -0.13831913471221924, + "logits/rejected": -0.21836647391319275, + "logps/chosen": -339.7200012207031, + "logps/rejected": -584.3929443359375, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.7049665451049805, + "rewards/margins": 13.086289405822754, + "rewards/rejected": -18.791255950927734, + "step": 584 + }, + { + "epoch": 0.36391912908242613, + "grad_norm": 14.236071586608887, + "learning_rate": 2.305555555555556e-06, + "logits/chosen": -0.11309604346752167, + "logits/rejected": -0.11962088942527771, + "logps/chosen": -569.833984375, + "logps/rejected": -764.7379150390625, + "loss": 0.1858, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.710409641265869, + "rewards/margins": 12.531475067138672, + "rewards/rejected": -20.241886138916016, + "step": 585 + }, + { + "epoch": 0.3645412130637636, + "grad_norm": 3.846438407897949, + "learning_rate": 2.3000000000000004e-06, + "logits/chosen": -0.17874372005462646, + "logits/rejected": -0.23497071862220764, + "logps/chosen": -414.9308166503906, + "logps/rejected": -533.86181640625, + "loss": 0.054, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.382350444793701, + "rewards/margins": 9.667595863342285, + "rewards/rejected": -15.049946784973145, + "step": 586 + }, + { + "epoch": 0.3651632970451011, + "grad_norm": 0.26652610301971436, + "learning_rate": 2.294444444444445e-06, + "logits/chosen": -0.0071484968066215515, + "logits/rejected": -0.11718876659870148, + "logps/chosen": -295.73968505859375, + "logps/rejected": -558.0, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.50933313369751, + "rewards/margins": 12.545092582702637, + "rewards/rejected": -18.054424285888672, + "step": 587 + }, + { + "epoch": 0.3657853810264386, + "grad_norm": 2.24540376663208, + "learning_rate": 2.2888888888888892e-06, + "logits/chosen": -0.12435005605220795, + "logits/rejected": -0.2096242904663086, + "logps/chosen": -316.5429992675781, + "logps/rejected": -539.1443481445312, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.717129945755005, + "rewards/margins": 8.513898849487305, + "rewards/rejected": -12.23102855682373, + "step": 588 + }, + { + "epoch": 0.36640746500777605, + "grad_norm": 1.1552729606628418, + "learning_rate": 2.2833333333333336e-06, + "logits/chosen": -0.1275944709777832, + "logits/rejected": -0.21297526359558105, + "logps/chosen": -246.26165771484375, + "logps/rejected": -472.759033203125, + "loss": 0.032, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.610294342041016, + "rewards/margins": 9.511714935302734, + "rewards/rejected": -15.12200927734375, + "step": 589 + }, + { + "epoch": 0.36702954898911355, + "grad_norm": 1.453577995300293, + "learning_rate": 2.277777777777778e-06, + "logits/chosen": -0.12901009619235992, + "logits/rejected": -0.1924229860305786, + "logps/chosen": -475.49798583984375, + "logps/rejected": -729.259521484375, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.009323596954346, + "rewards/margins": 13.071990966796875, + "rewards/rejected": -19.081314086914062, + "step": 590 + }, + { + "epoch": 0.367651632970451, + "grad_norm": 0.5628300309181213, + "learning_rate": 2.2722222222222224e-06, + "logits/chosen": -0.06428832560777664, + "logits/rejected": -0.17118701338768005, + "logps/chosen": -289.195556640625, + "logps/rejected": -591.0325927734375, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.915555953979492, + "rewards/margins": 13.27055835723877, + "rewards/rejected": -18.186115264892578, + "step": 591 + }, + { + "epoch": 0.3682737169517885, + "grad_norm": 2.8013370037078857, + "learning_rate": 2.266666666666667e-06, + "logits/chosen": -0.027454953640699387, + "logits/rejected": -0.176240935921669, + "logps/chosen": -201.6141357421875, + "logps/rejected": -475.4548034667969, + "loss": 0.0623, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.133639335632324, + "rewards/margins": 10.884796142578125, + "rewards/rejected": -16.018436431884766, + "step": 592 + }, + { + "epoch": 0.36889580093312596, + "grad_norm": 3.5443055629730225, + "learning_rate": 2.2611111111111112e-06, + "logits/chosen": -0.074163056910038, + "logits/rejected": -0.11284688115119934, + "logps/chosen": -427.6487121582031, + "logps/rejected": -556.0269165039062, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.711418628692627, + "rewards/margins": 8.74967098236084, + "rewards/rejected": -16.461090087890625, + "step": 593 + }, + { + "epoch": 0.36951788491446347, + "grad_norm": 0.3400859534740448, + "learning_rate": 2.2555555555555557e-06, + "logits/chosen": -0.13126900792121887, + "logits/rejected": -0.2330160140991211, + "logps/chosen": -353.7569274902344, + "logps/rejected": -622.4049072265625, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.521755695343018, + "rewards/margins": 10.670633316040039, + "rewards/rejected": -15.192389488220215, + "step": 594 + }, + { + "epoch": 0.3701399688958009, + "grad_norm": 1.7291098833084106, + "learning_rate": 2.25e-06, + "logits/chosen": -0.10363461077213287, + "logits/rejected": -0.1477852463722229, + "logps/chosen": -365.3837890625, + "logps/rejected": -527.3635864257812, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.839771270751953, + "rewards/margins": 11.698359489440918, + "rewards/rejected": -16.538131713867188, + "step": 595 + }, + { + "epoch": 0.3707620528771384, + "grad_norm": 6.389466762542725, + "learning_rate": 2.2444444444444445e-06, + "logits/chosen": -0.11490525305271149, + "logits/rejected": -0.22313544154167175, + "logps/chosen": -394.7263488769531, + "logps/rejected": -600.71630859375, + "loss": 0.062, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.477814674377441, + "rewards/margins": 13.74073600769043, + "rewards/rejected": -19.218551635742188, + "step": 596 + }, + { + "epoch": 0.3713841368584759, + "grad_norm": 5.4853057861328125, + "learning_rate": 2.238888888888889e-06, + "logits/chosen": -0.16798526048660278, + "logits/rejected": -0.18549703061580658, + "logps/chosen": -496.73651123046875, + "logps/rejected": -531.6275634765625, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.083205223083496, + "rewards/margins": 10.589763641357422, + "rewards/rejected": -15.672967910766602, + "step": 597 + }, + { + "epoch": 0.3720062208398134, + "grad_norm": 0.1266779750585556, + "learning_rate": 2.2333333333333333e-06, + "logits/chosen": -0.11039476096630096, + "logits/rejected": -0.20770378410816193, + "logps/chosen": -272.56390380859375, + "logps/rejected": -471.83868408203125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.15316104888916, + "rewards/margins": 10.969218254089355, + "rewards/rejected": -16.122379302978516, + "step": 598 + }, + { + "epoch": 0.37262830482115084, + "grad_norm": 0.00319194165058434, + "learning_rate": 2.2277777777777777e-06, + "logits/chosen": -0.11443936824798584, + "logits/rejected": -0.16398362815380096, + "logps/chosen": -332.3847961425781, + "logps/rejected": -503.30938720703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.766131401062012, + "rewards/margins": 11.360424041748047, + "rewards/rejected": -16.126554489135742, + "step": 599 + }, + { + "epoch": 0.37325038880248834, + "grad_norm": 0.05770452693104744, + "learning_rate": 2.222222222222222e-06, + "logits/chosen": -0.16463051736354828, + "logits/rejected": -0.23499250411987305, + "logps/chosen": -411.88079833984375, + "logps/rejected": -649.42333984375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.202013969421387, + "rewards/margins": 14.236525535583496, + "rewards/rejected": -19.43853759765625, + "step": 600 + }, + { + "epoch": 0.3738724727838258, + "grad_norm": 4.070469856262207, + "learning_rate": 2.216666666666667e-06, + "logits/chosen": -0.19808316230773926, + "logits/rejected": -0.21870432794094086, + "logps/chosen": -566.7713623046875, + "logps/rejected": -588.8722534179688, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.4904632568359375, + "rewards/margins": 7.679604530334473, + "rewards/rejected": -13.170068740844727, + "step": 601 + }, + { + "epoch": 0.3744945567651633, + "grad_norm": 7.90131139755249, + "learning_rate": 2.2111111111111113e-06, + "logits/chosen": -0.1187443658709526, + "logits/rejected": -0.19577933847904205, + "logps/chosen": -465.4372253417969, + "logps/rejected": -652.456298828125, + "loss": 0.1001, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.003196716308594, + "rewards/margins": 9.818995475769043, + "rewards/rejected": -14.822192192077637, + "step": 602 + }, + { + "epoch": 0.37511664074650075, + "grad_norm": 0.028354298323392868, + "learning_rate": 2.2055555555555557e-06, + "logits/chosen": -0.05550114065408707, + "logits/rejected": -0.1802537739276886, + "logps/chosen": -248.3385772705078, + "logps/rejected": -550.254638671875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.1114115715026855, + "rewards/margins": 12.340682983398438, + "rewards/rejected": -16.45209312438965, + "step": 603 + }, + { + "epoch": 0.37573872472783826, + "grad_norm": 4.909607410430908, + "learning_rate": 2.2e-06, + "logits/chosen": -0.14214634895324707, + "logits/rejected": -0.17241595685482025, + "logps/chosen": -381.50103759765625, + "logps/rejected": -512.61669921875, + "loss": 0.0562, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.338761806488037, + "rewards/margins": 11.036218643188477, + "rewards/rejected": -15.374980926513672, + "step": 604 + }, + { + "epoch": 0.37636080870917576, + "grad_norm": 1.1660027503967285, + "learning_rate": 2.1944444444444445e-06, + "logits/chosen": -0.20690438151359558, + "logits/rejected": -0.1499423384666443, + "logps/chosen": -545.320556640625, + "logps/rejected": -464.8721923828125, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.50908088684082, + "rewards/margins": 6.113326072692871, + "rewards/rejected": -10.622406959533691, + "step": 605 + }, + { + "epoch": 0.3769828926905132, + "grad_norm": 0.07888741791248322, + "learning_rate": 2.188888888888889e-06, + "logits/chosen": -0.25985538959503174, + "logits/rejected": -0.2058733105659485, + "logps/chosen": -412.0350646972656, + "logps/rejected": -568.2172241210938, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.952276229858398, + "rewards/margins": 8.0259370803833, + "rewards/rejected": -12.978212356567383, + "step": 606 + }, + { + "epoch": 0.3776049766718507, + "grad_norm": 14.530272483825684, + "learning_rate": 2.1833333333333333e-06, + "logits/chosen": -0.0551840104162693, + "logits/rejected": -0.1383301317691803, + "logps/chosen": -300.150390625, + "logps/rejected": -496.4039306640625, + "loss": 0.1896, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.571584224700928, + "rewards/margins": 8.760917663574219, + "rewards/rejected": -14.332501411437988, + "step": 607 + }, + { + "epoch": 0.3782270606531882, + "grad_norm": 4.137472629547119, + "learning_rate": 2.1777777777777777e-06, + "logits/chosen": -0.18696731328964233, + "logits/rejected": -0.27007514238357544, + "logps/chosen": -366.3069763183594, + "logps/rejected": -547.8585205078125, + "loss": 0.0438, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.512137413024902, + "rewards/margins": 8.354242324829102, + "rewards/rejected": -13.866379737854004, + "step": 608 + }, + { + "epoch": 0.3788491446345257, + "grad_norm": 0.11437290906906128, + "learning_rate": 2.1722222222222226e-06, + "logits/chosen": -0.05784451216459274, + "logits/rejected": -0.17893891036510468, + "logps/chosen": -407.8121032714844, + "logps/rejected": -739.70166015625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.951784610748291, + "rewards/margins": 14.630231857299805, + "rewards/rejected": -18.582015991210938, + "step": 609 + }, + { + "epoch": 0.37947122861586313, + "grad_norm": 1.1886166334152222, + "learning_rate": 2.166666666666667e-06, + "logits/chosen": -0.15050490200519562, + "logits/rejected": -0.2669588327407837, + "logps/chosen": -327.3599853515625, + "logps/rejected": -658.18017578125, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.552830219268799, + "rewards/margins": 14.122157096862793, + "rewards/rejected": -19.674985885620117, + "step": 610 + }, + { + "epoch": 0.38009331259720064, + "grad_norm": 0.05546851456165314, + "learning_rate": 2.1611111111111114e-06, + "logits/chosen": -0.06541711091995239, + "logits/rejected": -0.14934766292572021, + "logps/chosen": -187.0348358154297, + "logps/rejected": -378.8170166015625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.156720161437988, + "rewards/margins": 10.687467575073242, + "rewards/rejected": -14.844186782836914, + "step": 611 + }, + { + "epoch": 0.3807153965785381, + "grad_norm": 0.026807919144630432, + "learning_rate": 2.1555555555555558e-06, + "logits/chosen": 0.0479261539876461, + "logits/rejected": -0.13238559663295746, + "logps/chosen": -335.9419250488281, + "logps/rejected": -579.7203369140625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.055352210998535, + "rewards/margins": 11.876808166503906, + "rewards/rejected": -17.932159423828125, + "step": 612 + }, + { + "epoch": 0.3813374805598756, + "grad_norm": 0.27989986538887024, + "learning_rate": 2.15e-06, + "logits/chosen": -0.08589765429496765, + "logits/rejected": -0.19928470253944397, + "logps/chosen": -303.1141052246094, + "logps/rejected": -591.6826782226562, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.660463333129883, + "rewards/margins": 14.89505386352539, + "rewards/rejected": -20.555517196655273, + "step": 613 + }, + { + "epoch": 0.38195956454121305, + "grad_norm": 0.07629609107971191, + "learning_rate": 2.1444444444444446e-06, + "logits/chosen": -0.06538677215576172, + "logits/rejected": -0.15840166807174683, + "logps/chosen": -228.55377197265625, + "logps/rejected": -560.1513061523438, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.373802661895752, + "rewards/margins": 13.550684928894043, + "rewards/rejected": -18.924488067626953, + "step": 614 + }, + { + "epoch": 0.38258164852255055, + "grad_norm": 0.10868936032056808, + "learning_rate": 2.138888888888889e-06, + "logits/chosen": -0.07268622517585754, + "logits/rejected": -0.1560012400150299, + "logps/chosen": -420.86370849609375, + "logps/rejected": -577.6571044921875, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.937615871429443, + "rewards/margins": 11.336206436157227, + "rewards/rejected": -16.273822784423828, + "step": 615 + }, + { + "epoch": 0.383203732503888, + "grad_norm": 0.1917133331298828, + "learning_rate": 2.133333333333334e-06, + "logits/chosen": -0.185995951294899, + "logits/rejected": -0.2692728638648987, + "logps/chosen": -405.998779296875, + "logps/rejected": -597.4282836914062, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.724762439727783, + "rewards/margins": 10.162138938903809, + "rewards/rejected": -15.88690185546875, + "step": 616 + }, + { + "epoch": 0.3838258164852255, + "grad_norm": 0.0896904319524765, + "learning_rate": 2.127777777777778e-06, + "logits/chosen": -0.1531233936548233, + "logits/rejected": -0.21464720368385315, + "logps/chosen": -460.220458984375, + "logps/rejected": -540.3817138671875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.548835754394531, + "rewards/margins": 12.11173152923584, + "rewards/rejected": -16.660566329956055, + "step": 617 + }, + { + "epoch": 0.38444790046656296, + "grad_norm": 4.865177154541016, + "learning_rate": 2.1222222222222226e-06, + "logits/chosen": -0.2055264413356781, + "logits/rejected": -0.2791425585746765, + "logps/chosen": -326.22125244140625, + "logps/rejected": -578.8974609375, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.485081672668457, + "rewards/margins": 10.926785469055176, + "rewards/rejected": -15.411866188049316, + "step": 618 + }, + { + "epoch": 0.38506998444790047, + "grad_norm": 0.425128310918808, + "learning_rate": 2.116666666666667e-06, + "logits/chosen": -0.2265782356262207, + "logits/rejected": -0.2858770489692688, + "logps/chosen": -324.4194641113281, + "logps/rejected": -519.598388671875, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.5589752197265625, + "rewards/margins": 9.433330535888672, + "rewards/rejected": -13.992305755615234, + "step": 619 + }, + { + "epoch": 0.3856920684292379, + "grad_norm": 20.143705368041992, + "learning_rate": 2.1111111111111114e-06, + "logits/chosen": -0.16033339500427246, + "logits/rejected": -0.20508690178394318, + "logps/chosen": -274.070556640625, + "logps/rejected": -574.3967895507812, + "loss": 0.3499, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.248621940612793, + "rewards/margins": 13.607233047485352, + "rewards/rejected": -18.855854034423828, + "step": 620 + }, + { + "epoch": 0.38631415241057543, + "grad_norm": 0.032015785574913025, + "learning_rate": 2.105555555555556e-06, + "logits/chosen": -0.09311247617006302, + "logits/rejected": -0.22847864031791687, + "logps/chosen": -204.98751831054688, + "logps/rejected": -556.1636962890625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.249104022979736, + "rewards/margins": 10.036844253540039, + "rewards/rejected": -14.285948753356934, + "step": 621 + }, + { + "epoch": 0.38693623639191294, + "grad_norm": 0.26403936743736267, + "learning_rate": 2.1000000000000002e-06, + "logits/chosen": -0.05379270017147064, + "logits/rejected": -0.15036076307296753, + "logps/chosen": -465.2509765625, + "logps/rejected": -761.9566650390625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.163125038146973, + "rewards/margins": 15.5380277633667, + "rewards/rejected": -21.701152801513672, + "step": 622 + }, + { + "epoch": 0.3875583203732504, + "grad_norm": 2.3223023414611816, + "learning_rate": 2.0944444444444446e-06, + "logits/chosen": -0.12875807285308838, + "logits/rejected": -0.15465134382247925, + "logps/chosen": -497.8499450683594, + "logps/rejected": -668.3350830078125, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.110443115234375, + "rewards/margins": 12.902565956115723, + "rewards/rejected": -19.013010025024414, + "step": 623 + }, + { + "epoch": 0.3881804043545879, + "grad_norm": 0.38317784667015076, + "learning_rate": 2.088888888888889e-06, + "logits/chosen": -0.1266111433506012, + "logits/rejected": -0.22132231295108795, + "logps/chosen": -182.77452087402344, + "logps/rejected": -472.81829833984375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4256887435913086, + "rewards/margins": 10.939119338989258, + "rewards/rejected": -14.364809036254883, + "step": 624 + }, + { + "epoch": 0.38880248833592534, + "grad_norm": 0.5231698155403137, + "learning_rate": 2.0833333333333334e-06, + "logits/chosen": -0.07042233645915985, + "logits/rejected": -0.1630944460630417, + "logps/chosen": -374.0731506347656, + "logps/rejected": -596.9639282226562, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.224048614501953, + "rewards/margins": 12.912734985351562, + "rewards/rejected": -19.13678550720215, + "step": 625 + }, + { + "epoch": 0.38942457231726285, + "grad_norm": 0.6219817996025085, + "learning_rate": 2.077777777777778e-06, + "logits/chosen": -0.03450584039092064, + "logits/rejected": -0.20337636768817902, + "logps/chosen": -271.948974609375, + "logps/rejected": -583.3980712890625, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0580902099609375, + "rewards/margins": 12.030652046203613, + "rewards/rejected": -17.088743209838867, + "step": 626 + }, + { + "epoch": 0.3900466562986003, + "grad_norm": 0.004323755390942097, + "learning_rate": 2.0722222222222222e-06, + "logits/chosen": -0.207444429397583, + "logits/rejected": -0.2159947007894516, + "logps/chosen": -536.43896484375, + "logps/rejected": -685.21875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.932429313659668, + "rewards/margins": 13.263242721557617, + "rewards/rejected": -20.19567108154297, + "step": 627 + }, + { + "epoch": 0.3906687402799378, + "grad_norm": 2.432190179824829, + "learning_rate": 2.0666666666666666e-06, + "logits/chosen": -0.07729389518499374, + "logits/rejected": -0.16018494963645935, + "logps/chosen": -426.13592529296875, + "logps/rejected": -581.6529541015625, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.427860260009766, + "rewards/margins": 9.814375877380371, + "rewards/rejected": -19.24223518371582, + "step": 628 + }, + { + "epoch": 0.39129082426127526, + "grad_norm": 0.8065938949584961, + "learning_rate": 2.061111111111111e-06, + "logits/chosen": -0.08945492655038834, + "logits/rejected": -0.23955777287483215, + "logps/chosen": -122.81063079833984, + "logps/rejected": -469.9738464355469, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7575199604034424, + "rewards/margins": 12.169333457946777, + "rewards/rejected": -14.92685317993164, + "step": 629 + }, + { + "epoch": 0.39191290824261277, + "grad_norm": 1.2694801092147827, + "learning_rate": 2.0555555555555555e-06, + "logits/chosen": -0.1652809977531433, + "logits/rejected": -0.24504458904266357, + "logps/chosen": -310.398681640625, + "logps/rejected": -572.1951293945312, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.44223165512085, + "rewards/margins": 13.307544708251953, + "rewards/rejected": -18.74977684020996, + "step": 630 + }, + { + "epoch": 0.3925349922239502, + "grad_norm": 0.7890479564666748, + "learning_rate": 2.05e-06, + "logits/chosen": 0.10143455862998962, + "logits/rejected": -0.014388229697942734, + "logps/chosen": -273.21063232421875, + "logps/rejected": -485.297119140625, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.012326240539551, + "rewards/margins": 12.347881317138672, + "rewards/rejected": -16.36020851135254, + "step": 631 + }, + { + "epoch": 0.3931570762052877, + "grad_norm": 0.06735774874687195, + "learning_rate": 2.0444444444444447e-06, + "logits/chosen": -0.08643755316734314, + "logits/rejected": -0.19199813902378082, + "logps/chosen": -460.5355224609375, + "logps/rejected": -685.3828125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.815726280212402, + "rewards/margins": 11.540682792663574, + "rewards/rejected": -16.356407165527344, + "step": 632 + }, + { + "epoch": 0.3937791601866252, + "grad_norm": 5.342881202697754, + "learning_rate": 2.038888888888889e-06, + "logits/chosen": -0.10000207275152206, + "logits/rejected": -0.17099639773368835, + "logps/chosen": -403.6191101074219, + "logps/rejected": -482.010009765625, + "loss": 0.0968, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.434308052062988, + "rewards/margins": 8.641256332397461, + "rewards/rejected": -14.07556438446045, + "step": 633 + }, + { + "epoch": 0.3944012441679627, + "grad_norm": 0.17070814967155457, + "learning_rate": 2.0333333333333335e-06, + "logits/chosen": -0.1604013442993164, + "logits/rejected": -0.18836960196495056, + "logps/chosen": -311.6618957519531, + "logps/rejected": -476.594482421875, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.793218612670898, + "rewards/margins": 11.51873779296875, + "rewards/rejected": -16.311954498291016, + "step": 634 + }, + { + "epoch": 0.39502332814930013, + "grad_norm": 0.0027436772361397743, + "learning_rate": 2.027777777777778e-06, + "logits/chosen": -0.09416146576404572, + "logits/rejected": -0.14794966578483582, + "logps/chosen": -300.7768859863281, + "logps/rejected": -522.87158203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.378449440002441, + "rewards/margins": 13.257801055908203, + "rewards/rejected": -18.63625144958496, + "step": 635 + }, + { + "epoch": 0.39564541213063764, + "grad_norm": 0.08537284284830093, + "learning_rate": 2.0222222222222223e-06, + "logits/chosen": -0.12835612893104553, + "logits/rejected": -0.2175043821334839, + "logps/chosen": -287.2108154296875, + "logps/rejected": -605.6363525390625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.402039527893066, + "rewards/margins": 15.171186447143555, + "rewards/rejected": -21.573226928710938, + "step": 636 + }, + { + "epoch": 0.3962674961119751, + "grad_norm": 9.46985912322998, + "learning_rate": 2.0166666666666667e-06, + "logits/chosen": -0.15604367852210999, + "logits/rejected": -0.205369234085083, + "logps/chosen": -365.55108642578125, + "logps/rejected": -505.03375244140625, + "loss": 0.4266, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.6501145362854, + "rewards/margins": 10.839437484741211, + "rewards/rejected": -16.489551544189453, + "step": 637 + }, + { + "epoch": 0.3968895800933126, + "grad_norm": 0.110622338950634, + "learning_rate": 2.011111111111111e-06, + "logits/chosen": -0.14026425778865814, + "logits/rejected": -0.1904151290655136, + "logps/chosen": -298.12054443359375, + "logps/rejected": -484.876708984375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.848516464233398, + "rewards/margins": 10.600751876831055, + "rewards/rejected": -16.449268341064453, + "step": 638 + }, + { + "epoch": 0.39751166407465005, + "grad_norm": 0.06769077479839325, + "learning_rate": 2.0055555555555555e-06, + "logits/chosen": -0.08915624022483826, + "logits/rejected": -0.21795007586479187, + "logps/chosen": -397.98016357421875, + "logps/rejected": -670.50927734375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.211318492889404, + "rewards/margins": 14.440079689025879, + "rewards/rejected": -18.651397705078125, + "step": 639 + }, + { + "epoch": 0.39813374805598756, + "grad_norm": 20.421268463134766, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -0.07606413215398788, + "logits/rejected": -0.1844390630722046, + "logps/chosen": -414.2615051269531, + "logps/rejected": -640.6993408203125, + "loss": 0.1846, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.177009582519531, + "rewards/margins": 11.89448070526123, + "rewards/rejected": -18.071491241455078, + "step": 640 + }, + { + "epoch": 0.39875583203732506, + "grad_norm": 0.0037117390893399715, + "learning_rate": 1.9944444444444447e-06, + "logits/chosen": -0.15957298874855042, + "logits/rejected": -0.2717669606208801, + "logps/chosen": -391.90582275390625, + "logps/rejected": -658.2315673828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.238301753997803, + "rewards/margins": 16.459196090698242, + "rewards/rejected": -22.697498321533203, + "step": 641 + }, + { + "epoch": 0.3993779160186625, + "grad_norm": 0.7360544800758362, + "learning_rate": 1.988888888888889e-06, + "logits/chosen": -0.08280476182699203, + "logits/rejected": -0.19797533750534058, + "logps/chosen": -302.4588623046875, + "logps/rejected": -581.579833984375, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.456978797912598, + "rewards/margins": 13.991204261779785, + "rewards/rejected": -18.44818115234375, + "step": 642 + }, + { + "epoch": 0.4, + "grad_norm": 0.18684785068035126, + "learning_rate": 1.9833333333333335e-06, + "logits/chosen": 0.0668894499540329, + "logits/rejected": -0.019208211451768875, + "logps/chosen": -301.3817138671875, + "logps/rejected": -588.744140625, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.071944236755371, + "rewards/margins": 11.101165771484375, + "rewards/rejected": -16.173110961914062, + "step": 643 + }, + { + "epoch": 0.4006220839813375, + "grad_norm": 0.01737477257847786, + "learning_rate": 1.977777777777778e-06, + "logits/chosen": -0.16378875076770782, + "logits/rejected": -0.26666751503944397, + "logps/chosen": -213.44223022460938, + "logps/rejected": -524.17529296875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.308502674102783, + "rewards/margins": 15.712350845336914, + "rewards/rejected": -19.020854949951172, + "step": 644 + }, + { + "epoch": 0.401244167962675, + "grad_norm": 0.011187204159796238, + "learning_rate": 1.9722222222222224e-06, + "logits/chosen": -0.1535571813583374, + "logits/rejected": -0.24406680464744568, + "logps/chosen": -328.3287658691406, + "logps/rejected": -570.1505126953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.826768398284912, + "rewards/margins": 13.620793342590332, + "rewards/rejected": -19.44756317138672, + "step": 645 + }, + { + "epoch": 0.40186625194401243, + "grad_norm": 0.1063438206911087, + "learning_rate": 1.9666666666666668e-06, + "logits/chosen": 1.7386802937835455e-05, + "logits/rejected": -0.10718881338834763, + "logps/chosen": -300.99658203125, + "logps/rejected": -625.9423828125, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.733684062957764, + "rewards/margins": 15.054697036743164, + "rewards/rejected": -19.788381576538086, + "step": 646 + }, + { + "epoch": 0.40248833592534994, + "grad_norm": 1.1191056966781616, + "learning_rate": 1.9611111111111116e-06, + "logits/chosen": -0.14308711886405945, + "logits/rejected": -0.22438490390777588, + "logps/chosen": -335.1119689941406, + "logps/rejected": -628.6477661132812, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.323019027709961, + "rewards/margins": 11.095314025878906, + "rewards/rejected": -15.4183349609375, + "step": 647 + }, + { + "epoch": 0.4031104199066874, + "grad_norm": 0.03880783170461655, + "learning_rate": 1.955555555555556e-06, + "logits/chosen": -0.12418518960475922, + "logits/rejected": -0.2297421097755432, + "logps/chosen": -291.6612548828125, + "logps/rejected": -634.6732177734375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.544435024261475, + "rewards/margins": 12.667137145996094, + "rewards/rejected": -17.211572647094727, + "step": 648 + }, + { + "epoch": 0.4037325038880249, + "grad_norm": 0.068694107234478, + "learning_rate": 1.9500000000000004e-06, + "logits/chosen": -0.22765761613845825, + "logits/rejected": -0.30484437942504883, + "logps/chosen": -284.57769775390625, + "logps/rejected": -598.4955444335938, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.054725170135498, + "rewards/margins": 12.800515174865723, + "rewards/rejected": -17.855239868164062, + "step": 649 + }, + { + "epoch": 0.40435458786936235, + "grad_norm": 10.782881736755371, + "learning_rate": 1.944444444444445e-06, + "logits/chosen": -0.0829843282699585, + "logits/rejected": -0.13291698694229126, + "logps/chosen": -419.85894775390625, + "logps/rejected": -491.6153869628906, + "loss": 0.4345, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.095956802368164, + "rewards/margins": 10.624947547912598, + "rewards/rejected": -16.720903396606445, + "step": 650 + }, + { + "epoch": 0.40497667185069985, + "grad_norm": 0.31501320004463196, + "learning_rate": 1.938888888888889e-06, + "logits/chosen": -0.03152599185705185, + "logits/rejected": -0.09313070774078369, + "logps/chosen": -273.9474792480469, + "logps/rejected": -543.2339477539062, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.681985855102539, + "rewards/margins": 15.335407257080078, + "rewards/rejected": -21.017393112182617, + "step": 651 + }, + { + "epoch": 0.4055987558320373, + "grad_norm": 51.570682525634766, + "learning_rate": 1.9333333333333336e-06, + "logits/chosen": -0.19380784034729004, + "logits/rejected": -0.25447648763656616, + "logps/chosen": -276.1874084472656, + "logps/rejected": -483.1874694824219, + "loss": 0.2368, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.090529441833496, + "rewards/margins": 11.945367813110352, + "rewards/rejected": -18.03589630126953, + "step": 652 + }, + { + "epoch": 0.4062208398133748, + "grad_norm": 7.368281841278076, + "learning_rate": 1.927777777777778e-06, + "logits/chosen": -0.044491082429885864, + "logits/rejected": -0.12117129564285278, + "logps/chosen": -425.54583740234375, + "logps/rejected": -594.9068603515625, + "loss": 0.0613, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.216131210327148, + "rewards/margins": 10.014167785644531, + "rewards/rejected": -15.23029899597168, + "step": 653 + }, + { + "epoch": 0.40684292379471226, + "grad_norm": 8.551002502441406, + "learning_rate": 1.9222222222222224e-06, + "logits/chosen": -0.03686397522687912, + "logits/rejected": -0.13784199953079224, + "logps/chosen": -400.63836669921875, + "logps/rejected": -602.3237915039062, + "loss": 0.1046, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9830572605133057, + "rewards/margins": 14.174583435058594, + "rewards/rejected": -18.15764045715332, + "step": 654 + }, + { + "epoch": 0.40746500777604977, + "grad_norm": 0.6211469173431396, + "learning_rate": 1.916666666666667e-06, + "logits/chosen": -0.20156216621398926, + "logits/rejected": -0.24790304899215698, + "logps/chosen": -502.99957275390625, + "logps/rejected": -617.322021484375, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.623086929321289, + "rewards/margins": 10.706113815307617, + "rewards/rejected": -17.329200744628906, + "step": 655 + }, + { + "epoch": 0.4080870917573872, + "grad_norm": 0.06461192667484283, + "learning_rate": 1.9111111111111112e-06, + "logits/chosen": -0.08647769689559937, + "logits/rejected": -0.15352439880371094, + "logps/chosen": -201.84490966796875, + "logps/rejected": -416.85791015625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.096901893615723, + "rewards/margins": 13.094003677368164, + "rewards/rejected": -17.190906524658203, + "step": 656 + }, + { + "epoch": 0.40870917573872473, + "grad_norm": 0.8288792371749878, + "learning_rate": 1.9055555555555558e-06, + "logits/chosen": -0.0935092568397522, + "logits/rejected": -0.04587852209806442, + "logps/chosen": -469.42584228515625, + "logps/rejected": -609.2669677734375, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.462061882019043, + "rewards/margins": 10.951631546020508, + "rewards/rejected": -19.413692474365234, + "step": 657 + }, + { + "epoch": 0.40933125972006223, + "grad_norm": 0.09737135469913483, + "learning_rate": 1.9000000000000002e-06, + "logits/chosen": -0.08615106344223022, + "logits/rejected": -0.19879643619060516, + "logps/chosen": -254.21240234375, + "logps/rejected": -490.58203125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.449499130249023, + "rewards/margins": 9.446135520935059, + "rewards/rejected": -14.895633697509766, + "step": 658 + }, + { + "epoch": 0.4099533437013997, + "grad_norm": 0.023484792560338974, + "learning_rate": 1.8944444444444446e-06, + "logits/chosen": -0.12654292583465576, + "logits/rejected": -0.2109220027923584, + "logps/chosen": -414.76531982421875, + "logps/rejected": -636.120849609375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.316765785217285, + "rewards/margins": 11.792091369628906, + "rewards/rejected": -17.108856201171875, + "step": 659 + }, + { + "epoch": 0.4105754276827372, + "grad_norm": 1.2695841789245605, + "learning_rate": 1.888888888888889e-06, + "logits/chosen": -0.08933547884225845, + "logits/rejected": -0.16670718789100647, + "logps/chosen": -345.0384521484375, + "logps/rejected": -648.160888671875, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.860443115234375, + "rewards/margins": 12.678651809692383, + "rewards/rejected": -18.539094924926758, + "step": 660 + }, + { + "epoch": 0.41119751166407464, + "grad_norm": 0.008481858298182487, + "learning_rate": 1.8833333333333334e-06, + "logits/chosen": -0.152295783162117, + "logits/rejected": -0.23516657948493958, + "logps/chosen": -205.67498779296875, + "logps/rejected": -550.2120361328125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8071155548095703, + "rewards/margins": 12.339879035949707, + "rewards/rejected": -15.146995544433594, + "step": 661 + }, + { + "epoch": 0.41181959564541215, + "grad_norm": 1.6403297185897827, + "learning_rate": 1.8777777777777778e-06, + "logits/chosen": -0.15700671076774597, + "logits/rejected": -0.2123398780822754, + "logps/chosen": -358.7858581542969, + "logps/rejected": -533.4429321289062, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.386796474456787, + "rewards/margins": 10.40941047668457, + "rewards/rejected": -15.796207427978516, + "step": 662 + }, + { + "epoch": 0.4124416796267496, + "grad_norm": 5.1434431076049805, + "learning_rate": 1.8722222222222225e-06, + "logits/chosen": -0.13996818661689758, + "logits/rejected": -0.17551323771476746, + "logps/chosen": -370.56890869140625, + "logps/rejected": -432.46331787109375, + "loss": 0.0589, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.708287239074707, + "rewards/margins": 7.814229965209961, + "rewards/rejected": -14.522517204284668, + "step": 663 + }, + { + "epoch": 0.4130637636080871, + "grad_norm": 1.9147734642028809, + "learning_rate": 1.8666666666666669e-06, + "logits/chosen": -0.06994500756263733, + "logits/rejected": -0.12452316284179688, + "logps/chosen": -416.5871887207031, + "logps/rejected": -566.2616577148438, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.667995929718018, + "rewards/margins": 10.740212440490723, + "rewards/rejected": -18.408206939697266, + "step": 664 + }, + { + "epoch": 0.41368584758942456, + "grad_norm": 3.4069325923919678, + "learning_rate": 1.8611111111111113e-06, + "logits/chosen": -0.09214827418327332, + "logits/rejected": -0.19123274087905884, + "logps/chosen": -187.82534790039062, + "logps/rejected": -320.75250244140625, + "loss": 0.0609, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5920560359954834, + "rewards/margins": 9.63013744354248, + "rewards/rejected": -13.222193717956543, + "step": 665 + }, + { + "epoch": 0.41430793157076207, + "grad_norm": 0.2542458772659302, + "learning_rate": 1.8555555555555557e-06, + "logits/chosen": -0.16086122393608093, + "logits/rejected": -0.2501816153526306, + "logps/chosen": -600.5001831054688, + "logps/rejected": -824.9646606445312, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.229086875915527, + "rewards/margins": 13.589860916137695, + "rewards/rejected": -17.818946838378906, + "step": 666 + }, + { + "epoch": 0.4149300155520995, + "grad_norm": 1.103056788444519, + "learning_rate": 1.85e-06, + "logits/chosen": -0.11067149043083191, + "logits/rejected": -0.13151177763938904, + "logps/chosen": -292.71453857421875, + "logps/rejected": -562.3177490234375, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.3683576583862305, + "rewards/margins": 14.157787322998047, + "rewards/rejected": -19.526145935058594, + "step": 667 + }, + { + "epoch": 0.415552099533437, + "grad_norm": 0.2992672622203827, + "learning_rate": 1.8444444444444445e-06, + "logits/chosen": -0.1569594144821167, + "logits/rejected": -0.19007906317710876, + "logps/chosen": -575.823974609375, + "logps/rejected": -786.2784423828125, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.386287689208984, + "rewards/margins": 10.662607192993164, + "rewards/rejected": -20.048892974853516, + "step": 668 + }, + { + "epoch": 0.4161741835147745, + "grad_norm": 0.00017103359277825803, + "learning_rate": 1.8388888888888889e-06, + "logits/chosen": -0.08035887777805328, + "logits/rejected": -0.1913282871246338, + "logps/chosen": -306.74798583984375, + "logps/rejected": -640.42822265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.598724365234375, + "rewards/margins": 17.540102005004883, + "rewards/rejected": -22.138824462890625, + "step": 669 + }, + { + "epoch": 0.416796267496112, + "grad_norm": 0.012049006298184395, + "learning_rate": 1.8333333333333333e-06, + "logits/chosen": -0.10484198480844498, + "logits/rejected": -0.18023985624313354, + "logps/chosen": -282.49932861328125, + "logps/rejected": -583.6565551757812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4086499214172363, + "rewards/margins": 14.642749786376953, + "rewards/rejected": -18.051401138305664, + "step": 670 + }, + { + "epoch": 0.41741835147744943, + "grad_norm": 0.337211012840271, + "learning_rate": 1.8277777777777781e-06, + "logits/chosen": -0.06835547834634781, + "logits/rejected": -0.12556421756744385, + "logps/chosen": -262.0645446777344, + "logps/rejected": -436.9096374511719, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.674449920654297, + "rewards/margins": 10.837376594543457, + "rewards/rejected": -16.51182746887207, + "step": 671 + }, + { + "epoch": 0.41804043545878694, + "grad_norm": 0.03445616737008095, + "learning_rate": 1.8222222222222225e-06, + "logits/chosen": -0.10916835069656372, + "logits/rejected": -0.18215718865394592, + "logps/chosen": -448.89276123046875, + "logps/rejected": -614.1470947265625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.859401226043701, + "rewards/margins": 11.705766677856445, + "rewards/rejected": -15.565168380737305, + "step": 672 + }, + { + "epoch": 0.4186625194401244, + "grad_norm": 0.024650681763887405, + "learning_rate": 1.816666666666667e-06, + "logits/chosen": 0.04075014218688011, + "logits/rejected": -0.14517706632614136, + "logps/chosen": -149.82000732421875, + "logps/rejected": -548.462890625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.130614757537842, + "rewards/margins": 15.922340393066406, + "rewards/rejected": -19.052953720092773, + "step": 673 + }, + { + "epoch": 0.4192846034214619, + "grad_norm": 2.9891819953918457, + "learning_rate": 1.8111111111111113e-06, + "logits/chosen": -0.09427367895841599, + "logits/rejected": -0.07827030122280121, + "logps/chosen": -424.68218994140625, + "logps/rejected": -593.6963500976562, + "loss": 0.0249, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.796016693115234, + "rewards/margins": 10.951379776000977, + "rewards/rejected": -17.74739646911621, + "step": 674 + }, + { + "epoch": 0.4199066874027994, + "grad_norm": 0.1619732528924942, + "learning_rate": 1.8055555555555557e-06, + "logits/chosen": -0.22021238505840302, + "logits/rejected": -0.3068513870239258, + "logps/chosen": -383.26617431640625, + "logps/rejected": -633.9918212890625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.705854892730713, + "rewards/margins": 10.765361785888672, + "rewards/rejected": -14.471217155456543, + "step": 675 + }, + { + "epoch": 0.42052877138413686, + "grad_norm": 16.23748207092285, + "learning_rate": 1.8000000000000001e-06, + "logits/chosen": -0.028598565608263016, + "logits/rejected": -0.15933629870414734, + "logps/chosen": -438.76824951171875, + "logps/rejected": -662.4112548828125, + "loss": 0.4037, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.978034973144531, + "rewards/margins": 13.586522102355957, + "rewards/rejected": -18.564556121826172, + "step": 676 + }, + { + "epoch": 0.42115085536547436, + "grad_norm": 11.46135425567627, + "learning_rate": 1.7944444444444445e-06, + "logits/chosen": -0.09502118825912476, + "logits/rejected": -0.16668325662612915, + "logps/chosen": -378.0183410644531, + "logps/rejected": -505.00775146484375, + "loss": 0.1509, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.632676124572754, + "rewards/margins": 7.720284938812256, + "rewards/rejected": -15.352960586547852, + "step": 677 + }, + { + "epoch": 0.4217729393468118, + "grad_norm": 1.674432635307312, + "learning_rate": 1.788888888888889e-06, + "logits/chosen": -0.15954998135566711, + "logits/rejected": -0.2307274341583252, + "logps/chosen": -221.89923095703125, + "logps/rejected": -462.06121826171875, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.097198486328125, + "rewards/margins": 10.277615547180176, + "rewards/rejected": -15.374813079833984, + "step": 678 + }, + { + "epoch": 0.4223950233281493, + "grad_norm": 0.9491381645202637, + "learning_rate": 1.7833333333333336e-06, + "logits/chosen": -0.11849575489759445, + "logits/rejected": -0.24084538221359253, + "logps/chosen": -277.0999755859375, + "logps/rejected": -667.0069580078125, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9958906173706055, + "rewards/margins": 12.097909927368164, + "rewards/rejected": -16.093801498413086, + "step": 679 + }, + { + "epoch": 0.4230171073094868, + "grad_norm": 0.03170959651470184, + "learning_rate": 1.777777777777778e-06, + "logits/chosen": -0.16851282119750977, + "logits/rejected": -0.2724132239818573, + "logps/chosen": -264.6683349609375, + "logps/rejected": -629.499267578125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.651165008544922, + "rewards/margins": 12.204557418823242, + "rewards/rejected": -16.855722427368164, + "step": 680 + }, + { + "epoch": 0.4236391912908243, + "grad_norm": 1.4719005823135376, + "learning_rate": 1.7722222222222224e-06, + "logits/chosen": -0.16406311094760895, + "logits/rejected": -0.24618197977542877, + "logps/chosen": -458.88409423828125, + "logps/rejected": -734.8978881835938, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.639718532562256, + "rewards/margins": 16.661426544189453, + "rewards/rejected": -23.301143646240234, + "step": 681 + }, + { + "epoch": 0.42426127527216173, + "grad_norm": 0.29268231987953186, + "learning_rate": 1.7666666666666668e-06, + "logits/chosen": -0.0868673324584961, + "logits/rejected": -0.11438660323619843, + "logps/chosen": -268.0291748046875, + "logps/rejected": -372.7334289550781, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.334218502044678, + "rewards/margins": 10.743279457092285, + "rewards/rejected": -15.077498435974121, + "step": 682 + }, + { + "epoch": 0.42488335925349924, + "grad_norm": 0.0003953919222112745, + "learning_rate": 1.7611111111111112e-06, + "logits/chosen": -0.014019282534718513, + "logits/rejected": -0.15486721694469452, + "logps/chosen": -441.3645324707031, + "logps/rejected": -645.1834106445312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2152209281921387, + "rewards/margins": 17.028236389160156, + "rewards/rejected": -20.24345588684082, + "step": 683 + }, + { + "epoch": 0.4255054432348367, + "grad_norm": 0.01558225043118, + "learning_rate": 1.7555555555555556e-06, + "logits/chosen": 0.004964258521795273, + "logits/rejected": -0.1792491376399994, + "logps/chosen": -279.4091796875, + "logps/rejected": -619.8191528320312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.901151180267334, + "rewards/margins": 16.380020141601562, + "rewards/rejected": -20.281173706054688, + "step": 684 + }, + { + "epoch": 0.4261275272161742, + "grad_norm": 1.237605094909668, + "learning_rate": 1.75e-06, + "logits/chosen": -0.06393839418888092, + "logits/rejected": -0.1796531230211258, + "logps/chosen": -356.6410217285156, + "logps/rejected": -528.025146484375, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.140842437744141, + "rewards/margins": 10.812944412231445, + "rewards/rejected": -16.953784942626953, + "step": 685 + }, + { + "epoch": 0.42674961119751165, + "grad_norm": 0.003960651811212301, + "learning_rate": 1.7444444444444448e-06, + "logits/chosen": -0.02524508163332939, + "logits/rejected": -0.15117965638637543, + "logps/chosen": -300.7728576660156, + "logps/rejected": -595.6337890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.915596961975098, + "rewards/margins": 13.349435806274414, + "rewards/rejected": -19.265033721923828, + "step": 686 + }, + { + "epoch": 0.42737169517884915, + "grad_norm": 31.751911163330078, + "learning_rate": 1.7388888888888892e-06, + "logits/chosen": -0.07495184987783432, + "logits/rejected": -0.14515666663646698, + "logps/chosen": -358.4555969238281, + "logps/rejected": -575.625732421875, + "loss": 1.0371, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.43927526473999, + "rewards/margins": 10.69642162322998, + "rewards/rejected": -18.135696411132812, + "step": 687 + }, + { + "epoch": 0.4279937791601866, + "grad_norm": 5.31747579574585, + "learning_rate": 1.7333333333333336e-06, + "logits/chosen": -0.019868716597557068, + "logits/rejected": -0.11948978900909424, + "logps/chosen": -292.62481689453125, + "logps/rejected": -575.4559326171875, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.650533199310303, + "rewards/margins": 10.92553997039795, + "rewards/rejected": -16.576074600219727, + "step": 688 + }, + { + "epoch": 0.4286158631415241, + "grad_norm": 0.06408777087926865, + "learning_rate": 1.727777777777778e-06, + "logits/chosen": -0.21184779703617096, + "logits/rejected": -0.3151513338088989, + "logps/chosen": -248.61837768554688, + "logps/rejected": -510.5364990234375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.027789115905762, + "rewards/margins": 10.393976211547852, + "rewards/rejected": -14.42176628112793, + "step": 689 + }, + { + "epoch": 0.42923794712286156, + "grad_norm": 23.883710861206055, + "learning_rate": 1.7222222222222224e-06, + "logits/chosen": -0.05874648690223694, + "logits/rejected": -0.11361812055110931, + "logps/chosen": -259.2091369628906, + "logps/rejected": -442.018798828125, + "loss": 0.2202, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.43297004699707, + "rewards/margins": 8.377538681030273, + "rewards/rejected": -12.810510635375977, + "step": 690 + }, + { + "epoch": 0.42986003110419907, + "grad_norm": 16.968923568725586, + "learning_rate": 1.7166666666666668e-06, + "logits/chosen": -0.19153708219528198, + "logits/rejected": -0.19154079258441925, + "logps/chosen": -372.42486572265625, + "logps/rejected": -605.8370361328125, + "loss": 0.7917, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.724769592285156, + "rewards/margins": 9.562446594238281, + "rewards/rejected": -16.287216186523438, + "step": 691 + }, + { + "epoch": 0.4304821150855365, + "grad_norm": 11.717562675476074, + "learning_rate": 1.7111111111111112e-06, + "logits/chosen": -0.1389576494693756, + "logits/rejected": -0.2131836712360382, + "logps/chosen": -263.28155517578125, + "logps/rejected": -471.924072265625, + "loss": 0.3233, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.3641357421875, + "rewards/margins": 9.534891128540039, + "rewards/rejected": -14.899026870727539, + "step": 692 + }, + { + "epoch": 0.431104199066874, + "grad_norm": 0.9646769165992737, + "learning_rate": 1.7055555555555556e-06, + "logits/chosen": -0.25284844636917114, + "logits/rejected": -0.30989181995391846, + "logps/chosen": -476.11920166015625, + "logps/rejected": -673.481689453125, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.875650405883789, + "rewards/margins": 8.012702941894531, + "rewards/rejected": -11.88835334777832, + "step": 693 + }, + { + "epoch": 0.43172628304821153, + "grad_norm": 1.5322151184082031, + "learning_rate": 1.7000000000000002e-06, + "logits/chosen": -0.03931673988699913, + "logits/rejected": -0.03220512345433235, + "logps/chosen": -371.24127197265625, + "logps/rejected": -477.2238464355469, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.191614627838135, + "rewards/margins": 11.550434112548828, + "rewards/rejected": -16.742048263549805, + "step": 694 + }, + { + "epoch": 0.432348367029549, + "grad_norm": 0.016582980751991272, + "learning_rate": 1.6944444444444446e-06, + "logits/chosen": -0.12472251057624817, + "logits/rejected": -0.21175099909305573, + "logps/chosen": -272.5353698730469, + "logps/rejected": -505.7217102050781, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.813591241836548, + "rewards/margins": 14.684088706970215, + "rewards/rejected": -18.497678756713867, + "step": 695 + }, + { + "epoch": 0.4329704510108865, + "grad_norm": 0.004617233294993639, + "learning_rate": 1.688888888888889e-06, + "logits/chosen": -0.10770875960588455, + "logits/rejected": -0.22447675466537476, + "logps/chosen": -328.5132751464844, + "logps/rejected": -731.626953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.059587001800537, + "rewards/margins": 15.51219654083252, + "rewards/rejected": -20.5717830657959, + "step": 696 + }, + { + "epoch": 0.43359253499222394, + "grad_norm": 0.9247336983680725, + "learning_rate": 1.6833333333333335e-06, + "logits/chosen": -0.21552152931690216, + "logits/rejected": -0.27845749258995056, + "logps/chosen": -403.65625, + "logps/rejected": -580.2930908203125, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.468318939208984, + "rewards/margins": 10.941831588745117, + "rewards/rejected": -15.410150527954102, + "step": 697 + }, + { + "epoch": 0.43421461897356145, + "grad_norm": 0.8505792617797852, + "learning_rate": 1.6777777777777779e-06, + "logits/chosen": -0.12612029910087585, + "logits/rejected": -0.13450108468532562, + "logps/chosen": -426.95977783203125, + "logps/rejected": -484.1307373046875, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.076811790466309, + "rewards/margins": 10.43472671508789, + "rewards/rejected": -15.511537551879883, + "step": 698 + }, + { + "epoch": 0.4348367029548989, + "grad_norm": 1.4853614568710327, + "learning_rate": 1.6722222222222223e-06, + "logits/chosen": -0.010052400641143322, + "logits/rejected": -0.15936283767223358, + "logps/chosen": -236.47901916503906, + "logps/rejected": -472.603515625, + "loss": 0.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7972493171691895, + "rewards/margins": 10.760887145996094, + "rewards/rejected": -13.558135986328125, + "step": 699 + }, + { + "epoch": 0.4354587869362364, + "grad_norm": 0.023863688111305237, + "learning_rate": 1.6666666666666667e-06, + "logits/chosen": -0.015409186482429504, + "logits/rejected": -0.17830517888069153, + "logps/chosen": -219.58987426757812, + "logps/rejected": -549.2088012695312, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1944127082824707, + "rewards/margins": 13.901273727416992, + "rewards/rejected": -17.095685958862305, + "step": 700 + }, + { + "epoch": 0.43608087091757386, + "grad_norm": 0.7308242917060852, + "learning_rate": 1.661111111111111e-06, + "logits/chosen": 0.06572002172470093, + "logits/rejected": -0.12573106586933136, + "logps/chosen": -253.5244140625, + "logps/rejected": -620.3221435546875, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.939428329467773, + "rewards/margins": 14.805521011352539, + "rewards/rejected": -21.74494743347168, + "step": 701 + }, + { + "epoch": 0.43670295489891137, + "grad_norm": 0.1737741380929947, + "learning_rate": 1.6555555555555559e-06, + "logits/chosen": -0.13788293302059174, + "logits/rejected": -0.21947264671325684, + "logps/chosen": -349.48468017578125, + "logps/rejected": -616.4808959960938, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.1379170417785645, + "rewards/margins": 12.968765258789062, + "rewards/rejected": -18.10668182373047, + "step": 702 + }, + { + "epoch": 0.4373250388802488, + "grad_norm": 0.03152701258659363, + "learning_rate": 1.6500000000000003e-06, + "logits/chosen": -0.1375151127576828, + "logits/rejected": -0.25498446822166443, + "logps/chosen": -266.0514221191406, + "logps/rejected": -521.8695068359375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.075312852859497, + "rewards/margins": 15.80811882019043, + "rewards/rejected": -17.88343048095703, + "step": 703 + }, + { + "epoch": 0.4379471228615863, + "grad_norm": 0.6338666677474976, + "learning_rate": 1.6444444444444447e-06, + "logits/chosen": -0.13791918754577637, + "logits/rejected": -0.18672017753124237, + "logps/chosen": -563.3693237304688, + "logps/rejected": -707.0787353515625, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.59062385559082, + "rewards/margins": 10.377401351928711, + "rewards/rejected": -15.968025207519531, + "step": 704 + }, + { + "epoch": 0.4385692068429238, + "grad_norm": 0.3461260199546814, + "learning_rate": 1.638888888888889e-06, + "logits/chosen": -0.07948113977909088, + "logits/rejected": -0.19342201948165894, + "logps/chosen": -344.34112548828125, + "logps/rejected": -572.6790161132812, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.457808971405029, + "rewards/margins": 11.12295150756836, + "rewards/rejected": -18.580760955810547, + "step": 705 + }, + { + "epoch": 0.4391912908242613, + "grad_norm": 3.90309739112854, + "learning_rate": 1.6333333333333335e-06, + "logits/chosen": -0.08926865458488464, + "logits/rejected": -0.13055631518363953, + "logps/chosen": -359.5900573730469, + "logps/rejected": -460.8412170410156, + "loss": 0.0304, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.787007808685303, + "rewards/margins": 11.419514656066895, + "rewards/rejected": -17.20652198791504, + "step": 706 + }, + { + "epoch": 0.43981337480559873, + "grad_norm": 0.016811877489089966, + "learning_rate": 1.627777777777778e-06, + "logits/chosen": -0.14071565866470337, + "logits/rejected": -0.21861140429973602, + "logps/chosen": -417.37896728515625, + "logps/rejected": -655.8318481445312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.475986480712891, + "rewards/margins": 14.201498985290527, + "rewards/rejected": -20.677486419677734, + "step": 707 + }, + { + "epoch": 0.44043545878693624, + "grad_norm": 0.026484820991754532, + "learning_rate": 1.6222222222222223e-06, + "logits/chosen": -0.08523029088973999, + "logits/rejected": -0.1710880994796753, + "logps/chosen": -189.8123779296875, + "logps/rejected": -450.3400573730469, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.080999374389648, + "rewards/margins": 13.761738777160645, + "rewards/rejected": -18.84273910522461, + "step": 708 + }, + { + "epoch": 0.4410575427682737, + "grad_norm": 0.7414613962173462, + "learning_rate": 1.6166666666666667e-06, + "logits/chosen": 0.044315554201602936, + "logits/rejected": -0.09654340893030167, + "logps/chosen": -239.44342041015625, + "logps/rejected": -616.7940673828125, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.406595706939697, + "rewards/margins": 16.82744598388672, + "rewards/rejected": -21.23404312133789, + "step": 709 + }, + { + "epoch": 0.4416796267496112, + "grad_norm": 0.44611403346061707, + "learning_rate": 1.6111111111111113e-06, + "logits/chosen": -0.04723618924617767, + "logits/rejected": -0.09528446197509766, + "logps/chosen": -549.2506103515625, + "logps/rejected": -586.6680908203125, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.617132663726807, + "rewards/margins": 12.169167518615723, + "rewards/rejected": -16.786300659179688, + "step": 710 + }, + { + "epoch": 0.4423017107309487, + "grad_norm": 0.1348663866519928, + "learning_rate": 1.6055555555555557e-06, + "logits/chosen": -0.11756815016269684, + "logits/rejected": -0.2050608992576599, + "logps/chosen": -397.72686767578125, + "logps/rejected": -634.6549072265625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.027783393859863, + "rewards/margins": 14.490629196166992, + "rewards/rejected": -20.518413543701172, + "step": 711 + }, + { + "epoch": 0.44292379471228616, + "grad_norm": 2.00345516204834, + "learning_rate": 1.6000000000000001e-06, + "logits/chosen": -0.17369945347309113, + "logits/rejected": -0.2516752779483795, + "logps/chosen": -293.000244140625, + "logps/rejected": -502.453857421875, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9230446815490723, + "rewards/margins": 12.903790473937988, + "rewards/rejected": -15.826835632324219, + "step": 712 + }, + { + "epoch": 0.44354587869362366, + "grad_norm": 0.0026191978249698877, + "learning_rate": 1.5944444444444445e-06, + "logits/chosen": 0.04210498183965683, + "logits/rejected": -0.07989753037691116, + "logps/chosen": -241.44775390625, + "logps/rejected": -557.5899047851562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.620311737060547, + "rewards/margins": 14.429510116577148, + "rewards/rejected": -19.049819946289062, + "step": 713 + }, + { + "epoch": 0.4441679626749611, + "grad_norm": 4.78471040725708, + "learning_rate": 1.588888888888889e-06, + "logits/chosen": -0.07693975418806076, + "logits/rejected": -0.23134349286556244, + "logps/chosen": -220.2307586669922, + "logps/rejected": -496.198974609375, + "loss": 0.0329, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.392569541931152, + "rewards/margins": 12.644582748413086, + "rewards/rejected": -17.037151336669922, + "step": 714 + }, + { + "epoch": 0.4447900466562986, + "grad_norm": 0.11427666991949081, + "learning_rate": 1.5833333333333333e-06, + "logits/chosen": -0.04946906864643097, + "logits/rejected": -0.11771225929260254, + "logps/chosen": -350.3536682128906, + "logps/rejected": -532.328369140625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.364938259124756, + "rewards/margins": 11.28852653503418, + "rewards/rejected": -17.65346336364746, + "step": 715 + }, + { + "epoch": 0.4454121306376361, + "grad_norm": 0.400550901889801, + "learning_rate": 1.5777777777777778e-06, + "logits/chosen": -0.09401246905326843, + "logits/rejected": -0.21787378191947937, + "logps/chosen": -379.552001953125, + "logps/rejected": -548.9990234375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.749712944030762, + "rewards/margins": 9.92713451385498, + "rewards/rejected": -16.676849365234375, + "step": 716 + }, + { + "epoch": 0.4460342146189736, + "grad_norm": 0.005562597885727882, + "learning_rate": 1.5722222222222226e-06, + "logits/chosen": -0.06001667305827141, + "logits/rejected": -0.1688457727432251, + "logps/chosen": -267.3535461425781, + "logps/rejected": -560.7206420898438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.820340156555176, + "rewards/margins": 15.118245124816895, + "rewards/rejected": -18.93858528137207, + "step": 717 + }, + { + "epoch": 0.44665629860031103, + "grad_norm": 6.150920867919922, + "learning_rate": 1.566666666666667e-06, + "logits/chosen": -0.187502920627594, + "logits/rejected": -0.28694209456443787, + "logps/chosen": -364.0586242675781, + "logps/rejected": -590.85791015625, + "loss": 0.0761, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.922880172729492, + "rewards/margins": 7.912856101989746, + "rewards/rejected": -12.835737228393555, + "step": 718 + }, + { + "epoch": 0.44727838258164854, + "grad_norm": 0.08995234966278076, + "learning_rate": 1.5611111111111114e-06, + "logits/chosen": -0.1185571700334549, + "logits/rejected": -0.22074072062969208, + "logps/chosen": -251.9179229736328, + "logps/rejected": -674.1341552734375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.531166076660156, + "rewards/margins": 14.85726547241211, + "rewards/rejected": -19.388431549072266, + "step": 719 + }, + { + "epoch": 0.447900466562986, + "grad_norm": 5.418034553527832, + "learning_rate": 1.5555555555555558e-06, + "logits/chosen": -0.1919911503791809, + "logits/rejected": -0.22939085960388184, + "logps/chosen": -308.644287109375, + "logps/rejected": -500.6492919921875, + "loss": 0.095, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.500028610229492, + "rewards/margins": 6.765000820159912, + "rewards/rejected": -12.265029907226562, + "step": 720 + }, + { + "epoch": 0.4485225505443235, + "grad_norm": 13.279927253723145, + "learning_rate": 1.5500000000000002e-06, + "logits/chosen": -0.2112463414669037, + "logits/rejected": -0.2748299837112427, + "logps/chosen": -696.64501953125, + "logps/rejected": -728.1998291015625, + "loss": 0.2761, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.196961402893066, + "rewards/margins": 9.402154922485352, + "rewards/rejected": -13.599115371704102, + "step": 721 + }, + { + "epoch": 0.44914463452566095, + "grad_norm": 9.740560531616211, + "learning_rate": 1.5444444444444446e-06, + "logits/chosen": -0.1563461571931839, + "logits/rejected": -0.14186030626296997, + "logps/chosen": -423.80474853515625, + "logps/rejected": -464.29046630859375, + "loss": 0.2228, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.8481202125549316, + "rewards/margins": 10.102777481079102, + "rewards/rejected": -13.950897216796875, + "step": 722 + }, + { + "epoch": 0.44976671850699845, + "grad_norm": 0.5665701031684875, + "learning_rate": 1.538888888888889e-06, + "logits/chosen": -0.23303410410881042, + "logits/rejected": -0.28530117869377136, + "logps/chosen": -385.8143615722656, + "logps/rejected": -590.6848754882812, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.326517581939697, + "rewards/margins": 9.872583389282227, + "rewards/rejected": -16.199100494384766, + "step": 723 + }, + { + "epoch": 0.4503888024883359, + "grad_norm": 4.69204568862915, + "learning_rate": 1.5333333333333334e-06, + "logits/chosen": -0.17243418097496033, + "logits/rejected": -0.2365702986717224, + "logps/chosen": -503.00958251953125, + "logps/rejected": -710.9443359375, + "loss": 0.038, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.410350799560547, + "rewards/margins": 13.743621826171875, + "rewards/rejected": -18.153972625732422, + "step": 724 + }, + { + "epoch": 0.4510108864696734, + "grad_norm": 9.430375099182129, + "learning_rate": 1.527777777777778e-06, + "logits/chosen": -0.10459433495998383, + "logits/rejected": -0.13301539421081543, + "logps/chosen": -617.29931640625, + "logps/rejected": -744.34912109375, + "loss": 0.1591, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.9423346519470215, + "rewards/margins": 14.188889503479004, + "rewards/rejected": -20.131223678588867, + "step": 725 + }, + { + "epoch": 0.45163297045101086, + "grad_norm": 0.031052909791469574, + "learning_rate": 1.5222222222222224e-06, + "logits/chosen": -0.0109294094145298, + "logits/rejected": -0.11231635510921478, + "logps/chosen": -307.6101379394531, + "logps/rejected": -538.7658081054688, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.147153377532959, + "rewards/margins": 11.208839416503906, + "rewards/rejected": -16.355993270874023, + "step": 726 + }, + { + "epoch": 0.45225505443234837, + "grad_norm": 9.497825622558594, + "learning_rate": 1.5166666666666668e-06, + "logits/chosen": -0.23131603002548218, + "logits/rejected": -0.26031461358070374, + "logps/chosen": -487.10406494140625, + "logps/rejected": -618.082763671875, + "loss": 0.179, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.126858234405518, + "rewards/margins": 8.741113662719727, + "rewards/rejected": -14.867971420288086, + "step": 727 + }, + { + "epoch": 0.4528771384136858, + "grad_norm": 7.282534122467041, + "learning_rate": 1.5111111111111112e-06, + "logits/chosen": -0.1393272578716278, + "logits/rejected": -0.19980961084365845, + "logps/chosen": -424.136962890625, + "logps/rejected": -647.45654296875, + "loss": 0.1549, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.702028274536133, + "rewards/margins": 11.997804641723633, + "rewards/rejected": -17.699832916259766, + "step": 728 + }, + { + "epoch": 0.4534992223950233, + "grad_norm": 0.1490292251110077, + "learning_rate": 1.5055555555555556e-06, + "logits/chosen": -0.06625983119010925, + "logits/rejected": -0.1540638506412506, + "logps/chosen": -409.71929931640625, + "logps/rejected": -577.4666748046875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.085414886474609, + "rewards/margins": 10.824784278869629, + "rewards/rejected": -14.910199165344238, + "step": 729 + }, + { + "epoch": 0.45412130637636083, + "grad_norm": 0.026190560311079025, + "learning_rate": 1.5e-06, + "logits/chosen": -0.019478559494018555, + "logits/rejected": -0.1690959483385086, + "logps/chosen": -324.0633239746094, + "logps/rejected": -659.6967163085938, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.009406089782715, + "rewards/margins": 15.281152725219727, + "rewards/rejected": -19.290559768676758, + "step": 730 + }, + { + "epoch": 0.4547433903576983, + "grad_norm": 0.26904934644699097, + "learning_rate": 1.4944444444444444e-06, + "logits/chosen": -0.11029868572950363, + "logits/rejected": -0.26389309763908386, + "logps/chosen": -288.8099060058594, + "logps/rejected": -632.843994140625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.770277500152588, + "rewards/margins": 12.765262603759766, + "rewards/rejected": -16.535539627075195, + "step": 731 + }, + { + "epoch": 0.4553654743390358, + "grad_norm": 0.003615317866206169, + "learning_rate": 1.4888888888888888e-06, + "logits/chosen": -0.02934402972459793, + "logits/rejected": -0.1376732885837555, + "logps/chosen": -196.23678588867188, + "logps/rejected": -552.7225341796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.232180595397949, + "rewards/margins": 14.447599411010742, + "rewards/rejected": -18.679779052734375, + "step": 732 + }, + { + "epoch": 0.45598755832037324, + "grad_norm": 0.11386875063180923, + "learning_rate": 1.4833333333333337e-06, + "logits/chosen": -0.14048852026462555, + "logits/rejected": -0.19948913156986237, + "logps/chosen": -325.22650146484375, + "logps/rejected": -562.0709228515625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.173778533935547, + "rewards/margins": 12.442522048950195, + "rewards/rejected": -15.616301536560059, + "step": 733 + }, + { + "epoch": 0.45660964230171075, + "grad_norm": 1.618320345878601, + "learning_rate": 1.477777777777778e-06, + "logits/chosen": -0.10951582342386246, + "logits/rejected": -0.2571600377559662, + "logps/chosen": -264.5402526855469, + "logps/rejected": -586.5535278320312, + "loss": 0.0251, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.42150354385376, + "rewards/margins": 11.112964630126953, + "rewards/rejected": -15.534467697143555, + "step": 734 + }, + { + "epoch": 0.4572317262830482, + "grad_norm": 0.17494763433933258, + "learning_rate": 1.4722222222222225e-06, + "logits/chosen": -0.215254008769989, + "logits/rejected": -0.1988726556301117, + "logps/chosen": -167.96499633789062, + "logps/rejected": -375.2445068359375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.121342897415161, + "rewards/margins": 10.723258018493652, + "rewards/rejected": -13.84460163116455, + "step": 735 + }, + { + "epoch": 0.4578538102643857, + "grad_norm": 0.2510223388671875, + "learning_rate": 1.4666666666666669e-06, + "logits/chosen": -0.050418347120285034, + "logits/rejected": -0.18208985030651093, + "logps/chosen": -309.9395446777344, + "logps/rejected": -540.1434326171875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.7002787590026855, + "rewards/margins": 11.213287353515625, + "rewards/rejected": -16.913564682006836, + "step": 736 + }, + { + "epoch": 0.45847589424572316, + "grad_norm": 0.26509401202201843, + "learning_rate": 1.4611111111111113e-06, + "logits/chosen": -0.15048304200172424, + "logits/rejected": -0.16428953409194946, + "logps/chosen": -340.8234558105469, + "logps/rejected": -550.6556396484375, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.12544059753418, + "rewards/margins": 12.557748794555664, + "rewards/rejected": -19.68318748474121, + "step": 737 + }, + { + "epoch": 0.45909797822706067, + "grad_norm": 0.09233911335468292, + "learning_rate": 1.4555555555555557e-06, + "logits/chosen": -0.07279112935066223, + "logits/rejected": -0.269211083650589, + "logps/chosen": -244.86083984375, + "logps/rejected": -649.9526977539062, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.796719789505005, + "rewards/margins": 13.512633323669434, + "rewards/rejected": -17.30935287475586, + "step": 738 + }, + { + "epoch": 0.4597200622083981, + "grad_norm": 10.029057502746582, + "learning_rate": 1.45e-06, + "logits/chosen": -0.050769902765750885, + "logits/rejected": -0.19179697334766388, + "logps/chosen": -323.5790710449219, + "logps/rejected": -463.0047912597656, + "loss": 0.1115, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.586523056030273, + "rewards/margins": 9.222599983215332, + "rewards/rejected": -13.809123992919922, + "step": 739 + }, + { + "epoch": 0.4603421461897356, + "grad_norm": 2.1069769859313965, + "learning_rate": 1.4444444444444445e-06, + "logits/chosen": -0.18043649196624756, + "logits/rejected": -0.19845055043697357, + "logps/chosen": -391.2398376464844, + "logps/rejected": -504.6783752441406, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.988456726074219, + "rewards/margins": 9.624873161315918, + "rewards/rejected": -15.613329887390137, + "step": 740 + }, + { + "epoch": 0.4609642301710731, + "grad_norm": 0.01782449521124363, + "learning_rate": 1.4388888888888891e-06, + "logits/chosen": -0.0005922671407461166, + "logits/rejected": -0.22645621001720428, + "logps/chosen": -200.54518127441406, + "logps/rejected": -678.3786010742188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5418107509613037, + "rewards/margins": 17.46556854248047, + "rewards/rejected": -21.00737953186035, + "step": 741 + }, + { + "epoch": 0.4615863141524106, + "grad_norm": 0.15225456655025482, + "learning_rate": 1.4333333333333335e-06, + "logits/chosen": -0.1659916192293167, + "logits/rejected": -0.24109870195388794, + "logps/chosen": -325.1512451171875, + "logps/rejected": -613.4889526367188, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.81900691986084, + "rewards/margins": 12.071318626403809, + "rewards/rejected": -16.890323638916016, + "step": 742 + }, + { + "epoch": 0.46220839813374803, + "grad_norm": 9.510211944580078, + "learning_rate": 1.427777777777778e-06, + "logits/chosen": -0.09520632773637772, + "logits/rejected": -0.16343779861927032, + "logps/chosen": -220.83074951171875, + "logps/rejected": -436.3971252441406, + "loss": 0.1104, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.927903175354004, + "rewards/margins": 10.406707763671875, + "rewards/rejected": -14.334610939025879, + "step": 743 + }, + { + "epoch": 0.46283048211508554, + "grad_norm": 0.001954207429662347, + "learning_rate": 1.4222222222222223e-06, + "logits/chosen": -0.11868180334568024, + "logits/rejected": -0.20967280864715576, + "logps/chosen": -326.1553649902344, + "logps/rejected": -561.0624389648438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.020052909851074, + "rewards/margins": 14.854504585266113, + "rewards/rejected": -19.87455940246582, + "step": 744 + }, + { + "epoch": 0.463452566096423, + "grad_norm": 7.722990989685059, + "learning_rate": 1.4166666666666667e-06, + "logits/chosen": -0.03333606198430061, + "logits/rejected": -0.18697881698608398, + "logps/chosen": -209.54664611816406, + "logps/rejected": -514.886474609375, + "loss": 0.1154, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.703244209289551, + "rewards/margins": 16.089412689208984, + "rewards/rejected": -19.79265594482422, + "step": 745 + }, + { + "epoch": 0.4640746500777605, + "grad_norm": 1.7064536809921265, + "learning_rate": 1.4111111111111111e-06, + "logits/chosen": -0.07485972344875336, + "logits/rejected": -0.14361241459846497, + "logps/chosen": -276.5263671875, + "logps/rejected": -565.6427612304688, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.051205635070801, + "rewards/margins": 12.139596939086914, + "rewards/rejected": -17.1908016204834, + "step": 746 + }, + { + "epoch": 0.464696734059098, + "grad_norm": 0.017171675339341164, + "learning_rate": 1.4055555555555555e-06, + "logits/chosen": -0.13880085945129395, + "logits/rejected": -0.2007388174533844, + "logps/chosen": -213.27394104003906, + "logps/rejected": -469.6338806152344, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.129637718200684, + "rewards/margins": 13.070512771606445, + "rewards/rejected": -18.200149536132812, + "step": 747 + }, + { + "epoch": 0.46531881804043546, + "grad_norm": 10.123662948608398, + "learning_rate": 1.4000000000000001e-06, + "logits/chosen": 0.03683340549468994, + "logits/rejected": -0.08564390987157822, + "logps/chosen": -316.3659362792969, + "logps/rejected": -617.3487548828125, + "loss": 0.0906, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.669050931930542, + "rewards/margins": 12.815457344055176, + "rewards/rejected": -16.484508514404297, + "step": 748 + }, + { + "epoch": 0.46594090202177296, + "grad_norm": 3.5007524490356445, + "learning_rate": 1.3944444444444446e-06, + "logits/chosen": -0.0997939258813858, + "logits/rejected": -0.19426652789115906, + "logps/chosen": -392.793701171875, + "logps/rejected": -587.8896484375, + "loss": 0.0453, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.254166126251221, + "rewards/margins": 12.482680320739746, + "rewards/rejected": -18.736846923828125, + "step": 749 + }, + { + "epoch": 0.4665629860031104, + "grad_norm": 13.680143356323242, + "learning_rate": 1.3888888888888892e-06, + "logits/chosen": -0.10629221796989441, + "logits/rejected": -0.1759316772222519, + "logps/chosen": -488.7080078125, + "logps/rejected": -602.2348022460938, + "loss": 0.2011, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.634420871734619, + "rewards/margins": 8.620626449584961, + "rewards/rejected": -13.255046844482422, + "step": 750 + }, + { + "epoch": 0.4671850699844479, + "grad_norm": 1.0197296142578125, + "learning_rate": 1.3833333333333336e-06, + "logits/chosen": -0.2175053060054779, + "logits/rejected": -0.24155527353286743, + "logps/chosen": -413.7650146484375, + "logps/rejected": -479.35406494140625, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.766584396362305, + "rewards/margins": 11.36799430847168, + "rewards/rejected": -16.134578704833984, + "step": 751 + }, + { + "epoch": 0.46780715396578537, + "grad_norm": 0.1404423862695694, + "learning_rate": 1.377777777777778e-06, + "logits/chosen": -0.20662929117679596, + "logits/rejected": -0.2549915313720703, + "logps/chosen": -309.4803771972656, + "logps/rejected": -382.5816650390625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.063450336456299, + "rewards/margins": 7.605124473571777, + "rewards/rejected": -12.668575286865234, + "step": 752 + }, + { + "epoch": 0.4684292379471229, + "grad_norm": 7.706900596618652, + "learning_rate": 1.3722222222222224e-06, + "logits/chosen": -0.14855460822582245, + "logits/rejected": -0.29836156964302063, + "logps/chosen": -379.81494140625, + "logps/rejected": -657.1109619140625, + "loss": 0.0846, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.31683349609375, + "rewards/margins": 9.868423461914062, + "rewards/rejected": -13.185256958007812, + "step": 753 + }, + { + "epoch": 0.46905132192846033, + "grad_norm": 6.5629072189331055, + "learning_rate": 1.3666666666666668e-06, + "logits/chosen": -0.10302937030792236, + "logits/rejected": -0.15600721538066864, + "logps/chosen": -336.8003845214844, + "logps/rejected": -503.1877136230469, + "loss": 0.0328, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.324923515319824, + "rewards/margins": 10.662532806396484, + "rewards/rejected": -14.987455368041992, + "step": 754 + }, + { + "epoch": 0.46967340590979784, + "grad_norm": 0.711733877658844, + "learning_rate": 1.3611111111111112e-06, + "logits/chosen": -0.12427747994661331, + "logits/rejected": -0.19206631183624268, + "logps/chosen": -430.4934997558594, + "logps/rejected": -626.50537109375, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.4360785484313965, + "rewards/margins": 10.157730102539062, + "rewards/rejected": -16.593807220458984, + "step": 755 + }, + { + "epoch": 0.4702954898911353, + "grad_norm": 3.854640245437622, + "learning_rate": 1.3555555555555558e-06, + "logits/chosen": -0.17316797375679016, + "logits/rejected": -0.22901447117328644, + "logps/chosen": -317.1405029296875, + "logps/rejected": -493.6657409667969, + "loss": 0.0454, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.143527030944824, + "rewards/margins": 9.394670486450195, + "rewards/rejected": -13.538196563720703, + "step": 756 + }, + { + "epoch": 0.4709175738724728, + "grad_norm": 3.0120458602905273, + "learning_rate": 1.3500000000000002e-06, + "logits/chosen": -0.2033642828464508, + "logits/rejected": -0.23598918318748474, + "logps/chosen": -493.49737548828125, + "logps/rejected": -628.5738525390625, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.72686243057251, + "rewards/margins": 9.371174812316895, + "rewards/rejected": -14.098036766052246, + "step": 757 + }, + { + "epoch": 0.47153965785381025, + "grad_norm": 1.372122049331665, + "learning_rate": 1.3444444444444446e-06, + "logits/chosen": -0.22071154415607452, + "logits/rejected": -0.22147798538208008, + "logps/chosen": -453.8293151855469, + "logps/rejected": -617.3856201171875, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.355320930480957, + "rewards/margins": 10.091093063354492, + "rewards/rejected": -15.44641399383545, + "step": 758 + }, + { + "epoch": 0.47216174183514775, + "grad_norm": 4.308305263519287, + "learning_rate": 1.338888888888889e-06, + "logits/chosen": -0.13364093005657196, + "logits/rejected": -0.14899829030036926, + "logps/chosen": -367.8165283203125, + "logps/rejected": -663.4675903320312, + "loss": 0.0387, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.05575704574585, + "rewards/margins": 9.636533737182617, + "rewards/rejected": -14.692291259765625, + "step": 759 + }, + { + "epoch": 0.4727838258164852, + "grad_norm": 8.815164566040039, + "learning_rate": 1.3333333333333334e-06, + "logits/chosen": -0.12688037753105164, + "logits/rejected": -0.19369713962078094, + "logps/chosen": -411.9972229003906, + "logps/rejected": -557.9375, + "loss": 0.1034, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.2347002029418945, + "rewards/margins": 10.083131790161133, + "rewards/rejected": -14.317832946777344, + "step": 760 + }, + { + "epoch": 0.4734059097978227, + "grad_norm": 19.365949630737305, + "learning_rate": 1.3277777777777778e-06, + "logits/chosen": -0.03429656848311424, + "logits/rejected": -0.09788615256547928, + "logps/chosen": -487.744140625, + "logps/rejected": -574.6835327148438, + "loss": 0.5827, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.715859889984131, + "rewards/margins": 7.6128716468811035, + "rewards/rejected": -13.328731536865234, + "step": 761 + }, + { + "epoch": 0.47402799377916016, + "grad_norm": 5.826582908630371, + "learning_rate": 1.3222222222222222e-06, + "logits/chosen": -0.1900830864906311, + "logits/rejected": -0.22969363629817963, + "logps/chosen": -301.0557556152344, + "logps/rejected": -447.8843994140625, + "loss": 0.1787, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.013862133026123, + "rewards/margins": 7.629651069641113, + "rewards/rejected": -12.643513679504395, + "step": 762 + }, + { + "epoch": 0.47465007776049767, + "grad_norm": 0.004037719685584307, + "learning_rate": 1.3166666666666666e-06, + "logits/chosen": -0.18360383808612823, + "logits/rejected": -0.17849460244178772, + "logps/chosen": -362.8299560546875, + "logps/rejected": -635.7446899414062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.390879154205322, + "rewards/margins": 16.14934539794922, + "rewards/rejected": -20.540225982666016, + "step": 763 + }, + { + "epoch": 0.4752721617418352, + "grad_norm": 10.646064758300781, + "learning_rate": 1.3111111111111112e-06, + "logits/chosen": -0.16234049201011658, + "logits/rejected": -0.16542690992355347, + "logps/chosen": -491.10882568359375, + "logps/rejected": -594.88720703125, + "loss": 0.1582, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.262134075164795, + "rewards/margins": 8.791068077087402, + "rewards/rejected": -14.053202629089355, + "step": 764 + }, + { + "epoch": 0.4758942457231726, + "grad_norm": 5.2629170417785645, + "learning_rate": 1.3055555555555556e-06, + "logits/chosen": -0.18621662259101868, + "logits/rejected": -0.21489739418029785, + "logps/chosen": -415.2783203125, + "logps/rejected": -564.4843139648438, + "loss": 0.1087, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.421874523162842, + "rewards/margins": 10.853161811828613, + "rewards/rejected": -16.275035858154297, + "step": 765 + }, + { + "epoch": 0.47651632970451013, + "grad_norm": 0.6087837219238281, + "learning_rate": 1.3e-06, + "logits/chosen": -0.061621908098459244, + "logits/rejected": -0.18054337799549103, + "logps/chosen": -572.9510498046875, + "logps/rejected": -739.3956298828125, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9321579933166504, + "rewards/margins": 12.54207992553711, + "rewards/rejected": -16.474239349365234, + "step": 766 + }, + { + "epoch": 0.4771384136858476, + "grad_norm": 5.500948905944824, + "learning_rate": 1.2944444444444447e-06, + "logits/chosen": -0.0754532516002655, + "logits/rejected": -0.1645621359348297, + "logps/chosen": -441.9910583496094, + "logps/rejected": -495.4376525878906, + "loss": 0.0735, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.62314510345459, + "rewards/margins": 7.8088250160217285, + "rewards/rejected": -13.431968688964844, + "step": 767 + }, + { + "epoch": 0.4777604976671851, + "grad_norm": 0.011328157968819141, + "learning_rate": 1.288888888888889e-06, + "logits/chosen": -0.18844503164291382, + "logits/rejected": -0.26246243715286255, + "logps/chosen": -469.8314208984375, + "logps/rejected": -629.7899169921875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.418628215789795, + "rewards/margins": 11.904468536376953, + "rewards/rejected": -15.32309627532959, + "step": 768 + }, + { + "epoch": 0.47838258164852254, + "grad_norm": 8.79350757598877, + "learning_rate": 1.2833333333333335e-06, + "logits/chosen": -0.1111457496881485, + "logits/rejected": -0.1931067854166031, + "logps/chosen": -306.70623779296875, + "logps/rejected": -357.4154052734375, + "loss": 0.0803, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.580134391784668, + "rewards/margins": 8.642915725708008, + "rewards/rejected": -12.22304916381836, + "step": 769 + }, + { + "epoch": 0.47900466562986005, + "grad_norm": 20.364259719848633, + "learning_rate": 1.2777777777777779e-06, + "logits/chosen": -0.10649178922176361, + "logits/rejected": -0.1479824185371399, + "logps/chosen": -544.820068359375, + "logps/rejected": -541.74462890625, + "loss": 0.3863, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.634666919708252, + "rewards/margins": 7.190972328186035, + "rewards/rejected": -12.825638771057129, + "step": 770 + }, + { + "epoch": 0.4796267496111975, + "grad_norm": 0.7768333554267883, + "learning_rate": 1.2722222222222223e-06, + "logits/chosen": -0.05986177921295166, + "logits/rejected": -0.1937805712223053, + "logps/chosen": -411.20916748046875, + "logps/rejected": -617.9605102539062, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.152901649475098, + "rewards/margins": 12.503531455993652, + "rewards/rejected": -16.65643310546875, + "step": 771 + }, + { + "epoch": 0.480248833592535, + "grad_norm": 3.426276206970215, + "learning_rate": 1.2666666666666669e-06, + "logits/chosen": -0.2028035819530487, + "logits/rejected": -0.27828484773635864, + "logps/chosen": -352.870849609375, + "logps/rejected": -562.242431640625, + "loss": 0.0602, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8357479572296143, + "rewards/margins": 10.26229476928711, + "rewards/rejected": -14.098043441772461, + "step": 772 + }, + { + "epoch": 0.48087091757387246, + "grad_norm": 0.009728431701660156, + "learning_rate": 1.2611111111111113e-06, + "logits/chosen": -0.12964648008346558, + "logits/rejected": -0.21467560529708862, + "logps/chosen": -403.47283935546875, + "logps/rejected": -570.8280029296875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.723182201385498, + "rewards/margins": 12.088554382324219, + "rewards/rejected": -15.811737060546875, + "step": 773 + }, + { + "epoch": 0.48149300155520997, + "grad_norm": 0.013007402420043945, + "learning_rate": 1.2555555555555557e-06, + "logits/chosen": -0.1980336606502533, + "logits/rejected": -0.23050229251384735, + "logps/chosen": -468.044677734375, + "logps/rejected": -662.9429931640625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.649184226989746, + "rewards/margins": 13.730168342590332, + "rewards/rejected": -17.379352569580078, + "step": 774 + }, + { + "epoch": 0.4821150855365474, + "grad_norm": 0.026373110711574554, + "learning_rate": 1.25e-06, + "logits/chosen": -0.19756723940372467, + "logits/rejected": -0.28129222989082336, + "logps/chosen": -305.7901611328125, + "logps/rejected": -555.2416381835938, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8107564449310303, + "rewards/margins": 11.472835540771484, + "rewards/rejected": -14.28359317779541, + "step": 775 + }, + { + "epoch": 0.4827371695178849, + "grad_norm": 14.28516960144043, + "learning_rate": 1.2444444444444445e-06, + "logits/chosen": -0.12472839653491974, + "logits/rejected": -0.1902036964893341, + "logps/chosen": -494.14044189453125, + "logps/rejected": -627.536865234375, + "loss": 0.2843, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.056323051452637, + "rewards/margins": 13.423925399780273, + "rewards/rejected": -17.480247497558594, + "step": 776 + }, + { + "epoch": 0.4833592534992224, + "grad_norm": 3.762486219406128, + "learning_rate": 1.2388888888888891e-06, + "logits/chosen": -0.12436408549547195, + "logits/rejected": -0.20738908648490906, + "logps/chosen": -412.3399963378906, + "logps/rejected": -689.9681396484375, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.784951686859131, + "rewards/margins": 10.317832946777344, + "rewards/rejected": -15.102785110473633, + "step": 777 + }, + { + "epoch": 0.4839813374805599, + "grad_norm": 1.431050181388855, + "learning_rate": 1.2333333333333335e-06, + "logits/chosen": 0.038748402148485184, + "logits/rejected": -0.08617618680000305, + "logps/chosen": -484.8975830078125, + "logps/rejected": -514.212158203125, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7106587886810303, + "rewards/margins": 9.949796676635742, + "rewards/rejected": -12.660454750061035, + "step": 778 + }, + { + "epoch": 0.48460342146189733, + "grad_norm": 0.004360859282314777, + "learning_rate": 1.227777777777778e-06, + "logits/chosen": -0.06922930479049683, + "logits/rejected": -0.24433544278144836, + "logps/chosen": -319.2720031738281, + "logps/rejected": -652.229248046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.619636058807373, + "rewards/margins": 15.295748710632324, + "rewards/rejected": -19.91538429260254, + "step": 779 + }, + { + "epoch": 0.48522550544323484, + "grad_norm": 0.968350350856781, + "learning_rate": 1.2222222222222223e-06, + "logits/chosen": -0.1112077534198761, + "logits/rejected": -0.24374963343143463, + "logps/chosen": -157.97760009765625, + "logps/rejected": -492.9150390625, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.977019786834717, + "rewards/margins": 14.135503768920898, + "rewards/rejected": -17.112524032592773, + "step": 780 + }, + { + "epoch": 0.4858475894245723, + "grad_norm": 7.084324359893799, + "learning_rate": 1.2166666666666667e-06, + "logits/chosen": -0.016914956271648407, + "logits/rejected": -0.09667975455522537, + "logps/chosen": -339.33184814453125, + "logps/rejected": -522.1082763671875, + "loss": 0.0813, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.129027366638184, + "rewards/margins": 9.399267196655273, + "rewards/rejected": -13.528295516967773, + "step": 781 + }, + { + "epoch": 0.4864696734059098, + "grad_norm": 0.02582201175391674, + "learning_rate": 1.2111111111111111e-06, + "logits/chosen": -0.18956682085990906, + "logits/rejected": -0.22381770610809326, + "logps/chosen": -343.450439453125, + "logps/rejected": -439.785888671875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.3069024085998535, + "rewards/margins": 9.899419784545898, + "rewards/rejected": -15.206321716308594, + "step": 782 + }, + { + "epoch": 0.4870917573872473, + "grad_norm": 0.059722837060689926, + "learning_rate": 1.2055555555555555e-06, + "logits/chosen": -0.058198168873786926, + "logits/rejected": -0.14550668001174927, + "logps/chosen": -342.9820556640625, + "logps/rejected": -612.16796875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.353404998779297, + "rewards/margins": 13.806711196899414, + "rewards/rejected": -18.160118103027344, + "step": 783 + }, + { + "epoch": 0.48771384136858476, + "grad_norm": 7.432962894439697, + "learning_rate": 1.2000000000000002e-06, + "logits/chosen": -0.13221409916877747, + "logits/rejected": -0.17261086404323578, + "logps/chosen": -423.8681640625, + "logps/rejected": -522.3591918945312, + "loss": 0.0875, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.856818675994873, + "rewards/margins": 8.219755172729492, + "rewards/rejected": -12.076574325561523, + "step": 784 + }, + { + "epoch": 0.48833592534992226, + "grad_norm": 0.6306452751159668, + "learning_rate": 1.1944444444444446e-06, + "logits/chosen": -0.15697450935840607, + "logits/rejected": -0.22889652848243713, + "logps/chosen": -300.1998291015625, + "logps/rejected": -550.5955810546875, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.4288129806518555, + "rewards/margins": 10.472444534301758, + "rewards/rejected": -14.901256561279297, + "step": 785 + }, + { + "epoch": 0.4889580093312597, + "grad_norm": 0.5289738774299622, + "learning_rate": 1.188888888888889e-06, + "logits/chosen": -0.043267399072647095, + "logits/rejected": -0.16180385649204254, + "logps/chosen": -493.2218322753906, + "logps/rejected": -783.1202392578125, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.296093463897705, + "rewards/margins": 13.870599746704102, + "rewards/rejected": -19.16669464111328, + "step": 786 + }, + { + "epoch": 0.4895800933125972, + "grad_norm": 12.32890510559082, + "learning_rate": 1.1833333333333334e-06, + "logits/chosen": -0.08341973274946213, + "logits/rejected": -0.18155544996261597, + "logps/chosen": -257.9139404296875, + "logps/rejected": -525.1361083984375, + "loss": 0.1842, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.25715970993042, + "rewards/margins": 10.490758895874023, + "rewards/rejected": -16.74791717529297, + "step": 787 + }, + { + "epoch": 0.49020217729393467, + "grad_norm": 5.447890281677246, + "learning_rate": 1.1777777777777778e-06, + "logits/chosen": -0.11618351936340332, + "logits/rejected": -0.161826491355896, + "logps/chosen": -546.9429931640625, + "logps/rejected": -633.5008544921875, + "loss": 0.0545, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.852203369140625, + "rewards/margins": 9.551763534545898, + "rewards/rejected": -15.403966903686523, + "step": 788 + }, + { + "epoch": 0.4908242612752722, + "grad_norm": 0.029557283967733383, + "learning_rate": 1.1722222222222224e-06, + "logits/chosen": -0.1468360722064972, + "logits/rejected": -0.2727779746055603, + "logps/chosen": -309.54339599609375, + "logps/rejected": -636.0472412109375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.287850856781006, + "rewards/margins": 13.895767211914062, + "rewards/rejected": -19.183618545532227, + "step": 789 + }, + { + "epoch": 0.49144634525660963, + "grad_norm": 3.124437093734741, + "learning_rate": 1.1666666666666668e-06, + "logits/chosen": -0.11858217418193817, + "logits/rejected": -0.24527664482593536, + "logps/chosen": -329.87908935546875, + "logps/rejected": -529.0873413085938, + "loss": 0.0312, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.786318302154541, + "rewards/margins": 8.984128952026367, + "rewards/rejected": -13.77044677734375, + "step": 790 + }, + { + "epoch": 0.49206842923794714, + "grad_norm": 0.949057400226593, + "learning_rate": 1.1611111111111112e-06, + "logits/chosen": -0.1073325127363205, + "logits/rejected": -0.1736810803413391, + "logps/chosen": -267.47271728515625, + "logps/rejected": -593.3236694335938, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.164538860321045, + "rewards/margins": 12.785499572753906, + "rewards/rejected": -16.950037002563477, + "step": 791 + }, + { + "epoch": 0.4926905132192846, + "grad_norm": 1.6363840103149414, + "learning_rate": 1.1555555555555556e-06, + "logits/chosen": -0.17099550366401672, + "logits/rejected": -0.28737446665763855, + "logps/chosen": -320.5814208984375, + "logps/rejected": -602.750732421875, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2221455574035645, + "rewards/margins": 12.045496940612793, + "rewards/rejected": -15.2676420211792, + "step": 792 + }, + { + "epoch": 0.4933125972006221, + "grad_norm": 0.06922617554664612, + "learning_rate": 1.1500000000000002e-06, + "logits/chosen": 0.022076137363910675, + "logits/rejected": -0.12550291419029236, + "logps/chosen": -359.2410583496094, + "logps/rejected": -672.7980346679688, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.742738723754883, + "rewards/margins": 16.421613693237305, + "rewards/rejected": -21.164352416992188, + "step": 793 + }, + { + "epoch": 0.49393468118195955, + "grad_norm": 2.7981667518615723, + "learning_rate": 1.1444444444444446e-06, + "logits/chosen": -0.15968932211399078, + "logits/rejected": -0.18666735291481018, + "logps/chosen": -431.2681579589844, + "logps/rejected": -610.80322265625, + "loss": 0.0297, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.854342460632324, + "rewards/margins": 12.010974884033203, + "rewards/rejected": -15.865316390991211, + "step": 794 + }, + { + "epoch": 0.49455676516329705, + "grad_norm": 0.08087986707687378, + "learning_rate": 1.138888888888889e-06, + "logits/chosen": -0.08422063291072845, + "logits/rejected": -0.1889151930809021, + "logps/chosen": -505.75714111328125, + "logps/rejected": -620.28759765625, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.359622478485107, + "rewards/margins": 11.246438980102539, + "rewards/rejected": -16.606061935424805, + "step": 795 + }, + { + "epoch": 0.4951788491446345, + "grad_norm": 4.797577381134033, + "learning_rate": 1.1333333333333334e-06, + "logits/chosen": -0.2181960642337799, + "logits/rejected": -0.2858554422855377, + "logps/chosen": -375.14251708984375, + "logps/rejected": -637.0696411132812, + "loss": 0.0459, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.3120036125183105, + "rewards/margins": 9.222024917602539, + "rewards/rejected": -13.534029006958008, + "step": 796 + }, + { + "epoch": 0.495800933125972, + "grad_norm": 0.4710332751274109, + "learning_rate": 1.1277777777777778e-06, + "logits/chosen": -0.06919482350349426, + "logits/rejected": -0.18534164130687714, + "logps/chosen": -371.7652893066406, + "logps/rejected": -612.9109497070312, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.964427947998047, + "rewards/margins": 11.687528610229492, + "rewards/rejected": -16.65195655822754, + "step": 797 + }, + { + "epoch": 0.49642301710730946, + "grad_norm": 0.15326674282550812, + "learning_rate": 1.1222222222222222e-06, + "logits/chosen": -0.06730147451162338, + "logits/rejected": -0.10682743787765503, + "logps/chosen": -378.02178955078125, + "logps/rejected": -658.353515625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.9843034744262695, + "rewards/margins": 11.386041641235352, + "rewards/rejected": -16.370346069335938, + "step": 798 + }, + { + "epoch": 0.49704510108864697, + "grad_norm": 0.07365721464157104, + "learning_rate": 1.1166666666666666e-06, + "logits/chosen": -0.16448479890823364, + "logits/rejected": -0.2385314404964447, + "logps/chosen": -298.3537292480469, + "logps/rejected": -600.6778564453125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.367663383483887, + "rewards/margins": 11.847495079040527, + "rewards/rejected": -16.215160369873047, + "step": 799 + }, + { + "epoch": 0.4976671850699845, + "grad_norm": 11.054051399230957, + "learning_rate": 1.111111111111111e-06, + "logits/chosen": -0.23298412561416626, + "logits/rejected": -0.2709173262119293, + "logps/chosen": -418.9882507324219, + "logps/rejected": -573.3237915039062, + "loss": 0.1255, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.271183967590332, + "rewards/margins": 9.043691635131836, + "rewards/rejected": -13.314874649047852, + "step": 800 + }, + { + "epoch": 0.4982892690513219, + "grad_norm": 0.314887136220932, + "learning_rate": 1.1055555555555557e-06, + "logits/chosen": -0.17169691622257233, + "logits/rejected": -0.20936615765094757, + "logps/chosen": -718.2200927734375, + "logps/rejected": -796.76123046875, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.328655242919922, + "rewards/margins": 14.240682601928711, + "rewards/rejected": -19.569339752197266, + "step": 801 + }, + { + "epoch": 0.49891135303265943, + "grad_norm": 0.3938513696193695, + "learning_rate": 1.1e-06, + "logits/chosen": -0.11984017491340637, + "logits/rejected": -0.17501826584339142, + "logps/chosen": -623.9495849609375, + "logps/rejected": -648.8294677734375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.334211349487305, + "rewards/margins": 9.899785995483398, + "rewards/rejected": -16.233997344970703, + "step": 802 + }, + { + "epoch": 0.4995334370139969, + "grad_norm": 0.6801307797431946, + "learning_rate": 1.0944444444444445e-06, + "logits/chosen": -0.11766411364078522, + "logits/rejected": -0.22060424089431763, + "logps/chosen": -289.8787841796875, + "logps/rejected": -588.0881958007812, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9312644004821777, + "rewards/margins": 10.972482681274414, + "rewards/rejected": -14.90374755859375, + "step": 803 + }, + { + "epoch": 0.5001555209953343, + "grad_norm": 0.1781761795282364, + "learning_rate": 1.0888888888888889e-06, + "logits/chosen": -0.10266199707984924, + "logits/rejected": -0.21250051259994507, + "logps/chosen": -377.56512451171875, + "logps/rejected": -752.5086059570312, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.061295032501221, + "rewards/margins": 14.639426231384277, + "rewards/rejected": -21.700721740722656, + "step": 804 + }, + { + "epoch": 0.5007776049766719, + "grad_norm": 18.796554565429688, + "learning_rate": 1.0833333333333335e-06, + "logits/chosen": -0.10829688608646393, + "logits/rejected": -0.12565062940120697, + "logps/chosen": -419.29248046875, + "logps/rejected": -436.3228454589844, + "loss": 0.9507, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.168567180633545, + "rewards/margins": 7.141113758087158, + "rewards/rejected": -12.309680938720703, + "step": 805 + }, + { + "epoch": 0.5013996889580093, + "grad_norm": 0.8916314840316772, + "learning_rate": 1.0777777777777779e-06, + "logits/chosen": -0.081145741045475, + "logits/rejected": -0.1943606734275818, + "logps/chosen": -433.43719482421875, + "logps/rejected": -593.4254150390625, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.020490646362305, + "rewards/margins": 10.029472351074219, + "rewards/rejected": -17.04996109008789, + "step": 806 + }, + { + "epoch": 0.5020217729393468, + "grad_norm": 0.4311363399028778, + "learning_rate": 1.0722222222222223e-06, + "logits/chosen": -0.13741615414619446, + "logits/rejected": -0.1985922008752823, + "logps/chosen": -537.6214599609375, + "logps/rejected": -591.674072265625, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.60624885559082, + "rewards/margins": 9.444084167480469, + "rewards/rejected": -14.050333976745605, + "step": 807 + }, + { + "epoch": 0.5026438569206843, + "grad_norm": 8.428035736083984, + "learning_rate": 1.066666666666667e-06, + "logits/chosen": -0.22717341780662537, + "logits/rejected": -0.2505257725715637, + "logps/chosen": -580.8439331054688, + "logps/rejected": -649.7686767578125, + "loss": 0.0938, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.898068904876709, + "rewards/margins": 9.795823097229004, + "rewards/rejected": -15.693891525268555, + "step": 808 + }, + { + "epoch": 0.5032659409020218, + "grad_norm": 0.6646371483802795, + "learning_rate": 1.0611111111111113e-06, + "logits/chosen": -0.06821540743112564, + "logits/rejected": -0.19478103518486023, + "logps/chosen": -268.40118408203125, + "logps/rejected": -506.8492431640625, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.478716850280762, + "rewards/margins": 11.02419662475586, + "rewards/rejected": -15.502914428710938, + "step": 809 + }, + { + "epoch": 0.5038880248833593, + "grad_norm": 8.02017593383789, + "learning_rate": 1.0555555555555557e-06, + "logits/chosen": -0.13367031514644623, + "logits/rejected": -0.18360628187656403, + "logps/chosen": -333.7110595703125, + "logps/rejected": -411.2962951660156, + "loss": 0.0901, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.410120010375977, + "rewards/margins": 10.173619270324707, + "rewards/rejected": -15.583738327026367, + "step": 810 + }, + { + "epoch": 0.5045101088646967, + "grad_norm": 0.45219916105270386, + "learning_rate": 1.0500000000000001e-06, + "logits/chosen": -0.15827444195747375, + "logits/rejected": -0.24806246161460876, + "logps/chosen": -280.5505676269531, + "logps/rejected": -697.7249755859375, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.901421546936035, + "rewards/margins": 14.807828903198242, + "rewards/rejected": -17.709251403808594, + "step": 811 + }, + { + "epoch": 0.5051321928460342, + "grad_norm": 0.0029280667658895254, + "learning_rate": 1.0444444444444445e-06, + "logits/chosen": -0.04659561812877655, + "logits/rejected": -0.19229617714881897, + "logps/chosen": -297.29071044921875, + "logps/rejected": -697.6536865234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.1074652671813965, + "rewards/margins": 14.944089889526367, + "rewards/rejected": -20.051555633544922, + "step": 812 + }, + { + "epoch": 0.5057542768273717, + "grad_norm": 0.2773353159427643, + "learning_rate": 1.038888888888889e-06, + "logits/chosen": -0.06874191761016846, + "logits/rejected": -0.12255003303289413, + "logps/chosen": -357.48095703125, + "logps/rejected": -648.473388671875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.879541873931885, + "rewards/margins": 12.595026016235352, + "rewards/rejected": -17.474567413330078, + "step": 813 + }, + { + "epoch": 0.5063763608087092, + "grad_norm": 2.861154556274414, + "learning_rate": 1.0333333333333333e-06, + "logits/chosen": -0.15620224177837372, + "logits/rejected": -0.2467564195394516, + "logps/chosen": -277.6351013183594, + "logps/rejected": -441.5032958984375, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.482789993286133, + "rewards/margins": 10.809075355529785, + "rewards/rejected": -15.291865348815918, + "step": 814 + }, + { + "epoch": 0.5069984447900466, + "grad_norm": 0.1625307947397232, + "learning_rate": 1.0277777777777777e-06, + "logits/chosen": -0.16752129793167114, + "logits/rejected": -0.264906644821167, + "logps/chosen": -390.0548095703125, + "logps/rejected": -683.6582641601562, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.59525203704834, + "rewards/margins": 11.34913444519043, + "rewards/rejected": -15.944387435913086, + "step": 815 + }, + { + "epoch": 0.5076205287713841, + "grad_norm": 2.833897113800049, + "learning_rate": 1.0222222222222223e-06, + "logits/chosen": -0.17296412587165833, + "logits/rejected": -0.2573244273662567, + "logps/chosen": -395.054931640625, + "logps/rejected": -622.4232177734375, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.005512237548828, + "rewards/margins": 14.422053337097168, + "rewards/rejected": -18.427566528320312, + "step": 816 + }, + { + "epoch": 0.5082426127527216, + "grad_norm": 0.6978428363800049, + "learning_rate": 1.0166666666666667e-06, + "logits/chosen": -0.04084404557943344, + "logits/rejected": -0.18320974707603455, + "logps/chosen": -367.6859130859375, + "logps/rejected": -795.5833740234375, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.731910228729248, + "rewards/margins": 15.947837829589844, + "rewards/rejected": -19.67974853515625, + "step": 817 + }, + { + "epoch": 0.5088646967340591, + "grad_norm": 9.428451538085938, + "learning_rate": 1.0111111111111111e-06, + "logits/chosen": -0.11217397451400757, + "logits/rejected": -0.20489533245563507, + "logps/chosen": -395.34228515625, + "logps/rejected": -631.6146850585938, + "loss": 0.0918, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.1232404708862305, + "rewards/margins": 13.14024829864502, + "rewards/rejected": -18.26348876953125, + "step": 818 + }, + { + "epoch": 0.5094867807153965, + "grad_norm": 12.503117561340332, + "learning_rate": 1.0055555555555556e-06, + "logits/chosen": -0.12951543927192688, + "logits/rejected": -0.20333555340766907, + "logps/chosen": -457.14849853515625, + "logps/rejected": -575.1900634765625, + "loss": 0.1451, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.0344343185424805, + "rewards/margins": 10.895971298217773, + "rewards/rejected": -16.930404663085938, + "step": 819 + }, + { + "epoch": 0.5101088646967341, + "grad_norm": 0.0017153396038338542, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -0.06952300667762756, + "logits/rejected": -0.2192678153514862, + "logps/chosen": -605.42236328125, + "logps/rejected": -874.6058349609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.995631694793701, + "rewards/margins": 15.455646514892578, + "rewards/rejected": -20.451278686523438, + "step": 820 + }, + { + "epoch": 0.5107309486780716, + "grad_norm": 2.432025909423828, + "learning_rate": 9.944444444444446e-07, + "logits/chosen": -0.12238409370183945, + "logits/rejected": -0.17445950210094452, + "logps/chosen": -400.46990966796875, + "logps/rejected": -565.1221923828125, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.594307899475098, + "rewards/margins": 9.516624450683594, + "rewards/rejected": -17.110931396484375, + "step": 821 + }, + { + "epoch": 0.511353032659409, + "grad_norm": 0.16394193470478058, + "learning_rate": 9.88888888888889e-07, + "logits/chosen": -0.11291683465242386, + "logits/rejected": -0.23497769236564636, + "logps/chosen": -206.3529052734375, + "logps/rejected": -466.59552001953125, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.996105194091797, + "rewards/margins": 11.889469146728516, + "rewards/rejected": -16.885574340820312, + "step": 822 + }, + { + "epoch": 0.5119751166407465, + "grad_norm": 0.057342130690813065, + "learning_rate": 9.833333333333334e-07, + "logits/chosen": -0.014496766030788422, + "logits/rejected": -0.07959604263305664, + "logps/chosen": -393.99932861328125, + "logps/rejected": -578.0052490234375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.578857898712158, + "rewards/margins": 12.45543384552002, + "rewards/rejected": -19.034292221069336, + "step": 823 + }, + { + "epoch": 0.512597200622084, + "grad_norm": 2.905329704284668, + "learning_rate": 9.77777777777778e-07, + "logits/chosen": -0.18546809256076813, + "logits/rejected": -0.21080923080444336, + "logps/chosen": -541.985107421875, + "logps/rejected": -666.6212158203125, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.904903411865234, + "rewards/margins": 10.309957504272461, + "rewards/rejected": -16.214860916137695, + "step": 824 + }, + { + "epoch": 0.5132192846034215, + "grad_norm": 10.76130199432373, + "learning_rate": 9.722222222222224e-07, + "logits/chosen": -0.18283893167972565, + "logits/rejected": -0.2073207050561905, + "logps/chosen": -448.9975891113281, + "logps/rejected": -541.1183471679688, + "loss": 0.2796, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.22944712638855, + "rewards/margins": 8.52389144897461, + "rewards/rejected": -11.753338813781738, + "step": 825 + }, + { + "epoch": 0.5138413685847589, + "grad_norm": 0.025278907269239426, + "learning_rate": 9.666666666666668e-07, + "logits/chosen": -0.1806577891111374, + "logits/rejected": -0.27836233377456665, + "logps/chosen": -334.7089538574219, + "logps/rejected": -628.4230346679688, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.70955753326416, + "rewards/margins": 12.056928634643555, + "rewards/rejected": -16.7664852142334, + "step": 826 + }, + { + "epoch": 0.5144634525660964, + "grad_norm": 1.5111489295959473, + "learning_rate": 9.611111111111112e-07, + "logits/chosen": -0.10573087632656097, + "logits/rejected": -0.13830536603927612, + "logps/chosen": -342.9310607910156, + "logps/rejected": -528.2989501953125, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0026915073394775, + "rewards/margins": 12.831938743591309, + "rewards/rejected": -15.834630012512207, + "step": 827 + }, + { + "epoch": 0.5150855365474339, + "grad_norm": 0.11757127940654755, + "learning_rate": 9.555555555555556e-07, + "logits/chosen": -0.10259944945573807, + "logits/rejected": -0.19391939043998718, + "logps/chosen": -303.1847839355469, + "logps/rejected": -547.028564453125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.920499801635742, + "rewards/margins": 12.689993858337402, + "rewards/rejected": -17.610492706298828, + "step": 828 + }, + { + "epoch": 0.5157076205287714, + "grad_norm": 0.010215331800282001, + "learning_rate": 9.500000000000001e-07, + "logits/chosen": -0.0334804430603981, + "logits/rejected": -0.09284466505050659, + "logps/chosen": -303.435791015625, + "logps/rejected": -476.25244140625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.558291435241699, + "rewards/margins": 12.527627944946289, + "rewards/rejected": -18.085918426513672, + "step": 829 + }, + { + "epoch": 0.5163297045101088, + "grad_norm": 2.138187885284424, + "learning_rate": 9.444444444444445e-07, + "logits/chosen": -0.21128448843955994, + "logits/rejected": -0.273444801568985, + "logps/chosen": -387.3895568847656, + "logps/rejected": -578.1643676757812, + "loss": 0.0409, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.223419666290283, + "rewards/margins": 9.034905433654785, + "rewards/rejected": -12.258325576782227, + "step": 830 + }, + { + "epoch": 0.5169517884914463, + "grad_norm": 0.0022938617039471865, + "learning_rate": 9.388888888888889e-07, + "logits/chosen": -0.07449503988027573, + "logits/rejected": -0.1505374014377594, + "logps/chosen": -192.39096069335938, + "logps/rejected": -440.9158630371094, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8032004833221436, + "rewards/margins": 13.191038131713867, + "rewards/rejected": -15.994239807128906, + "step": 831 + }, + { + "epoch": 0.5175738724727839, + "grad_norm": 3.2103562355041504, + "learning_rate": 9.333333333333334e-07, + "logits/chosen": -0.26282697916030884, + "logits/rejected": -0.3174377977848053, + "logps/chosen": -194.12721252441406, + "logps/rejected": -494.2572326660156, + "loss": 0.067, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8954710960388184, + "rewards/margins": 11.596528053283691, + "rewards/rejected": -15.491998672485352, + "step": 832 + }, + { + "epoch": 0.5181959564541213, + "grad_norm": 0.4645395576953888, + "learning_rate": 9.277777777777778e-07, + "logits/chosen": -0.11890628188848495, + "logits/rejected": -0.2198331654071808, + "logps/chosen": -495.14251708984375, + "logps/rejected": -576.7490234375, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.775590896606445, + "rewards/margins": 12.412190437316895, + "rewards/rejected": -17.187782287597656, + "step": 833 + }, + { + "epoch": 0.5188180404354588, + "grad_norm": 0.07538089156150818, + "learning_rate": 9.222222222222222e-07, + "logits/chosen": -0.15080526471138, + "logits/rejected": -0.22703272104263306, + "logps/chosen": -282.1693115234375, + "logps/rejected": -563.7903442382812, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3750627040863037, + "rewards/margins": 11.348495483398438, + "rewards/rejected": -14.72355842590332, + "step": 834 + }, + { + "epoch": 0.5194401244167963, + "grad_norm": 0.20219729840755463, + "learning_rate": 9.166666666666666e-07, + "logits/chosen": -0.13170932233333588, + "logits/rejected": -0.2396613210439682, + "logps/chosen": -220.7593994140625, + "logps/rejected": -436.1505126953125, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.804533004760742, + "rewards/margins": 11.761199951171875, + "rewards/rejected": -17.565732955932617, + "step": 835 + }, + { + "epoch": 0.5200622083981338, + "grad_norm": 2.074481725692749, + "learning_rate": 9.111111111111113e-07, + "logits/chosen": -0.12444557994604111, + "logits/rejected": -0.16928750276565552, + "logps/chosen": -477.85333251953125, + "logps/rejected": -523.0330810546875, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.63342809677124, + "rewards/margins": 10.95406723022461, + "rewards/rejected": -17.587495803833008, + "step": 836 + }, + { + "epoch": 0.5206842923794712, + "grad_norm": 0.006530633196234703, + "learning_rate": 9.055555555555557e-07, + "logits/chosen": -0.08721506595611572, + "logits/rejected": -0.14582838118076324, + "logps/chosen": -360.37542724609375, + "logps/rejected": -605.824951171875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.453657150268555, + "rewards/margins": 17.401119232177734, + "rewards/rejected": -22.854778289794922, + "step": 837 + }, + { + "epoch": 0.5213063763608087, + "grad_norm": 15.2570219039917, + "learning_rate": 9.000000000000001e-07, + "logits/chosen": -0.1389024257659912, + "logits/rejected": -0.19426587224006653, + "logps/chosen": -376.310546875, + "logps/rejected": -559.767822265625, + "loss": 0.3525, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.231505393981934, + "rewards/margins": 9.277427673339844, + "rewards/rejected": -14.508933067321777, + "step": 838 + }, + { + "epoch": 0.5219284603421462, + "grad_norm": 0.0011822642991319299, + "learning_rate": 8.944444444444445e-07, + "logits/chosen": -0.07590152323246002, + "logits/rejected": -0.2137918621301651, + "logps/chosen": -234.78384399414062, + "logps/rejected": -640.68603515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.688441276550293, + "rewards/margins": 19.418983459472656, + "rewards/rejected": -23.107425689697266, + "step": 839 + }, + { + "epoch": 0.5225505443234837, + "grad_norm": 6.155368328094482, + "learning_rate": 8.88888888888889e-07, + "logits/chosen": -0.22164008021354675, + "logits/rejected": -0.29293811321258545, + "logps/chosen": -388.80145263671875, + "logps/rejected": -693.7932739257812, + "loss": 0.0508, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.692360877990723, + "rewards/margins": 11.019664764404297, + "rewards/rejected": -17.712026596069336, + "step": 840 + }, + { + "epoch": 0.5231726283048211, + "grad_norm": 8.173077583312988, + "learning_rate": 8.833333333333334e-07, + "logits/chosen": -0.16625632345676422, + "logits/rejected": -0.21757225692272186, + "logps/chosen": -285.2996826171875, + "logps/rejected": -384.7782287597656, + "loss": 0.1286, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.043044090270996, + "rewards/margins": 9.46791934967041, + "rewards/rejected": -14.510963439941406, + "step": 841 + }, + { + "epoch": 0.5237947122861586, + "grad_norm": 0.07404880970716476, + "learning_rate": 8.777777777777778e-07, + "logits/chosen": -0.10380920767784119, + "logits/rejected": -0.16136209666728973, + "logps/chosen": -528.9005126953125, + "logps/rejected": -645.5800170898438, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.031332015991211, + "rewards/margins": 11.985795974731445, + "rewards/rejected": -17.017126083374023, + "step": 842 + }, + { + "epoch": 0.5244167962674962, + "grad_norm": 0.3246402442455292, + "learning_rate": 8.722222222222224e-07, + "logits/chosen": -0.2106352299451828, + "logits/rejected": -0.2244756817817688, + "logps/chosen": -523.5628662109375, + "logps/rejected": -685.65478515625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.213235378265381, + "rewards/margins": 10.64880084991455, + "rewards/rejected": -15.862035751342773, + "step": 843 + }, + { + "epoch": 0.5250388802488336, + "grad_norm": 0.1538921743631363, + "learning_rate": 8.666666666666668e-07, + "logits/chosen": -0.18455460667610168, + "logits/rejected": -0.21693429350852966, + "logps/chosen": -485.05706787109375, + "logps/rejected": -595.4783935546875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6308600902557373, + "rewards/margins": 12.152213096618652, + "rewards/rejected": -15.783073425292969, + "step": 844 + }, + { + "epoch": 0.5256609642301711, + "grad_norm": 1.5943061113357544, + "learning_rate": 8.611111111111112e-07, + "logits/chosen": -0.12988509237766266, + "logits/rejected": -0.19536495208740234, + "logps/chosen": -484.14617919921875, + "logps/rejected": -572.92626953125, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.178459167480469, + "rewards/margins": 8.747992515563965, + "rewards/rejected": -14.92645263671875, + "step": 845 + }, + { + "epoch": 0.5262830482115085, + "grad_norm": 4.369041919708252, + "learning_rate": 8.555555555555556e-07, + "logits/chosen": -0.054929569363594055, + "logits/rejected": -0.16759105026721954, + "logps/chosen": -280.879150390625, + "logps/rejected": -511.91583251953125, + "loss": 0.0307, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5062103271484375, + "rewards/margins": 12.953079223632812, + "rewards/rejected": -16.45928955078125, + "step": 846 + }, + { + "epoch": 0.5269051321928461, + "grad_norm": 0.873319149017334, + "learning_rate": 8.500000000000001e-07, + "logits/chosen": -0.03607794642448425, + "logits/rejected": -0.11062926799058914, + "logps/chosen": -337.906494140625, + "logps/rejected": -567.6065673828125, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.082869052886963, + "rewards/margins": 11.893442153930664, + "rewards/rejected": -18.97631072998047, + "step": 847 + }, + { + "epoch": 0.5275272161741835, + "grad_norm": 0.0003729510644916445, + "learning_rate": 8.444444444444445e-07, + "logits/chosen": -0.10276341438293457, + "logits/rejected": -0.18702064454555511, + "logps/chosen": -324.3984375, + "logps/rejected": -735.4124755859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.690668106079102, + "rewards/margins": 20.31211280822754, + "rewards/rejected": -26.00278091430664, + "step": 848 + }, + { + "epoch": 0.528149300155521, + "grad_norm": 0.685472309589386, + "learning_rate": 8.388888888888889e-07, + "logits/chosen": -0.11547763645648956, + "logits/rejected": -0.20280900597572327, + "logps/chosen": -370.812255859375, + "logps/rejected": -585.2813720703125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.483940124511719, + "rewards/margins": 11.791271209716797, + "rewards/rejected": -17.275211334228516, + "step": 849 + }, + { + "epoch": 0.5287713841368584, + "grad_norm": 0.0464678592979908, + "learning_rate": 8.333333333333333e-07, + "logits/chosen": -0.0008079832186922431, + "logits/rejected": -0.1541062444448471, + "logps/chosen": -204.13082885742188, + "logps/rejected": -564.323486328125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.117552757263184, + "rewards/margins": 11.570759773254395, + "rewards/rejected": -16.688312530517578, + "step": 850 + }, + { + "epoch": 0.529393468118196, + "grad_norm": 1.271052598953247, + "learning_rate": 8.277777777777779e-07, + "logits/chosen": -0.21541725099086761, + "logits/rejected": -0.23174723982810974, + "logps/chosen": -492.3011474609375, + "logps/rejected": -600.9510498046875, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.660007476806641, + "rewards/margins": 10.578567504882812, + "rewards/rejected": -16.238574981689453, + "step": 851 + }, + { + "epoch": 0.5300155520995334, + "grad_norm": 2.4191997051239014, + "learning_rate": 8.222222222222223e-07, + "logits/chosen": -0.1402202844619751, + "logits/rejected": -0.15027600526809692, + "logps/chosen": -226.8055877685547, + "logps/rejected": -371.1802978515625, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.2740888595581055, + "rewards/margins": 9.526644706726074, + "rewards/rejected": -14.80073356628418, + "step": 852 + }, + { + "epoch": 0.5306376360808709, + "grad_norm": 0.24876125156879425, + "learning_rate": 8.166666666666668e-07, + "logits/chosen": -0.09285081177949905, + "logits/rejected": -0.15901288390159607, + "logps/chosen": -357.9372253417969, + "logps/rejected": -568.0193481445312, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.474281311035156, + "rewards/margins": 13.601968765258789, + "rewards/rejected": -19.076250076293945, + "step": 853 + }, + { + "epoch": 0.5312597200622085, + "grad_norm": 0.6209073066711426, + "learning_rate": 8.111111111111112e-07, + "logits/chosen": -0.018872026354074478, + "logits/rejected": -0.1206902340054512, + "logps/chosen": -222.9196319580078, + "logps/rejected": -563.94873046875, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.442939281463623, + "rewards/margins": 15.915273666381836, + "rewards/rejected": -21.358213424682617, + "step": 854 + }, + { + "epoch": 0.5318818040435459, + "grad_norm": 0.03168340027332306, + "learning_rate": 8.055555555555557e-07, + "logits/chosen": -0.2448480725288391, + "logits/rejected": -0.30475103855133057, + "logps/chosen": -404.17340087890625, + "logps/rejected": -520.5206909179688, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.753871440887451, + "rewards/margins": 10.558013916015625, + "rewards/rejected": -18.3118839263916, + "step": 855 + }, + { + "epoch": 0.5325038880248834, + "grad_norm": 0.03646927699446678, + "learning_rate": 8.000000000000001e-07, + "logits/chosen": -0.056139037013053894, + "logits/rejected": -0.1836017668247223, + "logps/chosen": -326.4194641113281, + "logps/rejected": -575.1658935546875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.059144020080566, + "rewards/margins": 15.686582565307617, + "rewards/rejected": -19.745725631713867, + "step": 856 + }, + { + "epoch": 0.5331259720062208, + "grad_norm": 0.042390741407871246, + "learning_rate": 7.944444444444445e-07, + "logits/chosen": -0.028428319841623306, + "logits/rejected": -0.15726517140865326, + "logps/chosen": -312.4945373535156, + "logps/rejected": -590.1441650390625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.915543556213379, + "rewards/margins": 13.295063972473145, + "rewards/rejected": -19.210607528686523, + "step": 857 + }, + { + "epoch": 0.5337480559875584, + "grad_norm": 10.014250755310059, + "learning_rate": 7.888888888888889e-07, + "logits/chosen": -0.16622062027454376, + "logits/rejected": -0.17550688982009888, + "logps/chosen": -282.2913818359375, + "logps/rejected": -559.4396362304688, + "loss": 0.2758, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.619200706481934, + "rewards/margins": 9.143976211547852, + "rewards/rejected": -14.763177871704102, + "step": 858 + }, + { + "epoch": 0.5343701399688958, + "grad_norm": 0.06868380308151245, + "learning_rate": 7.833333333333335e-07, + "logits/chosen": -0.06318702548742294, + "logits/rejected": -0.14875267446041107, + "logps/chosen": -366.27020263671875, + "logps/rejected": -549.589599609375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.328137397766113, + "rewards/margins": 13.563217163085938, + "rewards/rejected": -18.891355514526367, + "step": 859 + }, + { + "epoch": 0.5349922239502333, + "grad_norm": 0.053340256214141846, + "learning_rate": 7.777777777777779e-07, + "logits/chosen": -0.20857927203178406, + "logits/rejected": -0.25469526648521423, + "logps/chosen": -382.9903564453125, + "logps/rejected": -528.3805541992188, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.410423278808594, + "rewards/margins": 11.412582397460938, + "rewards/rejected": -15.823005676269531, + "step": 860 + }, + { + "epoch": 0.5356143079315707, + "grad_norm": 4.2361674308776855, + "learning_rate": 7.722222222222223e-07, + "logits/chosen": -0.1153683215379715, + "logits/rejected": -0.1478542983531952, + "logps/chosen": -470.185302734375, + "logps/rejected": -549.1876220703125, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.100476264953613, + "rewards/margins": 7.586841106414795, + "rewards/rejected": -13.68731689453125, + "step": 861 + }, + { + "epoch": 0.5362363919129083, + "grad_norm": 0.004544577095657587, + "learning_rate": 7.666666666666667e-07, + "logits/chosen": -0.09765191376209259, + "logits/rejected": -0.15658029913902283, + "logps/chosen": -517.2640380859375, + "logps/rejected": -666.1004638671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.103967189788818, + "rewards/margins": 14.988969802856445, + "rewards/rejected": -19.092937469482422, + "step": 862 + }, + { + "epoch": 0.5368584758942457, + "grad_norm": 0.023717544972896576, + "learning_rate": 7.611111111111112e-07, + "logits/chosen": -0.10382688045501709, + "logits/rejected": -0.1460075080394745, + "logps/chosen": -328.40032958984375, + "logps/rejected": -664.793701171875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.337226390838623, + "rewards/margins": 12.689056396484375, + "rewards/rejected": -20.026283264160156, + "step": 863 + }, + { + "epoch": 0.5374805598755832, + "grad_norm": 2.449568748474121, + "learning_rate": 7.555555555555556e-07, + "logits/chosen": -0.14971768856048584, + "logits/rejected": -0.18246319890022278, + "logps/chosen": -344.18951416015625, + "logps/rejected": -567.51611328125, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.205051422119141, + "rewards/margins": 12.014487266540527, + "rewards/rejected": -18.219539642333984, + "step": 864 + }, + { + "epoch": 0.5381026438569206, + "grad_norm": 0.06219907104969025, + "learning_rate": 7.5e-07, + "logits/chosen": -0.09020867198705673, + "logits/rejected": -0.12211479246616364, + "logps/chosen": -397.8858642578125, + "logps/rejected": -640.037109375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.428716659545898, + "rewards/margins": 14.163869857788086, + "rewards/rejected": -19.592586517333984, + "step": 865 + }, + { + "epoch": 0.5387247278382582, + "grad_norm": 1.8258507251739502, + "learning_rate": 7.444444444444444e-07, + "logits/chosen": -0.19156305491924286, + "logits/rejected": -0.3070339560508728, + "logps/chosen": -350.8108825683594, + "logps/rejected": -580.215087890625, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.436943054199219, + "rewards/margins": 12.24517822265625, + "rewards/rejected": -17.68212127685547, + "step": 866 + }, + { + "epoch": 0.5393468118195957, + "grad_norm": 0.004493940621614456, + "learning_rate": 7.38888888888889e-07, + "logits/chosen": -0.08604935556650162, + "logits/rejected": -0.21125580370426178, + "logps/chosen": -405.9581298828125, + "logps/rejected": -731.90478515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.167068958282471, + "rewards/margins": 16.007829666137695, + "rewards/rejected": -22.17490005493164, + "step": 867 + }, + { + "epoch": 0.5399688958009331, + "grad_norm": 1.3643146753311157, + "learning_rate": 7.333333333333334e-07, + "logits/chosen": -0.10822945088148117, + "logits/rejected": -0.14297321438789368, + "logps/chosen": -486.5452575683594, + "logps/rejected": -562.8757934570312, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.976385116577148, + "rewards/margins": 11.439517974853516, + "rewards/rejected": -17.415903091430664, + "step": 868 + }, + { + "epoch": 0.5405909797822706, + "grad_norm": 1.1700414419174194, + "learning_rate": 7.277777777777778e-07, + "logits/chosen": -0.13553275167942047, + "logits/rejected": -0.18331025540828705, + "logps/chosen": -450.7088928222656, + "logps/rejected": -597.802490234375, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0849761962890625, + "rewards/margins": 10.554206848144531, + "rewards/rejected": -15.639183044433594, + "step": 869 + }, + { + "epoch": 0.5412130637636081, + "grad_norm": 0.4021424353122711, + "learning_rate": 7.222222222222222e-07, + "logits/chosen": -0.17759475111961365, + "logits/rejected": -0.2140870839357376, + "logps/chosen": -322.3487854003906, + "logps/rejected": -554.01953125, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.326861381530762, + "rewards/margins": 11.425933837890625, + "rewards/rejected": -16.752796173095703, + "step": 870 + }, + { + "epoch": 0.5418351477449456, + "grad_norm": 19.288776397705078, + "learning_rate": 7.166666666666668e-07, + "logits/chosen": -0.09169970452785492, + "logits/rejected": -0.1635945737361908, + "logps/chosen": -475.0706787109375, + "logps/rejected": -674.3582763671875, + "loss": 0.3139, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.219222068786621, + "rewards/margins": 12.146341323852539, + "rewards/rejected": -20.365562438964844, + "step": 871 + }, + { + "epoch": 0.542457231726283, + "grad_norm": 3.2314341068267822, + "learning_rate": 7.111111111111112e-07, + "logits/chosen": -0.12291283160448074, + "logits/rejected": -0.17266924679279327, + "logps/chosen": -294.10040283203125, + "logps/rejected": -509.72576904296875, + "loss": 0.0634, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.248610496520996, + "rewards/margins": 12.903603553771973, + "rewards/rejected": -19.15221405029297, + "step": 872 + }, + { + "epoch": 0.5430793157076206, + "grad_norm": 0.14128378033638, + "learning_rate": 7.055555555555556e-07, + "logits/chosen": -0.00795955490320921, + "logits/rejected": -0.17231547832489014, + "logps/chosen": -182.74887084960938, + "logps/rejected": -503.8109130859375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3208446502685547, + "rewards/margins": 13.800355911254883, + "rewards/rejected": -17.121200561523438, + "step": 873 + }, + { + "epoch": 0.543701399688958, + "grad_norm": 0.036097049713134766, + "learning_rate": 7.000000000000001e-07, + "logits/chosen": -0.0867750346660614, + "logits/rejected": -0.18024961650371552, + "logps/chosen": -309.51220703125, + "logps/rejected": -463.6152038574219, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.808304786682129, + "rewards/margins": 11.622719764709473, + "rewards/rejected": -15.431024551391602, + "step": 874 + }, + { + "epoch": 0.5443234836702955, + "grad_norm": 0.30656698346138, + "learning_rate": 6.944444444444446e-07, + "logits/chosen": -0.1918606460094452, + "logits/rejected": -0.24658414721488953, + "logps/chosen": -341.14306640625, + "logps/rejected": -528.3599243164062, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.104247570037842, + "rewards/margins": 8.394479751586914, + "rewards/rejected": -11.498727798461914, + "step": 875 + }, + { + "epoch": 0.5449455676516329, + "grad_norm": 2.052777051925659, + "learning_rate": 6.88888888888889e-07, + "logits/chosen": -0.1469787061214447, + "logits/rejected": -0.2411191612482071, + "logps/chosen": -340.0904541015625, + "logps/rejected": -611.8851318359375, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7907726764678955, + "rewards/margins": 12.982802391052246, + "rewards/rejected": -15.773574829101562, + "step": 876 + }, + { + "epoch": 0.5455676516329705, + "grad_norm": 0.31644880771636963, + "learning_rate": 6.833333333333334e-07, + "logits/chosen": -0.11007525771856308, + "logits/rejected": -0.15615954995155334, + "logps/chosen": -412.95196533203125, + "logps/rejected": -620.7003784179688, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.329563140869141, + "rewards/margins": 13.613090515136719, + "rewards/rejected": -17.94265365600586, + "step": 877 + }, + { + "epoch": 0.546189735614308, + "grad_norm": 0.0013116950867697597, + "learning_rate": 6.777777777777779e-07, + "logits/chosen": -0.019503729417920113, + "logits/rejected": -0.12442383170127869, + "logps/chosen": -293.31402587890625, + "logps/rejected": -696.2352294921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.384456634521484, + "rewards/margins": 19.87446403503418, + "rewards/rejected": -26.258922576904297, + "step": 878 + }, + { + "epoch": 0.5468118195956454, + "grad_norm": 0.3735998868942261, + "learning_rate": 6.722222222222223e-07, + "logits/chosen": -0.18256860971450806, + "logits/rejected": -0.18508324027061462, + "logps/chosen": -485.07379150390625, + "logps/rejected": -674.661865234375, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.526670932769775, + "rewards/margins": 12.90936279296875, + "rewards/rejected": -19.43603515625, + "step": 879 + }, + { + "epoch": 0.5474339035769828, + "grad_norm": 0.4759034216403961, + "learning_rate": 6.666666666666667e-07, + "logits/chosen": -0.10696282982826233, + "logits/rejected": -0.15038657188415527, + "logps/chosen": -295.8506774902344, + "logps/rejected": -469.07470703125, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.581249237060547, + "rewards/margins": 12.212093353271484, + "rewards/rejected": -17.79334259033203, + "step": 880 + }, + { + "epoch": 0.5480559875583204, + "grad_norm": 0.0028715962544083595, + "learning_rate": 6.611111111111111e-07, + "logits/chosen": -0.12589344382286072, + "logits/rejected": -0.20949020981788635, + "logps/chosen": -306.7193298339844, + "logps/rejected": -527.232666015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.480921745300293, + "rewards/margins": 12.060077667236328, + "rewards/rejected": -16.541000366210938, + "step": 881 + }, + { + "epoch": 0.5486780715396579, + "grad_norm": 1.6646157503128052, + "learning_rate": 6.555555555555556e-07, + "logits/chosen": -0.10699842125177383, + "logits/rejected": -0.12870880961418152, + "logps/chosen": -335.8416442871094, + "logps/rejected": -462.2041015625, + "loss": 0.0347, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.440911293029785, + "rewards/margins": 7.507866859436035, + "rewards/rejected": -12.94877815246582, + "step": 882 + }, + { + "epoch": 0.5493001555209953, + "grad_norm": 0.28018447756767273, + "learning_rate": 6.5e-07, + "logits/chosen": -0.07878740131855011, + "logits/rejected": -0.12369250506162643, + "logps/chosen": -571.8325805664062, + "logps/rejected": -658.3065795898438, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.798992156982422, + "rewards/margins": 13.361906051635742, + "rewards/rejected": -18.160898208618164, + "step": 883 + }, + { + "epoch": 0.5499222395023328, + "grad_norm": 5.722230434417725, + "learning_rate": 6.444444444444445e-07, + "logits/chosen": -0.10185466706752777, + "logits/rejected": -0.25914275646209717, + "logps/chosen": -572.52978515625, + "logps/rejected": -726.36083984375, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.9253387451171875, + "rewards/margins": 13.580059051513672, + "rewards/rejected": -20.50539779663086, + "step": 884 + }, + { + "epoch": 0.5505443234836703, + "grad_norm": 0.47849270701408386, + "learning_rate": 6.388888888888889e-07, + "logits/chosen": -0.19341492652893066, + "logits/rejected": -0.21045443415641785, + "logps/chosen": -375.5443115234375, + "logps/rejected": -685.9794921875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.445119857788086, + "rewards/margins": 17.271745681762695, + "rewards/rejected": -22.71686553955078, + "step": 885 + }, + { + "epoch": 0.5511664074650078, + "grad_norm": 0.004693964961916208, + "learning_rate": 6.333333333333334e-07, + "logits/chosen": -0.11429623514413834, + "logits/rejected": -0.21233513951301575, + "logps/chosen": -266.5773620605469, + "logps/rejected": -775.644775390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.863078594207764, + "rewards/margins": 21.096294403076172, + "rewards/rejected": -25.95937156677246, + "step": 886 + }, + { + "epoch": 0.5517884914463452, + "grad_norm": 3.480909824371338, + "learning_rate": 6.277777777777778e-07, + "logits/chosen": -0.10272429883480072, + "logits/rejected": -0.15049391984939575, + "logps/chosen": -341.75189208984375, + "logps/rejected": -505.22894287109375, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.483181953430176, + "rewards/margins": 11.354247093200684, + "rewards/rejected": -17.83742904663086, + "step": 887 + }, + { + "epoch": 0.5524105754276827, + "grad_norm": 0.30205652117729187, + "learning_rate": 6.222222222222223e-07, + "logits/chosen": -0.10421382635831833, + "logits/rejected": -0.16247880458831787, + "logps/chosen": -435.6741943359375, + "logps/rejected": -618.5713500976562, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.097311019897461, + "rewards/margins": 13.388763427734375, + "rewards/rejected": -20.486074447631836, + "step": 888 + }, + { + "epoch": 0.5530326594090202, + "grad_norm": 0.9897993803024292, + "learning_rate": 6.166666666666668e-07, + "logits/chosen": -0.10267721116542816, + "logits/rejected": -0.14244253933429718, + "logps/chosen": -390.5269775390625, + "logps/rejected": -744.2490844726562, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.826128005981445, + "rewards/margins": 16.62411117553711, + "rewards/rejected": -22.450237274169922, + "step": 889 + }, + { + "epoch": 0.5536547433903577, + "grad_norm": 0.005904040299355984, + "learning_rate": 6.111111111111112e-07, + "logits/chosen": -0.15186476707458496, + "logits/rejected": -0.22795245051383972, + "logps/chosen": -377.89501953125, + "logps/rejected": -699.17724609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7881152629852295, + "rewards/margins": 16.48578643798828, + "rewards/rejected": -20.273902893066406, + "step": 890 + }, + { + "epoch": 0.5542768273716951, + "grad_norm": 0.2147321254014969, + "learning_rate": 6.055555555555556e-07, + "logits/chosen": -0.16942289471626282, + "logits/rejected": -0.18956929445266724, + "logps/chosen": -425.08331298828125, + "logps/rejected": -532.7347412109375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.357314586639404, + "rewards/margins": 10.840717315673828, + "rewards/rejected": -15.19803237915039, + "step": 891 + }, + { + "epoch": 0.5548989113530327, + "grad_norm": 0.15978899598121643, + "learning_rate": 6.000000000000001e-07, + "logits/chosen": -0.14765959978103638, + "logits/rejected": -0.16400831937789917, + "logps/chosen": -428.1240234375, + "logps/rejected": -555.2313232421875, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.642336845397949, + "rewards/margins": 12.675694465637207, + "rewards/rejected": -19.318031311035156, + "step": 892 + }, + { + "epoch": 0.5555209953343702, + "grad_norm": 20.763750076293945, + "learning_rate": 5.944444444444445e-07, + "logits/chosen": -0.1046159490942955, + "logits/rejected": -0.17557016015052795, + "logps/chosen": -525.4366455078125, + "logps/rejected": -609.904296875, + "loss": 0.7238, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.663157939910889, + "rewards/margins": 14.893532752990723, + "rewards/rejected": -20.556690216064453, + "step": 893 + }, + { + "epoch": 0.5561430793157076, + "grad_norm": 15.393893241882324, + "learning_rate": 5.888888888888889e-07, + "logits/chosen": -0.1357133984565735, + "logits/rejected": -0.11325462907552719, + "logps/chosen": -374.254638671875, + "logps/rejected": -508.286376953125, + "loss": 0.168, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.380086421966553, + "rewards/margins": 9.364904403686523, + "rewards/rejected": -16.744991302490234, + "step": 894 + }, + { + "epoch": 0.5567651632970451, + "grad_norm": 3.6201484203338623, + "learning_rate": 5.833333333333334e-07, + "logits/chosen": -0.1725485920906067, + "logits/rejected": -0.2805717885494232, + "logps/chosen": -361.28936767578125, + "logps/rejected": -522.1871337890625, + "loss": 0.0661, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.158904075622559, + "rewards/margins": 9.466593742370605, + "rewards/rejected": -17.625497817993164, + "step": 895 + }, + { + "epoch": 0.5573872472783826, + "grad_norm": 0.16825364530086517, + "learning_rate": 5.777777777777778e-07, + "logits/chosen": -0.02355928160250187, + "logits/rejected": -0.182865172624588, + "logps/chosen": -390.2503662109375, + "logps/rejected": -743.47998046875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.198307037353516, + "rewards/margins": 17.3770809173584, + "rewards/rejected": -23.57538604736328, + "step": 896 + }, + { + "epoch": 0.5580093312597201, + "grad_norm": 0.1888658106327057, + "learning_rate": 5.722222222222223e-07, + "logits/chosen": 0.004930809140205383, + "logits/rejected": -0.03549078479409218, + "logps/chosen": -416.9251403808594, + "logps/rejected": -571.6239624023438, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.569538116455078, + "rewards/margins": 14.049834251403809, + "rewards/rejected": -19.619373321533203, + "step": 897 + }, + { + "epoch": 0.5586314152410575, + "grad_norm": 0.6532146334648132, + "learning_rate": 5.666666666666667e-07, + "logits/chosen": -0.18088920414447784, + "logits/rejected": -0.24680380523204803, + "logps/chosen": -370.8760070800781, + "logps/rejected": -445.78240966796875, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.847957611083984, + "rewards/margins": 10.341049194335938, + "rewards/rejected": -15.189004898071289, + "step": 898 + }, + { + "epoch": 0.559253499222395, + "grad_norm": 0.007513427175581455, + "learning_rate": 5.611111111111111e-07, + "logits/chosen": -0.05223657563328743, + "logits/rejected": -0.12175898253917694, + "logps/chosen": -274.02996826171875, + "logps/rejected": -585.1141967773438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.348029613494873, + "rewards/margins": 16.194965362548828, + "rewards/rejected": -21.54299545288086, + "step": 899 + }, + { + "epoch": 0.5598755832037325, + "grad_norm": 1.4539251327514648, + "learning_rate": 5.555555555555555e-07, + "logits/chosen": -0.14714716374874115, + "logits/rejected": -0.11805664002895355, + "logps/chosen": -353.10638427734375, + "logps/rejected": -556.822265625, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.005470275878906, + "rewards/margins": 12.202390670776367, + "rewards/rejected": -19.207860946655273, + "step": 900 + }, + { + "epoch": 0.56049766718507, + "grad_norm": 2.6392199993133545, + "learning_rate": 5.5e-07, + "logits/chosen": -0.08751775324344635, + "logits/rejected": -0.18409952521324158, + "logps/chosen": -330.4507141113281, + "logps/rejected": -602.0423583984375, + "loss": 0.0248, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.185247421264648, + "rewards/margins": 15.733636856079102, + "rewards/rejected": -19.91888427734375, + "step": 901 + }, + { + "epoch": 0.5611197511664074, + "grad_norm": 17.840116500854492, + "learning_rate": 5.444444444444444e-07, + "logits/chosen": -0.14321552217006683, + "logits/rejected": -0.19351348280906677, + "logps/chosen": -366.5910339355469, + "logps/rejected": -599.8262329101562, + "loss": 0.2823, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.426840305328369, + "rewards/margins": 13.056909561157227, + "rewards/rejected": -18.483749389648438, + "step": 902 + }, + { + "epoch": 0.5617418351477449, + "grad_norm": 0.008685658685863018, + "learning_rate": 5.388888888888889e-07, + "logits/chosen": -0.07585480809211731, + "logits/rejected": -0.23233291506767273, + "logps/chosen": -271.5794677734375, + "logps/rejected": -708.705078125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7151174545288086, + "rewards/margins": 13.646063804626465, + "rewards/rejected": -17.361181259155273, + "step": 903 + }, + { + "epoch": 0.5623639191290825, + "grad_norm": 0.09190709888935089, + "learning_rate": 5.333333333333335e-07, + "logits/chosen": -0.03712693974375725, + "logits/rejected": -0.12349878996610641, + "logps/chosen": -220.2087860107422, + "logps/rejected": -543.0441284179688, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.945670127868652, + "rewards/margins": 13.043341636657715, + "rewards/rejected": -17.989011764526367, + "step": 904 + }, + { + "epoch": 0.5629860031104199, + "grad_norm": 0.5448461771011353, + "learning_rate": 5.277777777777779e-07, + "logits/chosen": -0.039869170635938644, + "logits/rejected": -0.15174376964569092, + "logps/chosen": -504.3404541015625, + "logps/rejected": -586.2520141601562, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.272248268127441, + "rewards/margins": 10.992162704467773, + "rewards/rejected": -16.26441192626953, + "step": 905 + }, + { + "epoch": 0.5636080870917574, + "grad_norm": 1.7924355268478394, + "learning_rate": 5.222222222222223e-07, + "logits/chosen": -0.1609061360359192, + "logits/rejected": -0.16112622618675232, + "logps/chosen": -378.368408203125, + "logps/rejected": -468.166015625, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.828129768371582, + "rewards/margins": 10.233999252319336, + "rewards/rejected": -16.062129974365234, + "step": 906 + }, + { + "epoch": 0.5642301710730949, + "grad_norm": 0.012283282354474068, + "learning_rate": 5.166666666666667e-07, + "logits/chosen": 0.0005133431404829025, + "logits/rejected": -0.1820628046989441, + "logps/chosen": -390.963134765625, + "logps/rejected": -762.43408203125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.100973129272461, + "rewards/margins": 15.296104431152344, + "rewards/rejected": -19.397077560424805, + "step": 907 + }, + { + "epoch": 0.5648522550544324, + "grad_norm": 0.4727984368801117, + "learning_rate": 5.111111111111112e-07, + "logits/chosen": -0.012810694053769112, + "logits/rejected": -0.141657292842865, + "logps/chosen": -506.322021484375, + "logps/rejected": -763.9342041015625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.831555366516113, + "rewards/margins": 12.667049407958984, + "rewards/rejected": -19.49860382080078, + "step": 908 + }, + { + "epoch": 0.5654743390357698, + "grad_norm": 0.11106092482805252, + "learning_rate": 5.055555555555556e-07, + "logits/chosen": -0.1638554483652115, + "logits/rejected": -0.2685515284538269, + "logps/chosen": -244.2712860107422, + "logps/rejected": -552.2424926757812, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.179596900939941, + "rewards/margins": 13.380350112915039, + "rewards/rejected": -19.559947967529297, + "step": 909 + }, + { + "epoch": 0.5660964230171073, + "grad_norm": 0.5243893265724182, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": -0.1559494435787201, + "logits/rejected": -0.27840447425842285, + "logps/chosen": -465.1278381347656, + "logps/rejected": -742.3772583007812, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.935652256011963, + "rewards/margins": 11.955910682678223, + "rewards/rejected": -17.891563415527344, + "step": 910 + }, + { + "epoch": 0.5667185069984448, + "grad_norm": 0.16543732583522797, + "learning_rate": 4.944444444444445e-07, + "logits/chosen": -0.12534503638744354, + "logits/rejected": -0.20476460456848145, + "logps/chosen": -497.0600280761719, + "logps/rejected": -652.06591796875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.643803119659424, + "rewards/margins": 10.384767532348633, + "rewards/rejected": -17.0285701751709, + "step": 911 + }, + { + "epoch": 0.5673405909797823, + "grad_norm": 0.08982028067111969, + "learning_rate": 4.88888888888889e-07, + "logits/chosen": -0.14199914038181305, + "logits/rejected": -0.16620983183383942, + "logps/chosen": -478.5483093261719, + "logps/rejected": -653.017333984375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.212128162384033, + "rewards/margins": 14.567411422729492, + "rewards/rejected": -20.779539108276367, + "step": 912 + }, + { + "epoch": 0.5679626749611197, + "grad_norm": 9.143415451049805, + "learning_rate": 4.833333333333334e-07, + "logits/chosen": -0.17913131415843964, + "logits/rejected": -0.23153036832809448, + "logps/chosen": -284.7883605957031, + "logps/rejected": -492.8653564453125, + "loss": 0.1025, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.652069091796875, + "rewards/margins": 11.854927062988281, + "rewards/rejected": -16.506994247436523, + "step": 913 + }, + { + "epoch": 0.5685847589424572, + "grad_norm": 16.70577621459961, + "learning_rate": 4.777777777777778e-07, + "logits/chosen": -0.09874202311038971, + "logits/rejected": -0.16406746208667755, + "logps/chosen": -341.44427490234375, + "logps/rejected": -587.882568359375, + "loss": 0.1827, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.519284725189209, + "rewards/margins": 11.650325775146484, + "rewards/rejected": -18.16960906982422, + "step": 914 + }, + { + "epoch": 0.5692068429237948, + "grad_norm": 1.225197672843933, + "learning_rate": 4.7222222222222226e-07, + "logits/chosen": -0.14438487589359283, + "logits/rejected": -0.24372327327728271, + "logps/chosen": -490.1650390625, + "logps/rejected": -717.9415893554688, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.548934936523438, + "rewards/margins": 11.42662239074707, + "rewards/rejected": -19.97555923461914, + "step": 915 + }, + { + "epoch": 0.5698289269051322, + "grad_norm": 0.08595186471939087, + "learning_rate": 4.666666666666667e-07, + "logits/chosen": -0.016423068940639496, + "logits/rejected": -0.15606629848480225, + "logps/chosen": -327.37744140625, + "logps/rejected": -653.3740844726562, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.819915294647217, + "rewards/margins": 14.082674026489258, + "rewards/rejected": -19.902589797973633, + "step": 916 + }, + { + "epoch": 0.5704510108864697, + "grad_norm": 1.2695430517196655, + "learning_rate": 4.611111111111111e-07, + "logits/chosen": -0.02233402617275715, + "logits/rejected": -0.09145306795835495, + "logps/chosen": -206.2130126953125, + "logps/rejected": -483.6703186035156, + "loss": 0.0196, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.390079498291016, + "rewards/margins": 15.489239692687988, + "rewards/rejected": -20.87932014465332, + "step": 917 + }, + { + "epoch": 0.5710730948678071, + "grad_norm": 2.2679624557495117, + "learning_rate": 4.5555555555555563e-07, + "logits/chosen": 0.04085580259561539, + "logits/rejected": -0.15909084677696228, + "logps/chosen": -266.6771240234375, + "logps/rejected": -621.5974731445312, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.382062911987305, + "rewards/margins": 16.21413230895996, + "rewards/rejected": -21.596195220947266, + "step": 918 + }, + { + "epoch": 0.5716951788491447, + "grad_norm": 0.032155707478523254, + "learning_rate": 4.5000000000000003e-07, + "logits/chosen": -0.14174522459506989, + "logits/rejected": -0.1900528520345688, + "logps/chosen": -449.571044921875, + "logps/rejected": -688.2855224609375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.439013957977295, + "rewards/margins": 11.895275115966797, + "rewards/rejected": -19.334287643432617, + "step": 919 + }, + { + "epoch": 0.5723172628304821, + "grad_norm": 3.8114731311798096, + "learning_rate": 4.444444444444445e-07, + "logits/chosen": -0.11737746000289917, + "logits/rejected": -0.10855232924222946, + "logps/chosen": -398.8075866699219, + "logps/rejected": -698.47998046875, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.646176338195801, + "rewards/margins": 10.34241008758545, + "rewards/rejected": -15.98858642578125, + "step": 920 + }, + { + "epoch": 0.5729393468118196, + "grad_norm": 0.005684667732566595, + "learning_rate": 4.388888888888889e-07, + "logits/chosen": -0.16741794347763062, + "logits/rejected": -0.26522406935691833, + "logps/chosen": -431.4114990234375, + "logps/rejected": -708.2075805664062, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.284224987030029, + "rewards/margins": 14.803743362426758, + "rewards/rejected": -20.087968826293945, + "step": 921 + }, + { + "epoch": 0.573561430793157, + "grad_norm": 0.4710279405117035, + "learning_rate": 4.333333333333334e-07, + "logits/chosen": -0.1566358506679535, + "logits/rejected": -0.25515982508659363, + "logps/chosen": -314.4952697753906, + "logps/rejected": -630.3221435546875, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.383520603179932, + "rewards/margins": 10.817621231079102, + "rewards/rejected": -17.201141357421875, + "step": 922 + }, + { + "epoch": 0.5741835147744946, + "grad_norm": 1.4161458015441895, + "learning_rate": 4.277777777777778e-07, + "logits/chosen": -0.1422313153743744, + "logits/rejected": -0.2073248326778412, + "logps/chosen": -571.3612060546875, + "logps/rejected": -734.2919311523438, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.92413330078125, + "rewards/margins": 12.520830154418945, + "rewards/rejected": -21.444961547851562, + "step": 923 + }, + { + "epoch": 0.574805598755832, + "grad_norm": 0.03369239717721939, + "learning_rate": 4.2222222222222226e-07, + "logits/chosen": -0.07113811373710632, + "logits/rejected": -0.13425123691558838, + "logps/chosen": -448.33642578125, + "logps/rejected": -578.402587890625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.699841499328613, + "rewards/margins": 11.654420852661133, + "rewards/rejected": -19.354263305664062, + "step": 924 + }, + { + "epoch": 0.5754276827371695, + "grad_norm": 0.029124926775693893, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": -0.06968679279088974, + "logits/rejected": -0.16744616627693176, + "logps/chosen": -199.85435485839844, + "logps/rejected": -528.104736328125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6088180541992188, + "rewards/margins": 18.708251953125, + "rewards/rejected": -22.31707000732422, + "step": 925 + }, + { + "epoch": 0.576049766718507, + "grad_norm": 1.0575337409973145, + "learning_rate": 4.111111111111112e-07, + "logits/chosen": -0.1803208440542221, + "logits/rejected": -0.2370225191116333, + "logps/chosen": -374.892578125, + "logps/rejected": -533.2374267578125, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.645905017852783, + "rewards/margins": 10.530025482177734, + "rewards/rejected": -16.17593002319336, + "step": 926 + }, + { + "epoch": 0.5766718506998445, + "grad_norm": 0.004277026280760765, + "learning_rate": 4.055555555555556e-07, + "logits/chosen": -0.1017603948712349, + "logits/rejected": -0.10668627917766571, + "logps/chosen": -448.9598083496094, + "logps/rejected": -760.168701171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.068013668060303, + "rewards/margins": 20.150733947753906, + "rewards/rejected": -27.21875, + "step": 927 + }, + { + "epoch": 0.577293934681182, + "grad_norm": 0.055350467562675476, + "learning_rate": 4.0000000000000003e-07, + "logits/chosen": -0.14868000149726868, + "logits/rejected": -0.21844668686389923, + "logps/chosen": -307.8984069824219, + "logps/rejected": -552.7166137695312, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.971147060394287, + "rewards/margins": 15.226062774658203, + "rewards/rejected": -21.19721031188965, + "step": 928 + }, + { + "epoch": 0.5779160186625194, + "grad_norm": 0.00021855716477148235, + "learning_rate": 3.9444444444444444e-07, + "logits/chosen": 0.056283533573150635, + "logits/rejected": -0.08859264105558395, + "logps/chosen": -521.605712890625, + "logps/rejected": -815.8853759765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.173193454742432, + "rewards/margins": 18.710405349731445, + "rewards/rejected": -24.88359832763672, + "step": 929 + }, + { + "epoch": 0.578538102643857, + "grad_norm": 3.0008628368377686, + "learning_rate": 3.8888888888888895e-07, + "logits/chosen": -0.16845470666885376, + "logits/rejected": -0.19171449542045593, + "logps/chosen": -405.1419677734375, + "logps/rejected": -512.3128662109375, + "loss": 0.0484, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.733224868774414, + "rewards/margins": 13.844807624816895, + "rewards/rejected": -19.578033447265625, + "step": 930 + }, + { + "epoch": 0.5791601866251944, + "grad_norm": 0.09574901312589645, + "learning_rate": 3.8333333333333335e-07, + "logits/chosen": -0.027104195207357407, + "logits/rejected": -0.11811557412147522, + "logps/chosen": -314.9927062988281, + "logps/rejected": -527.8656616210938, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.482093334197998, + "rewards/margins": 10.763703346252441, + "rewards/rejected": -14.245797157287598, + "step": 931 + }, + { + "epoch": 0.5797822706065319, + "grad_norm": 0.3817373216152191, + "learning_rate": 3.777777777777778e-07, + "logits/chosen": -0.10936226695775986, + "logits/rejected": -0.19887566566467285, + "logps/chosen": -484.7103271484375, + "logps/rejected": -612.5166015625, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7887473106384277, + "rewards/margins": 9.765079498291016, + "rewards/rejected": -13.553826332092285, + "step": 932 + }, + { + "epoch": 0.5804043545878693, + "grad_norm": 1.274432897567749, + "learning_rate": 3.722222222222222e-07, + "logits/chosen": -0.10899853706359863, + "logits/rejected": -0.16873008012771606, + "logps/chosen": -461.8088073730469, + "logps/rejected": -645.8157958984375, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.818090438842773, + "rewards/margins": 12.055378913879395, + "rewards/rejected": -17.873470306396484, + "step": 933 + }, + { + "epoch": 0.5810264385692069, + "grad_norm": 3.055016040802002, + "learning_rate": 3.666666666666667e-07, + "logits/chosen": -0.056296851485967636, + "logits/rejected": -0.11693690717220306, + "logps/chosen": -485.4197998046875, + "logps/rejected": -800.8173217773438, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.5564961433410645, + "rewards/margins": 14.01213264465332, + "rewards/rejected": -19.568626403808594, + "step": 934 + }, + { + "epoch": 0.5816485225505443, + "grad_norm": 22.70096206665039, + "learning_rate": 3.611111111111111e-07, + "logits/chosen": -0.15096169710159302, + "logits/rejected": -0.16625237464904785, + "logps/chosen": -410.192138671875, + "logps/rejected": -544.0167236328125, + "loss": 0.5482, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.8273024559021, + "rewards/margins": 10.687788009643555, + "rewards/rejected": -16.515090942382812, + "step": 935 + }, + { + "epoch": 0.5822706065318818, + "grad_norm": 0.04782518371939659, + "learning_rate": 3.555555555555556e-07, + "logits/chosen": -0.06998179852962494, + "logits/rejected": -0.13976743817329407, + "logps/chosen": -403.09320068359375, + "logps/rejected": -691.07421875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.375682830810547, + "rewards/margins": 18.689632415771484, + "rewards/rejected": -24.06531524658203, + "step": 936 + }, + { + "epoch": 0.5828926905132192, + "grad_norm": 0.1057569608092308, + "learning_rate": 3.5000000000000004e-07, + "logits/chosen": -0.10755689442157745, + "logits/rejected": -0.20193353295326233, + "logps/chosen": -588.2215576171875, + "logps/rejected": -756.331298828125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.982949256896973, + "rewards/margins": 13.871467590332031, + "rewards/rejected": -19.854415893554688, + "step": 937 + }, + { + "epoch": 0.5835147744945568, + "grad_norm": 0.11293259263038635, + "learning_rate": 3.444444444444445e-07, + "logits/chosen": -0.06790734827518463, + "logits/rejected": -0.1973171830177307, + "logps/chosen": -190.36285400390625, + "logps/rejected": -556.5388793945312, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0512185096740723, + "rewards/margins": 14.098311424255371, + "rewards/rejected": -17.14952850341797, + "step": 938 + }, + { + "epoch": 0.5841368584758942, + "grad_norm": 5.4528961181640625, + "learning_rate": 3.3888888888888895e-07, + "logits/chosen": -0.16578200459480286, + "logits/rejected": -0.17942333221435547, + "logps/chosen": -541.2430419921875, + "logps/rejected": -521.4286499023438, + "loss": 0.0373, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.198036193847656, + "rewards/margins": 10.667282104492188, + "rewards/rejected": -15.865318298339844, + "step": 939 + }, + { + "epoch": 0.5847589424572317, + "grad_norm": 2.315322160720825, + "learning_rate": 3.3333333333333335e-07, + "logits/chosen": -0.19601251184940338, + "logits/rejected": -0.22224481403827667, + "logps/chosen": -286.32171630859375, + "logps/rejected": -474.5668640136719, + "loss": 0.0965, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.360180854797363, + "rewards/margins": 10.261273384094238, + "rewards/rejected": -14.621456146240234, + "step": 940 + }, + { + "epoch": 0.5853810264385692, + "grad_norm": 6.288290500640869, + "learning_rate": 3.277777777777778e-07, + "logits/chosen": -0.08888668566942215, + "logits/rejected": -0.07497645169496536, + "logps/chosen": -547.2893676757812, + "logps/rejected": -674.3657836914062, + "loss": 0.1256, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.156732559204102, + "rewards/margins": 11.964634895324707, + "rewards/rejected": -22.121370315551758, + "step": 941 + }, + { + "epoch": 0.5860031104199067, + "grad_norm": 0.21400593221187592, + "learning_rate": 3.2222222222222227e-07, + "logits/chosen": -0.03957169130444527, + "logits/rejected": -0.1689032018184662, + "logps/chosen": -332.2846374511719, + "logps/rejected": -645.13232421875, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.372023582458496, + "rewards/margins": 14.312931060791016, + "rewards/rejected": -21.684953689575195, + "step": 942 + }, + { + "epoch": 0.5866251944012442, + "grad_norm": 7.27147102355957, + "learning_rate": 3.166666666666667e-07, + "logits/chosen": -0.07867459952831268, + "logits/rejected": -0.19862180948257446, + "logps/chosen": -397.5509033203125, + "logps/rejected": -600.814697265625, + "loss": 0.048, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.240923881530762, + "rewards/margins": 10.414289474487305, + "rewards/rejected": -18.65521240234375, + "step": 943 + }, + { + "epoch": 0.5872472783825816, + "grad_norm": 1.1487442255020142, + "learning_rate": 3.111111111111111e-07, + "logits/chosen": -0.08701802790164948, + "logits/rejected": -0.16848428547382355, + "logps/chosen": -431.9360046386719, + "logps/rejected": -589.8011474609375, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.523426055908203, + "rewards/margins": 12.11369514465332, + "rewards/rejected": -16.637121200561523, + "step": 944 + }, + { + "epoch": 0.5878693623639192, + "grad_norm": 0.001029341947287321, + "learning_rate": 3.055555555555556e-07, + "logits/chosen": 0.04855077341198921, + "logits/rejected": -0.17110806703567505, + "logps/chosen": -277.65692138671875, + "logps/rejected": -658.63720703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.618890762329102, + "rewards/margins": 17.13446807861328, + "rewards/rejected": -23.753360748291016, + "step": 945 + }, + { + "epoch": 0.5884914463452566, + "grad_norm": 5.48944616317749, + "learning_rate": 3.0000000000000004e-07, + "logits/chosen": -0.07804742455482483, + "logits/rejected": -0.13293352723121643, + "logps/chosen": -282.9705810546875, + "logps/rejected": -437.17034912109375, + "loss": 0.0594, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.170144081115723, + "rewards/margins": 10.368755340576172, + "rewards/rejected": -16.53890037536621, + "step": 946 + }, + { + "epoch": 0.5891135303265941, + "grad_norm": 0.0020194146782159805, + "learning_rate": 2.9444444444444444e-07, + "logits/chosen": -0.04791930317878723, + "logits/rejected": -0.144740492105484, + "logps/chosen": -285.6473388671875, + "logps/rejected": -604.849365234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.96201229095459, + "rewards/margins": 14.040616989135742, + "rewards/rejected": -21.002628326416016, + "step": 947 + }, + { + "epoch": 0.5897356143079315, + "grad_norm": 0.17419804632663727, + "learning_rate": 2.888888888888889e-07, + "logits/chosen": -0.0936560183763504, + "logits/rejected": -0.28556597232818604, + "logps/chosen": -228.4469451904297, + "logps/rejected": -743.6260375976562, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.099587917327881, + "rewards/margins": 15.566191673278809, + "rewards/rejected": -18.66577911376953, + "step": 948 + }, + { + "epoch": 0.5903576982892691, + "grad_norm": 3.2199866771698, + "learning_rate": 2.8333333333333336e-07, + "logits/chosen": -0.11061854660511017, + "logits/rejected": -0.16628184914588928, + "logps/chosen": -419.2809143066406, + "logps/rejected": -466.8538513183594, + "loss": 0.0291, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.118745803833008, + "rewards/margins": 10.695882797241211, + "rewards/rejected": -14.814629554748535, + "step": 949 + }, + { + "epoch": 0.5909797822706065, + "grad_norm": 0.003019323805347085, + "learning_rate": 2.7777777777777776e-07, + "logits/chosen": -0.1654212772846222, + "logits/rejected": -0.23847834765911102, + "logps/chosen": -373.5917053222656, + "logps/rejected": -641.8060913085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.203384399414062, + "rewards/margins": 16.09211540222168, + "rewards/rejected": -24.295501708984375, + "step": 950 + }, + { + "epoch": 0.591601866251944, + "grad_norm": 0.025611286982893944, + "learning_rate": 2.722222222222222e-07, + "logits/chosen": -0.019785813987255096, + "logits/rejected": -0.11858349293470383, + "logps/chosen": -285.9967041015625, + "logps/rejected": -498.4588928222656, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.526800155639648, + "rewards/margins": 13.301270484924316, + "rewards/rejected": -17.82806968688965, + "step": 951 + }, + { + "epoch": 0.5922239502332814, + "grad_norm": 1.3907132148742676, + "learning_rate": 2.666666666666667e-07, + "logits/chosen": -0.16363373398780823, + "logits/rejected": -0.20473578572273254, + "logps/chosen": -578.28515625, + "logps/rejected": -690.71630859375, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.961004257202148, + "rewards/margins": 9.755850791931152, + "rewards/rejected": -16.716854095458984, + "step": 952 + }, + { + "epoch": 0.592846034214619, + "grad_norm": 0.0023125142324715853, + "learning_rate": 2.6111111111111113e-07, + "logits/chosen": -0.1231946051120758, + "logits/rejected": -0.1937326341867447, + "logps/chosen": -600.8501586914062, + "logps/rejected": -836.072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.908215045928955, + "rewards/margins": 17.251995086669922, + "rewards/rejected": -23.16020965576172, + "step": 953 + }, + { + "epoch": 0.5934681181959565, + "grad_norm": 14.315815925598145, + "learning_rate": 2.555555555555556e-07, + "logits/chosen": -0.19726470112800598, + "logits/rejected": -0.1924193650484085, + "logps/chosen": -422.891357421875, + "logps/rejected": -498.3763732910156, + "loss": 0.4767, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.893457412719727, + "rewards/margins": 10.094686508178711, + "rewards/rejected": -15.988143920898438, + "step": 954 + }, + { + "epoch": 0.5940902021772939, + "grad_norm": 0.28058430552482605, + "learning_rate": 2.5000000000000004e-07, + "logits/chosen": -0.09799051284790039, + "logits/rejected": -0.18414980173110962, + "logps/chosen": -296.8705139160156, + "logps/rejected": -616.1451416015625, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.579868316650391, + "rewards/margins": 13.945685386657715, + "rewards/rejected": -18.525554656982422, + "step": 955 + }, + { + "epoch": 0.5947122861586314, + "grad_norm": 0.567885160446167, + "learning_rate": 2.444444444444445e-07, + "logits/chosen": -0.17700766026973724, + "logits/rejected": -0.26745501160621643, + "logps/chosen": -323.0926513671875, + "logps/rejected": -532.8177490234375, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.055389404296875, + "rewards/margins": 9.770513534545898, + "rewards/rejected": -14.825901985168457, + "step": 956 + }, + { + "epoch": 0.5953343701399689, + "grad_norm": 0.0003429602656979114, + "learning_rate": 2.388888888888889e-07, + "logits/chosen": -0.16062578558921814, + "logits/rejected": -0.20720486342906952, + "logps/chosen": -259.6952819824219, + "logps/rejected": -708.463134765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.374472618103027, + "rewards/margins": 22.57442283630371, + "rewards/rejected": -28.948894500732422, + "step": 957 + }, + { + "epoch": 0.5959564541213064, + "grad_norm": 3.378014326095581, + "learning_rate": 2.3333333333333336e-07, + "logits/chosen": -0.05014656484127045, + "logits/rejected": -0.006142571568489075, + "logps/chosen": -371.0225830078125, + "logps/rejected": -426.4797058105469, + "loss": 0.0864, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.998368263244629, + "rewards/margins": 7.749053478240967, + "rewards/rejected": -15.747421264648438, + "step": 958 + }, + { + "epoch": 0.5965785381026438, + "grad_norm": 0.053706925362348557, + "learning_rate": 2.2777777777777781e-07, + "logits/chosen": -0.17318032681941986, + "logits/rejected": -0.2223173826932907, + "logps/chosen": -336.95654296875, + "logps/rejected": -652.204345703125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.490747451782227, + "rewards/margins": 14.091325759887695, + "rewards/rejected": -20.582073211669922, + "step": 959 + }, + { + "epoch": 0.5972006220839814, + "grad_norm": 0.06210765242576599, + "learning_rate": 2.2222222222222224e-07, + "logits/chosen": -0.12771883606910706, + "logits/rejected": -0.1767701953649521, + "logps/chosen": -515.870361328125, + "logps/rejected": -667.3025512695312, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.69327449798584, + "rewards/margins": 12.756893157958984, + "rewards/rejected": -19.450166702270508, + "step": 960 + }, + { + "epoch": 0.5978227060653188, + "grad_norm": 0.2277175933122635, + "learning_rate": 2.166666666666667e-07, + "logits/chosen": 0.009233126416802406, + "logits/rejected": -0.12659485638141632, + "logps/chosen": -342.41802978515625, + "logps/rejected": -719.2701416015625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.477318286895752, + "rewards/margins": 14.717327117919922, + "rewards/rejected": -20.194644927978516, + "step": 961 + }, + { + "epoch": 0.5984447900466563, + "grad_norm": 0.17974837124347687, + "learning_rate": 2.1111111111111113e-07, + "logits/chosen": -0.09429922699928284, + "logits/rejected": -0.2365911602973938, + "logps/chosen": -251.5556182861328, + "logps/rejected": -597.9002685546875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.30790901184082, + "rewards/margins": 15.119089126586914, + "rewards/rejected": -19.426998138427734, + "step": 962 + }, + { + "epoch": 0.5990668740279937, + "grad_norm": 0.45290467143058777, + "learning_rate": 2.055555555555556e-07, + "logits/chosen": -0.15474756062030792, + "logits/rejected": -0.16858980059623718, + "logps/chosen": -392.7610168457031, + "logps/rejected": -531.8675537109375, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.710953712463379, + "rewards/margins": 10.490254402160645, + "rewards/rejected": -15.201208114624023, + "step": 963 + }, + { + "epoch": 0.5996889580093313, + "grad_norm": 0.04285069555044174, + "learning_rate": 2.0000000000000002e-07, + "logits/chosen": -0.07530153542757034, + "logits/rejected": -0.14746886491775513, + "logps/chosen": -253.17449951171875, + "logps/rejected": -512.3123779296875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.048408508300781, + "rewards/margins": 16.373149871826172, + "rewards/rejected": -20.421558380126953, + "step": 964 + }, + { + "epoch": 0.6003110419906688, + "grad_norm": 0.022624600678682327, + "learning_rate": 1.9444444444444447e-07, + "logits/chosen": -0.18949706852436066, + "logits/rejected": -0.24511560797691345, + "logps/chosen": -376.66925048828125, + "logps/rejected": -708.326416015625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.887449264526367, + "rewards/margins": 14.771350860595703, + "rewards/rejected": -19.65880012512207, + "step": 965 + }, + { + "epoch": 0.6009331259720062, + "grad_norm": 0.31223249435424805, + "learning_rate": 1.888888888888889e-07, + "logits/chosen": -0.17197266221046448, + "logits/rejected": -0.22930482029914856, + "logps/chosen": -332.7516174316406, + "logps/rejected": -532.8316040039062, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.003329277038574, + "rewards/margins": 11.795023918151855, + "rewards/rejected": -19.798351287841797, + "step": 966 + }, + { + "epoch": 0.6015552099533437, + "grad_norm": 0.19846542179584503, + "learning_rate": 1.8333333333333336e-07, + "logits/chosen": -0.048394568264484406, + "logits/rejected": -0.1507554054260254, + "logps/chosen": -264.4502868652344, + "logps/rejected": -509.6466979980469, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.049882411956787, + "rewards/margins": 14.112605094909668, + "rewards/rejected": -18.162487030029297, + "step": 967 + }, + { + "epoch": 0.6021772939346812, + "grad_norm": 0.2126728892326355, + "learning_rate": 1.777777777777778e-07, + "logits/chosen": -0.09563367068767548, + "logits/rejected": -0.20723724365234375, + "logps/chosen": -327.195068359375, + "logps/rejected": -658.9421997070312, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.178316116333008, + "rewards/margins": 15.883522033691406, + "rewards/rejected": -20.061840057373047, + "step": 968 + }, + { + "epoch": 0.6027993779160187, + "grad_norm": 0.29353633522987366, + "learning_rate": 1.7222222222222225e-07, + "logits/chosen": -0.15504950284957886, + "logits/rejected": -0.22116543352603912, + "logps/chosen": -332.7735290527344, + "logps/rejected": -498.68817138671875, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.426469326019287, + "rewards/margins": 10.015861511230469, + "rewards/rejected": -15.442331314086914, + "step": 969 + }, + { + "epoch": 0.6034214618973561, + "grad_norm": 0.008527892641723156, + "learning_rate": 1.6666666666666668e-07, + "logits/chosen": -0.08138444274663925, + "logits/rejected": -0.16231387853622437, + "logps/chosen": -409.7948303222656, + "logps/rejected": -644.8119506835938, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.172708511352539, + "rewards/margins": 14.5297269821167, + "rewards/rejected": -21.702436447143555, + "step": 970 + }, + { + "epoch": 0.6040435458786936, + "grad_norm": 1.415183186531067, + "learning_rate": 1.6111111111111113e-07, + "logits/chosen": -0.09529612213373184, + "logits/rejected": -0.14113754034042358, + "logps/chosen": -263.0989074707031, + "logps/rejected": -467.4256286621094, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.548853874206543, + "rewards/margins": 12.65938949584961, + "rewards/rejected": -17.20824432373047, + "step": 971 + }, + { + "epoch": 0.6046656298600311, + "grad_norm": 0.02875661477446556, + "learning_rate": 1.5555555555555556e-07, + "logits/chosen": -0.20879468321800232, + "logits/rejected": -0.2822430729866028, + "logps/chosen": -364.8992004394531, + "logps/rejected": -595.9520874023438, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.066924571990967, + "rewards/margins": 12.317537307739258, + "rewards/rejected": -16.384462356567383, + "step": 972 + }, + { + "epoch": 0.6052877138413686, + "grad_norm": 12.605457305908203, + "learning_rate": 1.5000000000000002e-07, + "logits/chosen": -0.08637724071741104, + "logits/rejected": -0.1715412735939026, + "logps/chosen": -291.0902099609375, + "logps/rejected": -581.4862670898438, + "loss": 0.1108, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.216606616973877, + "rewards/margins": 13.213994979858398, + "rewards/rejected": -19.430599212646484, + "step": 973 + }, + { + "epoch": 0.605909797822706, + "grad_norm": 0.17934511601924896, + "learning_rate": 1.4444444444444445e-07, + "logits/chosen": 0.030429325997829437, + "logits/rejected": -0.18844103813171387, + "logps/chosen": -334.04742431640625, + "logps/rejected": -740.1949462890625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.061784267425537, + "rewards/margins": 14.036367416381836, + "rewards/rejected": -19.09815216064453, + "step": 974 + }, + { + "epoch": 0.6065318818040435, + "grad_norm": 0.06465157121419907, + "learning_rate": 1.3888888888888888e-07, + "logits/chosen": -0.08784466981887817, + "logits/rejected": -0.09879573434591293, + "logps/chosen": -423.0223388671875, + "logps/rejected": -741.9646606445312, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.515133857727051, + "rewards/margins": 16.499309539794922, + "rewards/rejected": -22.014442443847656, + "step": 975 + }, + { + "epoch": 0.6071539657853811, + "grad_norm": 0.12453171610832214, + "learning_rate": 1.3333333333333336e-07, + "logits/chosen": -0.19629894196987152, + "logits/rejected": -0.2520937919616699, + "logps/chosen": -305.9202880859375, + "logps/rejected": -601.223388671875, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.222259521484375, + "rewards/margins": 14.104742050170898, + "rewards/rejected": -19.32699966430664, + "step": 976 + }, + { + "epoch": 0.6077760497667185, + "grad_norm": 0.1534721702337265, + "learning_rate": 1.277777777777778e-07, + "logits/chosen": -0.12924648821353912, + "logits/rejected": -0.2147355079650879, + "logps/chosen": -336.9256591796875, + "logps/rejected": -548.0462646484375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.697869300842285, + "rewards/margins": 10.756597518920898, + "rewards/rejected": -16.4544677734375, + "step": 977 + }, + { + "epoch": 0.608398133748056, + "grad_norm": 8.723535537719727, + "learning_rate": 1.2222222222222225e-07, + "logits/chosen": -0.11720257997512817, + "logits/rejected": -0.16808898746967316, + "logps/chosen": -415.1446533203125, + "logps/rejected": -653.688232421875, + "loss": 0.0964, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.655577659606934, + "rewards/margins": 15.170005798339844, + "rewards/rejected": -22.825580596923828, + "step": 978 + }, + { + "epoch": 0.6090202177293935, + "grad_norm": 0.08397135883569717, + "learning_rate": 1.1666666666666668e-07, + "logits/chosen": -0.10598370432853699, + "logits/rejected": -0.2061256617307663, + "logps/chosen": -218.11441040039062, + "logps/rejected": -493.979736328125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.6420488357543945, + "rewards/margins": 13.715932846069336, + "rewards/rejected": -18.35797882080078, + "step": 979 + }, + { + "epoch": 0.609642301710731, + "grad_norm": 0.2912822961807251, + "learning_rate": 1.1111111111111112e-07, + "logits/chosen": -0.07981020212173462, + "logits/rejected": -0.17837537825107574, + "logps/chosen": -363.23504638671875, + "logps/rejected": -584.80517578125, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.916135787963867, + "rewards/margins": 15.333748817443848, + "rewards/rejected": -19.2498836517334, + "step": 980 + }, + { + "epoch": 0.6102643856920684, + "grad_norm": 1.9975814819335938, + "learning_rate": 1.0555555555555557e-07, + "logits/chosen": -0.12037665396928787, + "logits/rejected": -0.209097221493721, + "logps/chosen": -553.438232421875, + "logps/rejected": -764.375, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.960785865783691, + "rewards/margins": 13.666773796081543, + "rewards/rejected": -21.627559661865234, + "step": 981 + }, + { + "epoch": 0.6108864696734059, + "grad_norm": 0.0017288518138229847, + "learning_rate": 1.0000000000000001e-07, + "logits/chosen": -0.10199277848005295, + "logits/rejected": -0.2675820589065552, + "logps/chosen": -312.9398193359375, + "logps/rejected": -769.2210083007812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.329739093780518, + "rewards/margins": 19.25160026550293, + "rewards/rejected": -24.581340789794922, + "step": 982 + }, + { + "epoch": 0.6115085536547434, + "grad_norm": 0.1516176462173462, + "learning_rate": 9.444444444444445e-08, + "logits/chosen": -0.2142464816570282, + "logits/rejected": -0.2178041636943817, + "logps/chosen": -242.07122802734375, + "logps/rejected": -430.35247802734375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.038417816162109, + "rewards/margins": 11.36536979675293, + "rewards/rejected": -15.403787612915039, + "step": 983 + }, + { + "epoch": 0.6121306376360809, + "grad_norm": 0.4500713348388672, + "learning_rate": 8.88888888888889e-08, + "logits/chosen": -0.1276463121175766, + "logits/rejected": -0.21693138778209686, + "logps/chosen": -388.90093994140625, + "logps/rejected": -637.0849609375, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.70115852355957, + "rewards/margins": 14.504281997680664, + "rewards/rejected": -19.205440521240234, + "step": 984 + }, + { + "epoch": 0.6127527216174183, + "grad_norm": 2.287808418273926, + "learning_rate": 8.333333333333334e-08, + "logits/chosen": -0.06157265603542328, + "logits/rejected": -0.14640533924102783, + "logps/chosen": -551.8743896484375, + "logps/rejected": -782.920166015625, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.059138298034668, + "rewards/margins": 14.505857467651367, + "rewards/rejected": -23.56499671936035, + "step": 985 + }, + { + "epoch": 0.6133748055987558, + "grad_norm": 0.003867674618959427, + "learning_rate": 7.777777777777778e-08, + "logits/chosen": -0.061310332268476486, + "logits/rejected": -0.1948992908000946, + "logps/chosen": -386.36541748046875, + "logps/rejected": -702.05029296875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1613779067993164, + "rewards/margins": 18.125558853149414, + "rewards/rejected": -21.28693389892578, + "step": 986 + }, + { + "epoch": 0.6139968895800934, + "grad_norm": 0.031196242198348045, + "learning_rate": 7.222222222222222e-08, + "logits/chosen": -0.10236930847167969, + "logits/rejected": -0.17077046632766724, + "logps/chosen": -337.6380920410156, + "logps/rejected": -572.47802734375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.682240962982178, + "rewards/margins": 13.58427619934082, + "rewards/rejected": -18.266517639160156, + "step": 987 + }, + { + "epoch": 0.6146189735614308, + "grad_norm": 0.053422823548316956, + "learning_rate": 6.666666666666668e-08, + "logits/chosen": -0.07512759417295456, + "logits/rejected": -0.15700417757034302, + "logps/chosen": -579.5989990234375, + "logps/rejected": -670.5982666015625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.205841064453125, + "rewards/margins": 13.896177291870117, + "rewards/rejected": -22.10201644897461, + "step": 988 + }, + { + "epoch": 0.6152410575427683, + "grad_norm": 0.25244781374931335, + "learning_rate": 6.111111111111112e-08, + "logits/chosen": -0.2211957573890686, + "logits/rejected": -0.2574540078639984, + "logps/chosen": -246.4641571044922, + "logps/rejected": -452.1832580566406, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.450385093688965, + "rewards/margins": 11.89030933380127, + "rewards/rejected": -18.340694427490234, + "step": 989 + }, + { + "epoch": 0.6158631415241057, + "grad_norm": 8.850515365600586, + "learning_rate": 5.555555555555556e-08, + "logits/chosen": -0.21874357759952545, + "logits/rejected": -0.24778583645820618, + "logps/chosen": -435.7833557128906, + "logps/rejected": -557.0269775390625, + "loss": 0.1137, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.025851249694824, + "rewards/margins": 11.219110488891602, + "rewards/rejected": -17.24496078491211, + "step": 990 + }, + { + "epoch": 0.6164852255054433, + "grad_norm": 7.452006816864014, + "learning_rate": 5.0000000000000004e-08, + "logits/chosen": -0.16037732362747192, + "logits/rejected": -0.24099667370319366, + "logps/chosen": -459.7658996582031, + "logps/rejected": -517.275634765625, + "loss": 0.1114, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.483787536621094, + "rewards/margins": 8.557278633117676, + "rewards/rejected": -14.041067123413086, + "step": 991 + }, + { + "epoch": 0.6171073094867807, + "grad_norm": 0.04967445880174637, + "learning_rate": 4.444444444444445e-08, + "logits/chosen": -0.08167940378189087, + "logits/rejected": -0.1765783727169037, + "logps/chosen": -216.80258178710938, + "logps/rejected": -570.129638671875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.299595832824707, + "rewards/margins": 12.523042678833008, + "rewards/rejected": -17.8226375579834, + "step": 992 + }, + { + "epoch": 0.6177293934681182, + "grad_norm": 1.1257555484771729, + "learning_rate": 3.888888888888889e-08, + "logits/chosen": -0.22494761645793915, + "logits/rejected": -0.25441375374794006, + "logps/chosen": -372.453125, + "logps/rejected": -477.4569091796875, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.7852935791015625, + "rewards/margins": 8.005215644836426, + "rewards/rejected": -12.790508270263672, + "step": 993 + }, + { + "epoch": 0.6183514774494556, + "grad_norm": 0.005639821756631136, + "learning_rate": 3.333333333333334e-08, + "logits/chosen": -0.035672757774591446, + "logits/rejected": -0.1991538554430008, + "logps/chosen": -370.77227783203125, + "logps/rejected": -715.0587158203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.819668292999268, + "rewards/margins": 17.804977416992188, + "rewards/rejected": -22.62464714050293, + "step": 994 + }, + { + "epoch": 0.6189735614307932, + "grad_norm": 1.280846118927002, + "learning_rate": 2.777777777777778e-08, + "logits/chosen": -0.10067661851644516, + "logits/rejected": -0.20049627125263214, + "logps/chosen": -391.2098083496094, + "logps/rejected": -739.1378173828125, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.560197353363037, + "rewards/margins": 13.409440040588379, + "rewards/rejected": -17.969636917114258, + "step": 995 + }, + { + "epoch": 0.6195956454121306, + "grad_norm": 1.3170677423477173, + "learning_rate": 2.2222222222222224e-08, + "logits/chosen": -0.07506420463323593, + "logits/rejected": -0.1836668998003006, + "logps/chosen": -262.58953857421875, + "logps/rejected": -492.2082824707031, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.377254009246826, + "rewards/margins": 11.456298828125, + "rewards/rejected": -15.833551406860352, + "step": 996 + }, + { + "epoch": 0.6202177293934681, + "grad_norm": 1.8218214511871338, + "learning_rate": 1.666666666666667e-08, + "logits/chosen": -0.1762431263923645, + "logits/rejected": -0.22069337964057922, + "logps/chosen": -317.37152099609375, + "logps/rejected": -527.6663818359375, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.7176618576049805, + "rewards/margins": 10.859783172607422, + "rewards/rejected": -15.577445030212402, + "step": 997 + }, + { + "epoch": 0.6208398133748056, + "grad_norm": 0.13057827949523926, + "learning_rate": 1.1111111111111112e-08, + "logits/chosen": -0.09723896533250809, + "logits/rejected": -0.2526796758174896, + "logps/chosen": -313.3844299316406, + "logps/rejected": -587.695556640625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.314016342163086, + "rewards/margins": 13.393948554992676, + "rewards/rejected": -18.707965850830078, + "step": 998 + }, + { + "epoch": 0.6214618973561431, + "grad_norm": 4.283413410186768, + "learning_rate": 5.555555555555556e-09, + "logits/chosen": -0.07240301370620728, + "logits/rejected": -0.1720668077468872, + "logps/chosen": -333.8627014160156, + "logps/rejected": -544.05517578125, + "loss": 0.0375, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.653079032897949, + "rewards/margins": 10.947370529174805, + "rewards/rejected": -17.600448608398438, + "step": 999 + }, + { + "epoch": 0.6220839813374806, + "grad_norm": 21.72480583190918, + "learning_rate": 0.0, + "logits/chosen": -0.11657991260290146, + "logits/rejected": -0.1369350254535675, + "logps/chosen": -367.5093994140625, + "logps/rejected": -458.85198974609375, + "loss": 0.5093, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.000164985656738, + "rewards/margins": 10.249628067016602, + "rewards/rejected": -17.249794006347656, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}