diff --git "a/checkpoint-500/trainer_state.json" "b/checkpoint-500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-500/trainer_state.json" @@ -0,0 +1,7021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5215803885773895, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": -2.2717783451080322, + "logits/rejected": -2.2640371322631836, + "logps/chosen": -200.07493591308594, + "logps/rejected": -200.70086669921875, + "loss": 0.6789, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.023946668952703476, + "rewards/margins": 0.029492639005184174, + "rewards/rejected": -0.005545974709093571, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -2.2359213829040527, + "logits/rejected": -2.2241828441619873, + "logps/chosen": -188.74400329589844, + "logps/rejected": -181.30078125, + "loss": 0.7042, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.022877119481563568, + "rewards/margins": -0.02025613933801651, + "rewards/rejected": -0.0026209834031760693, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 1.5e-06, + "logits/chosen": -2.2504844665527344, + "logits/rejected": -2.2917656898498535, + "logps/chosen": -182.1482391357422, + "logps/rejected": -201.4050750732422, + "loss": 0.6934, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.007386707700788975, + "rewards/margins": 0.0019398471340537071, + "rewards/rejected": 0.005446866154670715, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -2.166776180267334, + "logits/rejected": -2.0744781494140625, + "logps/chosen": -173.78936767578125, + "logps/rejected": -150.8326416015625, + "loss": 0.6891, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0020218612626194954, + "rewards/margins": 0.009007596410810947, + "rewards/rejected": -0.011029457673430443, + "step": 4 + }, + { + "epoch": 0.01, + "learning_rate": 2.5e-06, + "logits/chosen": -2.1799259185791016, + "logits/rejected": -2.3425800800323486, + "logps/chosen": -137.8708953857422, + "logps/rejected": -148.37060546875, + "loss": 0.6906, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012697315774857998, + "rewards/margins": 0.00599064864218235, + "rewards/rejected": 0.006706667132675648, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 3e-06, + "logits/chosen": -2.1913797855377197, + "logits/rejected": -2.1852920055389404, + "logps/chosen": -127.57758331298828, + "logps/rejected": -138.31591796875, + "loss": 0.7037, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.010897636413574219, + "rewards/margins": -0.02074580080807209, + "rewards/rejected": 0.009848165325820446, + "step": 6 + }, + { + "epoch": 0.01, + "learning_rate": 3.5000000000000004e-06, + "logits/chosen": -2.1357011795043945, + "logits/rejected": -2.136214256286621, + "logps/chosen": -139.83346557617188, + "logps/rejected": -145.2589111328125, + "loss": 0.6845, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0056040287017822266, + "rewards/margins": 0.01838543452322483, + "rewards/rejected": -0.01278140489012003, + "step": 7 + }, + { + "epoch": 0.01, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -2.105727195739746, + "logits/rejected": -2.0614218711853027, + "logps/chosen": -195.34869384765625, + "logps/rejected": -206.2098388671875, + "loss": 0.6963, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.030789854004979134, + "rewards/margins": -0.005677317269146442, + "rewards/rejected": 0.03646716848015785, + "step": 8 + }, + { + "epoch": 0.01, + "learning_rate": 4.5e-06, + "logits/chosen": -2.3017406463623047, + "logits/rejected": -2.3441271781921387, + "logps/chosen": -174.49053955078125, + "logps/rejected": -197.5611572265625, + "loss": 0.6875, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006907796021550894, + "rewards/margins": 0.012369632720947266, + "rewards/rejected": -0.019277429208159447, + "step": 9 + }, + { + "epoch": 0.01, + "learning_rate": 5e-06, + "logits/chosen": -2.257244825363159, + "logits/rejected": -2.2989351749420166, + "logps/chosen": -182.47164916992188, + "logps/rejected": -170.6004180908203, + "loss": 0.6925, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.009080934338271618, + "rewards/margins": 0.0034890654496848583, + "rewards/rejected": 0.005591869354248047, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 5.500000000000001e-06, + "logits/chosen": -2.084881544113159, + "logits/rejected": -2.1664023399353027, + "logps/chosen": -147.18572998046875, + "logps/rejected": -154.4085693359375, + "loss": 0.7014, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.014512108638882637, + "rewards/margins": -0.0151824951171875, + "rewards/rejected": 0.0006703853141516447, + "step": 11 + }, + { + "epoch": 0.01, + "learning_rate": 6e-06, + "logits/chosen": -2.122969150543213, + "logits/rejected": -2.114781379699707, + "logps/chosen": -238.08493041992188, + "logps/rejected": -221.2827606201172, + "loss": 0.6798, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003618289018049836, + "rewards/margins": 0.028017427772283554, + "rewards/rejected": -0.02439913898706436, + "step": 12 + }, + { + "epoch": 0.01, + "learning_rate": 6.5000000000000004e-06, + "logits/chosen": -2.3176565170288086, + "logits/rejected": -2.26943039894104, + "logps/chosen": -166.89556884765625, + "logps/rejected": -156.35850524902344, + "loss": 0.6901, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.013709260150790215, + "rewards/margins": 0.007927654311060905, + "rewards/rejected": 0.00578160397708416, + "step": 13 + }, + { + "epoch": 0.01, + "learning_rate": 7.000000000000001e-06, + "logits/chosen": -2.2808492183685303, + "logits/rejected": -2.295313596725464, + "logps/chosen": -158.31381225585938, + "logps/rejected": -165.48663330078125, + "loss": 0.6953, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.01445994433015585, + "rewards/margins": -0.002848696894943714, + "rewards/rejected": 0.017308639362454414, + "step": 14 + }, + { + "epoch": 0.02, + "learning_rate": 7.5e-06, + "logits/chosen": -2.428576707839966, + "logits/rejected": -2.4046826362609863, + "logps/chosen": -198.4075164794922, + "logps/rejected": -199.75180053710938, + "loss": 0.6883, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.012678648345172405, + "rewards/margins": 0.010429286397993565, + "rewards/rejected": 0.0022493600845336914, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": -2.102433681488037, + "logits/rejected": -2.098867893218994, + "logps/chosen": -130.66616821289062, + "logps/rejected": -129.43551635742188, + "loss": 0.6897, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.00463492888957262, + "rewards/margins": 0.007227444555610418, + "rewards/rejected": -0.0025925161316990852, + "step": 16 + }, + { + "epoch": 0.02, + "learning_rate": 8.500000000000002e-06, + "logits/chosen": -2.2135839462280273, + "logits/rejected": -2.2183382511138916, + "logps/chosen": -157.07391357421875, + "logps/rejected": -173.192138671875, + "loss": 0.7028, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.00783538818359375, + "rewards/margins": -0.018341876566410065, + "rewards/rejected": 0.010506488382816315, + "step": 17 + }, + { + "epoch": 0.02, + "learning_rate": 9e-06, + "logits/chosen": -2.2919342517852783, + "logits/rejected": -2.3105809688568115, + "logps/chosen": -212.9804229736328, + "logps/rejected": -213.6470947265625, + "loss": 0.6944, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.017333555966615677, + "rewards/margins": -0.0019711018539965153, + "rewards/rejected": 0.019304655492305756, + "step": 18 + }, + { + "epoch": 0.02, + "learning_rate": 9.5e-06, + "logits/chosen": -2.150813579559326, + "logits/rejected": -2.1184804439544678, + "logps/chosen": -164.96514892578125, + "logps/rejected": -159.76754760742188, + "loss": 0.688, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.016628170385956764, + "rewards/margins": 0.01211006660014391, + "rewards/rejected": -0.02873823791742325, + "step": 19 + }, + { + "epoch": 0.02, + "learning_rate": 1e-05, + "logits/chosen": -2.357393264770508, + "logits/rejected": -2.284986734390259, + "logps/chosen": -206.59085083007812, + "logps/rejected": -196.2120819091797, + "loss": 0.6848, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006941366009414196, + "rewards/margins": 0.018102647736668587, + "rewards/rejected": -0.011161278933286667, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 1.05e-05, + "logits/chosen": -2.1450111865997314, + "logits/rejected": -2.2165582180023193, + "logps/chosen": -157.14804077148438, + "logps/rejected": -169.00897216796875, + "loss": 0.6747, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.01565234735608101, + "rewards/margins": 0.03814287483692169, + "rewards/rejected": -0.022490523755550385, + "step": 21 + }, + { + "epoch": 0.02, + "learning_rate": 1.1000000000000001e-05, + "logits/chosen": -2.194695234298706, + "logits/rejected": -2.1587462425231934, + "logps/chosen": -180.54441833496094, + "logps/rejected": -179.88087463378906, + "loss": 0.6891, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004329085350036621, + "rewards/margins": 0.008360721170902252, + "rewards/rejected": -0.0040316348895430565, + "step": 22 + }, + { + "epoch": 0.02, + "learning_rate": 1.1500000000000002e-05, + "logits/chosen": -2.224414825439453, + "logits/rejected": -2.2185750007629395, + "logps/chosen": -172.73263549804688, + "logps/rejected": -164.4583282470703, + "loss": 0.6916, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.007931852713227272, + "rewards/margins": 0.0039233677089214325, + "rewards/rejected": -0.011855222284793854, + "step": 23 + }, + { + "epoch": 0.03, + "learning_rate": 1.2e-05, + "logits/chosen": -2.2102274894714355, + "logits/rejected": -2.2018349170684814, + "logps/chosen": -187.68124389648438, + "logps/rejected": -195.62225341796875, + "loss": 0.7161, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.016138983890414238, + "rewards/margins": -0.04391060024499893, + "rewards/rejected": 0.027771614491939545, + "step": 24 + }, + { + "epoch": 0.03, + "learning_rate": 1.25e-05, + "logits/chosen": -2.124497413635254, + "logits/rejected": -2.180361270904541, + "logps/chosen": -173.36505126953125, + "logps/rejected": -188.89918518066406, + "loss": 0.7035, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.021524429321289062, + "rewards/margins": -0.02002444490790367, + "rewards/rejected": -0.0014999869745224714, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 1.3000000000000001e-05, + "logits/chosen": -2.154703140258789, + "logits/rejected": -2.2054295539855957, + "logps/chosen": -161.19815063476562, + "logps/rejected": -172.0135040283203, + "loss": 0.6995, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.007346701342612505, + "rewards/margins": -0.011799763888120651, + "rewards/rejected": 0.004453063011169434, + "step": 26 + }, + { + "epoch": 0.03, + "learning_rate": 1.3500000000000001e-05, + "logits/chosen": -2.2279651165008545, + "logits/rejected": -2.360706329345703, + "logps/chosen": -134.864501953125, + "logps/rejected": -170.91477966308594, + "loss": 0.6831, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.017622999846935272, + "rewards/margins": 0.021318625658750534, + "rewards/rejected": -0.03894162178039551, + "step": 27 + }, + { + "epoch": 0.03, + "learning_rate": 1.4000000000000001e-05, + "logits/chosen": -2.119718551635742, + "logits/rejected": -2.1303789615631104, + "logps/chosen": -153.42706298828125, + "logps/rejected": -149.59426879882812, + "loss": 0.7052, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.022306587547063828, + "rewards/margins": -0.02277979999780655, + "rewards/rejected": 0.00047321245074272156, + "step": 28 + }, + { + "epoch": 0.03, + "learning_rate": 1.45e-05, + "logits/chosen": -2.1661736965179443, + "logits/rejected": -2.200699806213379, + "logps/chosen": -134.8897705078125, + "logps/rejected": -173.7844696044922, + "loss": 0.6753, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.013965796679258347, + "rewards/margins": 0.03791213408112526, + "rewards/rejected": -0.05187792703509331, + "step": 29 + }, + { + "epoch": 0.03, + "learning_rate": 1.5e-05, + "logits/chosen": -2.1092920303344727, + "logits/rejected": -2.1575889587402344, + "logps/chosen": -156.42156982421875, + "logps/rejected": -184.9061737060547, + "loss": 0.6931, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.02043609507381916, + "rewards/margins": 0.0014666561037302017, + "rewards/rejected": -0.02190275304019451, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 1.55e-05, + "logits/chosen": -2.1494805812835693, + "logits/rejected": -2.235766887664795, + "logps/chosen": -147.58779907226562, + "logps/rejected": -176.71292114257812, + "loss": 0.6972, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.020243335515260696, + "rewards/margins": -0.007363701239228249, + "rewards/rejected": -0.012879634276032448, + "step": 31 + }, + { + "epoch": 0.03, + "learning_rate": 1.6000000000000003e-05, + "logits/chosen": -2.0302445888519287, + "logits/rejected": -2.072943687438965, + "logps/chosen": -161.97325134277344, + "logps/rejected": -169.5047149658203, + "loss": 0.7081, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.033078648149967194, + "rewards/margins": -0.026705406606197357, + "rewards/rejected": -0.006373238749802113, + "step": 32 + }, + { + "epoch": 0.03, + "learning_rate": 1.65e-05, + "logits/chosen": -2.1231038570404053, + "logits/rejected": -2.164695978164673, + "logps/chosen": -177.8040313720703, + "logps/rejected": -184.5164794921875, + "loss": 0.7067, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.02300238609313965, + "rewards/margins": -0.026141025125980377, + "rewards/rejected": 0.003138638101518154, + "step": 33 + }, + { + "epoch": 0.04, + "learning_rate": 1.7000000000000003e-05, + "logits/chosen": -2.297323703765869, + "logits/rejected": -2.2965850830078125, + "logps/chosen": -138.19195556640625, + "logps/rejected": -146.45855712890625, + "loss": 0.6963, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.01872837543487549, + "rewards/margins": -0.003586245933547616, + "rewards/rejected": -0.01514213066548109, + "step": 34 + }, + { + "epoch": 0.04, + "learning_rate": 1.75e-05, + "logits/chosen": -1.9681299924850464, + "logits/rejected": -2.0026988983154297, + "logps/chosen": -148.55194091796875, + "logps/rejected": -154.24107360839844, + "loss": 0.6965, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.027061177417635918, + "rewards/margins": -0.003932238090783358, + "rewards/rejected": -0.023128939792513847, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 1.8e-05, + "logits/chosen": -2.351780414581299, + "logits/rejected": -2.4137086868286133, + "logps/chosen": -206.77256774902344, + "logps/rejected": -187.95159912109375, + "loss": 0.6941, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.015692900866270065, + "rewards/margins": -0.0006634213495999575, + "rewards/rejected": -0.015029479749500751, + "step": 36 + }, + { + "epoch": 0.04, + "learning_rate": 1.85e-05, + "logits/chosen": -2.145651340484619, + "logits/rejected": -2.1530380249023438, + "logps/chosen": -174.49249267578125, + "logps/rejected": -170.29107666015625, + "loss": 0.6981, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05813932418823242, + "rewards/margins": -0.007088134065270424, + "rewards/rejected": -0.05105118826031685, + "step": 37 + }, + { + "epoch": 0.04, + "learning_rate": 1.9e-05, + "logits/chosen": -2.119550943374634, + "logits/rejected": -2.1285836696624756, + "logps/chosen": -158.59103393554688, + "logps/rejected": -161.4877471923828, + "loss": 0.6626, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.004540919791907072, + "rewards/margins": 0.06354211270809174, + "rewards/rejected": -0.0680830180644989, + "step": 38 + }, + { + "epoch": 0.04, + "learning_rate": 1.9500000000000003e-05, + "logits/chosen": -2.0205461978912354, + "logits/rejected": -2.0473952293395996, + "logps/chosen": -132.5583953857422, + "logps/rejected": -158.61367797851562, + "loss": 0.7152, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.05132186412811279, + "rewards/margins": -0.04252650961279869, + "rewards/rejected": -0.008795355446636677, + "step": 39 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "logits/chosen": -2.1021435260772705, + "logits/rejected": -2.056175708770752, + "logps/chosen": -164.51136779785156, + "logps/rejected": -140.69076538085938, + "loss": 0.704, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.042324475944042206, + "rewards/margins": -0.01991286501288414, + "rewards/rejected": -0.022411609068512917, + "step": 40 + }, + { + "epoch": 0.04, + "learning_rate": 2.05e-05, + "logits/chosen": -2.0000710487365723, + "logits/rejected": -2.0445573329925537, + "logps/chosen": -144.15602111816406, + "logps/rejected": -158.8379364013672, + "loss": 0.687, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.007117677479982376, + "rewards/margins": 0.014511799439787865, + "rewards/rejected": -0.007394121494144201, + "step": 41 + }, + { + "epoch": 0.04, + "learning_rate": 2.1e-05, + "logits/chosen": -2.1619393825531006, + "logits/rejected": -2.2450132369995117, + "logps/chosen": -147.3644561767578, + "logps/rejected": -152.8286895751953, + "loss": 0.6965, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.050475671887397766, + "rewards/margins": -0.003598665352910757, + "rewards/rejected": -0.04687700420618057, + "step": 42 + }, + { + "epoch": 0.04, + "learning_rate": 2.15e-05, + "logits/chosen": -2.103001356124878, + "logits/rejected": -2.1376430988311768, + "logps/chosen": -197.96510314941406, + "logps/rejected": -183.20042419433594, + "loss": 0.6886, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02689351886510849, + "rewards/margins": 0.01161129493266344, + "rewards/rejected": -0.038504816591739655, + "step": 43 + }, + { + "epoch": 0.05, + "learning_rate": 2.2000000000000003e-05, + "logits/chosen": -2.249006748199463, + "logits/rejected": -2.2560691833496094, + "logps/chosen": -183.13180541992188, + "logps/rejected": -191.20266723632812, + "loss": 0.6994, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0708339661359787, + "rewards/margins": -0.009988496080040932, + "rewards/rejected": -0.06084546819329262, + "step": 44 + }, + { + "epoch": 0.05, + "learning_rate": 2.25e-05, + "logits/chosen": -2.076373815536499, + "logits/rejected": -2.1558804512023926, + "logps/chosen": -173.52224731445312, + "logps/rejected": -189.98086547851562, + "loss": 0.6965, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.061136387288570404, + "rewards/margins": -0.0023721233010292053, + "rewards/rejected": -0.0587642677128315, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 2.3000000000000003e-05, + "logits/chosen": -2.2398018836975098, + "logits/rejected": -2.1674294471740723, + "logps/chosen": -162.8035125732422, + "logps/rejected": -156.17276000976562, + "loss": 0.7007, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.06224403530359268, + "rewards/margins": -0.008054491132497787, + "rewards/rejected": -0.054189540445804596, + "step": 46 + }, + { + "epoch": 0.05, + "learning_rate": 2.35e-05, + "logits/chosen": -2.152360200881958, + "logits/rejected": -2.221158742904663, + "logps/chosen": -241.5851593017578, + "logps/rejected": -263.8956298828125, + "loss": 0.6805, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.04438929632306099, + "rewards/margins": 0.027727916836738586, + "rewards/rejected": -0.07211720943450928, + "step": 47 + }, + { + "epoch": 0.05, + "learning_rate": 2.4e-05, + "logits/chosen": -2.180600166320801, + "logits/rejected": -2.184905529022217, + "logps/chosen": -146.00100708007812, + "logps/rejected": -172.64227294921875, + "loss": 0.7003, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.04197859391570091, + "rewards/margins": -0.011492157354950905, + "rewards/rejected": -0.030486440286040306, + "step": 48 + }, + { + "epoch": 0.05, + "learning_rate": 2.45e-05, + "logits/chosen": -2.113537549972534, + "logits/rejected": -2.1188883781433105, + "logps/chosen": -145.5101318359375, + "logps/rejected": -145.72979736328125, + "loss": 0.6969, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09581132233142853, + "rewards/margins": -0.0029689306393265724, + "rewards/rejected": -0.09284238517284393, + "step": 49 + }, + { + "epoch": 0.05, + "learning_rate": 2.5e-05, + "logits/chosen": -2.2486584186553955, + "logits/rejected": -2.2213432788848877, + "logps/chosen": -165.0408172607422, + "logps/rejected": -172.3516845703125, + "loss": 0.672, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07465384155511856, + "rewards/margins": 0.04853079840540886, + "rewards/rejected": -0.12318463623523712, + "step": 50 + }, + { + "epoch": 0.05, + "learning_rate": 2.5500000000000003e-05, + "logits/chosen": -2.0694539546966553, + "logits/rejected": -2.0400142669677734, + "logps/chosen": -177.0946502685547, + "logps/rejected": -164.363037109375, + "loss": 0.734, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.11727481335401535, + "rewards/margins": -0.07744203507900238, + "rewards/rejected": -0.03983278200030327, + "step": 51 + }, + { + "epoch": 0.05, + "learning_rate": 2.6000000000000002e-05, + "logits/chosen": -2.2477283477783203, + "logits/rejected": -2.322450876235962, + "logps/chosen": -180.88040161132812, + "logps/rejected": -194.59298706054688, + "loss": 0.7052, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.05896530672907829, + "rewards/margins": -0.019788123667240143, + "rewards/rejected": -0.03917717933654785, + "step": 52 + }, + { + "epoch": 0.06, + "learning_rate": 2.6500000000000004e-05, + "logits/chosen": -2.28759765625, + "logits/rejected": -2.2965030670166016, + "logps/chosen": -166.57757568359375, + "logps/rejected": -168.3147430419922, + "loss": 0.7066, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.11757899075746536, + "rewards/margins": -0.023945949971675873, + "rewards/rejected": -0.09363303333520889, + "step": 53 + }, + { + "epoch": 0.06, + "learning_rate": 2.7000000000000002e-05, + "logits/chosen": -2.1356818675994873, + "logits/rejected": -2.077864170074463, + "logps/chosen": -163.74700927734375, + "logps/rejected": -186.17466735839844, + "loss": 0.6966, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0876019299030304, + "rewards/margins": -0.0037088878452777863, + "rewards/rejected": -0.08389303088188171, + "step": 54 + }, + { + "epoch": 0.06, + "learning_rate": 2.7500000000000004e-05, + "logits/chosen": -2.3550398349761963, + "logits/rejected": -2.3566269874572754, + "logps/chosen": -201.12591552734375, + "logps/rejected": -196.99490356445312, + "loss": 0.6809, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14813652634620667, + "rewards/margins": 0.03020734339952469, + "rewards/rejected": -0.17834386229515076, + "step": 55 + }, + { + "epoch": 0.06, + "learning_rate": 2.8000000000000003e-05, + "logits/chosen": -2.1153206825256348, + "logits/rejected": -2.172855854034424, + "logps/chosen": -176.15061950683594, + "logps/rejected": -198.8987579345703, + "loss": 0.7242, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1643160581588745, + "rewards/margins": -0.05339653417468071, + "rewards/rejected": -0.1109195277094841, + "step": 56 + }, + { + "epoch": 0.06, + "learning_rate": 2.8499999999999998e-05, + "logits/chosen": -2.0671656131744385, + "logits/rejected": -2.146867036819458, + "logps/chosen": -190.34786987304688, + "logps/rejected": -229.0176544189453, + "loss": 0.6826, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12184757739305496, + "rewards/margins": 0.025761937722563744, + "rewards/rejected": -0.14760951697826385, + "step": 57 + }, + { + "epoch": 0.06, + "learning_rate": 2.9e-05, + "logits/chosen": -2.1337296962738037, + "logits/rejected": -2.1995086669921875, + "logps/chosen": -152.1367950439453, + "logps/rejected": -201.97044372558594, + "loss": 0.6778, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1797308623790741, + "rewards/margins": 0.04084575176239014, + "rewards/rejected": -0.22057661414146423, + "step": 58 + }, + { + "epoch": 0.06, + "learning_rate": 2.95e-05, + "logits/chosen": -1.802043080329895, + "logits/rejected": -1.7019507884979248, + "logps/chosen": -136.04141235351562, + "logps/rejected": -147.78366088867188, + "loss": 0.6867, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14073634147644043, + "rewards/margins": 0.02230207994580269, + "rewards/rejected": -0.163038432598114, + "step": 59 + }, + { + "epoch": 0.06, + "learning_rate": 3e-05, + "logits/chosen": -2.1223766803741455, + "logits/rejected": -2.229828357696533, + "logps/chosen": -163.0341033935547, + "logps/rejected": -186.51254272460938, + "loss": 0.6933, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15651509165763855, + "rewards/margins": 0.003519295249134302, + "rewards/rejected": -0.16003437340259552, + "step": 60 + }, + { + "epoch": 0.06, + "learning_rate": 3.05e-05, + "logits/chosen": -2.0530059337615967, + "logits/rejected": -2.076582908630371, + "logps/chosen": -140.6927490234375, + "logps/rejected": -159.9715576171875, + "loss": 0.7026, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1471368670463562, + "rewards/margins": -0.013904569670557976, + "rewards/rejected": -0.13323231041431427, + "step": 61 + }, + { + "epoch": 0.06, + "learning_rate": 3.1e-05, + "logits/chosen": -2.2076821327209473, + "logits/rejected": -2.2303273677825928, + "logps/chosen": -162.5172882080078, + "logps/rejected": -172.63409423828125, + "loss": 0.7205, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21913205087184906, + "rewards/margins": -0.046682070940732956, + "rewards/rejected": -0.1724499762058258, + "step": 62 + }, + { + "epoch": 0.07, + "learning_rate": 3.15e-05, + "logits/chosen": -2.1623778343200684, + "logits/rejected": -2.109609365463257, + "logps/chosen": -190.25259399414062, + "logps/rejected": -181.3391876220703, + "loss": 0.6934, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2230222225189209, + "rewards/margins": 0.014317656867206097, + "rewards/rejected": -0.23733988404273987, + "step": 63 + }, + { + "epoch": 0.07, + "learning_rate": 3.2000000000000005e-05, + "logits/chosen": -2.1411020755767822, + "logits/rejected": -2.087689161300659, + "logps/chosen": -149.68978881835938, + "logps/rejected": -144.92745971679688, + "loss": 0.68, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18927860260009766, + "rewards/margins": 0.03361104428768158, + "rewards/rejected": -0.22288964688777924, + "step": 64 + }, + { + "epoch": 0.07, + "learning_rate": 3.2500000000000004e-05, + "logits/chosen": -2.1835758686065674, + "logits/rejected": -2.1907880306243896, + "logps/chosen": -164.97738647460938, + "logps/rejected": -180.56930541992188, + "loss": 0.6891, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20788520574569702, + "rewards/margins": 0.012470051646232605, + "rewards/rejected": -0.22035524249076843, + "step": 65 + }, + { + "epoch": 0.07, + "learning_rate": 3.3e-05, + "logits/chosen": -2.3262267112731934, + "logits/rejected": -2.2733724117279053, + "logps/chosen": -179.5419921875, + "logps/rejected": -180.0846710205078, + "loss": 0.7209, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.21241910755634308, + "rewards/margins": -0.04504784941673279, + "rewards/rejected": -0.16737127304077148, + "step": 66 + }, + { + "epoch": 0.07, + "learning_rate": 3.35e-05, + "logits/chosen": -2.183920383453369, + "logits/rejected": -2.155302047729492, + "logps/chosen": -165.67642211914062, + "logps/rejected": -179.09642028808594, + "loss": 0.6755, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18955731391906738, + "rewards/margins": 0.04666180908679962, + "rewards/rejected": -0.2362191379070282, + "step": 67 + }, + { + "epoch": 0.07, + "learning_rate": 3.4000000000000007e-05, + "logits/chosen": -2.099074602127075, + "logits/rejected": -2.144994020462036, + "logps/chosen": -146.17672729492188, + "logps/rejected": -151.28427124023438, + "loss": 0.737, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.26217174530029297, + "rewards/margins": -0.07887978851795197, + "rewards/rejected": -0.1832919716835022, + "step": 68 + }, + { + "epoch": 0.07, + "learning_rate": 3.45e-05, + "logits/chosen": -2.2945265769958496, + "logits/rejected": -2.2669451236724854, + "logps/chosen": -189.3528594970703, + "logps/rejected": -166.80990600585938, + "loss": 0.6892, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2909608483314514, + "rewards/margins": 0.024565648287534714, + "rewards/rejected": -0.31552648544311523, + "step": 69 + }, + { + "epoch": 0.07, + "learning_rate": 3.5e-05, + "logits/chosen": -2.0787875652313232, + "logits/rejected": -2.090841054916382, + "logps/chosen": -192.3681640625, + "logps/rejected": -224.5333251953125, + "loss": 0.7097, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.29522770643234253, + "rewards/margins": -0.02591385506093502, + "rewards/rejected": -0.26931384205818176, + "step": 70 + }, + { + "epoch": 0.07, + "learning_rate": 3.55e-05, + "logits/chosen": -2.1215460300445557, + "logits/rejected": -2.0683701038360596, + "logps/chosen": -144.15745544433594, + "logps/rejected": -141.385498046875, + "loss": 0.7234, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.34091001749038696, + "rewards/margins": -0.048917919397354126, + "rewards/rejected": -0.2919921278953552, + "step": 71 + }, + { + "epoch": 0.08, + "learning_rate": 3.6e-05, + "logits/chosen": -2.1100878715515137, + "logits/rejected": -2.112070083618164, + "logps/chosen": -154.8167724609375, + "logps/rejected": -158.28297424316406, + "loss": 0.6744, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2560737431049347, + "rewards/margins": 0.04427195340394974, + "rewards/rejected": -0.30034565925598145, + "step": 72 + }, + { + "epoch": 0.08, + "learning_rate": 3.65e-05, + "logits/chosen": -2.129936933517456, + "logits/rejected": -2.2630691528320312, + "logps/chosen": -158.00205993652344, + "logps/rejected": -176.6663055419922, + "loss": 0.6952, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.27465441823005676, + "rewards/margins": 0.013333894312381744, + "rewards/rejected": -0.2879883348941803, + "step": 73 + }, + { + "epoch": 0.08, + "learning_rate": 3.7e-05, + "logits/chosen": -2.2137179374694824, + "logits/rejected": -2.219494342803955, + "logps/chosen": -175.63868713378906, + "logps/rejected": -165.37460327148438, + "loss": 0.7134, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.30476734042167664, + "rewards/margins": -0.019104812294244766, + "rewards/rejected": -0.28566253185272217, + "step": 74 + }, + { + "epoch": 0.08, + "learning_rate": 3.7500000000000003e-05, + "logits/chosen": -2.021366596221924, + "logits/rejected": -1.953249216079712, + "logps/chosen": -148.9893341064453, + "logps/rejected": -147.3232421875, + "loss": 0.6865, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2916088402271271, + "rewards/margins": 0.026065416634082794, + "rewards/rejected": -0.31767427921295166, + "step": 75 + }, + { + "epoch": 0.08, + "learning_rate": 3.8e-05, + "logits/chosen": -2.0788702964782715, + "logits/rejected": -2.081282377243042, + "logps/chosen": -142.72647094726562, + "logps/rejected": -155.7174835205078, + "loss": 0.6944, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.33365076780319214, + "rewards/margins": 0.008806329220533371, + "rewards/rejected": -0.342457115650177, + "step": 76 + }, + { + "epoch": 0.08, + "learning_rate": 3.85e-05, + "logits/chosen": -2.141986608505249, + "logits/rejected": -2.1325931549072266, + "logps/chosen": -166.86537170410156, + "logps/rejected": -197.6178436279297, + "loss": 0.7052, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.34245288372039795, + "rewards/margins": 0.011411521583795547, + "rewards/rejected": -0.3538644015789032, + "step": 77 + }, + { + "epoch": 0.08, + "learning_rate": 3.9000000000000006e-05, + "logits/chosen": -2.215611457824707, + "logits/rejected": -2.212402582168579, + "logps/chosen": -161.9552459716797, + "logps/rejected": -160.0537109375, + "loss": 0.6606, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28822606801986694, + "rewards/margins": 0.07802614569664001, + "rewards/rejected": -0.36625221371650696, + "step": 78 + }, + { + "epoch": 0.08, + "learning_rate": 3.9500000000000005e-05, + "logits/chosen": -2.1117148399353027, + "logits/rejected": -2.152738094329834, + "logps/chosen": -187.3416290283203, + "logps/rejected": -185.2106170654297, + "loss": 0.7461, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.38387393951416016, + "rewards/margins": -0.09052477777004242, + "rewards/rejected": -0.2933492064476013, + "step": 79 + }, + { + "epoch": 0.08, + "learning_rate": 4e-05, + "logits/chosen": -2.081353187561035, + "logits/rejected": -2.1454594135284424, + "logps/chosen": -148.2335968017578, + "logps/rejected": -169.42164611816406, + "loss": 0.7009, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.34989604353904724, + "rewards/margins": 0.00529644638299942, + "rewards/rejected": -0.35519248247146606, + "step": 80 + }, + { + "epoch": 0.08, + "learning_rate": 4.05e-05, + "logits/chosen": -2.1243531703948975, + "logits/rejected": -2.1677510738372803, + "logps/chosen": -164.04400634765625, + "logps/rejected": -185.00840759277344, + "loss": 0.6988, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3187049627304077, + "rewards/margins": 0.011884737759828568, + "rewards/rejected": -0.3305897116661072, + "step": 81 + }, + { + "epoch": 0.09, + "learning_rate": 4.1e-05, + "logits/chosen": -2.0324885845184326, + "logits/rejected": -1.9073715209960938, + "logps/chosen": -171.0150604248047, + "logps/rejected": -178.9537811279297, + "loss": 0.6898, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.262698233127594, + "rewards/margins": 0.012961791828274727, + "rewards/rejected": -0.27566003799438477, + "step": 82 + }, + { + "epoch": 0.09, + "learning_rate": 4.15e-05, + "logits/chosen": -2.1523077487945557, + "logits/rejected": -2.1646199226379395, + "logps/chosen": -164.2784881591797, + "logps/rejected": -170.58087158203125, + "loss": 0.6565, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2823156714439392, + "rewards/margins": 0.08943505585193634, + "rewards/rejected": -0.37175074219703674, + "step": 83 + }, + { + "epoch": 0.09, + "learning_rate": 4.2e-05, + "logits/chosen": -2.132960081100464, + "logits/rejected": -2.170064926147461, + "logps/chosen": -217.71466064453125, + "logps/rejected": -224.87718200683594, + "loss": 0.6449, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.338445246219635, + "rewards/margins": 0.1073441132903099, + "rewards/rejected": -0.4457893371582031, + "step": 84 + }, + { + "epoch": 0.09, + "learning_rate": 4.25e-05, + "logits/chosen": -2.263223648071289, + "logits/rejected": -2.3083293437957764, + "logps/chosen": -156.03329467773438, + "logps/rejected": -174.6391143798828, + "loss": 0.6873, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22993193566799164, + "rewards/margins": 0.018939830362796783, + "rewards/rejected": -0.24887175858020782, + "step": 85 + }, + { + "epoch": 0.09, + "learning_rate": 4.3e-05, + "logits/chosen": -2.265439510345459, + "logits/rejected": -2.2670176029205322, + "logps/chosen": -166.77760314941406, + "logps/rejected": -166.4705352783203, + "loss": 0.737, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3399352431297302, + "rewards/margins": -0.06634283810853958, + "rewards/rejected": -0.27359241247177124, + "step": 86 + }, + { + "epoch": 0.09, + "learning_rate": 4.35e-05, + "logits/chosen": -2.210916757583618, + "logits/rejected": -2.2296037673950195, + "logps/chosen": -142.21340942382812, + "logps/rejected": -161.62600708007812, + "loss": 0.6545, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.301052987575531, + "rewards/margins": 0.09594573080539703, + "rewards/rejected": -0.39699873328208923, + "step": 87 + }, + { + "epoch": 0.09, + "learning_rate": 4.4000000000000006e-05, + "logits/chosen": -2.220825433731079, + "logits/rejected": -2.1372034549713135, + "logps/chosen": -156.62326049804688, + "logps/rejected": -150.1629638671875, + "loss": 0.6818, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2567201852798462, + "rewards/margins": 0.031475357711315155, + "rewards/rejected": -0.28819552063941956, + "step": 88 + }, + { + "epoch": 0.09, + "learning_rate": 4.4500000000000004e-05, + "logits/chosen": -2.080606698989868, + "logits/rejected": -2.16312575340271, + "logps/chosen": -162.34506225585938, + "logps/rejected": -175.51246643066406, + "loss": 0.7598, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4455791115760803, + "rewards/margins": -0.09895279258489609, + "rewards/rejected": -0.34662631154060364, + "step": 89 + }, + { + "epoch": 0.09, + "learning_rate": 4.5e-05, + "logits/chosen": -2.22636342048645, + "logits/rejected": -2.32147479057312, + "logps/chosen": -152.72509765625, + "logps/rejected": -210.8067169189453, + "loss": 0.6536, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2957713007926941, + "rewards/margins": 0.11437603086233139, + "rewards/rejected": -0.4101472795009613, + "step": 90 + }, + { + "epoch": 0.09, + "learning_rate": 4.55e-05, + "logits/chosen": -2.264246702194214, + "logits/rejected": -2.322878122329712, + "logps/chosen": -116.30667114257812, + "logps/rejected": -136.15834045410156, + "loss": 0.6652, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.21190989017486572, + "rewards/margins": 0.09101668000221252, + "rewards/rejected": -0.30292657017707825, + "step": 91 + }, + { + "epoch": 0.1, + "learning_rate": 4.600000000000001e-05, + "logits/chosen": -2.2319533824920654, + "logits/rejected": -2.272087335586548, + "logps/chosen": -209.2500762939453, + "logps/rejected": -213.80284118652344, + "loss": 0.7243, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5155326128005981, + "rewards/margins": -0.025821635499596596, + "rewards/rejected": -0.4897109568119049, + "step": 92 + }, + { + "epoch": 0.1, + "learning_rate": 4.6500000000000005e-05, + "logits/chosen": -2.3007993698120117, + "logits/rejected": -2.233243227005005, + "logps/chosen": -163.1545867919922, + "logps/rejected": -162.89089965820312, + "loss": 0.7095, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4529023766517639, + "rewards/margins": -0.020960787311196327, + "rewards/rejected": -0.43194156885147095, + "step": 93 + }, + { + "epoch": 0.1, + "learning_rate": 4.7e-05, + "logits/chosen": -2.0963847637176514, + "logits/rejected": -2.120976209640503, + "logps/chosen": -180.0958709716797, + "logps/rejected": -205.76585388183594, + "loss": 0.6049, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.30844229459762573, + "rewards/margins": 0.21104586124420166, + "rewards/rejected": -0.5194881558418274, + "step": 94 + }, + { + "epoch": 0.1, + "learning_rate": 4.75e-05, + "logits/chosen": -2.2864556312561035, + "logits/rejected": -2.241337299346924, + "logps/chosen": -202.34237670898438, + "logps/rejected": -200.288330078125, + "loss": 0.6701, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.35101816058158875, + "rewards/margins": 0.08223339170217514, + "rewards/rejected": -0.4332515299320221, + "step": 95 + }, + { + "epoch": 0.1, + "learning_rate": 4.8e-05, + "logits/chosen": -2.1984472274780273, + "logits/rejected": -2.230916976928711, + "logps/chosen": -176.45132446289062, + "logps/rejected": -189.4639892578125, + "loss": 0.7, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3947378993034363, + "rewards/margins": 0.01168506033718586, + "rewards/rejected": -0.4064229726791382, + "step": 96 + }, + { + "epoch": 0.1, + "learning_rate": 4.85e-05, + "logits/chosen": -2.1936614513397217, + "logits/rejected": -2.178769588470459, + "logps/chosen": -172.03759765625, + "logps/rejected": -184.67947387695312, + "loss": 0.7077, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5262473821640015, + "rewards/margins": -0.006830459460616112, + "rewards/rejected": -0.5194169282913208, + "step": 97 + }, + { + "epoch": 0.1, + "learning_rate": 4.9e-05, + "logits/chosen": -2.1688649654388428, + "logits/rejected": -2.1433870792388916, + "logps/chosen": -158.25515747070312, + "logps/rejected": -165.4871368408203, + "loss": 0.7229, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3400518298149109, + "rewards/margins": -0.027561640366911888, + "rewards/rejected": -0.31249016523361206, + "step": 98 + }, + { + "epoch": 0.1, + "learning_rate": 4.9500000000000004e-05, + "logits/chosen": -2.25595760345459, + "logits/rejected": -2.2448055744171143, + "logps/chosen": -175.17568969726562, + "logps/rejected": -179.33013916015625, + "loss": 0.7008, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4599221646785736, + "rewards/margins": 0.01116972602903843, + "rewards/rejected": -0.4710919260978699, + "step": 99 + }, + { + "epoch": 0.1, + "learning_rate": 5e-05, + "logits/chosen": -2.0946810245513916, + "logits/rejected": -2.0803956985473633, + "logps/chosen": -118.56491088867188, + "logps/rejected": -113.53369140625, + "loss": 0.6508, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2681715190410614, + "rewards/margins": 0.11040147393941879, + "rewards/rejected": -0.3785730004310608, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 4.9999832415172185e-05, + "logits/chosen": -2.1622209548950195, + "logits/rejected": -2.251312732696533, + "logps/chosen": -148.2783966064453, + "logps/rejected": -232.58053588867188, + "loss": 0.6485, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.36020269989967346, + "rewards/margins": 0.13017883896827698, + "rewards/rejected": -0.49038150906562805, + "step": 101 + }, + { + "epoch": 0.11, + "learning_rate": 4.9999329662935534e-05, + "logits/chosen": -2.1302261352539062, + "logits/rejected": -2.146063804626465, + "logps/chosen": -182.8227081298828, + "logps/rejected": -189.34913635253906, + "loss": 0.6736, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6098363995552063, + "rewards/margins": 0.08228112757205963, + "rewards/rejected": -0.6921175122261047, + "step": 102 + }, + { + "epoch": 0.11, + "learning_rate": 4.9998491750030315e-05, + "logits/chosen": -2.0695760250091553, + "logits/rejected": -2.151846170425415, + "logps/chosen": -157.34878540039062, + "logps/rejected": -173.14401245117188, + "loss": 0.7008, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5831299424171448, + "rewards/margins": 0.029685884714126587, + "rewards/rejected": -0.612815797328949, + "step": 103 + }, + { + "epoch": 0.11, + "learning_rate": 4.999731868769027e-05, + "logits/chosen": -2.1843104362487793, + "logits/rejected": -2.107654094696045, + "logps/chosen": -161.08914184570312, + "logps/rejected": -142.64749145507812, + "loss": 0.7995, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6787440776824951, + "rewards/margins": -0.16208001971244812, + "rewards/rejected": -0.5166640877723694, + "step": 104 + }, + { + "epoch": 0.11, + "learning_rate": 4.999581049164237e-05, + "logits/chosen": -2.0645735263824463, + "logits/rejected": -2.147791862487793, + "logps/chosen": -149.2325897216797, + "logps/rejected": -173.02069091796875, + "loss": 0.623, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3364582061767578, + "rewards/margins": 0.1654883325099945, + "rewards/rejected": -0.5019465684890747, + "step": 105 + }, + { + "epoch": 0.11, + "learning_rate": 4.99939671821067e-05, + "logits/chosen": -2.238675832748413, + "logits/rejected": -2.308804988861084, + "logps/chosen": -187.11219787597656, + "logps/rejected": -192.90240478515625, + "loss": 0.6952, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4883981943130493, + "rewards/margins": 0.011134681291878223, + "rewards/rejected": -0.49953290820121765, + "step": 106 + }, + { + "epoch": 0.11, + "learning_rate": 4.999178878379611e-05, + "logits/chosen": -2.1448211669921875, + "logits/rejected": -2.163353681564331, + "logps/chosen": -151.56005859375, + "logps/rejected": -149.57290649414062, + "loss": 0.7033, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5375577807426453, + "rewards/margins": 0.005782928317785263, + "rewards/rejected": -0.5433407425880432, + "step": 107 + }, + { + "epoch": 0.11, + "learning_rate": 4.998927532591592e-05, + "logits/chosen": -2.1543734073638916, + "logits/rejected": -2.1483535766601562, + "logps/chosen": -168.912841796875, + "logps/rejected": -163.83612060546875, + "loss": 0.7009, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.40299952030181885, + "rewards/margins": 0.021744927391409874, + "rewards/rejected": -0.42474448680877686, + "step": 108 + }, + { + "epoch": 0.11, + "learning_rate": 4.9986426842163515e-05, + "logits/chosen": -2.196964979171753, + "logits/rejected": -2.1282408237457275, + "logps/chosen": -148.19366455078125, + "logps/rejected": -141.83212280273438, + "loss": 0.622, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36278650164604187, + "rewards/margins": 0.18898257613182068, + "rewards/rejected": -0.5517690777778625, + "step": 109 + }, + { + "epoch": 0.11, + "learning_rate": 4.9983243370727914e-05, + "logits/chosen": -2.136972665786743, + "logits/rejected": -2.154928684234619, + "logps/chosen": -146.0774383544922, + "logps/rejected": -132.6929473876953, + "loss": 0.6987, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5064787268638611, + "rewards/margins": 0.05323922634124756, + "rewards/rejected": -0.5597178936004639, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 4.9979724954289244e-05, + "logits/chosen": -2.1273446083068848, + "logits/rejected": -2.1698343753814697, + "logps/chosen": -140.70408630371094, + "logps/rejected": -164.12890625, + "loss": 0.5856, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27825814485549927, + "rewards/margins": 0.30408480763435364, + "rewards/rejected": -0.5823429822921753, + "step": 111 + }, + { + "epoch": 0.12, + "learning_rate": 4.9975871640018154e-05, + "logits/chosen": -2.156425952911377, + "logits/rejected": -2.149127244949341, + "logps/chosen": -209.48033142089844, + "logps/rejected": -183.49618530273438, + "loss": 0.6814, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3876263499259949, + "rewards/margins": 0.07094159722328186, + "rewards/rejected": -0.45856791734695435, + "step": 112 + }, + { + "epoch": 0.12, + "learning_rate": 4.99716834795752e-05, + "logits/chosen": -2.195077896118164, + "logits/rejected": -2.171152353286743, + "logps/chosen": -137.59030151367188, + "logps/rejected": -144.78488159179688, + "loss": 0.6117, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.201141357421875, + "rewards/margins": 0.24156084656715393, + "rewards/rejected": -0.4427022337913513, + "step": 113 + }, + { + "epoch": 0.12, + "learning_rate": 4.996716052911017e-05, + "logits/chosen": -2.0847880840301514, + "logits/rejected": -2.141540050506592, + "logps/chosen": -197.46426391601562, + "logps/rejected": -199.35997009277344, + "loss": 0.6728, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4001588821411133, + "rewards/margins": 0.0924140065908432, + "rewards/rejected": -0.4925729036331177, + "step": 114 + }, + { + "epoch": 0.12, + "learning_rate": 4.996230284926128e-05, + "logits/chosen": -1.791740894317627, + "logits/rejected": -1.7912871837615967, + "logps/chosen": -181.44020080566406, + "logps/rejected": -201.87872314453125, + "loss": 0.6437, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40906965732574463, + "rewards/margins": 0.15568459033966064, + "rewards/rejected": -0.5647542476654053, + "step": 115 + }, + { + "epoch": 0.12, + "learning_rate": 4.99571105051544e-05, + "logits/chosen": -2.1246492862701416, + "logits/rejected": -2.122555732727051, + "logps/chosen": -175.01571655273438, + "logps/rejected": -176.2000732421875, + "loss": 0.7431, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.36241090297698975, + "rewards/margins": -0.04763410612940788, + "rewards/rejected": -0.314776748418808, + "step": 116 + }, + { + "epoch": 0.12, + "learning_rate": 4.99515835664022e-05, + "logits/chosen": -2.066060781478882, + "logits/rejected": -2.0761799812316895, + "logps/chosen": -135.93301391601562, + "logps/rejected": -162.29739379882812, + "loss": 0.8265, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39111948013305664, + "rewards/margins": -0.14791490137577057, + "rewards/rejected": -0.2432045191526413, + "step": 117 + }, + { + "epoch": 0.12, + "learning_rate": 4.994572210710315e-05, + "logits/chosen": -2.0879368782043457, + "logits/rejected": -2.1556289196014404, + "logps/chosen": -197.7449951171875, + "logps/rejected": -203.45278930664062, + "loss": 0.6753, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2648780345916748, + "rewards/margins": 0.068690724670887, + "rewards/rejected": -0.3335687518119812, + "step": 118 + }, + { + "epoch": 0.12, + "learning_rate": 4.993952620584058e-05, + "logits/chosen": -2.2932987213134766, + "logits/rejected": -2.330873489379883, + "logps/chosen": -139.2769775390625, + "logps/rejected": -152.76504516601562, + "loss": 0.6608, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23606640100479126, + "rewards/margins": 0.08653931319713593, + "rewards/rejected": -0.3226057291030884, + "step": 119 + }, + { + "epoch": 0.13, + "learning_rate": 4.993299594568163e-05, + "logits/chosen": -2.1717681884765625, + "logits/rejected": -2.176131010055542, + "logps/chosen": -171.66172790527344, + "logps/rejected": -188.26991271972656, + "loss": 0.6917, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43707746267318726, + "rewards/margins": 0.05936805531382561, + "rewards/rejected": -0.49644553661346436, + "step": 120 + }, + { + "epoch": 0.13, + "learning_rate": 4.992613141417608e-05, + "logits/chosen": -2.1105880737304688, + "logits/rejected": -2.0103209018707275, + "logps/chosen": -155.08033752441406, + "logps/rejected": -141.55419921875, + "loss": 0.5905, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2041604220867157, + "rewards/margins": 0.32256096601486206, + "rewards/rejected": -0.5267213582992554, + "step": 121 + }, + { + "epoch": 0.13, + "learning_rate": 4.9918932703355256e-05, + "logits/chosen": -2.1467816829681396, + "logits/rejected": -2.115278720855713, + "logps/chosen": -152.28237915039062, + "logps/rejected": -133.01211547851562, + "loss": 0.7446, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44335490465164185, + "rewards/margins": -0.022354908287525177, + "rewards/rejected": -0.42100000381469727, + "step": 122 + }, + { + "epoch": 0.13, + "learning_rate": 4.9911399909730714e-05, + "logits/chosen": -2.318913221359253, + "logits/rejected": -2.275251865386963, + "logps/chosen": -160.89625549316406, + "logps/rejected": -159.76943969726562, + "loss": 0.6249, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.21618026494979858, + "rewards/margins": 0.20579932630062103, + "rewards/rejected": -0.42197954654693604, + "step": 123 + }, + { + "epoch": 0.13, + "learning_rate": 4.990353313429303e-05, + "logits/chosen": -2.1120331287384033, + "logits/rejected": -2.0410468578338623, + "logps/chosen": -183.7588653564453, + "logps/rejected": -222.8588104248047, + "loss": 0.684, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4172906279563904, + "rewards/margins": 0.05181103199720383, + "rewards/rejected": -0.4691016674041748, + "step": 124 + }, + { + "epoch": 0.13, + "learning_rate": 4.989533248251037e-05, + "logits/chosen": -2.0413904190063477, + "logits/rejected": -2.050387382507324, + "logps/chosen": -221.38436889648438, + "logps/rejected": -220.9242401123047, + "loss": 0.5889, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.18506890535354614, + "rewards/margins": 0.2643589675426483, + "rewards/rejected": -0.44942787289619446, + "step": 125 + }, + { + "epoch": 0.13, + "learning_rate": 4.988679806432712e-05, + "logits/chosen": -2.119227170944214, + "logits/rejected": -2.140362024307251, + "logps/chosen": -164.545166015625, + "logps/rejected": -171.43540954589844, + "loss": 0.727, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.41472572088241577, + "rewards/margins": -0.024414831772446632, + "rewards/rejected": -0.3903109133243561, + "step": 126 + }, + { + "epoch": 0.13, + "learning_rate": 4.98779299941624e-05, + "logits/chosen": -2.008235454559326, + "logits/rejected": -2.0146234035491943, + "logps/chosen": -182.02488708496094, + "logps/rejected": -173.46522521972656, + "loss": 0.7665, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39273571968078613, + "rewards/margins": -0.07521196454763412, + "rewards/rejected": -0.3175237476825714, + "step": 127 + }, + { + "epoch": 0.13, + "learning_rate": 4.9868728390908526e-05, + "logits/chosen": -2.194214105606079, + "logits/rejected": -2.183523654937744, + "logps/chosen": -144.79937744140625, + "logps/rejected": -145.96029663085938, + "loss": 0.71, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.15473532676696777, + "rewards/margins": 0.0015310226008296013, + "rewards/rejected": -0.15626637637615204, + "step": 128 + }, + { + "epoch": 0.13, + "learning_rate": 4.985919337792944e-05, + "logits/chosen": -2.0170881748199463, + "logits/rejected": -2.021702527999878, + "logps/chosen": -157.9308624267578, + "logps/rejected": -187.35256958007812, + "loss": 0.682, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.23143672943115234, + "rewards/margins": 0.07466793060302734, + "rewards/rejected": -0.3061046600341797, + "step": 129 + }, + { + "epoch": 0.14, + "learning_rate": 4.9849325083059e-05, + "logits/chosen": -2.345395088195801, + "logits/rejected": -2.294443130493164, + "logps/chosen": -202.0467987060547, + "logps/rejected": -186.38247680664062, + "loss": 0.692, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.19066600501537323, + "rewards/margins": 0.07181324809789658, + "rewards/rejected": -0.2624792754650116, + "step": 130 + }, + { + "epoch": 0.14, + "learning_rate": 4.983912363859935e-05, + "logits/chosen": -2.14298152923584, + "logits/rejected": -2.1677587032318115, + "logps/chosen": -125.33621215820312, + "logps/rejected": -137.17404174804688, + "loss": 0.6617, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23150649666786194, + "rewards/margins": 0.10384988784790039, + "rewards/rejected": -0.33535638451576233, + "step": 131 + }, + { + "epoch": 0.14, + "learning_rate": 4.982858918131906e-05, + "logits/chosen": -2.190133810043335, + "logits/rejected": -2.247300863265991, + "logps/chosen": -167.1696014404297, + "logps/rejected": -168.60281372070312, + "loss": 0.7058, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43472814559936523, + "rewards/margins": 0.03844255581498146, + "rewards/rejected": -0.473170667886734, + "step": 132 + }, + { + "epoch": 0.14, + "learning_rate": 4.981772185245135e-05, + "logits/chosen": -2.154466152191162, + "logits/rejected": -2.1573760509490967, + "logps/chosen": -168.56600952148438, + "logps/rejected": -188.00607299804688, + "loss": 0.6659, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3921719491481781, + "rewards/margins": 0.10123846679925919, + "rewards/rejected": -0.4934104084968567, + "step": 133 + }, + { + "epoch": 0.14, + "learning_rate": 4.980652179769218e-05, + "logits/chosen": -2.139244318008423, + "logits/rejected": -2.170041561126709, + "logps/chosen": -194.27749633789062, + "logps/rejected": -194.87745666503906, + "loss": 0.6939, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.19869127869606018, + "rewards/margins": 0.15997806191444397, + "rewards/rejected": -0.3586694002151489, + "step": 134 + }, + { + "epoch": 0.14, + "learning_rate": 4.979498916719828e-05, + "logits/chosen": -2.0148520469665527, + "logits/rejected": -2.0170676708221436, + "logps/chosen": -178.43853759765625, + "logps/rejected": -177.73255920410156, + "loss": 0.6163, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15721122920513153, + "rewards/margins": 0.2595069408416748, + "rewards/rejected": -0.41671818494796753, + "step": 135 + }, + { + "epoch": 0.14, + "learning_rate": 4.978312411558518e-05, + "logits/chosen": -2.2386107444763184, + "logits/rejected": -2.2416539192199707, + "logps/chosen": -153.1583251953125, + "logps/rejected": -165.6448516845703, + "loss": 0.679, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48863306641578674, + "rewards/margins": 0.107185497879982, + "rewards/rejected": -0.5958185195922852, + "step": 136 + }, + { + "epoch": 0.14, + "learning_rate": 4.977092680192507e-05, + "logits/chosen": -1.9784085750579834, + "logits/rejected": -1.9997234344482422, + "logps/chosen": -148.26358032226562, + "logps/rejected": -131.2608642578125, + "loss": 0.7235, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3870002031326294, + "rewards/margins": -0.03670747950673103, + "rewards/rejected": -0.35029271245002747, + "step": 137 + }, + { + "epoch": 0.14, + "learning_rate": 4.9758397389744734e-05, + "logits/chosen": -2.2231109142303467, + "logits/rejected": -2.1324055194854736, + "logps/chosen": -175.7447052001953, + "logps/rejected": -165.63980102539062, + "loss": 0.6117, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3166625499725342, + "rewards/margins": 0.23041030764579773, + "rewards/rejected": -0.5470728278160095, + "step": 138 + }, + { + "epoch": 0.14, + "learning_rate": 4.9745536047023324e-05, + "logits/chosen": -2.1007936000823975, + "logits/rejected": -2.2264654636383057, + "logps/chosen": -205.53456115722656, + "logps/rejected": -180.32980346679688, + "loss": 0.6427, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45254045724868774, + "rewards/margins": 0.15192103385925293, + "rewards/rejected": -0.6044614911079407, + "step": 139 + }, + { + "epoch": 0.15, + "learning_rate": 4.973234294619011e-05, + "logits/chosen": -1.936387062072754, + "logits/rejected": -2.0128026008605957, + "logps/chosen": -145.3013916015625, + "logps/rejected": -161.27244567871094, + "loss": 0.6728, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3800917863845825, + "rewards/margins": 0.11550942063331604, + "rewards/rejected": -0.49560117721557617, + "step": 140 + }, + { + "epoch": 0.15, + "learning_rate": 4.971881826412218e-05, + "logits/chosen": -2.154330253601074, + "logits/rejected": -2.237229585647583, + "logps/chosen": -156.77369689941406, + "logps/rejected": -176.82176208496094, + "loss": 0.6794, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.36407431960105896, + "rewards/margins": 0.1079927533864975, + "rewards/rejected": -0.47206708788871765, + "step": 141 + }, + { + "epoch": 0.15, + "learning_rate": 4.9704962182142044e-05, + "logits/chosen": -2.1118252277374268, + "logits/rejected": -2.134640693664551, + "logps/chosen": -166.844482421875, + "logps/rejected": -167.47900390625, + "loss": 0.6575, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4557605981826782, + "rewards/margins": 0.10824623703956604, + "rewards/rejected": -0.5640068054199219, + "step": 142 + }, + { + "epoch": 0.15, + "learning_rate": 4.9690774886015244e-05, + "logits/chosen": -2.1018967628479004, + "logits/rejected": -2.1379857063293457, + "logps/chosen": -176.53829956054688, + "logps/rejected": -194.2880859375, + "loss": 0.6878, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4790266156196594, + "rewards/margins": 0.10418644547462463, + "rewards/rejected": -0.5832130312919617, + "step": 143 + }, + { + "epoch": 0.15, + "learning_rate": 4.967625656594782e-05, + "logits/chosen": -2.0278987884521484, + "logits/rejected": -2.0434699058532715, + "logps/chosen": -145.874267578125, + "logps/rejected": -151.4019317626953, + "loss": 0.7109, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.38604238629341125, + "rewards/margins": 0.0024005472660064697, + "rewards/rejected": -0.3884429931640625, + "step": 144 + }, + { + "epoch": 0.15, + "learning_rate": 4.966140741658379e-05, + "logits/chosen": -2.128117084503174, + "logits/rejected": -2.169542074203491, + "logps/chosen": -166.9932098388672, + "logps/rejected": -166.7843475341797, + "loss": 0.699, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.42920929193496704, + "rewards/margins": 0.09982432425022125, + "rewards/rejected": -0.5290336608886719, + "step": 145 + }, + { + "epoch": 0.15, + "learning_rate": 4.9646227637002515e-05, + "logits/chosen": -2.2982516288757324, + "logits/rejected": -2.3041136264801025, + "logps/chosen": -174.27261352539062, + "logps/rejected": -181.32333374023438, + "loss": 0.7344, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5106673836708069, + "rewards/margins": 0.0017841942608356476, + "rewards/rejected": -0.5124515891075134, + "step": 146 + }, + { + "epoch": 0.15, + "learning_rate": 4.963071743071607e-05, + "logits/chosen": -2.1971347332000732, + "logits/rejected": -2.252727508544922, + "logps/chosen": -164.0286102294922, + "logps/rejected": -174.8863525390625, + "loss": 0.7943, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5613827705383301, + "rewards/margins": -0.05153023824095726, + "rewards/rejected": -0.5098525285720825, + "step": 147 + }, + { + "epoch": 0.15, + "learning_rate": 4.961487700566646e-05, + "logits/chosen": -2.056870222091675, + "logits/rejected": -2.05178165435791, + "logps/chosen": -143.26681518554688, + "logps/rejected": -183.6707000732422, + "loss": 0.744, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3036291301250458, + "rewards/margins": 0.006056658923625946, + "rewards/rejected": -0.3096857964992523, + "step": 148 + }, + { + "epoch": 0.16, + "learning_rate": 4.9598706574222886e-05, + "logits/chosen": -2.159867286682129, + "logits/rejected": -2.2232213020324707, + "logps/chosen": -161.9599609375, + "logps/rejected": -184.70950317382812, + "loss": 0.7647, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.32318681478500366, + "rewards/margins": -0.07305724918842316, + "rewards/rejected": -0.2501295506954193, + "step": 149 + }, + { + "epoch": 0.16, + "learning_rate": 4.958220635317886e-05, + "logits/chosen": -2.0702919960021973, + "logits/rejected": -2.1789698600769043, + "logps/chosen": -159.52511596679688, + "logps/rejected": -187.4576416015625, + "loss": 0.6574, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.264212042093277, + "rewards/margins": 0.14712117612361908, + "rewards/rejected": -0.41133320331573486, + "step": 150 + }, + { + "epoch": 0.16, + "learning_rate": 4.956537656374933e-05, + "logits/chosen": -2.1290884017944336, + "logits/rejected": -2.1409378051757812, + "logps/chosen": -156.96994018554688, + "logps/rejected": -166.66525268554688, + "loss": 0.7229, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4065425992012024, + "rewards/margins": 0.03063153848052025, + "rewards/rejected": -0.43717408180236816, + "step": 151 + }, + { + "epoch": 0.16, + "learning_rate": 4.9548217431567665e-05, + "logits/chosen": -2.1941640377044678, + "logits/rejected": -2.2345049381256104, + "logps/chosen": -141.96652221679688, + "logps/rejected": -145.7759552001953, + "loss": 0.6756, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2183290421962738, + "rewards/margins": 0.11311781406402588, + "rewards/rejected": -0.3314468264579773, + "step": 152 + }, + { + "epoch": 0.16, + "learning_rate": 4.95307291866827e-05, + "logits/chosen": -2.1858081817626953, + "logits/rejected": -2.1450536251068115, + "logps/chosen": -155.87149047851562, + "logps/rejected": -157.30197143554688, + "loss": 0.6938, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34080612659454346, + "rewards/margins": 0.06700116395950317, + "rewards/rejected": -0.40780726075172424, + "step": 153 + }, + { + "epoch": 0.16, + "learning_rate": 4.95129120635556e-05, + "logits/chosen": -2.240177869796753, + "logits/rejected": -2.171326160430908, + "logps/chosen": -165.3470458984375, + "logps/rejected": -152.51541137695312, + "loss": 0.7462, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3122984766960144, + "rewards/margins": -0.0709066390991211, + "rewards/rejected": -0.24139180779457092, + "step": 154 + }, + { + "epoch": 0.16, + "learning_rate": 4.949476630105669e-05, + "logits/chosen": -2.2033369541168213, + "logits/rejected": -2.219047784805298, + "logps/chosen": -201.08102416992188, + "logps/rejected": -198.62356567382812, + "loss": 0.6226, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3462004065513611, + "rewards/margins": 0.1935741901397705, + "rewards/rejected": -0.5397745966911316, + "step": 155 + }, + { + "epoch": 0.16, + "learning_rate": 4.9476292142462374e-05, + "logits/chosen": -2.0294063091278076, + "logits/rejected": -2.0089852809906006, + "logps/chosen": -141.88330078125, + "logps/rejected": -145.25686645507812, + "loss": 0.6791, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2922942638397217, + "rewards/margins": 0.08209052681922913, + "rewards/rejected": -0.3743847906589508, + "step": 156 + }, + { + "epoch": 0.16, + "learning_rate": 4.945748983545172e-05, + "logits/chosen": -2.1608543395996094, + "logits/rejected": -2.099316358566284, + "logps/chosen": -140.04598999023438, + "logps/rejected": -135.99771118164062, + "loss": 0.5853, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14180442690849304, + "rewards/margins": 0.3191547393798828, + "rewards/rejected": -0.46095913648605347, + "step": 157 + }, + { + "epoch": 0.16, + "learning_rate": 4.943835963210324e-05, + "logits/chosen": -2.23140811920166, + "logits/rejected": -2.160383701324463, + "logps/chosen": -176.86737060546875, + "logps/rejected": -173.28509521484375, + "loss": 0.6432, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16037102043628693, + "rewards/margins": 0.1384631097316742, + "rewards/rejected": -0.2988341450691223, + "step": 158 + }, + { + "epoch": 0.17, + "learning_rate": 4.941890178889149e-05, + "logits/chosen": -2.226886034011841, + "logits/rejected": -2.2156994342803955, + "logps/chosen": -151.04501342773438, + "logps/rejected": -152.1208953857422, + "loss": 0.7934, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.4307902157306671, + "rewards/margins": -0.15255290269851685, + "rewards/rejected": -0.2782372236251831, + "step": 159 + }, + { + "epoch": 0.17, + "learning_rate": 4.939911656668361e-05, + "logits/chosen": -2.1915454864501953, + "logits/rejected": -2.1480343341827393, + "logps/chosen": -127.00537109375, + "logps/rejected": -118.40946960449219, + "loss": 0.6693, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.05850663036108017, + "rewards/margins": 0.07123789936304092, + "rewards/rejected": -0.1297445297241211, + "step": 160 + }, + { + "epoch": 0.17, + "learning_rate": 4.937900423073585e-05, + "logits/chosen": -2.2268741130828857, + "logits/rejected": -2.223992347717285, + "logps/chosen": -180.8136749267578, + "logps/rejected": -194.1834259033203, + "loss": 0.7348, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2488463819026947, + "rewards/margins": 0.01856984756886959, + "rewards/rejected": -0.26741623878479004, + "step": 161 + }, + { + "epoch": 0.17, + "learning_rate": 4.9358565050689985e-05, + "logits/chosen": -2.163513660430908, + "logits/rejected": -2.1871607303619385, + "logps/chosen": -196.00209045410156, + "logps/rejected": -197.71170043945312, + "loss": 0.6894, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3192088305950165, + "rewards/margins": 0.049343544989824295, + "rewards/rejected": -0.36855238676071167, + "step": 162 + }, + { + "epoch": 0.17, + "learning_rate": 4.933779930056975e-05, + "logits/chosen": -2.1793665885925293, + "logits/rejected": -2.140435218811035, + "logps/chosen": -141.68283081054688, + "logps/rejected": -157.8527069091797, + "loss": 0.8341, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.5028910636901855, + "rewards/margins": -0.241933673620224, + "rewards/rejected": -0.26095739006996155, + "step": 163 + }, + { + "epoch": 0.17, + "learning_rate": 4.93167072587771e-05, + "logits/chosen": -2.1162264347076416, + "logits/rejected": -2.159403085708618, + "logps/chosen": -167.28477478027344, + "logps/rejected": -159.3861083984375, + "loss": 0.6934, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.32176950573921204, + "rewards/margins": 0.0923306867480278, + "rewards/rejected": -0.41410019993782043, + "step": 164 + }, + { + "epoch": 0.17, + "learning_rate": 4.929528920808854e-05, + "logits/chosen": -2.1691784858703613, + "logits/rejected": -2.151355266571045, + "logps/chosen": -210.38043212890625, + "logps/rejected": -182.91961669921875, + "loss": 0.5909, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3064863383769989, + "rewards/margins": 0.28497347235679626, + "rewards/rejected": -0.5914597511291504, + "step": 165 + }, + { + "epoch": 0.17, + "learning_rate": 4.92735454356513e-05, + "logits/chosen": -2.105543613433838, + "logits/rejected": -2.123973846435547, + "logps/chosen": -136.9002685546875, + "logps/rejected": -134.88809204101562, + "loss": 0.7425, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.40990638732910156, + "rewards/margins": -0.044356442987918854, + "rewards/rejected": -0.3655499517917633, + "step": 166 + }, + { + "epoch": 0.17, + "learning_rate": 4.925147623297949e-05, + "logits/chosen": -2.324575901031494, + "logits/rejected": -2.318359613418579, + "logps/chosen": -196.5352020263672, + "logps/rejected": -168.95431518554688, + "loss": 0.8018, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.35998621582984924, + "rewards/margins": -0.16241417825222015, + "rewards/rejected": -0.1975720375776291, + "step": 167 + }, + { + "epoch": 0.18, + "learning_rate": 4.922908189595018e-05, + "logits/chosen": -2.0524559020996094, + "logits/rejected": -2.0096793174743652, + "logps/chosen": -154.45022583007812, + "logps/rejected": -153.45050048828125, + "loss": 0.6433, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21955004334449768, + "rewards/margins": 0.17729638516902924, + "rewards/rejected": -0.3968464732170105, + "step": 168 + }, + { + "epoch": 0.18, + "learning_rate": 4.920636272479946e-05, + "logits/chosen": -2.305999755859375, + "logits/rejected": -2.299030065536499, + "logps/chosen": -164.8284149169922, + "logps/rejected": -151.80828857421875, + "loss": 0.6648, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.31743210554122925, + "rewards/margins": 0.11833076179027557, + "rewards/rejected": -0.43576285243034363, + "step": 169 + }, + { + "epoch": 0.18, + "learning_rate": 4.9183319024118415e-05, + "logits/chosen": -2.1119515895843506, + "logits/rejected": -2.1947803497314453, + "logps/chosen": -135.42007446289062, + "logps/rejected": -142.30645751953125, + "loss": 0.7242, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.29650259017944336, + "rewards/margins": -0.007799305021762848, + "rewards/rejected": -0.2887033224105835, + "step": 170 + }, + { + "epoch": 0.18, + "learning_rate": 4.915995110284901e-05, + "logits/chosen": -2.1119866371154785, + "logits/rejected": -2.133423328399658, + "logps/chosen": -180.34866333007812, + "logps/rejected": -194.75259399414062, + "loss": 0.712, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3628592789173126, + "rewards/margins": 0.04306137561798096, + "rewards/rejected": -0.4059206247329712, + "step": 171 + }, + { + "epoch": 0.18, + "learning_rate": 4.9136259274279955e-05, + "logits/chosen": -2.2553658485412598, + "logits/rejected": -2.261441707611084, + "logps/chosen": -146.23861694335938, + "logps/rejected": -143.55841064453125, + "loss": 0.7566, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4861433506011963, + "rewards/margins": -0.029232196509838104, + "rewards/rejected": -0.4569111168384552, + "step": 172 + }, + { + "epoch": 0.18, + "learning_rate": 4.911224385604255e-05, + "logits/chosen": -2.3380446434020996, + "logits/rejected": -2.278407335281372, + "logps/chosen": -160.54949951171875, + "logps/rejected": -151.08018493652344, + "loss": 0.8986, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5705781579017639, + "rewards/margins": -0.310020387172699, + "rewards/rejected": -0.26055777072906494, + "step": 173 + }, + { + "epoch": 0.18, + "learning_rate": 4.908790517010636e-05, + "logits/chosen": -2.298267364501953, + "logits/rejected": -2.317840814590454, + "logps/chosen": -148.01202392578125, + "logps/rejected": -157.5655517578125, + "loss": 0.5926, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20174311101436615, + "rewards/margins": 0.3064672350883484, + "rewards/rejected": -0.5082104206085205, + "step": 174 + }, + { + "epoch": 0.18, + "learning_rate": 4.906324354277495e-05, + "logits/chosen": -2.31154727935791, + "logits/rejected": -2.3395073413848877, + "logps/chosen": -208.95376586914062, + "logps/rejected": -197.12318420410156, + "loss": 0.8049, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.4388049244880676, + "rewards/margins": -0.17647361755371094, + "rewards/rejected": -0.2623312771320343, + "step": 175 + }, + { + "epoch": 0.18, + "learning_rate": 4.903825930468149e-05, + "logits/chosen": -2.3014442920684814, + "logits/rejected": -2.2676706314086914, + "logps/chosen": -176.29176330566406, + "logps/rejected": -158.9712371826172, + "loss": 0.6158, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.17731614410877228, + "rewards/margins": 0.219602569937706, + "rewards/rejected": -0.3969186842441559, + "step": 176 + }, + { + "epoch": 0.18, + "learning_rate": 4.901295279078431e-05, + "logits/chosen": -2.3018527030944824, + "logits/rejected": -2.250325918197632, + "logps/chosen": -198.3526611328125, + "logps/rejected": -224.3977508544922, + "loss": 0.749, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6051126718521118, + "rewards/margins": -0.05212073400616646, + "rewards/rejected": -0.5529919862747192, + "step": 177 + }, + { + "epoch": 0.19, + "learning_rate": 4.898732434036244e-05, + "logits/chosen": -2.299297571182251, + "logits/rejected": -2.3030190467834473, + "logps/chosen": -167.0242156982422, + "logps/rejected": -163.21133422851562, + "loss": 0.818, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4616413414478302, + "rewards/margins": -0.1739044189453125, + "rewards/rejected": -0.2877369225025177, + "step": 178 + }, + { + "epoch": 0.19, + "learning_rate": 4.896137429701102e-05, + "logits/chosen": -2.186522960662842, + "logits/rejected": -2.127988338470459, + "logps/chosen": -173.8786163330078, + "logps/rejected": -175.3472137451172, + "loss": 0.6219, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33478206396102905, + "rewards/margins": 0.27421796321868896, + "rewards/rejected": -0.609000027179718, + "step": 179 + }, + { + "epoch": 0.19, + "learning_rate": 4.893510300863676e-05, + "logits/chosen": -2.1868398189544678, + "logits/rejected": -2.2052829265594482, + "logps/chosen": -226.92471313476562, + "logps/rejected": -219.23912048339844, + "loss": 0.7715, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5434471964836121, + "rewards/margins": -0.10749813914299011, + "rewards/rejected": -0.43594905734062195, + "step": 180 + }, + { + "epoch": 0.19, + "learning_rate": 4.890851082745319e-05, + "logits/chosen": -2.2493791580200195, + "logits/rejected": -2.300419330596924, + "logps/chosen": -181.13748168945312, + "logps/rejected": -190.27645874023438, + "loss": 0.6568, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10172881931066513, + "rewards/margins": 0.09888561069965363, + "rewards/rejected": -0.20061442255973816, + "step": 181 + }, + { + "epoch": 0.19, + "learning_rate": 4.8881598109976004e-05, + "logits/chosen": -2.3254058361053467, + "logits/rejected": -2.3100500106811523, + "logps/chosen": -208.1369171142578, + "logps/rejected": -213.24652099609375, + "loss": 0.7061, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5665594339370728, + "rewards/margins": 0.10720066726207733, + "rewards/rejected": -0.673760175704956, + "step": 182 + }, + { + "epoch": 0.19, + "learning_rate": 4.885436521701824e-05, + "logits/chosen": -2.3967294692993164, + "logits/rejected": -2.4122745990753174, + "logps/chosen": -123.20140838623047, + "logps/rejected": -133.7183837890625, + "loss": 0.6325, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3578134775161743, + "rewards/margins": 0.17870007455348969, + "rewards/rejected": -0.5365135669708252, + "step": 183 + }, + { + "epoch": 0.19, + "learning_rate": 4.8826812513685487e-05, + "logits/chosen": -2.327406167984009, + "logits/rejected": -2.3479795455932617, + "logps/chosen": -169.1548614501953, + "logps/rejected": -181.2438201904297, + "loss": 0.6755, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.5492762327194214, + "rewards/margins": 0.12284587323665619, + "rewards/rejected": -0.672122061252594, + "step": 184 + }, + { + "epoch": 0.19, + "learning_rate": 4.8798940369370944e-05, + "logits/chosen": -2.2139463424682617, + "logits/rejected": -2.182992935180664, + "logps/chosen": -169.96737670898438, + "logps/rejected": -162.341552734375, + "loss": 0.7834, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5437475442886353, + "rewards/margins": -0.11101134121417999, + "rewards/rejected": -0.43273624777793884, + "step": 185 + }, + { + "epoch": 0.19, + "learning_rate": 4.877074915775049e-05, + "logits/chosen": -2.370654582977295, + "logits/rejected": -2.2954137325286865, + "logps/chosen": -197.25172424316406, + "logps/rejected": -176.16543579101562, + "loss": 0.7289, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43914932012557983, + "rewards/margins": -0.006785091012716293, + "rewards/rejected": -0.43236425518989563, + "step": 186 + }, + { + "epoch": 0.2, + "learning_rate": 4.8742239256777674e-05, + "logits/chosen": -2.179440975189209, + "logits/rejected": -2.1714303493499756, + "logps/chosen": -151.60626220703125, + "logps/rejected": -179.89920043945312, + "loss": 0.7239, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5735316872596741, + "rewards/margins": 0.02325405180454254, + "rewards/rejected": -0.5967857241630554, + "step": 187 + }, + { + "epoch": 0.2, + "learning_rate": 4.8713411048678635e-05, + "logits/chosen": -2.132361650466919, + "logits/rejected": -1.9567621946334839, + "logps/chosen": -185.78890991210938, + "logps/rejected": -146.61630249023438, + "loss": 0.7341, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5060631632804871, + "rewards/margins": -0.03276711702346802, + "rewards/rejected": -0.47329598665237427, + "step": 188 + }, + { + "epoch": 0.2, + "learning_rate": 4.868426491994702e-05, + "logits/chosen": -2.0946998596191406, + "logits/rejected": -2.0924675464630127, + "logps/chosen": -182.0203857421875, + "logps/rejected": -189.61448669433594, + "loss": 0.6746, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5287783741950989, + "rewards/margins": 0.13924337923526764, + "rewards/rejected": -0.6680217981338501, + "step": 189 + }, + { + "epoch": 0.2, + "learning_rate": 4.865480126133872e-05, + "logits/chosen": -2.1674387454986572, + "logits/rejected": -2.1639039516448975, + "logps/chosen": -215.89862060546875, + "logps/rejected": -194.6399383544922, + "loss": 0.6866, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4448810815811157, + "rewards/margins": 0.05918551981449127, + "rewards/rejected": -0.5040666460990906, + "step": 190 + }, + { + "epoch": 0.2, + "learning_rate": 4.862502046786671e-05, + "logits/chosen": -2.1711835861206055, + "logits/rejected": -2.3064002990722656, + "logps/chosen": -173.3459014892578, + "logps/rejected": -198.23008728027344, + "loss": 0.6805, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4160701632499695, + "rewards/margins": 0.11089640110731125, + "rewards/rejected": -0.5269665122032166, + "step": 191 + }, + { + "epoch": 0.2, + "learning_rate": 4.859492293879574e-05, + "logits/chosen": -2.2058756351470947, + "logits/rejected": -2.178715944290161, + "logps/chosen": -227.00070190429688, + "logps/rejected": -238.19232177734375, + "loss": 0.6392, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3010501563549042, + "rewards/margins": 0.1993046998977661, + "rewards/rejected": -0.5003548264503479, + "step": 192 + }, + { + "epoch": 0.2, + "learning_rate": 4.856450907763693e-05, + "logits/chosen": -2.2074475288391113, + "logits/rejected": -2.1603050231933594, + "logps/chosen": -149.5859832763672, + "logps/rejected": -149.609130859375, + "loss": 0.777, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.4570000171661377, + "rewards/margins": -0.09236741065979004, + "rewards/rejected": -0.36463260650634766, + "step": 193 + }, + { + "epoch": 0.2, + "learning_rate": 4.853377929214243e-05, + "logits/chosen": -2.123965263366699, + "logits/rejected": -2.116884469985962, + "logps/chosen": -172.16937255859375, + "logps/rejected": -180.29434204101562, + "loss": 0.6691, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4884795546531677, + "rewards/margins": 0.12548628449440002, + "rewards/rejected": -0.6139658689498901, + "step": 194 + }, + { + "epoch": 0.2, + "learning_rate": 4.85027339942999e-05, + "logits/chosen": -2.302314281463623, + "logits/rejected": -2.272852659225464, + "logps/chosen": -220.17941284179688, + "logps/rejected": -226.3072509765625, + "loss": 0.6509, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4287835657596588, + "rewards/margins": 0.1545097827911377, + "rewards/rejected": -0.5832933783531189, + "step": 195 + }, + { + "epoch": 0.2, + "learning_rate": 4.8471373600326996e-05, + "logits/chosen": -2.139336347579956, + "logits/rejected": -2.087122678756714, + "logps/chosen": -140.20321655273438, + "logps/rejected": -131.48948669433594, + "loss": 0.7291, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3434653580188751, + "rewards/margins": -0.03645901009440422, + "rewards/rejected": -0.3070063292980194, + "step": 196 + }, + { + "epoch": 0.21, + "learning_rate": 4.843969853066584e-05, + "logits/chosen": -2.292895793914795, + "logits/rejected": -2.316250801086426, + "logps/chosen": -147.1464385986328, + "logps/rejected": -161.3351593017578, + "loss": 0.6387, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1997198909521103, + "rewards/margins": 0.2121596336364746, + "rewards/rejected": -0.4118794798851013, + "step": 197 + }, + { + "epoch": 0.21, + "learning_rate": 4.8407709209977305e-05, + "logits/chosen": -2.4767165184020996, + "logits/rejected": -2.521482467651367, + "logps/chosen": -206.472412109375, + "logps/rejected": -215.40516662597656, + "loss": 0.6462, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5403072237968445, + "rewards/margins": 0.17954078316688538, + "rewards/rejected": -0.7198480367660522, + "step": 198 + }, + { + "epoch": 0.21, + "learning_rate": 4.837540606713538e-05, + "logits/chosen": -2.2345879077911377, + "logits/rejected": -2.221876621246338, + "logps/chosen": -166.65631103515625, + "logps/rejected": -146.72183227539062, + "loss": 0.8822, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.7132791876792908, + "rewards/margins": -0.2818105220794678, + "rewards/rejected": -0.43146878480911255, + "step": 199 + }, + { + "epoch": 0.21, + "learning_rate": 4.834278953522138e-05, + "logits/chosen": -2.1550986766815186, + "logits/rejected": -2.2200095653533936, + "logps/chosen": -141.46798706054688, + "logps/rejected": -152.30160522460938, + "loss": 0.6853, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3020961284637451, + "rewards/margins": 0.08537312597036362, + "rewards/rejected": -0.3874692916870117, + "step": 200 + }, + { + "epoch": 0.21, + "learning_rate": 4.8309860051518204e-05, + "logits/chosen": -2.1666178703308105, + "logits/rejected": -2.195492744445801, + "logps/chosen": -154.3419647216797, + "logps/rejected": -154.75958251953125, + "loss": 0.7683, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5529704093933105, + "rewards/margins": 0.002926960587501526, + "rewards/rejected": -0.5558973550796509, + "step": 201 + }, + { + "epoch": 0.21, + "learning_rate": 4.8276618057504376e-05, + "logits/chosen": -2.2236335277557373, + "logits/rejected": -2.2724032402038574, + "logps/chosen": -143.28988647460938, + "logps/rejected": -150.9318084716797, + "loss": 0.7539, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.256462037563324, + "rewards/margins": -0.02850787341594696, + "rewards/rejected": -0.2279541939496994, + "step": 202 + }, + { + "epoch": 0.21, + "learning_rate": 4.824306399884822e-05, + "logits/chosen": -2.2802443504333496, + "logits/rejected": -2.2836642265319824, + "logps/chosen": -179.90365600585938, + "logps/rejected": -169.27171325683594, + "loss": 0.7907, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.4755932092666626, + "rewards/margins": -0.1532573252916336, + "rewards/rejected": -0.3223358988761902, + "step": 203 + }, + { + "epoch": 0.21, + "learning_rate": 4.8209198325401815e-05, + "logits/chosen": -2.3384103775024414, + "logits/rejected": -2.314667224884033, + "logps/chosen": -170.7639617919922, + "logps/rejected": -162.15753173828125, + "loss": 0.6634, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26886916160583496, + "rewards/margins": 0.1269666701555252, + "rewards/rejected": -0.39583584666252136, + "step": 204 + }, + { + "epoch": 0.21, + "learning_rate": 4.817502149119502e-05, + "logits/chosen": -2.250046491622925, + "logits/rejected": -2.225694179534912, + "logps/chosen": -165.20504760742188, + "logps/rejected": -172.9903106689453, + "loss": 0.6962, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.45896613597869873, + "rewards/margins": 0.0778706818819046, + "rewards/rejected": -0.5368368029594421, + "step": 205 + }, + { + "epoch": 0.21, + "learning_rate": 4.8140533954429327e-05, + "logits/chosen": -2.313793420791626, + "logits/rejected": -2.2939281463623047, + "logps/chosen": -143.09251403808594, + "logps/rejected": -150.27903747558594, + "loss": 0.6284, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2762828767299652, + "rewards/margins": 0.2016773521900177, + "rewards/rejected": -0.4779602289199829, + "step": 206 + }, + { + "epoch": 0.22, + "learning_rate": 4.810573617747178e-05, + "logits/chosen": -2.332127809524536, + "logits/rejected": -2.327253818511963, + "logps/chosen": -162.88739013671875, + "logps/rejected": -167.6454315185547, + "loss": 0.6114, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.22507114708423615, + "rewards/margins": 0.24488690495491028, + "rewards/rejected": -0.4699580669403076, + "step": 207 + }, + { + "epoch": 0.22, + "learning_rate": 4.8070628626848735e-05, + "logits/chosen": -2.12371563911438, + "logits/rejected": -2.1890509128570557, + "logps/chosen": -169.19342041015625, + "logps/rejected": -191.41851806640625, + "loss": 0.7002, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5895254611968994, + "rewards/margins": 0.07979288697242737, + "rewards/rejected": -0.6693182587623596, + "step": 208 + }, + { + "epoch": 0.22, + "learning_rate": 4.803521177323962e-05, + "logits/chosen": -2.2081453800201416, + "logits/rejected": -2.1713621616363525, + "logps/chosen": -163.9095458984375, + "logps/rejected": -169.9920654296875, + "loss": 0.6973, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3412324786186218, + "rewards/margins": 0.045901067554950714, + "rewards/rejected": -0.38713356852531433, + "step": 209 + }, + { + "epoch": 0.22, + "learning_rate": 4.799948609147061e-05, + "logits/chosen": -2.161041259765625, + "logits/rejected": -2.1152327060699463, + "logps/chosen": -165.09188842773438, + "logps/rejected": -156.6759796142578, + "loss": 0.7462, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.48258787393569946, + "rewards/margins": -0.04187957942485809, + "rewards/rejected": -0.44070830941200256, + "step": 210 + }, + { + "epoch": 0.22, + "learning_rate": 4.796345206050829e-05, + "logits/chosen": -2.106369733810425, + "logits/rejected": -2.2182164192199707, + "logps/chosen": -176.6033477783203, + "logps/rejected": -213.2372589111328, + "loss": 0.7045, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4269718527793884, + "rewards/margins": 0.00813320279121399, + "rewards/rejected": -0.4351051151752472, + "step": 211 + }, + { + "epoch": 0.22, + "learning_rate": 4.792711016345321e-05, + "logits/chosen": -2.1926729679107666, + "logits/rejected": -2.137371063232422, + "logps/chosen": -157.62571716308594, + "logps/rejected": -141.09170532226562, + "loss": 0.6535, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3278404176235199, + "rewards/margins": 0.16385424137115479, + "rewards/rejected": -0.49169468879699707, + "step": 212 + }, + { + "epoch": 0.22, + "learning_rate": 4.7890460887533417e-05, + "logits/chosen": -2.1466121673583984, + "logits/rejected": -2.181826114654541, + "logps/chosen": -163.6356964111328, + "logps/rejected": -176.73495483398438, + "loss": 0.6503, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25386762619018555, + "rewards/margins": 0.17454546689987183, + "rewards/rejected": -0.4284130930900574, + "step": 213 + }, + { + "epoch": 0.22, + "learning_rate": 4.785350472409792e-05, + "logits/chosen": -2.1585237979888916, + "logits/rejected": -2.2346014976501465, + "logps/chosen": -171.45030212402344, + "logps/rejected": -226.79852294921875, + "loss": 0.7963, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.43708741664886475, + "rewards/margins": -0.08538319170475006, + "rewards/rejected": -0.3517042398452759, + "step": 214 + }, + { + "epoch": 0.22, + "learning_rate": 4.7816242168610093e-05, + "logits/chosen": -2.247028350830078, + "logits/rejected": -2.2580173015594482, + "logps/chosen": -190.72055053710938, + "logps/rejected": -186.36619567871094, + "loss": 0.6804, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.447765588760376, + "rewards/margins": 0.06104414537549019, + "rewards/rejected": -0.5088096857070923, + "step": 215 + }, + { + "epoch": 0.23, + "learning_rate": 4.777867372064105e-05, + "logits/chosen": -2.222308874130249, + "logits/rejected": -2.2742927074432373, + "logps/chosen": -165.11985778808594, + "logps/rejected": -187.6106719970703, + "loss": 0.7121, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.37397119402885437, + "rewards/margins": 0.040555573999881744, + "rewards/rejected": -0.4145267605781555, + "step": 216 + }, + { + "epoch": 0.23, + "learning_rate": 4.774079988386296e-05, + "logits/chosen": -2.2380332946777344, + "logits/rejected": -2.3339126110076904, + "logps/chosen": -137.3435821533203, + "logps/rejected": -169.9375, + "loss": 0.7015, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.36366236209869385, + "rewards/margins": 0.009570196270942688, + "rewards/rejected": -0.37323257327079773, + "step": 217 + }, + { + "epoch": 0.23, + "learning_rate": 4.770262116604224e-05, + "logits/chosen": -2.2831175327301025, + "logits/rejected": -2.260009765625, + "logps/chosen": -207.33409118652344, + "logps/rejected": -217.00962829589844, + "loss": 0.6735, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.27594539523124695, + "rewards/margins": 0.09640569984912872, + "rewards/rejected": -0.3723510801792145, + "step": 218 + }, + { + "epoch": 0.23, + "learning_rate": 4.76641380790328e-05, + "logits/chosen": -2.2921030521392822, + "logits/rejected": -2.2923154830932617, + "logps/chosen": -168.2577667236328, + "logps/rejected": -166.49632263183594, + "loss": 0.6891, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2567214369773865, + "rewards/margins": 0.15365271270275116, + "rewards/rejected": -0.41037416458129883, + "step": 219 + }, + { + "epoch": 0.23, + "learning_rate": 4.762535113876917e-05, + "logits/chosen": -2.273233652114868, + "logits/rejected": -2.246183395385742, + "logps/chosen": -208.0199737548828, + "logps/rejected": -209.45909118652344, + "loss": 0.6409, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12088227272033691, + "rewards/margins": 0.15805310010910034, + "rewards/rejected": -0.27893537282943726, + "step": 220 + }, + { + "epoch": 0.23, + "learning_rate": 4.758626086525956e-05, + "logits/chosen": -2.2132441997528076, + "logits/rejected": -2.22251296043396, + "logps/chosen": -167.60195922851562, + "logps/rejected": -189.9409942626953, + "loss": 0.6678, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3752858340740204, + "rewards/margins": 0.10380817204713821, + "rewards/rejected": -0.479093998670578, + "step": 221 + }, + { + "epoch": 0.23, + "learning_rate": 4.754686778257891e-05, + "logits/chosen": -2.282052755355835, + "logits/rejected": -2.2734806537628174, + "logps/chosen": -132.63140869140625, + "logps/rejected": -136.8794708251953, + "loss": 0.6003, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.009242314845323563, + "rewards/margins": 0.26125800609588623, + "rewards/rejected": -0.25201570987701416, + "step": 222 + }, + { + "epoch": 0.23, + "learning_rate": 4.750717241886185e-05, + "logits/chosen": -2.189680337905884, + "logits/rejected": -2.1535582542419434, + "logps/chosen": -129.69590759277344, + "logps/rejected": -127.54085540771484, + "loss": 0.7529, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5190645456314087, + "rewards/margins": -0.08417561650276184, + "rewards/rejected": -0.43488895893096924, + "step": 223 + }, + { + "epoch": 0.23, + "learning_rate": 4.7467175306295655e-05, + "logits/chosen": -2.2347896099090576, + "logits/rejected": -2.2738678455352783, + "logps/chosen": -158.85928344726562, + "logps/rejected": -158.6512908935547, + "loss": 0.683, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4156660735607147, + "rewards/margins": 0.1208164319396019, + "rewards/rejected": -0.5364825129508972, + "step": 224 + }, + { + "epoch": 0.23, + "learning_rate": 4.7426876981113044e-05, + "logits/chosen": -2.1925535202026367, + "logits/rejected": -2.140517234802246, + "logps/chosen": -165.46412658691406, + "logps/rejected": -160.30532836914062, + "loss": 0.6721, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22652333974838257, + "rewards/margins": 0.09546832740306854, + "rewards/rejected": -0.3219916820526123, + "step": 225 + }, + { + "epoch": 0.24, + "learning_rate": 4.738627798358506e-05, + "logits/chosen": -2.3328022956848145, + "logits/rejected": -2.3571670055389404, + "logps/chosen": -214.76580810546875, + "logps/rejected": -231.52804565429688, + "loss": 0.5743, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2423395961523056, + "rewards/margins": 0.3217008709907532, + "rewards/rejected": -0.5640404224395752, + "step": 226 + }, + { + "epoch": 0.24, + "learning_rate": 4.7345378858013776e-05, + "logits/chosen": -2.250012159347534, + "logits/rejected": -2.26370906829834, + "logps/chosen": -208.80230712890625, + "logps/rejected": -210.8206329345703, + "loss": 0.8003, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5306903719902039, + "rewards/margins": -0.14622673392295837, + "rewards/rejected": -0.3844636082649231, + "step": 227 + }, + { + "epoch": 0.24, + "learning_rate": 4.730418015272503e-05, + "logits/chosen": -2.3351643085479736, + "logits/rejected": -2.33880352973938, + "logps/chosen": -209.63296508789062, + "logps/rejected": -219.62771606445312, + "loss": 0.6265, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4614105522632599, + "rewards/margins": 0.28935885429382324, + "rewards/rejected": -0.7507694959640503, + "step": 228 + }, + { + "epoch": 0.24, + "learning_rate": 4.726268242006106e-05, + "logits/chosen": -2.051967144012451, + "logits/rejected": -2.0519886016845703, + "logps/chosen": -142.45199584960938, + "logps/rejected": -140.49044799804688, + "loss": 0.6649, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4592810571193695, + "rewards/margins": 0.15481841564178467, + "rewards/rejected": -0.6140995025634766, + "step": 229 + }, + { + "epoch": 0.24, + "learning_rate": 4.722088621637309e-05, + "logits/chosen": -2.2426981925964355, + "logits/rejected": -2.287612199783325, + "logps/chosen": -169.47291564941406, + "logps/rejected": -179.918212890625, + "loss": 0.7675, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6015586853027344, + "rewards/margins": -0.048168424516916275, + "rewards/rejected": -0.5533902645111084, + "step": 230 + }, + { + "epoch": 0.24, + "learning_rate": 4.717879210201389e-05, + "logits/chosen": -2.192275047302246, + "logits/rejected": -2.308380365371704, + "logps/chosen": -156.87548828125, + "logps/rejected": -180.34512329101562, + "loss": 0.714, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.42379727959632874, + "rewards/margins": 0.08124817907810211, + "rewards/rejected": -0.505045473575592, + "step": 231 + }, + { + "epoch": 0.24, + "learning_rate": 4.713640064133025e-05, + "logits/chosen": -2.057934045791626, + "logits/rejected": -1.9755558967590332, + "logps/chosen": -156.72357177734375, + "logps/rejected": -158.1801300048828, + "loss": 0.7132, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5255974531173706, + "rewards/margins": 0.04890578240156174, + "rewards/rejected": -0.5745032429695129, + "step": 232 + }, + { + "epoch": 0.24, + "learning_rate": 4.7093712402655427e-05, + "logits/chosen": -2.055185079574585, + "logits/rejected": -2.0594377517700195, + "logps/chosen": -139.1905517578125, + "logps/rejected": -141.0957489013672, + "loss": 0.7112, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4041973948478699, + "rewards/margins": 0.0033319219946861267, + "rewards/rejected": -0.407529354095459, + "step": 233 + }, + { + "epoch": 0.24, + "learning_rate": 4.7050727958301506e-05, + "logits/chosen": -2.1757850646972656, + "logits/rejected": -2.1435933113098145, + "logps/chosen": -177.0955810546875, + "logps/rejected": -161.39651489257812, + "loss": 0.6564, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3865699768066406, + "rewards/margins": 0.11041317880153656, + "rewards/rejected": -0.4969831705093384, + "step": 234 + }, + { + "epoch": 0.25, + "learning_rate": 4.7007447884551745e-05, + "logits/chosen": -2.055013418197632, + "logits/rejected": -2.0445072650909424, + "logps/chosen": -154.93429565429688, + "logps/rejected": -166.30482482910156, + "loss": 0.7354, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48415255546569824, + "rewards/margins": 0.04046877846121788, + "rewards/rejected": -0.5246213674545288, + "step": 235 + }, + { + "epoch": 0.25, + "learning_rate": 4.6963872761652835e-05, + "logits/chosen": -2.202390670776367, + "logits/rejected": -2.2232775688171387, + "logps/chosen": -216.69921875, + "logps/rejected": -206.72483825683594, + "loss": 0.79, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.47140538692474365, + "rewards/margins": 0.04289142042398453, + "rewards/rejected": -0.51429682970047, + "step": 236 + }, + { + "epoch": 0.25, + "learning_rate": 4.692000317380715e-05, + "logits/chosen": -2.2061116695404053, + "logits/rejected": -2.2866199016571045, + "logps/chosen": -174.46450805664062, + "logps/rejected": -185.262939453125, + "loss": 0.628, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4978125989437103, + "rewards/margins": 0.25329017639160156, + "rewards/rejected": -0.7511026859283447, + "step": 237 + }, + { + "epoch": 0.25, + "learning_rate": 4.687583970916487e-05, + "logits/chosen": -2.2171239852905273, + "logits/rejected": -2.2869226932525635, + "logps/chosen": -186.90225219726562, + "logps/rejected": -210.4280548095703, + "loss": 0.7644, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4735070765018463, + "rewards/margins": -0.040224503725767136, + "rewards/rejected": -0.4332825839519501, + "step": 238 + }, + { + "epoch": 0.25, + "learning_rate": 4.683138295981611e-05, + "logits/chosen": -2.1964111328125, + "logits/rejected": -2.212043046951294, + "logps/chosen": -153.104248046875, + "logps/rejected": -165.96299743652344, + "loss": 0.6534, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.24953705072402954, + "rewards/margins": 0.14786407351493835, + "rewards/rejected": -0.3974011242389679, + "step": 239 + }, + { + "epoch": 0.25, + "learning_rate": 4.678663352178301e-05, + "logits/chosen": -1.9726707935333252, + "logits/rejected": -2.0120890140533447, + "logps/chosen": -148.98944091796875, + "logps/rejected": -149.28582763671875, + "loss": 0.6168, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.33494099974632263, + "rewards/margins": 0.3014180362224579, + "rewards/rejected": -0.6363590359687805, + "step": 240 + }, + { + "epoch": 0.25, + "learning_rate": 4.674159199501173e-05, + "logits/chosen": -2.143721103668213, + "logits/rejected": -2.1720468997955322, + "logps/chosen": -131.01319885253906, + "logps/rejected": -147.077392578125, + "loss": 0.7585, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4145124554634094, + "rewards/margins": -0.08547190576791763, + "rewards/rejected": -0.3290405869483948, + "step": 241 + }, + { + "epoch": 0.25, + "learning_rate": 4.6696258983364385e-05, + "logits/chosen": -2.2236969470977783, + "logits/rejected": -2.2577428817749023, + "logps/chosen": -184.3029327392578, + "logps/rejected": -190.77215576171875, + "loss": 0.6862, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.23868240416049957, + "rewards/margins": 0.11198949813842773, + "rewards/rejected": -0.3506719172000885, + "step": 242 + }, + { + "epoch": 0.25, + "learning_rate": 4.665063509461097e-05, + "logits/chosen": -2.060711622238159, + "logits/rejected": -2.0380258560180664, + "logps/chosen": -174.40530395507812, + "logps/rejected": -171.62820434570312, + "loss": 0.6911, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4223189949989319, + "rewards/margins": 0.14973345398902893, + "rewards/rejected": -0.5720524191856384, + "step": 243 + }, + { + "epoch": 0.25, + "learning_rate": 4.660472094042121e-05, + "logits/chosen": -2.1594133377075195, + "logits/rejected": -2.1700899600982666, + "logps/chosen": -207.55848693847656, + "logps/rejected": -200.78297424316406, + "loss": 0.7768, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5937251448631287, + "rewards/margins": -0.08584102243185043, + "rewards/rejected": -0.50788414478302, + "step": 244 + }, + { + "epoch": 0.26, + "learning_rate": 4.655851713635635e-05, + "logits/chosen": -2.305196762084961, + "logits/rejected": -2.2462093830108643, + "logps/chosen": -243.9378662109375, + "logps/rejected": -215.36997985839844, + "loss": 0.724, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5542324185371399, + "rewards/margins": 0.021042201668024063, + "rewards/rejected": -0.5752745866775513, + "step": 245 + }, + { + "epoch": 0.26, + "learning_rate": 4.651202430186092e-05, + "logits/chosen": -2.025132179260254, + "logits/rejected": -1.9792908430099487, + "logps/chosen": -195.5504608154297, + "logps/rejected": -195.3519744873047, + "loss": 0.7289, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.539334237575531, + "rewards/margins": 0.23235680162906647, + "rewards/rejected": -0.771691083908081, + "step": 246 + }, + { + "epoch": 0.26, + "learning_rate": 4.6465243060254415e-05, + "logits/chosen": -2.147789239883423, + "logits/rejected": -2.1263253688812256, + "logps/chosen": -189.7777099609375, + "logps/rejected": -175.15142822265625, + "loss": 0.7963, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.40796539187431335, + "rewards/margins": -0.13925030827522278, + "rewards/rejected": -0.2687150835990906, + "step": 247 + }, + { + "epoch": 0.26, + "learning_rate": 4.641817403872293e-05, + "logits/chosen": -2.0318384170532227, + "logits/rejected": -2.0659642219543457, + "logps/chosen": -165.9607391357422, + "logps/rejected": -182.31895446777344, + "loss": 0.8084, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3802737891674042, + "rewards/margins": -0.0964832603931427, + "rewards/rejected": -0.28379055857658386, + "step": 248 + }, + { + "epoch": 0.26, + "learning_rate": 4.637081786831079e-05, + "logits/chosen": -2.0719449520111084, + "logits/rejected": -2.026743173599243, + "logps/chosen": -191.45486450195312, + "logps/rejected": -185.71011352539062, + "loss": 0.7078, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3257860839366913, + "rewards/margins": 0.07691369950771332, + "rewards/rejected": -0.4026997983455658, + "step": 249 + }, + { + "epoch": 0.26, + "learning_rate": 4.6323175183912024e-05, + "logits/chosen": -2.0912959575653076, + "logits/rejected": -2.156938314437866, + "logps/chosen": -167.86541748046875, + "logps/rejected": -198.28921508789062, + "loss": 0.4872, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.35692939162254333, + "rewards/margins": 0.6100252866744995, + "rewards/rejected": -0.9669547080993652, + "step": 250 + }, + { + "epoch": 0.26, + "learning_rate": 4.627524662426194e-05, + "logits/chosen": -1.8769241571426392, + "logits/rejected": -1.8571780920028687, + "logps/chosen": -175.3948974609375, + "logps/rejected": -183.54019165039062, + "loss": 0.8564, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6141475439071655, + "rewards/margins": -0.18047203123569489, + "rewards/rejected": -0.43367546796798706, + "step": 251 + }, + { + "epoch": 0.26, + "learning_rate": 4.6227032831928484e-05, + "logits/chosen": -1.9304271936416626, + "logits/rejected": -1.8033334016799927, + "logps/chosen": -174.45651245117188, + "logps/rejected": -144.34657287597656, + "loss": 0.7318, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.625647246837616, + "rewards/margins": 0.09466619789600372, + "rewards/rejected": -0.7203134298324585, + "step": 252 + }, + { + "epoch": 0.26, + "learning_rate": 4.6178534453303666e-05, + "logits/chosen": -2.082902193069458, + "logits/rejected": -2.0136592388153076, + "logps/chosen": -201.52587890625, + "logps/rejected": -198.62039184570312, + "loss": 0.87, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5361589789390564, + "rewards/margins": -0.25926750898361206, + "rewards/rejected": -0.27689146995544434, + "step": 253 + }, + { + "epoch": 0.26, + "learning_rate": 4.6129752138594874e-05, + "logits/chosen": -1.957344889640808, + "logits/rejected": -1.982904314994812, + "logps/chosen": -187.89329528808594, + "logps/rejected": -186.68289184570312, + "loss": 0.7922, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5014457106590271, + "rewards/margins": -0.08516909182071686, + "rewards/rejected": -0.41627663373947144, + "step": 254 + }, + { + "epoch": 0.27, + "learning_rate": 4.608068654181617e-05, + "logits/chosen": -1.654222846031189, + "logits/rejected": -1.6798536777496338, + "logps/chosen": -184.433349609375, + "logps/rejected": -182.4044189453125, + "loss": 0.6938, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25272616744041443, + "rewards/margins": 0.10025610774755478, + "rewards/rejected": -0.3529822528362274, + "step": 255 + }, + { + "epoch": 0.27, + "learning_rate": 4.6031338320779534e-05, + "logits/chosen": -2.0019171237945557, + "logits/rejected": -2.0632452964782715, + "logps/chosen": -162.12811279296875, + "logps/rejected": -177.98736572265625, + "loss": 0.688, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5015031099319458, + "rewards/margins": 0.15753880143165588, + "rewards/rejected": -0.6590418815612793, + "step": 256 + }, + { + "epoch": 0.27, + "learning_rate": 4.5981708137086e-05, + "logits/chosen": -2.0519323348999023, + "logits/rejected": -2.089592933654785, + "logps/chosen": -168.29608154296875, + "logps/rejected": -180.1384735107422, + "loss": 0.6519, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.46469518542289734, + "rewards/margins": 0.22538693249225616, + "rewards/rejected": -0.6900821924209595, + "step": 257 + }, + { + "epoch": 0.27, + "learning_rate": 4.5931796656116846e-05, + "logits/chosen": -1.9542289972305298, + "logits/rejected": -2.1067469120025635, + "logps/chosen": -141.18040466308594, + "logps/rejected": -154.12332153320312, + "loss": 0.7912, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.424506276845932, + "rewards/margins": -0.11340674757957458, + "rewards/rejected": -0.3110995292663574, + "step": 258 + }, + { + "epoch": 0.27, + "learning_rate": 4.588160454702462e-05, + "logits/chosen": -1.9130336046218872, + "logits/rejected": -1.8589096069335938, + "logps/chosen": -154.01983642578125, + "logps/rejected": -151.2600860595703, + "loss": 0.6448, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.17828267812728882, + "rewards/margins": 0.22233837842941284, + "rewards/rejected": -0.40062105655670166, + "step": 259 + }, + { + "epoch": 0.27, + "learning_rate": 4.5831132482724195e-05, + "logits/chosen": -1.969378113746643, + "logits/rejected": -1.9947978258132935, + "logps/chosen": -221.1347198486328, + "logps/rejected": -219.78065490722656, + "loss": 0.7675, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8122418522834778, + "rewards/margins": -0.012366384267807007, + "rewards/rejected": -0.7998754978179932, + "step": 260 + }, + { + "epoch": 0.27, + "learning_rate": 4.578038113988376e-05, + "logits/chosen": -1.9141626358032227, + "logits/rejected": -1.8990297317504883, + "logps/chosen": -181.55563354492188, + "logps/rejected": -158.1247100830078, + "loss": 0.8615, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5867054462432861, + "rewards/margins": -0.2548179626464844, + "rewards/rejected": -0.33188754320144653, + "step": 261 + }, + { + "epoch": 0.27, + "learning_rate": 4.572935119891571e-05, + "logits/chosen": -1.9274933338165283, + "logits/rejected": -2.0607924461364746, + "logps/chosen": -200.04896545410156, + "logps/rejected": -209.02926635742188, + "loss": 0.6567, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5149486064910889, + "rewards/margins": 0.22366176545619965, + "rewards/rejected": -0.7386104464530945, + "step": 262 + }, + { + "epoch": 0.27, + "learning_rate": 4.5678043343967554e-05, + "logits/chosen": -2.108922243118286, + "logits/rejected": -2.041205406188965, + "logps/chosen": -174.11276245117188, + "logps/rejected": -146.20103454589844, + "loss": 0.9138, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.47598356008529663, + "rewards/margins": -0.3184044659137726, + "rewards/rejected": -0.15757909417152405, + "step": 263 + }, + { + "epoch": 0.28, + "learning_rate": 4.5626458262912745e-05, + "logits/chosen": -2.0428683757781982, + "logits/rejected": -1.9962735176086426, + "logps/chosen": -203.15782165527344, + "logps/rejected": -179.91824340820312, + "loss": 0.8575, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2803889513015747, + "rewards/margins": -0.19668762385845184, + "rewards/rejected": -0.08370131254196167, + "step": 264 + }, + { + "epoch": 0.28, + "learning_rate": 4.557459664734141e-05, + "logits/chosen": -1.9452859163284302, + "logits/rejected": -2.0013086795806885, + "logps/chosen": -151.39443969726562, + "logps/rejected": -160.42532348632812, + "loss": 0.7297, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.366563618183136, + "rewards/margins": 0.06915048509836197, + "rewards/rejected": -0.43571415543556213, + "step": 265 + }, + { + "epoch": 0.28, + "learning_rate": 4.552245919255117e-05, + "logits/chosen": -1.9962891340255737, + "logits/rejected": -2.0349621772766113, + "logps/chosen": -178.134765625, + "logps/rejected": -167.83949279785156, + "loss": 0.7017, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.19582800567150116, + "rewards/margins": 0.07028350979089737, + "rewards/rejected": -0.2661115229129791, + "step": 266 + }, + { + "epoch": 0.28, + "learning_rate": 4.5470046597537735e-05, + "logits/chosen": -1.9965953826904297, + "logits/rejected": -2.0604090690612793, + "logps/chosen": -158.63546752929688, + "logps/rejected": -177.67669677734375, + "loss": 0.7415, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3360700309276581, + "rewards/margins": 0.05659861862659454, + "rewards/rejected": -0.3926686644554138, + "step": 267 + }, + { + "epoch": 0.28, + "learning_rate": 4.541735956498554e-05, + "logits/chosen": -1.95860755443573, + "logits/rejected": -1.9922930002212524, + "logps/chosen": -130.50543212890625, + "logps/rejected": -137.30873107910156, + "loss": 0.6415, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13774187862873077, + "rewards/margins": 0.156549870967865, + "rewards/rejected": -0.29429173469543457, + "step": 268 + }, + { + "epoch": 0.28, + "learning_rate": 4.5364398801258396e-05, + "logits/chosen": -2.0323469638824463, + "logits/rejected": -1.989902377128601, + "logps/chosen": -123.73210144042969, + "logps/rejected": -120.40567016601562, + "loss": 0.7417, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.23733878135681152, + "rewards/margins": -0.006110057234764099, + "rewards/rejected": -0.2312287539243698, + "step": 269 + }, + { + "epoch": 0.28, + "learning_rate": 4.5311165016389916e-05, + "logits/chosen": -2.2102789878845215, + "logits/rejected": -2.1945927143096924, + "logps/chosen": -178.87513732910156, + "logps/rejected": -185.6321258544922, + "loss": 0.58, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3368569016456604, + "rewards/margins": 0.3213872015476227, + "rewards/rejected": -0.6582440733909607, + "step": 270 + }, + { + "epoch": 0.28, + "learning_rate": 4.525765892407409e-05, + "logits/chosen": -2.014317750930786, + "logits/rejected": -1.9814597368240356, + "logps/chosen": -162.72450256347656, + "logps/rejected": -162.10733032226562, + "loss": 0.7063, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.30713194608688354, + "rewards/margins": 0.028126142919063568, + "rewards/rejected": -0.33525803685188293, + "step": 271 + }, + { + "epoch": 0.28, + "learning_rate": 4.5203881241655644e-05, + "logits/chosen": -2.2270286083221436, + "logits/rejected": -2.207059144973755, + "logps/chosen": -158.911376953125, + "logps/rejected": -163.7472381591797, + "loss": 0.8314, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24247878789901733, + "rewards/margins": -0.18095803260803223, + "rewards/rejected": -0.0615207776427269, + "step": 272 + }, + { + "epoch": 0.28, + "learning_rate": 4.514983269012049e-05, + "logits/chosen": -2.163167715072632, + "logits/rejected": -2.183046817779541, + "logps/chosen": -174.82070922851562, + "logps/rejected": -164.1748504638672, + "loss": 0.7933, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.38691380620002747, + "rewards/margins": -0.16529785096645355, + "rewards/rejected": -0.22161594033241272, + "step": 273 + }, + { + "epoch": 0.29, + "learning_rate": 4.509551399408598e-05, + "logits/chosen": -2.253500461578369, + "logits/rejected": -2.260409355163574, + "logps/chosen": -190.3398895263672, + "logps/rejected": -201.19200134277344, + "loss": 0.7775, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21146364510059357, + "rewards/margins": -0.06666092574596405, + "rewards/rejected": -0.14480271935462952, + "step": 274 + }, + { + "epoch": 0.29, + "learning_rate": 4.504092588179128e-05, + "logits/chosen": -2.2398221492767334, + "logits/rejected": -2.1929337978363037, + "logps/chosen": -231.88616943359375, + "logps/rejected": -226.3920135498047, + "loss": 0.6504, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30005282163619995, + "rewards/margins": 0.13275346159934998, + "rewards/rejected": -0.43280625343322754, + "step": 275 + }, + { + "epoch": 0.29, + "learning_rate": 4.498606908508754e-05, + "logits/chosen": -2.251856565475464, + "logits/rejected": -2.2825677394866943, + "logps/chosen": -194.575927734375, + "logps/rejected": -208.67977905273438, + "loss": 0.7303, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.1339748501777649, + "rewards/margins": -0.01863221824169159, + "rewards/rejected": -0.1153426244854927, + "step": 276 + }, + { + "epoch": 0.29, + "learning_rate": 4.4930944339428085e-05, + "logits/chosen": -1.9029638767242432, + "logits/rejected": -2.052668571472168, + "logps/chosen": -192.86978149414062, + "logps/rejected": -214.9619140625, + "loss": 0.734, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.31461602449417114, + "rewards/margins": -0.04775575175881386, + "rewards/rejected": -0.26686030626296997, + "step": 277 + }, + { + "epoch": 0.29, + "learning_rate": 4.487555238385862e-05, + "logits/chosen": -2.242000102996826, + "logits/rejected": -2.1664445400238037, + "logps/chosen": -177.2509002685547, + "logps/rejected": -168.57376098632812, + "loss": 0.7252, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3784841001033783, + "rewards/margins": -0.008476179093122482, + "rewards/rejected": -0.3700079321861267, + "step": 278 + }, + { + "epoch": 0.29, + "learning_rate": 4.481989396100724e-05, + "logits/chosen": -2.1768834590911865, + "logits/rejected": -2.122082471847534, + "logps/chosen": -136.37344360351562, + "logps/rejected": -129.23260498046875, + "loss": 0.7226, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1186956837773323, + "rewards/margins": 0.10684624314308167, + "rewards/rejected": -0.22554191946983337, + "step": 279 + }, + { + "epoch": 0.29, + "learning_rate": 4.476396981707453e-05, + "logits/chosen": -2.2543294429779053, + "logits/rejected": -2.2154581546783447, + "logps/chosen": -170.55038452148438, + "logps/rejected": -156.1742706298828, + "loss": 0.7387, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.22529585659503937, + "rewards/margins": -0.04772930592298508, + "rewards/rejected": -0.1775665581226349, + "step": 280 + }, + { + "epoch": 0.29, + "learning_rate": 4.470778070182353e-05, + "logits/chosen": -2.2135331630706787, + "logits/rejected": -2.170997142791748, + "logps/chosen": -140.70477294921875, + "logps/rejected": -140.6665496826172, + "loss": 0.7037, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09401345998048782, + "rewards/margins": 0.028659436851739883, + "rewards/rejected": -0.122672900557518, + "step": 281 + }, + { + "epoch": 0.29, + "learning_rate": 4.465132736856969e-05, + "logits/chosen": -2.2878525257110596, + "logits/rejected": -2.2237839698791504, + "logps/chosen": -164.2831573486328, + "logps/rejected": -160.92367553710938, + "loss": 0.7044, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.34608572721481323, + "rewards/margins": 0.046185556799173355, + "rewards/rejected": -0.3922712802886963, + "step": 282 + }, + { + "epoch": 0.3, + "learning_rate": 4.459461057417078e-05, + "logits/chosen": -2.185762882232666, + "logits/rejected": -2.099052667617798, + "logps/chosen": -151.68963623046875, + "logps/rejected": -147.94314575195312, + "loss": 0.6582, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20870129764080048, + "rewards/margins": 0.16736984252929688, + "rewards/rejected": -0.37607109546661377, + "step": 283 + }, + { + "epoch": 0.3, + "learning_rate": 4.453763107901675e-05, + "logits/chosen": -2.1122889518737793, + "logits/rejected": -2.010392189025879, + "logps/chosen": -187.94906616210938, + "logps/rejected": -185.841796875, + "loss": 0.6521, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08321194350719452, + "rewards/margins": 0.12147242575883865, + "rewards/rejected": -0.20468439161777496, + "step": 284 + }, + { + "epoch": 0.3, + "learning_rate": 4.4480389647019505e-05, + "logits/chosen": -2.1214723587036133, + "logits/rejected": -2.033308506011963, + "logps/chosen": -152.14337158203125, + "logps/rejected": -149.34632873535156, + "loss": 0.7408, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3923906683921814, + "rewards/margins": 0.008589165285229683, + "rewards/rejected": -0.40097981691360474, + "step": 285 + }, + { + "epoch": 0.3, + "learning_rate": 4.442288704560268e-05, + "logits/chosen": -2.1580543518066406, + "logits/rejected": -2.1600093841552734, + "logps/chosen": -202.5330810546875, + "logps/rejected": -194.6158905029297, + "loss": 0.8982, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4735994040966034, + "rewards/margins": -0.2955199182033539, + "rewards/rejected": -0.1780795007944107, + "step": 286 + }, + { + "epoch": 0.3, + "learning_rate": 4.436512404569136e-05, + "logits/chosen": -2.1974916458129883, + "logits/rejected": -2.259157657623291, + "logps/chosen": -147.84683227539062, + "logps/rejected": -164.73829650878906, + "loss": 0.6104, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2959311902523041, + "rewards/margins": 0.22050346434116364, + "rewards/rejected": -0.5164346694946289, + "step": 287 + }, + { + "epoch": 0.3, + "learning_rate": 4.430710142170176e-05, + "logits/chosen": -2.341240644454956, + "logits/rejected": -2.3171639442443848, + "logps/chosen": -151.489990234375, + "logps/rejected": -136.1929931640625, + "loss": 0.7421, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.12041179090738297, + "rewards/margins": -0.06451301276683807, + "rewards/rejected": -0.0558987595140934, + "step": 288 + }, + { + "epoch": 0.3, + "learning_rate": 4.424881995153076e-05, + "logits/chosen": -2.076103448867798, + "logits/rejected": -2.188572883605957, + "logps/chosen": -155.2179718017578, + "logps/rejected": -181.94903564453125, + "loss": 0.7223, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1940208077430725, + "rewards/margins": 0.03361191600561142, + "rewards/rejected": -0.22763270139694214, + "step": 289 + }, + { + "epoch": 0.3, + "learning_rate": 4.419028041654559e-05, + "logits/chosen": -2.1491026878356934, + "logits/rejected": -2.1136281490325928, + "logps/chosen": -151.4420928955078, + "logps/rejected": -141.8388214111328, + "loss": 0.6722, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2384054809808731, + "rewards/margins": 0.15212693810462952, + "rewards/rejected": -0.39053237438201904, + "step": 290 + }, + { + "epoch": 0.3, + "learning_rate": 4.4131483601573285e-05, + "logits/chosen": -1.9776232242584229, + "logits/rejected": -2.004852771759033, + "logps/chosen": -170.9802703857422, + "logps/rejected": -162.25498962402344, + "loss": 0.7193, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18611471354961395, + "rewards/margins": 0.00812564603984356, + "rewards/rejected": -0.19424037635326385, + "step": 291 + }, + { + "epoch": 0.3, + "learning_rate": 4.4072430294890174e-05, + "logits/chosen": -2.0479369163513184, + "logits/rejected": -2.0386621952056885, + "logps/chosen": -178.59158325195312, + "logps/rejected": -174.89373779296875, + "loss": 0.6113, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.28108635544776917, + "rewards/margins": 0.20999087393283844, + "rewards/rejected": -0.4910773038864136, + "step": 292 + }, + { + "epoch": 0.31, + "learning_rate": 4.4013121288211307e-05, + "logits/chosen": -2.2802951335906982, + "logits/rejected": -2.192854881286621, + "logps/chosen": -153.86907958984375, + "logps/rejected": -142.56341552734375, + "loss": 0.7816, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07042498886585236, + "rewards/margins": -0.13551297783851624, + "rewards/rejected": 0.06508798897266388, + "step": 293 + }, + { + "epoch": 0.31, + "learning_rate": 4.3953557376679856e-05, + "logits/chosen": -2.2507810592651367, + "logits/rejected": -2.2218830585479736, + "logps/chosen": -125.35671997070312, + "logps/rejected": -128.93533325195312, + "loss": 0.7147, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10209127515554428, + "rewards/margins": 0.0391690619289875, + "rewards/rejected": -0.14126034080982208, + "step": 294 + }, + { + "epoch": 0.31, + "learning_rate": 4.389373935885646e-05, + "logits/chosen": -2.169445514678955, + "logits/rejected": -2.194335460662842, + "logps/chosen": -157.78758239746094, + "logps/rejected": -165.60736083984375, + "loss": 0.7124, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24438923597335815, + "rewards/margins": 0.05163384974002838, + "rewards/rejected": -0.29602310061454773, + "step": 295 + }, + { + "epoch": 0.31, + "learning_rate": 4.383366803670849e-05, + "logits/chosen": -2.2508602142333984, + "logits/rejected": -2.30161714553833, + "logps/chosen": -167.88427734375, + "logps/rejected": -184.48272705078125, + "loss": 0.7121, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.15736985206604004, + "rewards/margins": 0.126334547996521, + "rewards/rejected": -0.28370437026023865, + "step": 296 + }, + { + "epoch": 0.31, + "learning_rate": 4.377334421559932e-05, + "logits/chosen": -2.314563035964966, + "logits/rejected": -2.2978732585906982, + "logps/chosen": -179.24159240722656, + "logps/rejected": -188.38165283203125, + "loss": 0.7184, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012131119146943092, + "rewards/margins": -0.01239142008125782, + "rewards/rejected": 0.02452254109084606, + "step": 297 + }, + { + "epoch": 0.31, + "learning_rate": 4.371276870427753e-05, + "logits/chosen": -2.066857099533081, + "logits/rejected": -2.174121856689453, + "logps/chosen": -170.82960510253906, + "logps/rejected": -189.02621459960938, + "loss": 0.7574, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.28441551327705383, + "rewards/margins": -0.09379325062036514, + "rewards/rejected": -0.1906222403049469, + "step": 298 + }, + { + "epoch": 0.31, + "learning_rate": 4.365194231486604e-05, + "logits/chosen": -2.147336006164551, + "logits/rejected": -2.132305383682251, + "logps/chosen": -158.25559997558594, + "logps/rejected": -152.197509765625, + "loss": 0.6692, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.104363813996315, + "rewards/margins": 0.12111049890518188, + "rewards/rejected": -0.2254743129014969, + "step": 299 + }, + { + "epoch": 0.31, + "learning_rate": 4.359086586285127e-05, + "logits/chosen": -2.247628688812256, + "logits/rejected": -2.286552667617798, + "logps/chosen": -133.13673400878906, + "logps/rejected": -183.82647705078125, + "loss": 0.6152, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.17196473479270935, + "rewards/margins": 0.2282213419675827, + "rewards/rejected": -0.40018609166145325, + "step": 300 + }, + { + "epoch": 0.31, + "learning_rate": 4.3529540167072126e-05, + "logits/chosen": -1.8818175792694092, + "logits/rejected": -1.8767746686935425, + "logps/chosen": -134.77548217773438, + "logps/rejected": -151.11073303222656, + "loss": 0.6999, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2484281212091446, + "rewards/margins": 0.12181131541728973, + "rewards/rejected": -0.3702394366264343, + "step": 301 + }, + { + "epoch": 0.32, + "learning_rate": 4.346796604970912e-05, + "logits/chosen": -2.107909679412842, + "logits/rejected": -2.138780355453491, + "logps/chosen": -168.53460693359375, + "logps/rejected": -174.63592529296875, + "loss": 0.7954, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.30226460099220276, + "rewards/margins": -0.09562454372644424, + "rewards/rejected": -0.20664002001285553, + "step": 302 + }, + { + "epoch": 0.32, + "learning_rate": 4.340614433627328e-05, + "logits/chosen": -2.1604933738708496, + "logits/rejected": -2.2617201805114746, + "logps/chosen": -155.14198303222656, + "logps/rejected": -169.87091064453125, + "loss": 0.6444, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07352495938539505, + "rewards/margins": 0.1430576741695404, + "rewards/rejected": -0.21658262610435486, + "step": 303 + }, + { + "epoch": 0.32, + "learning_rate": 4.3344075855595104e-05, + "logits/chosen": -2.1969313621520996, + "logits/rejected": -2.2095913887023926, + "logps/chosen": -165.4632568359375, + "logps/rejected": -158.85653686523438, + "loss": 0.7844, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2591025233268738, + "rewards/margins": -0.1096937507390976, + "rewards/rejected": -0.14940877258777618, + "step": 304 + }, + { + "epoch": 0.32, + "learning_rate": 4.328176143981343e-05, + "logits/chosen": -2.146892786026001, + "logits/rejected": -2.1590354442596436, + "logps/chosen": -165.599365234375, + "logps/rejected": -155.86920166015625, + "loss": 0.5594, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04538270831108093, + "rewards/margins": 0.3189627528190613, + "rewards/rejected": -0.27358004450798035, + "step": 305 + }, + { + "epoch": 0.32, + "learning_rate": 4.321920192436433e-05, + "logits/chosen": -2.226012706756592, + "logits/rejected": -2.2147953510284424, + "logps/chosen": -149.3193817138672, + "logps/rejected": -182.29501342773438, + "loss": 0.7058, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.34394580125808716, + "rewards/margins": 0.0967680886387825, + "rewards/rejected": -0.44071388244628906, + "step": 306 + }, + { + "epoch": 0.32, + "learning_rate": 4.315639814796983e-05, + "logits/chosen": -2.0329627990722656, + "logits/rejected": -2.1080126762390137, + "logps/chosen": -145.89712524414062, + "logps/rejected": -164.14891052246094, + "loss": 0.6894, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22714479267597198, + "rewards/margins": 0.17113275825977325, + "rewards/rejected": -0.39827755093574524, + "step": 307 + }, + { + "epoch": 0.32, + "learning_rate": 4.309335095262676e-05, + "logits/chosen": -2.131873607635498, + "logits/rejected": -2.3015997409820557, + "logps/chosen": -146.17129516601562, + "logps/rejected": -182.18138122558594, + "loss": 0.6993, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3666872978210449, + "rewards/margins": 0.09421360492706299, + "rewards/rejected": -0.4609009325504303, + "step": 308 + }, + { + "epoch": 0.32, + "learning_rate": 4.303006118359537e-05, + "logits/chosen": -2.2067878246307373, + "logits/rejected": -2.162324905395508, + "logps/chosen": -169.20140075683594, + "logps/rejected": -163.42198181152344, + "loss": 0.7353, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.5356483459472656, + "rewards/margins": -0.05132238194346428, + "rewards/rejected": -0.48432594537734985, + "step": 309 + }, + { + "epoch": 0.32, + "learning_rate": 4.296652968938807e-05, + "logits/chosen": -2.0966219902038574, + "logits/rejected": -2.096193313598633, + "logps/chosen": -181.05987548828125, + "logps/rejected": -199.04830932617188, + "loss": 0.8487, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.5439967513084412, + "rewards/margins": -0.17244365811347961, + "rewards/rejected": -0.3715530335903168, + "step": 310 + }, + { + "epoch": 0.32, + "learning_rate": 4.2902757321758016e-05, + "logits/chosen": -2.0997745990753174, + "logits/rejected": -2.11462140083313, + "logps/chosen": -154.70413208007812, + "logps/rejected": -166.63629150390625, + "loss": 0.6186, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27216559648513794, + "rewards/margins": 0.23034000396728516, + "rewards/rejected": -0.5025056004524231, + "step": 311 + }, + { + "epoch": 0.33, + "learning_rate": 4.283874493568772e-05, + "logits/chosen": -2.1701714992523193, + "logits/rejected": -2.2467424869537354, + "logps/chosen": -172.70042419433594, + "logps/rejected": -210.9363250732422, + "loss": 0.7279, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.49327972531318665, + "rewards/margins": 0.008369775488972664, + "rewards/rejected": -0.5016494989395142, + "step": 312 + }, + { + "epoch": 0.33, + "learning_rate": 4.2774493389377545e-05, + "logits/chosen": -2.2590439319610596, + "logits/rejected": -2.214010000228882, + "logps/chosen": -149.47921752929688, + "logps/rejected": -162.21078491210938, + "loss": 0.7782, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.27489376068115234, + "rewards/margins": -0.09422563016414642, + "rewards/rejected": -0.18066814541816711, + "step": 313 + }, + { + "epoch": 0.33, + "learning_rate": 4.271000354423426e-05, + "logits/chosen": -2.179133892059326, + "logits/rejected": -2.3145484924316406, + "logps/chosen": -161.55218505859375, + "logps/rejected": -180.9322509765625, + "loss": 0.766, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.41766488552093506, + "rewards/margins": -0.06459204852581024, + "rewards/rejected": -0.353072851896286, + "step": 314 + }, + { + "epoch": 0.33, + "learning_rate": 4.2645276264859394e-05, + "logits/chosen": -2.14973521232605, + "logits/rejected": -2.122270107269287, + "logps/chosen": -169.07066345214844, + "logps/rejected": -148.82968139648438, + "loss": 0.7404, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4353243112564087, + "rewards/margins": -0.06003642827272415, + "rewards/rejected": -0.3752879202365875, + "step": 315 + }, + { + "epoch": 0.33, + "learning_rate": 4.258031241903778e-05, + "logits/chosen": -2.2368862628936768, + "logits/rejected": -2.231748104095459, + "logps/chosen": -228.32550048828125, + "logps/rejected": -248.53265380859375, + "loss": 0.7322, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.48313820362091064, + "rewards/margins": -0.03280310332775116, + "rewards/rejected": -0.4503350555896759, + "step": 316 + }, + { + "epoch": 0.33, + "learning_rate": 4.251511287772579e-05, + "logits/chosen": -2.172724485397339, + "logits/rejected": -2.166696786880493, + "logps/chosen": -166.26548767089844, + "logps/rejected": -189.61898803710938, + "loss": 0.7494, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4010167717933655, + "rewards/margins": -0.011104248464107513, + "rewards/rejected": -0.38991254568099976, + "step": 317 + }, + { + "epoch": 0.33, + "learning_rate": 4.2449678515039747e-05, + "logits/chosen": -2.168539047241211, + "logits/rejected": -2.230973243713379, + "logps/chosen": -150.7926025390625, + "logps/rejected": -141.18051147460938, + "loss": 0.801, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.29772108793258667, + "rewards/margins": -0.10154817998409271, + "rewards/rejected": -0.19617292284965515, + "step": 318 + }, + { + "epoch": 0.33, + "learning_rate": 4.238401020824416e-05, + "logits/chosen": -2.1671128273010254, + "logits/rejected": -2.1290652751922607, + "logps/chosen": -163.53701782226562, + "logps/rejected": -176.98638916015625, + "loss": 0.6244, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26465463638305664, + "rewards/margins": 0.2812088131904602, + "rewards/rejected": -0.5458635091781616, + "step": 319 + }, + { + "epoch": 0.33, + "learning_rate": 4.231810883773999e-05, + "logits/chosen": -2.0769715309143066, + "logits/rejected": -2.2087574005126953, + "logps/chosen": -143.20196533203125, + "logps/rejected": -185.92027282714844, + "loss": 0.7082, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41921350359916687, + "rewards/margins": 0.10322752594947815, + "rewards/rejected": -0.522441029548645, + "step": 320 + }, + { + "epoch": 0.33, + "learning_rate": 4.2251975287052804e-05, + "logits/chosen": -2.1802303791046143, + "logits/rejected": -2.2122409343719482, + "logps/chosen": -156.01007080078125, + "logps/rejected": -183.6471405029297, + "loss": 0.6816, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2279636561870575, + "rewards/margins": 0.09283652901649475, + "rewards/rejected": -0.32080018520355225, + "step": 321 + }, + { + "epoch": 0.34, + "learning_rate": 4.218561044282099e-05, + "logits/chosen": -2.113987684249878, + "logits/rejected": -2.1755659580230713, + "logps/chosen": -183.98928833007812, + "logps/rejected": -201.93418884277344, + "loss": 0.6784, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39034149050712585, + "rewards/margins": 0.08643309772014618, + "rewards/rejected": -0.47677451372146606, + "step": 322 + }, + { + "epoch": 0.34, + "learning_rate": 4.211901519478382e-05, + "logits/chosen": -2.139608144760132, + "logits/rejected": -2.3428738117218018, + "logps/chosen": -165.4562530517578, + "logps/rejected": -215.16778564453125, + "loss": 0.631, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6147371530532837, + "rewards/margins": 0.23880484700202942, + "rewards/rejected": -0.8535419702529907, + "step": 323 + }, + { + "epoch": 0.34, + "learning_rate": 4.2052190435769554e-05, + "logits/chosen": -2.1603267192840576, + "logits/rejected": -2.0630412101745605, + "logps/chosen": -173.13119506835938, + "logps/rejected": -156.80117797851562, + "loss": 0.6334, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2138206511735916, + "rewards/margins": 0.18611598014831543, + "rewards/rejected": -0.39993664622306824, + "step": 324 + }, + { + "epoch": 0.34, + "learning_rate": 4.198513706168345e-05, + "logits/chosen": -2.132692813873291, + "logits/rejected": -2.117668390274048, + "logps/chosen": -163.94891357421875, + "logps/rejected": -177.30056762695312, + "loss": 0.6352, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3021223843097687, + "rewards/margins": 0.18045057356357574, + "rewards/rejected": -0.4825729727745056, + "step": 325 + }, + { + "epoch": 0.34, + "learning_rate": 4.191785597149577e-05, + "logits/chosen": -2.129894495010376, + "logits/rejected": -2.126570224761963, + "logps/chosen": -233.8136749267578, + "logps/rejected": -209.49566650390625, + "loss": 0.7343, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5364924073219299, + "rewards/margins": -0.013996928930282593, + "rewards/rejected": -0.5224955677986145, + "step": 326 + }, + { + "epoch": 0.34, + "learning_rate": 4.1850348067229696e-05, + "logits/chosen": -2.096973419189453, + "logits/rejected": -2.1738548278808594, + "logps/chosen": -152.7852020263672, + "logps/rejected": -173.36817932128906, + "loss": 0.7105, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07808439433574677, + "rewards/margins": 0.017765391618013382, + "rewards/rejected": -0.09584978222846985, + "step": 327 + }, + { + "epoch": 0.34, + "learning_rate": 4.178261425394926e-05, + "logits/chosen": -2.026822566986084, + "logits/rejected": -2.074733257293701, + "logps/chosen": -171.08468627929688, + "logps/rejected": -204.09425354003906, + "loss": 0.8344, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6632390022277832, + "rewards/margins": -0.23066554963588715, + "rewards/rejected": -0.4325733780860901, + "step": 328 + }, + { + "epoch": 0.34, + "learning_rate": 4.171465543974723e-05, + "logits/chosen": -2.205124855041504, + "logits/rejected": -2.198807716369629, + "logps/chosen": -153.81307983398438, + "logps/rejected": -165.59127807617188, + "loss": 0.6848, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3507269620895386, + "rewards/margins": 0.07101988792419434, + "rewards/rejected": -0.4217468202114105, + "step": 329 + }, + { + "epoch": 0.34, + "learning_rate": 4.1646472535732895e-05, + "logits/chosen": -2.2543835639953613, + "logits/rejected": -2.169010877609253, + "logps/chosen": -193.3108673095703, + "logps/rejected": -163.5917510986328, + "loss": 0.7477, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.4203927218914032, + "rewards/margins": -0.05665392428636551, + "rewards/rejected": -0.3637387752532959, + "step": 330 + }, + { + "epoch": 0.35, + "learning_rate": 4.157806645601988e-05, + "logits/chosen": -1.9615943431854248, + "logits/rejected": -2.026665210723877, + "logps/chosen": -188.08934020996094, + "logps/rejected": -210.8447265625, + "loss": 0.6278, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26600322127342224, + "rewards/margins": 0.22521573305130005, + "rewards/rejected": -0.4912189245223999, + "step": 331 + }, + { + "epoch": 0.35, + "learning_rate": 4.1509438117713866e-05, + "logits/chosen": -2.1103501319885254, + "logits/rejected": -2.092162609100342, + "logps/chosen": -152.57652282714844, + "logps/rejected": -155.68255615234375, + "loss": 0.7481, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19243402779102325, + "rewards/margins": -0.01794758439064026, + "rewards/rejected": -0.1744864583015442, + "step": 332 + }, + { + "epoch": 0.35, + "learning_rate": 4.144058844090032e-05, + "logits/chosen": -2.059112310409546, + "logits/rejected": -2.1364006996154785, + "logps/chosen": -134.24253845214844, + "logps/rejected": -140.66732788085938, + "loss": 0.7082, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.19404403865337372, + "rewards/margins": 0.034823037683963776, + "rewards/rejected": -0.2288670837879181, + "step": 333 + }, + { + "epoch": 0.35, + "learning_rate": 4.137151834863213e-05, + "logits/chosen": -2.178894519805908, + "logits/rejected": -2.218296766281128, + "logps/chosen": -167.58921813964844, + "logps/rejected": -184.11642456054688, + "loss": 0.6905, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.18130388855934143, + "rewards/margins": 0.06935537606477737, + "rewards/rejected": -0.2506592571735382, + "step": 334 + }, + { + "epoch": 0.35, + "learning_rate": 4.130222876691726e-05, + "logits/chosen": -1.9210792779922485, + "logits/rejected": -1.9056644439697266, + "logps/chosen": -248.0112762451172, + "logps/rejected": -249.83152770996094, + "loss": 0.6878, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5254245400428772, + "rewards/margins": 0.07501597702503204, + "rewards/rejected": -0.600440502166748, + "step": 335 + }, + { + "epoch": 0.35, + "learning_rate": 4.123272062470633e-05, + "logits/chosen": -2.2695250511169434, + "logits/rejected": -2.3075647354125977, + "logps/chosen": -175.56838989257812, + "logps/rejected": -185.3209686279297, + "loss": 0.6858, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5953464508056641, + "rewards/margins": 0.08486279845237732, + "rewards/rejected": -0.680209219455719, + "step": 336 + }, + { + "epoch": 0.35, + "learning_rate": 4.116299485388014e-05, + "logits/chosen": -2.1485931873321533, + "logits/rejected": -2.143951177597046, + "logps/chosen": -147.05918884277344, + "logps/rejected": -146.83811950683594, + "loss": 0.8233, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5940690040588379, + "rewards/margins": -0.17442089319229126, + "rewards/rejected": -0.4196482002735138, + "step": 337 + }, + { + "epoch": 0.35, + "learning_rate": 4.109305238923718e-05, + "logits/chosen": -2.151376247406006, + "logits/rejected": -2.2524118423461914, + "logps/chosen": -258.1955261230469, + "logps/rejected": -254.03448486328125, + "loss": 0.6856, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5329893231391907, + "rewards/margins": 0.19465385377407074, + "rewards/rejected": -0.7276431322097778, + "step": 338 + }, + { + "epoch": 0.35, + "learning_rate": 4.102289416848114e-05, + "logits/chosen": -2.141131639480591, + "logits/rejected": -2.094794511795044, + "logps/chosen": -143.72801208496094, + "logps/rejected": -137.58067321777344, + "loss": 0.7808, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.384622186422348, + "rewards/margins": -0.13031712174415588, + "rewards/rejected": -0.25430506467819214, + "step": 339 + }, + { + "epoch": 0.35, + "learning_rate": 4.095252113220827e-05, + "logits/chosen": -2.16725492477417, + "logits/rejected": -2.1304190158843994, + "logps/chosen": -168.14285278320312, + "logps/rejected": -173.74656677246094, + "loss": 0.7767, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.37200167775154114, + "rewards/margins": -0.032151952385902405, + "rewards/rejected": -0.3398497402667999, + "step": 340 + }, + { + "epoch": 0.36, + "learning_rate": 4.088193422389484e-05, + "logits/chosen": -2.1071646213531494, + "logits/rejected": -2.1935393810272217, + "logps/chosen": -165.9573516845703, + "logps/rejected": -193.26974487304688, + "loss": 0.5765, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.30919840931892395, + "rewards/margins": 0.3557150065898895, + "rewards/rejected": -0.6649134159088135, + "step": 341 + }, + { + "epoch": 0.36, + "learning_rate": 4.0811134389884433e-05, + "logits/chosen": -1.9773459434509277, + "logits/rejected": -2.059852361679077, + "logps/chosen": -149.0285186767578, + "logps/rejected": -159.69705200195312, + "loss": 0.641, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1402963101863861, + "rewards/margins": 0.19628655910491943, + "rewards/rejected": -0.33658286929130554, + "step": 342 + }, + { + "epoch": 0.36, + "learning_rate": 4.0740122579375286e-05, + "logits/chosen": -2.0288474559783936, + "logits/rejected": -2.244412422180176, + "logps/chosen": -158.99160766601562, + "logps/rejected": -198.47886657714844, + "loss": 0.6393, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3524281680583954, + "rewards/margins": 0.20226937532424927, + "rewards/rejected": -0.5546976327896118, + "step": 343 + }, + { + "epoch": 0.36, + "learning_rate": 4.066889974440757e-05, + "logits/chosen": -1.9884339570999146, + "logits/rejected": -2.0600476264953613, + "logps/chosen": -149.9541473388672, + "logps/rejected": -168.91061401367188, + "loss": 0.7141, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3000350594520569, + "rewards/margins": 0.055553682148456573, + "rewards/rejected": -0.35558873414993286, + "step": 344 + }, + { + "epoch": 0.36, + "learning_rate": 4.0597466839850595e-05, + "logits/chosen": -2.229095935821533, + "logits/rejected": -2.208395481109619, + "logps/chosen": -180.67138671875, + "logps/rejected": -191.8995361328125, + "loss": 0.8027, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5766149163246155, + "rewards/margins": -0.08274443447589874, + "rewards/rejected": -0.4938705563545227, + "step": 345 + }, + { + "epoch": 0.36, + "learning_rate": 4.0525824823390045e-05, + "logits/chosen": -1.9827308654785156, + "logits/rejected": -2.038292646408081, + "logps/chosen": -137.3387451171875, + "logps/rejected": -156.16510009765625, + "loss": 0.6739, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24382978677749634, + "rewards/margins": 0.09684039652347565, + "rewards/rejected": -0.3406701982021332, + "step": 346 + }, + { + "epoch": 0.36, + "learning_rate": 4.045397465551513e-05, + "logits/chosen": -2.0480711460113525, + "logits/rejected": -2.0361733436584473, + "logps/chosen": -173.17909240722656, + "logps/rejected": -157.0221405029297, + "loss": 0.7591, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5380522012710571, + "rewards/margins": -0.046980153769254684, + "rewards/rejected": -0.4910720884799957, + "step": 347 + }, + { + "epoch": 0.36, + "learning_rate": 4.038191729950569e-05, + "logits/chosen": -2.229896068572998, + "logits/rejected": -2.211841106414795, + "logps/chosen": -167.17428588867188, + "logps/rejected": -167.23196411132812, + "loss": 0.8467, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6671632528305054, + "rewards/margins": -0.22287489473819733, + "rewards/rejected": -0.44428837299346924, + "step": 348 + }, + { + "epoch": 0.36, + "learning_rate": 4.030965372141927e-05, + "logits/chosen": -2.0725326538085938, + "logits/rejected": -2.0685665607452393, + "logps/chosen": -151.9163360595703, + "logps/rejected": -158.74620056152344, + "loss": 0.6537, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.27225998044013977, + "rewards/margins": 0.1315668821334839, + "rewards/rejected": -0.40382686257362366, + "step": 349 + }, + { + "epoch": 0.37, + "learning_rate": 4.0237184890078245e-05, + "logits/chosen": -2.1178064346313477, + "logits/rejected": -2.1523594856262207, + "logps/chosen": -157.16265869140625, + "logps/rejected": -176.5404052734375, + "loss": 0.622, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.31149694323539734, + "rewards/margins": 0.21072791516780853, + "rewards/rejected": -0.5222248435020447, + "step": 350 + }, + { + "epoch": 0.37, + "learning_rate": 4.0164511777056725e-05, + "logits/chosen": -2.2286159992218018, + "logits/rejected": -2.2007691860198975, + "logps/chosen": -186.49661254882812, + "logps/rejected": -188.19149780273438, + "loss": 0.7013, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5486454963684082, + "rewards/margins": 0.11480455100536346, + "rewards/rejected": -0.6634500622749329, + "step": 351 + }, + { + "epoch": 0.37, + "learning_rate": 4.009163535666761e-05, + "logits/chosen": -2.182291030883789, + "logits/rejected": -2.1893458366394043, + "logps/chosen": -148.7130126953125, + "logps/rejected": -159.863037109375, + "loss": 0.6501, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.23504705727100372, + "rewards/margins": 0.17834332585334778, + "rewards/rejected": -0.4133903682231903, + "step": 352 + }, + { + "epoch": 0.37, + "learning_rate": 4.001855660594948e-05, + "logits/chosen": -2.0689799785614014, + "logits/rejected": -2.133513927459717, + "logps/chosen": -193.48846435546875, + "logps/rejected": -231.55738830566406, + "loss": 0.6689, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5353763699531555, + "rewards/margins": 0.13139232993125916, + "rewards/rejected": -0.6667687296867371, + "step": 353 + }, + { + "epoch": 0.37, + "learning_rate": 3.994527650465352e-05, + "logits/chosen": -2.2244315147399902, + "logits/rejected": -2.206336259841919, + "logps/chosen": -153.9037628173828, + "logps/rejected": -153.5791778564453, + "loss": 0.6506, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.10444588959217072, + "rewards/margins": 0.24528437852859497, + "rewards/rejected": -0.3497302234172821, + "step": 354 + }, + { + "epoch": 0.37, + "learning_rate": 3.98717960352304e-05, + "logits/chosen": -2.0277304649353027, + "logits/rejected": -1.9793894290924072, + "logps/chosen": -153.72535705566406, + "logps/rejected": -152.46961975097656, + "loss": 0.7438, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5082700252532959, + "rewards/margins": 0.030984222888946533, + "rewards/rejected": -0.5392543077468872, + "step": 355 + }, + { + "epoch": 0.37, + "learning_rate": 3.979811618281706e-05, + "logits/chosen": -2.0062384605407715, + "logits/rejected": -2.057262420654297, + "logps/chosen": -136.8114013671875, + "logps/rejected": -134.69456481933594, + "loss": 0.7313, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2975345849990845, + "rewards/margins": 0.03512765094637871, + "rewards/rejected": -0.3326622247695923, + "step": 356 + }, + { + "epoch": 0.37, + "learning_rate": 3.972423793522352e-05, + "logits/chosen": -2.0485219955444336, + "logits/rejected": -2.085298776626587, + "logps/chosen": -193.29806518554688, + "logps/rejected": -208.20973205566406, + "loss": 0.818, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6571996212005615, + "rewards/margins": -0.1324111372232437, + "rewards/rejected": -0.524788498878479, + "step": 357 + }, + { + "epoch": 0.37, + "learning_rate": 3.9650162282919655e-05, + "logits/chosen": -1.9818403720855713, + "logits/rejected": -2.0531868934631348, + "logps/chosen": -158.05345153808594, + "logps/rejected": -157.64669799804688, + "loss": 0.7875, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.27151066064834595, + "rewards/margins": -0.05360978841781616, + "rewards/rejected": -0.21790087223052979, + "step": 358 + }, + { + "epoch": 0.37, + "learning_rate": 3.957589021902191e-05, + "logits/chosen": -2.1913740634918213, + "logits/rejected": -2.147808790206909, + "logps/chosen": -158.68458557128906, + "logps/rejected": -168.61692810058594, + "loss": 0.8811, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6251986622810364, + "rewards/margins": -0.24323511123657227, + "rewards/rejected": -0.3819635510444641, + "step": 359 + }, + { + "epoch": 0.38, + "learning_rate": 3.9501422739279956e-05, + "logits/chosen": -1.9855284690856934, + "logits/rejected": -2.0047171115875244, + "logps/chosen": -158.85598754882812, + "logps/rejected": -186.7561492919922, + "loss": 0.8237, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.4131236970424652, + "rewards/margins": -0.2267259657382965, + "rewards/rejected": -0.1863977611064911, + "step": 360 + }, + { + "epoch": 0.38, + "learning_rate": 3.942676084206338e-05, + "logits/chosen": -2.1845693588256836, + "logits/rejected": -2.2711105346679688, + "logps/chosen": -153.8560791015625, + "logps/rejected": -188.9151611328125, + "loss": 0.661, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2779008448123932, + "rewards/margins": 0.1375725418329239, + "rewards/rejected": -0.4154733717441559, + "step": 361 + }, + { + "epoch": 0.38, + "learning_rate": 3.9351905528348285e-05, + "logits/chosen": -2.063652515411377, + "logits/rejected": -2.1154680252075195, + "logps/chosen": -154.21665954589844, + "logps/rejected": -166.70904541015625, + "loss": 0.6842, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.302326500415802, + "rewards/margins": 0.07578597962856293, + "rewards/rejected": -0.3781124949455261, + "step": 362 + }, + { + "epoch": 0.38, + "learning_rate": 3.927685780170385e-05, + "logits/chosen": -2.115208625793457, + "logits/rejected": -2.042466878890991, + "logps/chosen": -133.27244567871094, + "logps/rejected": -123.8929214477539, + "loss": 0.667, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09392361342906952, + "rewards/margins": 0.1019444689154625, + "rewards/rejected": -0.1958681046962738, + "step": 363 + }, + { + "epoch": 0.38, + "learning_rate": 3.920161866827889e-05, + "logits/chosen": -2.167541980743408, + "logits/rejected": -2.15377140045166, + "logps/chosen": -152.6026611328125, + "logps/rejected": -145.47889709472656, + "loss": 0.723, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22231236100196838, + "rewards/margins": 0.015478478744626045, + "rewards/rejected": -0.23779082298278809, + "step": 364 + }, + { + "epoch": 0.38, + "learning_rate": 3.9126189136788416e-05, + "logits/chosen": -2.1280405521392822, + "logits/rejected": -1.9908369779586792, + "logps/chosen": -146.22325134277344, + "logps/rejected": -131.01043701171875, + "loss": 0.6667, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.32886457443237305, + "rewards/margins": 0.1325063705444336, + "rewards/rejected": -0.46137094497680664, + "step": 365 + }, + { + "epoch": 0.38, + "learning_rate": 3.90505702185e-05, + "logits/chosen": -2.0596060752868652, + "logits/rejected": -1.9969902038574219, + "logps/chosen": -164.17156982421875, + "logps/rejected": -137.92181396484375, + "loss": 0.8594, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4894620478153229, + "rewards/margins": -0.21743880212306976, + "rewards/rejected": -0.2720232605934143, + "step": 366 + }, + { + "epoch": 0.38, + "learning_rate": 3.897476292722034e-05, + "logits/chosen": -1.9921385049819946, + "logits/rejected": -2.1082358360290527, + "logps/chosen": -140.42034912109375, + "logps/rejected": -164.16915893554688, + "loss": 0.6899, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.26265978813171387, + "rewards/margins": 0.043181706219911575, + "rewards/rejected": -0.30584150552749634, + "step": 367 + }, + { + "epoch": 0.38, + "learning_rate": 3.889876827928156e-05, + "logits/chosen": -2.0175795555114746, + "logits/rejected": -2.040492296218872, + "logps/chosen": -152.78158569335938, + "logps/rejected": -151.21533203125, + "loss": 0.595, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16340675950050354, + "rewards/margins": 0.2702358067035675, + "rewards/rejected": -0.43364256620407104, + "step": 368 + }, + { + "epoch": 0.38, + "learning_rate": 3.882258729352768e-05, + "logits/chosen": -2.0957415103912354, + "logits/rejected": -2.0559308528900146, + "logps/chosen": -185.1763153076172, + "logps/rejected": -192.24658203125, + "loss": 0.6894, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2837347686290741, + "rewards/margins": 0.07096924632787704, + "rewards/rejected": -0.35470402240753174, + "step": 369 + }, + { + "epoch": 0.39, + "learning_rate": 3.874622099130087e-05, + "logits/chosen": -1.9157230854034424, + "logits/rejected": -1.9512850046157837, + "logps/chosen": -149.11521911621094, + "logps/rejected": -165.88624572753906, + "loss": 0.6754, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.335940957069397, + "rewards/margins": 0.11438290774822235, + "rewards/rejected": -0.45032384991645813, + "step": 370 + }, + { + "epoch": 0.39, + "learning_rate": 3.866967039642784e-05, + "logits/chosen": -2.0574257373809814, + "logits/rejected": -2.203120470046997, + "logps/chosen": -158.74758911132812, + "logps/rejected": -173.32127380371094, + "loss": 0.6842, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.40988606214523315, + "rewards/margins": 0.061044152826070786, + "rewards/rejected": -0.4709302484989166, + "step": 371 + }, + { + "epoch": 0.39, + "learning_rate": 3.859293653520604e-05, + "logits/chosen": -2.053711175918579, + "logits/rejected": -1.980366587638855, + "logps/chosen": -214.1997528076172, + "logps/rejected": -201.98345947265625, + "loss": 0.6636, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2103947103023529, + "rewards/margins": 0.11527465283870697, + "rewards/rejected": -0.3256693482398987, + "step": 372 + }, + { + "epoch": 0.39, + "learning_rate": 3.851602043638994e-05, + "logits/chosen": -2.012058973312378, + "logits/rejected": -1.9634625911712646, + "logps/chosen": -167.78753662109375, + "logps/rejected": -190.8964385986328, + "loss": 0.6111, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.26867952942848206, + "rewards/margins": 0.29532575607299805, + "rewards/rejected": -0.5640051960945129, + "step": 373 + }, + { + "epoch": 0.39, + "learning_rate": 3.843892313117724e-05, + "logits/chosen": -2.0894453525543213, + "logits/rejected": -2.1030113697052, + "logps/chosen": -155.4733123779297, + "logps/rejected": -177.70977783203125, + "loss": 0.784, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.658606767654419, + "rewards/margins": -0.06122620403766632, + "rewards/rejected": -0.597380518913269, + "step": 374 + }, + { + "epoch": 0.39, + "learning_rate": 3.8361645653195026e-05, + "logits/chosen": -2.1214489936828613, + "logits/rejected": -2.1964516639709473, + "logps/chosen": -163.82373046875, + "logps/rejected": -185.65740966796875, + "loss": 0.7088, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20250454545021057, + "rewards/margins": 0.09702645242214203, + "rewards/rejected": -0.2995309829711914, + "step": 375 + }, + { + "epoch": 0.39, + "learning_rate": 3.8284189038485936e-05, + "logits/chosen": -2.225022554397583, + "logits/rejected": -2.2104990482330322, + "logps/chosen": -160.45584106445312, + "logps/rejected": -158.27365112304688, + "loss": 0.6533, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3125608265399933, + "rewards/margins": 0.14464329183101654, + "rewards/rejected": -0.45720410346984863, + "step": 376 + }, + { + "epoch": 0.39, + "learning_rate": 3.8206554325494225e-05, + "logits/chosen": -2.246929168701172, + "logits/rejected": -2.206899881362915, + "logps/chosen": -177.77415466308594, + "logps/rejected": -168.1491241455078, + "loss": 0.6993, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4578195810317993, + "rewards/margins": 0.06638437509536743, + "rewards/rejected": -0.5242039561271667, + "step": 377 + }, + { + "epoch": 0.39, + "learning_rate": 3.812874255505191e-05, + "logits/chosen": -2.2009379863739014, + "logits/rejected": -2.1823983192443848, + "logps/chosen": -149.4908905029297, + "logps/rejected": -165.87646484375, + "loss": 0.822, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.47621116042137146, + "rewards/margins": -0.1242537796497345, + "rewards/rejected": -0.35195738077163696, + "step": 378 + }, + { + "epoch": 0.4, + "learning_rate": 3.805075477036476e-05, + "logits/chosen": -2.1507351398468018, + "logits/rejected": -2.098275661468506, + "logps/chosen": -155.75393676757812, + "logps/rejected": -150.32437133789062, + "loss": 0.6496, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23333942890167236, + "rewards/margins": 0.19434207677841187, + "rewards/rejected": -0.42768150568008423, + "step": 379 + }, + { + "epoch": 0.4, + "learning_rate": 3.797259201699833e-05, + "logits/chosen": -2.231349468231201, + "logits/rejected": -2.2406768798828125, + "logps/chosen": -165.03302001953125, + "logps/rejected": -160.68557739257812, + "loss": 0.6536, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2146168351173401, + "rewards/margins": 0.12036348879337311, + "rewards/rejected": -0.334980309009552, + "step": 380 + }, + { + "epoch": 0.4, + "learning_rate": 3.789425534286394e-05, + "logits/chosen": -2.3824687004089355, + "logits/rejected": -2.3478920459747314, + "logps/chosen": -267.89453125, + "logps/rejected": -268.1957702636719, + "loss": 0.8122, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3535911440849304, + "rewards/margins": -0.1552121490240097, + "rewards/rejected": -0.19837898015975952, + "step": 381 + }, + { + "epoch": 0.4, + "learning_rate": 3.781574579820464e-05, + "logits/chosen": -2.171052932739258, + "logits/rejected": -2.204153299331665, + "logps/chosen": -226.82186889648438, + "logps/rejected": -241.44273376464844, + "loss": 0.6833, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4559452533721924, + "rewards/margins": 0.0971999317407608, + "rewards/rejected": -0.553145170211792, + "step": 382 + }, + { + "epoch": 0.4, + "learning_rate": 3.773706443558111e-05, + "logits/chosen": -2.1312382221221924, + "logits/rejected": -2.159982442855835, + "logps/chosen": -169.75729370117188, + "logps/rejected": -180.92498779296875, + "loss": 0.769, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4790065586566925, + "rewards/margins": -0.034655213356018066, + "rewards/rejected": -0.44435134530067444, + "step": 383 + }, + { + "epoch": 0.4, + "learning_rate": 3.765821230985758e-05, + "logits/chosen": -2.1556124687194824, + "logits/rejected": -2.1452436447143555, + "logps/chosen": -205.10719299316406, + "logps/rejected": -178.97406005859375, + "loss": 0.6892, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.4622449278831482, + "rewards/margins": 0.09195668250322342, + "rewards/rejected": -0.554201602935791, + "step": 384 + }, + { + "epoch": 0.4, + "learning_rate": 3.75791904781876e-05, + "logits/chosen": -2.2256433963775635, + "logits/rejected": -2.1840739250183105, + "logps/chosen": -173.39869689941406, + "logps/rejected": -180.0209503173828, + "loss": 0.7169, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41330060362815857, + "rewards/margins": 0.014306016266345978, + "rewards/rejected": -0.42760664224624634, + "step": 385 + }, + { + "epoch": 0.4, + "learning_rate": 3.7500000000000003e-05, + "logits/chosen": -2.2018544673919678, + "logits/rejected": -2.175457000732422, + "logps/chosen": -141.57928466796875, + "logps/rejected": -142.81686401367188, + "loss": 0.6963, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1722419261932373, + "rewards/margins": 0.04862082004547119, + "rewards/rejected": -0.2208627462387085, + "step": 386 + }, + { + "epoch": 0.4, + "learning_rate": 3.74206419369846e-05, + "logits/chosen": -2.224078893661499, + "logits/rejected": -2.2861950397491455, + "logps/chosen": -193.62677001953125, + "logps/rejected": -196.46714782714844, + "loss": 0.8549, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.5895551443099976, + "rewards/margins": -0.2451256364583969, + "rewards/rejected": -0.34442949295043945, + "step": 387 + }, + { + "epoch": 0.4, + "learning_rate": 3.7341117353077966e-05, + "logits/chosen": -2.3696727752685547, + "logits/rejected": -2.3207218647003174, + "logps/chosen": -237.3257598876953, + "logps/rejected": -208.5120391845703, + "loss": 0.6565, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.47394251823425293, + "rewards/margins": 0.12043605744838715, + "rewards/rejected": -0.5943784713745117, + "step": 388 + }, + { + "epoch": 0.41, + "learning_rate": 3.726142731444921e-05, + "logits/chosen": -2.1822972297668457, + "logits/rejected": -2.2831835746765137, + "logps/chosen": -150.12652587890625, + "logps/rejected": -144.74551391601562, + "loss": 0.7884, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.33526331186294556, + "rewards/margins": -0.13741034269332886, + "rewards/rejected": -0.1978529691696167, + "step": 389 + }, + { + "epoch": 0.41, + "learning_rate": 3.718157288948563e-05, + "logits/chosen": -2.2395238876342773, + "logits/rejected": -2.2703099250793457, + "logps/chosen": -177.6690216064453, + "logps/rejected": -185.6038818359375, + "loss": 0.5714, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.45524442195892334, + "rewards/margins": 0.34756022691726685, + "rewards/rejected": -0.8028046488761902, + "step": 390 + }, + { + "epoch": 0.41, + "learning_rate": 3.710155514877844e-05, + "logits/chosen": -2.2453153133392334, + "logits/rejected": -2.2400312423706055, + "logps/chosen": -161.18931579589844, + "logps/rejected": -155.0644073486328, + "loss": 0.9213, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.633940577507019, + "rewards/margins": -0.3272451162338257, + "rewards/rejected": -0.30669546127319336, + "step": 391 + }, + { + "epoch": 0.41, + "learning_rate": 3.702137516510838e-05, + "logits/chosen": -2.1709861755371094, + "logits/rejected": -2.1527490615844727, + "logps/chosen": -149.6127166748047, + "logps/rejected": -135.25987243652344, + "loss": 0.6775, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.31984999775886536, + "rewards/margins": 0.08133503049612045, + "rewards/rejected": -0.4011850357055664, + "step": 392 + }, + { + "epoch": 0.41, + "learning_rate": 3.694103401343136e-05, + "logits/chosen": -2.3013548851013184, + "logits/rejected": -2.2986433506011963, + "logps/chosen": -165.6312255859375, + "logps/rejected": -174.7577362060547, + "loss": 0.7364, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5793277025222778, + "rewards/margins": -0.015010036528110504, + "rewards/rejected": -0.5643177032470703, + "step": 393 + }, + { + "epoch": 0.41, + "learning_rate": 3.686053277086401e-05, + "logits/chosen": -2.1550047397613525, + "logits/rejected": -2.246464252471924, + "logps/chosen": -147.44175720214844, + "logps/rejected": -154.8894805908203, + "loss": 0.7561, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3496069014072418, + "rewards/margins": -0.07592416554689407, + "rewards/rejected": -0.27368271350860596, + "step": 394 + }, + { + "epoch": 0.41, + "learning_rate": 3.6779872516669295e-05, + "logits/chosen": -2.1460325717926025, + "logits/rejected": -2.154590129852295, + "logps/chosen": -151.6021728515625, + "logps/rejected": -168.52456665039062, + "loss": 0.5368, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2864820659160614, + "rewards/margins": 0.4405989944934845, + "rewards/rejected": -0.7270810008049011, + "step": 395 + }, + { + "epoch": 0.41, + "learning_rate": 3.669905433224199e-05, + "logits/chosen": -2.315129041671753, + "logits/rejected": -2.4050261974334717, + "logps/chosen": -146.9856414794922, + "logps/rejected": -169.13397216796875, + "loss": 0.7616, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2588258385658264, + "rewards/margins": -0.059488385915756226, + "rewards/rejected": -0.199337437748909, + "step": 396 + }, + { + "epoch": 0.41, + "learning_rate": 3.6618079301094216e-05, + "logits/chosen": -2.233609199523926, + "logits/rejected": -2.2259573936462402, + "logps/chosen": -179.19204711914062, + "logps/rejected": -178.01219177246094, + "loss": 0.5825, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2573452591896057, + "rewards/margins": 0.3134271800518036, + "rewards/rejected": -0.5707724094390869, + "step": 397 + }, + { + "epoch": 0.42, + "learning_rate": 3.653694850884091e-05, + "logits/chosen": -2.2443690299987793, + "logits/rejected": -2.3436453342437744, + "logps/chosen": -141.96392822265625, + "logps/rejected": -165.8162384033203, + "loss": 0.6064, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1481434404850006, + "rewards/margins": 0.25018060207366943, + "rewards/rejected": -0.39832407236099243, + "step": 398 + }, + { + "epoch": 0.42, + "learning_rate": 3.645566304318526e-05, + "logits/chosen": -2.251343250274658, + "logits/rejected": -2.2624480724334717, + "logps/chosen": -199.05337524414062, + "logps/rejected": -199.2899627685547, + "loss": 0.6354, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.46376729011535645, + "rewards/margins": 0.15301668643951416, + "rewards/rejected": -0.6167839765548706, + "step": 399 + }, + { + "epoch": 0.42, + "learning_rate": 3.637422399390413e-05, + "logits/chosen": -2.309321165084839, + "logits/rejected": -2.249835252761841, + "logps/chosen": -187.30145263671875, + "logps/rejected": -175.0135498046875, + "loss": 0.8022, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6318432688713074, + "rewards/margins": -0.13741618394851685, + "rewards/rejected": -0.49442705512046814, + "step": 400 + }, + { + "epoch": 0.42, + "learning_rate": 3.6292632452833436e-05, + "logits/chosen": -2.149308681488037, + "logits/rejected": -2.1867353916168213, + "logps/chosen": -157.0370635986328, + "logps/rejected": -179.92384338378906, + "loss": 0.6469, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2832821309566498, + "rewards/margins": 0.17430852353572845, + "rewards/rejected": -0.45759066939353943, + "step": 401 + }, + { + "epoch": 0.42, + "learning_rate": 3.621088951385353e-05, + "logits/chosen": -2.430102825164795, + "logits/rejected": -2.4103968143463135, + "logps/chosen": -174.2300262451172, + "logps/rejected": -194.47872924804688, + "loss": 0.6782, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6608787178993225, + "rewards/margins": 0.14938460290431976, + "rewards/rejected": -0.8102633953094482, + "step": 402 + }, + { + "epoch": 0.42, + "learning_rate": 3.612899627287452e-05, + "logits/chosen": -2.381316661834717, + "logits/rejected": -2.4797677993774414, + "logps/chosen": -183.57020568847656, + "logps/rejected": -211.94908142089844, + "loss": 0.7676, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6477088332176208, + "rewards/margins": 0.04121372848749161, + "rewards/rejected": -0.6889225840568542, + "step": 403 + }, + { + "epoch": 0.42, + "learning_rate": 3.604695382782159e-05, + "logits/chosen": -2.2722840309143066, + "logits/rejected": -2.2684082984924316, + "logps/chosen": -157.826171875, + "logps/rejected": -155.97283935546875, + "loss": 0.7743, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.18509769439697266, + "rewards/margins": -0.07210510969161987, + "rewards/rejected": -0.11299259960651398, + "step": 404 + }, + { + "epoch": 0.42, + "learning_rate": 3.596476327862024e-05, + "logits/chosen": -2.128013849258423, + "logits/rejected": -2.2023983001708984, + "logps/chosen": -194.61569213867188, + "logps/rejected": -211.86927795410156, + "loss": 0.6399, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36270129680633545, + "rewards/margins": 0.2554909288883209, + "rewards/rejected": -0.618192195892334, + "step": 405 + }, + { + "epoch": 0.42, + "learning_rate": 3.588242572718162e-05, + "logits/chosen": -2.377016305923462, + "logits/rejected": -2.2582497596740723, + "logps/chosen": -161.89572143554688, + "logps/rejected": -164.3067169189453, + "loss": 0.744, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.38204970955848694, + "rewards/margins": -0.01100611686706543, + "rewards/rejected": -0.3710435926914215, + "step": 406 + }, + { + "epoch": 0.42, + "learning_rate": 3.579994227738767e-05, + "logits/chosen": -2.208984851837158, + "logits/rejected": -2.288970708847046, + "logps/chosen": -191.62936401367188, + "logps/rejected": -223.2491455078125, + "loss": 0.6478, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.34289631247520447, + "rewards/margins": 0.17745056748390198, + "rewards/rejected": -0.5203468799591064, + "step": 407 + }, + { + "epoch": 0.43, + "learning_rate": 3.5717314035076355e-05, + "logits/chosen": -2.2903645038604736, + "logits/rejected": -2.224257469177246, + "logps/chosen": -173.94061279296875, + "logps/rejected": -183.39857482910156, + "loss": 0.9867, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8537546396255493, + "rewards/margins": -0.33834022283554077, + "rewards/rejected": -0.5154143571853638, + "step": 408 + }, + { + "epoch": 0.43, + "learning_rate": 3.5634542108026876e-05, + "logits/chosen": -2.2586324214935303, + "logits/rejected": -2.333674192428589, + "logps/chosen": -128.76527404785156, + "logps/rejected": -138.5326690673828, + "loss": 0.6894, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3050040900707245, + "rewards/margins": 0.024737656116485596, + "rewards/rejected": -0.32974177598953247, + "step": 409 + }, + { + "epoch": 0.43, + "learning_rate": 3.5551627605944745e-05, + "logits/chosen": -2.2455837726593018, + "logits/rejected": -2.2226006984710693, + "logps/chosen": -165.0658416748047, + "logps/rejected": -158.5568084716797, + "loss": 0.7748, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.47917866706848145, + "rewards/margins": -0.09431587904691696, + "rewards/rejected": -0.3848627507686615, + "step": 410 + }, + { + "epoch": 0.43, + "learning_rate": 3.5468571640446994e-05, + "logits/chosen": -2.220954179763794, + "logits/rejected": -2.2065913677215576, + "logps/chosen": -155.57606506347656, + "logps/rejected": -196.17453002929688, + "loss": 0.681, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.43619242310523987, + "rewards/margins": 0.16942578554153442, + "rewards/rejected": -0.6056181788444519, + "step": 411 + }, + { + "epoch": 0.43, + "learning_rate": 3.5385375325047166e-05, + "logits/chosen": -2.28615665435791, + "logits/rejected": -2.3207404613494873, + "logps/chosen": -139.8089599609375, + "logps/rejected": -146.37413024902344, + "loss": 0.5752, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2894131541252136, + "rewards/margins": 0.32500603795051575, + "rewards/rejected": -0.6144192218780518, + "step": 412 + }, + { + "epoch": 0.43, + "learning_rate": 3.5302039775140486e-05, + "logits/chosen": -2.223402500152588, + "logits/rejected": -2.2294511795043945, + "logps/chosen": -192.4325714111328, + "logps/rejected": -195.81764221191406, + "loss": 0.6225, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.422715961933136, + "rewards/margins": 0.23224471509456635, + "rewards/rejected": -0.654960572719574, + "step": 413 + }, + { + "epoch": 0.43, + "learning_rate": 3.521856610798887e-05, + "logits/chosen": -2.1355066299438477, + "logits/rejected": -2.2011489868164062, + "logps/chosen": -186.72837829589844, + "logps/rejected": -198.95672607421875, + "loss": 0.7121, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8084388971328735, + "rewards/margins": 0.06887459009885788, + "rewards/rejected": -0.8773134350776672, + "step": 414 + }, + { + "epoch": 0.43, + "learning_rate": 3.513495544270592e-05, + "logits/chosen": -2.2741241455078125, + "logits/rejected": -2.2826316356658936, + "logps/chosen": -167.8680877685547, + "logps/rejected": -167.94290161132812, + "loss": 0.7632, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5057553648948669, + "rewards/margins": -0.06321151554584503, + "rewards/rejected": -0.44254380464553833, + "step": 415 + }, + { + "epoch": 0.43, + "learning_rate": 3.505120890024195e-05, + "logits/chosen": -2.2100603580474854, + "logits/rejected": -2.1715095043182373, + "logps/chosen": -178.6398162841797, + "logps/rejected": -199.203857421875, + "loss": 0.7779, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6270021796226501, + "rewards/margins": -0.08634312450885773, + "rewards/rejected": -0.5406590700149536, + "step": 416 + }, + { + "epoch": 0.43, + "learning_rate": 3.496732760336895e-05, + "logits/chosen": -2.3388140201568604, + "logits/rejected": -2.3569979667663574, + "logps/chosen": -183.85336303710938, + "logps/rejected": -176.68597412109375, + "loss": 0.6665, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.38947370648384094, + "rewards/margins": 0.12428087741136551, + "rewards/rejected": -0.5137546062469482, + "step": 417 + }, + { + "epoch": 0.44, + "learning_rate": 3.4883312676665536e-05, + "logits/chosen": -2.0799946784973145, + "logits/rejected": -2.127676248550415, + "logps/chosen": -157.06455993652344, + "logps/rejected": -196.19081115722656, + "loss": 0.6533, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38800424337387085, + "rewards/margins": 0.2187524437904358, + "rewards/rejected": -0.6067566275596619, + "step": 418 + }, + { + "epoch": 0.44, + "learning_rate": 3.479916524650188e-05, + "logits/chosen": -2.2445905208587646, + "logits/rejected": -2.2511062622070312, + "logps/chosen": -191.49032592773438, + "logps/rejected": -211.65402221679688, + "loss": 0.6537, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4725024104118347, + "rewards/margins": 0.14553888142108917, + "rewards/rejected": -0.6180413365364075, + "step": 419 + }, + { + "epoch": 0.44, + "learning_rate": 3.4714886441024574e-05, + "logits/chosen": -2.3066306114196777, + "logits/rejected": -2.295111894607544, + "logps/chosen": -166.61192321777344, + "logps/rejected": -183.85430908203125, + "loss": 0.7243, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6095054149627686, + "rewards/margins": 0.045834727585315704, + "rewards/rejected": -0.6553401350975037, + "step": 420 + }, + { + "epoch": 0.44, + "learning_rate": 3.4630477390141556e-05, + "logits/chosen": -2.0795845985412598, + "logits/rejected": -2.0988335609436035, + "logps/chosen": -176.13385009765625, + "logps/rejected": -162.16116333007812, + "loss": 0.8325, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.6280168890953064, + "rewards/margins": -0.21699213981628418, + "rewards/rejected": -0.41102465987205505, + "step": 421 + }, + { + "epoch": 0.44, + "learning_rate": 3.4545939225506934e-05, + "logits/chosen": -2.2829484939575195, + "logits/rejected": -2.369950294494629, + "logps/chosen": -120.27825164794922, + "logps/rejected": -135.86590576171875, + "loss": 0.6277, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3205239772796631, + "rewards/margins": 0.2128859907388687, + "rewards/rejected": -0.5334099531173706, + "step": 422 + }, + { + "epoch": 0.44, + "learning_rate": 3.4461273080505793e-05, + "logits/chosen": -2.227790117263794, + "logits/rejected": -2.3280563354492188, + "logps/chosen": -166.44442749023438, + "logps/rejected": -190.11199951171875, + "loss": 0.658, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6263283491134644, + "rewards/margins": 0.1760719120502472, + "rewards/rejected": -0.8024002909660339, + "step": 423 + }, + { + "epoch": 0.44, + "learning_rate": 3.437648009023905e-05, + "logits/chosen": -2.2951772212982178, + "logits/rejected": -2.2783102989196777, + "logps/chosen": -168.9396209716797, + "logps/rejected": -161.99826049804688, + "loss": 0.7575, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.4612702429294586, + "rewards/margins": -0.09408943355083466, + "rewards/rejected": -0.36718082427978516, + "step": 424 + }, + { + "epoch": 0.44, + "learning_rate": 3.4291561391508185e-05, + "logits/chosen": -2.233304023742676, + "logits/rejected": -2.1193814277648926, + "logps/chosen": -191.5747833251953, + "logps/rejected": -198.6702423095703, + "loss": 0.6742, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6324459910392761, + "rewards/margins": 0.13748225569725037, + "rewards/rejected": -0.7699282169342041, + "step": 425 + }, + { + "epoch": 0.44, + "learning_rate": 3.420651812280006e-05, + "logits/chosen": -2.0054640769958496, + "logits/rejected": -2.0682601928710938, + "logps/chosen": -179.95880126953125, + "logps/rejected": -178.59805297851562, + "loss": 0.7556, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5854605436325073, + "rewards/margins": -0.06146989390254021, + "rewards/rejected": -0.5239906311035156, + "step": 426 + }, + { + "epoch": 0.45, + "learning_rate": 3.4121351424271594e-05, + "logits/chosen": -2.220736026763916, + "logits/rejected": -2.229501247406006, + "logps/chosen": -161.79852294921875, + "logps/rejected": -150.96261596679688, + "loss": 0.634, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5760630369186401, + "rewards/margins": 0.19034941494464874, + "rewards/rejected": -0.7664124965667725, + "step": 427 + }, + { + "epoch": 0.45, + "learning_rate": 3.4036062437734484e-05, + "logits/chosen": -2.084941864013672, + "logits/rejected": -2.1283769607543945, + "logps/chosen": -138.01251220703125, + "logps/rejected": -141.75653076171875, + "loss": 0.6951, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5684069991111755, + "rewards/margins": 0.06860056519508362, + "rewards/rejected": -0.6370075345039368, + "step": 428 + }, + { + "epoch": 0.45, + "learning_rate": 3.395065230663996e-05, + "logits/chosen": -2.356782913208008, + "logits/rejected": -2.3323380947113037, + "logps/chosen": -164.42636108398438, + "logps/rejected": -157.51885986328125, + "loss": 0.8111, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.5730559825897217, + "rewards/margins": -0.158206045627594, + "rewards/rejected": -0.4148499667644501, + "step": 429 + }, + { + "epoch": 0.45, + "learning_rate": 3.386512217606339e-05, + "logits/chosen": -2.306445837020874, + "logits/rejected": -2.305457353591919, + "logps/chosen": -177.36483764648438, + "logps/rejected": -180.41497802734375, + "loss": 0.7929, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6710464358329773, + "rewards/margins": -0.10232071578502655, + "rewards/rejected": -0.5687257647514343, + "step": 430 + }, + { + "epoch": 0.45, + "learning_rate": 3.3779473192688954e-05, + "logits/chosen": -2.2322001457214355, + "logits/rejected": -2.2678284645080566, + "logps/chosen": -180.5767822265625, + "logps/rejected": -215.00439453125, + "loss": 0.6272, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7634757161140442, + "rewards/margins": 0.3175090253353119, + "rewards/rejected": -1.0809847116470337, + "step": 431 + }, + { + "epoch": 0.45, + "learning_rate": 3.369370650479425e-05, + "logits/chosen": -2.3506946563720703, + "logits/rejected": -2.272690534591675, + "logps/chosen": -191.31764221191406, + "logps/rejected": -167.90931701660156, + "loss": 0.6944, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6492268443107605, + "rewards/margins": 0.056166499853134155, + "rewards/rejected": -0.7053933143615723, + "step": 432 + }, + { + "epoch": 0.45, + "learning_rate": 3.360782326223493e-05, + "logits/chosen": -2.21726131439209, + "logits/rejected": -2.1750893592834473, + "logps/chosen": -130.21981811523438, + "logps/rejected": -122.38761138916016, + "loss": 0.7093, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6112926006317139, + "rewards/margins": 0.05839107558131218, + "rewards/rejected": -0.6696836948394775, + "step": 433 + }, + { + "epoch": 0.45, + "learning_rate": 3.3521824616429285e-05, + "logits/chosen": -2.276099681854248, + "logits/rejected": -2.3207895755767822, + "logps/chosen": -152.9349822998047, + "logps/rejected": -176.16763305664062, + "loss": 0.6335, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6175845861434937, + "rewards/margins": 0.19425898790359497, + "rewards/rejected": -0.8118435740470886, + "step": 434 + }, + { + "epoch": 0.45, + "learning_rate": 3.3435711720342764e-05, + "logits/chosen": -2.3540244102478027, + "logits/rejected": -2.4207704067230225, + "logps/chosen": -162.0496063232422, + "logps/rejected": -180.61257934570312, + "loss": 0.6201, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7302818298339844, + "rewards/margins": 0.24378572404384613, + "rewards/rejected": -0.9740675687789917, + "step": 435 + }, + { + "epoch": 0.45, + "learning_rate": 3.3349485728472535e-05, + "logits/chosen": -2.2981767654418945, + "logits/rejected": -2.403442144393921, + "logps/chosen": -169.84153747558594, + "logps/rejected": -196.2303466796875, + "loss": 0.6202, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.631123960018158, + "rewards/margins": 0.2665478587150574, + "rewards/rejected": -0.8976718187332153, + "step": 436 + }, + { + "epoch": 0.46, + "learning_rate": 3.326314779683207e-05, + "logits/chosen": -2.45729923248291, + "logits/rejected": -2.3062028884887695, + "logps/chosen": -180.6869354248047, + "logps/rejected": -158.7431640625, + "loss": 0.8641, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7027081251144409, + "rewards/margins": -0.22353459894657135, + "rewards/rejected": -0.47917354106903076, + "step": 437 + }, + { + "epoch": 0.46, + "learning_rate": 3.3176699082935545e-05, + "logits/chosen": -2.310640811920166, + "logits/rejected": -2.3036937713623047, + "logps/chosen": -186.8025665283203, + "logps/rejected": -198.4528045654297, + "loss": 0.6925, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8213132619857788, + "rewards/margins": 0.07846779376268387, + "rewards/rejected": -0.8997809886932373, + "step": 438 + }, + { + "epoch": 0.46, + "learning_rate": 3.3090140745782396e-05, + "logits/chosen": -2.3146181106567383, + "logits/rejected": -2.318394660949707, + "logps/chosen": -207.11068725585938, + "logps/rejected": -205.5038604736328, + "loss": 0.6093, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6563000679016113, + "rewards/margins": 0.33885622024536133, + "rewards/rejected": -0.9951564073562622, + "step": 439 + }, + { + "epoch": 0.46, + "learning_rate": 3.300347394584172e-05, + "logits/chosen": -2.4188132286071777, + "logits/rejected": -2.4725587368011475, + "logps/chosen": -156.28175354003906, + "logps/rejected": -178.71530151367188, + "loss": 0.7442, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6938459277153015, + "rewards/margins": 0.030388107523322105, + "rewards/rejected": -0.7242341041564941, + "step": 440 + }, + { + "epoch": 0.46, + "learning_rate": 3.2916699845036816e-05, + "logits/chosen": -2.326338768005371, + "logits/rejected": -2.3983230590820312, + "logps/chosen": -116.61759948730469, + "logps/rejected": -127.69525909423828, + "loss": 0.6664, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6837319135665894, + "rewards/margins": 0.17259922623634338, + "rewards/rejected": -0.8563311696052551, + "step": 441 + }, + { + "epoch": 0.46, + "learning_rate": 3.282981960672948e-05, + "logits/chosen": -2.239466905593872, + "logits/rejected": -2.3111233711242676, + "logps/chosen": -159.78004455566406, + "logps/rejected": -177.1030731201172, + "loss": 0.6711, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7229784727096558, + "rewards/margins": 0.17383922636508942, + "rewards/rejected": -0.896817684173584, + "step": 442 + }, + { + "epoch": 0.46, + "learning_rate": 3.2742834395704486e-05, + "logits/chosen": -2.203927516937256, + "logits/rejected": -2.1760244369506836, + "logps/chosen": -133.5518341064453, + "logps/rejected": -157.52099609375, + "loss": 0.7027, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.71323561668396, + "rewards/margins": 0.07185419648885727, + "rewards/rejected": -0.7850897908210754, + "step": 443 + }, + { + "epoch": 0.46, + "learning_rate": 3.265574537815398e-05, + "logits/chosen": -2.1297600269317627, + "logits/rejected": -2.0856151580810547, + "logps/chosen": -149.8739776611328, + "logps/rejected": -155.45904541015625, + "loss": 0.7246, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7274818420410156, + "rewards/margins": 0.02891545556485653, + "rewards/rejected": -0.7563972473144531, + "step": 444 + }, + { + "epoch": 0.46, + "learning_rate": 3.25685537216618e-05, + "logits/chosen": -2.446577787399292, + "logits/rejected": -2.4465479850769043, + "logps/chosen": -205.2270050048828, + "logps/rejected": -194.4155731201172, + "loss": 0.7937, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9242798686027527, + "rewards/margins": -0.04263466224074364, + "rewards/rejected": -0.8816452622413635, + "step": 445 + }, + { + "epoch": 0.47, + "learning_rate": 3.248126059518785e-05, + "logits/chosen": -2.11894154548645, + "logits/rejected": -2.1701767444610596, + "logps/chosen": -146.13601684570312, + "logps/rejected": -163.9047088623047, + "loss": 0.6358, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5888944864273071, + "rewards/margins": 0.164439395070076, + "rewards/rejected": -0.7533338665962219, + "step": 446 + }, + { + "epoch": 0.47, + "learning_rate": 3.2393867169052385e-05, + "logits/chosen": -2.250922203063965, + "logits/rejected": -2.2642922401428223, + "logps/chosen": -222.06298828125, + "logps/rejected": -232.71881103515625, + "loss": 0.8317, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9078823924064636, + "rewards/margins": -0.15829764306545258, + "rewards/rejected": -0.7495847344398499, + "step": 447 + }, + { + "epoch": 0.47, + "learning_rate": 3.230637461492043e-05, + "logits/chosen": -2.254838228225708, + "logits/rejected": -2.2903223037719727, + "logps/chosen": -182.53579711914062, + "logps/rejected": -193.14736938476562, + "loss": 0.6348, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.626493513584137, + "rewards/margins": 0.2684532105922699, + "rewards/rejected": -0.8949467539787292, + "step": 448 + }, + { + "epoch": 0.47, + "learning_rate": 3.221878410578593e-05, + "logits/chosen": -2.258246898651123, + "logits/rejected": -2.1956984996795654, + "logps/chosen": -210.88497924804688, + "logps/rejected": -204.88185119628906, + "loss": 0.8482, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5795395374298096, + "rewards/margins": -0.15963196754455566, + "rewards/rejected": -0.4199075698852539, + "step": 449 + }, + { + "epoch": 0.47, + "learning_rate": 3.213109681595612e-05, + "logits/chosen": -2.3528032302856445, + "logits/rejected": -2.296219825744629, + "logps/chosen": -207.69216918945312, + "logps/rejected": -225.57296752929688, + "loss": 0.7991, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9065631031990051, + "rewards/margins": -0.03170624002814293, + "rewards/rejected": -0.8748568892478943, + "step": 450 + }, + { + "epoch": 0.47, + "learning_rate": 3.2043313921035743e-05, + "logits/chosen": -2.1952693462371826, + "logits/rejected": -2.2114603519439697, + "logps/chosen": -204.89718627929688, + "logps/rejected": -201.62734985351562, + "loss": 0.9834, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.2711020708084106, + "rewards/margins": -0.4141416847705841, + "rewards/rejected": -0.8569603562355042, + "step": 451 + }, + { + "epoch": 0.47, + "learning_rate": 3.195543659791132e-05, + "logits/chosen": -2.1842381954193115, + "logits/rejected": -2.2123162746429443, + "logps/chosen": -169.44754028320312, + "logps/rejected": -167.3434295654297, + "loss": 0.64, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6639611124992371, + "rewards/margins": 0.15555183589458466, + "rewards/rejected": -0.8195129632949829, + "step": 452 + }, + { + "epoch": 0.47, + "learning_rate": 3.186746602473533e-05, + "logits/chosen": -2.1493401527404785, + "logits/rejected": -2.1739141941070557, + "logps/chosen": -149.2794189453125, + "logps/rejected": -154.29624938964844, + "loss": 0.8507, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6466760039329529, + "rewards/margins": -0.20036213099956512, + "rewards/rejected": -0.44631391763687134, + "step": 453 + }, + { + "epoch": 0.47, + "learning_rate": 3.177940338091043e-05, + "logits/chosen": -2.2300286293029785, + "logits/rejected": -2.3122761249542236, + "logps/chosen": -191.32516479492188, + "logps/rejected": -194.84938049316406, + "loss": 0.7401, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7524707317352295, + "rewards/margins": 0.010163038969039917, + "rewards/rejected": -0.7626338005065918, + "step": 454 + }, + { + "epoch": 0.47, + "learning_rate": 3.169124984707367e-05, + "logits/chosen": -2.2368390560150146, + "logits/rejected": -2.29437255859375, + "logps/chosen": -163.08969116210938, + "logps/rejected": -171.7424774169922, + "loss": 0.8942, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9385874271392822, + "rewards/margins": -0.3221665024757385, + "rewards/rejected": -0.6164208650588989, + "step": 455 + }, + { + "epoch": 0.48, + "learning_rate": 3.160300660508064e-05, + "logits/chosen": -2.2047293186187744, + "logits/rejected": -2.1667182445526123, + "logps/chosen": -156.24505615234375, + "logps/rejected": -150.25613403320312, + "loss": 0.7661, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5315223932266235, + "rewards/margins": -0.031079813838005066, + "rewards/rejected": -0.500442624092102, + "step": 456 + }, + { + "epoch": 0.48, + "learning_rate": 3.151467483798961e-05, + "logits/chosen": -2.2086293697357178, + "logits/rejected": -2.196566581726074, + "logps/chosen": -163.7198944091797, + "logps/rejected": -154.07669067382812, + "loss": 0.7642, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7637531757354736, + "rewards/margins": -0.03873248025774956, + "rewards/rejected": -0.7250206470489502, + "step": 457 + }, + { + "epoch": 0.48, + "learning_rate": 3.14262557300457e-05, + "logits/chosen": -2.1346004009246826, + "logits/rejected": -2.2624478340148926, + "logps/chosen": -157.80322265625, + "logps/rejected": -193.9817657470703, + "loss": 0.6025, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5126041173934937, + "rewards/margins": 0.3903144598007202, + "rewards/rejected": -0.9029185175895691, + "step": 458 + }, + { + "epoch": 0.48, + "learning_rate": 3.1337750466665e-05, + "logits/chosen": -2.120087146759033, + "logits/rejected": -2.164226770401001, + "logps/chosen": -189.44192504882812, + "logps/rejected": -220.5596466064453, + "loss": 0.7477, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8327434659004211, + "rewards/margins": -0.02192605845630169, + "rewards/rejected": -0.8108173608779907, + "step": 459 + }, + { + "epoch": 0.48, + "learning_rate": 3.124916023441865e-05, + "logits/chosen": -2.2006072998046875, + "logits/rejected": -2.1659958362579346, + "logps/chosen": -182.32632446289062, + "logps/rejected": -194.20724487304688, + "loss": 0.8611, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8323721885681152, + "rewards/margins": -0.26599258184432983, + "rewards/rejected": -0.5663796067237854, + "step": 460 + }, + { + "epoch": 0.48, + "learning_rate": 3.116048622101694e-05, + "logits/chosen": -2.143481969833374, + "logits/rejected": -2.1845016479492188, + "logps/chosen": -165.87046813964844, + "logps/rejected": -171.90936279296875, + "loss": 0.7098, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8616752028465271, + "rewards/margins": 0.11464538425207138, + "rewards/rejected": -0.9763206243515015, + "step": 461 + }, + { + "epoch": 0.48, + "learning_rate": 3.107172961529343e-05, + "logits/chosen": -2.1274116039276123, + "logits/rejected": -2.162541389465332, + "logps/chosen": -158.4412841796875, + "logps/rejected": -173.54653930664062, + "loss": 0.7462, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7143791317939758, + "rewards/margins": 0.004445172846317291, + "rewards/rejected": -0.7188242673873901, + "step": 462 + }, + { + "epoch": 0.48, + "learning_rate": 3.098289160718895e-05, + "logits/chosen": -2.1465463638305664, + "logits/rejected": -2.1098814010620117, + "logps/chosen": -130.60450744628906, + "logps/rejected": -149.80252075195312, + "loss": 0.6369, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4852756857872009, + "rewards/margins": 0.2071731984615326, + "rewards/rejected": -0.6924489140510559, + "step": 463 + }, + { + "epoch": 0.48, + "learning_rate": 3.0893973387735687e-05, + "logits/chosen": -2.323080539703369, + "logits/rejected": -2.2071361541748047, + "logps/chosen": -180.71392822265625, + "logps/rejected": -165.3758544921875, + "loss": 0.9189, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8797387480735779, + "rewards/margins": -0.3374296724796295, + "rewards/rejected": -0.5423091053962708, + "step": 464 + }, + { + "epoch": 0.49, + "learning_rate": 3.0804976149041195e-05, + "logits/chosen": -2.3689966201782227, + "logits/rejected": -2.432495355606079, + "logps/chosen": -183.48805236816406, + "logps/rejected": -180.59786987304688, + "loss": 0.6557, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7070946097373962, + "rewards/margins": 0.12962420284748077, + "rewards/rejected": -0.836718738079071, + "step": 465 + }, + { + "epoch": 0.49, + "learning_rate": 3.071590108427244e-05, + "logits/chosen": -2.2095448970794678, + "logits/rejected": -2.22792387008667, + "logps/chosen": -194.24359130859375, + "logps/rejected": -181.46434020996094, + "loss": 0.5084, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.46445751190185547, + "rewards/margins": 0.5187379121780396, + "rewards/rejected": -0.9831954836845398, + "step": 466 + }, + { + "epoch": 0.49, + "learning_rate": 3.062674938763976e-05, + "logits/chosen": -2.224792718887329, + "logits/rejected": -2.276299476623535, + "logps/chosen": -151.77529907226562, + "logps/rejected": -171.2163543701172, + "loss": 0.589, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.49393290281295776, + "rewards/margins": 0.3881426751613617, + "rewards/rejected": -0.8820755481719971, + "step": 467 + }, + { + "epoch": 0.49, + "learning_rate": 3.0537522254380905e-05, + "logits/chosen": -2.327399730682373, + "logits/rejected": -2.2717721462249756, + "logps/chosen": -178.17420959472656, + "logps/rejected": -183.68106079101562, + "loss": 0.7317, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3572332262992859, + "rewards/margins": 0.004655532538890839, + "rewards/rejected": -0.3618887662887573, + "step": 468 + }, + { + "epoch": 0.49, + "learning_rate": 3.044822088074496e-05, + "logits/chosen": -2.150599479675293, + "logits/rejected": -2.1766562461853027, + "logps/chosen": -150.70323181152344, + "logps/rejected": -175.47964477539062, + "loss": 0.6856, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5796679258346558, + "rewards/margins": 0.14709994196891785, + "rewards/rejected": -0.726767897605896, + "step": 469 + }, + { + "epoch": 0.49, + "learning_rate": 3.0358846463976372e-05, + "logits/chosen": -2.2366018295288086, + "logits/rejected": -2.338874578475952, + "logps/chosen": -192.69740295410156, + "logps/rejected": -190.33204650878906, + "loss": 0.6836, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.48144611716270447, + "rewards/margins": 0.07051944732666016, + "rewards/rejected": -0.551965594291687, + "step": 470 + }, + { + "epoch": 0.49, + "learning_rate": 3.026940020229882e-05, + "logits/chosen": -2.133188247680664, + "logits/rejected": -2.177133798599243, + "logps/chosen": -150.59495544433594, + "logps/rejected": -149.1016845703125, + "loss": 0.818, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7209250926971436, + "rewards/margins": -0.17671580612659454, + "rewards/rejected": -0.5442093014717102, + "step": 471 + }, + { + "epoch": 0.49, + "learning_rate": 3.017988329489923e-05, + "logits/chosen": -2.2492454051971436, + "logits/rejected": -2.2075250148773193, + "logps/chosen": -218.95291137695312, + "logps/rejected": -213.46139526367188, + "loss": 0.7839, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6977720856666565, + "rewards/margins": -0.08655133843421936, + "rewards/rejected": -0.6112207770347595, + "step": 472 + }, + { + "epoch": 0.49, + "learning_rate": 3.0090296941911633e-05, + "logits/chosen": -2.1852970123291016, + "logits/rejected": -2.1652181148529053, + "logps/chosen": -196.5089874267578, + "logps/rejected": -201.77569580078125, + "loss": 0.7244, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5932771563529968, + "rewards/margins": -0.0026064813137054443, + "rewards/rejected": -0.5906707644462585, + "step": 473 + }, + { + "epoch": 0.49, + "learning_rate": 3.0000642344401113e-05, + "logits/chosen": -2.115180015563965, + "logits/rejected": -2.0559911727905273, + "logps/chosen": -157.2303924560547, + "logps/rejected": -145.6020050048828, + "loss": 0.7223, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.48168429732322693, + "rewards/margins": 0.11891864240169525, + "rewards/rejected": -0.600602924823761, + "step": 474 + }, + { + "epoch": 0.5, + "learning_rate": 2.9910920704347696e-05, + "logits/chosen": -2.387964963912964, + "logits/rejected": -2.433955669403076, + "logps/chosen": -245.86285400390625, + "logps/rejected": -259.4566955566406, + "loss": 0.7292, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6999150514602661, + "rewards/margins": 0.08391554653644562, + "rewards/rejected": -0.7838307023048401, + "step": 475 + }, + { + "epoch": 0.5, + "learning_rate": 2.9821133224630226e-05, + "logits/chosen": -2.1827383041381836, + "logits/rejected": -2.2108314037323, + "logps/chosen": -172.43350219726562, + "logps/rejected": -167.54298400878906, + "loss": 0.6998, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.40625911951065063, + "rewards/margins": 0.13796135783195496, + "rewards/rejected": -0.5442204475402832, + "step": 476 + }, + { + "epoch": 0.5, + "learning_rate": 2.9731281109010256e-05, + "logits/chosen": -2.393608331680298, + "logits/rejected": -2.4628074169158936, + "logps/chosen": -155.9365234375, + "logps/rejected": -150.9811248779297, + "loss": 0.6863, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4619404375553131, + "rewards/margins": 0.08013444393873215, + "rewards/rejected": -0.5420749187469482, + "step": 477 + }, + { + "epoch": 0.5, + "learning_rate": 2.9641365562115887e-05, + "logits/chosen": -2.1305439472198486, + "logits/rejected": -2.158849000930786, + "logps/chosen": -157.4604034423828, + "logps/rejected": -158.77505493164062, + "loss": 0.6796, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5900415182113647, + "rewards/margins": 0.08278737962245941, + "rewards/rejected": -0.6728289127349854, + "step": 478 + }, + { + "epoch": 0.5, + "learning_rate": 2.9551387789425638e-05, + "logits/chosen": -2.111013412475586, + "logits/rejected": -2.1469898223876953, + "logps/chosen": -177.7059326171875, + "logps/rejected": -199.083251953125, + "loss": 0.6744, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6065003871917725, + "rewards/margins": 0.17838376760482788, + "rewards/rejected": -0.7848842144012451, + "step": 479 + }, + { + "epoch": 0.5, + "learning_rate": 2.9461348997252265e-05, + "logits/chosen": -2.2793450355529785, + "logits/rejected": -2.2518503665924072, + "logps/chosen": -167.08595275878906, + "logps/rejected": -162.74386596679688, + "loss": 0.6446, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5793865323066711, + "rewards/margins": 0.19307895004749298, + "rewards/rejected": -0.7724654674530029, + "step": 480 + }, + { + "epoch": 0.5, + "learning_rate": 2.9371250392726614e-05, + "logits/chosen": -2.156540632247925, + "logits/rejected": -2.1982791423797607, + "logps/chosen": -232.06939697265625, + "logps/rejected": -225.7415771484375, + "loss": 0.6585, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6940824389457703, + "rewards/margins": 0.11553283035755157, + "rewards/rejected": -0.8096152544021606, + "step": 481 + }, + { + "epoch": 0.5, + "learning_rate": 2.9281093183781403e-05, + "logits/chosen": -2.0882251262664795, + "logits/rejected": -2.2663707733154297, + "logps/chosen": -130.05990600585938, + "logps/rejected": -188.05630493164062, + "loss": 0.6944, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4038165509700775, + "rewards/margins": 0.06018731743097305, + "rewards/rejected": -0.46400386095046997, + "step": 482 + }, + { + "epoch": 0.5, + "learning_rate": 2.919087857913508e-05, + "logits/chosen": -2.3520162105560303, + "logits/rejected": -2.321183443069458, + "logps/chosen": -182.3740997314453, + "logps/rejected": -178.23336791992188, + "loss": 0.6351, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5323800444602966, + "rewards/margins": 0.18767206370830536, + "rewards/rejected": -0.7200521230697632, + "step": 483 + }, + { + "epoch": 0.5, + "learning_rate": 2.9100607788275545e-05, + "logits/chosen": -2.1776552200317383, + "logits/rejected": -2.2282662391662598, + "logps/chosen": -163.6830596923828, + "logps/rejected": -172.34671020507812, + "loss": 0.7805, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6728564500808716, + "rewards/margins": -0.11494327336549759, + "rewards/rejected": -0.5579131245613098, + "step": 484 + }, + { + "epoch": 0.51, + "learning_rate": 2.9010282021444008e-05, + "logits/chosen": -2.239274501800537, + "logits/rejected": -2.17651104927063, + "logps/chosen": -174.9864044189453, + "logps/rejected": -169.44493103027344, + "loss": 0.8076, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5585433840751648, + "rewards/margins": -0.09774555265903473, + "rewards/rejected": -0.4607977867126465, + "step": 485 + }, + { + "epoch": 0.51, + "learning_rate": 2.891990248961871e-05, + "logits/chosen": -2.1217386722564697, + "logits/rejected": -2.1039137840270996, + "logps/chosen": -159.67498779296875, + "logps/rejected": -174.40069580078125, + "loss": 0.6205, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24506211280822754, + "rewards/margins": 0.40262073278427124, + "rewards/rejected": -0.647682785987854, + "step": 486 + }, + { + "epoch": 0.51, + "learning_rate": 2.8829470404498697e-05, + "logits/chosen": -2.1323282718658447, + "logits/rejected": -2.1301045417785645, + "logps/chosen": -129.35870361328125, + "logps/rejected": -170.45484924316406, + "loss": 0.6848, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4312661588191986, + "rewards/margins": 0.12291737645864487, + "rewards/rejected": -0.5541835427284241, + "step": 487 + }, + { + "epoch": 0.51, + "learning_rate": 2.8738986978487625e-05, + "logits/chosen": -2.2189228534698486, + "logits/rejected": -2.1614956855773926, + "logps/chosen": -193.06204223632812, + "logps/rejected": -182.66293334960938, + "loss": 0.7176, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6811609864234924, + "rewards/margins": 0.08441457152366638, + "rewards/rejected": -0.7655755877494812, + "step": 488 + }, + { + "epoch": 0.51, + "learning_rate": 2.8648453424677434e-05, + "logits/chosen": -2.2789225578308105, + "logits/rejected": -2.3813822269439697, + "logps/chosen": -168.468017578125, + "logps/rejected": -183.34982299804688, + "loss": 0.655, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5851632952690125, + "rewards/margins": 0.17568376660346985, + "rewards/rejected": -0.7608469724655151, + "step": 489 + }, + { + "epoch": 0.51, + "learning_rate": 2.8557870956832132e-05, + "logits/chosen": -2.264902114868164, + "logits/rejected": -2.2514560222625732, + "logps/chosen": -174.7198486328125, + "logps/rejected": -179.86676025390625, + "loss": 0.6624, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6332632303237915, + "rewards/margins": 0.08560072630643845, + "rewards/rejected": -0.7188639640808105, + "step": 490 + }, + { + "epoch": 0.51, + "learning_rate": 2.846724078937149e-05, + "logits/chosen": -2.1317250728607178, + "logits/rejected": -2.1464059352874756, + "logps/chosen": -174.97686767578125, + "logps/rejected": -182.57919311523438, + "loss": 0.6618, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6465609073638916, + "rewards/margins": 0.17847394943237305, + "rewards/rejected": -0.8250348567962646, + "step": 491 + }, + { + "epoch": 0.51, + "learning_rate": 2.8376564137354795e-05, + "logits/chosen": -2.1236746311187744, + "logits/rejected": -2.148552894592285, + "logps/chosen": -156.05751037597656, + "logps/rejected": -151.4677734375, + "loss": 0.773, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5343315601348877, + "rewards/margins": -0.059497520327568054, + "rewards/rejected": -0.47483405470848083, + "step": 492 + }, + { + "epoch": 0.51, + "learning_rate": 2.8285842216464543e-05, + "logits/chosen": -2.2011935710906982, + "logits/rejected": -2.3106913566589355, + "logps/chosen": -183.11766052246094, + "logps/rejected": -201.7404022216797, + "loss": 0.5969, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5661877989768982, + "rewards/margins": 0.39767351746559143, + "rewards/rejected": -0.9638612866401672, + "step": 493 + }, + { + "epoch": 0.52, + "learning_rate": 2.8195076242990122e-05, + "logits/chosen": -2.245713472366333, + "logits/rejected": -2.243020534515381, + "logps/chosen": -159.26397705078125, + "logps/rejected": -174.874267578125, + "loss": 0.8165, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6563709378242493, + "rewards/margins": -0.1701582670211792, + "rewards/rejected": -0.48621270060539246, + "step": 494 + }, + { + "epoch": 0.52, + "learning_rate": 2.8104267433811533e-05, + "logits/chosen": -2.1842641830444336, + "logits/rejected": -2.1591455936431885, + "logps/chosen": -121.25286102294922, + "logps/rejected": -115.32366180419922, + "loss": 0.6663, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4449831545352936, + "rewards/margins": 0.13848333060741425, + "rewards/rejected": -0.5834664702415466, + "step": 495 + }, + { + "epoch": 0.52, + "learning_rate": 2.8013417006383076e-05, + "logits/chosen": -2.1810221672058105, + "logits/rejected": -2.2239696979522705, + "logps/chosen": -151.152099609375, + "logps/rejected": -176.65977478027344, + "loss": 0.601, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4074276387691498, + "rewards/margins": 0.25585755705833435, + "rewards/rejected": -0.6632851958274841, + "step": 496 + }, + { + "epoch": 0.52, + "learning_rate": 2.7922526178717017e-05, + "logits/chosen": -2.1347427368164062, + "logits/rejected": -2.1655385494232178, + "logps/chosen": -159.8424072265625, + "logps/rejected": -178.92288208007812, + "loss": 0.6182, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48965874314308167, + "rewards/margins": 0.2526865601539612, + "rewards/rejected": -0.7423452734947205, + "step": 497 + }, + { + "epoch": 0.52, + "learning_rate": 2.783159616936723e-05, + "logits/chosen": -2.141169309616089, + "logits/rejected": -2.141371726989746, + "logps/chosen": -158.35968017578125, + "logps/rejected": -176.37464904785156, + "loss": 0.642, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5580905079841614, + "rewards/margins": 0.24599358439445496, + "rewards/rejected": -0.8040841221809387, + "step": 498 + }, + { + "epoch": 0.52, + "learning_rate": 2.774062819741293e-05, + "logits/chosen": -2.250638246536255, + "logits/rejected": -2.1852548122406006, + "logps/chosen": -165.15774536132812, + "logps/rejected": -180.04124450683594, + "loss": 0.6831, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4352782368659973, + "rewards/margins": 0.0841899961233139, + "rewards/rejected": -0.5194682478904724, + "step": 499 + }, + { + "epoch": 0.52, + "learning_rate": 2.764962348244228e-05, + "logits/chosen": -2.187967538833618, + "logits/rejected": -2.1378700733184814, + "logps/chosen": -179.86184692382812, + "logps/rejected": -174.86183166503906, + "loss": 0.7768, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8409562110900879, + "rewards/margins": -0.06990113109350204, + "rewards/rejected": -0.7710551619529724, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 958, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}