diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,67533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.7993779160186625, + "eval_steps": 500, + "global_step": 4500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006220839813374805, + "grad_norm": 52.52096939086914, + "learning_rate": 1.0351966873706006e-08, + "logits/chosen": 0.012532182969152927, + "logits/rejected": 5.3814802169799805, + "logps/chosen": -373.7783203125, + "logps/rejected": -592.5435791015625, + "loss": 2.1252, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.039964437484741, + "rewards/margins": -1.1029200553894043, + "rewards/rejected": 4.142884254455566, + "step": 1 + }, + { + "epoch": 0.001244167962674961, + "grad_norm": 48.9936408996582, + "learning_rate": 2.0703933747412012e-08, + "logits/chosen": 1.9091384410858154, + "logits/rejected": 5.514098644256592, + "logps/chosen": -337.4912414550781, + "logps/rejected": -521.2832641601562, + "loss": 5.0175, + "rewards/accuracies": 0.125, + "rewards/chosen": 5.3760600090026855, + "rewards/margins": -4.835138320922852, + "rewards/rejected": 10.211198806762695, + "step": 2 + }, + { + "epoch": 0.0018662519440124418, + "grad_norm": 45.977542877197266, + "learning_rate": 3.1055900621118015e-08, + "logits/chosen": 2.1885063648223877, + "logits/rejected": 4.337547302246094, + "logps/chosen": -459.13153076171875, + "logps/rejected": -506.27825927734375, + "loss": 2.7651, + "rewards/accuracies": 0.25, + "rewards/chosen": 5.090634346008301, + "rewards/margins": -1.6695140600204468, + "rewards/rejected": 6.760149002075195, + "step": 3 + }, + { + "epoch": 0.002488335925349922, + "grad_norm": 46.56956481933594, + "learning_rate": 4.1407867494824025e-08, + "logits/chosen": -0.5925924181938171, + "logits/rejected": 3.8525352478027344, + "logps/chosen": -306.25311279296875, + "logps/rejected": -507.11968994140625, + "loss": 4.915, + "rewards/accuracies": 0.125, + "rewards/chosen": 3.0717453956604004, + "rewards/margins": -4.032550811767578, + "rewards/rejected": 7.10429573059082, + "step": 4 + }, + { + "epoch": 0.003110419906687403, + "grad_norm": 28.329368591308594, + "learning_rate": 5.175983436853002e-08, + "logits/chosen": 4.241677284240723, + "logits/rejected": 4.568050384521484, + "logps/chosen": -507.6634826660156, + "logps/rejected": -522.225341796875, + "loss": 1.2785, + "rewards/accuracies": 0.5, + "rewards/chosen": 9.437142372131348, + "rewards/margins": 1.2293858528137207, + "rewards/rejected": 8.207756042480469, + "step": 5 + }, + { + "epoch": 0.0037325038880248835, + "grad_norm": 44.399723052978516, + "learning_rate": 6.211180124223603e-08, + "logits/chosen": 0.9154919385910034, + "logits/rejected": 2.570462703704834, + "logps/chosen": -378.0952453613281, + "logps/rejected": -446.9964599609375, + "loss": 2.6541, + "rewards/accuracies": 0.375, + "rewards/chosen": 4.882189750671387, + "rewards/margins": -1.601928949356079, + "rewards/rejected": 6.484118461608887, + "step": 6 + }, + { + "epoch": 0.004354587869362364, + "grad_norm": 56.556884765625, + "learning_rate": 7.246376811594204e-08, + "logits/chosen": -0.5533325672149658, + "logits/rejected": 4.104916095733643, + "logps/chosen": -329.94818115234375, + "logps/rejected": -621.577880859375, + "loss": 4.5922, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.9307994842529297, + "rewards/margins": -3.158440113067627, + "rewards/rejected": 7.089239597320557, + "step": 7 + }, + { + "epoch": 0.004976671850699844, + "grad_norm": 44.0192985534668, + "learning_rate": 8.281573498964805e-08, + "logits/chosen": 4.58944034576416, + "logits/rejected": 4.53434944152832, + "logps/chosen": -509.0722961425781, + "logps/rejected": -480.3599853515625, + "loss": 3.6859, + "rewards/accuracies": 0.25, + "rewards/chosen": 4.1546311378479, + "rewards/margins": -3.303114891052246, + "rewards/rejected": 7.4577460289001465, + "step": 8 + }, + { + "epoch": 0.005598755832037325, + "grad_norm": 50.199031829833984, + "learning_rate": 9.316770186335405e-08, + "logits/chosen": -0.12182983756065369, + "logits/rejected": 5.243905067443848, + "logps/chosen": -297.5303039550781, + "logps/rejected": -551.4981079101562, + "loss": 5.1511, + "rewards/accuracies": 0.125, + "rewards/chosen": 3.3871593475341797, + "rewards/margins": -4.763964653015137, + "rewards/rejected": 8.151124000549316, + "step": 9 + }, + { + "epoch": 0.006220839813374806, + "grad_norm": 44.1707763671875, + "learning_rate": 1.0351966873706004e-07, + "logits/chosen": 1.1508839130401611, + "logits/rejected": 3.511411428451538, + "logps/chosen": -398.50701904296875, + "logps/rejected": -559.7413330078125, + "loss": 3.7766, + "rewards/accuracies": 0.375, + "rewards/chosen": 5.0946455001831055, + "rewards/margins": -2.76721453666687, + "rewards/rejected": 7.861859321594238, + "step": 10 + }, + { + "epoch": 0.006842923794712286, + "grad_norm": 56.588111877441406, + "learning_rate": 1.1387163561076605e-07, + "logits/chosen": 2.96075177192688, + "logits/rejected": 5.463801383972168, + "logps/chosen": -471.24652099609375, + "logps/rejected": -518.7703857421875, + "loss": 2.7771, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.3338894844055176, + "rewards/margins": -1.9073762893676758, + "rewards/rejected": 5.241266250610352, + "step": 11 + }, + { + "epoch": 0.007465007776049767, + "grad_norm": 44.33781814575195, + "learning_rate": 1.2422360248447206e-07, + "logits/chosen": 2.9660511016845703, + "logits/rejected": 5.102863788604736, + "logps/chosen": -435.560546875, + "logps/rejected": -541.2033081054688, + "loss": 3.1998, + "rewards/accuracies": 0.25, + "rewards/chosen": 7.141000270843506, + "rewards/margins": -1.1307448148727417, + "rewards/rejected": 8.271744728088379, + "step": 12 + }, + { + "epoch": 0.008087091757387248, + "grad_norm": 40.28484344482422, + "learning_rate": 1.3457556935817807e-07, + "logits/chosen": 1.7922760248184204, + "logits/rejected": 6.533666610717773, + "logps/chosen": -422.5932312011719, + "logps/rejected": -602.603515625, + "loss": 3.6227, + "rewards/accuracies": 0.375, + "rewards/chosen": 5.170872688293457, + "rewards/margins": -2.765868663787842, + "rewards/rejected": 7.936741828918457, + "step": 13 + }, + { + "epoch": 0.008709175738724729, + "grad_norm": 49.1853141784668, + "learning_rate": 1.4492753623188408e-07, + "logits/chosen": -1.2458416223526, + "logits/rejected": 3.827310562133789, + "logps/chosen": -276.527587890625, + "logps/rejected": -613.375732421875, + "loss": 4.5431, + "rewards/accuracies": 0.25, + "rewards/chosen": 2.5047080516815186, + "rewards/margins": -3.598675012588501, + "rewards/rejected": 6.1033830642700195, + "step": 14 + }, + { + "epoch": 0.00933125972006221, + "grad_norm": 40.00694274902344, + "learning_rate": 1.5527950310559006e-07, + "logits/chosen": 0.6321654915809631, + "logits/rejected": 4.0691609382629395, + "logps/chosen": -410.6964111328125, + "logps/rejected": -517.6644287109375, + "loss": 4.6035, + "rewards/accuracies": 0.375, + "rewards/chosen": 5.570537567138672, + "rewards/margins": -2.5490994453430176, + "rewards/rejected": 8.119636535644531, + "step": 15 + }, + { + "epoch": 0.009953343701399688, + "grad_norm": 38.782012939453125, + "learning_rate": 1.656314699792961e-07, + "logits/chosen": 0.7363018989562988, + "logits/rejected": 4.3214006423950195, + "logps/chosen": -343.54168701171875, + "logps/rejected": -488.6893310546875, + "loss": 3.467, + "rewards/accuracies": 0.25, + "rewards/chosen": 7.064784049987793, + "rewards/margins": -1.6931116580963135, + "rewards/rejected": 8.757895469665527, + "step": 16 + }, + { + "epoch": 0.010575427682737169, + "grad_norm": 32.41596603393555, + "learning_rate": 1.7598343685300208e-07, + "logits/chosen": -0.19795677065849304, + "logits/rejected": 1.080691933631897, + "logps/chosen": -396.13726806640625, + "logps/rejected": -448.459228515625, + "loss": 3.3021, + "rewards/accuracies": 0.375, + "rewards/chosen": 4.552862167358398, + "rewards/margins": -1.6756930351257324, + "rewards/rejected": 6.228555202484131, + "step": 17 + }, + { + "epoch": 0.01119751166407465, + "grad_norm": 46.2690544128418, + "learning_rate": 1.863354037267081e-07, + "logits/chosen": 2.7062017917633057, + "logits/rejected": 3.694239377975464, + "logps/chosen": -446.1850280761719, + "logps/rejected": -549.3088989257812, + "loss": 4.0624, + "rewards/accuracies": 0.25, + "rewards/chosen": 6.74229621887207, + "rewards/margins": -3.0481457710266113, + "rewards/rejected": 9.79044246673584, + "step": 18 + }, + { + "epoch": 0.01181959564541213, + "grad_norm": 43.01369094848633, + "learning_rate": 1.966873706004141e-07, + "logits/chosen": 0.34469613432884216, + "logits/rejected": 4.731164932250977, + "logps/chosen": -413.55377197265625, + "logps/rejected": -558.8274536132812, + "loss": 1.8663, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.901057243347168, + "rewards/margins": -1.076063632965088, + "rewards/rejected": 4.977120876312256, + "step": 19 + }, + { + "epoch": 0.012441679626749611, + "grad_norm": 43.26823043823242, + "learning_rate": 2.0703933747412008e-07, + "logits/chosen": 0.7995167970657349, + "logits/rejected": 4.243210792541504, + "logps/chosen": -403.41650390625, + "logps/rejected": -514.751953125, + "loss": 3.9075, + "rewards/accuracies": 0.375, + "rewards/chosen": 4.657955646514893, + "rewards/margins": -2.819042682647705, + "rewards/rejected": 7.476998329162598, + "step": 20 + }, + { + "epoch": 0.013063763608087092, + "grad_norm": 40.63854217529297, + "learning_rate": 2.173913043478261e-07, + "logits/chosen": 0.8253070712089539, + "logits/rejected": 4.052617073059082, + "logps/chosen": -392.7893371582031, + "logps/rejected": -536.6062622070312, + "loss": 4.2509, + "rewards/accuracies": 0.375, + "rewards/chosen": 5.524932861328125, + "rewards/margins": -2.6818959712982178, + "rewards/rejected": 8.206829071044922, + "step": 21 + }, + { + "epoch": 0.013685847589424573, + "grad_norm": 46.58070755004883, + "learning_rate": 2.277432712215321e-07, + "logits/chosen": -0.05951416492462158, + "logits/rejected": 3.8601527214050293, + "logps/chosen": -323.16619873046875, + "logps/rejected": -515.654052734375, + "loss": 4.1162, + "rewards/accuracies": 0.125, + "rewards/chosen": 3.4312305450439453, + "rewards/margins": -3.677948236465454, + "rewards/rejected": 7.10917854309082, + "step": 22 + }, + { + "epoch": 0.014307931570762053, + "grad_norm": 46.004512786865234, + "learning_rate": 2.3809523809523811e-07, + "logits/chosen": 1.3055967092514038, + "logits/rejected": 4.365591526031494, + "logps/chosen": -403.9496765136719, + "logps/rejected": -589.9659423828125, + "loss": 5.3046, + "rewards/accuracies": 0.125, + "rewards/chosen": 6.964463710784912, + "rewards/margins": -5.141401290893555, + "rewards/rejected": 12.105865478515625, + "step": 23 + }, + { + "epoch": 0.014930015552099534, + "grad_norm": 55.458885192871094, + "learning_rate": 2.484472049689441e-07, + "logits/chosen": 1.6353414058685303, + "logits/rejected": 5.698489665985107, + "logps/chosen": -337.55078125, + "logps/rejected": -551.3826904296875, + "loss": 5.5208, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.996336936950684, + "rewards/margins": -5.433783531188965, + "rewards/rejected": 11.430120468139648, + "step": 24 + }, + { + "epoch": 0.015552099533437015, + "grad_norm": 38.34725570678711, + "learning_rate": 2.5879917184265016e-07, + "logits/chosen": -0.7651683688163757, + "logits/rejected": 2.5544052124023438, + "logps/chosen": -235.06356811523438, + "logps/rejected": -407.9915771484375, + "loss": 3.0314, + "rewards/accuracies": 0.125, + "rewards/chosen": 4.645867347717285, + "rewards/margins": -2.6488218307495117, + "rewards/rejected": 7.294688701629639, + "step": 25 + }, + { + "epoch": 0.016174183514774496, + "grad_norm": 26.11163330078125, + "learning_rate": 2.6915113871635614e-07, + "logits/chosen": 0.4576059579849243, + "logits/rejected": 3.3899879455566406, + "logps/chosen": -265.80413818359375, + "logps/rejected": -426.92108154296875, + "loss": 3.0763, + "rewards/accuracies": 0.75, + "rewards/chosen": 4.355762481689453, + "rewards/margins": -1.404404640197754, + "rewards/rejected": 5.760167121887207, + "step": 26 + }, + { + "epoch": 0.016796267496111975, + "grad_norm": 50.16153335571289, + "learning_rate": 2.795031055900621e-07, + "logits/chosen": 0.9520180225372314, + "logits/rejected": 1.3030765056610107, + "logps/chosen": -424.3631591796875, + "logps/rejected": -459.70550537109375, + "loss": 3.3118, + "rewards/accuracies": 0.125, + "rewards/chosen": 3.551614999771118, + "rewards/margins": -2.787909746170044, + "rewards/rejected": 6.339524745941162, + "step": 27 + }, + { + "epoch": 0.017418351477449457, + "grad_norm": 47.12639236450195, + "learning_rate": 2.8985507246376816e-07, + "logits/chosen": -0.052807122468948364, + "logits/rejected": 4.841619968414307, + "logps/chosen": -374.9623718261719, + "logps/rejected": -599.8634643554688, + "loss": 5.1175, + "rewards/accuracies": 0.375, + "rewards/chosen": 4.661255836486816, + "rewards/margins": -4.2213335037231445, + "rewards/rejected": 8.882589340209961, + "step": 28 + }, + { + "epoch": 0.018040435458786936, + "grad_norm": 55.17001724243164, + "learning_rate": 3.0020703933747414e-07, + "logits/chosen": 2.173837661743164, + "logits/rejected": 4.256175518035889, + "logps/chosen": -514.0777587890625, + "logps/rejected": -582.631591796875, + "loss": 4.6282, + "rewards/accuracies": 0.25, + "rewards/chosen": 7.686736106872559, + "rewards/margins": -2.3255183696746826, + "rewards/rejected": 10.01225471496582, + "step": 29 + }, + { + "epoch": 0.01866251944012442, + "grad_norm": 45.248477935791016, + "learning_rate": 3.1055900621118013e-07, + "logits/chosen": 3.1304564476013184, + "logits/rejected": 5.465931415557861, + "logps/chosen": -467.85833740234375, + "logps/rejected": -526.0656127929688, + "loss": 4.2446, + "rewards/accuracies": 0.25, + "rewards/chosen": 7.557483673095703, + "rewards/margins": -3.491199016571045, + "rewards/rejected": 11.04868221282959, + "step": 30 + }, + { + "epoch": 0.019284603421461897, + "grad_norm": 22.855022430419922, + "learning_rate": 3.2091097308488616e-07, + "logits/chosen": 3.430361270904541, + "logits/rejected": 5.7235107421875, + "logps/chosen": -460.7084655761719, + "logps/rejected": -496.1153564453125, + "loss": 1.6007, + "rewards/accuracies": 0.75, + "rewards/chosen": 7.377945899963379, + "rewards/margins": 1.0376758575439453, + "rewards/rejected": 6.340270519256592, + "step": 31 + }, + { + "epoch": 0.019906687402799376, + "grad_norm": 53.90958023071289, + "learning_rate": 3.312629399585922e-07, + "logits/chosen": 0.0030750036239624023, + "logits/rejected": 4.562582015991211, + "logps/chosen": -328.0555419921875, + "logps/rejected": -523.3200073242188, + "loss": 4.207, + "rewards/accuracies": 0.125, + "rewards/chosen": 3.139477252960205, + "rewards/margins": -3.903869152069092, + "rewards/rejected": 7.043346881866455, + "step": 32 + }, + { + "epoch": 0.02052877138413686, + "grad_norm": 46.680335998535156, + "learning_rate": 3.416149068322982e-07, + "logits/chosen": 0.6541320085525513, + "logits/rejected": 4.543554306030273, + "logps/chosen": -410.05511474609375, + "logps/rejected": -555.1267700195312, + "loss": 2.788, + "rewards/accuracies": 0.25, + "rewards/chosen": 4.895612716674805, + "rewards/margins": -2.0638134479522705, + "rewards/rejected": 6.959425926208496, + "step": 33 + }, + { + "epoch": 0.021150855365474338, + "grad_norm": 45.052101135253906, + "learning_rate": 3.5196687370600417e-07, + "logits/chosen": 2.8638980388641357, + "logits/rejected": 4.418205738067627, + "logps/chosen": -467.8529357910156, + "logps/rejected": -529.4423828125, + "loss": 1.9395, + "rewards/accuracies": 0.25, + "rewards/chosen": 8.37755298614502, + "rewards/margins": -0.4131333827972412, + "rewards/rejected": 8.79068660736084, + "step": 34 + }, + { + "epoch": 0.02177293934681182, + "grad_norm": 46.419036865234375, + "learning_rate": 3.623188405797102e-07, + "logits/chosen": 1.6391205787658691, + "logits/rejected": 3.039005756378174, + "logps/chosen": -437.704833984375, + "logps/rejected": -481.65179443359375, + "loss": 3.9945, + "rewards/accuracies": 0.25, + "rewards/chosen": 8.587377548217773, + "rewards/margins": -2.558168649673462, + "rewards/rejected": 11.145545959472656, + "step": 35 + }, + { + "epoch": 0.0223950233281493, + "grad_norm": 49.24765396118164, + "learning_rate": 3.726708074534162e-07, + "logits/chosen": 0.6626954078674316, + "logits/rejected": 5.522377967834473, + "logps/chosen": -249.82749938964844, + "logps/rejected": -480.0360412597656, + "loss": 5.1576, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.22590708732605, + "rewards/margins": -5.005549430847168, + "rewards/rejected": 8.231456756591797, + "step": 36 + }, + { + "epoch": 0.023017107309486782, + "grad_norm": 40.403690338134766, + "learning_rate": 3.8302277432712217e-07, + "logits/chosen": 3.179089069366455, + "logits/rejected": 5.580604076385498, + "logps/chosen": -484.9145812988281, + "logps/rejected": -545.426513671875, + "loss": 2.1843, + "rewards/accuracies": 0.375, + "rewards/chosen": 8.788908004760742, + "rewards/margins": 1.674599528312683, + "rewards/rejected": 7.114309310913086, + "step": 37 + }, + { + "epoch": 0.02363919129082426, + "grad_norm": 45.154029846191406, + "learning_rate": 3.933747412008282e-07, + "logits/chosen": 1.6743485927581787, + "logits/rejected": 4.098839282989502, + "logps/chosen": -391.55462646484375, + "logps/rejected": -487.47802734375, + "loss": 3.8492, + "rewards/accuracies": 0.375, + "rewards/chosen": 4.612683296203613, + "rewards/margins": -3.17740535736084, + "rewards/rejected": 7.7900896072387695, + "step": 38 + }, + { + "epoch": 0.024261275272161743, + "grad_norm": 34.88768768310547, + "learning_rate": 4.037267080745342e-07, + "logits/chosen": 2.6441800594329834, + "logits/rejected": 3.981311321258545, + "logps/chosen": -326.20489501953125, + "logps/rejected": -379.9033203125, + "loss": 1.7066, + "rewards/accuracies": 0.375, + "rewards/chosen": 2.9094905853271484, + "rewards/margins": -0.5736005902290344, + "rewards/rejected": 3.483090877532959, + "step": 39 + }, + { + "epoch": 0.024883359253499222, + "grad_norm": 53.884212493896484, + "learning_rate": 4.1407867494824017e-07, + "logits/chosen": 1.5335716009140015, + "logits/rejected": 4.578222751617432, + "logps/chosen": -436.3438720703125, + "logps/rejected": -596.6395263671875, + "loss": 7.6457, + "rewards/accuracies": 0.25, + "rewards/chosen": 4.73098087310791, + "rewards/margins": -7.282605171203613, + "rewards/rejected": 12.013585090637207, + "step": 40 + }, + { + "epoch": 0.0255054432348367, + "grad_norm": 47.64874267578125, + "learning_rate": 4.244306418219462e-07, + "logits/chosen": -0.8001726865768433, + "logits/rejected": 4.6293535232543945, + "logps/chosen": -284.55389404296875, + "logps/rejected": -473.02423095703125, + "loss": 3.6999, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.107661247253418, + "rewards/margins": -3.117319107055664, + "rewards/rejected": 6.224980354309082, + "step": 41 + }, + { + "epoch": 0.026127527216174184, + "grad_norm": 39.684940338134766, + "learning_rate": 4.347826086956522e-07, + "logits/chosen": 2.798779010772705, + "logits/rejected": 3.0875256061553955, + "logps/chosen": -509.02490234375, + "logps/rejected": -545.913330078125, + "loss": 1.6709, + "rewards/accuracies": 0.5, + "rewards/chosen": 10.082112312316895, + "rewards/margins": 2.398789405822754, + "rewards/rejected": 7.683322906494141, + "step": 42 + }, + { + "epoch": 0.026749611197511663, + "grad_norm": 32.33852767944336, + "learning_rate": 4.451345755693582e-07, + "logits/chosen": 1.85959792137146, + "logits/rejected": 1.575959324836731, + "logps/chosen": -412.39202880859375, + "logps/rejected": -416.99822998046875, + "loss": 2.4367, + "rewards/accuracies": 0.375, + "rewards/chosen": 7.020496368408203, + "rewards/margins": -1.3365765810012817, + "rewards/rejected": 8.357072830200195, + "step": 43 + }, + { + "epoch": 0.027371695178849145, + "grad_norm": 51.07847595214844, + "learning_rate": 4.554865424430642e-07, + "logits/chosen": -0.3712843060493469, + "logits/rejected": 5.780962944030762, + "logps/chosen": -346.7430725097656, + "logps/rejected": -668.7047729492188, + "loss": 6.9333, + "rewards/accuracies": 0.125, + "rewards/chosen": 1.6845252513885498, + "rewards/margins": -6.724910736083984, + "rewards/rejected": 8.409436225891113, + "step": 44 + }, + { + "epoch": 0.027993779160186624, + "grad_norm": 49.54875183105469, + "learning_rate": 4.658385093167702e-07, + "logits/chosen": 0.5838897228240967, + "logits/rejected": 4.9215898513793945, + "logps/chosen": -403.30804443359375, + "logps/rejected": -589.4797973632812, + "loss": 3.4521, + "rewards/accuracies": 0.0, + "rewards/chosen": 9.050199508666992, + "rewards/margins": -3.2908575534820557, + "rewards/rejected": 12.341056823730469, + "step": 45 + }, + { + "epoch": 0.028615863141524107, + "grad_norm": 34.638187408447266, + "learning_rate": 4.7619047619047623e-07, + "logits/chosen": -0.016638919711112976, + "logits/rejected": 3.867025375366211, + "logps/chosen": -291.69720458984375, + "logps/rejected": -462.72760009765625, + "loss": 3.2514, + "rewards/accuracies": 0.375, + "rewards/chosen": 3.6876978874206543, + "rewards/margins": -1.1836978197097778, + "rewards/rejected": 4.871395587921143, + "step": 46 + }, + { + "epoch": 0.029237947122861586, + "grad_norm": 43.84613800048828, + "learning_rate": 4.865424430641822e-07, + "logits/chosen": 1.2583420276641846, + "logits/rejected": 5.132736682891846, + "logps/chosen": -379.2347412109375, + "logps/rejected": -541.4534912109375, + "loss": 2.7044, + "rewards/accuracies": 0.375, + "rewards/chosen": 4.057747840881348, + "rewards/margins": -2.267834186553955, + "rewards/rejected": 6.325582027435303, + "step": 47 + }, + { + "epoch": 0.029860031104199068, + "grad_norm": 47.614315032958984, + "learning_rate": 4.968944099378882e-07, + "logits/chosen": -1.1907007694244385, + "logits/rejected": 3.4339210987091064, + "logps/chosen": -319.9333190917969, + "logps/rejected": -497.4118957519531, + "loss": 3.9817, + "rewards/accuracies": 0.25, + "rewards/chosen": 5.15675687789917, + "rewards/margins": -2.3444647789001465, + "rewards/rejected": 7.501220703125, + "step": 48 + }, + { + "epoch": 0.030482115085536547, + "grad_norm": 39.3867301940918, + "learning_rate": 5.072463768115942e-07, + "logits/chosen": -3.0998148918151855, + "logits/rejected": 4.021823406219482, + "logps/chosen": -238.55032348632812, + "logps/rejected": -542.6666870117188, + "loss": 3.4389, + "rewards/accuracies": 0.375, + "rewards/chosen": 3.568326234817505, + "rewards/margins": -2.0750913619995117, + "rewards/rejected": 5.6434173583984375, + "step": 49 + }, + { + "epoch": 0.03110419906687403, + "grad_norm": 44.98670196533203, + "learning_rate": 5.175983436853003e-07, + "logits/chosen": -1.0378568172454834, + "logits/rejected": 6.316718101501465, + "logps/chosen": -254.8461151123047, + "logps/rejected": -560.372802734375, + "loss": 5.8883, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.8307912349700928, + "rewards/margins": -5.595851898193359, + "rewards/rejected": 9.426643371582031, + "step": 50 + }, + { + "epoch": 0.031726283048211505, + "grad_norm": 57.26844787597656, + "learning_rate": 5.279503105590063e-07, + "logits/chosen": 1.2163293361663818, + "logits/rejected": 4.85459566116333, + "logps/chosen": -395.74951171875, + "logps/rejected": -541.177734375, + "loss": 7.3597, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2945306301116943, + "rewards/margins": -7.298341751098633, + "rewards/rejected": 10.59287166595459, + "step": 51 + }, + { + "epoch": 0.03234836702954899, + "grad_norm": 64.28971862792969, + "learning_rate": 5.383022774327123e-07, + "logits/chosen": 3.2448387145996094, + "logits/rejected": 5.297354221343994, + "logps/chosen": -508.2118835449219, + "logps/rejected": -576.161376953125, + "loss": 4.2396, + "rewards/accuracies": 0.25, + "rewards/chosen": 8.044355392456055, + "rewards/margins": -2.649853467941284, + "rewards/rejected": 10.694210052490234, + "step": 52 + }, + { + "epoch": 0.03297045101088647, + "grad_norm": 45.40407180786133, + "learning_rate": 5.486542443064183e-07, + "logits/chosen": 4.2648606300354, + "logits/rejected": 6.613371849060059, + "logps/chosen": -471.30206298828125, + "logps/rejected": -523.7880859375, + "loss": 2.9062, + "rewards/accuracies": 0.25, + "rewards/chosen": 4.817241191864014, + "rewards/margins": -1.9571592807769775, + "rewards/rejected": 6.77440071105957, + "step": 53 + }, + { + "epoch": 0.03359253499222395, + "grad_norm": 44.306114196777344, + "learning_rate": 5.590062111801243e-07, + "logits/chosen": -2.1772749423980713, + "logits/rejected": 4.385883808135986, + "logps/chosen": -280.7019958496094, + "logps/rejected": -626.658935546875, + "loss": 5.2982, + "rewards/accuracies": 0.125, + "rewards/chosen": 4.835158348083496, + "rewards/margins": -4.899688720703125, + "rewards/rejected": 9.734847068786621, + "step": 54 + }, + { + "epoch": 0.03421461897356143, + "grad_norm": 52.895057678222656, + "learning_rate": 5.693581780538302e-07, + "logits/chosen": 3.337453603744507, + "logits/rejected": 4.959819793701172, + "logps/chosen": -557.9818115234375, + "logps/rejected": -590.6300659179688, + "loss": 4.2118, + "rewards/accuracies": 0.25, + "rewards/chosen": 8.753548622131348, + "rewards/margins": -3.3140532970428467, + "rewards/rejected": 12.067602157592773, + "step": 55 + }, + { + "epoch": 0.034836702954898914, + "grad_norm": 55.78615951538086, + "learning_rate": 5.797101449275363e-07, + "logits/chosen": 1.4191863536834717, + "logits/rejected": 3.9965124130249023, + "logps/chosen": -516.8236083984375, + "logps/rejected": -617.3018798828125, + "loss": 4.5912, + "rewards/accuracies": 0.25, + "rewards/chosen": 6.091235160827637, + "rewards/margins": -4.235839366912842, + "rewards/rejected": 10.32707405090332, + "step": 56 + }, + { + "epoch": 0.03545878693623639, + "grad_norm": 43.840721130371094, + "learning_rate": 5.900621118012423e-07, + "logits/chosen": 0.7551851272583008, + "logits/rejected": 3.4136557579040527, + "logps/chosen": -383.36932373046875, + "logps/rejected": -520.3485107421875, + "loss": 2.3896, + "rewards/accuracies": 0.25, + "rewards/chosen": 6.259200096130371, + "rewards/margins": -1.5887733697891235, + "rewards/rejected": 7.847973346710205, + "step": 57 + }, + { + "epoch": 0.03608087091757387, + "grad_norm": 53.005619049072266, + "learning_rate": 6.004140786749483e-07, + "logits/chosen": 0.8960199356079102, + "logits/rejected": 5.212392807006836, + "logps/chosen": -324.52838134765625, + "logps/rejected": -570.8604736328125, + "loss": 5.8641, + "rewards/accuracies": 0.125, + "rewards/chosen": 4.302431583404541, + "rewards/margins": -5.693329811096191, + "rewards/rejected": 9.99576187133789, + "step": 58 + }, + { + "epoch": 0.03670295489891135, + "grad_norm": 39.89027404785156, + "learning_rate": 6.107660455486543e-07, + "logits/chosen": 0.9366174340248108, + "logits/rejected": 3.5945024490356445, + "logps/chosen": -388.9140625, + "logps/rejected": -512.8837890625, + "loss": 4.197, + "rewards/accuracies": 0.25, + "rewards/chosen": 6.309109210968018, + "rewards/margins": -3.4562954902648926, + "rewards/rejected": 9.76540470123291, + "step": 59 + }, + { + "epoch": 0.03732503888024884, + "grad_norm": 50.484832763671875, + "learning_rate": 6.211180124223603e-07, + "logits/chosen": 1.6095322370529175, + "logits/rejected": 5.206839561462402, + "logps/chosen": -287.35223388671875, + "logps/rejected": -461.3832092285156, + "loss": 4.2601, + "rewards/accuracies": 0.125, + "rewards/chosen": 3.1970438957214355, + "rewards/margins": -3.5525975227355957, + "rewards/rejected": 6.749641418457031, + "step": 60 + }, + { + "epoch": 0.037947122861586316, + "grad_norm": 49.992767333984375, + "learning_rate": 6.314699792960663e-07, + "logits/chosen": -1.0890344381332397, + "logits/rejected": 3.729297161102295, + "logps/chosen": -355.4293212890625, + "logps/rejected": -582.5045166015625, + "loss": 4.0015, + "rewards/accuracies": 0.25, + "rewards/chosen": 5.588550567626953, + "rewards/margins": -2.822916030883789, + "rewards/rejected": 8.411466598510742, + "step": 61 + }, + { + "epoch": 0.038569206842923795, + "grad_norm": 32.55772018432617, + "learning_rate": 6.418219461697723e-07, + "logits/chosen": 1.50239098072052, + "logits/rejected": 3.742825984954834, + "logps/chosen": -340.8537902832031, + "logps/rejected": -463.9370422363281, + "loss": 3.6689, + "rewards/accuracies": 0.375, + "rewards/chosen": 3.409723997116089, + "rewards/margins": -2.145049571990967, + "rewards/rejected": 5.554773807525635, + "step": 62 + }, + { + "epoch": 0.039191290824261274, + "grad_norm": 47.97010803222656, + "learning_rate": 6.521739130434783e-07, + "logits/chosen": -0.5774695873260498, + "logits/rejected": 3.452863931655884, + "logps/chosen": -466.74688720703125, + "logps/rejected": -540.4033813476562, + "loss": 3.878, + "rewards/accuracies": 0.375, + "rewards/chosen": 5.0257568359375, + "rewards/margins": -3.079911231994629, + "rewards/rejected": 8.105668067932129, + "step": 63 + }, + { + "epoch": 0.03981337480559875, + "grad_norm": 60.673484802246094, + "learning_rate": 6.625258799171844e-07, + "logits/chosen": 1.5208394527435303, + "logits/rejected": 3.985078811645508, + "logps/chosen": -520.3128662109375, + "logps/rejected": -586.0016479492188, + "loss": 4.4295, + "rewards/accuracies": 0.25, + "rewards/chosen": 4.203519344329834, + "rewards/margins": -3.1812143325805664, + "rewards/rejected": 7.3847336769104, + "step": 64 + }, + { + "epoch": 0.04043545878693624, + "grad_norm": 37.80503845214844, + "learning_rate": 6.728778467908903e-07, + "logits/chosen": 3.377659320831299, + "logits/rejected": 4.628144264221191, + "logps/chosen": -503.3712463378906, + "logps/rejected": -563.0233154296875, + "loss": 1.7255, + "rewards/accuracies": 0.5, + "rewards/chosen": 7.828451156616211, + "rewards/margins": 0.11194157600402832, + "rewards/rejected": 7.716508865356445, + "step": 65 + }, + { + "epoch": 0.04105754276827372, + "grad_norm": 49.729862213134766, + "learning_rate": 6.832298136645964e-07, + "logits/chosen": 1.4419440031051636, + "logits/rejected": 5.238779544830322, + "logps/chosen": -408.07769775390625, + "logps/rejected": -551.1817626953125, + "loss": 4.5682, + "rewards/accuracies": 0.375, + "rewards/chosen": 6.3790388107299805, + "rewards/margins": -2.325056552886963, + "rewards/rejected": 8.704095840454102, + "step": 66 + }, + { + "epoch": 0.0416796267496112, + "grad_norm": 56.857330322265625, + "learning_rate": 6.935817805383023e-07, + "logits/chosen": 1.8640213012695312, + "logits/rejected": 5.679266929626465, + "logps/chosen": -432.50390625, + "logps/rejected": -636.1224365234375, + "loss": 3.0801, + "rewards/accuracies": 0.25, + "rewards/chosen": 4.258503437042236, + "rewards/margins": -2.0767855644226074, + "rewards/rejected": 6.335289001464844, + "step": 67 + }, + { + "epoch": 0.042301710730948676, + "grad_norm": 46.245445251464844, + "learning_rate": 7.039337474120083e-07, + "logits/chosen": 0.9165676832199097, + "logits/rejected": 2.82378888130188, + "logps/chosen": -407.091552734375, + "logps/rejected": -470.09381103515625, + "loss": 2.5734, + "rewards/accuracies": 0.375, + "rewards/chosen": 2.471022605895996, + "rewards/margins": -1.7493443489074707, + "rewards/rejected": 4.220366954803467, + "step": 68 + }, + { + "epoch": 0.04292379471228616, + "grad_norm": 43.912864685058594, + "learning_rate": 7.142857142857143e-07, + "logits/chosen": 2.735879421234131, + "logits/rejected": 3.7518153190612793, + "logps/chosen": -509.6024169921875, + "logps/rejected": -566.545654296875, + "loss": 4.0801, + "rewards/accuracies": 0.375, + "rewards/chosen": 8.47749137878418, + "rewards/margins": -2.4810967445373535, + "rewards/rejected": 10.958588600158691, + "step": 69 + }, + { + "epoch": 0.04354587869362364, + "grad_norm": 36.06388854980469, + "learning_rate": 7.246376811594204e-07, + "logits/chosen": -0.6139513254165649, + "logits/rejected": 3.7651548385620117, + "logps/chosen": -205.31744384765625, + "logps/rejected": -337.3735046386719, + "loss": 2.4479, + "rewards/accuracies": 0.125, + "rewards/chosen": 1.2144393920898438, + "rewards/margins": -2.109691619873047, + "rewards/rejected": 3.3241305351257324, + "step": 70 + }, + { + "epoch": 0.04416796267496112, + "grad_norm": 48.27947998046875, + "learning_rate": 7.349896480331263e-07, + "logits/chosen": 0.38291358947753906, + "logits/rejected": 4.874111652374268, + "logps/chosen": -282.7892761230469, + "logps/rejected": -501.6195373535156, + "loss": 3.3797, + "rewards/accuracies": 0.25, + "rewards/chosen": 2.928769111633301, + "rewards/margins": -2.763104200363159, + "rewards/rejected": 5.691873550415039, + "step": 71 + }, + { + "epoch": 0.0447900466562986, + "grad_norm": 36.64083480834961, + "learning_rate": 7.453416149068324e-07, + "logits/chosen": 3.216303586959839, + "logits/rejected": 5.372792720794678, + "logps/chosen": -454.7572021484375, + "logps/rejected": -530.1778564453125, + "loss": 1.9955, + "rewards/accuracies": 0.625, + "rewards/chosen": 3.4445557594299316, + "rewards/margins": 0.10334785282611847, + "rewards/rejected": 3.341207981109619, + "step": 72 + }, + { + "epoch": 0.04541213063763608, + "grad_norm": 68.5392074584961, + "learning_rate": 7.556935817805384e-07, + "logits/chosen": 1.791405439376831, + "logits/rejected": 4.570212364196777, + "logps/chosen": -487.98828125, + "logps/rejected": -584.217529296875, + "loss": 4.2218, + "rewards/accuracies": 0.125, + "rewards/chosen": 4.641778469085693, + "rewards/margins": -3.9613802433013916, + "rewards/rejected": 8.603158950805664, + "step": 73 + }, + { + "epoch": 0.046034214618973564, + "grad_norm": 35.1021614074707, + "learning_rate": 7.660455486542443e-07, + "logits/chosen": 2.474278450012207, + "logits/rejected": 3.269683599472046, + "logps/chosen": -424.99627685546875, + "logps/rejected": -421.07513427734375, + "loss": 1.4964, + "rewards/accuracies": 0.375, + "rewards/chosen": 6.019632339477539, + "rewards/margins": 1.7979152202606201, + "rewards/rejected": 4.221717357635498, + "step": 74 + }, + { + "epoch": 0.04665629860031104, + "grad_norm": 45.31480026245117, + "learning_rate": 7.763975155279503e-07, + "logits/chosen": 3.982232093811035, + "logits/rejected": 4.152582168579102, + "logps/chosen": -462.2098083496094, + "logps/rejected": -437.01776123046875, + "loss": 1.4921, + "rewards/accuracies": 0.375, + "rewards/chosen": 3.4199776649475098, + "rewards/margins": -0.8454870581626892, + "rewards/rejected": 4.265464782714844, + "step": 75 + }, + { + "epoch": 0.04727838258164852, + "grad_norm": 45.84056091308594, + "learning_rate": 7.867494824016564e-07, + "logits/chosen": 1.7973527908325195, + "logits/rejected": 4.244307994842529, + "logps/chosen": -511.0018005371094, + "logps/rejected": -612.5986328125, + "loss": 3.7604, + "rewards/accuracies": 0.375, + "rewards/chosen": 5.5702314376831055, + "rewards/margins": -2.9728171825408936, + "rewards/rejected": 8.543049812316895, + "step": 76 + }, + { + "epoch": 0.047900466562986, + "grad_norm": 49.20193862915039, + "learning_rate": 7.971014492753623e-07, + "logits/chosen": 1.614759922027588, + "logits/rejected": 4.329085350036621, + "logps/chosen": -505.11370849609375, + "logps/rejected": -607.37451171875, + "loss": 4.4378, + "rewards/accuracies": 0.375, + "rewards/chosen": 6.242237091064453, + "rewards/margins": -2.3279433250427246, + "rewards/rejected": 8.570180892944336, + "step": 77 + }, + { + "epoch": 0.04852255054432349, + "grad_norm": 33.60881805419922, + "learning_rate": 8.074534161490684e-07, + "logits/chosen": 3.6060690879821777, + "logits/rejected": 6.220172882080078, + "logps/chosen": -486.65313720703125, + "logps/rejected": -591.926025390625, + "loss": 2.7688, + "rewards/accuracies": 0.625, + "rewards/chosen": 9.750151634216309, + "rewards/margins": -0.9578385353088379, + "rewards/rejected": 10.707990646362305, + "step": 78 + }, + { + "epoch": 0.049144634525660966, + "grad_norm": 42.57831954956055, + "learning_rate": 8.178053830227745e-07, + "logits/chosen": 0.6833969354629517, + "logits/rejected": 4.242366790771484, + "logps/chosen": -285.3023376464844, + "logps/rejected": -455.5042419433594, + "loss": 4.457, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.0746004581451416, + "rewards/margins": -3.957090377807617, + "rewards/rejected": 7.031691074371338, + "step": 79 + }, + { + "epoch": 0.049766718506998445, + "grad_norm": 53.40254592895508, + "learning_rate": 8.281573498964803e-07, + "logits/chosen": 2.4628918170928955, + "logits/rejected": 4.680422306060791, + "logps/chosen": -472.9252624511719, + "logps/rejected": -564.3480834960938, + "loss": 3.7476, + "rewards/accuracies": 0.125, + "rewards/chosen": 7.595523357391357, + "rewards/margins": -3.3378608226776123, + "rewards/rejected": 10.933384895324707, + "step": 80 + }, + { + "epoch": 0.050388802488335924, + "grad_norm": 47.84553527832031, + "learning_rate": 8.385093167701864e-07, + "logits/chosen": -0.8003718852996826, + "logits/rejected": 4.039190769195557, + "logps/chosen": -320.61370849609375, + "logps/rejected": -488.978759765625, + "loss": 1.7874, + "rewards/accuracies": 0.125, + "rewards/chosen": 2.1538021564483643, + "rewards/margins": -1.0411781072616577, + "rewards/rejected": 3.1949803829193115, + "step": 81 + }, + { + "epoch": 0.0510108864696734, + "grad_norm": 51.101417541503906, + "learning_rate": 8.488612836438924e-07, + "logits/chosen": 0.9290800094604492, + "logits/rejected": 3.8441295623779297, + "logps/chosen": -422.1509704589844, + "logps/rejected": -547.141357421875, + "loss": 3.4353, + "rewards/accuracies": 0.125, + "rewards/chosen": 6.441579818725586, + "rewards/margins": -2.543184280395508, + "rewards/rejected": 8.984764099121094, + "step": 82 + }, + { + "epoch": 0.05163297045101089, + "grad_norm": 52.027313232421875, + "learning_rate": 8.592132505175985e-07, + "logits/chosen": 0.9031997919082642, + "logits/rejected": 6.393537521362305, + "logps/chosen": -402.1159362792969, + "logps/rejected": -635.5774536132812, + "loss": 4.7445, + "rewards/accuracies": 0.25, + "rewards/chosen": 5.350074291229248, + "rewards/margins": -3.6923584938049316, + "rewards/rejected": 9.04243278503418, + "step": 83 + }, + { + "epoch": 0.05225505443234837, + "grad_norm": 49.580406188964844, + "learning_rate": 8.695652173913044e-07, + "logits/chosen": -1.1733779907226562, + "logits/rejected": 3.845046281814575, + "logps/chosen": -309.610595703125, + "logps/rejected": -534.025634765625, + "loss": 5.856, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.196354866027832, + "rewards/margins": -4.153650283813477, + "rewards/rejected": 7.350005626678467, + "step": 84 + }, + { + "epoch": 0.05287713841368585, + "grad_norm": 42.38566970825195, + "learning_rate": 8.799171842650105e-07, + "logits/chosen": -0.9000750780105591, + "logits/rejected": 3.985197067260742, + "logps/chosen": -332.3162841796875, + "logps/rejected": -567.3726806640625, + "loss": 4.3539, + "rewards/accuracies": 0.25, + "rewards/chosen": 6.791793346405029, + "rewards/margins": -3.3670926094055176, + "rewards/rejected": 10.158885955810547, + "step": 85 + }, + { + "epoch": 0.053499222395023326, + "grad_norm": 54.692100524902344, + "learning_rate": 8.902691511387164e-07, + "logits/chosen": 0.4778970181941986, + "logits/rejected": 4.724665641784668, + "logps/chosen": -448.77117919921875, + "logps/rejected": -606.9342041015625, + "loss": 5.7404, + "rewards/accuracies": 0.125, + "rewards/chosen": 6.051287651062012, + "rewards/margins": -4.640262126922607, + "rewards/rejected": 10.691549301147461, + "step": 86 + }, + { + "epoch": 0.05412130637636081, + "grad_norm": 56.3837890625, + "learning_rate": 9.006211180124224e-07, + "logits/chosen": -1.7641148567199707, + "logits/rejected": 4.488871097564697, + "logps/chosen": -210.34579467773438, + "logps/rejected": -489.9498291015625, + "loss": 3.2886, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.801412343978882, + "rewards/margins": -2.549666404724121, + "rewards/rejected": 6.351078987121582, + "step": 87 + }, + { + "epoch": 0.05474339035769829, + "grad_norm": 37.51689147949219, + "learning_rate": 9.109730848861284e-07, + "logits/chosen": 0.7698231935501099, + "logits/rejected": 3.520718574523926, + "logps/chosen": -422.2262268066406, + "logps/rejected": -491.434814453125, + "loss": 2.6793, + "rewards/accuracies": 0.5, + "rewards/chosen": 4.767912864685059, + "rewards/margins": 0.8315317630767822, + "rewards/rejected": 3.9363808631896973, + "step": 88 + }, + { + "epoch": 0.05536547433903577, + "grad_norm": 43.7822380065918, + "learning_rate": 9.213250517598345e-07, + "logits/chosen": -0.11454379558563232, + "logits/rejected": 3.4940781593322754, + "logps/chosen": -329.4980163574219, + "logps/rejected": -485.48779296875, + "loss": 2.8667, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.1187541484832764, + "rewards/margins": -1.1736748218536377, + "rewards/rejected": 4.292428970336914, + "step": 89 + }, + { + "epoch": 0.05598755832037325, + "grad_norm": 52.58222961425781, + "learning_rate": 9.316770186335404e-07, + "logits/chosen": 1.3086739778518677, + "logits/rejected": 5.073335647583008, + "logps/chosen": -393.6297912597656, + "logps/rejected": -569.0318603515625, + "loss": 3.7375, + "rewards/accuracies": 0.125, + "rewards/chosen": 6.170962810516357, + "rewards/margins": -3.5130481719970703, + "rewards/rejected": 9.684011459350586, + "step": 90 + }, + { + "epoch": 0.05660964230171073, + "grad_norm": 31.68039894104004, + "learning_rate": 9.420289855072465e-07, + "logits/chosen": 0.22252234816551208, + "logits/rejected": 2.2449254989624023, + "logps/chosen": -322.2689208984375, + "logps/rejected": -423.7203063964844, + "loss": 2.0681, + "rewards/accuracies": 0.25, + "rewards/chosen": 4.651986122131348, + "rewards/margins": -0.39517509937286377, + "rewards/rejected": 5.047161102294922, + "step": 91 + }, + { + "epoch": 0.05723172628304821, + "grad_norm": 41.90281295776367, + "learning_rate": 9.523809523809525e-07, + "logits/chosen": 1.7530864477157593, + "logits/rejected": 5.830805778503418, + "logps/chosen": -420.7454833984375, + "logps/rejected": -628.6724853515625, + "loss": 3.2832, + "rewards/accuracies": 0.5, + "rewards/chosen": 9.385915756225586, + "rewards/margins": -1.5105422735214233, + "rewards/rejected": 10.896458625793457, + "step": 92 + }, + { + "epoch": 0.05785381026438569, + "grad_norm": 24.812217712402344, + "learning_rate": 9.627329192546585e-07, + "logits/chosen": 1.9809218645095825, + "logits/rejected": 4.16044807434082, + "logps/chosen": -430.45709228515625, + "logps/rejected": -506.4284362792969, + "loss": 1.266, + "rewards/accuracies": 0.625, + "rewards/chosen": 8.652717590332031, + "rewards/margins": 2.367443561553955, + "rewards/rejected": 6.285274982452393, + "step": 93 + }, + { + "epoch": 0.05847589424572317, + "grad_norm": 52.60491180419922, + "learning_rate": 9.730848861283643e-07, + "logits/chosen": 0.5998326539993286, + "logits/rejected": 4.901703834533691, + "logps/chosen": -406.40472412109375, + "logps/rejected": -620.4981689453125, + "loss": 4.0579, + "rewards/accuracies": 0.125, + "rewards/chosen": 4.539392471313477, + "rewards/margins": -3.4850521087646484, + "rewards/rejected": 8.024444580078125, + "step": 94 + }, + { + "epoch": 0.05909797822706065, + "grad_norm": 44.85295867919922, + "learning_rate": 9.834368530020705e-07, + "logits/chosen": -2.097593307495117, + "logits/rejected": 3.1550774574279785, + "logps/chosen": -391.03338623046875, + "logps/rejected": -641.2415771484375, + "loss": 3.0355, + "rewards/accuracies": 0.375, + "rewards/chosen": 5.09652042388916, + "rewards/margins": -2.0533454418182373, + "rewards/rejected": 7.149866104125977, + "step": 95 + }, + { + "epoch": 0.059720062208398136, + "grad_norm": 34.27623748779297, + "learning_rate": 9.937888198757765e-07, + "logits/chosen": 2.9386537075042725, + "logits/rejected": 3.68379545211792, + "logps/chosen": -584.134521484375, + "logps/rejected": -609.106201171875, + "loss": 1.5705, + "rewards/accuracies": 0.75, + "rewards/chosen": 5.0029168128967285, + "rewards/margins": 0.11842000484466553, + "rewards/rejected": 4.884496688842773, + "step": 96 + }, + { + "epoch": 0.060342146189735615, + "grad_norm": 44.750606536865234, + "learning_rate": 1.0041407867494825e-06, + "logits/chosen": 2.349898099899292, + "logits/rejected": 6.011685371398926, + "logps/chosen": -319.87628173828125, + "logps/rejected": -486.68499755859375, + "loss": 2.4576, + "rewards/accuracies": 0.25, + "rewards/chosen": 4.2724761962890625, + "rewards/margins": -1.7754751443862915, + "rewards/rejected": 6.047951698303223, + "step": 97 + }, + { + "epoch": 0.060964230171073094, + "grad_norm": 48.21299362182617, + "learning_rate": 1.0144927536231885e-06, + "logits/chosen": 0.6218348145484924, + "logits/rejected": 4.305963516235352, + "logps/chosen": -326.71710205078125, + "logps/rejected": -464.59320068359375, + "loss": 4.0852, + "rewards/accuracies": 0.125, + "rewards/chosen": 4.030767440795898, + "rewards/margins": -3.5390217304229736, + "rewards/rejected": 7.569788932800293, + "step": 98 + }, + { + "epoch": 0.06158631415241057, + "grad_norm": 43.5555419921875, + "learning_rate": 1.0248447204968944e-06, + "logits/chosen": 2.1077942848205566, + "logits/rejected": 4.135837078094482, + "logps/chosen": -475.6229248046875, + "logps/rejected": -545.8417358398438, + "loss": 3.2673, + "rewards/accuracies": 0.375, + "rewards/chosen": 5.936098098754883, + "rewards/margins": -0.997451663017273, + "rewards/rejected": 6.933549880981445, + "step": 99 + }, + { + "epoch": 0.06220839813374806, + "grad_norm": 60.58588790893555, + "learning_rate": 1.0351966873706006e-06, + "logits/chosen": 2.1906070709228516, + "logits/rejected": 4.7897725105285645, + "logps/chosen": -492.7605285644531, + "logps/rejected": -564.39208984375, + "loss": 4.7329, + "rewards/accuracies": 0.125, + "rewards/chosen": 5.5719757080078125, + "rewards/margins": -4.470346450805664, + "rewards/rejected": 10.042322158813477, + "step": 100 + }, + { + "epoch": 0.06283048211508553, + "grad_norm": 48.09543228149414, + "learning_rate": 1.0455486542443064e-06, + "logits/chosen": 0.6987656354904175, + "logits/rejected": 4.166409969329834, + "logps/chosen": -367.01678466796875, + "logps/rejected": -541.5540771484375, + "loss": 4.6407, + "rewards/accuracies": 0.25, + "rewards/chosen": 4.648255348205566, + "rewards/margins": -4.1871418952941895, + "rewards/rejected": 8.835397720336914, + "step": 101 + }, + { + "epoch": 0.06345256609642301, + "grad_norm": 57.54613494873047, + "learning_rate": 1.0559006211180126e-06, + "logits/chosen": -2.2215631008148193, + "logits/rejected": 5.639945983886719, + "logps/chosen": -292.84765625, + "logps/rejected": -641.1038818359375, + "loss": 5.1023, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.9426522254943848, + "rewards/margins": -4.445166110992432, + "rewards/rejected": 8.387818336486816, + "step": 102 + }, + { + "epoch": 0.0640746500777605, + "grad_norm": 46.35042190551758, + "learning_rate": 1.0662525879917186e-06, + "logits/chosen": 0.6544018387794495, + "logits/rejected": 4.883418560028076, + "logps/chosen": -323.3349914550781, + "logps/rejected": -573.3865356445312, + "loss": 3.4786, + "rewards/accuracies": 0.25, + "rewards/chosen": 6.362491130828857, + "rewards/margins": -2.575894832611084, + "rewards/rejected": 8.938385009765625, + "step": 103 + }, + { + "epoch": 0.06469673405909798, + "grad_norm": 53.28401565551758, + "learning_rate": 1.0766045548654246e-06, + "logits/chosen": 2.349595069885254, + "logits/rejected": 4.094552516937256, + "logps/chosen": -496.2256774902344, + "logps/rejected": -590.922119140625, + "loss": 3.2819, + "rewards/accuracies": 0.375, + "rewards/chosen": 2.4656195640563965, + "rewards/margins": -2.7044270038604736, + "rewards/rejected": 5.170046806335449, + "step": 104 + }, + { + "epoch": 0.06531881804043546, + "grad_norm": 41.93247985839844, + "learning_rate": 1.0869565217391306e-06, + "logits/chosen": 1.0588008165359497, + "logits/rejected": 2.653573989868164, + "logps/chosen": -380.8188781738281, + "logps/rejected": -459.8533630371094, + "loss": 1.8891, + "rewards/accuracies": 0.375, + "rewards/chosen": 6.556657791137695, + "rewards/margins": -0.8650388717651367, + "rewards/rejected": 7.421696662902832, + "step": 105 + }, + { + "epoch": 0.06594090202177294, + "grad_norm": 59.02732467651367, + "learning_rate": 1.0973084886128365e-06, + "logits/chosen": 1.3528294563293457, + "logits/rejected": 5.272040843963623, + "logps/chosen": -232.5532684326172, + "logps/rejected": -436.3426818847656, + "loss": 3.2053, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.45379501581192017, + "rewards/margins": -3.016813278198242, + "rewards/rejected": 3.4706084728240967, + "step": 106 + }, + { + "epoch": 0.06656298600311042, + "grad_norm": 56.47092819213867, + "learning_rate": 1.1076604554865425e-06, + "logits/chosen": 0.5967641472816467, + "logits/rejected": 1.849827527999878, + "logps/chosen": -502.0181884765625, + "logps/rejected": -542.710205078125, + "loss": 5.3509, + "rewards/accuracies": 0.125, + "rewards/chosen": 4.310145854949951, + "rewards/margins": -5.1838178634643555, + "rewards/rejected": 9.493963241577148, + "step": 107 + }, + { + "epoch": 0.0671850699844479, + "grad_norm": 29.14185905456543, + "learning_rate": 1.1180124223602485e-06, + "logits/chosen": 3.0689427852630615, + "logits/rejected": 3.7151615619659424, + "logps/chosen": -515.645263671875, + "logps/rejected": -523.316650390625, + "loss": 1.6792, + "rewards/accuracies": 0.5, + "rewards/chosen": 8.409460067749023, + "rewards/margins": 1.357513189315796, + "rewards/rejected": 7.051946640014648, + "step": 108 + }, + { + "epoch": 0.06780715396578538, + "grad_norm": 41.52821731567383, + "learning_rate": 1.1283643892339545e-06, + "logits/chosen": 3.8132739067077637, + "logits/rejected": 3.7674436569213867, + "logps/chosen": -589.3350219726562, + "logps/rejected": -575.3363037109375, + "loss": 2.2836, + "rewards/accuracies": 0.5, + "rewards/chosen": 8.319855690002441, + "rewards/margins": -0.695635974407196, + "rewards/rejected": 9.015491485595703, + "step": 109 + }, + { + "epoch": 0.06842923794712286, + "grad_norm": 32.027217864990234, + "learning_rate": 1.1387163561076605e-06, + "logits/chosen": 1.6463077068328857, + "logits/rejected": 4.320730686187744, + "logps/chosen": -380.8627014160156, + "logps/rejected": -503.88323974609375, + "loss": 1.8091, + "rewards/accuracies": 0.5, + "rewards/chosen": 7.096940040588379, + "rewards/margins": 0.04935610294342041, + "rewards/rejected": 7.047583103179932, + "step": 110 + }, + { + "epoch": 0.06905132192846034, + "grad_norm": 48.38652038574219, + "learning_rate": 1.1490683229813664e-06, + "logits/chosen": 0.4883633255958557, + "logits/rejected": 4.920735836029053, + "logps/chosen": -403.420166015625, + "logps/rejected": -578.6812133789062, + "loss": 1.5954, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.8030200004577637, + "rewards/margins": 1.088423490524292, + "rewards/rejected": 2.714596748352051, + "step": 111 + }, + { + "epoch": 0.06967340590979783, + "grad_norm": 39.91889953613281, + "learning_rate": 1.1594202898550726e-06, + "logits/chosen": 2.2552788257598877, + "logits/rejected": 4.922609329223633, + "logps/chosen": -424.36431884765625, + "logps/rejected": -576.1085205078125, + "loss": 2.2137, + "rewards/accuracies": 0.5, + "rewards/chosen": 6.894935607910156, + "rewards/margins": 0.45326733589172363, + "rewards/rejected": 6.441668510437012, + "step": 112 + }, + { + "epoch": 0.07029548989113531, + "grad_norm": 36.67121124267578, + "learning_rate": 1.1697722567287784e-06, + "logits/chosen": -3.018321990966797, + "logits/rejected": 3.101332187652588, + "logps/chosen": -302.11090087890625, + "logps/rejected": -540.5518798828125, + "loss": 3.3978, + "rewards/accuracies": 0.375, + "rewards/chosen": 3.1350018978118896, + "rewards/margins": -2.6119015216827393, + "rewards/rejected": 5.746903419494629, + "step": 113 + }, + { + "epoch": 0.07091757387247279, + "grad_norm": 37.47822952270508, + "learning_rate": 1.1801242236024846e-06, + "logits/chosen": 0.36539775133132935, + "logits/rejected": 2.5840260982513428, + "logps/chosen": -302.42681884765625, + "logps/rejected": -385.96075439453125, + "loss": 2.6045, + "rewards/accuracies": 0.375, + "rewards/chosen": 2.869319438934326, + "rewards/margins": -1.7174735069274902, + "rewards/rejected": 4.586793422698975, + "step": 114 + }, + { + "epoch": 0.07153965785381027, + "grad_norm": 61.6951904296875, + "learning_rate": 1.1904761904761906e-06, + "logits/chosen": 0.23400786519050598, + "logits/rejected": 4.336287975311279, + "logps/chosen": -468.264892578125, + "logps/rejected": -585.5806884765625, + "loss": 5.9258, + "rewards/accuracies": 0.25, + "rewards/chosen": 2.550875425338745, + "rewards/margins": -5.330711841583252, + "rewards/rejected": 7.881587028503418, + "step": 115 + }, + { + "epoch": 0.07216174183514774, + "grad_norm": 42.67302703857422, + "learning_rate": 1.2008281573498966e-06, + "logits/chosen": -1.4592227935791016, + "logits/rejected": 1.4536776542663574, + "logps/chosen": -272.0027160644531, + "logps/rejected": -449.3828125, + "loss": 2.3933, + "rewards/accuracies": 0.375, + "rewards/chosen": 4.253082275390625, + "rewards/margins": -0.5537877082824707, + "rewards/rejected": 4.806870460510254, + "step": 116 + }, + { + "epoch": 0.07278382581648522, + "grad_norm": 37.58478927612305, + "learning_rate": 1.2111801242236026e-06, + "logits/chosen": 0.2646111845970154, + "logits/rejected": 3.028031826019287, + "logps/chosen": -422.3641662597656, + "logps/rejected": -519.9530029296875, + "loss": 3.1383, + "rewards/accuracies": 0.5, + "rewards/chosen": 5.61953592300415, + "rewards/margins": -1.9047925472259521, + "rewards/rejected": 7.524328708648682, + "step": 117 + }, + { + "epoch": 0.0734059097978227, + "grad_norm": 46.9578971862793, + "learning_rate": 1.2215320910973085e-06, + "logits/chosen": -1.7312917709350586, + "logits/rejected": 4.160816192626953, + "logps/chosen": -374.05706787109375, + "logps/rejected": -589.3055419921875, + "loss": 3.5799, + "rewards/accuracies": 0.125, + "rewards/chosen": 6.456633567810059, + "rewards/margins": -2.8331234455108643, + "rewards/rejected": 9.289756774902344, + "step": 118 + }, + { + "epoch": 0.07402799377916018, + "grad_norm": 48.71473693847656, + "learning_rate": 1.2318840579710147e-06, + "logits/chosen": -0.895279049873352, + "logits/rejected": 4.6400885581970215, + "logps/chosen": -271.0947265625, + "logps/rejected": -575.875244140625, + "loss": 3.2339, + "rewards/accuracies": 0.25, + "rewards/chosen": 1.412948489189148, + "rewards/margins": -1.9902526140213013, + "rewards/rejected": 3.403201103210449, + "step": 119 + }, + { + "epoch": 0.07465007776049767, + "grad_norm": 53.11747360229492, + "learning_rate": 1.2422360248447205e-06, + "logits/chosen": -1.3114756345748901, + "logits/rejected": 3.2517542839050293, + "logps/chosen": -284.1141357421875, + "logps/rejected": -538.9307250976562, + "loss": 2.6922, + "rewards/accuracies": 0.25, + "rewards/chosen": 2.876573085784912, + "rewards/margins": -2.385010004043579, + "rewards/rejected": 5.26158332824707, + "step": 120 + }, + { + "epoch": 0.07527216174183515, + "grad_norm": 32.17045974731445, + "learning_rate": 1.2525879917184267e-06, + "logits/chosen": 3.688138961791992, + "logits/rejected": 4.196073055267334, + "logps/chosen": -517.5601196289062, + "logps/rejected": -592.2069091796875, + "loss": 1.1033, + "rewards/accuracies": 0.75, + "rewards/chosen": 7.916095733642578, + "rewards/margins": 0.7442711591720581, + "rewards/rejected": 7.1718244552612305, + "step": 121 + }, + { + "epoch": 0.07589424572317263, + "grad_norm": 41.10165786743164, + "learning_rate": 1.2629399585921327e-06, + "logits/chosen": 1.2101017236709595, + "logits/rejected": 2.4238104820251465, + "logps/chosen": -386.5546569824219, + "logps/rejected": -454.49859619140625, + "loss": 1.9363, + "rewards/accuracies": 0.375, + "rewards/chosen": 5.121078968048096, + "rewards/margins": 0.4273524284362793, + "rewards/rejected": 4.693726539611816, + "step": 122 + }, + { + "epoch": 0.07651632970451011, + "grad_norm": 40.01031494140625, + "learning_rate": 1.2732919254658385e-06, + "logits/chosen": 2.7430291175842285, + "logits/rejected": 4.701658248901367, + "logps/chosen": -423.6866455078125, + "logps/rejected": -474.761962890625, + "loss": 2.2463, + "rewards/accuracies": 0.5, + "rewards/chosen": 4.223697185516357, + "rewards/margins": -1.0474417209625244, + "rewards/rejected": 5.271138668060303, + "step": 123 + }, + { + "epoch": 0.07713841368584759, + "grad_norm": 47.88119125366211, + "learning_rate": 1.2836438923395447e-06, + "logits/chosen": -1.60335111618042, + "logits/rejected": 3.7535877227783203, + "logps/chosen": -201.6946258544922, + "logps/rejected": -488.87969970703125, + "loss": 3.3213, + "rewards/accuracies": 0.375, + "rewards/chosen": 2.568941593170166, + "rewards/margins": -1.890028715133667, + "rewards/rejected": 4.458970069885254, + "step": 124 + }, + { + "epoch": 0.07776049766718507, + "grad_norm": 41.96632385253906, + "learning_rate": 1.2939958592132506e-06, + "logits/chosen": 0.8452179431915283, + "logits/rejected": 2.755337715148926, + "logps/chosen": -344.67462158203125, + "logps/rejected": -452.2144775390625, + "loss": 1.6188, + "rewards/accuracies": 0.375, + "rewards/chosen": 4.238457679748535, + "rewards/margins": -1.0320303440093994, + "rewards/rejected": 5.270488262176514, + "step": 125 + }, + { + "epoch": 0.07838258164852255, + "grad_norm": 45.34007263183594, + "learning_rate": 1.3043478260869566e-06, + "logits/chosen": 1.386866569519043, + "logits/rejected": 4.8691511154174805, + "logps/chosen": -366.2882995605469, + "logps/rejected": -543.7435302734375, + "loss": 2.9963, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.628605842590332, + "rewards/margins": -2.35605525970459, + "rewards/rejected": 5.984661102294922, + "step": 126 + }, + { + "epoch": 0.07900466562986003, + "grad_norm": 31.151798248291016, + "learning_rate": 1.3146997929606626e-06, + "logits/chosen": 3.52664852142334, + "logits/rejected": 4.250565528869629, + "logps/chosen": -458.6521911621094, + "logps/rejected": -533.302490234375, + "loss": 2.3654, + "rewards/accuracies": 0.625, + "rewards/chosen": 7.884675025939941, + "rewards/margins": 1.5698320865631104, + "rewards/rejected": 6.314842700958252, + "step": 127 + }, + { + "epoch": 0.0796267496111975, + "grad_norm": 47.57011032104492, + "learning_rate": 1.3250517598343688e-06, + "logits/chosen": 3.230344533920288, + "logits/rejected": 6.1636643409729, + "logps/chosen": -435.8682861328125, + "logps/rejected": -526.7186889648438, + "loss": 3.2785, + "rewards/accuracies": 0.25, + "rewards/chosen": 5.82703971862793, + "rewards/margins": -2.0120530128479004, + "rewards/rejected": 7.839093208312988, + "step": 128 + }, + { + "epoch": 0.080248833592535, + "grad_norm": 41.736793518066406, + "learning_rate": 1.3354037267080746e-06, + "logits/chosen": 1.6107079982757568, + "logits/rejected": 4.2225165367126465, + "logps/chosen": -449.3865966796875, + "logps/rejected": -570.679931640625, + "loss": 3.5626, + "rewards/accuracies": 0.375, + "rewards/chosen": 6.096354007720947, + "rewards/margins": -1.7619743347167969, + "rewards/rejected": 7.858328819274902, + "step": 129 + }, + { + "epoch": 0.08087091757387248, + "grad_norm": 28.421558380126953, + "learning_rate": 1.3457556935817806e-06, + "logits/chosen": -2.727404832839966, + "logits/rejected": 4.425556659698486, + "logps/chosen": -231.41461181640625, + "logps/rejected": -580.169921875, + "loss": 2.6388, + "rewards/accuracies": 0.625, + "rewards/chosen": 4.525047779083252, + "rewards/margins": -0.4674373269081116, + "rewards/rejected": 4.992485046386719, + "step": 130 + }, + { + "epoch": 0.08149300155520996, + "grad_norm": 34.796287536621094, + "learning_rate": 1.3561076604554865e-06, + "logits/chosen": 1.1071691513061523, + "logits/rejected": 3.6799988746643066, + "logps/chosen": -383.92962646484375, + "logps/rejected": -528.1698608398438, + "loss": 3.2315, + "rewards/accuracies": 0.5, + "rewards/chosen": 6.100879669189453, + "rewards/margins": -1.634765863418579, + "rewards/rejected": 7.7356462478637695, + "step": 131 + }, + { + "epoch": 0.08211508553654744, + "grad_norm": 49.1640625, + "learning_rate": 1.3664596273291927e-06, + "logits/chosen": 0.9404339790344238, + "logits/rejected": 5.779233455657959, + "logps/chosen": -365.7359313964844, + "logps/rejected": -657.7894287109375, + "loss": 4.144, + "rewards/accuracies": 0.375, + "rewards/chosen": 3.550569534301758, + "rewards/margins": -2.898592948913574, + "rewards/rejected": 6.449162483215332, + "step": 132 + }, + { + "epoch": 0.08273716951788491, + "grad_norm": 33.71388626098633, + "learning_rate": 1.3768115942028987e-06, + "logits/chosen": -1.5326220989227295, + "logits/rejected": 4.281208038330078, + "logps/chosen": -145.20849609375, + "logps/rejected": -438.5946350097656, + "loss": 2.0325, + "rewards/accuracies": 0.5, + "rewards/chosen": 2.4517431259155273, + "rewards/margins": -0.3625298738479614, + "rewards/rejected": 2.8142733573913574, + "step": 133 + }, + { + "epoch": 0.0833592534992224, + "grad_norm": 47.125850677490234, + "learning_rate": 1.3871635610766047e-06, + "logits/chosen": 1.5233802795410156, + "logits/rejected": 4.672469615936279, + "logps/chosen": -411.7640380859375, + "logps/rejected": -596.0722045898438, + "loss": 4.3379, + "rewards/accuracies": 0.5, + "rewards/chosen": 4.727325439453125, + "rewards/margins": -1.5561026334762573, + "rewards/rejected": 6.283427715301514, + "step": 134 + }, + { + "epoch": 0.08398133748055987, + "grad_norm": 40.958187103271484, + "learning_rate": 1.3975155279503105e-06, + "logits/chosen": 1.8504540920257568, + "logits/rejected": 5.391268730163574, + "logps/chosen": -445.0400085449219, + "logps/rejected": -602.6244506835938, + "loss": 1.6739, + "rewards/accuracies": 0.625, + "rewards/chosen": 4.19703483581543, + "rewards/margins": -0.15884339809417725, + "rewards/rejected": 4.3558783531188965, + "step": 135 + }, + { + "epoch": 0.08460342146189735, + "grad_norm": 55.949459075927734, + "learning_rate": 1.4078674948240167e-06, + "logits/chosen": 1.1511344909667969, + "logits/rejected": 4.316564559936523, + "logps/chosen": -389.3515319824219, + "logps/rejected": -509.3475036621094, + "loss": 2.3386, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.0395596027374268, + "rewards/margins": -0.5732651352882385, + "rewards/rejected": 3.6128249168395996, + "step": 136 + }, + { + "epoch": 0.08522550544323483, + "grad_norm": 39.164981842041016, + "learning_rate": 1.4182194616977226e-06, + "logits/chosen": 1.738755226135254, + "logits/rejected": 4.727439880371094, + "logps/chosen": -352.5355529785156, + "logps/rejected": -513.2650146484375, + "loss": 2.3329, + "rewards/accuracies": 0.625, + "rewards/chosen": 4.571519374847412, + "rewards/margins": 0.1314365565776825, + "rewards/rejected": 4.440083026885986, + "step": 137 + }, + { + "epoch": 0.08584758942457232, + "grad_norm": 42.70882034301758, + "learning_rate": 1.4285714285714286e-06, + "logits/chosen": 0.34659343957901, + "logits/rejected": 3.817269802093506, + "logps/chosen": -473.480224609375, + "logps/rejected": -662.4795532226562, + "loss": 2.6746, + "rewards/accuracies": 0.625, + "rewards/chosen": 5.974919319152832, + "rewards/margins": -0.8572205305099487, + "rewards/rejected": 6.832139492034912, + "step": 138 + }, + { + "epoch": 0.0864696734059098, + "grad_norm": 40.75260543823242, + "learning_rate": 1.4389233954451348e-06, + "logits/chosen": -0.08259952068328857, + "logits/rejected": 3.267246723175049, + "logps/chosen": -385.9140319824219, + "logps/rejected": -604.5545654296875, + "loss": 2.6701, + "rewards/accuracies": 0.5, + "rewards/chosen": 3.4592254161834717, + "rewards/margins": -0.001993894577026367, + "rewards/rejected": 3.461219072341919, + "step": 139 + }, + { + "epoch": 0.08709175738724728, + "grad_norm": 43.442569732666016, + "learning_rate": 1.4492753623188408e-06, + "logits/chosen": 0.29093819856643677, + "logits/rejected": 4.218543529510498, + "logps/chosen": -445.10443115234375, + "logps/rejected": -656.73046875, + "loss": 4.0458, + "rewards/accuracies": 0.5, + "rewards/chosen": 6.9249982833862305, + "rewards/margins": -2.191218852996826, + "rewards/rejected": 9.116217613220215, + "step": 140 + }, + { + "epoch": 0.08771384136858476, + "grad_norm": 39.78031921386719, + "learning_rate": 1.4596273291925466e-06, + "logits/chosen": 0.6515867710113525, + "logits/rejected": 4.114278316497803, + "logps/chosen": -373.6202697753906, + "logps/rejected": -598.4118041992188, + "loss": 0.8933, + "rewards/accuracies": 0.5, + "rewards/chosen": 4.184488773345947, + "rewards/margins": 1.4812554121017456, + "rewards/rejected": 2.703233242034912, + "step": 141 + }, + { + "epoch": 0.08833592534992224, + "grad_norm": 43.94890594482422, + "learning_rate": 1.4699792960662526e-06, + "logits/chosen": 1.9376635551452637, + "logits/rejected": 5.742636680603027, + "logps/chosen": -368.154541015625, + "logps/rejected": -523.9427490234375, + "loss": 3.4754, + "rewards/accuracies": 0.375, + "rewards/chosen": 2.491053581237793, + "rewards/margins": -2.008192300796509, + "rewards/rejected": 4.499245643615723, + "step": 142 + }, + { + "epoch": 0.08895800933125972, + "grad_norm": 49.16591262817383, + "learning_rate": 1.4803312629399588e-06, + "logits/chosen": 0.4437107741832733, + "logits/rejected": 2.9630513191223145, + "logps/chosen": -416.73028564453125, + "logps/rejected": -574.045654296875, + "loss": 2.5663, + "rewards/accuracies": 0.375, + "rewards/chosen": 6.391658306121826, + "rewards/margins": -1.0516711473464966, + "rewards/rejected": 7.443329334259033, + "step": 143 + }, + { + "epoch": 0.0895800933125972, + "grad_norm": 25.008758544921875, + "learning_rate": 1.4906832298136647e-06, + "logits/chosen": -0.1724388599395752, + "logits/rejected": 3.9658737182617188, + "logps/chosen": -400.5678405761719, + "logps/rejected": -579.6800537109375, + "loss": 0.6586, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8572346568107605, + "rewards/margins": 1.3512787818908691, + "rewards/rejected": -0.49404406547546387, + "step": 144 + }, + { + "epoch": 0.09020217729393468, + "grad_norm": 47.504005432128906, + "learning_rate": 1.5010351966873707e-06, + "logits/chosen": -1.509745717048645, + "logits/rejected": 5.010717391967773, + "logps/chosen": -196.65380859375, + "logps/rejected": -495.7618408203125, + "loss": 3.8704, + "rewards/accuracies": 0.25, + "rewards/chosen": 2.165783643722534, + "rewards/margins": -2.8671915531158447, + "rewards/rejected": 5.032975196838379, + "step": 145 + }, + { + "epoch": 0.09082426127527216, + "grad_norm": 37.12325668334961, + "learning_rate": 1.5113871635610767e-06, + "logits/chosen": 0.7813265919685364, + "logits/rejected": 2.518862724304199, + "logps/chosen": -364.07196044921875, + "logps/rejected": -468.7448425292969, + "loss": 1.7235, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.2074331045150757, + "rewards/margins": 0.04681295156478882, + "rewards/rejected": 1.1606202125549316, + "step": 146 + }, + { + "epoch": 0.09144634525660965, + "grad_norm": 38.50062942504883, + "learning_rate": 1.521739130434783e-06, + "logits/chosen": -0.4486757516860962, + "logits/rejected": 3.2172610759735107, + "logps/chosen": -288.96588134765625, + "logps/rejected": -525.9904174804688, + "loss": 2.485, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.029601097106933594, + "rewards/margins": -1.5874814987182617, + "rewards/rejected": 1.5578804016113281, + "step": 147 + }, + { + "epoch": 0.09206842923794713, + "grad_norm": 45.27934265136719, + "learning_rate": 1.5320910973084887e-06, + "logits/chosen": 0.4484668970108032, + "logits/rejected": 4.418857097625732, + "logps/chosen": -344.88507080078125, + "logps/rejected": -540.1593017578125, + "loss": 3.5602, + "rewards/accuracies": 0.25, + "rewards/chosen": 4.541347503662109, + "rewards/margins": -2.862128257751465, + "rewards/rejected": 7.403475761413574, + "step": 148 + }, + { + "epoch": 0.0926905132192846, + "grad_norm": 48.022586822509766, + "learning_rate": 1.5424430641821947e-06, + "logits/chosen": -0.05514061450958252, + "logits/rejected": 4.1575236320495605, + "logps/chosen": -391.945556640625, + "logps/rejected": -546.39453125, + "loss": 3.2204, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.5273935794830322, + "rewards/margins": -0.9070781469345093, + "rewards/rejected": 4.43447208404541, + "step": 149 + }, + { + "epoch": 0.09331259720062209, + "grad_norm": 51.76631164550781, + "learning_rate": 1.5527950310559006e-06, + "logits/chosen": 0.8921202421188354, + "logits/rejected": 4.766198635101318, + "logps/chosen": -428.14654541015625, + "logps/rejected": -611.3369140625, + "loss": 1.9622, + "rewards/accuracies": 0.125, + "rewards/chosen": 6.118347644805908, + "rewards/margins": -0.13962364196777344, + "rewards/rejected": 6.257971286773682, + "step": 150 + }, + { + "epoch": 0.09393468118195956, + "grad_norm": 37.54130935668945, + "learning_rate": 1.5631469979296068e-06, + "logits/chosen": 1.5896421670913696, + "logits/rejected": 2.176250457763672, + "logps/chosen": -477.7599792480469, + "logps/rejected": -511.2159423828125, + "loss": 2.5037, + "rewards/accuracies": 0.625, + "rewards/chosen": 6.544580459594727, + "rewards/margins": 2.958336591720581, + "rewards/rejected": 3.5862441062927246, + "step": 151 + }, + { + "epoch": 0.09455676516329704, + "grad_norm": 36.237796783447266, + "learning_rate": 1.5734989648033128e-06, + "logits/chosen": 1.5189558267593384, + "logits/rejected": 4.844598770141602, + "logps/chosen": -324.1607360839844, + "logps/rejected": -502.5072326660156, + "loss": 1.4235, + "rewards/accuracies": 0.625, + "rewards/chosen": 5.578876495361328, + "rewards/margins": 1.3157144784927368, + "rewards/rejected": 4.263161659240723, + "step": 152 + }, + { + "epoch": 0.09517884914463452, + "grad_norm": 41.82114791870117, + "learning_rate": 1.5838509316770188e-06, + "logits/chosen": -0.6398723125457764, + "logits/rejected": 2.4939658641815186, + "logps/chosen": -355.7408752441406, + "logps/rejected": -513.5419311523438, + "loss": 1.737, + "rewards/accuracies": 0.5, + "rewards/chosen": 2.5448246002197266, + "rewards/margins": -0.07166877388954163, + "rewards/rejected": 2.6164932250976562, + "step": 153 + }, + { + "epoch": 0.095800933125972, + "grad_norm": 37.8273811340332, + "learning_rate": 1.5942028985507246e-06, + "logits/chosen": 3.02529239654541, + "logits/rejected": 4.839844226837158, + "logps/chosen": -499.8304748535156, + "logps/rejected": -633.421875, + "loss": 2.5997, + "rewards/accuracies": 0.625, + "rewards/chosen": 8.184986114501953, + "rewards/margins": 0.4161781072616577, + "rewards/rejected": 7.768807411193848, + "step": 154 + }, + { + "epoch": 0.09642301710730948, + "grad_norm": 50.805259704589844, + "learning_rate": 1.6045548654244308e-06, + "logits/chosen": -1.5919575691223145, + "logits/rejected": 2.0901906490325928, + "logps/chosen": -286.54669189453125, + "logps/rejected": -470.29364013671875, + "loss": 1.8801, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2827115058898926, + "rewards/margins": -1.2794893980026245, + "rewards/rejected": 0.9967778921127319, + "step": 155 + }, + { + "epoch": 0.09704510108864697, + "grad_norm": 38.15058135986328, + "learning_rate": 1.6149068322981367e-06, + "logits/chosen": 1.0436800718307495, + "logits/rejected": 5.138459205627441, + "logps/chosen": -467.70269775390625, + "logps/rejected": -661.43115234375, + "loss": 2.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 7.959029674530029, + "rewards/margins": 1.6471713781356812, + "rewards/rejected": 6.311858654022217, + "step": 156 + }, + { + "epoch": 0.09766718506998445, + "grad_norm": 53.46415710449219, + "learning_rate": 1.6252587991718427e-06, + "logits/chosen": -0.7396149039268494, + "logits/rejected": 4.500385284423828, + "logps/chosen": -277.4739990234375, + "logps/rejected": -541.1393432617188, + "loss": 4.5397, + "rewards/accuracies": 0.125, + "rewards/chosen": 4.4420905113220215, + "rewards/margins": -4.118373394012451, + "rewards/rejected": 8.560463905334473, + "step": 157 + }, + { + "epoch": 0.09828926905132193, + "grad_norm": 56.35954666137695, + "learning_rate": 1.635610766045549e-06, + "logits/chosen": -0.8244016170501709, + "logits/rejected": 2.7302236557006836, + "logps/chosen": -430.4935302734375, + "logps/rejected": -605.2032470703125, + "loss": 7.1248, + "rewards/accuracies": 0.25, + "rewards/chosen": 2.587373971939087, + "rewards/margins": -6.728273391723633, + "rewards/rejected": 9.31564712524414, + "step": 158 + }, + { + "epoch": 0.09891135303265941, + "grad_norm": 35.76237106323242, + "learning_rate": 1.645962732919255e-06, + "logits/chosen": 0.3610566258430481, + "logits/rejected": 4.236985206604004, + "logps/chosen": -247.51239013671875, + "logps/rejected": -494.441650390625, + "loss": 1.5019, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.1166188716888428, + "rewards/margins": 0.9880322217941284, + "rewards/rejected": 1.1285866498947144, + "step": 159 + }, + { + "epoch": 0.09953343701399689, + "grad_norm": 33.57570266723633, + "learning_rate": 1.6563146997929607e-06, + "logits/chosen": 0.2717881500720978, + "logits/rejected": 3.3716044425964355, + "logps/chosen": -386.5144348144531, + "logps/rejected": -545.7966918945312, + "loss": 0.6394, + "rewards/accuracies": 0.625, + "rewards/chosen": 3.3963658809661865, + "rewards/margins": 1.2866623401641846, + "rewards/rejected": 2.109703779220581, + "step": 160 + }, + { + "epoch": 0.10015552099533437, + "grad_norm": 41.08350372314453, + "learning_rate": 1.6666666666666667e-06, + "logits/chosen": 0.4015045762062073, + "logits/rejected": 2.754533052444458, + "logps/chosen": -514.9000244140625, + "logps/rejected": -614.388916015625, + "loss": 1.4105, + "rewards/accuracies": 0.25, + "rewards/chosen": 4.394747257232666, + "rewards/margins": -0.1185079962015152, + "rewards/rejected": 4.5132551193237305, + "step": 161 + }, + { + "epoch": 0.10077760497667185, + "grad_norm": 30.24134635925293, + "learning_rate": 1.6770186335403729e-06, + "logits/chosen": 3.017308235168457, + "logits/rejected": 5.260052680969238, + "logps/chosen": -510.5013427734375, + "logps/rejected": -667.1070556640625, + "loss": 1.3398, + "rewards/accuracies": 0.625, + "rewards/chosen": 7.266972541809082, + "rewards/margins": 4.063735485076904, + "rewards/rejected": 3.2032370567321777, + "step": 162 + }, + { + "epoch": 0.10139968895800933, + "grad_norm": 53.645294189453125, + "learning_rate": 1.6873706004140788e-06, + "logits/chosen": 1.5133259296417236, + "logits/rejected": 3.539081573486328, + "logps/chosen": -507.55078125, + "logps/rejected": -652.8729248046875, + "loss": 2.9854, + "rewards/accuracies": 0.5, + "rewards/chosen": 6.406628608703613, + "rewards/margins": -1.5444564819335938, + "rewards/rejected": 7.951085567474365, + "step": 163 + }, + { + "epoch": 0.1020217729393468, + "grad_norm": 18.941761016845703, + "learning_rate": 1.6977225672877848e-06, + "logits/chosen": 1.6392831802368164, + "logits/rejected": 3.4583234786987305, + "logps/chosen": -351.8995666503906, + "logps/rejected": -475.34564208984375, + "loss": 0.6281, + "rewards/accuracies": 0.875, + "rewards/chosen": 5.116458892822266, + "rewards/margins": 4.187784671783447, + "rewards/rejected": 0.9286739826202393, + "step": 164 + }, + { + "epoch": 0.1026438569206843, + "grad_norm": 31.994827270507812, + "learning_rate": 1.7080745341614908e-06, + "logits/chosen": -1.536903738975525, + "logits/rejected": 1.0030691623687744, + "logps/chosen": -365.28912353515625, + "logps/rejected": -476.5758361816406, + "loss": 0.6919, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.6238882541656494, + "rewards/margins": 2.114072799682617, + "rewards/rejected": 0.5098155736923218, + "step": 165 + }, + { + "epoch": 0.10326594090202178, + "grad_norm": 54.416507720947266, + "learning_rate": 1.718426501035197e-06, + "logits/chosen": 0.15935613214969635, + "logits/rejected": 5.428621292114258, + "logps/chosen": -346.4788818359375, + "logps/rejected": -621.9774169921875, + "loss": 3.0793, + "rewards/accuracies": 0.25, + "rewards/chosen": 5.3431501388549805, + "rewards/margins": -1.2562695741653442, + "rewards/rejected": 6.599419593811035, + "step": 166 + }, + { + "epoch": 0.10388802488335926, + "grad_norm": 56.326568603515625, + "learning_rate": 1.7287784679089028e-06, + "logits/chosen": 0.3369348645210266, + "logits/rejected": 4.614073276519775, + "logps/chosen": -442.0284423828125, + "logps/rejected": -650.9850463867188, + "loss": 5.0816, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.0569558143615723, + "rewards/margins": -3.6081981658935547, + "rewards/rejected": 6.665153503417969, + "step": 167 + }, + { + "epoch": 0.10451010886469674, + "grad_norm": 43.598472595214844, + "learning_rate": 1.7391304347826088e-06, + "logits/chosen": 1.4294483661651611, + "logits/rejected": 3.848559617996216, + "logps/chosen": -428.77825927734375, + "logps/rejected": -571.7686157226562, + "loss": 1.8859, + "rewards/accuracies": 0.5, + "rewards/chosen": 2.6971874237060547, + "rewards/margins": 0.7028264403343201, + "rewards/rejected": 1.9943610429763794, + "step": 168 + }, + { + "epoch": 0.10513219284603421, + "grad_norm": 33.3597412109375, + "learning_rate": 1.7494824016563147e-06, + "logits/chosen": 0.0816267803311348, + "logits/rejected": 3.4725728034973145, + "logps/chosen": -398.22998046875, + "logps/rejected": -560.8500366210938, + "loss": 2.7753, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.8912577629089355, + "rewards/margins": 0.11649256944656372, + "rewards/rejected": 2.7747652530670166, + "step": 169 + }, + { + "epoch": 0.1057542768273717, + "grad_norm": 22.159027099609375, + "learning_rate": 1.759834368530021e-06, + "logits/chosen": 1.665263056755066, + "logits/rejected": 1.8744968175888062, + "logps/chosen": -511.35595703125, + "logps/rejected": -522.4755249023438, + "loss": 0.3666, + "rewards/accuracies": 0.875, + "rewards/chosen": 5.932845115661621, + "rewards/margins": 3.839456081390381, + "rewards/rejected": 2.093388557434082, + "step": 170 + }, + { + "epoch": 0.10637636080870917, + "grad_norm": 32.081825256347656, + "learning_rate": 1.770186335403727e-06, + "logits/chosen": -2.3348941802978516, + "logits/rejected": 3.1992275714874268, + "logps/chosen": -373.0357666015625, + "logps/rejected": -653.9948120117188, + "loss": 1.5876, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.7468457221984863, + "rewards/margins": 1.3742190599441528, + "rewards/rejected": 2.372626781463623, + "step": 171 + }, + { + "epoch": 0.10699844479004665, + "grad_norm": 34.0040397644043, + "learning_rate": 1.780538302277433e-06, + "logits/chosen": 1.7993035316467285, + "logits/rejected": 4.404562950134277, + "logps/chosen": -417.0467529296875, + "logps/rejected": -575.447021484375, + "loss": 2.1066, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.4859135150909424, + "rewards/margins": 2.4089438915252686, + "rewards/rejected": 0.07696938514709473, + "step": 172 + }, + { + "epoch": 0.10762052877138413, + "grad_norm": 24.10268783569336, + "learning_rate": 1.7908902691511387e-06, + "logits/chosen": 2.498624801635742, + "logits/rejected": 3.5662894248962402, + "logps/chosen": -599.74169921875, + "logps/rejected": -655.7711181640625, + "loss": 0.2948, + "rewards/accuracies": 0.75, + "rewards/chosen": 4.151212692260742, + "rewards/margins": 4.251218795776367, + "rewards/rejected": -0.10000598430633545, + "step": 173 + }, + { + "epoch": 0.10824261275272162, + "grad_norm": 51.85417556762695, + "learning_rate": 1.8012422360248449e-06, + "logits/chosen": -1.7430806159973145, + "logits/rejected": 3.780117988586426, + "logps/chosen": -350.8195495605469, + "logps/rejected": -647.4478149414062, + "loss": 3.9011, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.512604236602783, + "rewards/margins": -1.8627538681030273, + "rewards/rejected": 4.375357627868652, + "step": 174 + }, + { + "epoch": 0.1088646967340591, + "grad_norm": 41.917449951171875, + "learning_rate": 1.8115942028985508e-06, + "logits/chosen": -1.1673665046691895, + "logits/rejected": 3.871638298034668, + "logps/chosen": -189.32254028320312, + "logps/rejected": -495.197998046875, + "loss": 4.4235, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.9024066925048828, + "rewards/margins": -1.6625474691390991, + "rewards/rejected": 3.5649542808532715, + "step": 175 + }, + { + "epoch": 0.10948678071539658, + "grad_norm": 50.83918762207031, + "learning_rate": 1.8219461697722568e-06, + "logits/chosen": 0.5478426814079285, + "logits/rejected": 1.6672499179840088, + "logps/chosen": -499.48370361328125, + "logps/rejected": -573.6226806640625, + "loss": 3.8286, + "rewards/accuracies": 0.375, + "rewards/chosen": 3.3958325386047363, + "rewards/margins": -2.7034497261047363, + "rewards/rejected": 6.0992817878723145, + "step": 176 + }, + { + "epoch": 0.11010886469673406, + "grad_norm": 34.604827880859375, + "learning_rate": 1.832298136645963e-06, + "logits/chosen": 0.9112688302993774, + "logits/rejected": 4.5739922523498535, + "logps/chosen": -432.5416259765625, + "logps/rejected": -530.7010498046875, + "loss": 1.156, + "rewards/accuracies": 0.625, + "rewards/chosen": 3.8393948078155518, + "rewards/margins": 1.3490967750549316, + "rewards/rejected": 2.490297794342041, + "step": 177 + }, + { + "epoch": 0.11073094867807154, + "grad_norm": 39.49983215332031, + "learning_rate": 1.842650103519669e-06, + "logits/chosen": -0.8858106136322021, + "logits/rejected": 5.029402732849121, + "logps/chosen": -205.45779418945312, + "logps/rejected": -511.8585205078125, + "loss": 2.0445, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.839600682258606, + "rewards/margins": 1.14154851436615, + "rewards/rejected": -0.30194801092147827, + "step": 178 + }, + { + "epoch": 0.11135303265940902, + "grad_norm": 43.34393310546875, + "learning_rate": 1.8530020703933748e-06, + "logits/chosen": 2.5536022186279297, + "logits/rejected": 5.413421630859375, + "logps/chosen": -391.5511474609375, + "logps/rejected": -573.38818359375, + "loss": 1.7285, + "rewards/accuracies": 0.5, + "rewards/chosen": 4.927746295928955, + "rewards/margins": 0.048822566866874695, + "rewards/rejected": 4.878923416137695, + "step": 179 + }, + { + "epoch": 0.1119751166407465, + "grad_norm": 51.95783615112305, + "learning_rate": 1.8633540372670808e-06, + "logits/chosen": -1.940110206604004, + "logits/rejected": 2.500943183898926, + "logps/chosen": -327.7534484863281, + "logps/rejected": -551.9661865234375, + "loss": 5.6805, + "rewards/accuracies": 0.375, + "rewards/chosen": 5.075728416442871, + "rewards/margins": -2.9619293212890625, + "rewards/rejected": 8.037657737731934, + "step": 180 + }, + { + "epoch": 0.11259720062208398, + "grad_norm": 28.379240036010742, + "learning_rate": 1.873706004140787e-06, + "logits/chosen": 1.9032138586044312, + "logits/rejected": 4.713849067687988, + "logps/chosen": -477.2460632324219, + "logps/rejected": -636.798095703125, + "loss": 1.2699, + "rewards/accuracies": 0.75, + "rewards/chosen": 5.134786605834961, + "rewards/margins": 1.3110426664352417, + "rewards/rejected": 3.8237438201904297, + "step": 181 + }, + { + "epoch": 0.11321928460342146, + "grad_norm": 54.019630432128906, + "learning_rate": 1.884057971014493e-06, + "logits/chosen": -0.569671094417572, + "logits/rejected": 3.2655553817749023, + "logps/chosen": -367.45550537109375, + "logps/rejected": -637.9801635742188, + "loss": 1.8045, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.29579782485961914, + "rewards/margins": -0.03955581784248352, + "rewards/rejected": -0.25624197721481323, + "step": 182 + }, + { + "epoch": 0.11384136858475895, + "grad_norm": 50.58377456665039, + "learning_rate": 1.894409937888199e-06, + "logits/chosen": 0.5268037915229797, + "logits/rejected": 3.4074790477752686, + "logps/chosen": -357.352783203125, + "logps/rejected": -576.5067749023438, + "loss": 1.7159, + "rewards/accuracies": 0.625, + "rewards/chosen": 4.699305534362793, + "rewards/margins": 0.25702714920043945, + "rewards/rejected": 4.4422783851623535, + "step": 183 + }, + { + "epoch": 0.11446345256609643, + "grad_norm": 43.4469108581543, + "learning_rate": 1.904761904761905e-06, + "logits/chosen": 2.0469088554382324, + "logits/rejected": 4.830729007720947, + "logps/chosen": -409.3948669433594, + "logps/rejected": -563.2650146484375, + "loss": 1.6257, + "rewards/accuracies": 0.5, + "rewards/chosen": 3.6548261642456055, + "rewards/margins": 1.6007968187332153, + "rewards/rejected": 2.0540289878845215, + "step": 184 + }, + { + "epoch": 0.1150855365474339, + "grad_norm": 49.39980697631836, + "learning_rate": 1.915113871635611e-06, + "logits/chosen": 1.1279759407043457, + "logits/rejected": 3.980091094970703, + "logps/chosen": -389.1132507324219, + "logps/rejected": -566.9024658203125, + "loss": 2.6029, + "rewards/accuracies": 0.25, + "rewards/chosen": 4.677840232849121, + "rewards/margins": -1.6894056797027588, + "rewards/rejected": 6.367246150970459, + "step": 185 + }, + { + "epoch": 0.11570762052877138, + "grad_norm": 30.314416885375977, + "learning_rate": 1.925465838509317e-06, + "logits/chosen": 1.9638819694519043, + "logits/rejected": 4.244125843048096, + "logps/chosen": -455.41754150390625, + "logps/rejected": -591.694580078125, + "loss": 1.458, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.3089189529418945, + "rewards/margins": 1.6026175022125244, + "rewards/rejected": 1.7063013315200806, + "step": 186 + }, + { + "epoch": 0.11632970451010886, + "grad_norm": 36.3062858581543, + "learning_rate": 1.935817805383023e-06, + "logits/chosen": -0.3030480146408081, + "logits/rejected": 3.8223390579223633, + "logps/chosen": -343.2451171875, + "logps/rejected": -557.1092529296875, + "loss": 1.0118, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.1318135261535645, + "rewards/margins": 2.583085536956787, + "rewards/rejected": -0.45127207040786743, + "step": 187 + }, + { + "epoch": 0.11695178849144634, + "grad_norm": 34.68586349487305, + "learning_rate": 1.9461697722567286e-06, + "logits/chosen": -0.018244266510009766, + "logits/rejected": 3.894641876220703, + "logps/chosen": -376.0664978027344, + "logps/rejected": -571.3930053710938, + "loss": 1.505, + "rewards/accuracies": 0.5, + "rewards/chosen": 5.088592052459717, + "rewards/margins": 0.9989587664604187, + "rewards/rejected": 4.089632987976074, + "step": 188 + }, + { + "epoch": 0.11757387247278382, + "grad_norm": 41.42873764038086, + "learning_rate": 1.956521739130435e-06, + "logits/chosen": 3.0823469161987305, + "logits/rejected": 4.817437171936035, + "logps/chosen": -481.16162109375, + "logps/rejected": -586.5604858398438, + "loss": 0.8623, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9342479705810547, + "rewards/margins": 0.728072464466095, + "rewards/rejected": -2.662320375442505, + "step": 189 + }, + { + "epoch": 0.1181959564541213, + "grad_norm": 70.27243041992188, + "learning_rate": 1.966873706004141e-06, + "logits/chosen": 2.949324369430542, + "logits/rejected": 3.5138602256774902, + "logps/chosen": -627.6104125976562, + "logps/rejected": -615.1279907226562, + "loss": 2.0358, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.9750401973724365, + "rewards/margins": 3.873018264770508, + "rewards/rejected": -0.8979783654212952, + "step": 190 + }, + { + "epoch": 0.1188180404354588, + "grad_norm": 49.11233901977539, + "learning_rate": 1.977225672877847e-06, + "logits/chosen": -0.8565497994422913, + "logits/rejected": 4.336215972900391, + "logps/chosen": -288.052978515625, + "logps/rejected": -581.88330078125, + "loss": 4.2047, + "rewards/accuracies": 0.5, + "rewards/chosen": 3.6924588680267334, + "rewards/margins": -1.4131442308425903, + "rewards/rejected": 5.105603218078613, + "step": 191 + }, + { + "epoch": 0.11944012441679627, + "grad_norm": 54.367671966552734, + "learning_rate": 1.987577639751553e-06, + "logits/chosen": -1.6013593673706055, + "logits/rejected": 3.676778554916382, + "logps/chosen": -427.30792236328125, + "logps/rejected": -734.0960083007812, + "loss": 1.9657, + "rewards/accuracies": 0.25, + "rewards/chosen": 3.4305613040924072, + "rewards/margins": -1.0722615718841553, + "rewards/rejected": 4.5028228759765625, + "step": 192 + }, + { + "epoch": 0.12006220839813375, + "grad_norm": 47.42571258544922, + "learning_rate": 1.997929606625259e-06, + "logits/chosen": 0.7500506639480591, + "logits/rejected": 4.990383625030518, + "logps/chosen": -360.7723388671875, + "logps/rejected": -608.65625, + "loss": 3.3685, + "rewards/accuracies": 0.5, + "rewards/chosen": 2.2373976707458496, + "rewards/margins": -0.7982468605041504, + "rewards/rejected": 3.035644769668579, + "step": 193 + }, + { + "epoch": 0.12068429237947123, + "grad_norm": 33.1822395324707, + "learning_rate": 2.008281573498965e-06, + "logits/chosen": 0.6219146847724915, + "logits/rejected": 2.929879665374756, + "logps/chosen": -460.13238525390625, + "logps/rejected": -706.8247680664062, + "loss": 2.046, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.59297513961792, + "rewards/margins": 1.675492286682129, + "rewards/rejected": 1.9174827337265015, + "step": 194 + }, + { + "epoch": 0.12130637636080871, + "grad_norm": 51.21638488769531, + "learning_rate": 2.018633540372671e-06, + "logits/chosen": -1.0155105590820312, + "logits/rejected": 1.4087202548980713, + "logps/chosen": -385.1896667480469, + "logps/rejected": -559.8251953125, + "loss": 2.8099, + "rewards/accuracies": 0.375, + "rewards/chosen": 3.893887996673584, + "rewards/margins": -0.482464075088501, + "rewards/rejected": 4.376351833343506, + "step": 195 + }, + { + "epoch": 0.12192846034214619, + "grad_norm": 46.816688537597656, + "learning_rate": 2.028985507246377e-06, + "logits/chosen": 1.821613073348999, + "logits/rejected": 3.8777379989624023, + "logps/chosen": -552.5363159179688, + "logps/rejected": -630.4986572265625, + "loss": 1.2842, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.8579941987991333, + "rewards/margins": 0.636164128780365, + "rewards/rejected": 1.221830129623413, + "step": 196 + }, + { + "epoch": 0.12255054432348367, + "grad_norm": 3.2378439903259277, + "learning_rate": 2.039337474120083e-06, + "logits/chosen": 1.5373815298080444, + "logits/rejected": 5.254292964935303, + "logps/chosen": -366.4618835449219, + "logps/rejected": -528.86865234375, + "loss": 0.1328, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.522764205932617, + "rewards/margins": 3.405061721801758, + "rewards/rejected": 0.11770275235176086, + "step": 197 + }, + { + "epoch": 0.12317262830482115, + "grad_norm": 34.82770538330078, + "learning_rate": 2.049689440993789e-06, + "logits/chosen": 3.3856725692749023, + "logits/rejected": 1.8890368938446045, + "logps/chosen": -570.4473876953125, + "logps/rejected": -554.4201049804688, + "loss": 0.7851, + "rewards/accuracies": 0.625, + "rewards/chosen": 3.8009650707244873, + "rewards/margins": 3.108883857727051, + "rewards/rejected": 0.692081093788147, + "step": 198 + }, + { + "epoch": 0.12379471228615863, + "grad_norm": 50.189727783203125, + "learning_rate": 2.060041407867495e-06, + "logits/chosen": 1.0152267217636108, + "logits/rejected": 3.6783008575439453, + "logps/chosen": -494.78271484375, + "logps/rejected": -646.5352783203125, + "loss": 4.7003, + "rewards/accuracies": 0.5, + "rewards/chosen": 5.196945667266846, + "rewards/margins": -2.769010543823242, + "rewards/rejected": 7.965957164764404, + "step": 199 + }, + { + "epoch": 0.12441679626749612, + "grad_norm": 30.09671974182129, + "learning_rate": 2.0703933747412013e-06, + "logits/chosen": 0.6837999820709229, + "logits/rejected": 3.201636552810669, + "logps/chosen": -420.2381591796875, + "logps/rejected": -556.3150634765625, + "loss": 0.4249, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.953427791595459, + "rewards/margins": 3.2480907440185547, + "rewards/rejected": -0.2946627736091614, + "step": 200 + }, + { + "epoch": 0.12503888024883358, + "grad_norm": 45.81303405761719, + "learning_rate": 2.0807453416149073e-06, + "logits/chosen": 1.7944955825805664, + "logits/rejected": 5.481308460235596, + "logps/chosen": -430.08514404296875, + "logps/rejected": -581.4287109375, + "loss": 2.692, + "rewards/accuracies": 0.5, + "rewards/chosen": 3.2093420028686523, + "rewards/margins": 0.14378416538238525, + "rewards/rejected": 3.0655577182769775, + "step": 201 + }, + { + "epoch": 0.12566096423017106, + "grad_norm": 43.14478302001953, + "learning_rate": 2.091097308488613e-06, + "logits/chosen": -1.7328505516052246, + "logits/rejected": 2.6943445205688477, + "logps/chosen": -396.7474670410156, + "logps/rejected": -666.2418212890625, + "loss": 1.991, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7151708602905273, + "rewards/margins": 0.4704517722129822, + "rewards/rejected": 0.2447190284729004, + "step": 202 + }, + { + "epoch": 0.12628304821150854, + "grad_norm": 31.574993133544922, + "learning_rate": 2.101449275362319e-06, + "logits/chosen": 1.9355977773666382, + "logits/rejected": 3.36230206489563, + "logps/chosen": -495.40789794921875, + "logps/rejected": -570.6536865234375, + "loss": 0.9764, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.350783586502075, + "rewards/margins": 3.221501111984253, + "rewards/rejected": 0.12928247451782227, + "step": 203 + }, + { + "epoch": 0.12690513219284602, + "grad_norm": 63.032718658447266, + "learning_rate": 2.111801242236025e-06, + "logits/chosen": -1.5227994918823242, + "logits/rejected": 4.453179359436035, + "logps/chosen": -373.2676086425781, + "logps/rejected": -745.5072021484375, + "loss": 4.4336, + "rewards/accuracies": 0.375, + "rewards/chosen": 4.488953590393066, + "rewards/margins": -2.0308337211608887, + "rewards/rejected": 6.519787788391113, + "step": 204 + }, + { + "epoch": 0.12752721617418353, + "grad_norm": 22.824756622314453, + "learning_rate": 2.122153209109731e-06, + "logits/chosen": 0.824635922908783, + "logits/rejected": 2.8972008228302, + "logps/chosen": -390.86981201171875, + "logps/rejected": -515.5609130859375, + "loss": 0.2428, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06970125436782837, + "rewards/margins": 2.518465518951416, + "rewards/rejected": -2.4487640857696533, + "step": 205 + }, + { + "epoch": 0.128149300155521, + "grad_norm": 24.64143180847168, + "learning_rate": 2.132505175983437e-06, + "logits/chosen": 0.700970470905304, + "logits/rejected": 2.4702115058898926, + "logps/chosen": -401.08013916015625, + "logps/rejected": -505.01715087890625, + "loss": 0.3995, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.6803672313690186, + "rewards/margins": 3.9768378734588623, + "rewards/rejected": -1.2964705228805542, + "step": 206 + }, + { + "epoch": 0.12877138413685849, + "grad_norm": 52.84967803955078, + "learning_rate": 2.1428571428571427e-06, + "logits/chosen": 2.6100564002990723, + "logits/rejected": 5.6528754234313965, + "logps/chosen": -531.5084228515625, + "logps/rejected": -727.681640625, + "loss": 1.454, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.112928032875061, + "rewards/margins": 1.359449863433838, + "rewards/rejected": -2.4723777770996094, + "step": 207 + }, + { + "epoch": 0.12939346811819596, + "grad_norm": 39.599334716796875, + "learning_rate": 2.153209109730849e-06, + "logits/chosen": 1.1986247301101685, + "logits/rejected": 5.218608856201172, + "logps/chosen": -325.86175537109375, + "logps/rejected": -480.5209655761719, + "loss": 2.2942, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.030095696449279785, + "rewards/margins": 0.45811617374420166, + "rewards/rejected": -0.42802050709724426, + "step": 208 + }, + { + "epoch": 0.13001555209953344, + "grad_norm": 37.675941467285156, + "learning_rate": 2.163561076604555e-06, + "logits/chosen": -0.7307783365249634, + "logits/rejected": 2.09501576423645, + "logps/chosen": -386.4396667480469, + "logps/rejected": -580.3024291992188, + "loss": 1.6707, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41714441776275635, + "rewards/margins": 2.4524335861206055, + "rewards/rejected": -2.0352892875671387, + "step": 209 + }, + { + "epoch": 0.13063763608087092, + "grad_norm": 20.327880859375, + "learning_rate": 2.173913043478261e-06, + "logits/chosen": 0.5417582988739014, + "logits/rejected": 2.157532215118408, + "logps/chosen": -552.3806762695312, + "logps/rejected": -637.8341064453125, + "loss": 0.2936, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.288459300994873, + "rewards/margins": 6.135288715362549, + "rewards/rejected": -2.846829652786255, + "step": 210 + }, + { + "epoch": 0.1312597200622084, + "grad_norm": 32.202301025390625, + "learning_rate": 2.184265010351967e-06, + "logits/chosen": 1.5024635791778564, + "logits/rejected": 4.253437519073486, + "logps/chosen": -503.56805419921875, + "logps/rejected": -695.8643798828125, + "loss": 1.1783, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.677678644657135, + "rewards/margins": 3.9930052757263184, + "rewards/rejected": -3.315326690673828, + "step": 211 + }, + { + "epoch": 0.13188180404354588, + "grad_norm": 31.536731719970703, + "learning_rate": 2.194616977225673e-06, + "logits/chosen": 1.5302878618240356, + "logits/rejected": 2.419198513031006, + "logps/chosen": -523.5919799804688, + "logps/rejected": -623.6431884765625, + "loss": 0.4977, + "rewards/accuracies": 0.75, + "rewards/chosen": 5.382325649261475, + "rewards/margins": 2.3979134559631348, + "rewards/rejected": 2.984412670135498, + "step": 212 + }, + { + "epoch": 0.13250388802488336, + "grad_norm": 47.228065490722656, + "learning_rate": 2.204968944099379e-06, + "logits/chosen": 1.0171109437942505, + "logits/rejected": 1.333390712738037, + "logps/chosen": -481.23822021484375, + "logps/rejected": -489.55426025390625, + "loss": 2.4562, + "rewards/accuracies": 0.5, + "rewards/chosen": 2.261629581451416, + "rewards/margins": 2.093576669692993, + "rewards/rejected": 0.16805295646190643, + "step": 213 + }, + { + "epoch": 0.13312597200622084, + "grad_norm": 47.21223831176758, + "learning_rate": 2.215320910973085e-06, + "logits/chosen": 1.9000352621078491, + "logits/rejected": 4.789727687835693, + "logps/chosen": -478.86102294921875, + "logps/rejected": -640.8873901367188, + "loss": 1.6637, + "rewards/accuracies": 0.375, + "rewards/chosen": 2.010389804840088, + "rewards/margins": 0.16780626773834229, + "rewards/rejected": 1.842583417892456, + "step": 214 + }, + { + "epoch": 0.13374805598755832, + "grad_norm": 41.62903594970703, + "learning_rate": 2.225672877846791e-06, + "logits/chosen": -0.524249792098999, + "logits/rejected": 3.3108348846435547, + "logps/chosen": -439.0601501464844, + "logps/rejected": -683.864013671875, + "loss": 2.1058, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.8111445903778076, + "rewards/margins": 2.1104533672332764, + "rewards/rejected": 0.7006913423538208, + "step": 215 + }, + { + "epoch": 0.1343701399688958, + "grad_norm": 37.27708435058594, + "learning_rate": 2.236024844720497e-06, + "logits/chosen": -1.3696495294570923, + "logits/rejected": 0.9015498161315918, + "logps/chosen": -392.2905578613281, + "logps/rejected": -514.0640258789062, + "loss": 1.8766, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.9762678146362305, + "rewards/margins": 3.288834810256958, + "rewards/rejected": -1.3125672340393066, + "step": 216 + }, + { + "epoch": 0.13499222395023328, + "grad_norm": 46.841278076171875, + "learning_rate": 2.246376811594203e-06, + "logits/chosen": -1.1116505861282349, + "logits/rejected": 3.4078965187072754, + "logps/chosen": -318.01605224609375, + "logps/rejected": -635.120361328125, + "loss": 2.2848, + "rewards/accuracies": 0.5, + "rewards/chosen": 2.1768715381622314, + "rewards/margins": -0.572177529335022, + "rewards/rejected": 2.749049186706543, + "step": 217 + }, + { + "epoch": 0.13561430793157075, + "grad_norm": 39.24924087524414, + "learning_rate": 2.256728778467909e-06, + "logits/chosen": 1.5476105213165283, + "logits/rejected": 3.4906458854675293, + "logps/chosen": -322.1094055175781, + "logps/rejected": -462.31683349609375, + "loss": 0.973, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4564497470855713, + "rewards/margins": 3.5915491580963135, + "rewards/rejected": -4.047998905181885, + "step": 218 + }, + { + "epoch": 0.13623639191290823, + "grad_norm": 60.18535614013672, + "learning_rate": 2.2670807453416154e-06, + "logits/chosen": 0.6108388900756836, + "logits/rejected": 2.967494249343872, + "logps/chosen": -531.0944213867188, + "logps/rejected": -751.517822265625, + "loss": 2.4948, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0347089767456055, + "rewards/margins": 1.0422743558883667, + "rewards/rejected": -2.0769832134246826, + "step": 219 + }, + { + "epoch": 0.1368584758942457, + "grad_norm": 24.164648056030273, + "learning_rate": 2.277432712215321e-06, + "logits/chosen": -0.36401891708374023, + "logits/rejected": 1.6827431917190552, + "logps/chosen": -440.4677429199219, + "logps/rejected": -560.8984985351562, + "loss": 0.5672, + "rewards/accuracies": 0.875, + "rewards/chosen": 3.0978987216949463, + "rewards/margins": 3.5617518424987793, + "rewards/rejected": -0.463853120803833, + "step": 220 + }, + { + "epoch": 0.1374805598755832, + "grad_norm": 49.610164642333984, + "learning_rate": 2.287784679089027e-06, + "logits/chosen": 0.010624885559082031, + "logits/rejected": 3.5907387733459473, + "logps/chosen": -405.98388671875, + "logps/rejected": -642.1790771484375, + "loss": 2.0901, + "rewards/accuracies": 0.5, + "rewards/chosen": 3.899686336517334, + "rewards/margins": 0.45490002632141113, + "rewards/rejected": 3.4447860717773438, + "step": 221 + }, + { + "epoch": 0.13810264385692067, + "grad_norm": 17.924177169799805, + "learning_rate": 2.298136645962733e-06, + "logits/chosen": -0.9403358697891235, + "logits/rejected": 3.576866388320923, + "logps/chosen": -347.7789001464844, + "logps/rejected": -721.7254028320312, + "loss": 0.3152, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.649275779724121, + "rewards/margins": 7.130861282348633, + "rewards/rejected": -4.4815850257873535, + "step": 222 + }, + { + "epoch": 0.13872472783825818, + "grad_norm": 22.09856414794922, + "learning_rate": 2.3084886128364393e-06, + "logits/chosen": 0.05965060740709305, + "logits/rejected": 3.7791969776153564, + "logps/chosen": -300.701171875, + "logps/rejected": -541.962890625, + "loss": 0.4578, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.1633074283599854, + "rewards/margins": 2.8631277084350586, + "rewards/rejected": -0.699820339679718, + "step": 223 + }, + { + "epoch": 0.13934681181959566, + "grad_norm": 31.209260940551758, + "learning_rate": 2.3188405797101453e-06, + "logits/chosen": -5.668010711669922, + "logits/rejected": 1.2728283405303955, + "logps/chosen": -199.50497436523438, + "logps/rejected": -599.9317016601562, + "loss": 0.5324, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.009107232093811, + "rewards/margins": 5.485748291015625, + "rewards/rejected": -4.476640701293945, + "step": 224 + }, + { + "epoch": 0.13996889580093314, + "grad_norm": 40.985450744628906, + "learning_rate": 2.3291925465838513e-06, + "logits/chosen": -1.2980353832244873, + "logits/rejected": 5.021801471710205, + "logps/chosen": -314.845703125, + "logps/rejected": -743.640869140625, + "loss": 1.9684, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.9610698223114014, + "rewards/margins": 2.9930801391601562, + "rewards/rejected": -2.032010316848755, + "step": 225 + }, + { + "epoch": 0.14059097978227061, + "grad_norm": 40.860530853271484, + "learning_rate": 2.339544513457557e-06, + "logits/chosen": 1.379894495010376, + "logits/rejected": 2.6520724296569824, + "logps/chosen": -518.1561279296875, + "logps/rejected": -594.7689208984375, + "loss": 1.633, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.6457796096801758, + "rewards/margins": 1.1583093404769897, + "rewards/rejected": 0.4874701499938965, + "step": 226 + }, + { + "epoch": 0.1412130637636081, + "grad_norm": 30.166248321533203, + "learning_rate": 2.3498964803312632e-06, + "logits/chosen": -0.5353786945343018, + "logits/rejected": 3.058781862258911, + "logps/chosen": -381.04534912109375, + "logps/rejected": -579.377685546875, + "loss": 0.4811, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24283432960510254, + "rewards/margins": 3.317432165145874, + "rewards/rejected": -3.5602664947509766, + "step": 227 + }, + { + "epoch": 0.14183514774494557, + "grad_norm": 14.29391098022461, + "learning_rate": 2.3602484472049692e-06, + "logits/chosen": -0.5758196115493774, + "logits/rejected": 1.405975341796875, + "logps/chosen": -263.87591552734375, + "logps/rejected": -447.11566162109375, + "loss": 0.1026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6491126418113708, + "rewards/margins": 6.495497226715088, + "rewards/rejected": -5.8463850021362305, + "step": 228 + }, + { + "epoch": 0.14245723172628305, + "grad_norm": 37.63568115234375, + "learning_rate": 2.370600414078675e-06, + "logits/chosen": -0.9808781147003174, + "logits/rejected": 3.0308525562286377, + "logps/chosen": -369.4408264160156, + "logps/rejected": -660.7260131835938, + "loss": 1.4419, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.0158448219299316, + "rewards/margins": 5.2338457107543945, + "rewards/rejected": -3.218001365661621, + "step": 229 + }, + { + "epoch": 0.14307931570762053, + "grad_norm": 10.795422554016113, + "learning_rate": 2.380952380952381e-06, + "logits/chosen": 1.0303987264633179, + "logits/rejected": 3.6234517097473145, + "logps/chosen": -420.68048095703125, + "logps/rejected": -619.9874877929688, + "loss": 0.16, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.7086896896362305, + "rewards/margins": 7.707023620605469, + "rewards/rejected": -5.998333930969238, + "step": 230 + }, + { + "epoch": 0.143701399688958, + "grad_norm": 37.661048889160156, + "learning_rate": 2.391304347826087e-06, + "logits/chosen": -1.1538281440734863, + "logits/rejected": 2.5236012935638428, + "logps/chosen": -396.6817932128906, + "logps/rejected": -621.38330078125, + "loss": 1.031, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.994320034980774, + "rewards/margins": 2.331658363342285, + "rewards/rejected": -0.3373383581638336, + "step": 231 + }, + { + "epoch": 0.1443234836702955, + "grad_norm": 35.17131423950195, + "learning_rate": 2.401656314699793e-06, + "logits/chosen": -1.1166588068008423, + "logits/rejected": 2.102498769760132, + "logps/chosen": -419.2410583496094, + "logps/rejected": -635.1260986328125, + "loss": 0.8231, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8397369384765625, + "rewards/margins": 2.944187641143799, + "rewards/rejected": -2.1044509410858154, + "step": 232 + }, + { + "epoch": 0.14494556765163297, + "grad_norm": 25.272857666015625, + "learning_rate": 2.412008281573499e-06, + "logits/chosen": -0.6808191537857056, + "logits/rejected": 3.588482141494751, + "logps/chosen": -241.42205810546875, + "logps/rejected": -557.1651611328125, + "loss": 0.6014, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2021692991256714, + "rewards/margins": 5.207537651062012, + "rewards/rejected": -5.409707069396973, + "step": 233 + }, + { + "epoch": 0.14556765163297045, + "grad_norm": 43.97793960571289, + "learning_rate": 2.422360248447205e-06, + "logits/chosen": 1.0645995140075684, + "logits/rejected": 3.1753838062286377, + "logps/chosen": -477.3868713378906, + "logps/rejected": -616.861083984375, + "loss": 0.8906, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.234439730644226, + "rewards/margins": 3.5022284984588623, + "rewards/rejected": -2.267788887023926, + "step": 234 + }, + { + "epoch": 0.14618973561430793, + "grad_norm": 14.174098014831543, + "learning_rate": 2.432712215320911e-06, + "logits/chosen": 2.2343997955322266, + "logits/rejected": 2.843075752258301, + "logps/chosen": -527.3372192382812, + "logps/rejected": -604.364990234375, + "loss": 0.1381, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.552929639816284, + "rewards/margins": 5.782426357269287, + "rewards/rejected": -3.229496479034424, + "step": 235 + }, + { + "epoch": 0.1468118195956454, + "grad_norm": 40.284629821777344, + "learning_rate": 2.443064182194617e-06, + "logits/chosen": -0.22108198702335358, + "logits/rejected": 2.564833641052246, + "logps/chosen": -368.523681640625, + "logps/rejected": -532.35400390625, + "loss": 0.6829, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9975924491882324, + "rewards/margins": 1.6895599365234375, + "rewards/rejected": 1.3080326318740845, + "step": 236 + }, + { + "epoch": 0.14743390357698288, + "grad_norm": 38.307769775390625, + "learning_rate": 2.453416149068323e-06, + "logits/chosen": -1.2599825859069824, + "logits/rejected": 2.4900829792022705, + "logps/chosen": -441.6986083984375, + "logps/rejected": -676.992431640625, + "loss": 1.8282, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.310348242521286, + "rewards/margins": 0.09642618894577026, + "rewards/rejected": -0.40677428245544434, + "step": 237 + }, + { + "epoch": 0.14805598755832036, + "grad_norm": 41.170799255371094, + "learning_rate": 2.4637681159420295e-06, + "logits/chosen": -0.8981081247329712, + "logits/rejected": 3.240385055541992, + "logps/chosen": -429.32147216796875, + "logps/rejected": -688.107666015625, + "loss": 0.8547, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36102116107940674, + "rewards/margins": 6.074836730957031, + "rewards/rejected": -5.713814735412598, + "step": 238 + }, + { + "epoch": 0.14867807153965784, + "grad_norm": 38.23157501220703, + "learning_rate": 2.474120082815735e-06, + "logits/chosen": 1.0275741815567017, + "logits/rejected": 3.7468931674957275, + "logps/chosen": -617.2286987304688, + "logps/rejected": -790.6701049804688, + "loss": 0.5688, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11678469181060791, + "rewards/margins": 4.816810131072998, + "rewards/rejected": -4.70002555847168, + "step": 239 + }, + { + "epoch": 0.14930015552099535, + "grad_norm": 50.341251373291016, + "learning_rate": 2.484472049689441e-06, + "logits/chosen": -2.779496192932129, + "logits/rejected": 0.41580402851104736, + "logps/chosen": -301.6960144042969, + "logps/rejected": -558.6378173828125, + "loss": 1.0108, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.6704089641571045, + "rewards/margins": 4.456121921539307, + "rewards/rejected": -2.7857131958007812, + "step": 240 + }, + { + "epoch": 0.14992223950233283, + "grad_norm": 34.48246765136719, + "learning_rate": 2.494824016563147e-06, + "logits/chosen": -0.310516357421875, + "logits/rejected": 1.8644804954528809, + "logps/chosen": -419.97955322265625, + "logps/rejected": -559.4046630859375, + "loss": 0.9705, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8145637512207031, + "rewards/margins": 1.7449901103973389, + "rewards/rejected": -3.559554100036621, + "step": 241 + }, + { + "epoch": 0.1505443234836703, + "grad_norm": 44.7969856262207, + "learning_rate": 2.5051759834368534e-06, + "logits/chosen": 2.063325881958008, + "logits/rejected": 1.8975759744644165, + "logps/chosen": -570.3998413085938, + "logps/rejected": -557.6295166015625, + "loss": 1.1162, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.317770481109619, + "rewards/margins": 1.6723326444625854, + "rewards/rejected": -5.990103244781494, + "step": 242 + }, + { + "epoch": 0.15116640746500778, + "grad_norm": 18.721473693847656, + "learning_rate": 2.515527950310559e-06, + "logits/chosen": -2.293026924133301, + "logits/rejected": 2.767634868621826, + "logps/chosen": -441.6069641113281, + "logps/rejected": -787.40771484375, + "loss": 0.174, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4177449345588684, + "rewards/margins": 5.593556880950928, + "rewards/rejected": -5.175811767578125, + "step": 243 + }, + { + "epoch": 0.15178849144634526, + "grad_norm": 19.432035446166992, + "learning_rate": 2.5258799171842654e-06, + "logits/chosen": 2.422300338745117, + "logits/rejected": 3.6136603355407715, + "logps/chosen": -622.0272827148438, + "logps/rejected": -721.8018798828125, + "loss": 0.1876, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7000467777252197, + "rewards/margins": 4.6907572746276855, + "rewards/rejected": -6.390804290771484, + "step": 244 + }, + { + "epoch": 0.15241057542768274, + "grad_norm": 38.153358459472656, + "learning_rate": 2.5362318840579714e-06, + "logits/chosen": 1.7388522624969482, + "logits/rejected": 4.493720531463623, + "logps/chosen": -551.026123046875, + "logps/rejected": -751.4261474609375, + "loss": 0.7534, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.3347375392913818, + "rewards/margins": 4.540463447570801, + "rewards/rejected": -3.205725908279419, + "step": 245 + }, + { + "epoch": 0.15303265940902022, + "grad_norm": 43.89891052246094, + "learning_rate": 2.546583850931677e-06, + "logits/chosen": -0.849528431892395, + "logits/rejected": 1.535990834236145, + "logps/chosen": -399.88470458984375, + "logps/rejected": -579.0242919921875, + "loss": 0.8152, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.5337576866149902, + "rewards/margins": 4.215351104736328, + "rewards/rejected": -7.749109268188477, + "step": 246 + }, + { + "epoch": 0.1536547433903577, + "grad_norm": 42.65324783325195, + "learning_rate": 2.5569358178053833e-06, + "logits/chosen": 1.0358763933181763, + "logits/rejected": 3.2417514324188232, + "logps/chosen": -489.2657470703125, + "logps/rejected": -644.57080078125, + "loss": 0.8755, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1160585880279541, + "rewards/margins": 3.223320960998535, + "rewards/rejected": -3.3393797874450684, + "step": 247 + }, + { + "epoch": 0.15427682737169518, + "grad_norm": 51.01871871948242, + "learning_rate": 2.5672877846790893e-06, + "logits/chosen": 0.9451044797897339, + "logits/rejected": 2.965320110321045, + "logps/chosen": -430.13543701171875, + "logps/rejected": -588.0543823242188, + "loss": 0.9527, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4361051321029663, + "rewards/margins": 5.314112663269043, + "rewards/rejected": -4.878007888793945, + "step": 248 + }, + { + "epoch": 0.15489891135303266, + "grad_norm": 46.12874221801758, + "learning_rate": 2.5776397515527953e-06, + "logits/chosen": -1.0910335779190063, + "logits/rejected": 3.0898303985595703, + "logps/chosen": -381.70159912109375, + "logps/rejected": -687.109619140625, + "loss": 1.0711, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.315556764602661, + "rewards/margins": 7.379352569580078, + "rewards/rejected": -5.063795566558838, + "step": 249 + }, + { + "epoch": 0.15552099533437014, + "grad_norm": 2.26055908203125, + "learning_rate": 2.5879917184265013e-06, + "logits/chosen": 0.7546824216842651, + "logits/rejected": 3.479255199432373, + "logps/chosen": -389.37835693359375, + "logps/rejected": -597.3038330078125, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3359346389770508, + "rewards/margins": 9.287576675415039, + "rewards/rejected": -7.951642036437988, + "step": 250 + }, + { + "epoch": 0.15614307931570762, + "grad_norm": 5.929549694061279, + "learning_rate": 2.598343685300207e-06, + "logits/chosen": 1.4317560195922852, + "logits/rejected": 3.4852263927459717, + "logps/chosen": -604.067138671875, + "logps/rejected": -810.948974609375, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.16350793838501, + "rewards/margins": 11.667287826538086, + "rewards/rejected": -15.830794334411621, + "step": 251 + }, + { + "epoch": 0.1567651632970451, + "grad_norm": 5.079573631286621, + "learning_rate": 2.6086956521739132e-06, + "logits/chosen": -0.28528928756713867, + "logits/rejected": 2.207214593887329, + "logps/chosen": -442.1741027832031, + "logps/rejected": -624.3016357421875, + "loss": 0.0309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1780473291873932, + "rewards/margins": 9.567262649536133, + "rewards/rejected": -9.745308876037598, + "step": 252 + }, + { + "epoch": 0.15738724727838257, + "grad_norm": 1.077399730682373, + "learning_rate": 2.6190476190476192e-06, + "logits/chosen": 1.8529900312423706, + "logits/rejected": 3.7783148288726807, + "logps/chosen": -544.4069213867188, + "logps/rejected": -773.4401245117188, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3226267099380493, + "rewards/margins": 9.717878341674805, + "rewards/rejected": -9.395252227783203, + "step": 253 + }, + { + "epoch": 0.15800933125972005, + "grad_norm": 26.679351806640625, + "learning_rate": 2.629399585921325e-06, + "logits/chosen": 0.5402032136917114, + "logits/rejected": 3.1739237308502197, + "logps/chosen": -467.8306579589844, + "logps/rejected": -729.2216796875, + "loss": 0.2176, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.075988531112671, + "rewards/margins": 7.494318962097168, + "rewards/rejected": -10.570308685302734, + "step": 254 + }, + { + "epoch": 0.15863141524105753, + "grad_norm": 6.276568412780762, + "learning_rate": 2.639751552795031e-06, + "logits/chosen": -3.0313994884490967, + "logits/rejected": 1.6596317291259766, + "logps/chosen": -239.6669464111328, + "logps/rejected": -597.2471313476562, + "loss": 0.0365, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20734518766403198, + "rewards/margins": 10.459396362304688, + "rewards/rejected": -10.252050399780273, + "step": 255 + }, + { + "epoch": 0.159253499222395, + "grad_norm": 36.85567855834961, + "learning_rate": 2.6501035196687376e-06, + "logits/chosen": 0.3948754072189331, + "logits/rejected": 2.9974865913391113, + "logps/chosen": -479.9315490722656, + "logps/rejected": -652.0233154296875, + "loss": 0.5583, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.309106469154358, + "rewards/margins": 8.474274635314941, + "rewards/rejected": -7.165168285369873, + "step": 256 + }, + { + "epoch": 0.1598755832037325, + "grad_norm": 59.51966094970703, + "learning_rate": 2.660455486542443e-06, + "logits/chosen": 2.1631646156311035, + "logits/rejected": 3.3487162590026855, + "logps/chosen": -596.6227416992188, + "logps/rejected": -722.599365234375, + "loss": 2.4024, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.933492660522461, + "rewards/margins": 0.4551329016685486, + "rewards/rejected": -6.388625144958496, + "step": 257 + }, + { + "epoch": 0.16049766718507, + "grad_norm": 39.466552734375, + "learning_rate": 2.670807453416149e-06, + "logits/chosen": -1.7184714078903198, + "logits/rejected": 2.2928924560546875, + "logps/chosen": -400.03253173828125, + "logps/rejected": -628.02587890625, + "loss": 0.9934, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7665202617645264, + "rewards/margins": 2.3955516815185547, + "rewards/rejected": -3.162071704864502, + "step": 258 + }, + { + "epoch": 0.16111975116640748, + "grad_norm": 5.647301197052002, + "learning_rate": 2.6811594202898555e-06, + "logits/chosen": -1.1122512817382812, + "logits/rejected": 2.666635274887085, + "logps/chosen": -352.390380859375, + "logps/rejected": -645.776611328125, + "loss": 0.0418, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1382899284362793, + "rewards/margins": 8.056615829467773, + "rewards/rejected": -9.194905281066895, + "step": 259 + }, + { + "epoch": 0.16174183514774496, + "grad_norm": 22.67473602294922, + "learning_rate": 2.691511387163561e-06, + "logits/chosen": -0.1789799928665161, + "logits/rejected": 2.045513153076172, + "logps/chosen": -465.56683349609375, + "logps/rejected": -668.13330078125, + "loss": 0.3468, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.058413505554199, + "rewards/margins": 9.895721435546875, + "rewards/rejected": -12.95413589477539, + "step": 260 + }, + { + "epoch": 0.16236391912908243, + "grad_norm": 0.010524489916861057, + "learning_rate": 2.7018633540372675e-06, + "logits/chosen": 0.737421452999115, + "logits/rejected": 3.0371365547180176, + "logps/chosen": -520.680908203125, + "logps/rejected": -735.24560546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17118918895721436, + "rewards/margins": 14.252211570739746, + "rewards/rejected": -14.081022262573242, + "step": 261 + }, + { + "epoch": 0.1629860031104199, + "grad_norm": 51.21318054199219, + "learning_rate": 2.712215320910973e-06, + "logits/chosen": -0.729854166507721, + "logits/rejected": 3.941812038421631, + "logps/chosen": -443.6608581542969, + "logps/rejected": -738.9736938476562, + "loss": 1.436, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1441855430603027, + "rewards/margins": 5.385406494140625, + "rewards/rejected": -8.52959156036377, + "step": 262 + }, + { + "epoch": 0.1636080870917574, + "grad_norm": 41.15818786621094, + "learning_rate": 2.7225672877846795e-06, + "logits/chosen": 1.1326603889465332, + "logits/rejected": 3.168422222137451, + "logps/chosen": -544.2522583007812, + "logps/rejected": -729.4052734375, + "loss": 0.7476, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.015408515930176, + "rewards/margins": 8.209056854248047, + "rewards/rejected": -13.224465370178223, + "step": 263 + }, + { + "epoch": 0.16423017107309487, + "grad_norm": 40.650020599365234, + "learning_rate": 2.7329192546583855e-06, + "logits/chosen": -0.14138327538967133, + "logits/rejected": -1.1530615091323853, + "logps/chosen": -486.3907775878906, + "logps/rejected": -504.61163330078125, + "loss": 1.7413, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.723165512084961, + "rewards/margins": 3.4709882736206055, + "rewards/rejected": -7.194153785705566, + "step": 264 + }, + { + "epoch": 0.16485225505443235, + "grad_norm": 8.631354331970215, + "learning_rate": 2.743271221532091e-06, + "logits/chosen": -1.6873260736465454, + "logits/rejected": 3.191648006439209, + "logps/chosen": -309.7070617675781, + "logps/rejected": -606.5875244140625, + "loss": 0.0498, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8784593343734741, + "rewards/margins": 10.098764419555664, + "rewards/rejected": -11.977224349975586, + "step": 265 + }, + { + "epoch": 0.16547433903576983, + "grad_norm": 35.53109359741211, + "learning_rate": 2.7536231884057974e-06, + "logits/chosen": -2.07094669342041, + "logits/rejected": 2.508565902709961, + "logps/chosen": -418.36077880859375, + "logps/rejected": -844.75146484375, + "loss": 0.5531, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.0940752029418945, + "rewards/margins": 11.74495792388916, + "rewards/rejected": -15.839033126831055, + "step": 266 + }, + { + "epoch": 0.1660964230171073, + "grad_norm": 17.228307723999023, + "learning_rate": 2.7639751552795034e-06, + "logits/chosen": 0.33517616987228394, + "logits/rejected": 3.5891106128692627, + "logps/chosen": -434.30438232421875, + "logps/rejected": -723.2798461914062, + "loss": 0.163, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3649284839630127, + "rewards/margins": 8.922518730163574, + "rewards/rejected": -6.557590484619141, + "step": 267 + }, + { + "epoch": 0.1667185069984448, + "grad_norm": 43.660179138183594, + "learning_rate": 2.7743271221532094e-06, + "logits/chosen": 0.725469708442688, + "logits/rejected": 3.846705198287964, + "logps/chosen": -609.2630615234375, + "logps/rejected": -882.2078857421875, + "loss": 0.8438, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.523108959197998, + "rewards/margins": 6.391105651855469, + "rewards/rejected": -9.914214134216309, + "step": 268 + }, + { + "epoch": 0.16734059097978227, + "grad_norm": 11.92179012298584, + "learning_rate": 2.7846790890269154e-06, + "logits/chosen": 0.1969280242919922, + "logits/rejected": 4.658049583435059, + "logps/chosen": -520.454345703125, + "logps/rejected": -818.27294921875, + "loss": 0.1198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.863990306854248, + "rewards/margins": 4.931917667388916, + "rewards/rejected": -5.795907974243164, + "step": 269 + }, + { + "epoch": 0.16796267496111975, + "grad_norm": 44.41823196411133, + "learning_rate": 2.795031055900621e-06, + "logits/chosen": 0.10651260614395142, + "logits/rejected": 3.5432283878326416, + "logps/chosen": -487.7990417480469, + "logps/rejected": -717.2167358398438, + "loss": 1.3901, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1005728244781494, + "rewards/margins": 10.617042541503906, + "rewards/rejected": -12.717616081237793, + "step": 270 + }, + { + "epoch": 0.16858475894245722, + "grad_norm": 33.92555618286133, + "learning_rate": 2.8053830227743273e-06, + "logits/chosen": 0.07751777768135071, + "logits/rejected": 4.2295098304748535, + "logps/chosen": -369.48114013671875, + "logps/rejected": -573.50732421875, + "loss": 0.823, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.756532192230225, + "rewards/margins": 1.9096379280090332, + "rewards/rejected": -6.666170120239258, + "step": 271 + }, + { + "epoch": 0.1692068429237947, + "grad_norm": 27.594270706176758, + "learning_rate": 2.8157349896480333e-06, + "logits/chosen": 0.10852780938148499, + "logits/rejected": 4.689783096313477, + "logps/chosen": -429.4493103027344, + "logps/rejected": -787.86865234375, + "loss": 0.2497, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5277105569839478, + "rewards/margins": 7.960251808166504, + "rewards/rejected": -9.48796272277832, + "step": 272 + }, + { + "epoch": 0.16982892690513218, + "grad_norm": 1.3322811126708984, + "learning_rate": 2.8260869565217393e-06, + "logits/chosen": -1.8472188711166382, + "logits/rejected": 2.6371984481811523, + "logps/chosen": -345.05682373046875, + "logps/rejected": -702.9637451171875, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7659844160079956, + "rewards/margins": 12.975811004638672, + "rewards/rejected": -12.209827423095703, + "step": 273 + }, + { + "epoch": 0.17045101088646966, + "grad_norm": 34.784027099609375, + "learning_rate": 2.8364389233954453e-06, + "logits/chosen": 0.351703405380249, + "logits/rejected": 2.1608669757843018, + "logps/chosen": -389.4510803222656, + "logps/rejected": -599.7803344726562, + "loss": 1.4058, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.442836284637451, + "rewards/margins": 4.768312454223633, + "rewards/rejected": -9.21114730834961, + "step": 274 + }, + { + "epoch": 0.17107309486780714, + "grad_norm": 44.48931884765625, + "learning_rate": 2.8467908902691517e-06, + "logits/chosen": 1.1296827793121338, + "logits/rejected": 4.558327674865723, + "logps/chosen": -466.94305419921875, + "logps/rejected": -747.0626220703125, + "loss": 0.4661, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.572750091552734, + "rewards/margins": 4.540435314178467, + "rewards/rejected": -9.113184928894043, + "step": 275 + }, + { + "epoch": 0.17169517884914465, + "grad_norm": 43.360530853271484, + "learning_rate": 2.8571428571428573e-06, + "logits/chosen": -1.1608879566192627, + "logits/rejected": 1.6421074867248535, + "logps/chosen": -595.3029174804688, + "logps/rejected": -797.7877807617188, + "loss": 0.9953, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.334722518920898, + "rewards/margins": 6.524963855743408, + "rewards/rejected": -13.859687805175781, + "step": 276 + }, + { + "epoch": 0.17231726283048213, + "grad_norm": 34.76185989379883, + "learning_rate": 2.8674948240165632e-06, + "logits/chosen": 3.2887511253356934, + "logits/rejected": 4.776767730712891, + "logps/chosen": -702.6865844726562, + "logps/rejected": -816.4313354492188, + "loss": 0.4011, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9340373277664185, + "rewards/margins": 9.299430847167969, + "rewards/rejected": -8.36539363861084, + "step": 277 + }, + { + "epoch": 0.1729393468118196, + "grad_norm": 0.728277862071991, + "learning_rate": 2.8778467908902696e-06, + "logits/chosen": 0.3774760961532593, + "logits/rejected": 3.0168018341064453, + "logps/chosen": -395.4736633300781, + "logps/rejected": -690.8614501953125, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5477930903434753, + "rewards/margins": 12.72874641418457, + "rewards/rejected": -13.276540756225586, + "step": 278 + }, + { + "epoch": 0.17356143079315708, + "grad_norm": 30.32001495361328, + "learning_rate": 2.888198757763975e-06, + "logits/chosen": -1.6389963626861572, + "logits/rejected": 1.60201096534729, + "logps/chosen": -320.2080078125, + "logps/rejected": -618.7227783203125, + "loss": 0.1274, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.867861032485962, + "rewards/margins": 10.369712829589844, + "rewards/rejected": -14.23757553100586, + "step": 279 + }, + { + "epoch": 0.17418351477449456, + "grad_norm": 50.79122543334961, + "learning_rate": 2.8985507246376816e-06, + "logits/chosen": 1.6694326400756836, + "logits/rejected": 2.5693209171295166, + "logps/chosen": -643.999755859375, + "logps/rejected": -713.3614501953125, + "loss": 3.1784, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.281852722167969, + "rewards/margins": 4.56790828704834, + "rewards/rejected": -12.849761962890625, + "step": 280 + }, + { + "epoch": 0.17480559875583204, + "grad_norm": 36.4666862487793, + "learning_rate": 2.908902691511387e-06, + "logits/chosen": -0.6406252384185791, + "logits/rejected": 1.632117748260498, + "logps/chosen": -515.8046264648438, + "logps/rejected": -732.3682861328125, + "loss": 1.2141, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.834707021713257, + "rewards/margins": 10.90757942199707, + "rewards/rejected": -14.742287635803223, + "step": 281 + }, + { + "epoch": 0.17542768273716952, + "grad_norm": 44.52206802368164, + "learning_rate": 2.919254658385093e-06, + "logits/chosen": 1.7754943370819092, + "logits/rejected": 2.074369192123413, + "logps/chosen": -688.3287353515625, + "logps/rejected": -695.4146728515625, + "loss": 1.7472, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.223416328430176, + "rewards/margins": 4.887999057769775, + "rewards/rejected": -11.111414909362793, + "step": 282 + }, + { + "epoch": 0.176049766718507, + "grad_norm": 46.707618713378906, + "learning_rate": 2.9296066252587996e-06, + "logits/chosen": -2.913400888442993, + "logits/rejected": 3.6963987350463867, + "logps/chosen": -259.6846923828125, + "logps/rejected": -758.641845703125, + "loss": 0.7202, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.6863110065460205, + "rewards/margins": 7.087304592132568, + "rewards/rejected": -10.773615837097168, + "step": 283 + }, + { + "epoch": 0.17667185069984448, + "grad_norm": 59.8579216003418, + "learning_rate": 2.939958592132505e-06, + "logits/chosen": 0.8096885085105896, + "logits/rejected": 1.4599106311798096, + "logps/chosen": -640.6162719726562, + "logps/rejected": -699.5067138671875, + "loss": 1.0139, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.759545803070068, + "rewards/margins": 6.045644760131836, + "rewards/rejected": -11.80518913269043, + "step": 284 + }, + { + "epoch": 0.17729393468118196, + "grad_norm": 2.9321112632751465, + "learning_rate": 2.9503105590062115e-06, + "logits/chosen": -2.3982667922973633, + "logits/rejected": 3.0039591789245605, + "logps/chosen": -393.3422546386719, + "logps/rejected": -827.1685791015625, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9348781108856201, + "rewards/margins": 11.114221572875977, + "rewards/rejected": -10.179344177246094, + "step": 285 + }, + { + "epoch": 0.17791601866251944, + "grad_norm": 38.78324890136719, + "learning_rate": 2.9606625258799175e-06, + "logits/chosen": 0.25066280364990234, + "logits/rejected": 1.5802853107452393, + "logps/chosen": -459.66357421875, + "logps/rejected": -581.1425170898438, + "loss": 0.3467, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.499963760375977, + "rewards/margins": 8.779077529907227, + "rewards/rejected": -16.279041290283203, + "step": 286 + }, + { + "epoch": 0.17853810264385692, + "grad_norm": 3.962609052658081, + "learning_rate": 2.9710144927536235e-06, + "logits/chosen": -2.1418416500091553, + "logits/rejected": 1.1488546133041382, + "logps/chosen": -345.88714599609375, + "logps/rejected": -620.9361572265625, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7706562280654907, + "rewards/margins": 8.32846450805664, + "rewards/rejected": -10.099120140075684, + "step": 287 + }, + { + "epoch": 0.1791601866251944, + "grad_norm": 26.762493133544922, + "learning_rate": 2.9813664596273295e-06, + "logits/chosen": -1.101001501083374, + "logits/rejected": 2.8201372623443604, + "logps/chosen": -486.9984130859375, + "logps/rejected": -790.114013671875, + "loss": 0.6456, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.161718368530273, + "rewards/margins": 6.815770626068115, + "rewards/rejected": -10.977489471435547, + "step": 288 + }, + { + "epoch": 0.17978227060653187, + "grad_norm": 24.064451217651367, + "learning_rate": 2.991718426501035e-06, + "logits/chosen": 1.4957832098007202, + "logits/rejected": 3.3200576305389404, + "logps/chosen": -516.3970947265625, + "logps/rejected": -710.7282104492188, + "loss": 0.8452, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.919002532958984, + "rewards/margins": 6.532848358154297, + "rewards/rejected": -16.45184898376465, + "step": 289 + }, + { + "epoch": 0.18040435458786935, + "grad_norm": 19.973859786987305, + "learning_rate": 3.0020703933747414e-06, + "logits/chosen": -4.241866588592529, + "logits/rejected": 2.728012800216675, + "logps/chosen": -220.65185546875, + "logps/rejected": -713.72119140625, + "loss": 0.4069, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7319488525390625, + "rewards/margins": 12.879581451416016, + "rewards/rejected": -16.611530303955078, + "step": 290 + }, + { + "epoch": 0.18102643856920683, + "grad_norm": 9.816418647766113, + "learning_rate": 3.0124223602484474e-06, + "logits/chosen": -2.1271183490753174, + "logits/rejected": 1.7093493938446045, + "logps/chosen": -368.6051025390625, + "logps/rejected": -721.2058715820312, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.852741241455078, + "rewards/margins": 10.840240478515625, + "rewards/rejected": -16.692981719970703, + "step": 291 + }, + { + "epoch": 0.1816485225505443, + "grad_norm": 58.5022087097168, + "learning_rate": 3.0227743271221534e-06, + "logits/chosen": -1.7429341077804565, + "logits/rejected": 3.815767288208008, + "logps/chosen": -329.79193115234375, + "logps/rejected": -742.4627685546875, + "loss": 0.8852, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.896777629852295, + "rewards/margins": 8.65748119354248, + "rewards/rejected": -12.554258346557617, + "step": 292 + }, + { + "epoch": 0.1822706065318818, + "grad_norm": 35.150367736816406, + "learning_rate": 3.0331262939958594e-06, + "logits/chosen": -0.9945878386497498, + "logits/rejected": 3.7409143447875977, + "logps/chosen": -498.15850830078125, + "logps/rejected": -846.7988891601562, + "loss": 0.4365, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.441921710968018, + "rewards/margins": 8.19404411315918, + "rewards/rejected": -15.635965347290039, + "step": 293 + }, + { + "epoch": 0.1828926905132193, + "grad_norm": 53.36930465698242, + "learning_rate": 3.043478260869566e-06, + "logits/chosen": -0.6886473894119263, + "logits/rejected": 4.724715232849121, + "logps/chosen": -432.71478271484375, + "logps/rejected": -881.2340698242188, + "loss": 2.3085, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.074771881103516, + "rewards/margins": 13.83968734741211, + "rewards/rejected": -18.914457321166992, + "step": 294 + }, + { + "epoch": 0.18351477449455678, + "grad_norm": 22.593576431274414, + "learning_rate": 3.0538302277432714e-06, + "logits/chosen": 0.6494942903518677, + "logits/rejected": 2.726839542388916, + "logps/chosen": -589.6900634765625, + "logps/rejected": -807.8902587890625, + "loss": 0.1957, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.541738033294678, + "rewards/margins": 10.383159637451172, + "rewards/rejected": -15.924898147583008, + "step": 295 + }, + { + "epoch": 0.18413685847589426, + "grad_norm": 0.09417515993118286, + "learning_rate": 3.0641821946169773e-06, + "logits/chosen": -1.711951494216919, + "logits/rejected": 2.7394614219665527, + "logps/chosen": -368.3467102050781, + "logps/rejected": -731.626708984375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5702080726623535, + "rewards/margins": 13.196901321411133, + "rewards/rejected": -15.767109870910645, + "step": 296 + }, + { + "epoch": 0.18475894245723173, + "grad_norm": 53.6297607421875, + "learning_rate": 3.0745341614906837e-06, + "logits/chosen": 2.0798213481903076, + "logits/rejected": 1.6787725687026978, + "logps/chosen": -639.6285400390625, + "logps/rejected": -665.6937255859375, + "loss": 1.9573, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.730389595031738, + "rewards/margins": 5.162319660186768, + "rewards/rejected": -13.892708778381348, + "step": 297 + }, + { + "epoch": 0.1853810264385692, + "grad_norm": 48.8674430847168, + "learning_rate": 3.0848861283643893e-06, + "logits/chosen": 2.25374436378479, + "logits/rejected": 3.050382137298584, + "logps/chosen": -669.9483642578125, + "logps/rejected": -737.494873046875, + "loss": 1.4464, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.033759117126465, + "rewards/margins": 8.142561912536621, + "rewards/rejected": -13.176321029663086, + "step": 298 + }, + { + "epoch": 0.1860031104199067, + "grad_norm": 13.007925033569336, + "learning_rate": 3.0952380952380957e-06, + "logits/chosen": -0.3085886836051941, + "logits/rejected": 1.4879200458526611, + "logps/chosen": -568.7110595703125, + "logps/rejected": -805.2724609375, + "loss": 0.0654, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.329258918762207, + "rewards/margins": 10.055083274841309, + "rewards/rejected": -17.384342193603516, + "step": 299 + }, + { + "epoch": 0.18662519440124417, + "grad_norm": 1.1980512142181396, + "learning_rate": 3.1055900621118013e-06, + "logits/chosen": -1.660165548324585, + "logits/rejected": 2.2211251258850098, + "logps/chosen": -307.91375732421875, + "logps/rejected": -707.158447265625, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.748342275619507, + "rewards/margins": 14.093570709228516, + "rewards/rejected": -16.8419132232666, + "step": 300 + }, + { + "epoch": 0.18724727838258165, + "grad_norm": 2.2520453929901123, + "learning_rate": 3.1159420289855073e-06, + "logits/chosen": -1.8573826551437378, + "logits/rejected": 2.397719621658325, + "logps/chosen": -297.97613525390625, + "logps/rejected": -675.8267211914062, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4887399673461914, + "rewards/margins": 15.379812240600586, + "rewards/rejected": -16.868553161621094, + "step": 301 + }, + { + "epoch": 0.18786936236391913, + "grad_norm": 1.385947346687317, + "learning_rate": 3.1262939958592137e-06, + "logits/chosen": 2.1132750511169434, + "logits/rejected": 2.269409656524658, + "logps/chosen": -581.947509765625, + "logps/rejected": -664.1751708984375, + "loss": 0.1036, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37770363688468933, + "rewards/margins": 7.922628402709961, + "rewards/rejected": -8.300332069396973, + "step": 302 + }, + { + "epoch": 0.1884914463452566, + "grad_norm": 3.8977954387664795, + "learning_rate": 3.1366459627329192e-06, + "logits/chosen": 0.9858641624450684, + "logits/rejected": 1.9707233905792236, + "logps/chosen": -619.97998046875, + "logps/rejected": -831.2584228515625, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.3522443771362305, + "rewards/margins": 12.102344512939453, + "rewards/rejected": -17.454587936401367, + "step": 303 + }, + { + "epoch": 0.1891135303265941, + "grad_norm": 33.76449203491211, + "learning_rate": 3.1469979296066256e-06, + "logits/chosen": -1.6749011278152466, + "logits/rejected": 2.192087173461914, + "logps/chosen": -387.16436767578125, + "logps/rejected": -690.3063354492188, + "loss": 0.1106, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.873738765716553, + "rewards/margins": 8.083623886108398, + "rewards/rejected": -15.957362174987793, + "step": 304 + }, + { + "epoch": 0.18973561430793157, + "grad_norm": 0.1594896912574768, + "learning_rate": 3.1573498964803316e-06, + "logits/chosen": -3.2994441986083984, + "logits/rejected": 1.944976806640625, + "logps/chosen": -242.07859802246094, + "logps/rejected": -739.9111938476562, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6777923107147217, + "rewards/margins": 15.069310188293457, + "rewards/rejected": -16.747102737426758, + "step": 305 + }, + { + "epoch": 0.19035769828926905, + "grad_norm": 63.30927658081055, + "learning_rate": 3.1677018633540376e-06, + "logits/chosen": 1.2816505432128906, + "logits/rejected": 2.624063491821289, + "logps/chosen": -605.6007080078125, + "logps/rejected": -725.8055419921875, + "loss": 2.7351, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.404818534851074, + "rewards/margins": 3.914830207824707, + "rewards/rejected": -12.319649696350098, + "step": 306 + }, + { + "epoch": 0.19097978227060652, + "grad_norm": 26.36448097229004, + "learning_rate": 3.1780538302277436e-06, + "logits/chosen": 0.1868879795074463, + "logits/rejected": 2.8717458248138428, + "logps/chosen": -550.631591796875, + "logps/rejected": -762.9170532226562, + "loss": 0.5359, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.5749921798706055, + "rewards/margins": 10.07697582244873, + "rewards/rejected": -14.651968955993652, + "step": 307 + }, + { + "epoch": 0.191601866251944, + "grad_norm": 2.184065580368042, + "learning_rate": 3.188405797101449e-06, + "logits/chosen": -0.3023766279220581, + "logits/rejected": 2.1106514930725098, + "logps/chosen": -538.0408935546875, + "logps/rejected": -788.0562744140625, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.051552772521973, + "rewards/margins": 13.31997299194336, + "rewards/rejected": -21.37152671813965, + "step": 308 + }, + { + "epoch": 0.19222395023328148, + "grad_norm": 31.663427352905273, + "learning_rate": 3.1987577639751555e-06, + "logits/chosen": 0.46885818243026733, + "logits/rejected": 3.4298930168151855, + "logps/chosen": -457.65106201171875, + "logps/rejected": -654.154052734375, + "loss": 0.254, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.863368034362793, + "rewards/margins": 4.747684955596924, + "rewards/rejected": -14.611052513122559, + "step": 309 + }, + { + "epoch": 0.19284603421461896, + "grad_norm": 9.104625701904297, + "learning_rate": 3.2091097308488615e-06, + "logits/chosen": 1.0741961002349854, + "logits/rejected": 4.69411563873291, + "logps/chosen": -489.1529541015625, + "logps/rejected": -798.117919921875, + "loss": 0.075, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.415085315704346, + "rewards/margins": 13.385915756225586, + "rewards/rejected": -17.801002502441406, + "step": 310 + }, + { + "epoch": 0.19346811819595647, + "grad_norm": 2.730404853820801, + "learning_rate": 3.2194616977225675e-06, + "logits/chosen": -3.9466922283172607, + "logits/rejected": 1.7940433025360107, + "logps/chosen": -369.9723205566406, + "logps/rejected": -805.43701171875, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8560994863510132, + "rewards/margins": 16.080703735351562, + "rewards/rejected": -17.936803817749023, + "step": 311 + }, + { + "epoch": 0.19409020217729395, + "grad_norm": 31.281147003173828, + "learning_rate": 3.2298136645962735e-06, + "logits/chosen": 0.35380566120147705, + "logits/rejected": 1.810831904411316, + "logps/chosen": -577.0803833007812, + "logps/rejected": -711.97119140625, + "loss": 0.3663, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.113171577453613, + "rewards/margins": 9.861759185791016, + "rewards/rejected": -17.974929809570312, + "step": 312 + }, + { + "epoch": 0.19471228615863143, + "grad_norm": 40.597530364990234, + "learning_rate": 3.24016563146998e-06, + "logits/chosen": 1.288434386253357, + "logits/rejected": 3.427957057952881, + "logps/chosen": -649.0224609375, + "logps/rejected": -792.2032470703125, + "loss": 0.3979, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.13973617553711, + "rewards/margins": 9.166152954101562, + "rewards/rejected": -17.305889129638672, + "step": 313 + }, + { + "epoch": 0.1953343701399689, + "grad_norm": 38.50737380981445, + "learning_rate": 3.2505175983436855e-06, + "logits/chosen": 0.9875385761260986, + "logits/rejected": 1.3537368774414062, + "logps/chosen": -590.6889038085938, + "logps/rejected": -713.373046875, + "loss": 1.1722, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.0353498458862305, + "rewards/margins": 7.720550537109375, + "rewards/rejected": -14.755900382995605, + "step": 314 + }, + { + "epoch": 0.19595645412130638, + "grad_norm": 58.41802215576172, + "learning_rate": 3.2608695652173914e-06, + "logits/chosen": 3.5066542625427246, + "logits/rejected": 3.6684353351593018, + "logps/chosen": -731.1271362304688, + "logps/rejected": -816.0885009765625, + "loss": 0.6979, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.413936614990234, + "rewards/margins": 4.682851791381836, + "rewards/rejected": -16.096786499023438, + "step": 315 + }, + { + "epoch": 0.19657853810264386, + "grad_norm": 13.743263244628906, + "learning_rate": 3.271221532091098e-06, + "logits/chosen": -1.6713857650756836, + "logits/rejected": 0.9872905015945435, + "logps/chosen": -455.806640625, + "logps/rejected": -674.9764404296875, + "loss": 0.0902, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.974114418029785, + "rewards/margins": 8.05095100402832, + "rewards/rejected": -13.025066375732422, + "step": 316 + }, + { + "epoch": 0.19720062208398134, + "grad_norm": 10.271381378173828, + "learning_rate": 3.2815734989648034e-06, + "logits/chosen": -1.2763960361480713, + "logits/rejected": 1.2967215776443481, + "logps/chosen": -479.1017761230469, + "logps/rejected": -778.9246215820312, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.021428108215332, + "rewards/margins": 10.688220977783203, + "rewards/rejected": -14.709648132324219, + "step": 317 + }, + { + "epoch": 0.19782270606531882, + "grad_norm": 10.258651733398438, + "learning_rate": 3.29192546583851e-06, + "logits/chosen": -0.9771242737770081, + "logits/rejected": 3.320232629776001, + "logps/chosen": -419.28009033203125, + "logps/rejected": -837.815185546875, + "loss": 0.0501, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8659563064575195, + "rewards/margins": 10.27839469909668, + "rewards/rejected": -13.144350051879883, + "step": 318 + }, + { + "epoch": 0.1984447900466563, + "grad_norm": 51.61482620239258, + "learning_rate": 3.3022774327122154e-06, + "logits/chosen": 0.948689341545105, + "logits/rejected": 1.2561674118041992, + "logps/chosen": -505.46624755859375, + "logps/rejected": -577.086669921875, + "loss": 2.3032, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.470050811767578, + "rewards/margins": 4.860204696655273, + "rewards/rejected": -12.330255508422852, + "step": 319 + }, + { + "epoch": 0.19906687402799378, + "grad_norm": 22.424713134765625, + "learning_rate": 3.3126293995859214e-06, + "logits/chosen": -0.02822953462600708, + "logits/rejected": 4.412856578826904, + "logps/chosen": -482.056396484375, + "logps/rejected": -848.180419921875, + "loss": 0.079, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.365329742431641, + "rewards/margins": 8.485767364501953, + "rewards/rejected": -15.851097106933594, + "step": 320 + }, + { + "epoch": 0.19968895800933126, + "grad_norm": 30.33492660522461, + "learning_rate": 3.3229813664596278e-06, + "logits/chosen": 1.4249606132507324, + "logits/rejected": 3.7021050453186035, + "logps/chosen": -519.3263549804688, + "logps/rejected": -800.2149658203125, + "loss": 0.7746, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.840842247009277, + "rewards/margins": 8.635676383972168, + "rewards/rejected": -16.476520538330078, + "step": 321 + }, + { + "epoch": 0.20031104199066874, + "grad_norm": 14.67454719543457, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -1.4113171100616455, + "logits/rejected": 3.002666473388672, + "logps/chosen": -444.43084716796875, + "logps/rejected": -809.7966918945312, + "loss": 0.084, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.718070030212402, + "rewards/margins": 11.31346607208252, + "rewards/rejected": -19.031536102294922, + "step": 322 + }, + { + "epoch": 0.20093312597200622, + "grad_norm": 11.428664207458496, + "learning_rate": 3.3436853002070397e-06, + "logits/chosen": 0.5857963562011719, + "logits/rejected": 2.122978925704956, + "logps/chosen": -484.9825134277344, + "logps/rejected": -699.5858154296875, + "loss": 0.0829, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3593549728393555, + "rewards/margins": 8.682098388671875, + "rewards/rejected": -11.041452407836914, + "step": 323 + }, + { + "epoch": 0.2015552099533437, + "grad_norm": 46.258018493652344, + "learning_rate": 3.3540372670807457e-06, + "logits/chosen": -0.023642655462026596, + "logits/rejected": 2.68320369720459, + "logps/chosen": -434.5428466796875, + "logps/rejected": -652.7188720703125, + "loss": 1.1915, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.017848491668701, + "rewards/margins": 7.239933490753174, + "rewards/rejected": -12.257781982421875, + "step": 324 + }, + { + "epoch": 0.20217729393468117, + "grad_norm": 27.407543182373047, + "learning_rate": 3.3643892339544517e-06, + "logits/chosen": -0.966314971446991, + "logits/rejected": 1.4754480123519897, + "logps/chosen": -432.0926208496094, + "logps/rejected": -727.2296142578125, + "loss": 0.6391, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.21859073638916, + "rewards/margins": 8.61583423614502, + "rewards/rejected": -13.83442497253418, + "step": 325 + }, + { + "epoch": 0.20279937791601865, + "grad_norm": 1.9784207344055176, + "learning_rate": 3.3747412008281577e-06, + "logits/chosen": 0.732000470161438, + "logits/rejected": 3.4103875160217285, + "logps/chosen": -518.6851806640625, + "logps/rejected": -762.8380126953125, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6236228942871094, + "rewards/margins": 12.843097686767578, + "rewards/rejected": -16.466720581054688, + "step": 326 + }, + { + "epoch": 0.20342146189735613, + "grad_norm": 17.703134536743164, + "learning_rate": 3.3850931677018632e-06, + "logits/chosen": -0.15337622165679932, + "logits/rejected": 3.2251529693603516, + "logps/chosen": -447.5892028808594, + "logps/rejected": -686.328125, + "loss": 0.1563, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26287829875946045, + "rewards/margins": 8.936200141906738, + "rewards/rejected": -9.199078559875488, + "step": 327 + }, + { + "epoch": 0.2040435458786936, + "grad_norm": 3.6893739700317383, + "learning_rate": 3.3954451345755696e-06, + "logits/chosen": -0.10928022861480713, + "logits/rejected": 3.0014255046844482, + "logps/chosen": -275.645751953125, + "logps/rejected": -618.063232421875, + "loss": 0.0614, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6410908699035645, + "rewards/margins": 13.339859008789062, + "rewards/rejected": -14.980949401855469, + "step": 328 + }, + { + "epoch": 0.20466562986003112, + "grad_norm": 32.0711784362793, + "learning_rate": 3.4057971014492756e-06, + "logits/chosen": 2.949876308441162, + "logits/rejected": 3.1409921646118164, + "logps/chosen": -831.2333984375, + "logps/rejected": -917.00146484375, + "loss": 0.7636, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.08757209777832, + "rewards/margins": 10.290388107299805, + "rewards/rejected": -22.377960205078125, + "step": 329 + }, + { + "epoch": 0.2052877138413686, + "grad_norm": 17.057174682617188, + "learning_rate": 3.4161490683229816e-06, + "logits/chosen": -1.017820954322815, + "logits/rejected": 4.270072937011719, + "logps/chosen": -252.80633544921875, + "logps/rejected": -551.4617309570312, + "loss": 0.0937, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9883391857147217, + "rewards/margins": 7.211501121520996, + "rewards/rejected": -5.223161697387695, + "step": 330 + }, + { + "epoch": 0.20590979782270608, + "grad_norm": 42.068294525146484, + "learning_rate": 3.4265010351966876e-06, + "logits/chosen": 0.805486798286438, + "logits/rejected": 2.702273368835449, + "logps/chosen": -640.76953125, + "logps/rejected": -748.0137939453125, + "loss": 0.9465, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.5970258712768555, + "rewards/margins": 9.613271713256836, + "rewards/rejected": -14.210297584533691, + "step": 331 + }, + { + "epoch": 0.20653188180404355, + "grad_norm": 28.886274337768555, + "learning_rate": 3.436853002070394e-06, + "logits/chosen": -1.282555341720581, + "logits/rejected": 1.9352182149887085, + "logps/chosen": -425.39117431640625, + "logps/rejected": -682.3564453125, + "loss": 0.6583, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.6346869468688965, + "rewards/margins": 7.455491065979004, + "rewards/rejected": -12.090177536010742, + "step": 332 + }, + { + "epoch": 0.20715396578538103, + "grad_norm": 37.93339920043945, + "learning_rate": 3.4472049689440996e-06, + "logits/chosen": 2.5240001678466797, + "logits/rejected": 4.233339309692383, + "logps/chosen": -619.6500854492188, + "logps/rejected": -794.2083740234375, + "loss": 0.5793, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.68540620803833, + "rewards/margins": 5.579806327819824, + "rewards/rejected": -9.265213012695312, + "step": 333 + }, + { + "epoch": 0.2077760497667185, + "grad_norm": 18.976966857910156, + "learning_rate": 3.4575569358178055e-06, + "logits/chosen": 3.476201057434082, + "logits/rejected": 4.37414026260376, + "logps/chosen": -620.353271484375, + "logps/rejected": -740.6907958984375, + "loss": 0.2442, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.355771064758301, + "rewards/margins": 8.380756378173828, + "rewards/rejected": -12.736527442932129, + "step": 334 + }, + { + "epoch": 0.208398133748056, + "grad_norm": 34.64626693725586, + "learning_rate": 3.467908902691512e-06, + "logits/chosen": 0.05781608819961548, + "logits/rejected": 2.8691015243530273, + "logps/chosen": -449.12384033203125, + "logps/rejected": -717.6018676757812, + "loss": 0.5111, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.46394681930542, + "rewards/margins": 11.127350807189941, + "rewards/rejected": -13.591299057006836, + "step": 335 + }, + { + "epoch": 0.20902021772939347, + "grad_norm": 53.95747375488281, + "learning_rate": 3.4782608695652175e-06, + "logits/chosen": -0.6395105123519897, + "logits/rejected": 3.263615608215332, + "logps/chosen": -465.0777587890625, + "logps/rejected": -737.4808349609375, + "loss": 2.0079, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.715614318847656, + "rewards/margins": 6.04669189453125, + "rewards/rejected": -10.762306213378906, + "step": 336 + }, + { + "epoch": 0.20964230171073095, + "grad_norm": 55.4720458984375, + "learning_rate": 3.488612836438924e-06, + "logits/chosen": -1.6190953254699707, + "logits/rejected": 2.6271119117736816, + "logps/chosen": -385.53424072265625, + "logps/rejected": -694.2275390625, + "loss": 0.7186, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.31008768081665, + "rewards/margins": 9.709564208984375, + "rewards/rejected": -16.019651412963867, + "step": 337 + }, + { + "epoch": 0.21026438569206843, + "grad_norm": 28.41864776611328, + "learning_rate": 3.4989648033126295e-06, + "logits/chosen": 0.5265299081802368, + "logits/rejected": 1.7877110242843628, + "logps/chosen": -576.763671875, + "logps/rejected": -724.7598876953125, + "loss": 0.3338, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.072409152984619, + "rewards/margins": 8.15505313873291, + "rewards/rejected": -12.227461814880371, + "step": 338 + }, + { + "epoch": 0.2108864696734059, + "grad_norm": 1.3567543029785156, + "learning_rate": 3.5093167701863355e-06, + "logits/chosen": 1.584625005722046, + "logits/rejected": 3.8020477294921875, + "logps/chosen": -429.4696960449219, + "logps/rejected": -659.51708984375, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6522419452667236, + "rewards/margins": 10.80935287475586, + "rewards/rejected": -14.461594581604004, + "step": 339 + }, + { + "epoch": 0.2115085536547434, + "grad_norm": 22.75896453857422, + "learning_rate": 3.519668737060042e-06, + "logits/chosen": -0.11166572570800781, + "logits/rejected": 3.290238857269287, + "logps/chosen": -433.4940185546875, + "logps/rejected": -669.661376953125, + "loss": 0.2866, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.664963245391846, + "rewards/margins": 8.477402687072754, + "rewards/rejected": -13.142365455627441, + "step": 340 + }, + { + "epoch": 0.21213063763608087, + "grad_norm": 59.272911071777344, + "learning_rate": 3.5300207039337474e-06, + "logits/chosen": 2.0690321922302246, + "logits/rejected": 3.6204605102539062, + "logps/chosen": -710.8519287109375, + "logps/rejected": -870.4520263671875, + "loss": 3.7634, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.790801048278809, + "rewards/margins": 8.290600776672363, + "rewards/rejected": -16.081401824951172, + "step": 341 + }, + { + "epoch": 0.21275272161741834, + "grad_norm": 18.303058624267578, + "learning_rate": 3.540372670807454e-06, + "logits/chosen": -2.2392303943634033, + "logits/rejected": 3.1548311710357666, + "logps/chosen": -287.3793029785156, + "logps/rejected": -696.4027709960938, + "loss": 0.0772, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4984145164489746, + "rewards/margins": 13.312691688537598, + "rewards/rejected": -14.81110668182373, + "step": 342 + }, + { + "epoch": 0.21337480559875582, + "grad_norm": 32.04864501953125, + "learning_rate": 3.55072463768116e-06, + "logits/chosen": -0.019650399684906006, + "logits/rejected": 5.416806221008301, + "logps/chosen": -432.47296142578125, + "logps/rejected": -780.5631103515625, + "loss": 0.7646, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.224555015563965, + "rewards/margins": 5.373963356018066, + "rewards/rejected": -9.598518371582031, + "step": 343 + }, + { + "epoch": 0.2139968895800933, + "grad_norm": 0.13826704025268555, + "learning_rate": 3.561076604554866e-06, + "logits/chosen": 0.030586957931518555, + "logits/rejected": 4.062129974365234, + "logps/chosen": -449.2490539550781, + "logps/rejected": -781.0255126953125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.162491798400879, + "rewards/margins": 10.261180877685547, + "rewards/rejected": -17.42367172241211, + "step": 344 + }, + { + "epoch": 0.21461897356143078, + "grad_norm": 33.03892135620117, + "learning_rate": 3.5714285714285718e-06, + "logits/chosen": 0.037287890911102295, + "logits/rejected": 4.113863945007324, + "logps/chosen": -381.3580017089844, + "logps/rejected": -648.242919921875, + "loss": 0.3812, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9465177059173584, + "rewards/margins": 5.442397117614746, + "rewards/rejected": -6.388915061950684, + "step": 345 + }, + { + "epoch": 0.21524105754276826, + "grad_norm": 37.0477180480957, + "learning_rate": 3.5817805383022773e-06, + "logits/chosen": 0.227480947971344, + "logits/rejected": 2.9201414585113525, + "logps/chosen": -538.601806640625, + "logps/rejected": -760.7200927734375, + "loss": 0.3476, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8433430194854736, + "rewards/margins": 10.570931434631348, + "rewards/rejected": -12.414274215698242, + "step": 346 + }, + { + "epoch": 0.21586314152410577, + "grad_norm": 1.1853687763214111, + "learning_rate": 3.5921325051759837e-06, + "logits/chosen": 0.9866267442703247, + "logits/rejected": 3.310483932495117, + "logps/chosen": -620.07666015625, + "logps/rejected": -880.2100219726562, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.350822925567627, + "rewards/margins": 12.167201042175293, + "rewards/rejected": -18.518024444580078, + "step": 347 + }, + { + "epoch": 0.21648522550544325, + "grad_norm": 10.583118438720703, + "learning_rate": 3.6024844720496897e-06, + "logits/chosen": -2.126750946044922, + "logits/rejected": 0.0941476821899414, + "logps/chosen": -432.0557861328125, + "logps/rejected": -669.322509765625, + "loss": 0.0705, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.652919769287109, + "rewards/margins": 11.304596900939941, + "rewards/rejected": -16.957515716552734, + "step": 348 + }, + { + "epoch": 0.21710730948678073, + "grad_norm": 27.52307891845703, + "learning_rate": 3.6128364389233957e-06, + "logits/chosen": -0.28781723976135254, + "logits/rejected": 4.803361892700195, + "logps/chosen": -432.9546203613281, + "logps/rejected": -847.7452392578125, + "loss": 1.3175, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.87117338180542, + "rewards/margins": 12.279956817626953, + "rewards/rejected": -18.15113067626953, + "step": 349 + }, + { + "epoch": 0.2177293934681182, + "grad_norm": 0.44653281569480896, + "learning_rate": 3.6231884057971017e-06, + "logits/chosen": -2.8117458820343018, + "logits/rejected": 1.8438504934310913, + "logps/chosen": -285.66485595703125, + "logps/rejected": -688.595947265625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.911851644515991, + "rewards/margins": 11.518555641174316, + "rewards/rejected": -15.430407524108887, + "step": 350 + }, + { + "epoch": 0.21835147744945568, + "grad_norm": 49.33088302612305, + "learning_rate": 3.633540372670808e-06, + "logits/chosen": 0.18182387948036194, + "logits/rejected": 1.4403018951416016, + "logps/chosen": -533.3489990234375, + "logps/rejected": -716.069091796875, + "loss": 1.6784, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.557300090789795, + "rewards/margins": 5.727059364318848, + "rewards/rejected": -12.284358978271484, + "step": 351 + }, + { + "epoch": 0.21897356143079316, + "grad_norm": 11.414206504821777, + "learning_rate": 3.6438923395445137e-06, + "logits/chosen": -2.3577916622161865, + "logits/rejected": 2.345189332962036, + "logps/chosen": -273.41754150390625, + "logps/rejected": -637.039794921875, + "loss": 0.05, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.158540725708008, + "rewards/margins": 8.63189697265625, + "rewards/rejected": -12.790438652038574, + "step": 352 + }, + { + "epoch": 0.21959564541213064, + "grad_norm": 15.936989784240723, + "learning_rate": 3.6542443064182196e-06, + "logits/chosen": -0.885023832321167, + "logits/rejected": 3.278656244277954, + "logps/chosen": -498.98699951171875, + "logps/rejected": -804.25634765625, + "loss": 0.1398, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9475908279418945, + "rewards/margins": 11.372220993041992, + "rewards/rejected": -13.319811820983887, + "step": 353 + }, + { + "epoch": 0.22021772939346812, + "grad_norm": 5.129945278167725, + "learning_rate": 3.664596273291926e-06, + "logits/chosen": -0.23889708518981934, + "logits/rejected": 2.783459424972534, + "logps/chosen": -460.1069641113281, + "logps/rejected": -735.85693359375, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.499173641204834, + "rewards/margins": 10.676677703857422, + "rewards/rejected": -18.175851821899414, + "step": 354 + }, + { + "epoch": 0.2208398133748056, + "grad_norm": 12.920183181762695, + "learning_rate": 3.6749482401656316e-06, + "logits/chosen": -0.26782411336898804, + "logits/rejected": 2.080547332763672, + "logps/chosen": -357.44122314453125, + "logps/rejected": -601.374267578125, + "loss": 0.1093, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0575757026672363, + "rewards/margins": 11.874951362609863, + "rewards/rejected": -14.932526588439941, + "step": 355 + }, + { + "epoch": 0.22146189735614308, + "grad_norm": 3.1438405513763428, + "learning_rate": 3.685300207039338e-06, + "logits/chosen": 0.6319728493690491, + "logits/rejected": 3.0250282287597656, + "logps/chosen": -542.49169921875, + "logps/rejected": -804.1852416992188, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6361920833587646, + "rewards/margins": 12.459537506103516, + "rewards/rejected": -14.09572982788086, + "step": 356 + }, + { + "epoch": 0.22208398133748056, + "grad_norm": 36.63279342651367, + "learning_rate": 3.6956521739130436e-06, + "logits/chosen": 3.3123087882995605, + "logits/rejected": 3.9450058937072754, + "logps/chosen": -695.519287109375, + "logps/rejected": -728.1754150390625, + "loss": 0.3193, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.348609447479248, + "rewards/margins": 5.168727397918701, + "rewards/rejected": -11.51733684539795, + "step": 357 + }, + { + "epoch": 0.22270606531881804, + "grad_norm": 29.851964950561523, + "learning_rate": 3.7060041407867496e-06, + "logits/chosen": 0.2993144989013672, + "logits/rejected": 3.471100091934204, + "logps/chosen": -428.8981628417969, + "logps/rejected": -622.437744140625, + "loss": 0.398, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.051437854766846, + "rewards/margins": 8.075346946716309, + "rewards/rejected": -13.126784324645996, + "step": 358 + }, + { + "epoch": 0.22332814930015552, + "grad_norm": 1.975639820098877, + "learning_rate": 3.716356107660456e-06, + "logits/chosen": 1.1808077096939087, + "logits/rejected": 3.8902134895324707, + "logps/chosen": -600.333251953125, + "logps/rejected": -811.3698120117188, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7640974521636963, + "rewards/margins": 9.818041801452637, + "rewards/rejected": -12.58213996887207, + "step": 359 + }, + { + "epoch": 0.223950233281493, + "grad_norm": 26.269790649414062, + "learning_rate": 3.7267080745341615e-06, + "logits/chosen": 0.20025908946990967, + "logits/rejected": 3.0269370079040527, + "logps/chosen": -549.3627319335938, + "logps/rejected": -816.055908203125, + "loss": 0.4161, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0061068534851074, + "rewards/margins": 13.149438858032227, + "rewards/rejected": -16.155548095703125, + "step": 360 + }, + { + "epoch": 0.22457231726283047, + "grad_norm": 6.104818820953369, + "learning_rate": 3.737060041407868e-06, + "logits/chosen": 0.3199158310890198, + "logits/rejected": 1.2874040603637695, + "logps/chosen": -559.9285888671875, + "logps/rejected": -674.5938720703125, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.582068920135498, + "rewards/margins": 7.962156295776367, + "rewards/rejected": -14.544224739074707, + "step": 361 + }, + { + "epoch": 0.22519440124416795, + "grad_norm": 0.13571402430534363, + "learning_rate": 3.747412008281574e-06, + "logits/chosen": 0.565754771232605, + "logits/rejected": 4.5421552658081055, + "logps/chosen": -477.07855224609375, + "logps/rejected": -831.6639404296875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.70570182800293, + "rewards/margins": 14.201147079467773, + "rewards/rejected": -21.906848907470703, + "step": 362 + }, + { + "epoch": 0.22581648522550543, + "grad_norm": 29.88707160949707, + "learning_rate": 3.7577639751552795e-06, + "logits/chosen": -0.7453135251998901, + "logits/rejected": 1.5671675205230713, + "logps/chosen": -481.1373291015625, + "logps/rejected": -689.8429565429688, + "loss": 0.5111, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.882606506347656, + "rewards/margins": 6.476565361022949, + "rewards/rejected": -15.359171867370605, + "step": 363 + }, + { + "epoch": 0.2264385692068429, + "grad_norm": 28.465078353881836, + "learning_rate": 3.768115942028986e-06, + "logits/chosen": 1.0373425483703613, + "logits/rejected": 3.4107184410095215, + "logps/chosen": -617.771728515625, + "logps/rejected": -864.3381958007812, + "loss": 0.3374, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.847381591796875, + "rewards/margins": 14.079671859741211, + "rewards/rejected": -22.927051544189453, + "step": 364 + }, + { + "epoch": 0.22706065318818042, + "grad_norm": 0.7878022789955139, + "learning_rate": 3.7784679089026914e-06, + "logits/chosen": -1.1057369709014893, + "logits/rejected": 4.344394207000732, + "logps/chosen": -436.24176025390625, + "logps/rejected": -857.5606689453125, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.23887825012207, + "rewards/margins": 11.883063316345215, + "rewards/rejected": -16.12194061279297, + "step": 365 + }, + { + "epoch": 0.2276827371695179, + "grad_norm": 26.853925704956055, + "learning_rate": 3.788819875776398e-06, + "logits/chosen": -0.7507523894309998, + "logits/rejected": 3.8645191192626953, + "logps/chosen": -458.9233093261719, + "logps/rejected": -851.0308837890625, + "loss": 0.1905, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.54487419128418, + "rewards/margins": 13.677597045898438, + "rewards/rejected": -21.222469329833984, + "step": 366 + }, + { + "epoch": 0.22830482115085537, + "grad_norm": 0.009762109257280827, + "learning_rate": 3.799171842650104e-06, + "logits/chosen": 0.7624329924583435, + "logits/rejected": 4.902022361755371, + "logps/chosen": -479.6599426269531, + "logps/rejected": -928.024169921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.773331642150879, + "rewards/margins": 18.00758171081543, + "rewards/rejected": -23.780914306640625, + "step": 367 + }, + { + "epoch": 0.22892690513219285, + "grad_norm": 21.023170471191406, + "learning_rate": 3.80952380952381e-06, + "logits/chosen": -0.7568533420562744, + "logits/rejected": 2.8591063022613525, + "logps/chosen": -502.4686279296875, + "logps/rejected": -829.576171875, + "loss": 0.2384, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.506834030151367, + "rewards/margins": 11.209808349609375, + "rewards/rejected": -16.716642379760742, + "step": 368 + }, + { + "epoch": 0.22954898911353033, + "grad_norm": 3.5660157203674316, + "learning_rate": 3.819875776397516e-06, + "logits/chosen": -0.4312325716018677, + "logits/rejected": 2.562499523162842, + "logps/chosen": -480.4518127441406, + "logps/rejected": -758.7933959960938, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.095095634460449, + "rewards/margins": 9.17817211151123, + "rewards/rejected": -15.273269653320312, + "step": 369 + }, + { + "epoch": 0.2301710730948678, + "grad_norm": 39.562400817871094, + "learning_rate": 3.830227743271222e-06, + "logits/chosen": -2.5081186294555664, + "logits/rejected": 0.7461903095245361, + "logps/chosen": -378.31488037109375, + "logps/rejected": -717.0318603515625, + "loss": 0.7567, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.153487205505371, + "rewards/margins": 7.799801826477051, + "rewards/rejected": -13.953289031982422, + "step": 370 + }, + { + "epoch": 0.2307931570762053, + "grad_norm": 3.9070324897766113, + "learning_rate": 3.840579710144928e-06, + "logits/chosen": -1.1572139263153076, + "logits/rejected": 4.676782608032227, + "logps/chosen": -444.24261474609375, + "logps/rejected": -850.5587768554688, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.589866638183594, + "rewards/margins": 13.885885238647461, + "rewards/rejected": -19.475751876831055, + "step": 371 + }, + { + "epoch": 0.23141524105754277, + "grad_norm": 1.867079496383667, + "learning_rate": 3.850931677018634e-06, + "logits/chosen": -1.5081374645233154, + "logits/rejected": 3.0880353450775146, + "logps/chosen": -357.534423828125, + "logps/rejected": -728.313232421875, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.523712158203125, + "rewards/margins": 10.212594985961914, + "rewards/rejected": -14.736308097839355, + "step": 372 + }, + { + "epoch": 0.23203732503888025, + "grad_norm": 27.546833038330078, + "learning_rate": 3.86128364389234e-06, + "logits/chosen": 1.3264672756195068, + "logits/rejected": 3.7906978130340576, + "logps/chosen": -420.4259033203125, + "logps/rejected": -580.6475830078125, + "loss": 0.2752, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.540620803833008, + "rewards/margins": 7.766623497009277, + "rewards/rejected": -14.307243347167969, + "step": 373 + }, + { + "epoch": 0.23265940902021773, + "grad_norm": 31.622692108154297, + "learning_rate": 3.871635610766046e-06, + "logits/chosen": -1.0142524242401123, + "logits/rejected": 1.874340295791626, + "logps/chosen": -449.20782470703125, + "logps/rejected": -648.7708740234375, + "loss": 0.8318, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.764393329620361, + "rewards/margins": 8.824634552001953, + "rewards/rejected": -15.589027404785156, + "step": 374 + }, + { + "epoch": 0.2332814930015552, + "grad_norm": 4.413476467132568, + "learning_rate": 3.881987577639752e-06, + "logits/chosen": -2.1498818397521973, + "logits/rejected": 3.9574713706970215, + "logps/chosen": -471.00579833984375, + "logps/rejected": -936.8933715820312, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.591935634613037, + "rewards/margins": 15.384891510009766, + "rewards/rejected": -21.976825714111328, + "step": 375 + }, + { + "epoch": 0.23390357698289269, + "grad_norm": 44.951560974121094, + "learning_rate": 3.892339544513457e-06, + "logits/chosen": 0.2634568214416504, + "logits/rejected": 4.468165874481201, + "logps/chosen": -532.2499389648438, + "logps/rejected": -898.7428588867188, + "loss": 1.4199, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.81072473526001, + "rewards/margins": 9.554339408874512, + "rewards/rejected": -17.365062713623047, + "step": 376 + }, + { + "epoch": 0.23452566096423016, + "grad_norm": 0.7279941439628601, + "learning_rate": 3.902691511387164e-06, + "logits/chosen": -2.8469130992889404, + "logits/rejected": 2.8125102519989014, + "logps/chosen": -294.74200439453125, + "logps/rejected": -710.29052734375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.099022150039673, + "rewards/margins": 10.450815200805664, + "rewards/rejected": -12.549837112426758, + "step": 377 + }, + { + "epoch": 0.23514774494556764, + "grad_norm": 0.7543030977249146, + "learning_rate": 3.91304347826087e-06, + "logits/chosen": 0.9640889763832092, + "logits/rejected": 3.2024917602539062, + "logps/chosen": -529.7698364257812, + "logps/rejected": -801.398193359375, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.66275405883789, + "rewards/margins": 12.464832305908203, + "rewards/rejected": -21.127586364746094, + "step": 378 + }, + { + "epoch": 0.23576982892690512, + "grad_norm": 3.1311933994293213, + "learning_rate": 3.923395445134576e-06, + "logits/chosen": 0.4315589368343353, + "logits/rejected": 3.47430682182312, + "logps/chosen": -391.7742919921875, + "logps/rejected": -686.934814453125, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.57724380493164, + "rewards/margins": 9.273082733154297, + "rewards/rejected": -18.85032844543457, + "step": 379 + }, + { + "epoch": 0.2363919129082426, + "grad_norm": 51.43965148925781, + "learning_rate": 3.933747412008282e-06, + "logits/chosen": -0.633091151714325, + "logits/rejected": 2.4071877002716064, + "logps/chosen": -599.2264404296875, + "logps/rejected": -844.1055908203125, + "loss": 0.3701, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.826896667480469, + "rewards/margins": 10.886575698852539, + "rewards/rejected": -22.713472366333008, + "step": 380 + }, + { + "epoch": 0.23701399688958008, + "grad_norm": 13.136116027832031, + "learning_rate": 3.9440993788819884e-06, + "logits/chosen": 0.8338421583175659, + "logits/rejected": 3.6499695777893066, + "logps/chosen": -475.3841552734375, + "logps/rejected": -757.0548095703125, + "loss": 0.246, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.005119323730469, + "rewards/margins": 10.59940242767334, + "rewards/rejected": -18.604522705078125, + "step": 381 + }, + { + "epoch": 0.2376360808709176, + "grad_norm": 49.22789001464844, + "learning_rate": 3.954451345755694e-06, + "logits/chosen": -2.2940762042999268, + "logits/rejected": 2.278968334197998, + "logps/chosen": -396.1622619628906, + "logps/rejected": -734.0848388671875, + "loss": 1.6745, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.4951090812683105, + "rewards/margins": 12.099257469177246, + "rewards/rejected": -18.5943660736084, + "step": 382 + }, + { + "epoch": 0.23825816485225507, + "grad_norm": 38.47724533081055, + "learning_rate": 3.9648033126294e-06, + "logits/chosen": 1.8390161991119385, + "logits/rejected": 2.8606789112091064, + "logps/chosen": -547.0473022460938, + "logps/rejected": -666.1442260742188, + "loss": 0.8119, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.53658676147461, + "rewards/margins": 7.52390193939209, + "rewards/rejected": -16.060489654541016, + "step": 383 + }, + { + "epoch": 0.23888024883359255, + "grad_norm": 0.290153443813324, + "learning_rate": 3.975155279503106e-06, + "logits/chosen": -2.8529579639434814, + "logits/rejected": 3.3242416381835938, + "logps/chosen": -212.5696563720703, + "logps/rejected": -679.15869140625, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8149008750915527, + "rewards/margins": 10.911154747009277, + "rewards/rejected": -14.726055145263672, + "step": 384 + }, + { + "epoch": 0.23950233281493002, + "grad_norm": 13.57795238494873, + "learning_rate": 3.9855072463768115e-06, + "logits/chosen": 0.6920334696769714, + "logits/rejected": 3.7827398777008057, + "logps/chosen": -409.10968017578125, + "logps/rejected": -730.4736938476562, + "loss": 0.0954, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.51535701751709, + "rewards/margins": 10.565073013305664, + "rewards/rejected": -16.080429077148438, + "step": 385 + }, + { + "epoch": 0.2401244167962675, + "grad_norm": 20.393552780151367, + "learning_rate": 3.995859213250518e-06, + "logits/chosen": -2.67911434173584, + "logits/rejected": 3.7035298347473145, + "logps/chosen": -335.30224609375, + "logps/rejected": -857.2125854492188, + "loss": 0.0903, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.92775821685791, + "rewards/margins": 16.138141632080078, + "rewards/rejected": -22.065900802612305, + "step": 386 + }, + { + "epoch": 0.24074650077760498, + "grad_norm": 44.758060455322266, + "learning_rate": 4.0062111801242235e-06, + "logits/chosen": 1.2232826948165894, + "logits/rejected": 3.325286388397217, + "logps/chosen": -695.9320068359375, + "logps/rejected": -895.4841918945312, + "loss": 1.7857, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.098251342773438, + "rewards/margins": 7.316153049468994, + "rewards/rejected": -19.414403915405273, + "step": 387 + }, + { + "epoch": 0.24136858475894246, + "grad_norm": 4.955570220947266, + "learning_rate": 4.01656314699793e-06, + "logits/chosen": 0.6446991562843323, + "logits/rejected": 3.0992045402526855, + "logps/chosen": -446.56939697265625, + "logps/rejected": -768.3590087890625, + "loss": 0.054, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.716497421264648, + "rewards/margins": 15.004471778869629, + "rewards/rejected": -23.720970153808594, + "step": 388 + }, + { + "epoch": 0.24199066874027994, + "grad_norm": 0.3073206841945648, + "learning_rate": 4.026915113871636e-06, + "logits/chosen": -1.1649131774902344, + "logits/rejected": 2.8903706073760986, + "logps/chosen": -549.2460327148438, + "logps/rejected": -947.3636474609375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.724716186523438, + "rewards/margins": 17.002840042114258, + "rewards/rejected": -25.727556228637695, + "step": 389 + }, + { + "epoch": 0.24261275272161742, + "grad_norm": 46.902503967285156, + "learning_rate": 4.037267080745342e-06, + "logits/chosen": 4.440867900848389, + "logits/rejected": 5.046853065490723, + "logps/chosen": -755.260498046875, + "logps/rejected": -915.3885498046875, + "loss": 0.9632, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.23651123046875, + "rewards/margins": 8.646513938903809, + "rewards/rejected": -17.883026123046875, + "step": 390 + }, + { + "epoch": 0.2432348367029549, + "grad_norm": 62.403770446777344, + "learning_rate": 4.047619047619048e-06, + "logits/chosen": 1.4265656471252441, + "logits/rejected": 1.5992298126220703, + "logps/chosen": -569.1845703125, + "logps/rejected": -579.0994262695312, + "loss": 1.5848, + "rewards/accuracies": 0.375, + "rewards/chosen": -7.588815689086914, + "rewards/margins": 3.867744207382202, + "rewards/rejected": -11.456559181213379, + "step": 391 + }, + { + "epoch": 0.24385692068429238, + "grad_norm": 0.5350190997123718, + "learning_rate": 4.057971014492754e-06, + "logits/chosen": -0.43517547845840454, + "logits/rejected": 4.035444736480713, + "logps/chosen": -518.9000854492188, + "logps/rejected": -957.9375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.960100173950195, + "rewards/margins": 20.601770401000977, + "rewards/rejected": -29.561870574951172, + "step": 392 + }, + { + "epoch": 0.24447900466562986, + "grad_norm": 17.18977928161621, + "learning_rate": 4.06832298136646e-06, + "logits/chosen": 2.52517032623291, + "logits/rejected": 3.6979427337646484, + "logps/chosen": -600.1431274414062, + "logps/rejected": -744.0851440429688, + "loss": 0.1593, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.176643371582031, + "rewards/margins": 10.406632423400879, + "rewards/rejected": -21.583276748657227, + "step": 393 + }, + { + "epoch": 0.24510108864696734, + "grad_norm": 26.286714553833008, + "learning_rate": 4.078674948240166e-06, + "logits/chosen": 0.6746093034744263, + "logits/rejected": 3.103036880493164, + "logps/chosen": -551.1771850585938, + "logps/rejected": -789.2107543945312, + "loss": 0.1156, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.185583114624023, + "rewards/margins": 12.239805221557617, + "rewards/rejected": -21.42538833618164, + "step": 394 + }, + { + "epoch": 0.24572317262830481, + "grad_norm": 48.16623306274414, + "learning_rate": 4.089026915113871e-06, + "logits/chosen": 0.9857127666473389, + "logits/rejected": 2.9497580528259277, + "logps/chosen": -589.0301513671875, + "logps/rejected": -858.3607177734375, + "loss": 0.8546, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.686701774597168, + "rewards/margins": 8.606294631958008, + "rewards/rejected": -17.292997360229492, + "step": 395 + }, + { + "epoch": 0.2463452566096423, + "grad_norm": 16.663297653198242, + "learning_rate": 4.099378881987578e-06, + "logits/chosen": -0.5236176252365112, + "logits/rejected": 4.395070552825928, + "logps/chosen": -501.79010009765625, + "logps/rejected": -919.9902954101562, + "loss": 0.1062, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.657619953155518, + "rewards/margins": 13.845070838928223, + "rewards/rejected": -21.502689361572266, + "step": 396 + }, + { + "epoch": 0.24696734059097977, + "grad_norm": 10.62903881072998, + "learning_rate": 4.109730848861284e-06, + "logits/chosen": 1.6638069152832031, + "logits/rejected": 4.234158992767334, + "logps/chosen": -656.40966796875, + "logps/rejected": -937.57373046875, + "loss": 0.0413, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.261098861694336, + "rewards/margins": 14.2492094039917, + "rewards/rejected": -26.51030921936035, + "step": 397 + }, + { + "epoch": 0.24758942457231725, + "grad_norm": 52.60649490356445, + "learning_rate": 4.12008281573499e-06, + "logits/chosen": -2.485949993133545, + "logits/rejected": 1.5430916547775269, + "logps/chosen": -493.4781494140625, + "logps/rejected": -862.8096923828125, + "loss": 1.3036, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.662334442138672, + "rewards/margins": 12.271377563476562, + "rewards/rejected": -21.933712005615234, + "step": 398 + }, + { + "epoch": 0.24821150855365473, + "grad_norm": 45.10889434814453, + "learning_rate": 4.130434782608696e-06, + "logits/chosen": -0.12205278873443604, + "logits/rejected": 3.722781181335449, + "logps/chosen": -527.0693359375, + "logps/rejected": -907.486083984375, + "loss": 0.9624, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.117431640625, + "rewards/margins": 13.028144836425781, + "rewards/rejected": -23.14557647705078, + "step": 399 + }, + { + "epoch": 0.24883359253499224, + "grad_norm": 30.479806900024414, + "learning_rate": 4.1407867494824025e-06, + "logits/chosen": -1.7838436365127563, + "logits/rejected": 3.219532012939453, + "logps/chosen": -398.19696044921875, + "logps/rejected": -772.0665283203125, + "loss": 0.2774, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.097878456115723, + "rewards/margins": 10.300621032714844, + "rewards/rejected": -14.39849853515625, + "step": 400 + }, + { + "epoch": 0.24945567651632972, + "grad_norm": 8.507942199707031, + "learning_rate": 4.151138716356108e-06, + "logits/chosen": -2.097954034805298, + "logits/rejected": 0.6987195611000061, + "logps/chosen": -465.9945373535156, + "logps/rejected": -792.6306762695312, + "loss": 0.0872, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.525957107543945, + "rewards/margins": 9.363893508911133, + "rewards/rejected": -16.889850616455078, + "step": 401 + }, + { + "epoch": 0.25007776049766717, + "grad_norm": 0.2137855887413025, + "learning_rate": 4.1614906832298145e-06, + "logits/chosen": -0.9140585660934448, + "logits/rejected": 2.3151116371154785, + "logps/chosen": -567.2479858398438, + "logps/rejected": -872.9825439453125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.398047924041748, + "rewards/margins": 16.891010284423828, + "rewards/rejected": -24.289058685302734, + "step": 402 + }, + { + "epoch": 0.2506998444790047, + "grad_norm": 8.883277893066406, + "learning_rate": 4.17184265010352e-06, + "logits/chosen": 1.1128238439559937, + "logits/rejected": 3.5758769512176514, + "logps/chosen": -319.7587585449219, + "logps/rejected": -550.59130859375, + "loss": 0.1725, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.242685317993164, + "rewards/margins": 8.442047119140625, + "rewards/rejected": -13.684733390808105, + "step": 403 + }, + { + "epoch": 0.2513219284603421, + "grad_norm": 2.909620761871338, + "learning_rate": 4.182194616977226e-06, + "logits/chosen": -2.1192445755004883, + "logits/rejected": 1.4824601411819458, + "logps/chosen": -362.9059143066406, + "logps/rejected": -629.9395751953125, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.2401885986328125, + "rewards/margins": 11.025968551635742, + "rewards/rejected": -17.266155242919922, + "step": 404 + }, + { + "epoch": 0.25194401244167963, + "grad_norm": 40.915035247802734, + "learning_rate": 4.192546583850932e-06, + "logits/chosen": 0.9748575687408447, + "logits/rejected": 3.5667386054992676, + "logps/chosen": -563.0350341796875, + "logps/rejected": -791.6390380859375, + "loss": 1.3857, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.867755889892578, + "rewards/margins": 11.296546936035156, + "rewards/rejected": -20.164302825927734, + "step": 405 + }, + { + "epoch": 0.2525660964230171, + "grad_norm": 17.680635452270508, + "learning_rate": 4.202898550724638e-06, + "logits/chosen": -1.3480279445648193, + "logits/rejected": 4.3109130859375, + "logps/chosen": -554.6787109375, + "logps/rejected": -936.0064086914062, + "loss": 0.126, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.705822944641113, + "rewards/margins": 10.080413818359375, + "rewards/rejected": -19.786235809326172, + "step": 406 + }, + { + "epoch": 0.2531881804043546, + "grad_norm": 35.00507354736328, + "learning_rate": 4.213250517598344e-06, + "logits/chosen": 2.5178279876708984, + "logits/rejected": 4.39531135559082, + "logps/chosen": -507.35345458984375, + "logps/rejected": -729.5151977539062, + "loss": 0.4349, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.729861259460449, + "rewards/margins": 7.778502941131592, + "rewards/rejected": -11.508363723754883, + "step": 407 + }, + { + "epoch": 0.25381026438569204, + "grad_norm": 0.04022481292486191, + "learning_rate": 4.22360248447205e-06, + "logits/chosen": -0.8258675336837769, + "logits/rejected": 3.800693988800049, + "logps/chosen": -420.3175048828125, + "logps/rejected": -882.1793212890625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.828684329986572, + "rewards/margins": 18.809568405151367, + "rewards/rejected": -24.63825225830078, + "step": 408 + }, + { + "epoch": 0.25443234836702955, + "grad_norm": 0.001061922637745738, + "learning_rate": 4.233954451345756e-06, + "logits/chosen": -3.2353157997131348, + "logits/rejected": 1.6717270612716675, + "logps/chosen": -393.54449462890625, + "logps/rejected": -819.972412109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6159932017326355, + "rewards/margins": 16.839954376220703, + "rewards/rejected": -17.455947875976562, + "step": 409 + }, + { + "epoch": 0.25505443234836706, + "grad_norm": 8.824790954589844, + "learning_rate": 4.244306418219462e-06, + "logits/chosen": 0.3574584126472473, + "logits/rejected": 4.6828107833862305, + "logps/chosen": -431.82720947265625, + "logps/rejected": -833.9597778320312, + "loss": 0.0481, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.883938789367676, + "rewards/margins": 15.454058647155762, + "rewards/rejected": -21.337997436523438, + "step": 410 + }, + { + "epoch": 0.2556765163297045, + "grad_norm": 28.11722183227539, + "learning_rate": 4.254658385093168e-06, + "logits/chosen": 1.0341689586639404, + "logits/rejected": 1.9157094955444336, + "logps/chosen": -704.972412109375, + "logps/rejected": -916.5335083007812, + "loss": 1.1396, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.860650062561035, + "rewards/margins": 14.662042617797852, + "rewards/rejected": -21.522693634033203, + "step": 411 + }, + { + "epoch": 0.256298600311042, + "grad_norm": 12.06362533569336, + "learning_rate": 4.265010351966874e-06, + "logits/chosen": 2.358436107635498, + "logits/rejected": 1.828049659729004, + "logps/chosen": -598.4984741210938, + "logps/rejected": -622.1403198242188, + "loss": 0.0603, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.350015163421631, + "rewards/margins": 6.945611953735352, + "rewards/rejected": -12.29562759399414, + "step": 412 + }, + { + "epoch": 0.25692068429237946, + "grad_norm": 32.59035873413086, + "learning_rate": 4.27536231884058e-06, + "logits/chosen": -2.3559393882751465, + "logits/rejected": -0.047882288694381714, + "logps/chosen": -506.2479553222656, + "logps/rejected": -774.8721923828125, + "loss": 1.0308, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.341673374176025, + "rewards/margins": 11.014400482177734, + "rewards/rejected": -17.3560733795166, + "step": 413 + }, + { + "epoch": 0.25754276827371697, + "grad_norm": 0.0008131487993523479, + "learning_rate": 4.2857142857142855e-06, + "logits/chosen": -0.24382908642292023, + "logits/rejected": 3.1727211475372314, + "logps/chosen": -495.8878173828125, + "logps/rejected": -906.5289916992188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.006720066070557, + "rewards/margins": 21.804332733154297, + "rewards/rejected": -25.811054229736328, + "step": 414 + }, + { + "epoch": 0.2581648522550544, + "grad_norm": 53.79934310913086, + "learning_rate": 4.296066252587992e-06, + "logits/chosen": 2.0413005352020264, + "logits/rejected": 2.457620143890381, + "logps/chosen": -714.4756469726562, + "logps/rejected": -817.9464721679688, + "loss": 1.9911, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.469036102294922, + "rewards/margins": 7.2690205574035645, + "rewards/rejected": -14.738057136535645, + "step": 415 + }, + { + "epoch": 0.25878693623639193, + "grad_norm": 46.28091812133789, + "learning_rate": 4.306418219461698e-06, + "logits/chosen": -1.0962793827056885, + "logits/rejected": 2.4222702980041504, + "logps/chosen": -452.4810791015625, + "logps/rejected": -901.61328125, + "loss": 0.4518, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.062511444091797, + "rewards/margins": 14.822870254516602, + "rewards/rejected": -18.8853816986084, + "step": 416 + }, + { + "epoch": 0.2594090202177294, + "grad_norm": 5.891534805297852, + "learning_rate": 4.316770186335404e-06, + "logits/chosen": -2.3341612815856934, + "logits/rejected": 2.0078935623168945, + "logps/chosen": -371.0771789550781, + "logps/rejected": -782.2593383789062, + "loss": 0.045, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.06859016418457, + "rewards/margins": 15.229127883911133, + "rewards/rejected": -20.297718048095703, + "step": 417 + }, + { + "epoch": 0.2600311041990669, + "grad_norm": 61.13966369628906, + "learning_rate": 4.32712215320911e-06, + "logits/chosen": -0.5787434577941895, + "logits/rejected": 2.6994335651397705, + "logps/chosen": -533.9091796875, + "logps/rejected": -798.9489135742188, + "loss": 2.1237, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.8390722274780273, + "rewards/margins": 6.464807987213135, + "rewards/rejected": -10.30388069152832, + "step": 418 + }, + { + "epoch": 0.26065318818040434, + "grad_norm": 38.46092987060547, + "learning_rate": 4.337474120082817e-06, + "logits/chosen": -2.4339849948883057, + "logits/rejected": 2.17366886138916, + "logps/chosen": -351.37957763671875, + "logps/rejected": -786.957275390625, + "loss": 1.4374, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.966383934020996, + "rewards/margins": 12.617194175720215, + "rewards/rejected": -18.583576202392578, + "step": 419 + }, + { + "epoch": 0.26127527216174184, + "grad_norm": 3.1034646034240723, + "learning_rate": 4.347826086956522e-06, + "logits/chosen": -1.9733619689941406, + "logits/rejected": 1.6848191022872925, + "logps/chosen": -379.7969970703125, + "logps/rejected": -691.1990966796875, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.863301753997803, + "rewards/margins": 12.697863578796387, + "rewards/rejected": -18.56116485595703, + "step": 420 + }, + { + "epoch": 0.2618973561430793, + "grad_norm": 32.67521286010742, + "learning_rate": 4.358178053830228e-06, + "logits/chosen": 1.1796698570251465, + "logits/rejected": 4.6011834144592285, + "logps/chosen": -574.990234375, + "logps/rejected": -920.2255859375, + "loss": 0.7592, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.079226970672607, + "rewards/margins": 8.750815391540527, + "rewards/rejected": -13.830042839050293, + "step": 421 + }, + { + "epoch": 0.2625194401244168, + "grad_norm": 0.34506696462631226, + "learning_rate": 4.368530020703934e-06, + "logits/chosen": 0.18304240703582764, + "logits/rejected": 2.8602895736694336, + "logps/chosen": -505.0839538574219, + "logps/rejected": -769.8147583007812, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4538447856903076, + "rewards/margins": 13.170971870422363, + "rewards/rejected": -16.62481689453125, + "step": 422 + }, + { + "epoch": 0.26314152410575425, + "grad_norm": 21.526031494140625, + "learning_rate": 4.37888198757764e-06, + "logits/chosen": 0.37090492248535156, + "logits/rejected": 3.465035915374756, + "logps/chosen": -380.53021240234375, + "logps/rejected": -645.5463256835938, + "loss": 0.3492, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.203587532043457, + "rewards/margins": 10.76041030883789, + "rewards/rejected": -15.963998794555664, + "step": 423 + }, + { + "epoch": 0.26376360808709176, + "grad_norm": 0.6480556130409241, + "learning_rate": 4.389233954451346e-06, + "logits/chosen": 0.4218369722366333, + "logits/rejected": 3.8173675537109375, + "logps/chosen": -460.1600036621094, + "logps/rejected": -816.0669555664062, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8497533798217773, + "rewards/margins": 16.56940460205078, + "rewards/rejected": -20.419158935546875, + "step": 424 + }, + { + "epoch": 0.2643856920684292, + "grad_norm": 0.35805878043174744, + "learning_rate": 4.399585921325052e-06, + "logits/chosen": -0.5259240865707397, + "logits/rejected": 2.797184467315674, + "logps/chosen": -321.9033203125, + "logps/rejected": -709.5282592773438, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.251986503601074, + "rewards/margins": 13.363567352294922, + "rewards/rejected": -17.615554809570312, + "step": 425 + }, + { + "epoch": 0.2650077760497667, + "grad_norm": 30.364904403686523, + "learning_rate": 4.409937888198758e-06, + "logits/chosen": 0.010838508605957031, + "logits/rejected": 3.3058197498321533, + "logps/chosen": -322.04998779296875, + "logps/rejected": -627.85107421875, + "loss": 0.3733, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6551088690757751, + "rewards/margins": 10.415206909179688, + "rewards/rejected": -11.070316314697266, + "step": 426 + }, + { + "epoch": 0.2656298600311042, + "grad_norm": 0.6932123303413391, + "learning_rate": 4.4202898550724645e-06, + "logits/chosen": -3.0709590911865234, + "logits/rejected": 1.8338202238082886, + "logps/chosen": -395.63201904296875, + "logps/rejected": -706.2110595703125, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.838165283203125, + "rewards/margins": 10.090193748474121, + "rewards/rejected": -12.928359985351562, + "step": 427 + }, + { + "epoch": 0.2662519440124417, + "grad_norm": 20.828861236572266, + "learning_rate": 4.43064182194617e-06, + "logits/chosen": 1.9217793941497803, + "logits/rejected": 3.788158893585205, + "logps/chosen": -446.7420654296875, + "logps/rejected": -647.2725219726562, + "loss": 0.1785, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4110867977142334, + "rewards/margins": 10.278014183044434, + "rewards/rejected": -13.689101219177246, + "step": 428 + }, + { + "epoch": 0.2668740279937792, + "grad_norm": 72.134033203125, + "learning_rate": 4.4409937888198765e-06, + "logits/chosen": -1.149119257926941, + "logits/rejected": 1.603652000427246, + "logps/chosen": -605.6443481445312, + "logps/rejected": -874.0235595703125, + "loss": 2.4631, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.702610969543457, + "rewards/margins": 8.99447250366211, + "rewards/rejected": -15.69708251953125, + "step": 429 + }, + { + "epoch": 0.26749611197511663, + "grad_norm": 14.545768737792969, + "learning_rate": 4.451345755693582e-06, + "logits/chosen": 0.7905763387680054, + "logits/rejected": 2.0667171478271484, + "logps/chosen": -578.0338745117188, + "logps/rejected": -808.4791259765625, + "loss": 0.0718, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.076621055603027, + "rewards/margins": 11.813894271850586, + "rewards/rejected": -16.890514373779297, + "step": 430 + }, + { + "epoch": 0.26811819595645414, + "grad_norm": 2.8750247955322266, + "learning_rate": 4.4616977225672884e-06, + "logits/chosen": 0.553899884223938, + "logits/rejected": 1.9621305465698242, + "logps/chosen": -432.06707763671875, + "logps/rejected": -614.139404296875, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.893393039703369, + "rewards/margins": 7.513227462768555, + "rewards/rejected": -14.406620025634766, + "step": 431 + }, + { + "epoch": 0.2687402799377916, + "grad_norm": 0.06428180634975433, + "learning_rate": 4.472049689440994e-06, + "logits/chosen": -3.2774603366851807, + "logits/rejected": 3.1442956924438477, + "logps/chosen": -306.5682678222656, + "logps/rejected": -784.934814453125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5532071590423584, + "rewards/margins": 16.325817108154297, + "rewards/rejected": -19.879024505615234, + "step": 432 + }, + { + "epoch": 0.2693623639191291, + "grad_norm": 0.01828506775200367, + "learning_rate": 4.4824016563146996e-06, + "logits/chosen": -2.483086347579956, + "logits/rejected": 2.2985618114471436, + "logps/chosen": -339.38409423828125, + "logps/rejected": -746.3882446289062, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.604024887084961, + "rewards/margins": 13.318676948547363, + "rewards/rejected": -16.92270278930664, + "step": 433 + }, + { + "epoch": 0.26998444790046655, + "grad_norm": 31.069183349609375, + "learning_rate": 4.492753623188406e-06, + "logits/chosen": 0.32477349042892456, + "logits/rejected": 3.5227673053741455, + "logps/chosen": -445.853515625, + "logps/rejected": -733.571044921875, + "loss": 0.383, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.151865482330322, + "rewards/margins": 11.84760856628418, + "rewards/rejected": -17.999475479125977, + "step": 434 + }, + { + "epoch": 0.27060653188180406, + "grad_norm": 0.9645976424217224, + "learning_rate": 4.503105590062112e-06, + "logits/chosen": -0.11669465899467468, + "logits/rejected": 4.126821041107178, + "logps/chosen": -368.92138671875, + "logps/rejected": -812.9979248046875, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.660503625869751, + "rewards/margins": 15.838494300842285, + "rewards/rejected": -18.498998641967773, + "step": 435 + }, + { + "epoch": 0.2712286158631415, + "grad_norm": 39.218509674072266, + "learning_rate": 4.513457556935818e-06, + "logits/chosen": 0.9734901785850525, + "logits/rejected": 2.7167110443115234, + "logps/chosen": -637.0340576171875, + "logps/rejected": -781.2739868164062, + "loss": 0.4874, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.061949729919434, + "rewards/margins": 6.840909004211426, + "rewards/rejected": -13.90285873413086, + "step": 436 + }, + { + "epoch": 0.271850699844479, + "grad_norm": 0.06801696121692657, + "learning_rate": 4.523809523809524e-06, + "logits/chosen": -0.26763665676116943, + "logits/rejected": 2.6673028469085693, + "logps/chosen": -445.45440673828125, + "logps/rejected": -784.701904296875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.659411907196045, + "rewards/margins": 14.931681632995605, + "rewards/rejected": -21.591094970703125, + "step": 437 + }, + { + "epoch": 0.27247278382581647, + "grad_norm": 36.035675048828125, + "learning_rate": 4.534161490683231e-06, + "logits/chosen": -0.16339880228042603, + "logits/rejected": 2.9149556159973145, + "logps/chosen": -583.873779296875, + "logps/rejected": -795.2122192382812, + "loss": 0.3557, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.639849662780762, + "rewards/margins": 12.784384727478027, + "rewards/rejected": -17.42423439025879, + "step": 438 + }, + { + "epoch": 0.273094867807154, + "grad_norm": 0.8856536746025085, + "learning_rate": 4.544513457556936e-06, + "logits/chosen": -1.4041645526885986, + "logits/rejected": 2.2672030925750732, + "logps/chosen": -365.6314697265625, + "logps/rejected": -686.5092163085938, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.813375949859619, + "rewards/margins": 12.718483924865723, + "rewards/rejected": -19.5318603515625, + "step": 439 + }, + { + "epoch": 0.2737169517884914, + "grad_norm": 3.0047109127044678, + "learning_rate": 4.554865424430642e-06, + "logits/chosen": 0.23409417271614075, + "logits/rejected": 3.16221284866333, + "logps/chosen": -432.616455078125, + "logps/rejected": -746.5999145507812, + "loss": 0.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.304468154907227, + "rewards/margins": 14.120923042297363, + "rewards/rejected": -18.425392150878906, + "step": 440 + }, + { + "epoch": 0.27433903576982893, + "grad_norm": 17.851999282836914, + "learning_rate": 4.565217391304348e-06, + "logits/chosen": 0.34363794326782227, + "logits/rejected": 1.5882227420806885, + "logps/chosen": -498.787353515625, + "logps/rejected": -747.3587646484375, + "loss": 0.1145, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.135153770446777, + "rewards/margins": 14.027706146240234, + "rewards/rejected": -18.162860870361328, + "step": 441 + }, + { + "epoch": 0.2749611197511664, + "grad_norm": 0.005955725442618132, + "learning_rate": 4.575569358178054e-06, + "logits/chosen": -1.9308936595916748, + "logits/rejected": 4.959326267242432, + "logps/chosen": -230.3758544921875, + "logps/rejected": -687.7498779296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3154499530792236, + "rewards/margins": 15.841753959655762, + "rewards/rejected": -18.157203674316406, + "step": 442 + }, + { + "epoch": 0.2755832037325039, + "grad_norm": 6.812429904937744, + "learning_rate": 4.58592132505176e-06, + "logits/chosen": -1.1785022020339966, + "logits/rejected": 4.522409915924072, + "logps/chosen": -390.4237976074219, + "logps/rejected": -832.566650390625, + "loss": 0.029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7003462314605713, + "rewards/margins": 13.203170776367188, + "rewards/rejected": -14.90351676940918, + "step": 443 + }, + { + "epoch": 0.27620528771384134, + "grad_norm": 0.40864428877830505, + "learning_rate": 4.596273291925466e-06, + "logits/chosen": 0.9464837312698364, + "logits/rejected": 3.3178625106811523, + "logps/chosen": -486.07080078125, + "logps/rejected": -789.6358642578125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.384681224822998, + "rewards/margins": 13.80879020690918, + "rewards/rejected": -19.19347381591797, + "step": 444 + }, + { + "epoch": 0.27682737169517885, + "grad_norm": 9.587814331054688, + "learning_rate": 4.606625258799172e-06, + "logits/chosen": -0.3521985411643982, + "logits/rejected": 1.9633857011795044, + "logps/chosen": -317.2825622558594, + "logps/rejected": -631.1461791992188, + "loss": 0.1236, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.156929016113281, + "rewards/margins": 11.896200180053711, + "rewards/rejected": -17.053129196166992, + "step": 445 + }, + { + "epoch": 0.27744945567651635, + "grad_norm": 18.159738540649414, + "learning_rate": 4.616977225672879e-06, + "logits/chosen": -2.9285995960235596, + "logits/rejected": 2.2748818397521973, + "logps/chosen": -380.7985534667969, + "logps/rejected": -957.358154296875, + "loss": 0.1393, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3776912689208984, + "rewards/margins": 15.898237228393555, + "rewards/rejected": -19.275928497314453, + "step": 446 + }, + { + "epoch": 0.2780715396578538, + "grad_norm": 14.753242492675781, + "learning_rate": 4.627329192546584e-06, + "logits/chosen": 0.5322920680046082, + "logits/rejected": 2.511157274246216, + "logps/chosen": -571.1580200195312, + "logps/rejected": -745.7861328125, + "loss": 0.1431, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.477001667022705, + "rewards/margins": 10.424528121948242, + "rewards/rejected": -16.901531219482422, + "step": 447 + }, + { + "epoch": 0.2786936236391913, + "grad_norm": 41.926910400390625, + "learning_rate": 4.637681159420291e-06, + "logits/chosen": -0.680780291557312, + "logits/rejected": 2.3562331199645996, + "logps/chosen": -612.2960205078125, + "logps/rejected": -949.9226684570312, + "loss": 0.6981, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.722881317138672, + "rewards/margins": 13.012408256530762, + "rewards/rejected": -25.73529052734375, + "step": 448 + }, + { + "epoch": 0.27931570762052876, + "grad_norm": 64.53588104248047, + "learning_rate": 4.648033126293996e-06, + "logits/chosen": 0.5436501502990723, + "logits/rejected": 1.3081331253051758, + "logps/chosen": -639.3142700195312, + "logps/rejected": -768.12109375, + "loss": 2.0345, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.196063995361328, + "rewards/margins": 7.856705665588379, + "rewards/rejected": -18.052770614624023, + "step": 449 + }, + { + "epoch": 0.27993779160186627, + "grad_norm": 0.13056936860084534, + "learning_rate": 4.6583850931677025e-06, + "logits/chosen": -1.6725151538848877, + "logits/rejected": 1.1285243034362793, + "logps/chosen": -322.6612548828125, + "logps/rejected": -694.7567749023438, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.016662836074829, + "rewards/margins": 12.84094524383545, + "rewards/rejected": -15.857608795166016, + "step": 450 + }, + { + "epoch": 0.2805598755832037, + "grad_norm": 19.141525268554688, + "learning_rate": 4.668737060041408e-06, + "logits/chosen": -1.0159063339233398, + "logits/rejected": 2.010833501815796, + "logps/chosen": -506.7704772949219, + "logps/rejected": -794.066162109375, + "loss": 0.1208, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9857184886932373, + "rewards/margins": 14.262092590332031, + "rewards/rejected": -17.24781036376953, + "step": 451 + }, + { + "epoch": 0.28118195956454123, + "grad_norm": 0.5353635549545288, + "learning_rate": 4.679089026915114e-06, + "logits/chosen": 1.4490152597427368, + "logits/rejected": 3.0750107765197754, + "logps/chosen": -593.1810913085938, + "logps/rejected": -867.7335815429688, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.067837238311768, + "rewards/margins": 19.971832275390625, + "rewards/rejected": -25.039670944213867, + "step": 452 + }, + { + "epoch": 0.2818040435458787, + "grad_norm": 33.27368927001953, + "learning_rate": 4.68944099378882e-06, + "logits/chosen": -0.8176746964454651, + "logits/rejected": 4.199805736541748, + "logps/chosen": -414.0248107910156, + "logps/rejected": -875.6469116210938, + "loss": 0.4048, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.070443630218506, + "rewards/margins": 17.442882537841797, + "rewards/rejected": -24.513324737548828, + "step": 453 + }, + { + "epoch": 0.2824261275272162, + "grad_norm": 0.5116642117500305, + "learning_rate": 4.6997929606625265e-06, + "logits/chosen": -0.9462630152702332, + "logits/rejected": 4.389346122741699, + "logps/chosen": -450.0084533691406, + "logps/rejected": -879.887451171875, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.727695941925049, + "rewards/margins": 14.461294174194336, + "rewards/rejected": -20.188989639282227, + "step": 454 + }, + { + "epoch": 0.28304821150855364, + "grad_norm": 0.004335819277912378, + "learning_rate": 4.710144927536232e-06, + "logits/chosen": -0.34563398361206055, + "logits/rejected": 2.4481041431427, + "logps/chosen": -492.546630859375, + "logps/rejected": -868.8828735351562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.161014556884766, + "rewards/margins": 17.945571899414062, + "rewards/rejected": -25.106586456298828, + "step": 455 + }, + { + "epoch": 0.28367029548989114, + "grad_norm": 4.237789630889893, + "learning_rate": 4.7204968944099384e-06, + "logits/chosen": 1.1531126499176025, + "logits/rejected": 3.893054485321045, + "logps/chosen": -517.807373046875, + "logps/rejected": -700.8167114257812, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.180765151977539, + "rewards/margins": 10.975384712219238, + "rewards/rejected": -19.156150817871094, + "step": 456 + }, + { + "epoch": 0.2842923794712286, + "grad_norm": 29.369626998901367, + "learning_rate": 4.730848861283645e-06, + "logits/chosen": -0.09173685312271118, + "logits/rejected": 4.074847221374512, + "logps/chosen": -400.05731201171875, + "logps/rejected": -793.289794921875, + "loss": 0.3132, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.655954599380493, + "rewards/margins": 14.949386596679688, + "rewards/rejected": -17.6053409576416, + "step": 457 + }, + { + "epoch": 0.2849144634525661, + "grad_norm": 0.08755633234977722, + "learning_rate": 4.74120082815735e-06, + "logits/chosen": 2.690290927886963, + "logits/rejected": 4.742177486419678, + "logps/chosen": -631.9013671875, + "logps/rejected": -792.9278564453125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.875566005706787, + "rewards/margins": 11.673030853271484, + "rewards/rejected": -17.548595428466797, + "step": 458 + }, + { + "epoch": 0.28553654743390355, + "grad_norm": 19.94220733642578, + "learning_rate": 4.751552795031056e-06, + "logits/chosen": -1.3704369068145752, + "logits/rejected": 3.496886968612671, + "logps/chosen": -335.9270324707031, + "logps/rejected": -774.93896484375, + "loss": 0.2088, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.90877628326416, + "rewards/margins": 16.912059783935547, + "rewards/rejected": -20.82083511352539, + "step": 459 + }, + { + "epoch": 0.28615863141524106, + "grad_norm": 19.56954574584961, + "learning_rate": 4.761904761904762e-06, + "logits/chosen": -1.0929023027420044, + "logits/rejected": 2.552992582321167, + "logps/chosen": -454.52703857421875, + "logps/rejected": -862.881103515625, + "loss": 0.0966, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.9837646484375, + "rewards/margins": 12.657257080078125, + "rewards/rejected": -20.641021728515625, + "step": 460 + }, + { + "epoch": 0.2867807153965785, + "grad_norm": 49.00531005859375, + "learning_rate": 4.772256728778468e-06, + "logits/chosen": 1.0357595682144165, + "logits/rejected": 3.4586753845214844, + "logps/chosen": -576.972900390625, + "logps/rejected": -813.3095703125, + "loss": 2.7897, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.867692947387695, + "rewards/margins": 8.180194854736328, + "rewards/rejected": -19.04788589477539, + "step": 461 + }, + { + "epoch": 0.287402799377916, + "grad_norm": 35.561561584472656, + "learning_rate": 4.782608695652174e-06, + "logits/chosen": -0.6484382748603821, + "logits/rejected": 1.8681774139404297, + "logps/chosen": -480.96697998046875, + "logps/rejected": -675.93505859375, + "loss": 0.7702, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.842622756958008, + "rewards/margins": 7.973590850830078, + "rewards/rejected": -15.816213607788086, + "step": 462 + }, + { + "epoch": 0.2880248833592535, + "grad_norm": 13.397370338439941, + "learning_rate": 4.79296066252588e-06, + "logits/chosen": -0.4268653988838196, + "logits/rejected": 1.851413607597351, + "logps/chosen": -558.5866088867188, + "logps/rejected": -824.4746704101562, + "loss": 0.1014, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.017292976379395, + "rewards/margins": 15.577205657958984, + "rewards/rejected": -26.594499588012695, + "step": 463 + }, + { + "epoch": 0.288646967340591, + "grad_norm": 9.706470489501953, + "learning_rate": 4.803312629399586e-06, + "logits/chosen": 0.26902830600738525, + "logits/rejected": 2.9063727855682373, + "logps/chosen": -592.06103515625, + "logps/rejected": -944.8291015625, + "loss": 0.0521, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.126492500305176, + "rewards/margins": 15.503677368164062, + "rewards/rejected": -24.630168914794922, + "step": 464 + }, + { + "epoch": 0.2892690513219285, + "grad_norm": 10.245887756347656, + "learning_rate": 4.813664596273293e-06, + "logits/chosen": 1.1928220987319946, + "logits/rejected": 2.9932894706726074, + "logps/chosen": -420.79364013671875, + "logps/rejected": -637.9657592773438, + "loss": 0.0487, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.203188896179199, + "rewards/margins": 10.591708183288574, + "rewards/rejected": -15.794897079467773, + "step": 465 + }, + { + "epoch": 0.28989113530326593, + "grad_norm": 2.713153600692749, + "learning_rate": 4.824016563146998e-06, + "logits/chosen": -2.3040497303009033, + "logits/rejected": 2.769646406173706, + "logps/chosen": -466.1157531738281, + "logps/rejected": -865.4462890625, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.633267879486084, + "rewards/margins": 12.267324447631836, + "rewards/rejected": -18.900592803955078, + "step": 466 + }, + { + "epoch": 0.29051321928460344, + "grad_norm": 10.755620002746582, + "learning_rate": 4.834368530020705e-06, + "logits/chosen": 0.9057645797729492, + "logits/rejected": 2.70548415184021, + "logps/chosen": -634.2490234375, + "logps/rejected": -898.1949462890625, + "loss": 0.0686, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.538129806518555, + "rewards/margins": 10.6325101852417, + "rewards/rejected": -17.17064094543457, + "step": 467 + }, + { + "epoch": 0.2911353032659409, + "grad_norm": 0.32542532682418823, + "learning_rate": 4.84472049689441e-06, + "logits/chosen": 0.36011022329330444, + "logits/rejected": 3.592407703399658, + "logps/chosen": -515.8699340820312, + "logps/rejected": -852.7752075195312, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.930538177490234, + "rewards/margins": 12.950520515441895, + "rewards/rejected": -21.881057739257812, + "step": 468 + }, + { + "epoch": 0.2917573872472784, + "grad_norm": 18.29994010925293, + "learning_rate": 4.855072463768117e-06, + "logits/chosen": -2.088484764099121, + "logits/rejected": 1.2744338512420654, + "logps/chosen": -515.333984375, + "logps/rejected": -878.5330810546875, + "loss": 0.0913, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.84554386138916, + "rewards/margins": 16.93134880065918, + "rewards/rejected": -25.77689552307129, + "step": 469 + }, + { + "epoch": 0.29237947122861585, + "grad_norm": 0.0330018624663353, + "learning_rate": 4.865424430641822e-06, + "logits/chosen": -1.7830653190612793, + "logits/rejected": 4.1540117263793945, + "logps/chosen": -318.3174133300781, + "logps/rejected": -939.5980224609375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.052013397216797, + "rewards/margins": 23.681976318359375, + "rewards/rejected": -29.733993530273438, + "step": 470 + }, + { + "epoch": 0.29300155520995336, + "grad_norm": 1.8088337182998657, + "learning_rate": 4.875776397515528e-06, + "logits/chosen": -1.733601450920105, + "logits/rejected": 3.224742889404297, + "logps/chosen": -384.46563720703125, + "logps/rejected": -849.8120727539062, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0474658012390137, + "rewards/margins": 15.848743438720703, + "rewards/rejected": -18.896209716796875, + "step": 471 + }, + { + "epoch": 0.2936236391912908, + "grad_norm": 36.6681022644043, + "learning_rate": 4.886128364389234e-06, + "logits/chosen": 4.0972490310668945, + "logits/rejected": 3.991466760635376, + "logps/chosen": -659.598876953125, + "logps/rejected": -745.9718017578125, + "loss": 0.9151, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.811805725097656, + "rewards/margins": 10.334466934204102, + "rewards/rejected": -21.146270751953125, + "step": 472 + }, + { + "epoch": 0.2942457231726283, + "grad_norm": 25.90192222595215, + "learning_rate": 4.896480331262941e-06, + "logits/chosen": 0.19086448848247528, + "logits/rejected": 4.012401103973389, + "logps/chosen": -527.1025390625, + "logps/rejected": -888.753173828125, + "loss": 0.2166, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.20269775390625, + "rewards/margins": 16.669485092163086, + "rewards/rejected": -26.872182846069336, + "step": 473 + }, + { + "epoch": 0.29486780715396577, + "grad_norm": 33.641075134277344, + "learning_rate": 4.906832298136646e-06, + "logits/chosen": -0.06766408681869507, + "logits/rejected": 1.6043598651885986, + "logps/chosen": -641.4823608398438, + "logps/rejected": -823.9783935546875, + "loss": 0.8575, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.947844505310059, + "rewards/margins": 13.2529878616333, + "rewards/rejected": -20.20083236694336, + "step": 474 + }, + { + "epoch": 0.2954898911353033, + "grad_norm": 28.769502639770508, + "learning_rate": 4.9171842650103525e-06, + "logits/chosen": -2.258347749710083, + "logits/rejected": 2.3731532096862793, + "logps/chosen": -292.4906311035156, + "logps/rejected": -720.4879760742188, + "loss": 0.7488, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.584536075592041, + "rewards/margins": 18.020307540893555, + "rewards/rejected": -23.60484504699707, + "step": 475 + }, + { + "epoch": 0.2961119751166407, + "grad_norm": 11.994034767150879, + "learning_rate": 4.927536231884059e-06, + "logits/chosen": -0.4436854124069214, + "logits/rejected": 3.012737989425659, + "logps/chosen": -534.578369140625, + "logps/rejected": -852.6429443359375, + "loss": 0.0835, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.41529655456543, + "rewards/margins": 11.558574676513672, + "rewards/rejected": -21.9738712310791, + "step": 476 + }, + { + "epoch": 0.29673405909797823, + "grad_norm": 43.14473342895508, + "learning_rate": 4.9378881987577645e-06, + "logits/chosen": -1.4974907636642456, + "logits/rejected": 1.5613664388656616, + "logps/chosen": -588.545654296875, + "logps/rejected": -959.09521484375, + "loss": 0.628, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.135448455810547, + "rewards/margins": 14.607860565185547, + "rewards/rejected": -23.743309020996094, + "step": 477 + }, + { + "epoch": 0.2973561430793157, + "grad_norm": 0.07286535203456879, + "learning_rate": 4.94824016563147e-06, + "logits/chosen": 2.7268476486206055, + "logits/rejected": 5.009843349456787, + "logps/chosen": -562.8731079101562, + "logps/rejected": -846.2417602539062, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.0041303634643555, + "rewards/margins": 14.70496940612793, + "rewards/rejected": -18.7091007232666, + "step": 478 + }, + { + "epoch": 0.2979782270606532, + "grad_norm": 21.970552444458008, + "learning_rate": 4.9585921325051765e-06, + "logits/chosen": -0.09169107675552368, + "logits/rejected": 3.7837038040161133, + "logps/chosen": -594.7907104492188, + "logps/rejected": -975.8360595703125, + "loss": 0.1053, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.725290298461914, + "rewards/margins": 16.457599639892578, + "rewards/rejected": -25.182891845703125, + "step": 479 + }, + { + "epoch": 0.2986003110419907, + "grad_norm": 0.8716603517532349, + "learning_rate": 4.968944099378882e-06, + "logits/chosen": -1.1426682472229004, + "logits/rejected": 3.6567468643188477, + "logps/chosen": -467.8244323730469, + "logps/rejected": -828.1360473632812, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.560232162475586, + "rewards/margins": 15.44735050201416, + "rewards/rejected": -25.007583618164062, + "step": 480 + }, + { + "epoch": 0.29922239502332815, + "grad_norm": 33.56404113769531, + "learning_rate": 4.9792960662525884e-06, + "logits/chosen": 0.7220965623855591, + "logits/rejected": 3.8607287406921387, + "logps/chosen": -571.3853149414062, + "logps/rejected": -981.12353515625, + "loss": 0.7797, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.732620239257812, + "rewards/margins": 18.23125648498535, + "rewards/rejected": -27.963876724243164, + "step": 481 + }, + { + "epoch": 0.29984447900466565, + "grad_norm": 0.08978770673274994, + "learning_rate": 4.989648033126294e-06, + "logits/chosen": -1.091248869895935, + "logits/rejected": 3.5144314765930176, + "logps/chosen": -335.8171691894531, + "logps/rejected": -747.2073364257812, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7346973419189453, + "rewards/margins": 15.793294906616211, + "rewards/rejected": -19.527992248535156, + "step": 482 + }, + { + "epoch": 0.3004665629860031, + "grad_norm": 3.013458490371704, + "learning_rate": 5e-06, + "logits/chosen": 0.6055781245231628, + "logits/rejected": 2.4230387210845947, + "logps/chosen": -551.6712036132812, + "logps/rejected": -721.6043701171875, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.386186361312866, + "rewards/margins": 14.0813627243042, + "rewards/rejected": -17.46755027770996, + "step": 483 + }, + { + "epoch": 0.3010886469673406, + "grad_norm": 30.440462112426758, + "learning_rate": 4.998847395112956e-06, + "logits/chosen": -1.0507476329803467, + "logits/rejected": 2.1822104454040527, + "logps/chosen": -519.81787109375, + "logps/rejected": -819.217041015625, + "loss": 0.6089, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.95994234085083, + "rewards/margins": 10.954721450805664, + "rewards/rejected": -16.91466522216797, + "step": 484 + }, + { + "epoch": 0.30171073094867806, + "grad_norm": 0.5171369910240173, + "learning_rate": 4.997694790225911e-06, + "logits/chosen": 1.39463210105896, + "logits/rejected": 3.4383838176727295, + "logps/chosen": -607.8924560546875, + "logps/rejected": -851.253173828125, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.285219192504883, + "rewards/margins": 17.140562057495117, + "rewards/rejected": -26.42578125, + "step": 485 + }, + { + "epoch": 0.30233281493001557, + "grad_norm": 0.0036457055248320103, + "learning_rate": 4.996542185338866e-06, + "logits/chosen": 0.37686654925346375, + "logits/rejected": 3.2330291271209717, + "logps/chosen": -513.7345581054688, + "logps/rejected": -845.1243896484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9581902027130127, + "rewards/margins": 16.60371208190918, + "rewards/rejected": -19.561901092529297, + "step": 486 + }, + { + "epoch": 0.302954898911353, + "grad_norm": 19.37331199645996, + "learning_rate": 4.995389580451821e-06, + "logits/chosen": 0.7491446733474731, + "logits/rejected": 4.478757858276367, + "logps/chosen": -346.427734375, + "logps/rejected": -620.2800903320312, + "loss": 0.1591, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.718315601348877, + "rewards/margins": 9.981128692626953, + "rewards/rejected": -13.699443817138672, + "step": 487 + }, + { + "epoch": 0.30357698289269053, + "grad_norm": 30.425840377807617, + "learning_rate": 4.9942369755647765e-06, + "logits/chosen": 0.05541801452636719, + "logits/rejected": 2.531036376953125, + "logps/chosen": -521.7699584960938, + "logps/rejected": -792.4808349609375, + "loss": 0.3181, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.5134687423706055, + "rewards/margins": 13.561178207397461, + "rewards/rejected": -18.07464599609375, + "step": 488 + }, + { + "epoch": 0.304199066874028, + "grad_norm": 0.3623151481151581, + "learning_rate": 4.993084370677732e-06, + "logits/chosen": -0.7831491231918335, + "logits/rejected": 2.9344162940979004, + "logps/chosen": -465.19976806640625, + "logps/rejected": -898.609130859375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.581158399581909, + "rewards/margins": 16.225189208984375, + "rewards/rejected": -19.806346893310547, + "step": 489 + }, + { + "epoch": 0.3048211508553655, + "grad_norm": 0.0868559256196022, + "learning_rate": 4.991931765790687e-06, + "logits/chosen": -0.7188588380813599, + "logits/rejected": 3.194019317626953, + "logps/chosen": -353.9021911621094, + "logps/rejected": -728.07373046875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.075817823410034, + "rewards/margins": 15.577851295471191, + "rewards/rejected": -18.653669357299805, + "step": 490 + }, + { + "epoch": 0.30544323483670294, + "grad_norm": 3.0214992875698954e-05, + "learning_rate": 4.990779160903643e-06, + "logits/chosen": -4.055235862731934, + "logits/rejected": 2.879169464111328, + "logps/chosen": -296.3211975097656, + "logps/rejected": -1010.753173828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0184645652770996, + "rewards/margins": 24.33975601196289, + "rewards/rejected": -27.35822105407715, + "step": 491 + }, + { + "epoch": 0.30606531881804044, + "grad_norm": 0.01288218330591917, + "learning_rate": 4.989626556016598e-06, + "logits/chosen": -1.2633652687072754, + "logits/rejected": 3.8267054557800293, + "logps/chosen": -355.5899658203125, + "logps/rejected": -829.5449829101562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8909149169921875, + "rewards/margins": 20.595117568969727, + "rewards/rejected": -23.48603057861328, + "step": 492 + }, + { + "epoch": 0.3066874027993779, + "grad_norm": 7.490941524505615, + "learning_rate": 4.9884739511295535e-06, + "logits/chosen": 0.9076950550079346, + "logits/rejected": 3.3427844047546387, + "logps/chosen": -536.124755859375, + "logps/rejected": -888.4501953125, + "loss": 0.2884, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.489822864532471, + "rewards/margins": 21.485950469970703, + "rewards/rejected": -27.97577476501465, + "step": 493 + }, + { + "epoch": 0.3073094867807154, + "grad_norm": 11.447746276855469, + "learning_rate": 4.987321346242509e-06, + "logits/chosen": -1.4060472249984741, + "logits/rejected": 3.44411563873291, + "logps/chosen": -309.3318786621094, + "logps/rejected": -801.0556640625, + "loss": 0.0456, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.114956378936768, + "rewards/margins": 15.835704803466797, + "rewards/rejected": -20.950660705566406, + "step": 494 + }, + { + "epoch": 0.30793157076205285, + "grad_norm": 6.083962917327881, + "learning_rate": 4.986168741355464e-06, + "logits/chosen": -0.1779065728187561, + "logits/rejected": 3.5842981338500977, + "logps/chosen": -526.7737426757812, + "logps/rejected": -901.7969970703125, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.71384859085083, + "rewards/margins": 17.37543487548828, + "rewards/rejected": -24.089284896850586, + "step": 495 + }, + { + "epoch": 0.30855365474339036, + "grad_norm": 0.15468864142894745, + "learning_rate": 4.985016136468419e-06, + "logits/chosen": 1.059091329574585, + "logits/rejected": 3.2236602306365967, + "logps/chosen": -624.5120849609375, + "logps/rejected": -944.41748046875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.029439926147461, + "rewards/margins": 17.166460037231445, + "rewards/rejected": -26.19590187072754, + "step": 496 + }, + { + "epoch": 0.3091757387247278, + "grad_norm": 3.3190361136803403e-05, + "learning_rate": 4.983863531581374e-06, + "logits/chosen": -0.9245209097862244, + "logits/rejected": 3.136207103729248, + "logps/chosen": -439.8807373046875, + "logps/rejected": -856.2329711914062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.595765590667725, + "rewards/margins": 19.405052185058594, + "rewards/rejected": -26.000818252563477, + "step": 497 + }, + { + "epoch": 0.3097978227060653, + "grad_norm": 19.35166358947754, + "learning_rate": 4.98271092669433e-06, + "logits/chosen": -0.9028311967849731, + "logits/rejected": 0.7717417478561401, + "logps/chosen": -487.8577880859375, + "logps/rejected": -661.5962524414062, + "loss": 0.3424, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.07199239730835, + "rewards/margins": 11.381423950195312, + "rewards/rejected": -18.453414916992188, + "step": 498 + }, + { + "epoch": 0.3104199066874028, + "grad_norm": 5.1509809494018555, + "learning_rate": 4.981558321807285e-06, + "logits/chosen": 2.9335570335388184, + "logits/rejected": 4.180946350097656, + "logps/chosen": -523.1350708007812, + "logps/rejected": -706.580078125, + "loss": 0.0564, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.346258163452148, + "rewards/margins": 10.622045516967773, + "rewards/rejected": -20.968303680419922, + "step": 499 + }, + { + "epoch": 0.3110419906687403, + "grad_norm": 46.103485107421875, + "learning_rate": 4.98040571692024e-06, + "logits/chosen": 1.3831346035003662, + "logits/rejected": 2.8288354873657227, + "logps/chosen": -670.0927734375, + "logps/rejected": -866.8636474609375, + "loss": 2.0674, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.911185264587402, + "rewards/margins": 7.389866828918457, + "rewards/rejected": -13.30105209350586, + "step": 500 + }, + { + "epoch": 0.3116640746500778, + "grad_norm": 2.393094539642334, + "learning_rate": 4.979253112033195e-06, + "logits/chosen": -1.0027861595153809, + "logits/rejected": 4.648131847381592, + "logps/chosen": -314.61761474609375, + "logps/rejected": -713.0402221679688, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.013879776000977, + "rewards/margins": 11.831021308898926, + "rewards/rejected": -15.844901084899902, + "step": 501 + }, + { + "epoch": 0.31228615863141523, + "grad_norm": 0.044700928032398224, + "learning_rate": 4.9781005071461505e-06, + "logits/chosen": -0.232437402009964, + "logits/rejected": 1.7130168676376343, + "logps/chosen": -703.8182373046875, + "logps/rejected": -967.1251831054688, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.608842372894287, + "rewards/margins": 13.548971176147461, + "rewards/rejected": -21.157814025878906, + "step": 502 + }, + { + "epoch": 0.31290824261275274, + "grad_norm": 28.902780532836914, + "learning_rate": 4.976947902259106e-06, + "logits/chosen": 0.051264986395835876, + "logits/rejected": 3.9489521980285645, + "logps/chosen": -476.07147216796875, + "logps/rejected": -838.5808715820312, + "loss": 0.1531, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.013816833496094, + "rewards/margins": 14.595891952514648, + "rewards/rejected": -18.609710693359375, + "step": 503 + }, + { + "epoch": 0.3135303265940902, + "grad_norm": 2.9529836177825928, + "learning_rate": 4.975795297372061e-06, + "logits/chosen": 0.06787616014480591, + "logits/rejected": 3.1880979537963867, + "logps/chosen": -502.477783203125, + "logps/rejected": -835.5942993164062, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.26987361907959, + "rewards/margins": 13.25092887878418, + "rewards/rejected": -16.520803451538086, + "step": 504 + }, + { + "epoch": 0.3141524105754277, + "grad_norm": 47.98479461669922, + "learning_rate": 4.974642692485017e-06, + "logits/chosen": -2.50323486328125, + "logits/rejected": 2.1609268188476562, + "logps/chosen": -439.9488830566406, + "logps/rejected": -871.5830078125, + "loss": 1.1082, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.005231857299805, + "rewards/margins": 15.006608963012695, + "rewards/rejected": -22.0118408203125, + "step": 505 + }, + { + "epoch": 0.31477449455676515, + "grad_norm": 30.343387603759766, + "learning_rate": 4.973490087597972e-06, + "logits/chosen": -0.5188862085342407, + "logits/rejected": 3.0589683055877686, + "logps/chosen": -588.429443359375, + "logps/rejected": -955.3839111328125, + "loss": 0.3109, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.351261138916016, + "rewards/margins": 16.22182846069336, + "rewards/rejected": -25.573089599609375, + "step": 506 + }, + { + "epoch": 0.31539657853810266, + "grad_norm": 0.008166669867932796, + "learning_rate": 4.9723374827109275e-06, + "logits/chosen": -2.4529879093170166, + "logits/rejected": 1.081737995147705, + "logps/chosen": -450.3961181640625, + "logps/rejected": -771.6092529296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.570064067840576, + "rewards/margins": 15.121671676635742, + "rewards/rejected": -20.69173812866211, + "step": 507 + }, + { + "epoch": 0.3160186625194401, + "grad_norm": 20.37687873840332, + "learning_rate": 4.971184877823883e-06, + "logits/chosen": -1.355845332145691, + "logits/rejected": 3.8365607261657715, + "logps/chosen": -369.71142578125, + "logps/rejected": -886.731201171875, + "loss": 0.0948, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.974061965942383, + "rewards/margins": 13.553607940673828, + "rewards/rejected": -20.527667999267578, + "step": 508 + }, + { + "epoch": 0.3166407465007776, + "grad_norm": 11.929078102111816, + "learning_rate": 4.970032272936838e-06, + "logits/chosen": -0.82236647605896, + "logits/rejected": 2.8248941898345947, + "logps/chosen": -480.3670654296875, + "logps/rejected": -850.4513549804688, + "loss": 0.0676, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.097372055053711, + "rewards/margins": 17.191640853881836, + "rewards/rejected": -25.289012908935547, + "step": 509 + }, + { + "epoch": 0.31726283048211507, + "grad_norm": 0.16133515536785126, + "learning_rate": 4.968879668049793e-06, + "logits/chosen": -0.7161822319030762, + "logits/rejected": 2.622292995452881, + "logps/chosen": -376.44049072265625, + "logps/rejected": -857.2184448242188, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.603147983551025, + "rewards/margins": 22.76580810546875, + "rewards/rejected": -28.368953704833984, + "step": 510 + }, + { + "epoch": 0.3178849144634526, + "grad_norm": 26.133943557739258, + "learning_rate": 4.967727063162748e-06, + "logits/chosen": -0.21386033296585083, + "logits/rejected": 3.1011533737182617, + "logps/chosen": -486.1957092285156, + "logps/rejected": -911.2633666992188, + "loss": 0.3089, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.180964469909668, + "rewards/margins": 12.32599925994873, + "rewards/rejected": -21.50696563720703, + "step": 511 + }, + { + "epoch": 0.31850699844479, + "grad_norm": 0.002745468867942691, + "learning_rate": 4.966574458275704e-06, + "logits/chosen": -1.3007858991622925, + "logits/rejected": 2.079380989074707, + "logps/chosen": -366.24652099609375, + "logps/rejected": -766.4879150390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.880084037780762, + "rewards/margins": 16.512948989868164, + "rewards/rejected": -22.393033981323242, + "step": 512 + }, + { + "epoch": 0.31912908242612753, + "grad_norm": 0.6302774548530579, + "learning_rate": 4.965421853388659e-06, + "logits/chosen": -2.278191089630127, + "logits/rejected": 2.4888484477996826, + "logps/chosen": -343.87042236328125, + "logps/rejected": -755.074462890625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.464818477630615, + "rewards/margins": 15.757192611694336, + "rewards/rejected": -22.222009658813477, + "step": 513 + }, + { + "epoch": 0.319751166407465, + "grad_norm": 11.727120399475098, + "learning_rate": 4.964269248501614e-06, + "logits/chosen": 1.27705717086792, + "logits/rejected": 3.1875715255737305, + "logps/chosen": -657.3870849609375, + "logps/rejected": -968.127685546875, + "loss": 0.0597, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.210622787475586, + "rewards/margins": 17.608915328979492, + "rewards/rejected": -24.819538116455078, + "step": 514 + }, + { + "epoch": 0.3203732503888025, + "grad_norm": 0.0041059167124331, + "learning_rate": 4.963116643614569e-06, + "logits/chosen": -0.5016254782676697, + "logits/rejected": 2.372769832611084, + "logps/chosen": -536.0447387695312, + "logps/rejected": -931.0604858398438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.73209285736084, + "rewards/margins": 19.664264678955078, + "rewards/rejected": -27.396358489990234, + "step": 515 + }, + { + "epoch": 0.32099533437014, + "grad_norm": 0.01308477483689785, + "learning_rate": 4.9619640387275245e-06, + "logits/chosen": 0.8085079789161682, + "logits/rejected": 1.2257745265960693, + "logps/chosen": -672.284912109375, + "logps/rejected": -892.2509765625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.008455276489258, + "rewards/margins": 18.847637176513672, + "rewards/rejected": -30.85609245300293, + "step": 516 + }, + { + "epoch": 0.32161741835147745, + "grad_norm": 0.041552409529685974, + "learning_rate": 4.96081143384048e-06, + "logits/chosen": 2.4713146686553955, + "logits/rejected": 4.937995910644531, + "logps/chosen": -600.8577880859375, + "logps/rejected": -997.2860107421875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.211318969726562, + "rewards/margins": 22.59845542907715, + "rewards/rejected": -32.809776306152344, + "step": 517 + }, + { + "epoch": 0.32223950233281495, + "grad_norm": 0.008153838105499744, + "learning_rate": 4.959658828953435e-06, + "logits/chosen": 0.0751684308052063, + "logits/rejected": 3.57389760017395, + "logps/chosen": -547.2952880859375, + "logps/rejected": -994.4802856445312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.388373851776123, + "rewards/margins": 21.344240188598633, + "rewards/rejected": -28.73261260986328, + "step": 518 + }, + { + "epoch": 0.3228615863141524, + "grad_norm": 30.417959213256836, + "learning_rate": 4.95850622406639e-06, + "logits/chosen": 0.2802954316139221, + "logits/rejected": 2.9028139114379883, + "logps/chosen": -483.03717041015625, + "logps/rejected": -740.7561645507812, + "loss": 0.8714, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.424942016601562, + "rewards/margins": 13.649709701538086, + "rewards/rejected": -23.07465171813965, + "step": 519 + }, + { + "epoch": 0.3234836702954899, + "grad_norm": 13.999482154846191, + "learning_rate": 4.957353619179346e-06, + "logits/chosen": 1.0957525968551636, + "logits/rejected": 4.062682151794434, + "logps/chosen": -607.4361572265625, + "logps/rejected": -969.5930786132812, + "loss": 0.0773, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.513922691345215, + "rewards/margins": 19.56466293334961, + "rewards/rejected": -31.07858657836914, + "step": 520 + }, + { + "epoch": 0.32410575427682736, + "grad_norm": 0.07647275179624557, + "learning_rate": 4.9562010142923015e-06, + "logits/chosen": -0.7335138320922852, + "logits/rejected": 3.0744705200195312, + "logps/chosen": -568.8802490234375, + "logps/rejected": -1019.5764770507812, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.956428527832031, + "rewards/margins": 16.48337173461914, + "rewards/rejected": -26.43979835510254, + "step": 521 + }, + { + "epoch": 0.32472783825816487, + "grad_norm": 1.4716237783432007, + "learning_rate": 4.955048409405257e-06, + "logits/chosen": -1.4096927642822266, + "logits/rejected": 1.4332243204116821, + "logps/chosen": -559.1268920898438, + "logps/rejected": -950.1163330078125, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.92173957824707, + "rewards/margins": 16.359594345092773, + "rewards/rejected": -25.281333923339844, + "step": 522 + }, + { + "epoch": 0.3253499222395023, + "grad_norm": 0.0002848071453627199, + "learning_rate": 4.953895804518212e-06, + "logits/chosen": 1.6974499225616455, + "logits/rejected": 4.660102367401123, + "logps/chosen": -540.9886474609375, + "logps/rejected": -895.8781127929688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.400766372680664, + "rewards/margins": 20.902734756469727, + "rewards/rejected": -31.30350112915039, + "step": 523 + }, + { + "epoch": 0.3259720062208398, + "grad_norm": 1.21076500415802, + "learning_rate": 4.952743199631167e-06, + "logits/chosen": -1.0652494430541992, + "logits/rejected": 2.6607961654663086, + "logps/chosen": -571.4052124023438, + "logps/rejected": -920.467041015625, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.149827003479004, + "rewards/margins": 22.16531753540039, + "rewards/rejected": -32.315147399902344, + "step": 524 + }, + { + "epoch": 0.3265940902021773, + "grad_norm": 5.951406002044678, + "learning_rate": 4.951590594744122e-06, + "logits/chosen": -0.5077402591705322, + "logits/rejected": 0.5942150950431824, + "logps/chosen": -651.2573852539062, + "logps/rejected": -814.20556640625, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.089807510375977, + "rewards/margins": 14.319589614868164, + "rewards/rejected": -27.40939712524414, + "step": 525 + }, + { + "epoch": 0.3272161741835148, + "grad_norm": 26.456708908081055, + "learning_rate": 4.950437989857078e-06, + "logits/chosen": 2.108860969543457, + "logits/rejected": 3.822720527648926, + "logps/chosen": -722.9005126953125, + "logps/rejected": -971.195556640625, + "loss": 0.2687, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.09481430053711, + "rewards/margins": 11.886865615844727, + "rewards/rejected": -22.981679916381836, + "step": 526 + }, + { + "epoch": 0.32783825816485224, + "grad_norm": 34.449256896972656, + "learning_rate": 4.949285384970033e-06, + "logits/chosen": 0.3040791153907776, + "logits/rejected": 3.248950481414795, + "logps/chosen": -527.2380981445312, + "logps/rejected": -877.9301147460938, + "loss": 0.2376, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.19322681427002, + "rewards/margins": 14.885248184204102, + "rewards/rejected": -26.078475952148438, + "step": 527 + }, + { + "epoch": 0.32846034214618974, + "grad_norm": 0.7171943783760071, + "learning_rate": 4.948132780082988e-06, + "logits/chosen": 0.1580321490764618, + "logits/rejected": 1.8069722652435303, + "logps/chosen": -633.6151123046875, + "logps/rejected": -865.1552734375, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.61785888671875, + "rewards/margins": 16.59424591064453, + "rewards/rejected": -29.21210479736328, + "step": 528 + }, + { + "epoch": 0.3290824261275272, + "grad_norm": 32.90729904174805, + "learning_rate": 4.946980175195943e-06, + "logits/chosen": -0.4327443242073059, + "logits/rejected": 3.3330399990081787, + "logps/chosen": -536.70166015625, + "logps/rejected": -927.87255859375, + "loss": 0.2303, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.0555419921875, + "rewards/margins": 17.280601501464844, + "rewards/rejected": -27.33614158630371, + "step": 529 + }, + { + "epoch": 0.3297045101088647, + "grad_norm": 0.0027257169131189585, + "learning_rate": 4.9458275703088985e-06, + "logits/chosen": 2.5637218952178955, + "logits/rejected": 3.3361709117889404, + "logps/chosen": -708.5892333984375, + "logps/rejected": -922.7438354492188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.294085502624512, + "rewards/margins": 21.062084197998047, + "rewards/rejected": -32.356170654296875, + "step": 530 + }, + { + "epoch": 0.33032659409020215, + "grad_norm": 0.9086652994155884, + "learning_rate": 4.944674965421854e-06, + "logits/chosen": -0.039085566997528076, + "logits/rejected": 3.6807267665863037, + "logps/chosen": -591.6583862304688, + "logps/rejected": -1018.6286010742188, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.807241439819336, + "rewards/margins": 18.556652069091797, + "rewards/rejected": -30.3638916015625, + "step": 531 + }, + { + "epoch": 0.33094867807153966, + "grad_norm": 0.13630260527133942, + "learning_rate": 4.943522360534809e-06, + "logits/chosen": 0.1960965096950531, + "logits/rejected": 2.4245996475219727, + "logps/chosen": -623.849609375, + "logps/rejected": -828.4188842773438, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.50074577331543, + "rewards/margins": 13.00462818145752, + "rewards/rejected": -27.505373001098633, + "step": 532 + }, + { + "epoch": 0.33157076205287717, + "grad_norm": 0.0008395586046390235, + "learning_rate": 4.942369755647764e-06, + "logits/chosen": 0.5557337403297424, + "logits/rejected": 2.7182116508483887, + "logps/chosen": -707.032958984375, + "logps/rejected": -1065.6031494140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.236682891845703, + "rewards/margins": 24.51875114440918, + "rewards/rejected": -34.755435943603516, + "step": 533 + }, + { + "epoch": 0.3321928460342146, + "grad_norm": 47.25418472290039, + "learning_rate": 4.94121715076072e-06, + "logits/chosen": -0.8992422819137573, + "logits/rejected": 2.3601064682006836, + "logps/chosen": -526.9442138671875, + "logps/rejected": -885.7796020507812, + "loss": 0.2361, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.016450881958008, + "rewards/margins": 22.274335861206055, + "rewards/rejected": -34.29078674316406, + "step": 534 + }, + { + "epoch": 0.3328149300155521, + "grad_norm": 28.037433624267578, + "learning_rate": 4.9400645458736755e-06, + "logits/chosen": -0.71573805809021, + "logits/rejected": 3.0357680320739746, + "logps/chosen": -526.3771362304688, + "logps/rejected": -831.0687255859375, + "loss": 0.8585, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.930638313293457, + "rewards/margins": 12.199329376220703, + "rewards/rejected": -21.12996482849121, + "step": 535 + }, + { + "epoch": 0.3334370139968896, + "grad_norm": 15.79422664642334, + "learning_rate": 4.938911940986631e-06, + "logits/chosen": -2.4950482845306396, + "logits/rejected": 0.818975567817688, + "logps/chosen": -389.66387939453125, + "logps/rejected": -746.3770751953125, + "loss": 0.3818, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.482288360595703, + "rewards/margins": 17.772830963134766, + "rewards/rejected": -26.25511932373047, + "step": 536 + }, + { + "epoch": 0.3340590979782271, + "grad_norm": 21.64934539794922, + "learning_rate": 4.937759336099586e-06, + "logits/chosen": -1.2507398128509521, + "logits/rejected": 0.7657705545425415, + "logps/chosen": -407.2629699707031, + "logps/rejected": -652.3349609375, + "loss": 0.118, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.758617877960205, + "rewards/margins": 11.0945405960083, + "rewards/rejected": -18.853158950805664, + "step": 537 + }, + { + "epoch": 0.33468118195956453, + "grad_norm": 11.210826873779297, + "learning_rate": 4.936606731212541e-06, + "logits/chosen": -1.8799057006835938, + "logits/rejected": -0.9870352149009705, + "logps/chosen": -464.33892822265625, + "logps/rejected": -667.3587646484375, + "loss": 0.0587, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.691595077514648, + "rewards/margins": 13.272488594055176, + "rewards/rejected": -23.96408462524414, + "step": 538 + }, + { + "epoch": 0.33530326594090204, + "grad_norm": 0.03116637095808983, + "learning_rate": 4.935454126325496e-06, + "logits/chosen": -0.41563552618026733, + "logits/rejected": 1.657536506652832, + "logps/chosen": -451.22296142578125, + "logps/rejected": -700.4354248046875, + "loss": 0.0867, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.579830169677734, + "rewards/margins": 17.299863815307617, + "rewards/rejected": -29.87969398498535, + "step": 539 + }, + { + "epoch": 0.3359253499222395, + "grad_norm": 48.413124084472656, + "learning_rate": 4.934301521438452e-06, + "logits/chosen": -1.1306934356689453, + "logits/rejected": 1.482694387435913, + "logps/chosen": -490.3797912597656, + "logps/rejected": -851.402099609375, + "loss": 0.7104, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.867384910583496, + "rewards/margins": 19.34979248046875, + "rewards/rejected": -30.21717643737793, + "step": 540 + }, + { + "epoch": 0.336547433903577, + "grad_norm": 0.10275397449731827, + "learning_rate": 4.933148916551407e-06, + "logits/chosen": -2.942965269088745, + "logits/rejected": 1.474172830581665, + "logps/chosen": -426.2244567871094, + "logps/rejected": -927.9111328125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.147480964660645, + "rewards/margins": 25.321353912353516, + "rewards/rejected": -35.468833923339844, + "step": 541 + }, + { + "epoch": 0.33716951788491445, + "grad_norm": 41.10462951660156, + "learning_rate": 4.931996311664362e-06, + "logits/chosen": -1.3480467796325684, + "logits/rejected": 2.5357439517974854, + "logps/chosen": -555.9721069335938, + "logps/rejected": -968.4746704101562, + "loss": 0.4043, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.855534553527832, + "rewards/margins": 16.756187438964844, + "rewards/rejected": -31.61172103881836, + "step": 542 + }, + { + "epoch": 0.33779160186625196, + "grad_norm": 0.6125503182411194, + "learning_rate": 4.930843706777317e-06, + "logits/chosen": -0.027516961097717285, + "logits/rejected": 3.6979832649230957, + "logps/chosen": -636.796875, + "logps/rejected": -1036.647705078125, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.932252883911133, + "rewards/margins": 19.813295364379883, + "rewards/rejected": -35.745548248291016, + "step": 543 + }, + { + "epoch": 0.3384136858475894, + "grad_norm": 2.54610538482666, + "learning_rate": 4.9296911018902725e-06, + "logits/chosen": -3.5853943824768066, + "logits/rejected": 1.622105360031128, + "logps/chosen": -367.0932312011719, + "logps/rejected": -856.6544189453125, + "loss": 0.0742, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.336380004882812, + "rewards/margins": 14.396159172058105, + "rewards/rejected": -23.732540130615234, + "step": 544 + }, + { + "epoch": 0.3390357698289269, + "grad_norm": 22.747459411621094, + "learning_rate": 4.928538497003228e-06, + "logits/chosen": -1.3813132047653198, + "logits/rejected": 3.102327585220337, + "logps/chosen": -441.0496826171875, + "logps/rejected": -810.7593994140625, + "loss": 0.2061, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.9570817947387695, + "rewards/margins": 14.54705810546875, + "rewards/rejected": -22.504138946533203, + "step": 545 + }, + { + "epoch": 0.33965785381026437, + "grad_norm": 0.5766253471374512, + "learning_rate": 4.927385892116183e-06, + "logits/chosen": -1.2130874395370483, + "logits/rejected": 3.091094970703125, + "logps/chosen": -418.65997314453125, + "logps/rejected": -918.8798828125, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.981672286987305, + "rewards/margins": 25.764217376708984, + "rewards/rejected": -33.745887756347656, + "step": 546 + }, + { + "epoch": 0.34027993779160187, + "grad_norm": 0.008549829944968224, + "learning_rate": 4.926233287229138e-06, + "logits/chosen": -1.421816110610962, + "logits/rejected": 2.4775519371032715, + "logps/chosen": -563.928466796875, + "logps/rejected": -1086.707275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.792487144470215, + "rewards/margins": 27.195476531982422, + "rewards/rejected": -37.98796463012695, + "step": 547 + }, + { + "epoch": 0.3409020217729393, + "grad_norm": 0.0007273833034560084, + "learning_rate": 4.925080682342093e-06, + "logits/chosen": -1.8788166046142578, + "logits/rejected": 3.0953235626220703, + "logps/chosen": -372.55548095703125, + "logps/rejected": -888.577392578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.427706718444824, + "rewards/margins": 20.497804641723633, + "rewards/rejected": -27.92551040649414, + "step": 548 + }, + { + "epoch": 0.34152410575427683, + "grad_norm": 0.002993043977767229, + "learning_rate": 4.9239280774550495e-06, + "logits/chosen": 1.5593526363372803, + "logits/rejected": 4.229238510131836, + "logps/chosen": -595.6698608398438, + "logps/rejected": -1023.314697265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.270602226257324, + "rewards/margins": 23.1187801361084, + "rewards/rejected": -35.38938522338867, + "step": 549 + }, + { + "epoch": 0.3421461897356143, + "grad_norm": 0.0029460815712809563, + "learning_rate": 4.922775472568005e-06, + "logits/chosen": -2.3387439250946045, + "logits/rejected": 1.7299787998199463, + "logps/chosen": -413.709228515625, + "logps/rejected": -912.1287841796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.157748222351074, + "rewards/margins": 24.370506286621094, + "rewards/rejected": -32.528255462646484, + "step": 550 + }, + { + "epoch": 0.3427682737169518, + "grad_norm": 0.0008374156313948333, + "learning_rate": 4.921622867680959e-06, + "logits/chosen": -0.32705816626548767, + "logits/rejected": 2.5051586627960205, + "logps/chosen": -589.31884765625, + "logps/rejected": -961.8628540039062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.971508026123047, + "rewards/margins": 22.370750427246094, + "rewards/rejected": -37.34225845336914, + "step": 551 + }, + { + "epoch": 0.3433903576982893, + "grad_norm": 33.720123291015625, + "learning_rate": 4.920470262793914e-06, + "logits/chosen": -2.241405487060547, + "logits/rejected": 3.023292064666748, + "logps/chosen": -447.4491882324219, + "logps/rejected": -1021.66015625, + "loss": 0.4903, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.648796081542969, + "rewards/margins": 23.02674674987793, + "rewards/rejected": -31.675540924072266, + "step": 552 + }, + { + "epoch": 0.34401244167962675, + "grad_norm": 39.974979400634766, + "learning_rate": 4.9193176579068695e-06, + "logits/chosen": 1.1962766647338867, + "logits/rejected": 3.599783182144165, + "logps/chosen": -549.6170654296875, + "logps/rejected": -786.75341796875, + "loss": 1.0005, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.80546760559082, + "rewards/margins": 13.706472396850586, + "rewards/rejected": -26.511940002441406, + "step": 553 + }, + { + "epoch": 0.34463452566096425, + "grad_norm": 17.803882598876953, + "learning_rate": 4.918165053019825e-06, + "logits/chosen": 1.2538249492645264, + "logits/rejected": 1.7552330493927002, + "logps/chosen": -711.7666625976562, + "logps/rejected": -828.937255859375, + "loss": 0.097, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.915160179138184, + "rewards/margins": 10.584980964660645, + "rewards/rejected": -25.500139236450195, + "step": 554 + }, + { + "epoch": 0.3452566096423017, + "grad_norm": 3.103040933609009, + "learning_rate": 4.91701244813278e-06, + "logits/chosen": 1.5487773418426514, + "logits/rejected": 3.0372424125671387, + "logps/chosen": -642.92529296875, + "logps/rejected": -902.142333984375, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.582810401916504, + "rewards/margins": 19.856678009033203, + "rewards/rejected": -33.43948745727539, + "step": 555 + }, + { + "epoch": 0.3458786936236392, + "grad_norm": 0.014518902637064457, + "learning_rate": 4.915859843245735e-06, + "logits/chosen": -0.48636725544929504, + "logits/rejected": 3.007913589477539, + "logps/chosen": -254.45782470703125, + "logps/rejected": -726.4654541015625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.3389573097229, + "rewards/margins": 20.198001861572266, + "rewards/rejected": -25.53696060180664, + "step": 556 + }, + { + "epoch": 0.34650077760497666, + "grad_norm": 0.005257305223494768, + "learning_rate": 4.9147072383586904e-06, + "logits/chosen": 0.3584858775138855, + "logits/rejected": 2.2523038387298584, + "logps/chosen": -760.4193115234375, + "logps/rejected": -1081.203369140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.717350006103516, + "rewards/margins": 21.49951934814453, + "rewards/rejected": -39.21686935424805, + "step": 557 + }, + { + "epoch": 0.34712286158631417, + "grad_norm": 1.6701537370681763, + "learning_rate": 4.913554633471646e-06, + "logits/chosen": -2.2826242446899414, + "logits/rejected": 0.3551350235939026, + "logps/chosen": -442.69097900390625, + "logps/rejected": -697.0272216796875, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.382549285888672, + "rewards/margins": 13.87314510345459, + "rewards/rejected": -23.255693435668945, + "step": 558 + }, + { + "epoch": 0.3477449455676516, + "grad_norm": 21.336896896362305, + "learning_rate": 4.912402028584602e-06, + "logits/chosen": -0.32555997371673584, + "logits/rejected": 0.8113161325454712, + "logps/chosen": -569.210693359375, + "logps/rejected": -764.329345703125, + "loss": 0.1424, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.628195762634277, + "rewards/margins": 14.781917572021484, + "rewards/rejected": -25.410112380981445, + "step": 559 + }, + { + "epoch": 0.3483670295489891, + "grad_norm": 0.01587485708296299, + "learning_rate": 4.911249423697557e-06, + "logits/chosen": -0.7397094964981079, + "logits/rejected": 3.212367057800293, + "logps/chosen": -520.3375244140625, + "logps/rejected": -964.132080078125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.017093658447266, + "rewards/margins": 24.597700119018555, + "rewards/rejected": -34.61479187011719, + "step": 560 + }, + { + "epoch": 0.3489891135303266, + "grad_norm": 0.06691177934408188, + "learning_rate": 4.910096818810512e-06, + "logits/chosen": 0.5737409591674805, + "logits/rejected": 3.6291799545288086, + "logps/chosen": -487.9759521484375, + "logps/rejected": -921.3861083984375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.400189399719238, + "rewards/margins": 21.333316802978516, + "rewards/rejected": -30.733505249023438, + "step": 561 + }, + { + "epoch": 0.3496111975116641, + "grad_norm": 37.446746826171875, + "learning_rate": 4.908944213923467e-06, + "logits/chosen": 1.1964095830917358, + "logits/rejected": 1.6312025785446167, + "logps/chosen": -680.3416137695312, + "logps/rejected": -908.3619384765625, + "loss": 0.4519, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.171266555786133, + "rewards/margins": 16.99323272705078, + "rewards/rejected": -30.164499282836914, + "step": 562 + }, + { + "epoch": 0.35023328149300154, + "grad_norm": 0.03469838947057724, + "learning_rate": 4.907791609036423e-06, + "logits/chosen": -1.0637686252593994, + "logits/rejected": 2.611809492111206, + "logps/chosen": -328.0810546875, + "logps/rejected": -687.5595092773438, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.062766075134277, + "rewards/margins": 14.250360488891602, + "rewards/rejected": -21.313125610351562, + "step": 563 + }, + { + "epoch": 0.35085536547433904, + "grad_norm": 3.72678804397583, + "learning_rate": 4.906639004149378e-06, + "logits/chosen": 2.042522430419922, + "logits/rejected": 3.486320734024048, + "logps/chosen": -773.8923950195312, + "logps/rejected": -1025.49560546875, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.240882873535156, + "rewards/margins": 17.987770080566406, + "rewards/rejected": -36.22865295410156, + "step": 564 + }, + { + "epoch": 0.3514774494556765, + "grad_norm": 1.5056075426400639e-05, + "learning_rate": 4.905486399262333e-06, + "logits/chosen": -2.78934383392334, + "logits/rejected": 3.307797431945801, + "logps/chosen": -389.1837158203125, + "logps/rejected": -981.9088745117188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.682865142822266, + "rewards/margins": 26.758037567138672, + "rewards/rejected": -33.44090270996094, + "step": 565 + }, + { + "epoch": 0.352099533437014, + "grad_norm": 0.0001173518830910325, + "learning_rate": 4.904333794375288e-06, + "logits/chosen": -3.6703920364379883, + "logits/rejected": 2.331336498260498, + "logps/chosen": -475.85528564453125, + "logps/rejected": -1081.368408203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.64018726348877, + "rewards/margins": 30.549610137939453, + "rewards/rejected": -39.18980026245117, + "step": 566 + }, + { + "epoch": 0.35272161741835145, + "grad_norm": 2.92313551902771, + "learning_rate": 4.9031811894882435e-06, + "logits/chosen": 1.1690174341201782, + "logits/rejected": 3.2298545837402344, + "logps/chosen": -529.1205444335938, + "logps/rejected": -833.876708984375, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.099763870239258, + "rewards/margins": 16.231340408325195, + "rewards/rejected": -30.331104278564453, + "step": 567 + }, + { + "epoch": 0.35334370139968896, + "grad_norm": 0.03307843208312988, + "learning_rate": 4.902028584601199e-06, + "logits/chosen": -0.4311780333518982, + "logits/rejected": 2.940704345703125, + "logps/chosen": -413.35516357421875, + "logps/rejected": -888.8939208984375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.529521942138672, + "rewards/margins": 24.85467529296875, + "rewards/rejected": -33.38420104980469, + "step": 568 + }, + { + "epoch": 0.35396578538102647, + "grad_norm": 26.350021362304688, + "learning_rate": 4.900875979714154e-06, + "logits/chosen": 0.2001965045928955, + "logits/rejected": 2.413053512573242, + "logps/chosen": -546.7880859375, + "logps/rejected": -866.169677734375, + "loss": 0.2851, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.412593841552734, + "rewards/margins": 13.435179710388184, + "rewards/rejected": -27.847774505615234, + "step": 569 + }, + { + "epoch": 0.3545878693623639, + "grad_norm": 37.97916793823242, + "learning_rate": 4.899723374827109e-06, + "logits/chosen": 1.0915520191192627, + "logits/rejected": 3.6510167121887207, + "logps/chosen": -701.7742919921875, + "logps/rejected": -1025.33984375, + "loss": 0.8505, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.174086570739746, + "rewards/margins": 18.551523208618164, + "rewards/rejected": -33.725608825683594, + "step": 570 + }, + { + "epoch": 0.3552099533437014, + "grad_norm": 12.117103576660156, + "learning_rate": 4.898570769940064e-06, + "logits/chosen": -0.11754333972930908, + "logits/rejected": 1.6310240030288696, + "logps/chosen": -532.1516723632812, + "logps/rejected": -736.618896484375, + "loss": 0.1422, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.331872940063477, + "rewards/margins": 12.47451114654541, + "rewards/rejected": -23.806385040283203, + "step": 571 + }, + { + "epoch": 0.3558320373250389, + "grad_norm": 0.04478954151272774, + "learning_rate": 4.89741816505302e-06, + "logits/chosen": -2.8400495052337646, + "logits/rejected": 1.9938653707504272, + "logps/chosen": -462.40533447265625, + "logps/rejected": -920.4817504882812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.620899200439453, + "rewards/margins": 25.308086395263672, + "rewards/rejected": -32.928985595703125, + "step": 572 + }, + { + "epoch": 0.3564541213063764, + "grad_norm": 4.206796169281006, + "learning_rate": 4.896265560165976e-06, + "logits/chosen": 1.6942188739776611, + "logits/rejected": 3.390136241912842, + "logps/chosen": -653.3523559570312, + "logps/rejected": -950.9197387695312, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.680431365966797, + "rewards/margins": 14.628912925720215, + "rewards/rejected": -25.309345245361328, + "step": 573 + }, + { + "epoch": 0.35707620528771383, + "grad_norm": 37.8600959777832, + "learning_rate": 4.895112955278931e-06, + "logits/chosen": 0.7583746910095215, + "logits/rejected": 1.5877070426940918, + "logps/chosen": -595.6297607421875, + "logps/rejected": -756.054931640625, + "loss": 1.2698, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.754125595092773, + "rewards/margins": 11.235215187072754, + "rewards/rejected": -22.989341735839844, + "step": 574 + }, + { + "epoch": 0.35769828926905134, + "grad_norm": 0.011185353621840477, + "learning_rate": 4.893960350391886e-06, + "logits/chosen": -2.070542097091675, + "logits/rejected": 3.7503252029418945, + "logps/chosen": -415.4565124511719, + "logps/rejected": -957.781494140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.346330165863037, + "rewards/margins": 24.029987335205078, + "rewards/rejected": -31.376317977905273, + "step": 575 + }, + { + "epoch": 0.3583203732503888, + "grad_norm": 0.3578238785266876, + "learning_rate": 4.892807745504841e-06, + "logits/chosen": -1.425378680229187, + "logits/rejected": 2.926992177963257, + "logps/chosen": -356.78533935546875, + "logps/rejected": -807.888671875, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.3499860763549805, + "rewards/margins": 21.162736892700195, + "rewards/rejected": -26.51272201538086, + "step": 576 + }, + { + "epoch": 0.3589424572317263, + "grad_norm": 0.49534156918525696, + "learning_rate": 4.891655140617797e-06, + "logits/chosen": -1.4474272727966309, + "logits/rejected": 2.1497583389282227, + "logps/chosen": -323.8979187011719, + "logps/rejected": -678.140625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.529425144195557, + "rewards/margins": 16.806941986083984, + "rewards/rejected": -22.336368560791016, + "step": 577 + }, + { + "epoch": 0.35956454121306375, + "grad_norm": 0.014279196970164776, + "learning_rate": 4.890502535730752e-06, + "logits/chosen": 1.0501610040664673, + "logits/rejected": 3.710789203643799, + "logps/chosen": -628.9027099609375, + "logps/rejected": -967.5325927734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.289298057556152, + "rewards/margins": 17.977649688720703, + "rewards/rejected": -26.266950607299805, + "step": 578 + }, + { + "epoch": 0.36018662519440126, + "grad_norm": 67.1640625, + "learning_rate": 4.889349930843707e-06, + "logits/chosen": -0.08438020944595337, + "logits/rejected": 0.6998257637023926, + "logps/chosen": -620.3289794921875, + "logps/rejected": -855.9243774414062, + "loss": 0.8311, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.778589248657227, + "rewards/margins": 15.465974807739258, + "rewards/rejected": -28.244564056396484, + "step": 579 + }, + { + "epoch": 0.3608087091757387, + "grad_norm": 35.90826416015625, + "learning_rate": 4.888197325956662e-06, + "logits/chosen": -3.391622543334961, + "logits/rejected": -0.005591452121734619, + "logps/chosen": -414.13525390625, + "logps/rejected": -752.8704223632812, + "loss": 0.2604, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.338963031768799, + "rewards/margins": 19.059038162231445, + "rewards/rejected": -25.39800262451172, + "step": 580 + }, + { + "epoch": 0.3614307931570762, + "grad_norm": 0.0016980888321995735, + "learning_rate": 4.8870447210696175e-06, + "logits/chosen": 1.6687289476394653, + "logits/rejected": 2.0782861709594727, + "logps/chosen": -697.9501342773438, + "logps/rejected": -933.7528686523438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.147013664245605, + "rewards/margins": 22.531450271606445, + "rewards/rejected": -30.678462982177734, + "step": 581 + }, + { + "epoch": 0.36205287713841366, + "grad_norm": 30.825054168701172, + "learning_rate": 4.885892116182573e-06, + "logits/chosen": -0.3926219344139099, + "logits/rejected": 1.7349659204483032, + "logps/chosen": -594.2177734375, + "logps/rejected": -873.021240234375, + "loss": 0.2269, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.434806823730469, + "rewards/margins": 14.307239532470703, + "rewards/rejected": -24.742048263549805, + "step": 582 + }, + { + "epoch": 0.36267496111975117, + "grad_norm": 13.518712043762207, + "learning_rate": 4.884739511295528e-06, + "logits/chosen": -0.2923990786075592, + "logits/rejected": 0.8742029070854187, + "logps/chosen": -729.5030517578125, + "logps/rejected": -917.1492309570312, + "loss": 0.0831, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.968738555908203, + "rewards/margins": 17.587398529052734, + "rewards/rejected": -31.556137084960938, + "step": 583 + }, + { + "epoch": 0.3632970451010886, + "grad_norm": 3.0330076217651367, + "learning_rate": 4.883586906408483e-06, + "logits/chosen": 0.3311905860900879, + "logits/rejected": 3.680189847946167, + "logps/chosen": -561.0177001953125, + "logps/rejected": -898.9573364257812, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.7169189453125, + "rewards/margins": 16.990234375, + "rewards/rejected": -27.707151412963867, + "step": 584 + }, + { + "epoch": 0.36391912908242613, + "grad_norm": 38.674537658691406, + "learning_rate": 4.882434301521438e-06, + "logits/chosen": 1.2607179880142212, + "logits/rejected": 4.454874038696289, + "logps/chosen": -765.2697143554688, + "logps/rejected": -1076.0303955078125, + "loss": 0.5958, + "rewards/accuracies": 0.75, + "rewards/chosen": -15.136398315429688, + "rewards/margins": 15.660676956176758, + "rewards/rejected": -30.797075271606445, + "step": 585 + }, + { + "epoch": 0.3645412130637636, + "grad_norm": 0.13426032662391663, + "learning_rate": 4.881281696634394e-06, + "logits/chosen": 1.5848565101623535, + "logits/rejected": 3.3656342029571533, + "logps/chosen": -545.01220703125, + "logps/rejected": -805.3402099609375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.408105850219727, + "rewards/margins": 17.92910385131836, + "rewards/rejected": -25.337207794189453, + "step": 586 + }, + { + "epoch": 0.3651632970451011, + "grad_norm": 0.10479693114757538, + "learning_rate": 4.880129091747349e-06, + "logits/chosen": -1.8742375373840332, + "logits/rejected": 1.2531564235687256, + "logps/chosen": -501.27825927734375, + "logps/rejected": -815.1589965820312, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.048238754272461, + "rewards/margins": 16.175546646118164, + "rewards/rejected": -26.223785400390625, + "step": 587 + }, + { + "epoch": 0.3657853810264386, + "grad_norm": 33.76664352416992, + "learning_rate": 4.878976486860305e-06, + "logits/chosen": 0.6626871228218079, + "logits/rejected": 3.236910820007324, + "logps/chosen": -571.5628662109375, + "logps/rejected": -790.7460327148438, + "loss": 0.4093, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.41337776184082, + "rewards/margins": 8.020959854125977, + "rewards/rejected": -16.434335708618164, + "step": 588 + }, + { + "epoch": 0.36640746500777605, + "grad_norm": 4.419313430786133, + "learning_rate": 4.87782388197326e-06, + "logits/chosen": -1.5819462537765503, + "logits/rejected": 0.5037208199501038, + "logps/chosen": -438.037353515625, + "logps/rejected": -680.818603515625, + "loss": 0.0597, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.652944087982178, + "rewards/margins": 13.655805587768555, + "rewards/rejected": -19.30875015258789, + "step": 589 + }, + { + "epoch": 0.36702954898911355, + "grad_norm": 0.024569852277636528, + "learning_rate": 4.876671277086215e-06, + "logits/chosen": 0.6127163767814636, + "logits/rejected": 3.8395800590515137, + "logps/chosen": -658.8626098632812, + "logps/rejected": -1016.54541015625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.372544765472412, + "rewards/margins": 16.042373657226562, + "rewards/rejected": -21.414918899536133, + "step": 590 + }, + { + "epoch": 0.367651632970451, + "grad_norm": 15.055514335632324, + "learning_rate": 4.875518672199171e-06, + "logits/chosen": 0.09293143451213837, + "logits/rejected": 3.7359554767608643, + "logps/chosen": -455.48675537109375, + "logps/rejected": -904.8528442382812, + "loss": 0.0721, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.17232608795166, + "rewards/margins": 21.1533203125, + "rewards/rejected": -30.325645446777344, + "step": 591 + }, + { + "epoch": 0.3682737169517885, + "grad_norm": 29.496828079223633, + "learning_rate": 4.874366067312126e-06, + "logits/chosen": -2.956026554107666, + "logits/rejected": 2.513519048690796, + "logps/chosen": -357.78668212890625, + "logps/rejected": -780.1038818359375, + "loss": 0.8831, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.444273471832275, + "rewards/margins": 13.725360870361328, + "rewards/rejected": -21.169635772705078, + "step": 592 + }, + { + "epoch": 0.36889580093312596, + "grad_norm": 11.6830415725708, + "learning_rate": 4.873213462425081e-06, + "logits/chosen": 1.3610471487045288, + "logits/rejected": 2.2409071922302246, + "logps/chosen": -676.43603515625, + "logps/rejected": -838.9456176757812, + "loss": 0.0514, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.695094108581543, + "rewards/margins": 13.126800537109375, + "rewards/rejected": -26.821895599365234, + "step": 593 + }, + { + "epoch": 0.36951788491446347, + "grad_norm": 2.102048397064209, + "learning_rate": 4.872060857538036e-06, + "logits/chosen": 1.9273995161056519, + "logits/rejected": 4.002025604248047, + "logps/chosen": -528.9932861328125, + "logps/rejected": -871.9857788085938, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.188224792480469, + "rewards/margins": 17.76958656311035, + "rewards/rejected": -23.957813262939453, + "step": 594 + }, + { + "epoch": 0.3701399688958009, + "grad_norm": 8.947134017944336, + "learning_rate": 4.8709082526509915e-06, + "logits/chosen": 1.2496016025543213, + "logits/rejected": 1.224539041519165, + "logps/chosen": -612.486083984375, + "logps/rejected": -726.193359375, + "loss": 0.0364, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.15511417388916, + "rewards/margins": 12.905986785888672, + "rewards/rejected": -20.061100006103516, + "step": 595 + }, + { + "epoch": 0.3707620528771384, + "grad_norm": 0.10295522212982178, + "learning_rate": 4.869755647763947e-06, + "logits/chosen": -0.8488799333572388, + "logits/rejected": 4.074465751647949, + "logps/chosen": -527.9417724609375, + "logps/rejected": -927.850830078125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.303407192230225, + "rewards/margins": 18.423900604248047, + "rewards/rejected": -24.727310180664062, + "step": 596 + }, + { + "epoch": 0.3713841368584759, + "grad_norm": 40.78211212158203, + "learning_rate": 4.868603042876902e-06, + "logits/chosen": 1.4812382459640503, + "logits/rejected": 3.599062204360962, + "logps/chosen": -663.3964233398438, + "logps/rejected": -983.2401123046875, + "loss": 0.8761, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.089118957519531, + "rewards/margins": 19.937496185302734, + "rewards/rejected": -31.0266170501709, + "step": 597 + }, + { + "epoch": 0.3720062208398134, + "grad_norm": 32.93647384643555, + "learning_rate": 4.867450437989857e-06, + "logits/chosen": -0.36488810181617737, + "logits/rejected": 1.5354863405227661, + "logps/chosen": -460.73455810546875, + "logps/rejected": -689.384521484375, + "loss": 0.4965, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.197094917297363, + "rewards/margins": 14.525459289550781, + "rewards/rejected": -20.72255516052246, + "step": 598 + }, + { + "epoch": 0.37262830482115084, + "grad_norm": 0.004770494066178799, + "learning_rate": 4.866297833102812e-06, + "logits/chosen": -1.5025954246520996, + "logits/rejected": 3.4834418296813965, + "logps/chosen": -318.74017333984375, + "logps/rejected": -800.4383544921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3950350284576416, + "rewards/margins": 19.421722412109375, + "rewards/rejected": -22.816753387451172, + "step": 599 + }, + { + "epoch": 0.37325038880248834, + "grad_norm": 19.019315719604492, + "learning_rate": 4.865145228215768e-06, + "logits/chosen": 2.659332275390625, + "logits/rejected": 3.8738040924072266, + "logps/chosen": -588.7614135742188, + "logps/rejected": -936.4422607421875, + "loss": 0.1276, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.607353210449219, + "rewards/margins": 20.830799102783203, + "rewards/rejected": -28.438154220581055, + "step": 600 + }, + { + "epoch": 0.3738724727838258, + "grad_norm": 9.484130859375, + "learning_rate": 4.863992623328723e-06, + "logits/chosen": 2.2311387062072754, + "logits/rejected": 4.177638530731201, + "logps/chosen": -634.6103515625, + "logps/rejected": -801.521484375, + "loss": 0.0547, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.441396236419678, + "rewards/margins": 12.886884689331055, + "rewards/rejected": -19.328279495239258, + "step": 601 + }, + { + "epoch": 0.3744945567651633, + "grad_norm": 0.7348048090934753, + "learning_rate": 4.862840018441679e-06, + "logits/chosen": 0.9550198316574097, + "logits/rejected": 3.199352741241455, + "logps/chosen": -641.8240356445312, + "logps/rejected": -881.9445190429688, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.406742095947266, + "rewards/margins": 14.080041885375977, + "rewards/rejected": -21.48678207397461, + "step": 602 + }, + { + "epoch": 0.37511664074650075, + "grad_norm": 0.01319398358464241, + "learning_rate": 4.861687413554634e-06, + "logits/chosen": -2.5636777877807617, + "logits/rejected": 1.7623281478881836, + "logps/chosen": -420.635986328125, + "logps/rejected": -824.3484497070312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.411018371582031, + "rewards/margins": 15.403884887695312, + "rewards/rejected": -19.814903259277344, + "step": 603 + }, + { + "epoch": 0.37573872472783826, + "grad_norm": 30.160045623779297, + "learning_rate": 4.860534808667589e-06, + "logits/chosen": 0.446747750043869, + "logits/rejected": 3.6913681030273438, + "logps/chosen": -425.6489562988281, + "logps/rejected": -764.9004516601562, + "loss": 0.5233, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.018855571746826, + "rewards/margins": 14.458169937133789, + "rewards/rejected": -18.47702407836914, + "step": 604 + }, + { + "epoch": 0.37636080870917576, + "grad_norm": 37.270713806152344, + "learning_rate": 4.859382203780545e-06, + "logits/chosen": 2.456481695175171, + "logits/rejected": 1.7729933261871338, + "logps/chosen": -640.4718017578125, + "logps/rejected": -661.660400390625, + "loss": 0.6326, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.538122177124023, + "rewards/margins": 7.628580093383789, + "rewards/rejected": -14.166702270507812, + "step": 605 + }, + { + "epoch": 0.3769828926905132, + "grad_norm": 0.01940000243484974, + "learning_rate": 4.8582295988935e-06, + "logits/chosen": 0.7876328229904175, + "logits/rejected": 1.8315197229385376, + "logps/chosen": -532.6771240234375, + "logps/rejected": -785.2579956054688, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.908534049987793, + "rewards/margins": 16.05277442932129, + "rewards/rejected": -20.9613094329834, + "step": 606 + }, + { + "epoch": 0.3776049766718507, + "grad_norm": 7.5594706535339355, + "learning_rate": 4.857076994006455e-06, + "logits/chosen": -0.28083693981170654, + "logits/rejected": 2.34736967086792, + "logps/chosen": -486.8939208984375, + "logps/rejected": -747.44287109375, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.2080583572387695, + "rewards/margins": 13.350509643554688, + "rewards/rejected": -19.558568954467773, + "step": 607 + }, + { + "epoch": 0.3782270606531882, + "grad_norm": 18.768293380737305, + "learning_rate": 4.85592438911941e-06, + "logits/chosen": 0.1136578619480133, + "logits/rejected": 2.9444642066955566, + "logps/chosen": -499.3504333496094, + "logps/rejected": -764.1222534179688, + "loss": 0.1255, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.96809720993042, + "rewards/margins": 12.719585418701172, + "rewards/rejected": -16.68768310546875, + "step": 608 + }, + { + "epoch": 0.3788491446345257, + "grad_norm": 0.0016397468280047178, + "learning_rate": 4.8547717842323655e-06, + "logits/chosen": -2.1330926418304443, + "logits/rejected": 2.6644673347473145, + "logps/chosen": -389.0440368652344, + "logps/rejected": -849.74365234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.208529949188232, + "rewards/margins": 18.280054092407227, + "rewards/rejected": -22.488584518432617, + "step": 609 + }, + { + "epoch": 0.37947122861586313, + "grad_norm": 0.18090400099754333, + "learning_rate": 4.853619179345321e-06, + "logits/chosen": 0.31500858068466187, + "logits/rejected": 2.8184757232666016, + "logps/chosen": -556.4556884765625, + "logps/rejected": -885.5828857421875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6242709159851074, + "rewards/margins": 16.77484703063965, + "rewards/rejected": -20.399118423461914, + "step": 610 + }, + { + "epoch": 0.38009331259720064, + "grad_norm": 0.5810987949371338, + "learning_rate": 4.852466574458276e-06, + "logits/chosen": -1.924774169921875, + "logits/rejected": 1.5502896308898926, + "logps/chosen": -318.47222900390625, + "logps/rejected": -653.2730102539062, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3044466972351074, + "rewards/margins": 14.565733909606934, + "rewards/rejected": -17.870182037353516, + "step": 611 + }, + { + "epoch": 0.3807153965785381, + "grad_norm": 1.5486432313919067, + "learning_rate": 4.851313969571231e-06, + "logits/chosen": -0.7176028490066528, + "logits/rejected": 1.7789404392242432, + "logps/chosen": -466.0256652832031, + "logps/rejected": -780.8444213867188, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.195468902587891, + "rewards/margins": 16.52389907836914, + "rewards/rejected": -22.71936798095703, + "step": 612 + }, + { + "epoch": 0.3813374805598756, + "grad_norm": 0.22635774314403534, + "learning_rate": 4.850161364684186e-06, + "logits/chosen": -0.2533850073814392, + "logits/rejected": 1.6178226470947266, + "logps/chosen": -524.1812744140625, + "logps/rejected": -790.1473388671875, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.199096918106079, + "rewards/margins": 18.512271881103516, + "rewards/rejected": -21.71137046813965, + "step": 613 + }, + { + "epoch": 0.38195956454121305, + "grad_norm": 0.00020886211132165045, + "learning_rate": 4.849008759797142e-06, + "logits/chosen": -2.068157911300659, + "logits/rejected": 2.3664326667785645, + "logps/chosen": -345.16943359375, + "logps/rejected": -826.10693359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.358859539031982, + "rewards/margins": 19.83062171936035, + "rewards/rejected": -24.189481735229492, + "step": 614 + }, + { + "epoch": 0.38258164852255055, + "grad_norm": 14.616460800170898, + "learning_rate": 4.847856154910097e-06, + "logits/chosen": 2.5612854957580566, + "logits/rejected": 3.3244102001190186, + "logps/chosen": -575.50048828125, + "logps/rejected": -779.091796875, + "loss": 0.0662, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.839805603027344, + "rewards/margins": 13.69375991821289, + "rewards/rejected": -22.533565521240234, + "step": 615 + }, + { + "epoch": 0.383203732503888, + "grad_norm": 0.1210361123085022, + "learning_rate": 4.846703550023052e-06, + "logits/chosen": 1.096333384513855, + "logits/rejected": 3.7907910346984863, + "logps/chosen": -487.0243225097656, + "logps/rejected": -856.36181640625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8041367530822754, + "rewards/margins": 18.125682830810547, + "rewards/rejected": -21.929819107055664, + "step": 616 + }, + { + "epoch": 0.3838258164852255, + "grad_norm": 0.007629493251442909, + "learning_rate": 4.845550945136008e-06, + "logits/chosen": 2.442495346069336, + "logits/rejected": 4.257726669311523, + "logps/chosen": -573.216796875, + "logps/rejected": -859.510009765625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.294878959655762, + "rewards/margins": 20.90713119506836, + "rewards/rejected": -26.202011108398438, + "step": 617 + }, + { + "epoch": 0.38444790046656296, + "grad_norm": 8.720742225646973, + "learning_rate": 4.844398340248963e-06, + "logits/chosen": 2.267151355743408, + "logits/rejected": 4.637188911437988, + "logps/chosen": -508.08251953125, + "logps/rejected": -838.2218017578125, + "loss": 0.0478, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.086606025695801, + "rewards/margins": 18.29007339477539, + "rewards/rejected": -21.376678466796875, + "step": 618 + }, + { + "epoch": 0.38506998444790047, + "grad_norm": 0.0023380150087177753, + "learning_rate": 4.843245735361919e-06, + "logits/chosen": 0.5009723901748657, + "logits/rejected": 3.4536290168762207, + "logps/chosen": -502.7635498046875, + "logps/rejected": -818.5247192382812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08828657865524292, + "rewards/margins": 18.531295776367188, + "rewards/rejected": -18.443008422851562, + "step": 619 + }, + { + "epoch": 0.3856920684292379, + "grad_norm": 50.71761703491211, + "learning_rate": 4.842093130474874e-06, + "logits/chosen": -0.8099173307418823, + "logits/rejected": 1.7252922058105469, + "logps/chosen": -438.8321533203125, + "logps/rejected": -750.5208740234375, + "loss": 0.8195, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.430469036102295, + "rewards/margins": 15.224184036254883, + "rewards/rejected": -19.654653549194336, + "step": 620 + }, + { + "epoch": 0.38631415241057543, + "grad_norm": 0.14640004932880402, + "learning_rate": 4.840940525587829e-06, + "logits/chosen": -1.0835647583007812, + "logits/rejected": 3.823948383331299, + "logps/chosen": -309.2799377441406, + "logps/rejected": -755.61669921875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2851157188415527, + "rewards/margins": 14.198723793029785, + "rewards/rejected": -17.48383903503418, + "step": 621 + }, + { + "epoch": 0.38693623639191294, + "grad_norm": 24.4279842376709, + "learning_rate": 4.839787920700784e-06, + "logits/chosen": 2.0698561668395996, + "logits/rejected": 3.8490822315216064, + "logps/chosen": -626.984375, + "logps/rejected": -923.0991821289062, + "loss": 0.2581, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.559420585632324, + "rewards/margins": 15.944664001464844, + "rewards/rejected": -22.50408363342285, + "step": 622 + }, + { + "epoch": 0.3875583203732504, + "grad_norm": 1.6033275127410889, + "learning_rate": 4.8386353158137395e-06, + "logits/chosen": 1.5655349493026733, + "logits/rejected": 4.216394424438477, + "logps/chosen": -497.54638671875, + "logps/rejected": -865.5770874023438, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9526607990264893, + "rewards/margins": 17.63835906982422, + "rewards/rejected": -19.591020584106445, + "step": 623 + }, + { + "epoch": 0.3881804043545879, + "grad_norm": 3.7565343379974365, + "learning_rate": 4.837482710926695e-06, + "logits/chosen": -3.539860963821411, + "logits/rejected": 3.1450586318969727, + "logps/chosen": -289.2846984863281, + "logps/rejected": -818.124755859375, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.083850622177124, + "rewards/margins": 15.55625057220459, + "rewards/rejected": -17.64010238647461, + "step": 624 + }, + { + "epoch": 0.38880248833592534, + "grad_norm": 9.447480201721191, + "learning_rate": 4.83633010603965e-06, + "logits/chosen": -1.6923489570617676, + "logits/rejected": 2.869992256164551, + "logps/chosen": -456.4915771484375, + "logps/rejected": -925.1444702148438, + "loss": 0.0621, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.213108539581299, + "rewards/margins": 16.34868812561035, + "rewards/rejected": -21.561796188354492, + "step": 625 + }, + { + "epoch": 0.38942457231726285, + "grad_norm": 1.3966708183288574, + "learning_rate": 4.835177501152605e-06, + "logits/chosen": 1.9913567304611206, + "logits/rejected": 3.4345850944519043, + "logps/chosen": -446.4867858886719, + "logps/rejected": -746.5498657226562, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.273656845092773, + "rewards/margins": 16.530517578125, + "rewards/rejected": -21.804176330566406, + "step": 626 + }, + { + "epoch": 0.3900466562986003, + "grad_norm": 0.013844764791429043, + "learning_rate": 4.83402489626556e-06, + "logits/chosen": 2.8672995567321777, + "logits/rejected": 3.437636375427246, + "logps/chosen": -738.3363647460938, + "logps/rejected": -923.5630493164062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.327809810638428, + "rewards/margins": 15.714049339294434, + "rewards/rejected": -21.041858673095703, + "step": 627 + }, + { + "epoch": 0.3906687402799378, + "grad_norm": 0.23032204806804657, + "learning_rate": 4.832872291378516e-06, + "logits/chosen": 0.48103535175323486, + "logits/rejected": 3.2791948318481445, + "logps/chosen": -511.300048828125, + "logps/rejected": -837.8681030273438, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.440017700195312, + "rewards/margins": 12.671974182128906, + "rewards/rejected": -22.111989974975586, + "step": 628 + }, + { + "epoch": 0.39129082426127526, + "grad_norm": 1.4107637405395508, + "learning_rate": 4.831719686491471e-06, + "logits/chosen": -2.6341326236724854, + "logits/rejected": 3.5024142265319824, + "logps/chosen": -183.09799194335938, + "logps/rejected": -754.7428588867188, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7689191102981567, + "rewards/margins": 23.79766273498535, + "rewards/rejected": -24.56658172607422, + "step": 629 + }, + { + "epoch": 0.39191290824261277, + "grad_norm": 0.15013962984085083, + "learning_rate": 4.830567081604426e-06, + "logits/chosen": -0.7445148229598999, + "logits/rejected": 3.8409135341644287, + "logps/chosen": -460.27984619140625, + "logps/rejected": -913.905029296875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9821481704711914, + "rewards/margins": 21.528270721435547, + "rewards/rejected": -24.510419845581055, + "step": 630 + }, + { + "epoch": 0.3925349922239502, + "grad_norm": 5.702386379241943, + "learning_rate": 4.829414476717382e-06, + "logits/chosen": 0.4213750958442688, + "logits/rejected": 2.7475595474243164, + "logps/chosen": -468.94549560546875, + "logps/rejected": -773.5858154296875, + "loss": 0.0383, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.939387798309326, + "rewards/margins": 16.688373565673828, + "rewards/rejected": -23.627761840820312, + "step": 631 + }, + { + "epoch": 0.3931570762052877, + "grad_norm": 28.688859939575195, + "learning_rate": 4.828261871830337e-06, + "logits/chosen": -0.6199629306793213, + "logits/rejected": 3.405015468597412, + "logps/chosen": -511.64501953125, + "logps/rejected": -868.1781005859375, + "loss": 0.325, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.980612277984619, + "rewards/margins": 13.439093589782715, + "rewards/rejected": -19.419706344604492, + "step": 632 + }, + { + "epoch": 0.3937791601866252, + "grad_norm": 0.5256664752960205, + "learning_rate": 4.827109266943293e-06, + "logits/chosen": 1.968687891960144, + "logits/rejected": 1.5026566982269287, + "logps/chosen": -554.6557006835938, + "logps/rejected": -652.0545654296875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.485213279724121, + "rewards/margins": 10.570073127746582, + "rewards/rejected": -15.055286407470703, + "step": 633 + }, + { + "epoch": 0.3944012441679627, + "grad_norm": 0.0065084053203463554, + "learning_rate": 4.825956662056248e-06, + "logits/chosen": 1.2704832553863525, + "logits/rejected": 2.851560115814209, + "logps/chosen": -437.1866149902344, + "logps/rejected": -672.5458984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.370952606201172, + "rewards/margins": 16.11528778076172, + "rewards/rejected": -21.48624038696289, + "step": 634 + }, + { + "epoch": 0.39502332814930013, + "grad_norm": 2.8544149245135486e-05, + "learning_rate": 4.824804057169203e-06, + "logits/chosen": 0.17647302150726318, + "logits/rejected": 3.5326387882232666, + "logps/chosen": -426.699462890625, + "logps/rejected": -840.47021484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.158924579620361, + "rewards/margins": 19.477825164794922, + "rewards/rejected": -23.636749267578125, + "step": 635 + }, + { + "epoch": 0.39564541213063764, + "grad_norm": 0.12314128130674362, + "learning_rate": 4.823651452282158e-06, + "logits/chosen": 0.10702091455459595, + "logits/rejected": 3.7575435638427734, + "logps/chosen": -440.03643798828125, + "logps/rejected": -920.6306762695312, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.680037498474121, + "rewards/margins": 22.33530616760254, + "rewards/rejected": -29.015344619750977, + "step": 636 + }, + { + "epoch": 0.3962674961119751, + "grad_norm": 26.167678833007812, + "learning_rate": 4.8224988473951135e-06, + "logits/chosen": 1.3711156845092773, + "logits/rejected": 3.597280502319336, + "logps/chosen": -536.123291015625, + "logps/rejected": -809.23291015625, + "loss": 0.4267, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.699240684509277, + "rewards/margins": 11.513164520263672, + "rewards/rejected": -20.212406158447266, + "step": 637 + }, + { + "epoch": 0.3968895800933126, + "grad_norm": 3.827866554260254, + "learning_rate": 4.821346242508069e-06, + "logits/chosen": 1.7881110906600952, + "logits/rejected": 3.7016730308532715, + "logps/chosen": -512.7743530273438, + "logps/rejected": -737.0347900390625, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.946093559265137, + "rewards/margins": 16.378984451293945, + "rewards/rejected": -24.325077056884766, + "step": 638 + }, + { + "epoch": 0.39751166407465005, + "grad_norm": 0.02579200640320778, + "learning_rate": 4.820193637621024e-06, + "logits/chosen": -1.0454192161560059, + "logits/rejected": 2.754476547241211, + "logps/chosen": -391.75030517578125, + "logps/rejected": -796.76953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.3952555656433105, + "rewards/margins": 19.92845916748047, + "rewards/rejected": -25.323715209960938, + "step": 639 + }, + { + "epoch": 0.39813374805598756, + "grad_norm": 13.338902473449707, + "learning_rate": 4.819041032733979e-06, + "logits/chosen": 0.01971861720085144, + "logits/rejected": 4.193253993988037, + "logps/chosen": -554.3214721679688, + "logps/rejected": -962.2105712890625, + "loss": 0.0639, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.8727617263793945, + "rewards/margins": 22.870092391967773, + "rewards/rejected": -29.742855072021484, + "step": 640 + }, + { + "epoch": 0.39875583203732506, + "grad_norm": 5.3078994824318215e-05, + "learning_rate": 4.817888427846934e-06, + "logits/chosen": 2.95037841796875, + "logits/rejected": 4.468027114868164, + "logps/chosen": -615.4202880859375, + "logps/rejected": -942.3226318359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.942698955535889, + "rewards/margins": 22.296367645263672, + "rewards/rejected": -30.239065170288086, + "step": 641 + }, + { + "epoch": 0.3993779160186625, + "grad_norm": 1.463631510734558, + "learning_rate": 4.81673582295989e-06, + "logits/chosen": -1.417764663696289, + "logits/rejected": 3.904311180114746, + "logps/chosen": -437.336669921875, + "logps/rejected": -847.3101196289062, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.907837390899658, + "rewards/margins": 16.962812423706055, + "rewards/rejected": -24.870649337768555, + "step": 642 + }, + { + "epoch": 0.4, + "grad_norm": 4.746206283569336, + "learning_rate": 4.815583218072845e-06, + "logits/chosen": -0.7753801345825195, + "logits/rejected": 3.87953519821167, + "logps/chosen": -416.31231689453125, + "logps/rejected": -864.30126953125, + "loss": 0.0556, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.581168174743652, + "rewards/margins": 22.60930061340332, + "rewards/rejected": -27.19046974182129, + "step": 643 + }, + { + "epoch": 0.4006220839813375, + "grad_norm": 4.815265128854662e-05, + "learning_rate": 4.8144306131858e-06, + "logits/chosen": -0.5812610387802124, + "logits/rejected": 3.251255989074707, + "logps/chosen": -372.25054931640625, + "logps/rejected": -811.37939453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.438142776489258, + "rewards/margins": 21.47218132019043, + "rewards/rejected": -24.910324096679688, + "step": 644 + }, + { + "epoch": 0.401244167962675, + "grad_norm": 0.024434104561805725, + "learning_rate": 4.813278008298755e-06, + "logits/chosen": 0.36400488018989563, + "logits/rejected": 3.0627307891845703, + "logps/chosen": -459.53582763671875, + "logps/rejected": -897.6513671875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.094304084777832, + "rewards/margins": 28.00246810913086, + "rewards/rejected": -31.096773147583008, + "step": 645 + }, + { + "epoch": 0.40186625194401243, + "grad_norm": 0.13461847603321075, + "learning_rate": 4.812125403411711e-06, + "logits/chosen": -2.2824478149414062, + "logits/rejected": 3.998173713684082, + "logps/chosen": -335.736083984375, + "logps/rejected": -883.3201293945312, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.016701698303223, + "rewards/margins": 17.895519256591797, + "rewards/rejected": -23.912221908569336, + "step": 646 + }, + { + "epoch": 0.40248833592534994, + "grad_norm": 23.656198501586914, + "learning_rate": 4.810972798524667e-06, + "logits/chosen": 0.18910843133926392, + "logits/rejected": 2.8961360454559326, + "logps/chosen": -523.1488647460938, + "logps/rejected": -843.4249267578125, + "loss": 0.2861, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.169979095458984, + "rewards/margins": 11.714703559875488, + "rewards/rejected": -17.884681701660156, + "step": 647 + }, + { + "epoch": 0.4031104199066874, + "grad_norm": 1.5852130651474, + "learning_rate": 4.809820193637622e-06, + "logits/chosen": 0.43711161613464355, + "logits/rejected": 2.8995206356048584, + "logps/chosen": -539.7998046875, + "logps/rejected": -927.2687377929688, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.630446434020996, + "rewards/margins": 20.007854461669922, + "rewards/rejected": -29.6382999420166, + "step": 648 + }, + { + "epoch": 0.4037325038880249, + "grad_norm": 0.008699237369000912, + "learning_rate": 4.808667588750577e-06, + "logits/chosen": 0.30366188287734985, + "logits/rejected": 3.941417694091797, + "logps/chosen": -508.7660217285156, + "logps/rejected": -924.7904663085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7266173362731934, + "rewards/margins": 23.782203674316406, + "rewards/rejected": -27.508821487426758, + "step": 649 + }, + { + "epoch": 0.40435458786936235, + "grad_norm": 19.735321044921875, + "learning_rate": 4.807514983863532e-06, + "logits/chosen": 1.1154712438583374, + "logits/rejected": 2.21747088432312, + "logps/chosen": -499.0096740722656, + "logps/rejected": -780.8884887695312, + "loss": 0.7222, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.748971939086914, + "rewards/margins": 19.208717346191406, + "rewards/rejected": -27.95768928527832, + "step": 650 + }, + { + "epoch": 0.40497667185069985, + "grad_norm": 4.993641376495361, + "learning_rate": 4.8063623789764875e-06, + "logits/chosen": -1.0320407152175903, + "logits/rejected": 2.897843837738037, + "logps/chosen": -491.4745178222656, + "logps/rejected": -866.7677001953125, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.669189453125, + "rewards/margins": 17.265472412109375, + "rewards/rejected": -26.934659957885742, + "step": 651 + }, + { + "epoch": 0.4055987558320373, + "grad_norm": 33.00937271118164, + "learning_rate": 4.805209774089443e-06, + "logits/chosen": 1.5638294219970703, + "logits/rejected": 4.383650779724121, + "logps/chosen": -449.9453125, + "logps/rejected": -779.3433837890625, + "loss": 0.4431, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.803736686706543, + "rewards/margins": 20.59209632873535, + "rewards/rejected": -26.395832061767578, + "step": 652 + }, + { + "epoch": 0.4062208398133748, + "grad_norm": 19.781658172607422, + "learning_rate": 4.804057169202398e-06, + "logits/chosen": -0.6717186570167542, + "logits/rejected": 3.4153504371643066, + "logps/chosen": -452.99822998046875, + "logps/rejected": -792.9339599609375, + "loss": 0.1421, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.400663375854492, + "rewards/margins": 15.584333419799805, + "rewards/rejected": -23.984996795654297, + "step": 653 + }, + { + "epoch": 0.40684292379471226, + "grad_norm": 28.147157669067383, + "learning_rate": 4.802904564315353e-06, + "logits/chosen": -3.40217661857605, + "logits/rejected": 3.5524790287017822, + "logps/chosen": -438.203857421875, + "logps/rejected": -1074.790283203125, + "loss": 0.9619, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.154910087585449, + "rewards/margins": 29.582317352294922, + "rewards/rejected": -33.73722839355469, + "step": 654 + }, + { + "epoch": 0.40746500777604977, + "grad_norm": 0.09156882762908936, + "learning_rate": 4.801751959428308e-06, + "logits/chosen": 3.1281723976135254, + "logits/rejected": 3.121691942214966, + "logps/chosen": -741.47216796875, + "logps/rejected": -1028.3216552734375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.673593521118164, + "rewards/margins": 22.124595642089844, + "rewards/rejected": -30.798187255859375, + "step": 655 + }, + { + "epoch": 0.4080870917573872, + "grad_norm": 0.6502810716629028, + "learning_rate": 4.800599354541264e-06, + "logits/chosen": -1.4345520734786987, + "logits/rejected": 1.081398367881775, + "logps/chosen": -333.1896667480469, + "logps/rejected": -653.4251708984375, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.2726359367370605, + "rewards/margins": 16.819339752197266, + "rewards/rejected": -22.091976165771484, + "step": 656 + }, + { + "epoch": 0.40870917573872473, + "grad_norm": 0.025361565873026848, + "learning_rate": 4.799446749654219e-06, + "logits/chosen": 1.792804479598999, + "logits/rejected": 3.220573663711548, + "logps/chosen": -632.1589965820312, + "logps/rejected": -931.2838134765625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.018635749816895, + "rewards/margins": 15.393332481384277, + "rewards/rejected": -27.411968231201172, + "step": 657 + }, + { + "epoch": 0.40933125972006223, + "grad_norm": 0.01778257079422474, + "learning_rate": 4.798294144767174e-06, + "logits/chosen": -2.2758796215057373, + "logits/rejected": 2.9297425746917725, + "logps/chosen": -398.4133605957031, + "logps/rejected": -808.4490356445312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.194453716278076, + "rewards/margins": 16.604900360107422, + "rewards/rejected": -22.799354553222656, + "step": 658 + }, + { + "epoch": 0.4099533437013997, + "grad_norm": 0.39328357577323914, + "learning_rate": 4.797141539880129e-06, + "logits/chosen": 0.019907251000404358, + "logits/rejected": 4.10305118560791, + "logps/chosen": -539.0341796875, + "logps/rejected": -1011.156005859375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.144923210144043, + "rewards/margins": 19.252063751220703, + "rewards/rejected": -29.39698600769043, + "step": 659 + }, + { + "epoch": 0.4105754276827372, + "grad_norm": 40.315162658691406, + "learning_rate": 4.795988934993085e-06, + "logits/chosen": -0.9315068125724792, + "logits/rejected": 2.8982458114624023, + "logps/chosen": -423.90631103515625, + "logps/rejected": -840.4696044921875, + "loss": 0.5096, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.175039291381836, + "rewards/margins": 19.72167205810547, + "rewards/rejected": -28.896713256835938, + "step": 660 + }, + { + "epoch": 0.41119751166407464, + "grad_norm": 0.00019077463366556913, + "learning_rate": 4.794836330106041e-06, + "logits/chosen": -0.42708921432495117, + "logits/rejected": 4.313132286071777, + "logps/chosen": -349.7354736328125, + "logps/rejected": -814.03759765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.468191146850586, + "rewards/margins": 19.493267059326172, + "rewards/rejected": -24.961458206176758, + "step": 661 + }, + { + "epoch": 0.41181959564541215, + "grad_norm": 9.062701225280762, + "learning_rate": 4.793683725218996e-06, + "logits/chosen": 1.5624027252197266, + "logits/rejected": 3.176633834838867, + "logps/chosen": -594.6624755859375, + "logps/rejected": -916.4132080078125, + "loss": 0.0664, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.484450340270996, + "rewards/margins": 17.643253326416016, + "rewards/rejected": -26.127704620361328, + "step": 662 + }, + { + "epoch": 0.4124416796267496, + "grad_norm": 0.16003720462322235, + "learning_rate": 4.792531120331951e-06, + "logits/chosen": 2.4539906978607178, + "logits/rejected": 3.480743408203125, + "logps/chosen": -633.8411865234375, + "logps/rejected": -825.93310546875, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.565387725830078, + "rewards/margins": 13.266152381896973, + "rewards/rejected": -23.831541061401367, + "step": 663 + }, + { + "epoch": 0.4130637636080871, + "grad_norm": 32.037132263183594, + "learning_rate": 4.791378515444906e-06, + "logits/chosen": 1.967376708984375, + "logits/rejected": 3.912313222885132, + "logps/chosen": -587.0730590820312, + "logps/rejected": -835.215576171875, + "loss": 1.5524, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.711736679077148, + "rewards/margins": 14.280363082885742, + "rewards/rejected": -25.992103576660156, + "step": 664 + }, + { + "epoch": 0.41368584758942456, + "grad_norm": 22.63080406188965, + "learning_rate": 4.7902259105578615e-06, + "logits/chosen": -1.1042022705078125, + "logits/rejected": 1.5109401941299438, + "logps/chosen": -308.7568359375, + "logps/rejected": -628.593994140625, + "loss": 0.4471, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.961351156234741, + "rewards/margins": 20.284442901611328, + "rewards/rejected": -24.245792388916016, + "step": 665 + }, + { + "epoch": 0.41430793157076207, + "grad_norm": 4.558940887451172, + "learning_rate": 4.789073305670817e-06, + "logits/chosen": 1.9392263889312744, + "logits/rejected": 3.432485580444336, + "logps/chosen": -719.56640625, + "logps/rejected": -961.9409790039062, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.849826812744141, + "rewards/margins": 17.749860763549805, + "rewards/rejected": -25.599689483642578, + "step": 666 + }, + { + "epoch": 0.4149300155520995, + "grad_norm": 26.374540328979492, + "learning_rate": 4.787920700783772e-06, + "logits/chosen": 1.5769593715667725, + "logits/rejected": 2.828300714492798, + "logps/chosen": -494.17388916015625, + "logps/rejected": -807.5885009765625, + "loss": 0.932, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.869695663452148, + "rewards/margins": 17.6748046875, + "rewards/rejected": -26.54450225830078, + "step": 667 + }, + { + "epoch": 0.415552099533437, + "grad_norm": 10.404101371765137, + "learning_rate": 4.786768095896727e-06, + "logits/chosen": 2.2112016677856445, + "logits/rejected": 2.179577112197876, + "logps/chosen": -776.2391357421875, + "logps/rejected": -941.023193359375, + "loss": 0.0436, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.392984390258789, + "rewards/margins": 13.242691993713379, + "rewards/rejected": -26.635677337646484, + "step": 668 + }, + { + "epoch": 0.4161741835147745, + "grad_norm": 0.00012222891382407397, + "learning_rate": 4.785615491009682e-06, + "logits/chosen": -0.8005756139755249, + "logits/rejected": 2.6633152961730957, + "logps/chosen": -580.3494262695312, + "logps/rejected": -1048.3486328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.532279968261719, + "rewards/margins": 26.374229431152344, + "rewards/rejected": -36.90650939941406, + "step": 669 + }, + { + "epoch": 0.416796267496112, + "grad_norm": 0.016480503603816032, + "learning_rate": 4.784462886122638e-06, + "logits/chosen": -1.1997987031936646, + "logits/rejected": 3.5286808013916016, + "logps/chosen": -381.94769287109375, + "logps/rejected": -869.9482421875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.5639848709106445, + "rewards/margins": 21.98867416381836, + "rewards/rejected": -27.552656173706055, + "step": 670 + }, + { + "epoch": 0.41741835147744943, + "grad_norm": 5.529460430145264, + "learning_rate": 4.783310281235593e-06, + "logits/chosen": -0.6281594634056091, + "logits/rejected": 2.345285654067993, + "logps/chosen": -411.87286376953125, + "logps/rejected": -790.1439819335938, + "loss": 0.0778, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.664583206176758, + "rewards/margins": 19.735605239868164, + "rewards/rejected": -26.400184631347656, + "step": 671 + }, + { + "epoch": 0.41804043545878694, + "grad_norm": 29.453466415405273, + "learning_rate": 4.782157676348548e-06, + "logits/chosen": -1.222573161125183, + "logits/rejected": 3.766413688659668, + "logps/chosen": -480.82989501953125, + "logps/rejected": -1006.6981201171875, + "loss": 0.2167, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.6935529708862305, + "rewards/margins": 23.371456146240234, + "rewards/rejected": -31.065006256103516, + "step": 672 + }, + { + "epoch": 0.4186625194401244, + "grad_norm": 0.17522646486759186, + "learning_rate": 4.781005071461503e-06, + "logits/chosen": -5.1208977699279785, + "logits/rejected": 3.1196346282958984, + "logps/chosen": -232.45718383789062, + "logps/rejected": -869.94091796875, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2358012199401855, + "rewards/margins": 20.760984420776367, + "rewards/rejected": -24.996788024902344, + "step": 673 + }, + { + "epoch": 0.4192846034214619, + "grad_norm": 27.000770568847656, + "learning_rate": 4.7798524665744585e-06, + "logits/chosen": 1.8730990886688232, + "logits/rejected": 3.122191905975342, + "logps/chosen": -651.515625, + "logps/rejected": -853.1071166992188, + "loss": 0.9936, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.197526931762695, + "rewards/margins": 10.82865047454834, + "rewards/rejected": -22.02617645263672, + "step": 674 + }, + { + "epoch": 0.4199066874027994, + "grad_norm": 30.578311920166016, + "learning_rate": 4.7786998616874146e-06, + "logits/chosen": -0.01745295524597168, + "logits/rejected": 2.4413537979125977, + "logps/chosen": -419.0821228027344, + "logps/rejected": -830.2938232421875, + "loss": 1.3451, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.137384414672852, + "rewards/margins": 18.40494155883789, + "rewards/rejected": -22.542327880859375, + "step": 675 + }, + { + "epoch": 0.42052877138413686, + "grad_norm": 53.24766159057617, + "learning_rate": 4.77754725680037e-06, + "logits/chosen": -1.0631290674209595, + "logits/rejected": 2.835108518600464, + "logps/chosen": -550.2883911132812, + "logps/rejected": -937.1199340820312, + "loss": 1.3762, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.001826286315918, + "rewards/margins": 19.6325740814209, + "rewards/rejected": -27.634401321411133, + "step": 676 + }, + { + "epoch": 0.42115085536547436, + "grad_norm": 24.685049057006836, + "learning_rate": 4.776394651913325e-06, + "logits/chosen": -0.9792740345001221, + "logits/rejected": 2.392324686050415, + "logps/chosen": -512.7687377929688, + "logps/rejected": -835.5687255859375, + "loss": 0.2189, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.09033203125, + "rewards/margins": 14.524964332580566, + "rewards/rejected": -23.61529541015625, + "step": 677 + }, + { + "epoch": 0.4217729393468118, + "grad_norm": 0.8354139924049377, + "learning_rate": 4.77524204702628e-06, + "logits/chosen": -0.6444275975227356, + "logits/rejected": 3.580641508102417, + "logps/chosen": -416.6503601074219, + "logps/rejected": -801.9967651367188, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.518429279327393, + "rewards/margins": 17.77960205078125, + "rewards/rejected": -23.298032760620117, + "step": 678 + }, + { + "epoch": 0.4223950233281493, + "grad_norm": 29.565584182739258, + "learning_rate": 4.7740894421392355e-06, + "logits/chosen": 0.06383585929870605, + "logits/rejected": 3.639883518218994, + "logps/chosen": -476.05352783203125, + "logps/rejected": -927.9241943359375, + "loss": 0.5134, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.642790794372559, + "rewards/margins": 17.618484497070312, + "rewards/rejected": -25.261276245117188, + "step": 679 + }, + { + "epoch": 0.4230171073094868, + "grad_norm": 0.0912347361445427, + "learning_rate": 4.772936837252191e-06, + "logits/chosen": -0.12195968627929688, + "logits/rejected": 3.932724952697754, + "logps/chosen": -402.1063232421875, + "logps/rejected": -811.5177001953125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.853856086730957, + "rewards/margins": 16.344602584838867, + "rewards/rejected": -21.19845962524414, + "step": 680 + }, + { + "epoch": 0.4236391912908243, + "grad_norm": 0.3873937129974365, + "learning_rate": 4.771784232365146e-06, + "logits/chosen": 1.799547553062439, + "logits/rejected": 4.590263366699219, + "logps/chosen": -559.97216796875, + "logps/rejected": -1006.1261596679688, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.338591575622559, + "rewards/margins": 23.82638168334961, + "rewards/rejected": -33.16497039794922, + "step": 681 + }, + { + "epoch": 0.42426127527216173, + "grad_norm": 0.7272276282310486, + "learning_rate": 4.770631627478101e-06, + "logits/chosen": 0.4105660915374756, + "logits/rejected": 2.076657772064209, + "logps/chosen": -388.392333984375, + "logps/rejected": -615.2227783203125, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.603516578674316, + "rewards/margins": 15.719802856445312, + "rewards/rejected": -21.323318481445312, + "step": 682 + }, + { + "epoch": 0.42488335925349924, + "grad_norm": 1.1333785323586199e-06, + "learning_rate": 4.769479022591056e-06, + "logits/chosen": -0.6940436959266663, + "logits/rejected": 4.295339107513428, + "logps/chosen": -507.3326110839844, + "logps/rejected": -970.3253173828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.027980327606201, + "rewards/margins": 27.017305374145508, + "rewards/rejected": -31.045286178588867, + "step": 683 + }, + { + "epoch": 0.4255054432348367, + "grad_norm": 4.457662726053968e-05, + "learning_rate": 4.768326417704012e-06, + "logits/chosen": -1.7501380443572998, + "logits/rejected": 4.424574851989746, + "logps/chosen": -388.6649169921875, + "logps/rejected": -957.874267578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.324387550354004, + "rewards/margins": 27.12349510192871, + "rewards/rejected": -33.44788360595703, + "step": 684 + }, + { + "epoch": 0.4261275272161742, + "grad_norm": 0.12122859060764313, + "learning_rate": 4.767173812816967e-06, + "logits/chosen": 1.5742546319961548, + "logits/rejected": 4.587541580200195, + "logps/chosen": -466.7994079589844, + "logps/rejected": -852.6675415039062, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.404057025909424, + "rewards/margins": 24.145267486572266, + "rewards/rejected": -30.54932403564453, + "step": 685 + }, + { + "epoch": 0.42674961119751165, + "grad_norm": 6.259092807769775, + "learning_rate": 4.766021207929922e-06, + "logits/chosen": 0.6969764828681946, + "logits/rejected": 4.548662185668945, + "logps/chosen": -486.2441711425781, + "logps/rejected": -815.9784545898438, + "loss": 0.032, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.9217023849487305, + "rewards/margins": 12.386877059936523, + "rewards/rejected": -20.30858039855957, + "step": 686 + }, + { + "epoch": 0.42737169517884915, + "grad_norm": 38.55778884887695, + "learning_rate": 4.764868603042877e-06, + "logits/chosen": 1.018954873085022, + "logits/rejected": 3.2408273220062256, + "logps/chosen": -672.9383544921875, + "logps/rejected": -942.35791015625, + "loss": 1.645, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.722557067871094, + "rewards/margins": 15.812875747680664, + "rewards/rejected": -26.53543472290039, + "step": 687 + }, + { + "epoch": 0.4279937791601866, + "grad_norm": 24.259414672851562, + "learning_rate": 4.7637159981558325e-06, + "logits/chosen": -2.0910446643829346, + "logits/rejected": 1.7110493183135986, + "logps/chosen": -436.27105712890625, + "logps/rejected": -857.93603515625, + "loss": 0.1087, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.506262302398682, + "rewards/margins": 17.343793869018555, + "rewards/rejected": -23.85005760192871, + "step": 688 + }, + { + "epoch": 0.4286158631415241, + "grad_norm": 0.00418486725538969, + "learning_rate": 4.762563393268788e-06, + "logits/chosen": 0.039060741662979126, + "logits/rejected": 3.397256374359131, + "logps/chosen": -334.5584411621094, + "logps/rejected": -723.466064453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.947673797607422, + "rewards/margins": 17.47788429260254, + "rewards/rejected": -22.425556182861328, + "step": 689 + }, + { + "epoch": 0.42923794712286156, + "grad_norm": 34.04841232299805, + "learning_rate": 4.761410788381743e-06, + "logits/chosen": -1.352245807647705, + "logits/rejected": 1.7429472208023071, + "logps/chosen": -442.674560546875, + "logps/rejected": -684.5184326171875, + "loss": 1.6189, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.620567321777344, + "rewards/margins": 9.817951202392578, + "rewards/rejected": -18.438518524169922, + "step": 690 + }, + { + "epoch": 0.42986003110419907, + "grad_norm": 31.715965270996094, + "learning_rate": 4.760258183494698e-06, + "logits/chosen": -1.247018814086914, + "logits/rejected": 3.7910540103912354, + "logps/chosen": -424.660400390625, + "logps/rejected": -836.0987548828125, + "loss": 0.4279, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.494476318359375, + "rewards/margins": 15.016087532043457, + "rewards/rejected": -23.510562896728516, + "step": 691 + }, + { + "epoch": 0.4304821150855365, + "grad_norm": 24.739803314208984, + "learning_rate": 4.759105578607653e-06, + "logits/chosen": -0.5017096400260925, + "logits/rejected": 2.8500189781188965, + "logps/chosen": -446.18585205078125, + "logps/rejected": -784.303466796875, + "loss": 0.2784, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.562870025634766, + "rewards/margins": 13.364456176757812, + "rewards/rejected": -21.927326202392578, + "step": 692 + }, + { + "epoch": 0.431104199066874, + "grad_norm": 44.9724235534668, + "learning_rate": 4.757952973720609e-06, + "logits/chosen": 3.754262924194336, + "logits/rejected": 5.475862979888916, + "logps/chosen": -708.015869140625, + "logps/rejected": -868.138916015625, + "loss": 1.1519, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.583785057067871, + "rewards/margins": 9.499543190002441, + "rewards/rejected": -19.083328247070312, + "step": 693 + }, + { + "epoch": 0.43172628304821153, + "grad_norm": 0.12701572477817535, + "learning_rate": 4.756800368833564e-06, + "logits/chosen": 1.0257909297943115, + "logits/rejected": 1.0529817342758179, + "logps/chosen": -592.5926513671875, + "logps/rejected": -720.8876953125, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.634477615356445, + "rewards/margins": 19.143278121948242, + "rewards/rejected": -26.777755737304688, + "step": 694 + }, + { + "epoch": 0.432348367029549, + "grad_norm": 0.3144236207008362, + "learning_rate": 4.755647763946519e-06, + "logits/chosen": -0.3531043231487274, + "logits/rejected": 3.7984418869018555, + "logps/chosen": -361.6063232421875, + "logps/rejected": -824.85009765625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.502399921417236, + "rewards/margins": 23.011634826660156, + "rewards/rejected": -29.514034271240234, + "step": 695 + }, + { + "epoch": 0.4329704510108865, + "grad_norm": 0.00020732081611640751, + "learning_rate": 4.754495159059474e-06, + "logits/chosen": -1.5537899732589722, + "logits/rejected": 3.863278388977051, + "logps/chosen": -386.71478271484375, + "logps/rejected": -1015.0911254882812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0640451908111572, + "rewards/margins": 25.591724395751953, + "rewards/rejected": -28.65576934814453, + "step": 696 + }, + { + "epoch": 0.43359253499222394, + "grad_norm": 26.792686462402344, + "learning_rate": 4.7533425541724295e-06, + "logits/chosen": 1.8323582410812378, + "logits/rejected": 3.697693109512329, + "logps/chosen": -591.3922729492188, + "logps/rejected": -806.5018310546875, + "loss": 0.374, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.721050500869751, + "rewards/margins": 13.876240730285645, + "rewards/rejected": -17.5972900390625, + "step": 697 + }, + { + "epoch": 0.43421461897356145, + "grad_norm": 4.399302005767822, + "learning_rate": 4.752189949285385e-06, + "logits/chosen": 0.6278190612792969, + "logits/rejected": 1.133225440979004, + "logps/chosen": -593.8261108398438, + "logps/rejected": -769.5350952148438, + "loss": 0.0245, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.2219953536987305, + "rewards/margins": 14.507144927978516, + "rewards/rejected": -21.729141235351562, + "step": 698 + }, + { + "epoch": 0.4348367029548989, + "grad_norm": 4.496114730834961, + "learning_rate": 4.751037344398341e-06, + "logits/chosen": 0.0551641583442688, + "logits/rejected": 4.235555171966553, + "logps/chosen": -363.8678894042969, + "logps/rejected": -777.250732421875, + "loss": 0.0807, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.3506622314453125, + "rewards/margins": 14.979228973388672, + "rewards/rejected": -22.329891204833984, + "step": 699 + }, + { + "epoch": 0.4354587869362364, + "grad_norm": 0.8819683194160461, + "learning_rate": 4.749884739511296e-06, + "logits/chosen": 0.10031324625015259, + "logits/rejected": 4.287592887878418, + "logps/chosen": -440.3474426269531, + "logps/rejected": -884.9929809570312, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.949248313903809, + "rewards/margins": 17.867116928100586, + "rewards/rejected": -23.816364288330078, + "step": 700 + }, + { + "epoch": 0.43608087091757386, + "grad_norm": 23.288360595703125, + "learning_rate": 4.748732134624251e-06, + "logits/chosen": -1.186615228652954, + "logits/rejected": 3.4784440994262695, + "logps/chosen": -423.2039794921875, + "logps/rejected": -845.6221313476562, + "loss": 0.7393, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.458308219909668, + "rewards/margins": 15.12204360961914, + "rewards/rejected": -22.580352783203125, + "step": 701 + }, + { + "epoch": 0.43670295489891137, + "grad_norm": 0.004663229454308748, + "learning_rate": 4.7475795297372065e-06, + "logits/chosen": 0.0869104266166687, + "logits/rejected": 4.081899166107178, + "logps/chosen": -479.1942443847656, + "logps/rejected": -896.97119140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.93591833114624, + "rewards/margins": 18.201383590698242, + "rewards/rejected": -26.137300491333008, + "step": 702 + }, + { + "epoch": 0.4373250388802488, + "grad_norm": 0.0026992084458470345, + "learning_rate": 4.746426924850162e-06, + "logits/chosen": -2.009183883666992, + "logits/rejected": 3.8174266815185547, + "logps/chosen": -265.8365478515625, + "logps/rejected": -853.8241577148438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.638486862182617, + "rewards/margins": 23.152103424072266, + "rewards/rejected": -25.790592193603516, + "step": 703 + }, + { + "epoch": 0.4379471228615863, + "grad_norm": 0.031216738745570183, + "learning_rate": 4.745274319963117e-06, + "logits/chosen": 2.1903696060180664, + "logits/rejected": 4.714466571807861, + "logps/chosen": -630.603515625, + "logps/rejected": -957.2979736328125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.502143859863281, + "rewards/margins": 16.405481338500977, + "rewards/rejected": -25.907625198364258, + "step": 704 + }, + { + "epoch": 0.4385692068429238, + "grad_norm": 4.766060829162598, + "learning_rate": 4.744121715076072e-06, + "logits/chosen": 0.11997011303901672, + "logits/rejected": 2.390803098678589, + "logps/chosen": -474.9181823730469, + "logps/rejected": -758.4802856445312, + "loss": 0.0293, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.49026107788086, + "rewards/margins": 13.536415100097656, + "rewards/rejected": -22.026674270629883, + "step": 705 + }, + { + "epoch": 0.4391912908242613, + "grad_norm": 30.133831024169922, + "learning_rate": 4.742969110189027e-06, + "logits/chosen": 0.5862630009651184, + "logits/rejected": 1.7295148372650146, + "logps/chosen": -444.6620178222656, + "logps/rejected": -619.6727294921875, + "loss": 0.4622, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.195107460021973, + "rewards/margins": 14.017037391662598, + "rewards/rejected": -20.212146759033203, + "step": 706 + }, + { + "epoch": 0.43981337480559873, + "grad_norm": 0.0015233299927785993, + "learning_rate": 4.741816505301983e-06, + "logits/chosen": 1.5498592853546143, + "logits/rejected": 4.029008865356445, + "logps/chosen": -538.0079345703125, + "logps/rejected": -924.365966796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.708686828613281, + "rewards/margins": 23.275190353393555, + "rewards/rejected": -30.98387908935547, + "step": 707 + }, + { + "epoch": 0.44043545878693624, + "grad_norm": 0.0009342418634332716, + "learning_rate": 4.740663900414938e-06, + "logits/chosen": -0.7223360538482666, + "logits/rejected": 3.4309163093566895, + "logps/chosen": -340.63818359375, + "logps/rejected": -774.0181274414062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.805405616760254, + "rewards/margins": 21.276443481445312, + "rewards/rejected": -28.081846237182617, + "step": 708 + }, + { + "epoch": 0.4410575427682737, + "grad_norm": 46.141357421875, + "learning_rate": 4.739511295527893e-06, + "logits/chosen": -3.716412305831909, + "logits/rejected": 1.5815439224243164, + "logps/chosen": -422.9011535644531, + "logps/rejected": -904.5416870117188, + "loss": 1.7318, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.199352264404297, + "rewards/margins": 17.21719741821289, + "rewards/rejected": -27.416549682617188, + "step": 709 + }, + { + "epoch": 0.4416796267496112, + "grad_norm": 2.966789484024048, + "learning_rate": 4.738358690640848e-06, + "logits/chosen": 2.19631028175354, + "logits/rejected": 4.154239177703857, + "logps/chosen": -558.2725830078125, + "logps/rejected": -827.265625, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.756782531738281, + "rewards/margins": 15.467741966247559, + "rewards/rejected": -23.224525451660156, + "step": 710 + }, + { + "epoch": 0.4423017107309487, + "grad_norm": 1.096614956855774, + "learning_rate": 4.7372060857538035e-06, + "logits/chosen": 1.0244200229644775, + "logits/rejected": 4.128070831298828, + "logps/chosen": -523.0098266601562, + "logps/rejected": -862.1829833984375, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.989865303039551, + "rewards/margins": 17.196226119995117, + "rewards/rejected": -24.18609046936035, + "step": 711 + }, + { + "epoch": 0.44292379471228616, + "grad_norm": 21.39118194580078, + "learning_rate": 4.736053480866759e-06, + "logits/chosen": 1.2569371461868286, + "logits/rejected": 4.960230350494385, + "logps/chosen": -481.29217529296875, + "logps/rejected": -845.056396484375, + "loss": 0.1824, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.386847019195557, + "rewards/margins": 18.074981689453125, + "rewards/rejected": -23.461828231811523, + "step": 712 + }, + { + "epoch": 0.44354587869362366, + "grad_norm": 0.035147711634635925, + "learning_rate": 4.734900875979714e-06, + "logits/chosen": -2.8533473014831543, + "logits/rejected": 1.125679850578308, + "logps/chosen": -378.7342529296875, + "logps/rejected": -823.3388671875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.9344024658203125, + "rewards/margins": 23.337167739868164, + "rewards/rejected": -28.271568298339844, + "step": 713 + }, + { + "epoch": 0.4441679626749611, + "grad_norm": 0.0005704350187443197, + "learning_rate": 4.73374827109267e-06, + "logits/chosen": -0.12316238880157471, + "logits/rejected": 3.290395736694336, + "logps/chosen": -390.05584716796875, + "logps/rejected": -771.4310302734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.037465572357178, + "rewards/margins": 19.214134216308594, + "rewards/rejected": -24.251598358154297, + "step": 714 + }, + { + "epoch": 0.4447900466562986, + "grad_norm": 6.9552507400512695, + "learning_rate": 4.732595666205625e-06, + "logits/chosen": -2.466341018676758, + "logits/rejected": 1.6108577251434326, + "logps/chosen": -375.34283447265625, + "logps/rejected": -802.8533325195312, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.593360900878906, + "rewards/margins": 15.921603202819824, + "rewards/rejected": -21.514965057373047, + "step": 715 + }, + { + "epoch": 0.4454121306376361, + "grad_norm": 1.741106629371643, + "learning_rate": 4.7314430613185805e-06, + "logits/chosen": 2.07574462890625, + "logits/rejected": 3.481757164001465, + "logps/chosen": -640.0421142578125, + "logps/rejected": -870.4541015625, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.261276245117188, + "rewards/margins": 15.212532043457031, + "rewards/rejected": -28.473806381225586, + "step": 716 + }, + { + "epoch": 0.4460342146189736, + "grad_norm": 4.817162789549911e-06, + "learning_rate": 4.730290456431536e-06, + "logits/chosen": -2.971275568008423, + "logits/rejected": 4.351428508758545, + "logps/chosen": -219.6768035888672, + "logps/rejected": -880.5977172851562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8710782527923584, + "rewards/margins": 29.834440231323242, + "rewards/rejected": -32.70552062988281, + "step": 717 + }, + { + "epoch": 0.44665629860031103, + "grad_norm": 34.71223068237305, + "learning_rate": 4.729137851544491e-06, + "logits/chosen": -0.3415360450744629, + "logits/rejected": 3.649252414703369, + "logps/chosen": -492.4614562988281, + "logps/rejected": -833.1641845703125, + "loss": 0.6228, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.140350341796875, + "rewards/margins": 10.404376983642578, + "rewards/rejected": -14.544727325439453, + "step": 718 + }, + { + "epoch": 0.44727838258164854, + "grad_norm": 0.005894318222999573, + "learning_rate": 4.727985246657446e-06, + "logits/chosen": -1.3749679327011108, + "logits/rejected": 3.040403127670288, + "logps/chosen": -446.4466552734375, + "logps/rejected": -924.62939453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.499325275421143, + "rewards/margins": 21.93583106994629, + "rewards/rejected": -27.435157775878906, + "step": 719 + }, + { + "epoch": 0.447900466562986, + "grad_norm": 2.2069828510284424, + "learning_rate": 4.726832641770401e-06, + "logits/chosen": 0.39223602414131165, + "logits/rejected": 2.627257823944092, + "logps/chosen": -487.3267517089844, + "logps/rejected": -755.5730590820312, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.579398155212402, + "rewards/margins": 12.08456039428711, + "rewards/rejected": -18.663959503173828, + "step": 720 + }, + { + "epoch": 0.4485225505443235, + "grad_norm": 33.56819152832031, + "learning_rate": 4.725680036883357e-06, + "logits/chosen": 1.559786319732666, + "logits/rejected": 3.7089614868164062, + "logps/chosen": -713.7747192382812, + "logps/rejected": -980.051513671875, + "loss": 0.471, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.505293846130371, + "rewards/margins": 19.567005157470703, + "rewards/rejected": -24.07229995727539, + "step": 721 + }, + { + "epoch": 0.44914463452566095, + "grad_norm": 27.670228958129883, + "learning_rate": 4.724527431996312e-06, + "logits/chosen": 0.33212411403656006, + "logits/rejected": 2.9645965099334717, + "logps/chosen": -450.44378662109375, + "logps/rejected": -721.9329833984375, + "loss": 0.3559, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.267915725708008, + "rewards/margins": 13.11201286315918, + "rewards/rejected": -17.379928588867188, + "step": 722 + }, + { + "epoch": 0.44976671850699845, + "grad_norm": 15.541918754577637, + "learning_rate": 4.723374827109267e-06, + "logits/chosen": 2.854301929473877, + "logits/rejected": 5.9449462890625, + "logps/chosen": -631.2748413085938, + "logps/rejected": -989.6734619140625, + "loss": 0.1159, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.880619049072266, + "rewards/margins": 17.788612365722656, + "rewards/rejected": -26.669231414794922, + "step": 723 + }, + { + "epoch": 0.4503888024883359, + "grad_norm": 0.08411452174186707, + "learning_rate": 4.722222222222222e-06, + "logits/chosen": -0.4340131878852844, + "logits/rejected": 2.539937973022461, + "logps/chosen": -517.1436767578125, + "logps/rejected": -874.4027099609375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.182528018951416, + "rewards/margins": 19.89919090270996, + "rewards/rejected": -26.08172035217285, + "step": 724 + }, + { + "epoch": 0.4510108864696734, + "grad_norm": 20.3781681060791, + "learning_rate": 4.7210696173351775e-06, + "logits/chosen": 0.23897361755371094, + "logits/rejected": 2.9999685287475586, + "logps/chosen": -700.116943359375, + "logps/rejected": -934.967529296875, + "loss": 0.1813, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.934647560119629, + "rewards/margins": 15.803482055664062, + "rewards/rejected": -23.738128662109375, + "step": 725 + }, + { + "epoch": 0.45163297045101086, + "grad_norm": 29.43364715576172, + "learning_rate": 4.719917012448133e-06, + "logits/chosen": -1.8553903102874756, + "logits/rejected": 1.862307071685791, + "logps/chosen": -480.6034240722656, + "logps/rejected": -942.1726684570312, + "loss": 0.2991, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.97006368637085, + "rewards/margins": 21.751319885253906, + "rewards/rejected": -28.721384048461914, + "step": 726 + }, + { + "epoch": 0.45225505443234837, + "grad_norm": 23.18766212463379, + "learning_rate": 4.718764407561088e-06, + "logits/chosen": 1.378616213798523, + "logits/rejected": 3.979541063308716, + "logps/chosen": -559.097412109375, + "logps/rejected": -870.5625610351562, + "loss": 0.3831, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.625370979309082, + "rewards/margins": 14.113837242126465, + "rewards/rejected": -21.739208221435547, + "step": 727 + }, + { + "epoch": 0.4528771384136858, + "grad_norm": 24.29891014099121, + "learning_rate": 4.717611802674044e-06, + "logits/chosen": 2.059588670730591, + "logits/rejected": 4.025932312011719, + "logps/chosen": -778.895263671875, + "logps/rejected": -1051.254150390625, + "loss": 0.2721, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.523857116699219, + "rewards/margins": 17.883371353149414, + "rewards/rejected": -29.407230377197266, + "step": 728 + }, + { + "epoch": 0.4534992223950233, + "grad_norm": 10.843426704406738, + "learning_rate": 4.716459197786999e-06, + "logits/chosen": -1.1179172992706299, + "logits/rejected": 2.602318525314331, + "logps/chosen": -471.91949462890625, + "logps/rejected": -863.3618774414062, + "loss": 0.1805, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.14882755279541, + "rewards/margins": 18.53677749633789, + "rewards/rejected": -24.685606002807617, + "step": 729 + }, + { + "epoch": 0.45412130637636083, + "grad_norm": 1.3418076038360596, + "learning_rate": 4.7153065928999545e-06, + "logits/chosen": -1.153537631034851, + "logits/rejected": 2.798265218734741, + "logps/chosen": -587.9439697265625, + "logps/rejected": -993.32861328125, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.406091690063477, + "rewards/margins": 23.02769660949707, + "rewards/rejected": -33.43379211425781, + "step": 730 + }, + { + "epoch": 0.4547433903576983, + "grad_norm": 0.011465908959507942, + "learning_rate": 4.71415398801291e-06, + "logits/chosen": -1.5201659202575684, + "logits/rejected": 3.761108160018921, + "logps/chosen": -495.5352783203125, + "logps/rejected": -978.4993286132812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.031967163085938, + "rewards/margins": 23.243043899536133, + "rewards/rejected": -32.2750129699707, + "step": 731 + }, + { + "epoch": 0.4553654743390358, + "grad_norm": 0.019519884139299393, + "learning_rate": 4.713001383125865e-06, + "logits/chosen": -1.4993672370910645, + "logits/rejected": 3.8746728897094727, + "logps/chosen": -348.3289794921875, + "logps/rejected": -954.3119506835938, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.078332901000977, + "rewards/margins": 22.78673553466797, + "rewards/rejected": -30.865070343017578, + "step": 732 + }, + { + "epoch": 0.45598755832037324, + "grad_norm": 0.0006432720110751688, + "learning_rate": 4.71184877823882e-06, + "logits/chosen": 1.0196747779846191, + "logits/rejected": 4.499789714813232, + "logps/chosen": -518.6286010742188, + "logps/rejected": -980.1221923828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.276054382324219, + "rewards/margins": 25.788124084472656, + "rewards/rejected": -33.064178466796875, + "step": 733 + }, + { + "epoch": 0.45660964230171075, + "grad_norm": 0.017226440832018852, + "learning_rate": 4.710696173351775e-06, + "logits/chosen": -1.222594976425171, + "logits/rejected": 4.341372489929199, + "logps/chosen": -471.2843322753906, + "logps/rejected": -997.5906982421875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.164975166320801, + "rewards/margins": 24.127727508544922, + "rewards/rejected": -31.29270362854004, + "step": 734 + }, + { + "epoch": 0.4572317262830482, + "grad_norm": 0.007779096253216267, + "learning_rate": 4.709543568464731e-06, + "logits/chosen": -0.9736397862434387, + "logits/rejected": 2.2716212272644043, + "logps/chosen": -304.4974365234375, + "logps/rejected": -723.11181640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.925656795501709, + "rewards/margins": 21.8261775970459, + "rewards/rejected": -26.751834869384766, + "step": 735 + }, + { + "epoch": 0.4578538102643857, + "grad_norm": 3.236133337020874, + "learning_rate": 4.708390963577686e-06, + "logits/chosen": 0.17622852325439453, + "logits/rejected": 2.8149657249450684, + "logps/chosen": -579.513427734375, + "logps/rejected": -885.9993896484375, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.07348346710205, + "rewards/margins": 19.248193740844727, + "rewards/rejected": -28.32167625427246, + "step": 736 + }, + { + "epoch": 0.45847589424572316, + "grad_norm": 0.00014734258002135903, + "learning_rate": 4.707238358690641e-06, + "logits/chosen": 1.3257426023483276, + "logits/rejected": 3.4590463638305664, + "logps/chosen": -559.5299072265625, + "logps/rejected": -946.916748046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.955436706542969, + "rewards/margins": 21.80295181274414, + "rewards/rejected": -35.75838851928711, + "step": 737 + }, + { + "epoch": 0.45909797822706067, + "grad_norm": 21.308420181274414, + "learning_rate": 4.706085753803596e-06, + "logits/chosen": -1.9544880390167236, + "logits/rejected": 4.07980489730835, + "logps/chosen": -432.8442077636719, + "logps/rejected": -980.4149169921875, + "loss": 0.0999, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.972265243530273, + "rewards/margins": 22.069438934326172, + "rewards/rejected": -32.04170608520508, + "step": 738 + }, + { + "epoch": 0.4597200622083981, + "grad_norm": 6.995245456695557, + "learning_rate": 4.7049331489165515e-06, + "logits/chosen": -0.3527810573577881, + "logits/rejected": 3.945213794708252, + "logps/chosen": -425.16424560546875, + "logps/rejected": -800.3833618164062, + "loss": 0.0411, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.171060562133789, + "rewards/margins": 21.52447509765625, + "rewards/rejected": -30.69553565979004, + "step": 739 + }, + { + "epoch": 0.4603421461897356, + "grad_norm": 9.528139114379883, + "learning_rate": 4.703780544029507e-06, + "logits/chosen": 0.8781489729881287, + "logits/rejected": 2.834296464920044, + "logps/chosen": -608.282470703125, + "logps/rejected": -802.6109619140625, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.577667236328125, + "rewards/margins": 16.0227108001709, + "rewards/rejected": -27.600378036499023, + "step": 740 + }, + { + "epoch": 0.4609642301710731, + "grad_norm": 0.005674920044839382, + "learning_rate": 4.702627939142462e-06, + "logits/chosen": -2.576509475708008, + "logits/rejected": 3.848076820373535, + "logps/chosen": -365.6551208496094, + "logps/rejected": -1039.408447265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.874650001525879, + "rewards/margins": 30.167984008789062, + "rewards/rejected": -37.04263687133789, + "step": 741 + }, + { + "epoch": 0.4615863141524106, + "grad_norm": 15.865659713745117, + "learning_rate": 4.701475334255417e-06, + "logits/chosen": -0.6655741930007935, + "logits/rejected": 3.148348569869995, + "logps/chosen": -500.2545166015625, + "logps/rejected": -918.5359497070312, + "loss": 0.1022, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.519559860229492, + "rewards/margins": 20.05760955810547, + "rewards/rejected": -27.577167510986328, + "step": 742 + }, + { + "epoch": 0.46220839813374803, + "grad_norm": 0.08761231601238251, + "learning_rate": 4.700322729368373e-06, + "logits/chosen": -0.6104300022125244, + "logits/rejected": 2.0581812858581543, + "logps/chosen": -421.5753173828125, + "logps/rejected": -801.907958984375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.096579074859619, + "rewards/margins": 20.425188064575195, + "rewards/rejected": -26.521766662597656, + "step": 743 + }, + { + "epoch": 0.46283048211508554, + "grad_norm": 0.027988320216536522, + "learning_rate": 4.6991701244813285e-06, + "logits/chosen": 0.3800036907196045, + "logits/rejected": 3.919179916381836, + "logps/chosen": -517.352294921875, + "logps/rejected": -964.316162109375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.163141250610352, + "rewards/margins": 28.622203826904297, + "rewards/rejected": -38.785343170166016, + "step": 744 + }, + { + "epoch": 0.463452566096423, + "grad_norm": 25.30392837524414, + "learning_rate": 4.698017519594284e-06, + "logits/chosen": -1.4486440420150757, + "logits/rejected": 3.305373191833496, + "logps/chosen": -373.08807373046875, + "logps/rejected": -865.032958984375, + "loss": 0.6425, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.044712543487549, + "rewards/margins": 28.989627838134766, + "rewards/rejected": -35.034339904785156, + "step": 745 + }, + { + "epoch": 0.4640746500777605, + "grad_norm": 5.703428268432617, + "learning_rate": 4.696864914707239e-06, + "logits/chosen": -2.756654739379883, + "logits/rejected": 0.5146583318710327, + "logps/chosen": -502.0623474121094, + "logps/rejected": -884.9734497070312, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.868464469909668, + "rewards/margins": 18.628908157348633, + "rewards/rejected": -28.497373580932617, + "step": 746 + }, + { + "epoch": 0.464696734059098, + "grad_norm": 0.549371063709259, + "learning_rate": 4.695712309820194e-06, + "logits/chosen": 0.6102413535118103, + "logits/rejected": 3.507424831390381, + "logps/chosen": -387.8964538574219, + "logps/rejected": -794.0484008789062, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.76818561553955, + "rewards/margins": 23.353492736816406, + "rewards/rejected": -33.12167739868164, + "step": 747 + }, + { + "epoch": 0.46531881804043546, + "grad_norm": 0.001107222051359713, + "learning_rate": 4.694559704933149e-06, + "logits/chosen": -1.4949012994766235, + "logits/rejected": 2.643195390701294, + "logps/chosen": -450.792724609375, + "logps/rejected": -943.675537109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.531936645507812, + "rewards/margins": 24.204355239868164, + "rewards/rejected": -34.736289978027344, + "step": 748 + }, + { + "epoch": 0.46594090202177296, + "grad_norm": 17.563385009765625, + "learning_rate": 4.693407100046105e-06, + "logits/chosen": 1.4797841310501099, + "logits/rejected": 4.371963024139404, + "logps/chosen": -494.5657958984375, + "logps/rejected": -917.1324462890625, + "loss": 0.264, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.093585968017578, + "rewards/margins": 18.366085052490234, + "rewards/rejected": -27.459671020507812, + "step": 749 + }, + { + "epoch": 0.4665629860031104, + "grad_norm": 43.27721405029297, + "learning_rate": 4.69225449515906e-06, + "logits/chosen": 2.0956027507781982, + "logits/rejected": 3.2051877975463867, + "logps/chosen": -626.8250732421875, + "logps/rejected": -921.1513671875, + "loss": 1.0323, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.858763694763184, + "rewards/margins": 16.39712142944336, + "rewards/rejected": -30.25588607788086, + "step": 750 + }, + { + "epoch": 0.4671850699844479, + "grad_norm": 18.37482261657715, + "learning_rate": 4.691101890272015e-06, + "logits/chosen": -0.03599190711975098, + "logits/rejected": 2.4114785194396973, + "logps/chosen": -523.785888671875, + "logps/rejected": -904.9202270507812, + "loss": 0.1681, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.543304443359375, + "rewards/margins": 26.735179901123047, + "rewards/rejected": -36.27848434448242, + "step": 751 + }, + { + "epoch": 0.46780715396578537, + "grad_norm": 2.7367844581604004, + "learning_rate": 4.68994928538497e-06, + "logits/chosen": 2.2475149631500244, + "logits/rejected": 2.533215284347534, + "logps/chosen": -502.916015625, + "logps/rejected": -703.9136352539062, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.636620998382568, + "rewards/margins": 16.46638298034668, + "rewards/rejected": -22.103004455566406, + "step": 752 + }, + { + "epoch": 0.4684292379471229, + "grad_norm": 21.466821670532227, + "learning_rate": 4.6887966804979255e-06, + "logits/chosen": 0.2573103904724121, + "logits/rejected": 3.109172821044922, + "logps/chosen": -586.2950439453125, + "logps/rejected": -882.7223510742188, + "loss": 0.1991, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.989780902862549, + "rewards/margins": 17.86110496520996, + "rewards/rejected": -23.85088539123535, + "step": 753 + }, + { + "epoch": 0.46905132192846033, + "grad_norm": 54.392433166503906, + "learning_rate": 4.687644075610881e-06, + "logits/chosen": -2.5727341175079346, + "logits/rejected": 1.3274749517440796, + "logps/chosen": -367.27911376953125, + "logps/rejected": -709.7308349609375, + "loss": 2.162, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.0185112953186035, + "rewards/margins": 17.04226303100586, + "rewards/rejected": -23.060775756835938, + "step": 754 + }, + { + "epoch": 0.46967340590979784, + "grad_norm": 56.05888366699219, + "learning_rate": 4.686491470723836e-06, + "logits/chosen": -0.37654730677604675, + "logits/rejected": 1.5416078567504883, + "logps/chosen": -543.9898681640625, + "logps/rejected": -910.7147216796875, + "loss": 1.9481, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.147024154663086, + "rewards/margins": 21.383563995361328, + "rewards/rejected": -29.530590057373047, + "step": 755 + }, + { + "epoch": 0.4702954898911353, + "grad_norm": 46.91044998168945, + "learning_rate": 4.685338865836791e-06, + "logits/chosen": 1.3805171251296997, + "logits/rejected": 4.201688289642334, + "logps/chosen": -599.8497314453125, + "logps/rejected": -894.1846313476562, + "loss": 1.7776, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.594759941101074, + "rewards/margins": 20.424440383911133, + "rewards/rejected": -31.019201278686523, + "step": 756 + }, + { + "epoch": 0.4709175738724728, + "grad_norm": 8.315458297729492, + "learning_rate": 4.684186260949747e-06, + "logits/chosen": 4.614778518676758, + "logits/rejected": 4.109716415405273, + "logps/chosen": -738.0523681640625, + "logps/rejected": -843.14599609375, + "loss": 0.0443, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.66948413848877, + "rewards/margins": 13.888378143310547, + "rewards/rejected": -25.557861328125, + "step": 757 + }, + { + "epoch": 0.47153965785381025, + "grad_norm": 9.746458053588867, + "learning_rate": 4.6830336560627025e-06, + "logits/chosen": 1.578921914100647, + "logits/rejected": 1.6241142749786377, + "logps/chosen": -752.6221923828125, + "logps/rejected": -848.0787963867188, + "loss": 0.053, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.152379989624023, + "rewards/margins": 15.030645370483398, + "rewards/rejected": -26.183025360107422, + "step": 758 + }, + { + "epoch": 0.47216174183514775, + "grad_norm": 54.19750213623047, + "learning_rate": 4.681881051175658e-06, + "logits/chosen": -2.6089086532592773, + "logits/rejected": 0.563267707824707, + "logps/chosen": -492.4306945800781, + "logps/rejected": -795.8643798828125, + "loss": 1.1181, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.355657577514648, + "rewards/margins": 14.989089965820312, + "rewards/rejected": -22.344745635986328, + "step": 759 + }, + { + "epoch": 0.4727838258164852, + "grad_norm": 40.35515594482422, + "learning_rate": 4.680728446288613e-06, + "logits/chosen": 1.459610939025879, + "logits/rejected": 2.081594228744507, + "logps/chosen": -646.0441284179688, + "logps/rejected": -817.3916015625, + "loss": 0.6381, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.502636909484863, + "rewards/margins": 13.858505249023438, + "rewards/rejected": -24.36114501953125, + "step": 760 + }, + { + "epoch": 0.4734059097978227, + "grad_norm": 37.597110748291016, + "learning_rate": 4.679575841401568e-06, + "logits/chosen": -0.21912920475006104, + "logits/rejected": 4.26984167098999, + "logps/chosen": -525.0760498046875, + "logps/rejected": -860.02294921875, + "loss": 0.5311, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.533904075622559, + "rewards/margins": 13.084606170654297, + "rewards/rejected": -21.61850929260254, + "step": 761 + }, + { + "epoch": 0.47402799377916016, + "grad_norm": 3.999826669692993, + "learning_rate": 4.678423236514523e-06, + "logits/chosen": 1.7163630723953247, + "logits/rejected": 5.152613639831543, + "logps/chosen": -440.76507568359375, + "logps/rejected": -786.729248046875, + "loss": 0.0292, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.547441005706787, + "rewards/margins": 15.236788749694824, + "rewards/rejected": -18.784229278564453, + "step": 762 + }, + { + "epoch": 0.47465007776049767, + "grad_norm": 1.3566317420554697e-06, + "learning_rate": 4.677270631627479e-06, + "logits/chosen": 0.08991807699203491, + "logits/rejected": 4.114513397216797, + "logps/chosen": -538.1998901367188, + "logps/rejected": -1037.4219970703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.77888822555542, + "rewards/margins": 29.567556381225586, + "rewards/rejected": -33.3464469909668, + "step": 763 + }, + { + "epoch": 0.4752721617418352, + "grad_norm": 1.6059880256652832, + "learning_rate": 4.676118026740434e-06, + "logits/chosen": 1.8196675777435303, + "logits/rejected": 4.471251487731934, + "logps/chosen": -545.8761596679688, + "logps/rejected": -786.5194091796875, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.208722114562988, + "rewards/margins": 13.019830703735352, + "rewards/rejected": -17.228551864624023, + "step": 764 + }, + { + "epoch": 0.4758942457231726, + "grad_norm": 8.153962135314941, + "learning_rate": 4.674965421853389e-06, + "logits/chosen": 3.151045322418213, + "logits/rejected": 3.54077410697937, + "logps/chosen": -668.2283935546875, + "logps/rejected": -743.080078125, + "loss": 0.06, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.266658782958984, + "rewards/margins": 11.068963050842285, + "rewards/rejected": -16.335622787475586, + "step": 765 + }, + { + "epoch": 0.47651632970451013, + "grad_norm": 41.73101806640625, + "learning_rate": 4.673812816966344e-06, + "logits/chosen": 0.876078188419342, + "logits/rejected": 3.8836395740509033, + "logps/chosen": -591.0263671875, + "logps/rejected": -935.2633666992188, + "loss": 1.4338, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.700626373291016, + "rewards/margins": 18.544769287109375, + "rewards/rejected": -25.24539566040039, + "step": 766 + }, + { + "epoch": 0.4771384136858476, + "grad_norm": 43.17827224731445, + "learning_rate": 4.6726602120792995e-06, + "logits/chosen": -1.3370412588119507, + "logits/rejected": 1.3205068111419678, + "logps/chosen": -465.7358703613281, + "logps/rejected": -738.778076171875, + "loss": 0.7697, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.05903959274292, + "rewards/margins": 13.253592491149902, + "rewards/rejected": -17.312631607055664, + "step": 767 + }, + { + "epoch": 0.4777604976671851, + "grad_norm": 0.40470781922340393, + "learning_rate": 4.671507607192255e-06, + "logits/chosen": 0.4932441711425781, + "logits/rejected": 4.747807025909424, + "logps/chosen": -420.48004150390625, + "logps/rejected": -759.2723999023438, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8618760108947754, + "rewards/margins": 15.449785232543945, + "rewards/rejected": -19.311660766601562, + "step": 768 + }, + { + "epoch": 0.47838258164852254, + "grad_norm": 1.6781436204910278, + "learning_rate": 4.67035500230521e-06, + "logits/chosen": -0.32106083631515503, + "logits/rejected": 2.2426936626434326, + "logps/chosen": -293.17706298828125, + "logps/rejected": -586.2684326171875, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.407588243484497, + "rewards/margins": 13.709722518920898, + "rewards/rejected": -16.1173095703125, + "step": 769 + }, + { + "epoch": 0.47900466562986005, + "grad_norm": 0.521558403968811, + "learning_rate": 4.669202397418165e-06, + "logits/chosen": 0.8737245798110962, + "logits/rejected": 2.6280784606933594, + "logps/chosen": -627.6446533203125, + "logps/rejected": -758.8953857421875, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.146016597747803, + "rewards/margins": 14.013367652893066, + "rewards/rejected": -19.15938377380371, + "step": 770 + }, + { + "epoch": 0.4796267496111975, + "grad_norm": 0.2913641333580017, + "learning_rate": 4.66804979253112e-06, + "logits/chosen": -1.9072849750518799, + "logits/rejected": 2.029294013977051, + "logps/chosen": -430.62786865234375, + "logps/rejected": -845.602783203125, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7142653465270996, + "rewards/margins": 19.053903579711914, + "rewards/rejected": -22.76816749572754, + "step": 771 + }, + { + "epoch": 0.480248833592535, + "grad_norm": 8.871782302856445, + "learning_rate": 4.6668971876440765e-06, + "logits/chosen": 0.41620588302612305, + "logits/rejected": 3.4311208724975586, + "logps/chosen": -433.32684326171875, + "logps/rejected": -803.77783203125, + "loss": 0.2005, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.387638092041016, + "rewards/margins": 16.02027702331543, + "rewards/rejected": -20.407915115356445, + "step": 772 + }, + { + "epoch": 0.48087091757387246, + "grad_norm": 1.6098644733428955, + "learning_rate": 4.665744582757032e-06, + "logits/chosen": 3.09476900100708, + "logits/rejected": 2.489342212677002, + "logps/chosen": -642.2359008789062, + "logps/rejected": -781.3652954101562, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.92470645904541, + "rewards/margins": 16.429569244384766, + "rewards/rejected": -22.35427474975586, + "step": 773 + }, + { + "epoch": 0.48149300155520997, + "grad_norm": 4.467459678649902, + "learning_rate": 4.664591977869987e-06, + "logits/chosen": 0.6544711589813232, + "logits/rejected": 4.949759483337402, + "logps/chosen": -458.5473937988281, + "logps/rejected": -877.2110595703125, + "loss": 0.0369, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.6345696449279785, + "rewards/margins": 20.904634475708008, + "rewards/rejected": -25.539203643798828, + "step": 774 + }, + { + "epoch": 0.4821150855365474, + "grad_norm": 2.7226791381835938, + "learning_rate": 4.663439372982942e-06, + "logits/chosen": -0.25376057624816895, + "logits/rejected": 3.4412503242492676, + "logps/chosen": -515.607177734375, + "logps/rejected": -887.7158813476562, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2038459777832031, + "rewards/margins": 14.594051361083984, + "rewards/rejected": -15.797898292541504, + "step": 775 + }, + { + "epoch": 0.4827371695178849, + "grad_norm": 19.828170776367188, + "learning_rate": 4.662286768095897e-06, + "logits/chosen": 2.055224895477295, + "logits/rejected": 3.848604440689087, + "logps/chosen": -650.3723754882812, + "logps/rejected": -948.07470703125, + "loss": 0.1521, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.56473445892334, + "rewards/margins": 17.763263702392578, + "rewards/rejected": -26.3279972076416, + "step": 776 + }, + { + "epoch": 0.4833592534992224, + "grad_norm": 30.034021377563477, + "learning_rate": 4.661134163208853e-06, + "logits/chosen": 0.7829601764678955, + "logits/rejected": 3.1941311359405518, + "logps/chosen": -531.4030151367188, + "logps/rejected": -833.4378051757812, + "loss": 1.231, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.631799221038818, + "rewards/margins": 16.359912872314453, + "rewards/rejected": -21.991710662841797, + "step": 777 + }, + { + "epoch": 0.4839813374805599, + "grad_norm": 26.594362258911133, + "learning_rate": 4.659981558321808e-06, + "logits/chosen": -1.7318233251571655, + "logits/rejected": 0.6387461423873901, + "logps/chosen": -417.7113952636719, + "logps/rejected": -671.1884765625, + "loss": 0.4294, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4069485664367676, + "rewards/margins": 14.395118713378906, + "rewards/rejected": -16.80206871032715, + "step": 778 + }, + { + "epoch": 0.48460342146189733, + "grad_norm": 0.039601147174835205, + "learning_rate": 4.658828953434763e-06, + "logits/chosen": 1.3546221256256104, + "logits/rejected": 3.462092399597168, + "logps/chosen": -591.6671142578125, + "logps/rejected": -932.0730590820312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.906269550323486, + "rewards/margins": 20.576311111450195, + "rewards/rejected": -28.482580184936523, + "step": 779 + }, + { + "epoch": 0.48522550544323484, + "grad_norm": 0.25924599170684814, + "learning_rate": 4.657676348547718e-06, + "logits/chosen": -2.0941567420959473, + "logits/rejected": 2.815412759780884, + "logps/chosen": -234.81979370117188, + "logps/rejected": -680.761474609375, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9445247650146484, + "rewards/margins": 19.434349060058594, + "rewards/rejected": -23.37887191772461, + "step": 780 + }, + { + "epoch": 0.4858475894245723, + "grad_norm": 40.54834747314453, + "learning_rate": 4.6565237436606735e-06, + "logits/chosen": -2.5595715045928955, + "logits/rejected": 1.2172040939331055, + "logps/chosen": -449.4385986328125, + "logps/rejected": -810.2548828125, + "loss": 1.0862, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.879364490509033, + "rewards/margins": 14.449647903442383, + "rewards/rejected": -19.329011917114258, + "step": 781 + }, + { + "epoch": 0.4864696734059098, + "grad_norm": 0.00287282164208591, + "learning_rate": 4.655371138773629e-06, + "logits/chosen": -0.0861775279045105, + "logits/rejected": 1.4691548347473145, + "logps/chosen": -417.1506652832031, + "logps/rejected": -736.8383178710938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.55625057220459, + "rewards/margins": 19.743030548095703, + "rewards/rejected": -26.29928207397461, + "step": 782 + }, + { + "epoch": 0.4870917573872473, + "grad_norm": 0.6697667837142944, + "learning_rate": 4.654218533886584e-06, + "logits/chosen": 0.8768869638442993, + "logits/rejected": 3.338918685913086, + "logps/chosen": -578.1866455078125, + "logps/rejected": -951.1362915039062, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.037720680236816, + "rewards/margins": 21.80088996887207, + "rewards/rejected": -27.83860969543457, + "step": 783 + }, + { + "epoch": 0.48771384136858476, + "grad_norm": 13.243257522583008, + "learning_rate": 4.653065928999539e-06, + "logits/chosen": 0.5735681653022766, + "logits/rejected": 1.3145192861557007, + "logps/chosen": -499.7567138671875, + "logps/rejected": -654.567626953125, + "loss": 0.102, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3559441566467285, + "rewards/margins": 11.995672225952148, + "rewards/rejected": -14.351615905761719, + "step": 784 + }, + { + "epoch": 0.48833592534992226, + "grad_norm": 2.240255832672119, + "learning_rate": 4.651913324112494e-06, + "logits/chosen": 0.1266368329524994, + "logits/rejected": 3.817394733428955, + "logps/chosen": -391.1224365234375, + "logps/rejected": -742.4867553710938, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3036351203918457, + "rewards/margins": 17.926546096801758, + "rewards/rejected": -20.230178833007812, + "step": 785 + }, + { + "epoch": 0.4889580093312597, + "grad_norm": 0.12976962327957153, + "learning_rate": 4.6507607192254504e-06, + "logits/chosen": 0.6222423315048218, + "logits/rejected": 4.154601097106934, + "logps/chosen": -553.1651000976562, + "logps/rejected": -969.9971923828125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.439103126525879, + "rewards/margins": 21.052276611328125, + "rewards/rejected": -27.49138069152832, + "step": 786 + }, + { + "epoch": 0.4895800933125972, + "grad_norm": 12.953614234924316, + "learning_rate": 4.649608114338406e-06, + "logits/chosen": -1.1646008491516113, + "logits/rejected": 2.2630698680877686, + "logps/chosen": -446.870361328125, + "logps/rejected": -865.9774169921875, + "loss": 0.0981, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.6477274894714355, + "rewards/margins": 17.280902862548828, + "rewards/rejected": -24.928630828857422, + "step": 787 + }, + { + "epoch": 0.49020217729393467, + "grad_norm": 24.662870407104492, + "learning_rate": 4.648455509451361e-06, + "logits/chosen": 2.1699166297912598, + "logits/rejected": 3.7511253356933594, + "logps/chosen": -555.0864868164062, + "logps/rejected": -872.4273681640625, + "loss": 0.5918, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.620430946350098, + "rewards/margins": 18.23769187927246, + "rewards/rejected": -23.858123779296875, + "step": 788 + }, + { + "epoch": 0.4908242612752722, + "grad_norm": 0.28288745880126953, + "learning_rate": 4.647302904564316e-06, + "logits/chosen": -0.5442591905593872, + "logits/rejected": 3.1663870811462402, + "logps/chosen": -431.5106506347656, + "logps/rejected": -803.7391357421875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.360430717468262, + "rewards/margins": 13.79015827178955, + "rewards/rejected": -19.150588989257812, + "step": 789 + }, + { + "epoch": 0.49144634525660963, + "grad_norm": 26.9351863861084, + "learning_rate": 4.646150299677271e-06, + "logits/chosen": -0.5746155381202698, + "logits/rejected": 2.7818684577941895, + "logps/chosen": -447.7415466308594, + "logps/rejected": -706.74072265625, + "loss": 0.3673, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7811388969421387, + "rewards/margins": 11.556182861328125, + "rewards/rejected": -15.337322235107422, + "step": 790 + }, + { + "epoch": 0.49206842923794714, + "grad_norm": 18.7960262298584, + "learning_rate": 4.6449976947902266e-06, + "logits/chosen": -0.4236619472503662, + "logits/rejected": 3.328354835510254, + "logps/chosen": -433.424560546875, + "logps/rejected": -815.5387573242188, + "loss": 0.3204, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.099715232849121, + "rewards/margins": 17.299833297729492, + "rewards/rejected": -23.399547576904297, + "step": 791 + }, + { + "epoch": 0.4926905132192846, + "grad_norm": 0.0008821140509098768, + "learning_rate": 4.643845089903182e-06, + "logits/chosen": -0.3153393268585205, + "logits/rejected": 3.704957962036133, + "logps/chosen": -450.7311096191406, + "logps/rejected": -864.4533081054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.469179391860962, + "rewards/margins": 22.34432029724121, + "rewards/rejected": -25.813501358032227, + "step": 792 + }, + { + "epoch": 0.4933125972006221, + "grad_norm": 0.9906829595565796, + "learning_rate": 4.642692485016137e-06, + "logits/chosen": -0.16309303045272827, + "logits/rejected": 4.056373596191406, + "logps/chosen": -536.8864135742188, + "logps/rejected": -936.7313232421875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.904061317443848, + "rewards/margins": 19.377239227294922, + "rewards/rejected": -26.281301498413086, + "step": 793 + }, + { + "epoch": 0.49393468118195955, + "grad_norm": 4.4242730140686035, + "learning_rate": 4.641539880129092e-06, + "logits/chosen": 1.683719515800476, + "logits/rejected": 3.134469747543335, + "logps/chosen": -638.5487060546875, + "logps/rejected": -888.1138916015625, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.161971569061279, + "rewards/margins": 17.2728271484375, + "rewards/rejected": -22.434797286987305, + "step": 794 + }, + { + "epoch": 0.49455676516329705, + "grad_norm": 0.9904939532279968, + "learning_rate": 4.6403872752420475e-06, + "logits/chosen": 0.6282204985618591, + "logits/rejected": 2.4573426246643066, + "logps/chosen": -596.8509521484375, + "logps/rejected": -819.6409301757812, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.868796348571777, + "rewards/margins": 16.89417266845703, + "rewards/rejected": -25.762969970703125, + "step": 795 + }, + { + "epoch": 0.4951788491446345, + "grad_norm": 0.050002530217170715, + "learning_rate": 4.639234670355003e-06, + "logits/chosen": 0.26788705587387085, + "logits/rejected": 4.157488822937012, + "logps/chosen": -586.4090576171875, + "logps/rejected": -960.6173095703125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.250540733337402, + "rewards/margins": 18.535598754882812, + "rewards/rejected": -23.78614044189453, + "step": 796 + }, + { + "epoch": 0.495800933125972, + "grad_norm": 0.3074765205383301, + "learning_rate": 4.638082065467958e-06, + "logits/chosen": 2.1239802837371826, + "logits/rejected": 4.563276767730713, + "logps/chosen": -500.58929443359375, + "logps/rejected": -899.6068115234375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.407536029815674, + "rewards/margins": 16.969867706298828, + "rewards/rejected": -24.377403259277344, + "step": 797 + }, + { + "epoch": 0.49642301710730946, + "grad_norm": 0.8631083965301514, + "learning_rate": 4.636929460580913e-06, + "logits/chosen": 0.07526445388793945, + "logits/rejected": 1.3133467435836792, + "logps/chosen": -547.68994140625, + "logps/rejected": -817.7927856445312, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.602295398712158, + "rewards/margins": 11.569221496582031, + "rewards/rejected": -18.17151641845703, + "step": 798 + }, + { + "epoch": 0.49704510108864697, + "grad_norm": 0.0004819149326067418, + "learning_rate": 4.635776855693868e-06, + "logits/chosen": -1.4631166458129883, + "logits/rejected": 4.0301384925842285, + "logps/chosen": -348.5102233886719, + "logps/rejected": -859.6078491210938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0662336349487305, + "rewards/margins": 19.39805030822754, + "rewards/rejected": -24.464282989501953, + "step": 799 + }, + { + "epoch": 0.4976671850699845, + "grad_norm": 1.7157622575759888, + "learning_rate": 4.634624250806824e-06, + "logits/chosen": 1.9659500122070312, + "logits/rejected": 3.541919708251953, + "logps/chosen": -578.5076904296875, + "logps/rejected": -742.12841796875, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.218245506286621, + "rewards/margins": 15.700920104980469, + "rewards/rejected": -18.919166564941406, + "step": 800 + }, + { + "epoch": 0.4982892690513219, + "grad_norm": 13.794669151306152, + "learning_rate": 4.63347164591978e-06, + "logits/chosen": 3.571282386779785, + "logits/rejected": 4.697101593017578, + "logps/chosen": -663.9895629882812, + "logps/rejected": -960.5087890625, + "loss": 0.0757, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.03059196472168, + "rewards/margins": 18.64733123779297, + "rewards/rejected": -25.677921295166016, + "step": 801 + }, + { + "epoch": 0.49891135303265943, + "grad_norm": 4.653563022613525, + "learning_rate": 4.632319041032735e-06, + "logits/chosen": -0.7853131890296936, + "logits/rejected": 2.139136552810669, + "logps/chosen": -603.5784912109375, + "logps/rejected": -852.1097412109375, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.409650802612305, + "rewards/margins": 11.422354698181152, + "rewards/rejected": -16.83200454711914, + "step": 802 + }, + { + "epoch": 0.4995334370139969, + "grad_norm": 27.104677200317383, + "learning_rate": 4.63116643614569e-06, + "logits/chosen": -1.9991862773895264, + "logits/rejected": 3.2186059951782227, + "logps/chosen": -384.8690185546875, + "logps/rejected": -877.2080078125, + "loss": 0.1709, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.625671863555908, + "rewards/margins": 17.056589126586914, + "rewards/rejected": -22.682260513305664, + "step": 803 + }, + { + "epoch": 0.5001555209953343, + "grad_norm": 0.00779650267213583, + "learning_rate": 4.630013831258645e-06, + "logits/chosen": -1.2525960206985474, + "logits/rejected": 3.466864585876465, + "logps/chosen": -509.2117919921875, + "logps/rejected": -991.336181640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.337059020996094, + "rewards/margins": 22.115440368652344, + "rewards/rejected": -26.45250129699707, + "step": 804 + }, + { + "epoch": 0.5007776049766719, + "grad_norm": 35.081085205078125, + "learning_rate": 4.6288612263716006e-06, + "logits/chosen": 1.4515666961669922, + "logits/rejected": 2.7874021530151367, + "logps/chosen": -558.619140625, + "logps/rejected": -708.9620361328125, + "loss": 0.3746, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3137013912200928, + "rewards/margins": 13.120676040649414, + "rewards/rejected": -16.434377670288086, + "step": 805 + }, + { + "epoch": 0.5013996889580093, + "grad_norm": 0.13099785149097443, + "learning_rate": 4.627708621484556e-06, + "logits/chosen": -0.2925226092338562, + "logits/rejected": 2.9255149364471436, + "logps/chosen": -508.7183837890625, + "logps/rejected": -914.4775390625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.891010284423828, + "rewards/margins": 18.918092727661133, + "rewards/rejected": -25.809104919433594, + "step": 806 + }, + { + "epoch": 0.5020217729393468, + "grad_norm": 16.45452880859375, + "learning_rate": 4.626556016597511e-06, + "logits/chosen": 0.7290836572647095, + "logits/rejected": 3.254908323287964, + "logps/chosen": -575.7127685546875, + "logps/rejected": -794.7529296875, + "loss": 0.1445, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.9818434715271, + "rewards/margins": 11.375015258789062, + "rewards/rejected": -19.35685920715332, + "step": 807 + }, + { + "epoch": 0.5026438569206843, + "grad_norm": 1.7199348211288452, + "learning_rate": 4.625403411710466e-06, + "logits/chosen": 4.098562717437744, + "logits/rejected": 5.463818073272705, + "logps/chosen": -594.138671875, + "logps/rejected": -830.6209716796875, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.791903495788574, + "rewards/margins": 15.993047714233398, + "rewards/rejected": -20.784954071044922, + "step": 808 + }, + { + "epoch": 0.5032659409020218, + "grad_norm": 0.016061756759881973, + "learning_rate": 4.6242508068234215e-06, + "logits/chosen": -2.1252896785736084, + "logits/rejected": 2.6811745166778564, + "logps/chosen": -316.495361328125, + "logps/rejected": -750.6249389648438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.024198055267334, + "rewards/margins": 18.38684844970703, + "rewards/rejected": -23.41104507446289, + "step": 809 + }, + { + "epoch": 0.5038880248833593, + "grad_norm": 1.1880834102630615, + "learning_rate": 4.623098201936377e-06, + "logits/chosen": 2.4408676624298096, + "logits/rejected": 3.9343366622924805, + "logps/chosen": -487.645263671875, + "logps/rejected": -694.6914672851562, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.5457634925842285, + "rewards/margins": 16.69846534729004, + "rewards/rejected": -23.244230270385742, + "step": 810 + }, + { + "epoch": 0.5045101088646967, + "grad_norm": 27.158384323120117, + "learning_rate": 4.621945597049332e-06, + "logits/chosen": 2.4023821353912354, + "logits/rejected": 5.048232078552246, + "logps/chosen": -527.9765625, + "logps/rejected": -850.1030883789062, + "loss": 0.2794, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.261188507080078, + "rewards/margins": 13.181292533874512, + "rewards/rejected": -17.442481994628906, + "step": 811 + }, + { + "epoch": 0.5051321928460342, + "grad_norm": 0.00033380460808984935, + "learning_rate": 4.620792992162287e-06, + "logits/chosen": -1.0781645774841309, + "logits/rejected": 3.9537434577941895, + "logps/chosen": -447.1126708984375, + "logps/rejected": -944.2455444335938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.95174503326416, + "rewards/margins": 22.1113338470459, + "rewards/rejected": -27.063077926635742, + "step": 812 + }, + { + "epoch": 0.5057542768273717, + "grad_norm": 3.74644914700184e-05, + "learning_rate": 4.619640387275242e-06, + "logits/chosen": 1.647376298904419, + "logits/rejected": 3.9168145656585693, + "logps/chosen": -501.5458984375, + "logps/rejected": -923.9151611328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.664668083190918, + "rewards/margins": 23.443607330322266, + "rewards/rejected": -30.108272552490234, + "step": 813 + }, + { + "epoch": 0.5063763608087092, + "grad_norm": 3.1582701467414154e-06, + "learning_rate": 4.618487782388198e-06, + "logits/chosen": -0.49896568059921265, + "logits/rejected": 2.6935436725616455, + "logps/chosen": -384.57977294921875, + "logps/rejected": -788.4315185546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.673390865325928, + "rewards/margins": 22.454517364501953, + "rewards/rejected": -27.127906799316406, + "step": 814 + }, + { + "epoch": 0.5069984447900466, + "grad_norm": 0.014756478369235992, + "learning_rate": 4.617335177501153e-06, + "logits/chosen": 0.6831048727035522, + "logits/rejected": 4.275322437286377, + "logps/chosen": -511.0201110839844, + "logps/rejected": -856.8211669921875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.114786624908447, + "rewards/margins": 18.40111541748047, + "rewards/rejected": -22.51590347290039, + "step": 815 + }, + { + "epoch": 0.5076205287713841, + "grad_norm": 17.718061447143555, + "learning_rate": 4.616182572614109e-06, + "logits/chosen": 1.340681552886963, + "logits/rejected": 3.1322154998779297, + "logps/chosen": -622.8712158203125, + "logps/rejected": -1000.7344360351562, + "loss": 0.108, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.9386420249938965, + "rewards/margins": 22.70752716064453, + "rewards/rejected": -29.646167755126953, + "step": 816 + }, + { + "epoch": 0.5082426127527216, + "grad_norm": 22.104307174682617, + "learning_rate": 4.615029967727064e-06, + "logits/chosen": -2.7415151596069336, + "logits/rejected": 2.7648801803588867, + "logps/chosen": -374.72784423828125, + "logps/rejected": -899.2406616210938, + "loss": 0.2482, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.962153673171997, + "rewards/margins": 18.367874145507812, + "rewards/rejected": -22.330028533935547, + "step": 817 + }, + { + "epoch": 0.5088646967340591, + "grad_norm": 32.42689514160156, + "learning_rate": 4.613877362840019e-06, + "logits/chosen": 1.2072477340698242, + "logits/rejected": 3.5181543827056885, + "logps/chosen": -589.1121215820312, + "logps/rejected": -896.8251953125, + "loss": 0.8031, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.592341899871826, + "rewards/margins": 17.727842330932617, + "rewards/rejected": -24.32018280029297, + "step": 818 + }, + { + "epoch": 0.5094867807153965, + "grad_norm": 11.6628999710083, + "learning_rate": 4.6127247579529746e-06, + "logits/chosen": 1.7232805490493774, + "logits/rejected": 3.776496171951294, + "logps/chosen": -568.9671630859375, + "logps/rejected": -874.9674072265625, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.510915756225586, + "rewards/margins": 19.880298614501953, + "rewards/rejected": -28.391212463378906, + "step": 819 + }, + { + "epoch": 0.5101088646967341, + "grad_norm": 0.7271249890327454, + "learning_rate": 4.61157215306593e-06, + "logits/chosen": 3.2940382957458496, + "logits/rejected": 4.654926300048828, + "logps/chosen": -770.0858154296875, + "logps/rejected": -996.1089477539062, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.865810394287109, + "rewards/margins": 19.61363983154297, + "rewards/rejected": -27.479454040527344, + "step": 820 + }, + { + "epoch": 0.5107309486780716, + "grad_norm": 1.074064016342163, + "learning_rate": 4.610419548178885e-06, + "logits/chosen": 1.2415605783462524, + "logits/rejected": 2.915865182876587, + "logps/chosen": -571.21728515625, + "logps/rejected": -850.7125854492188, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.833264350891113, + "rewards/margins": 13.54500961303711, + "rewards/rejected": -21.378273010253906, + "step": 821 + }, + { + "epoch": 0.511353032659409, + "grad_norm": 0.013838615268468857, + "learning_rate": 4.609266943291839e-06, + "logits/chosen": -2.0438036918640137, + "logits/rejected": 1.6256415843963623, + "logps/chosen": -343.5359802246094, + "logps/rejected": -786.7481689453125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.463273763656616, + "rewards/margins": 24.050792694091797, + "rewards/rejected": -27.514068603515625, + "step": 822 + }, + { + "epoch": 0.5119751166407465, + "grad_norm": 0.001235920935869217, + "learning_rate": 4.608114338404795e-06, + "logits/chosen": -0.3681415319442749, + "logits/rejected": 2.907719135284424, + "logps/chosen": -496.8587646484375, + "logps/rejected": -865.68701171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.546807765960693, + "rewards/margins": 21.944412231445312, + "rewards/rejected": -26.49121856689453, + "step": 823 + }, + { + "epoch": 0.512597200622084, + "grad_norm": 12.031455039978027, + "learning_rate": 4.60696173351775e-06, + "logits/chosen": 0.4697137773036957, + "logits/rejected": 4.086321830749512, + "logps/chosen": -567.7445678710938, + "logps/rejected": -843.5440673828125, + "loss": 0.0623, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.043512344360352, + "rewards/margins": 12.916199684143066, + "rewards/rejected": -17.9597110748291, + "step": 824 + }, + { + "epoch": 0.5132192846034215, + "grad_norm": 23.215635299682617, + "learning_rate": 4.605809128630706e-06, + "logits/chosen": 0.749940037727356, + "logits/rejected": 3.3669252395629883, + "logps/chosen": -502.6315612792969, + "logps/rejected": -741.6364135742188, + "loss": 0.6005, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.730608940124512, + "rewards/margins": 13.164880752563477, + "rewards/rejected": -17.895488739013672, + "step": 825 + }, + { + "epoch": 0.5138413685847589, + "grad_norm": 0.6654096841812134, + "learning_rate": 4.604656523743661e-06, + "logits/chosen": 0.640193521976471, + "logits/rejected": 3.969430685043335, + "logps/chosen": -396.957763671875, + "logps/rejected": -793.5340576171875, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.256521701812744, + "rewards/margins": 13.162273406982422, + "rewards/rejected": -17.418794631958008, + "step": 826 + }, + { + "epoch": 0.5144634525660964, + "grad_norm": 22.310516357421875, + "learning_rate": 4.603503918856616e-06, + "logits/chosen": -0.8501242399215698, + "logits/rejected": 3.2708749771118164, + "logps/chosen": -453.19781494140625, + "logps/rejected": -840.7752685546875, + "loss": 0.4233, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1451218128204346, + "rewards/margins": 17.685771942138672, + "rewards/rejected": -20.830896377563477, + "step": 827 + }, + { + "epoch": 0.5150855365474339, + "grad_norm": 16.61135482788086, + "learning_rate": 4.602351313969572e-06, + "logits/chosen": 1.2462623119354248, + "logits/rejected": 5.105804920196533, + "logps/chosen": -500.0732421875, + "logps/rejected": -918.1619873046875, + "loss": 0.1104, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.89671516418457, + "rewards/margins": 21.1292724609375, + "rewards/rejected": -26.025989532470703, + "step": 828 + }, + { + "epoch": 0.5157076205287714, + "grad_norm": 0.044712089002132416, + "learning_rate": 4.601198709082527e-06, + "logits/chosen": -1.1490451097488403, + "logits/rejected": 2.0136399269104004, + "logps/chosen": -406.8626708984375, + "logps/rejected": -761.128662109375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.640765190124512, + "rewards/margins": 17.16360855102539, + "rewards/rejected": -24.804372787475586, + "step": 829 + }, + { + "epoch": 0.5163297045101088, + "grad_norm": 15.269174575805664, + "learning_rate": 4.600046104195482e-06, + "logits/chosen": 0.6019724607467651, + "logits/rejected": 3.838488817214966, + "logps/chosen": -618.8802490234375, + "logps/rejected": -999.3819580078125, + "loss": 0.1088, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.692051410675049, + "rewards/margins": 17.41494369506836, + "rewards/rejected": -20.10699462890625, + "step": 830 + }, + { + "epoch": 0.5169517884914463, + "grad_norm": 0.014774742536246777, + "learning_rate": 4.598893499308437e-06, + "logits/chosen": -2.8291544914245605, + "logits/rejected": 3.349138021469116, + "logps/chosen": -300.6230163574219, + "logps/rejected": -764.5264892578125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.240640163421631, + "rewards/margins": 20.8753719329834, + "rewards/rejected": -23.116012573242188, + "step": 831 + }, + { + "epoch": 0.5175738724727839, + "grad_norm": 1.0063591003417969, + "learning_rate": 4.5977408944213925e-06, + "logits/chosen": -0.15792837738990784, + "logits/rejected": 4.523929595947266, + "logps/chosen": -366.80047607421875, + "logps/rejected": -839.95703125, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.910074234008789, + "rewards/margins": 15.645296096801758, + "rewards/rejected": -20.555370330810547, + "step": 832 + }, + { + "epoch": 0.5181959564541213, + "grad_norm": 26.251590728759766, + "learning_rate": 4.596588289534348e-06, + "logits/chosen": 1.309356451034546, + "logits/rejected": 3.279106616973877, + "logps/chosen": -526.9833984375, + "logps/rejected": -892.2254028320312, + "loss": 0.2489, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.946220397949219, + "rewards/margins": 24.749940872192383, + "rewards/rejected": -29.69615936279297, + "step": 833 + }, + { + "epoch": 0.5188180404354588, + "grad_norm": 0.0008679351885803044, + "learning_rate": 4.595435684647303e-06, + "logits/chosen": -0.7757160663604736, + "logits/rejected": 3.746737003326416, + "logps/chosen": -412.482666015625, + "logps/rejected": -899.3509521484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5694308280944824, + "rewards/margins": 18.884733200073242, + "rewards/rejected": -22.454164505004883, + "step": 834 + }, + { + "epoch": 0.5194401244167963, + "grad_norm": 0.44425633549690247, + "learning_rate": 4.594283079760258e-06, + "logits/chosen": -1.775087594985962, + "logits/rejected": 2.158067464828491, + "logps/chosen": -380.7835693359375, + "logps/rejected": -724.4786376953125, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.430812835693359, + "rewards/margins": 15.602945327758789, + "rewards/rejected": -22.03375816345215, + "step": 835 + }, + { + "epoch": 0.5200622083981338, + "grad_norm": 1.231345295906067, + "learning_rate": 4.593130474873213e-06, + "logits/chosen": 1.7987587451934814, + "logits/rejected": 3.159287691116333, + "logps/chosen": -584.3314208984375, + "logps/rejected": -882.6793212890625, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.365367889404297, + "rewards/margins": 22.243616104125977, + "rewards/rejected": -30.608985900878906, + "step": 836 + }, + { + "epoch": 0.5206842923794712, + "grad_norm": 0.0006359569961205125, + "learning_rate": 4.591977869986169e-06, + "logits/chosen": -0.4184119701385498, + "logits/rejected": 1.7271060943603516, + "logps/chosen": -530.0023193359375, + "logps/rejected": -953.5392456054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.454473495483398, + "rewards/margins": 27.001876831054688, + "rewards/rejected": -33.45635223388672, + "step": 837 + }, + { + "epoch": 0.5213063763608087, + "grad_norm": 35.898162841796875, + "learning_rate": 4.590825265099124e-06, + "logits/chosen": 1.0566655397415161, + "logits/rejected": 3.6799302101135254, + "logps/chosen": -493.8349609375, + "logps/rejected": -713.1172485351562, + "loss": 0.251, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.644853115081787, + "rewards/margins": 11.174239158630371, + "rewards/rejected": -18.819091796875, + "step": 838 + }, + { + "epoch": 0.5219284603421462, + "grad_norm": 4.95714573389705e-07, + "learning_rate": 4.589672660212079e-06, + "logits/chosen": -0.5007285475730896, + "logits/rejected": 4.842340469360352, + "logps/chosen": -400.39447021484375, + "logps/rejected": -1001.4553833007812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2126622200012207, + "rewards/margins": 32.23038864135742, + "rewards/rejected": -35.443050384521484, + "step": 839 + }, + { + "epoch": 0.5225505443234837, + "grad_norm": 33.6435546875, + "learning_rate": 4.588520055325035e-06, + "logits/chosen": 2.3923635482788086, + "logits/rejected": 3.9905447959899902, + "logps/chosen": -576.5003051757812, + "logps/rejected": -802.433837890625, + "loss": 1.0211, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.508181095123291, + "rewards/margins": 14.160221099853516, + "rewards/rejected": -21.66840171813965, + "step": 840 + }, + { + "epoch": 0.5231726283048211, + "grad_norm": 3.987846612930298, + "learning_rate": 4.58736745043799e-06, + "logits/chosen": 1.476585030555725, + "logits/rejected": 3.4213404655456543, + "logps/chosen": -489.3512268066406, + "logps/rejected": -753.6704711914062, + "loss": 0.0271, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.481175899505615, + "rewards/margins": 15.506742477416992, + "rewards/rejected": -21.987918853759766, + "step": 841 + }, + { + "epoch": 0.5237947122861586, + "grad_norm": 0.2226215898990631, + "learning_rate": 4.5862148455509456e-06, + "logits/chosen": -0.1877286732196808, + "logits/rejected": 3.0288901329040527, + "logps/chosen": -532.8231811523438, + "logps/rejected": -876.0736694335938, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.797203540802002, + "rewards/margins": 18.92719078063965, + "rewards/rejected": -25.724393844604492, + "step": 842 + }, + { + "epoch": 0.5244167962674962, + "grad_norm": 37.902252197265625, + "learning_rate": 4.585062240663901e-06, + "logits/chosen": 2.5677266120910645, + "logits/rejected": 3.460753917694092, + "logps/chosen": -683.076904296875, + "logps/rejected": -910.3892211914062, + "loss": 0.7027, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.876817226409912, + "rewards/margins": 19.93701171875, + "rewards/rejected": -25.81382942199707, + "step": 843 + }, + { + "epoch": 0.5250388802488336, + "grad_norm": 30.997549057006836, + "learning_rate": 4.583909635776856e-06, + "logits/chosen": 1.2948592901229858, + "logits/rejected": 3.858691453933716, + "logps/chosen": -622.263671875, + "logps/rejected": -937.9315185546875, + "loss": 0.3479, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.006096839904785, + "rewards/margins": 20.229141235351562, + "rewards/rejected": -29.23523712158203, + "step": 844 + }, + { + "epoch": 0.5256609642301711, + "grad_norm": 29.12040138244629, + "learning_rate": 4.582757030889811e-06, + "logits/chosen": 1.0461492538452148, + "logits/rejected": 4.793246269226074, + "logps/chosen": -491.78057861328125, + "logps/rejected": -799.749755859375, + "loss": 0.769, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.587861061096191, + "rewards/margins": 10.82066822052002, + "rewards/rejected": -17.40852928161621, + "step": 845 + }, + { + "epoch": 0.5262830482115085, + "grad_norm": 0.0004020752676296979, + "learning_rate": 4.5816044260027665e-06, + "logits/chosen": -1.9583938121795654, + "logits/rejected": 2.0443408489227295, + "logps/chosen": -410.334228515625, + "logps/rejected": -881.2435913085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.354313850402832, + "rewards/margins": 25.540056228637695, + "rewards/rejected": -31.89436912536621, + "step": 846 + }, + { + "epoch": 0.5269051321928461, + "grad_norm": 0.008207093924283981, + "learning_rate": 4.580451821115722e-06, + "logits/chosen": 0.43992137908935547, + "logits/rejected": 0.8132442235946655, + "logps/chosen": -625.908203125, + "logps/rejected": -858.2850341796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.035774230957031, + "rewards/margins": 20.796154022216797, + "rewards/rejected": -30.831926345825195, + "step": 847 + }, + { + "epoch": 0.5275272161741835, + "grad_norm": 1.043589691107627e-05, + "learning_rate": 4.579299216228677e-06, + "logits/chosen": -1.982966423034668, + "logits/rejected": 2.495979070663452, + "logps/chosen": -449.64886474609375, + "logps/rejected": -1031.390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.190682888031006, + "rewards/margins": 23.95623016357422, + "rewards/rejected": -30.14691162109375, + "step": 848 + }, + { + "epoch": 0.528149300155521, + "grad_norm": 10.80439281463623, + "learning_rate": 4.578146611341632e-06, + "logits/chosen": 2.5422017574310303, + "logits/rejected": 4.219742774963379, + "logps/chosen": -570.5355224609375, + "logps/rejected": -833.507568359375, + "loss": 0.062, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.7938432693481445, + "rewards/margins": 19.633541107177734, + "rewards/rejected": -26.427385330200195, + "step": 849 + }, + { + "epoch": 0.5287713841368584, + "grad_norm": 0.1870967447757721, + "learning_rate": 4.576994006454587e-06, + "logits/chosen": -2.3501226902008057, + "logits/rejected": 2.8784193992614746, + "logps/chosen": -357.7137451171875, + "logps/rejected": -834.7598876953125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.8307647705078125, + "rewards/margins": 20.596311569213867, + "rewards/rejected": -28.42707633972168, + "step": 850 + }, + { + "epoch": 0.529393468118196, + "grad_norm": 8.725545883178711, + "learning_rate": 4.575841401567543e-06, + "logits/chosen": 1.067751407623291, + "logits/rejected": 2.8930211067199707, + "logps/chosen": -585.4593505859375, + "logps/rejected": -889.775390625, + "loss": 0.026, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.797390460968018, + "rewards/margins": 16.867427825927734, + "rewards/rejected": -23.664817810058594, + "step": 851 + }, + { + "epoch": 0.5300155520995334, + "grad_norm": 1.1312555074691772, + "learning_rate": 4.574688796680498e-06, + "logits/chosen": -0.3240584135055542, + "logits/rejected": 1.6172173023223877, + "logps/chosen": -376.5591735839844, + "logps/rejected": -594.09375, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.678043842315674, + "rewards/margins": 12.331840515136719, + "rewards/rejected": -18.009883880615234, + "step": 852 + }, + { + "epoch": 0.5306376360808709, + "grad_norm": 2.2278480529785156, + "learning_rate": 4.573536191793453e-06, + "logits/chosen": 0.935234546661377, + "logits/rejected": 3.973618507385254, + "logps/chosen": -482.78704833984375, + "logps/rejected": -930.9962158203125, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.005084991455078, + "rewards/margins": 25.713951110839844, + "rewards/rejected": -33.71903610229492, + "step": 853 + }, + { + "epoch": 0.5312597200622085, + "grad_norm": 25.2884464263916, + "learning_rate": 4.572383586906409e-06, + "logits/chosen": -2.831954002380371, + "logits/rejected": 3.0417380332946777, + "logps/chosen": -374.65911865234375, + "logps/rejected": -950.072509765625, + "loss": 1.4801, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.000370025634766, + "rewards/margins": 23.205615997314453, + "rewards/rejected": -31.20598602294922, + "step": 854 + }, + { + "epoch": 0.5318818040435459, + "grad_norm": 0.008000146597623825, + "learning_rate": 4.571230982019364e-06, + "logits/chosen": 0.9278247356414795, + "logits/rejected": 4.047576427459717, + "logps/chosen": -548.40087890625, + "logps/rejected": -874.353271484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.476536750793457, + "rewards/margins": 16.574291229248047, + "rewards/rejected": -27.05082893371582, + "step": 855 + }, + { + "epoch": 0.5325038880248834, + "grad_norm": 10.3145751953125, + "learning_rate": 4.5700783771323196e-06, + "logits/chosen": -2.642561674118042, + "logits/rejected": 1.6923301219940186, + "logps/chosen": -393.0373229980469, + "logps/rejected": -858.5068359375, + "loss": 0.0415, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.599725723266602, + "rewards/margins": 19.435718536376953, + "rewards/rejected": -25.035446166992188, + "step": 856 + }, + { + "epoch": 0.5331259720062208, + "grad_norm": 0.030150998383760452, + "learning_rate": 4.568925772245275e-06, + "logits/chosen": 1.1801414489746094, + "logits/rejected": 3.758105993270874, + "logps/chosen": -503.06121826171875, + "logps/rejected": -868.5562744140625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.317437171936035, + "rewards/margins": 21.45519256591797, + "rewards/rejected": -29.77263069152832, + "step": 857 + }, + { + "epoch": 0.5337480559875584, + "grad_norm": 29.704832077026367, + "learning_rate": 4.56777316735823e-06, + "logits/chosen": -2.479341506958008, + "logits/rejected": 2.4226291179656982, + "logps/chosen": -396.0041809082031, + "logps/rejected": -965.6904296875, + "loss": 0.4235, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.2071075439453125, + "rewards/margins": 19.336469650268555, + "rewards/rejected": -25.5435791015625, + "step": 858 + }, + { + "epoch": 0.5343701399688958, + "grad_norm": 0.005256436299532652, + "learning_rate": 4.566620562471185e-06, + "logits/chosen": -1.4652228355407715, + "logits/rejected": 2.630767345428467, + "logps/chosen": -484.7992858886719, + "logps/rejected": -940.8848266601562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.632268905639648, + "rewards/margins": 24.89701271057129, + "rewards/rejected": -31.529281616210938, + "step": 859 + }, + { + "epoch": 0.5349922239502333, + "grad_norm": 0.4753815829753876, + "learning_rate": 4.5654679575841405e-06, + "logits/chosen": 0.8926655054092407, + "logits/rejected": 3.2669153213500977, + "logps/chosen": -473.52288818359375, + "logps/rejected": -825.9222412109375, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.783342361450195, + "rewards/margins": 21.152517318725586, + "rewards/rejected": -27.93585968017578, + "step": 860 + }, + { + "epoch": 0.5356143079315707, + "grad_norm": 33.34446716308594, + "learning_rate": 4.564315352697096e-06, + "logits/chosen": 0.9703787565231323, + "logits/rejected": 2.1039042472839355, + "logps/chosen": -631.0071411132812, + "logps/rejected": -800.5887451171875, + "loss": 0.3649, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.831226348876953, + "rewards/margins": 12.349831581115723, + "rewards/rejected": -22.181058883666992, + "step": 861 + }, + { + "epoch": 0.5362363919129083, + "grad_norm": 7.412981358356774e-05, + "learning_rate": 4.563162747810051e-06, + "logits/chosen": -0.35365569591522217, + "logits/rejected": 3.4597878456115723, + "logps/chosen": -567.4420166015625, + "logps/rejected": -991.6310424804688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.804872989654541, + "rewards/margins": 25.330028533935547, + "rewards/rejected": -32.13490295410156, + "step": 862 + }, + { + "epoch": 0.5368584758942457, + "grad_norm": 0.021897537633776665, + "learning_rate": 4.562010142923006e-06, + "logits/chosen": 0.643225908279419, + "logits/rejected": 4.038776397705078, + "logps/chosen": -504.0196533203125, + "logps/rejected": -991.962158203125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.733529090881348, + "rewards/margins": 24.76568603515625, + "rewards/rejected": -35.49921798706055, + "step": 863 + }, + { + "epoch": 0.5374805598755832, + "grad_norm": 0.4537505805492401, + "learning_rate": 4.560857538035961e-06, + "logits/chosen": 0.07119336724281311, + "logits/rejected": 2.72735333442688, + "logps/chosen": -516.2216796875, + "logps/rejected": -841.7863159179688, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.701822757720947, + "rewards/margins": 20.65270233154297, + "rewards/rejected": -26.35452651977539, + "step": 864 + }, + { + "epoch": 0.5381026438569206, + "grad_norm": 0.033131957054138184, + "learning_rate": 4.559704933148917e-06, + "logits/chosen": -1.2717825174331665, + "logits/rejected": 3.118110418319702, + "logps/chosen": -524.9382934570312, + "logps/rejected": -973.964111328125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.52232551574707, + "rewards/margins": 20.606990814208984, + "rewards/rejected": -28.129316329956055, + "step": 865 + }, + { + "epoch": 0.5387247278382582, + "grad_norm": 0.0002912423515226692, + "learning_rate": 4.558552328261872e-06, + "logits/chosen": -0.5772674083709717, + "logits/rejected": 2.762540102005005, + "logps/chosen": -496.74420166015625, + "logps/rejected": -960.775390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.3707194328308105, + "rewards/margins": 22.571828842163086, + "rewards/rejected": -29.942546844482422, + "step": 866 + }, + { + "epoch": 0.5393468118195957, + "grad_norm": 0.011110931634902954, + "learning_rate": 4.557399723374827e-06, + "logits/chosen": -0.25224435329437256, + "logits/rejected": 3.20752215385437, + "logps/chosen": -567.7034912109375, + "logps/rejected": -968.6920166015625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.4491548538208, + "rewards/margins": 24.463436126708984, + "rewards/rejected": -32.91259002685547, + "step": 867 + }, + { + "epoch": 0.5399688958009331, + "grad_norm": 18.80489730834961, + "learning_rate": 4.556247118487782e-06, + "logits/chosen": 0.04405069351196289, + "logits/rejected": 2.4502503871917725, + "logps/chosen": -551.693359375, + "logps/rejected": -857.9959106445312, + "loss": 0.1383, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.809330940246582, + "rewards/margins": 18.363243103027344, + "rewards/rejected": -28.172576904296875, + "step": 868 + }, + { + "epoch": 0.5405909797822706, + "grad_norm": 10.304757118225098, + "learning_rate": 4.555094513600738e-06, + "logits/chosen": 0.8795121908187866, + "logits/rejected": 3.748501777648926, + "logps/chosen": -553.6944580078125, + "logps/rejected": -883.4180297851562, + "loss": 0.0432, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.258725166320801, + "rewards/margins": 20.32225799560547, + "rewards/rejected": -25.580982208251953, + "step": 869 + }, + { + "epoch": 0.5412130637636081, + "grad_norm": 17.23099708557129, + "learning_rate": 4.5539419087136936e-06, + "logits/chosen": 0.2523691654205322, + "logits/rejected": 1.9475032091140747, + "logps/chosen": -538.177978515625, + "logps/rejected": -872.0485229492188, + "loss": 0.1189, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.795501708984375, + "rewards/margins": 19.7119140625, + "rewards/rejected": -28.507417678833008, + "step": 870 + }, + { + "epoch": 0.5418351477449456, + "grad_norm": 35.22981643676758, + "learning_rate": 4.552789303826649e-06, + "logits/chosen": 0.7181805372238159, + "logits/rejected": 3.5797152519226074, + "logps/chosen": -681.2504272460938, + "logps/rejected": -976.5474243164062, + "loss": 0.4289, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.748141288757324, + "rewards/margins": 15.885237693786621, + "rewards/rejected": -31.633377075195312, + "step": 871 + }, + { + "epoch": 0.542457231726283, + "grad_norm": 19.412866592407227, + "learning_rate": 4.551636698939604e-06, + "logits/chosen": 0.8360533714294434, + "logits/rejected": 2.686750888824463, + "logps/chosen": -524.9096069335938, + "logps/rejected": -870.4130249023438, + "loss": 0.4212, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.245410919189453, + "rewards/margins": 18.76921272277832, + "rewards/rejected": -28.014623641967773, + "step": 872 + }, + { + "epoch": 0.5430793157076206, + "grad_norm": 31.095205307006836, + "learning_rate": 4.550484094052559e-06, + "logits/chosen": -3.711292266845703, + "logits/rejected": 1.670317530632019, + "logps/chosen": -318.65130615234375, + "logps/rejected": -842.1668090820312, + "loss": 0.5101, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.937169075012207, + "rewards/margins": 26.391700744628906, + "rewards/rejected": -31.32887077331543, + "step": 873 + }, + { + "epoch": 0.543701399688958, + "grad_norm": 0.007474643178284168, + "learning_rate": 4.5493314891655145e-06, + "logits/chosen": -1.8571935892105103, + "logits/rejected": 1.5715970993041992, + "logps/chosen": -418.4180908203125, + "logps/rejected": -763.9039306640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.32398796081543, + "rewards/margins": 18.20618438720703, + "rewards/rejected": -24.53017234802246, + "step": 874 + }, + { + "epoch": 0.5443234836702955, + "grad_norm": 8.715862274169922, + "learning_rate": 4.54817888427847e-06, + "logits/chosen": 1.0062412023544312, + "logits/rejected": 3.670095920562744, + "logps/chosen": -386.5008544921875, + "logps/rejected": -703.3475341796875, + "loss": 0.0449, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.710971832275391, + "rewards/margins": 16.32583236694336, + "rewards/rejected": -22.036806106567383, + "step": 875 + }, + { + "epoch": 0.5449455676516329, + "grad_norm": 36.20487976074219, + "learning_rate": 4.547026279391425e-06, + "logits/chosen": 0.13863390684127808, + "logits/rejected": 3.659630537033081, + "logps/chosen": -503.400390625, + "logps/rejected": -918.284423828125, + "loss": 0.2315, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.179703235626221, + "rewards/margins": 20.5958251953125, + "rewards/rejected": -27.77552604675293, + "step": 876 + }, + { + "epoch": 0.5455676516329705, + "grad_norm": 0.4360642731189728, + "learning_rate": 4.54587367450438e-06, + "logits/chosen": 2.4419362545013428, + "logits/rejected": 3.9887049198150635, + "logps/chosen": -642.28076171875, + "logps/rejected": -993.3703002929688, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.067706108093262, + "rewards/margins": 21.047924041748047, + "rewards/rejected": -29.11562728881836, + "step": 877 + }, + { + "epoch": 0.546189735614308, + "grad_norm": 2.6440608501434326, + "learning_rate": 4.544721069617335e-06, + "logits/chosen": -1.8029651641845703, + "logits/rejected": 2.106647491455078, + "logps/chosen": -499.1551208496094, + "logps/rejected": -1073.3516845703125, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.704458236694336, + "rewards/margins": 27.312103271484375, + "rewards/rejected": -38.016563415527344, + "step": 878 + }, + { + "epoch": 0.5468118195956454, + "grad_norm": 0.3573562800884247, + "learning_rate": 4.543568464730291e-06, + "logits/chosen": 1.9899303913116455, + "logits/rejected": 4.196540355682373, + "logps/chosen": -654.3473510742188, + "logps/rejected": -1023.14501953125, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.193310737609863, + "rewards/margins": 26.175338745117188, + "rewards/rejected": -35.36864471435547, + "step": 879 + }, + { + "epoch": 0.5474339035769828, + "grad_norm": 0.24273815751075745, + "learning_rate": 4.542415859843246e-06, + "logits/chosen": 0.9308842420578003, + "logits/rejected": 2.3023080825805664, + "logps/chosen": -582.0292358398438, + "logps/rejected": -899.563720703125, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.894572257995605, + "rewards/margins": 24.512340545654297, + "rewards/rejected": -33.40690994262695, + "step": 880 + }, + { + "epoch": 0.5480559875583204, + "grad_norm": 2.9039588298473973e-06, + "learning_rate": 4.541263254956201e-06, + "logits/chosen": -0.09958934783935547, + "logits/rejected": 2.7995901107788086, + "logps/chosen": -520.8726806640625, + "logps/rejected": -907.4647216796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.7143473625183105, + "rewards/margins": 24.487667083740234, + "rewards/rejected": -32.2020149230957, + "step": 881 + }, + { + "epoch": 0.5486780715396579, + "grad_norm": 25.56290626525879, + "learning_rate": 4.540110650069156e-06, + "logits/chosen": -0.7153788208961487, + "logits/rejected": 1.400726079940796, + "logps/chosen": -434.67169189453125, + "logps/rejected": -671.6514892578125, + "loss": 0.3784, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.010853290557861, + "rewards/margins": 11.52184009552002, + "rewards/rejected": -17.53269386291504, + "step": 882 + }, + { + "epoch": 0.5493001555209953, + "grad_norm": 0.8233245015144348, + "learning_rate": 4.538958045182112e-06, + "logits/chosen": 1.417142629623413, + "logits/rejected": 3.3376898765563965, + "logps/chosen": -614.8678588867188, + "logps/rejected": -820.2135009765625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.703726291656494, + "rewards/margins": 20.157958984375, + "rewards/rejected": -27.861684799194336, + "step": 883 + }, + { + "epoch": 0.5499222395023328, + "grad_norm": 0.21891425549983978, + "learning_rate": 4.5378054402950676e-06, + "logits/chosen": 2.8602006435394287, + "logits/rejected": 4.124835968017578, + "logps/chosen": -712.83056640625, + "logps/rejected": -938.74169921875, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.705924987792969, + "rewards/margins": 22.996124267578125, + "rewards/rejected": -34.702049255371094, + "step": 884 + }, + { + "epoch": 0.5505443234836703, + "grad_norm": 0.023627132177352905, + "learning_rate": 4.536652835408023e-06, + "logits/chosen": -2.5472285747528076, + "logits/rejected": 1.7891316413879395, + "logps/chosen": -455.49041748046875, + "logps/rejected": -1010.7542114257812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.745904922485352, + "rewards/margins": 27.36368179321289, + "rewards/rejected": -34.10958480834961, + "step": 885 + }, + { + "epoch": 0.5511664074650078, + "grad_norm": 0.01632552035152912, + "learning_rate": 4.535500230520978e-06, + "logits/chosen": -0.05308155715465546, + "logits/rejected": 3.6396572589874268, + "logps/chosen": -558.2744750976562, + "logps/rejected": -1079.80615234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.748810768127441, + "rewards/margins": 27.036746978759766, + "rewards/rejected": -36.785560607910156, + "step": 886 + }, + { + "epoch": 0.5517884914463452, + "grad_norm": 58.952552795410156, + "learning_rate": 4.534347625633933e-06, + "logits/chosen": 0.29872390627861023, + "logits/rejected": 1.8410981893539429, + "logps/chosen": -637.9015502929688, + "logps/rejected": -892.9439086914062, + "loss": 1.9597, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.743337631225586, + "rewards/margins": 15.554926872253418, + "rewards/rejected": -29.298263549804688, + "step": 887 + }, + { + "epoch": 0.5524105754276827, + "grad_norm": 0.8052808046340942, + "learning_rate": 4.5331950207468885e-06, + "logits/chosen": 1.1044509410858154, + "logits/rejected": 2.839838981628418, + "logps/chosen": -631.5152587890625, + "logps/rejected": -952.8838500976562, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.291111946105957, + "rewards/margins": 19.244768142700195, + "rewards/rejected": -32.53588104248047, + "step": 888 + }, + { + "epoch": 0.5530326594090202, + "grad_norm": 23.6992130279541, + "learning_rate": 4.532042415859844e-06, + "logits/chosen": -1.5598255395889282, + "logits/rejected": 3.7028071880340576, + "logps/chosen": -507.09930419921875, + "logps/rejected": -1020.5116577148438, + "loss": 0.2016, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.400575637817383, + "rewards/margins": 23.5273494720459, + "rewards/rejected": -33.92792510986328, + "step": 889 + }, + { + "epoch": 0.5536547433903577, + "grad_norm": 0.001044000033289194, + "learning_rate": 4.530889810972799e-06, + "logits/chosen": -0.5535763502120972, + "logits/rejected": 3.151967763900757, + "logps/chosen": -441.44842529296875, + "logps/rejected": -916.7725830078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.017629623413086, + "rewards/margins": 23.890779495239258, + "rewards/rejected": -27.90840721130371, + "step": 890 + }, + { + "epoch": 0.5542768273716951, + "grad_norm": 5.863152980804443, + "learning_rate": 4.529737206085754e-06, + "logits/chosen": 0.11987632513046265, + "logits/rejected": 0.8908869624137878, + "logps/chosen": -618.422607421875, + "logps/rejected": -864.134521484375, + "loss": 0.0251, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.305463790893555, + "rewards/margins": 19.35150146484375, + "rewards/rejected": -28.656967163085938, + "step": 891 + }, + { + "epoch": 0.5548989113530327, + "grad_norm": 0.014139095321297646, + "learning_rate": 4.528584601198709e-06, + "logits/chosen": 2.1482062339782715, + "logits/rejected": 4.1894755363464355, + "logps/chosen": -619.8270263671875, + "logps/rejected": -974.5040893554688, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.460693359375, + "rewards/margins": 23.501544952392578, + "rewards/rejected": -31.962238311767578, + "step": 892 + }, + { + "epoch": 0.5555209953343702, + "grad_norm": 46.96757888793945, + "learning_rate": 4.527431996311665e-06, + "logits/chosen": 0.5948436260223389, + "logits/rejected": 2.237551212310791, + "logps/chosen": -593.1738891601562, + "logps/rejected": -931.5321044921875, + "loss": 1.05, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.85053014755249, + "rewards/margins": 25.84325408935547, + "rewards/rejected": -33.693782806396484, + "step": 893 + }, + { + "epoch": 0.5561430793157076, + "grad_norm": 5.902840614318848, + "learning_rate": 4.52627939142462e-06, + "logits/chosen": 0.7165172100067139, + "logits/rejected": 2.4892489910125732, + "logps/chosen": -592.964111328125, + "logps/rejected": -840.6056518554688, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.395855903625488, + "rewards/margins": 14.041229248046875, + "rewards/rejected": -23.43708610534668, + "step": 894 + }, + { + "epoch": 0.5567651632970451, + "grad_norm": 22.181793212890625, + "learning_rate": 4.525126786537575e-06, + "logits/chosen": 2.861790180206299, + "logits/rejected": 3.9013900756835938, + "logps/chosen": -650.225830078125, + "logps/rejected": -826.1898193359375, + "loss": 0.3234, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.422914505004883, + "rewards/margins": 14.123764038085938, + "rewards/rejected": -27.54667854309082, + "step": 895 + }, + { + "epoch": 0.5573872472783826, + "grad_norm": 2.8335178285487927e-05, + "learning_rate": 4.52397418165053e-06, + "logits/chosen": 1.190727949142456, + "logits/rejected": 2.2278220653533936, + "logps/chosen": -605.7880859375, + "logps/rejected": -1063.646240234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.507134437561035, + "rewards/margins": 31.69879150390625, + "rewards/rejected": -40.20592498779297, + "step": 896 + }, + { + "epoch": 0.5580093312597201, + "grad_norm": 0.004062708467245102, + "learning_rate": 4.5228215767634855e-06, + "logits/chosen": -0.09006160497665405, + "logits/rejected": 3.0125908851623535, + "logps/chosen": -552.53759765625, + "logps/rejected": -950.954833984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.288880348205566, + "rewards/margins": 22.93324851989746, + "rewards/rejected": -31.222129821777344, + "step": 897 + }, + { + "epoch": 0.5586314152410575, + "grad_norm": 0.052480533719062805, + "learning_rate": 4.5216689718764415e-06, + "logits/chosen": -0.5029276609420776, + "logits/rejected": 2.6806373596191406, + "logps/chosen": -414.865966796875, + "logps/rejected": -748.1179809570312, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.556739807128906, + "rewards/margins": 23.550384521484375, + "rewards/rejected": -28.107120513916016, + "step": 898 + }, + { + "epoch": 0.559253499222395, + "grad_norm": 0.0029522059485316277, + "learning_rate": 4.520516366989397e-06, + "logits/chosen": -1.5119998455047607, + "logits/rejected": 3.129838466644287, + "logps/chosen": -380.0979919433594, + "logps/rejected": -1028.6463623046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.766282081604004, + "rewards/margins": 34.25803756713867, + "rewards/rejected": -41.02431869506836, + "step": 899 + }, + { + "epoch": 0.5598755832037325, + "grad_norm": 0.31720152497291565, + "learning_rate": 4.519363762102352e-06, + "logits/chosen": -0.17320013046264648, + "logits/rejected": 3.0692298412323, + "logps/chosen": -400.371337890625, + "logps/rejected": -822.8155517578125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.835075378417969, + "rewards/margins": 17.175716400146484, + "rewards/rejected": -25.010791778564453, + "step": 900 + }, + { + "epoch": 0.56049766718507, + "grad_norm": 2.5494184494018555, + "learning_rate": 4.518211157215307e-06, + "logits/chosen": -0.592503547668457, + "logits/rejected": 3.36773681640625, + "logps/chosen": -465.85687255859375, + "logps/rejected": -930.584228515625, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.216737747192383, + "rewards/margins": 25.044021606445312, + "rewards/rejected": -30.260761260986328, + "step": 901 + }, + { + "epoch": 0.5611197511664074, + "grad_norm": 26.77128791809082, + "learning_rate": 4.5170585523282624e-06, + "logits/chosen": 0.34828922152519226, + "logits/rejected": 3.3961830139160156, + "logps/chosen": -468.8632507324219, + "logps/rejected": -877.02880859375, + "loss": 0.2851, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.608033180236816, + "rewards/margins": 16.834678649902344, + "rewards/rejected": -23.442712783813477, + "step": 902 + }, + { + "epoch": 0.5617418351477449, + "grad_norm": 0.055523090064525604, + "learning_rate": 4.515905947441218e-06, + "logits/chosen": -0.8115134835243225, + "logits/rejected": 4.621614933013916, + "logps/chosen": -442.95782470703125, + "logps/rejected": -994.3385620117188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.80880880355835, + "rewards/margins": 25.47365951538086, + "rewards/rejected": -31.282470703125, + "step": 903 + }, + { + "epoch": 0.5623639191290825, + "grad_norm": 0.0003423531888984144, + "learning_rate": 4.514753342554173e-06, + "logits/chosen": -1.8804258108139038, + "logits/rejected": 3.207822799682617, + "logps/chosen": -318.6630859375, + "logps/rejected": -776.7474365234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.680161952972412, + "rewards/margins": 20.768556594848633, + "rewards/rejected": -26.448719024658203, + "step": 904 + }, + { + "epoch": 0.5629860031104199, + "grad_norm": 1.876424789428711, + "learning_rate": 4.513600737667128e-06, + "logits/chosen": 1.282327651977539, + "logits/rejected": 3.3914713859558105, + "logps/chosen": -579.7039794921875, + "logps/rejected": -843.2907104492188, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.453951358795166, + "rewards/margins": 17.213043212890625, + "rewards/rejected": -23.666996002197266, + "step": 905 + }, + { + "epoch": 0.5636080870917574, + "grad_norm": 13.606136322021484, + "learning_rate": 4.512448132780083e-06, + "logits/chosen": 0.7030451893806458, + "logits/rejected": 2.2091081142425537, + "logps/chosen": -486.6806335449219, + "logps/rejected": -760.94482421875, + "loss": 0.0618, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.211647033691406, + "rewards/margins": 14.849355697631836, + "rewards/rejected": -22.061004638671875, + "step": 906 + }, + { + "epoch": 0.5642301710730949, + "grad_norm": 0.00018268085841555148, + "learning_rate": 4.5112955278930386e-06, + "logits/chosen": -1.1317505836486816, + "logits/rejected": 2.438883066177368, + "logps/chosen": -503.29754638671875, + "logps/rejected": -965.1803588867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7387869358062744, + "rewards/margins": 24.79349708557129, + "rewards/rejected": -27.532285690307617, + "step": 907 + }, + { + "epoch": 0.5648522550544324, + "grad_norm": 3.6311252117156982, + "learning_rate": 4.510142923005994e-06, + "logits/chosen": 1.5224366188049316, + "logits/rejected": 4.215592861175537, + "logps/chosen": -709.0532836914062, + "logps/rejected": -996.6823120117188, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.90211296081543, + "rewards/margins": 18.26665496826172, + "rewards/rejected": -29.16876792907715, + "step": 908 + }, + { + "epoch": 0.5654743390357698, + "grad_norm": 1.1938445568084717, + "learning_rate": 4.508990318118949e-06, + "logits/chosen": -1.4121729135513306, + "logits/rejected": 2.583425998687744, + "logps/chosen": -376.30084228515625, + "logps/rejected": -830.406982421875, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.8484039306640625, + "rewards/margins": 18.305500030517578, + "rewards/rejected": -23.15390396118164, + "step": 909 + }, + { + "epoch": 0.5660964230171073, + "grad_norm": 21.51590919494629, + "learning_rate": 4.507837713231904e-06, + "logits/chosen": 2.6975741386413574, + "logits/rejected": 4.030991077423096, + "logps/chosen": -666.0827026367188, + "logps/rejected": -952.5798950195312, + "loss": 0.3689, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.313325881958008, + "rewards/margins": 20.379703521728516, + "rewards/rejected": -26.693031311035156, + "step": 910 + }, + { + "epoch": 0.5667185069984448, + "grad_norm": 32.47568893432617, + "learning_rate": 4.5066851083448595e-06, + "logits/chosen": 0.2974998950958252, + "logits/rejected": 3.011427402496338, + "logps/chosen": -503.1514587402344, + "logps/rejected": -745.7210693359375, + "loss": 0.7872, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.5651936531066895, + "rewards/margins": 14.901344299316406, + "rewards/rejected": -22.466537475585938, + "step": 911 + }, + { + "epoch": 0.5673405909797823, + "grad_norm": 0.0001468830305384472, + "learning_rate": 4.5055325034578155e-06, + "logits/chosen": -0.25655102729797363, + "logits/rejected": 2.659013271331787, + "logps/chosen": -516.9340209960938, + "logps/rejected": -862.8931884765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.951726913452148, + "rewards/margins": 22.00909423828125, + "rewards/rejected": -27.960819244384766, + "step": 912 + }, + { + "epoch": 0.5679626749611197, + "grad_norm": 0.0001624006254132837, + "learning_rate": 4.504379898570771e-06, + "logits/chosen": 1.011976718902588, + "logits/rejected": 4.230520248413086, + "logps/chosen": -440.92352294921875, + "logps/rejected": -846.1481323242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.438632965087891, + "rewards/margins": 25.73596954345703, + "rewards/rejected": -32.17460632324219, + "step": 913 + }, + { + "epoch": 0.5685847589424572, + "grad_norm": 0.8141604065895081, + "learning_rate": 4.503227293683726e-06, + "logits/chosen": -1.5450491905212402, + "logits/rejected": 4.107460975646973, + "logps/chosen": -444.59075927734375, + "logps/rejected": -966.14013671875, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.689481735229492, + "rewards/margins": 18.89098358154297, + "rewards/rejected": -25.580467224121094, + "step": 914 + }, + { + "epoch": 0.5692068429237948, + "grad_norm": 31.039724349975586, + "learning_rate": 4.502074688796681e-06, + "logits/chosen": 2.0466227531433105, + "logits/rejected": 2.263880491256714, + "logps/chosen": -701.8427734375, + "logps/rejected": -896.3662109375, + "loss": 0.6259, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.721805572509766, + "rewards/margins": 17.82135772705078, + "rewards/rejected": -30.54316520690918, + "step": 915 + }, + { + "epoch": 0.5698289269051322, + "grad_norm": 0.0480804368853569, + "learning_rate": 4.5009220839096364e-06, + "logits/chosen": 0.30877208709716797, + "logits/rejected": 3.4405734539031982, + "logps/chosen": -545.4224243164062, + "logps/rejected": -977.380615234375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.624667167663574, + "rewards/margins": 28.29396629333496, + "rewards/rejected": -33.91863250732422, + "step": 916 + }, + { + "epoch": 0.5704510108864697, + "grad_norm": 2.4432003498077393, + "learning_rate": 4.499769479022592e-06, + "logits/chosen": -2.2052576541900635, + "logits/rejected": 2.1533780097961426, + "logps/chosen": -324.31756591796875, + "logps/rejected": -803.582275390625, + "loss": 0.0338, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.368386745452881, + "rewards/margins": 25.227500915527344, + "rewards/rejected": -32.595890045166016, + "step": 917 + }, + { + "epoch": 0.5710730948678071, + "grad_norm": 0.05066022649407387, + "learning_rate": 4.498616874135547e-06, + "logits/chosen": -2.10320782661438, + "logits/rejected": 0.7626610994338989, + "logps/chosen": -444.4925842285156, + "logps/rejected": -829.7848510742188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.30866813659668, + "rewards/margins": 21.65294647216797, + "rewards/rejected": -29.961612701416016, + "step": 918 + }, + { + "epoch": 0.5716951788491447, + "grad_norm": 0.5566099882125854, + "learning_rate": 4.497464269248502e-06, + "logits/chosen": 1.7583600282669067, + "logits/rejected": 2.833300828933716, + "logps/chosen": -627.1986083984375, + "logps/rejected": -866.6060791015625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.058523178100586, + "rewards/margins": 18.403121948242188, + "rewards/rejected": -27.461645126342773, + "step": 919 + }, + { + "epoch": 0.5723172628304821, + "grad_norm": 8.222851753234863, + "learning_rate": 4.496311664361457e-06, + "logits/chosen": 0.21099892258644104, + "logits/rejected": 2.976849317550659, + "logps/chosen": -485.7565002441406, + "logps/rejected": -840.64794921875, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.408513069152832, + "rewards/margins": 13.19969367980957, + "rewards/rejected": -21.608205795288086, + "step": 920 + }, + { + "epoch": 0.5729393468118196, + "grad_norm": 1.9750475530599942e-06, + "learning_rate": 4.4951590594744126e-06, + "logits/chosen": 0.6594505310058594, + "logits/rejected": 4.408999443054199, + "logps/chosen": -609.9363403320312, + "logps/rejected": -1108.749755859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.744295120239258, + "rewards/margins": 28.605003356933594, + "rewards/rejected": -37.34929656982422, + "step": 921 + }, + { + "epoch": 0.573561430793157, + "grad_norm": 0.04551282152533531, + "learning_rate": 4.494006454587368e-06, + "logits/chosen": 1.736369252204895, + "logits/rejected": 4.076783180236816, + "logps/chosen": -623.5065307617188, + "logps/rejected": -960.9025268554688, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.853501319885254, + "rewards/margins": 20.565467834472656, + "rewards/rejected": -29.418968200683594, + "step": 922 + }, + { + "epoch": 0.5741835147744946, + "grad_norm": 0.6607025265693665, + "learning_rate": 4.492853849700323e-06, + "logits/chosen": 1.1568937301635742, + "logits/rejected": 4.127242088317871, + "logps/chosen": -621.6375732421875, + "logps/rejected": -945.704345703125, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.65005874633789, + "rewards/margins": 21.218669891357422, + "rewards/rejected": -31.868732452392578, + "step": 923 + }, + { + "epoch": 0.574805598755832, + "grad_norm": 0.00010709751950344071, + "learning_rate": 4.491701244813278e-06, + "logits/chosen": 1.5515047311782837, + "logits/rejected": 3.1574530601501465, + "logps/chosen": -653.9324340820312, + "logps/rejected": -937.08984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.508325576782227, + "rewards/margins": 21.496139526367188, + "rewards/rejected": -33.00446701049805, + "step": 924 + }, + { + "epoch": 0.5754276827371695, + "grad_norm": 7.271386623382568, + "learning_rate": 4.4905486399262335e-06, + "logits/chosen": 0.06512796878814697, + "logits/rejected": 4.788290023803711, + "logps/chosen": -309.8177185058594, + "logps/rejected": -841.85546875, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.533644199371338, + "rewards/margins": 25.78881072998047, + "rewards/rejected": -31.32245635986328, + "step": 925 + }, + { + "epoch": 0.576049766718507, + "grad_norm": 1.5662002563476562, + "learning_rate": 4.489396035039189e-06, + "logits/chosen": -0.054076001048088074, + "logits/rejected": 1.7035764455795288, + "logps/chosen": -501.0464782714844, + "logps/rejected": -795.3419189453125, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.691514015197754, + "rewards/margins": 20.460548400878906, + "rewards/rejected": -27.152061462402344, + "step": 926 + }, + { + "epoch": 0.5766718506998445, + "grad_norm": 3.0245060770539567e-05, + "learning_rate": 4.488243430152145e-06, + "logits/chosen": 1.1634256839752197, + "logits/rejected": 3.546384334564209, + "logps/chosen": -582.4814453125, + "logps/rejected": -1055.0667724609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.61699104309082, + "rewards/margins": 27.090225219726562, + "rewards/rejected": -38.70721435546875, + "step": 927 + }, + { + "epoch": 0.577293934681182, + "grad_norm": 0.00030546420020982623, + "learning_rate": 4.4870908252651e-06, + "logits/chosen": 0.3939926028251648, + "logits/rejected": 1.8541518449783325, + "logps/chosen": -529.6817016601562, + "logps/rejected": -912.8052368164062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.74662971496582, + "rewards/margins": 26.91443634033203, + "rewards/rejected": -35.661067962646484, + "step": 928 + }, + { + "epoch": 0.5779160186625194, + "grad_norm": 5.225065251579508e-05, + "learning_rate": 4.485938220378055e-06, + "logits/chosen": 1.2161568403244019, + "logits/rejected": 2.1696629524230957, + "logps/chosen": -686.54931640625, + "logps/rejected": -1048.06396484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.813570022583008, + "rewards/margins": 29.099994659423828, + "rewards/rejected": -39.91356658935547, + "step": 929 + }, + { + "epoch": 0.578538102643857, + "grad_norm": 3.6309118270874023, + "learning_rate": 4.4847856154910104e-06, + "logits/chosen": 1.50335693359375, + "logits/rejected": 3.812343120574951, + "logps/chosen": -531.6221923828125, + "logps/rejected": -818.347900390625, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.524703979492188, + "rewards/margins": 18.538118362426758, + "rewards/rejected": -29.062822341918945, + "step": 930 + }, + { + "epoch": 0.5791601866251944, + "grad_norm": 0.5539863705635071, + "learning_rate": 4.483633010603966e-06, + "logits/chosen": -2.3044533729553223, + "logits/rejected": 1.220093011856079, + "logps/chosen": -345.4551696777344, + "logps/rejected": -790.6837158203125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.869375228881836, + "rewards/margins": 20.257827758789062, + "rewards/rejected": -25.12720489501953, + "step": 931 + }, + { + "epoch": 0.5797822706065319, + "grad_norm": 0.11792272329330444, + "learning_rate": 4.482480405716921e-06, + "logits/chosen": -1.1023590564727783, + "logits/rejected": 3.9352211952209473, + "logps/chosen": -422.75665283203125, + "logps/rejected": -827.6967163085938, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.071016788482666, + "rewards/margins": 17.74365997314453, + "rewards/rejected": -21.814678192138672, + "step": 932 + }, + { + "epoch": 0.5804043545878693, + "grad_norm": 17.893983840942383, + "learning_rate": 4.481327800829876e-06, + "logits/chosen": 0.9140981435775757, + "logits/rejected": 3.72688627243042, + "logps/chosen": -569.3828125, + "logps/rejected": -979.8197021484375, + "loss": 0.1286, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.508774757385254, + "rewards/margins": 22.191757202148438, + "rewards/rejected": -32.700531005859375, + "step": 933 + }, + { + "epoch": 0.5810264385692069, + "grad_norm": 0.005547088570892811, + "learning_rate": 4.480175195942831e-06, + "logits/chosen": 1.4913636445999146, + "logits/rejected": 4.0409159660339355, + "logps/chosen": -654.9049682617188, + "logps/rejected": -945.757568359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.859716415405273, + "rewards/margins": 20.024673461914062, + "rewards/rejected": -29.884387969970703, + "step": 934 + }, + { + "epoch": 0.5816485225505443, + "grad_norm": 37.471954345703125, + "learning_rate": 4.4790225910557866e-06, + "logits/chosen": -1.901386022567749, + "logits/rejected": 2.022355318069458, + "logps/chosen": -464.411376953125, + "logps/rejected": -863.151123046875, + "loss": 0.4578, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.3386812210083, + "rewards/margins": 14.769096374511719, + "rewards/rejected": -25.107778549194336, + "step": 935 + }, + { + "epoch": 0.5822706065318818, + "grad_norm": 0.7605171203613281, + "learning_rate": 4.477869986168742e-06, + "logits/chosen": 0.5889070630073547, + "logits/rejected": 2.96616792678833, + "logps/chosen": -525.977783203125, + "logps/rejected": -948.6707153320312, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.921030044555664, + "rewards/margins": 24.97126579284668, + "rewards/rejected": -34.89229965209961, + "step": 936 + }, + { + "epoch": 0.5828926905132192, + "grad_norm": 0.043847814202308655, + "learning_rate": 4.476717381281697e-06, + "logits/chosen": 4.610742568969727, + "logits/rejected": 5.402982234954834, + "logps/chosen": -703.4801025390625, + "logps/rejected": -1112.8795166015625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.42116928100586, + "rewards/margins": 26.29779815673828, + "rewards/rejected": -37.718971252441406, + "step": 937 + }, + { + "epoch": 0.5835147744945568, + "grad_norm": 0.0021270744036883116, + "learning_rate": 4.475564776394652e-06, + "logits/chosen": -2.914405345916748, + "logits/rejected": 3.734607219696045, + "logps/chosen": -344.076416015625, + "logps/rejected": -967.015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.8051252365112305, + "rewards/margins": 28.18408203125, + "rewards/rejected": -34.98920440673828, + "step": 938 + }, + { + "epoch": 0.5841368584758942, + "grad_norm": 14.73412036895752, + "learning_rate": 4.4744121715076075e-06, + "logits/chosen": -0.1586018055677414, + "logits/rejected": 2.3905863761901855, + "logps/chosen": -584.124755859375, + "logps/rejected": -859.9805297851562, + "loss": 0.0357, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.387353897094727, + "rewards/margins": 16.92449378967285, + "rewards/rejected": -30.311847686767578, + "step": 939 + }, + { + "epoch": 0.5847589424572317, + "grad_norm": 2.371389150619507, + "learning_rate": 4.473259566620563e-06, + "logits/chosen": -0.3750753402709961, + "logits/rejected": 2.5787787437438965, + "logps/chosen": -538.4427490234375, + "logps/rejected": -924.25341796875, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.576361656188965, + "rewards/margins": 19.905107498168945, + "rewards/rejected": -28.481468200683594, + "step": 940 + }, + { + "epoch": 0.5853810264385692, + "grad_norm": 91.64904022216797, + "learning_rate": 4.472106961733518e-06, + "logits/chosen": 0.6789902448654175, + "logits/rejected": 2.3577969074249268, + "logps/chosen": -772.6759033203125, + "logps/rejected": -923.3992919921875, + "loss": 2.4035, + "rewards/accuracies": 0.875, + "rewards/chosen": -23.004234313964844, + "rewards/margins": 11.870817184448242, + "rewards/rejected": -34.87505340576172, + "step": 941 + }, + { + "epoch": 0.5860031104199067, + "grad_norm": 0.0015563821652904153, + "learning_rate": 4.470954356846474e-06, + "logits/chosen": -0.7656075954437256, + "logits/rejected": 1.6892644166946411, + "logps/chosen": -569.5089111328125, + "logps/rejected": -983.2672119140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.503901481628418, + "rewards/margins": 24.433698654174805, + "rewards/rejected": -37.937599182128906, + "step": 942 + }, + { + "epoch": 0.5866251944012442, + "grad_norm": 0.0011474979110062122, + "learning_rate": 4.469801751959429e-06, + "logits/chosen": 0.09146726131439209, + "logits/rejected": 2.295836925506592, + "logps/chosen": -590.6580200195312, + "logps/rejected": -893.857666015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.355199813842773, + "rewards/margins": 19.084501266479492, + "rewards/rejected": -33.439701080322266, + "step": 943 + }, + { + "epoch": 0.5872472783825816, + "grad_norm": 30.391260147094727, + "learning_rate": 4.4686491470723844e-06, + "logits/chosen": -2.1463727951049805, + "logits/rejected": 0.11997640132904053, + "logps/chosen": -468.9334411621094, + "logps/rejected": -767.435302734375, + "loss": 0.3284, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.860607147216797, + "rewards/margins": 21.03769302368164, + "rewards/rejected": -26.898300170898438, + "step": 944 + }, + { + "epoch": 0.5878693623639192, + "grad_norm": 0.011637120507657528, + "learning_rate": 4.46749654218534e-06, + "logits/chosen": -2.7303953170776367, + "logits/rejected": 2.3290488719940186, + "logps/chosen": -414.9637451171875, + "logps/rejected": -1065.450439453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.247337341308594, + "rewards/margins": 32.224693298339844, + "rewards/rejected": -41.47202682495117, + "step": 945 + }, + { + "epoch": 0.5884914463452566, + "grad_norm": 1.5746371746063232, + "learning_rate": 4.466343937298295e-06, + "logits/chosen": -0.009310126304626465, + "logits/rejected": 1.9677188396453857, + "logps/chosen": -400.695068359375, + "logps/rejected": -643.286865234375, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.75554084777832, + "rewards/margins": 15.892524719238281, + "rewards/rejected": -21.6480655670166, + "step": 946 + }, + { + "epoch": 0.5891135303265941, + "grad_norm": 0.0009989278623834252, + "learning_rate": 4.46519133241125e-06, + "logits/chosen": 0.7627098560333252, + "logits/rejected": 3.4856858253479004, + "logps/chosen": -511.3065185546875, + "logps/rejected": -910.0632934570312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.348669052124023, + "rewards/margins": 23.043886184692383, + "rewards/rejected": -31.392555236816406, + "step": 947 + }, + { + "epoch": 0.5897356143079315, + "grad_norm": 0.08712099492549896, + "learning_rate": 4.464038727524205e-06, + "logits/chosen": -1.3138643503189087, + "logits/rejected": 4.858949184417725, + "logps/chosen": -408.7076110839844, + "logps/rejected": -1118.2198486328125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.463565826416016, + "rewards/margins": 32.61619567871094, + "rewards/rejected": -39.07975769042969, + "step": 948 + }, + { + "epoch": 0.5903576982892691, + "grad_norm": 21.967683792114258, + "learning_rate": 4.4628861226371606e-06, + "logits/chosen": -0.37556570768356323, + "logits/rejected": 2.6165435314178467, + "logps/chosen": -548.4478759765625, + "logps/rejected": -884.171142578125, + "loss": 0.1892, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.981535911560059, + "rewards/margins": 18.82710838317871, + "rewards/rejected": -27.80864715576172, + "step": 949 + }, + { + "epoch": 0.5909797822706065, + "grad_norm": 6.485556241386803e-06, + "learning_rate": 4.461733517750116e-06, + "logits/chosen": 0.7720064520835876, + "logits/rejected": 4.895740509033203, + "logps/chosen": -619.361328125, + "logps/rejected": -1186.765869140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.193835258483887, + "rewards/margins": 32.08253860473633, + "rewards/rejected": -45.276371002197266, + "step": 950 + }, + { + "epoch": 0.591601866251944, + "grad_norm": 0.9476038813591003, + "learning_rate": 4.460580912863071e-06, + "logits/chosen": -0.9694265127182007, + "logits/rejected": 2.774254322052002, + "logps/chosen": -434.80706787109375, + "logps/rejected": -810.2960205078125, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.517581462860107, + "rewards/margins": 19.960872650146484, + "rewards/rejected": -27.478456497192383, + "step": 951 + }, + { + "epoch": 0.5922239502332814, + "grad_norm": 0.024504758417606354, + "learning_rate": 4.459428307976026e-06, + "logits/chosen": 3.0849127769470215, + "logits/rejected": 3.723421096801758, + "logps/chosen": -814.4183959960938, + "logps/rejected": -1017.4376220703125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.637451171875, + "rewards/margins": 18.967361450195312, + "rewards/rejected": -33.60481262207031, + "step": 952 + }, + { + "epoch": 0.592846034214619, + "grad_norm": 0.004705091007053852, + "learning_rate": 4.4582757030889815e-06, + "logits/chosen": 1.4308946132659912, + "logits/rejected": 2.767594814300537, + "logps/chosen": -657.2290649414062, + "logps/rejected": -1088.5958251953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.916187286376953, + "rewards/margins": 31.49768829345703, + "rewards/rejected": -41.413875579833984, + "step": 953 + }, + { + "epoch": 0.5934681181959565, + "grad_norm": 28.829410552978516, + "learning_rate": 4.457123098201937e-06, + "logits/chosen": 1.4665814638137817, + "logits/rejected": 3.462592363357544, + "logps/chosen": -622.665771484375, + "logps/rejected": -946.5082397460938, + "loss": 1.3582, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.926738739013672, + "rewards/margins": 17.527084350585938, + "rewards/rejected": -27.45382308959961, + "step": 954 + }, + { + "epoch": 0.5940902021772939, + "grad_norm": 29.489173889160156, + "learning_rate": 4.455970493314892e-06, + "logits/chosen": -1.9026505947113037, + "logits/rejected": 2.029543399810791, + "logps/chosen": -472.3614501953125, + "logps/rejected": -930.0457763671875, + "loss": 1.0627, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.602188587188721, + "rewards/margins": 23.613767623901367, + "rewards/rejected": -31.215957641601562, + "step": 955 + }, + { + "epoch": 0.5947122861586314, + "grad_norm": 0.16023872792720795, + "learning_rate": 4.454817888427848e-06, + "logits/chosen": 1.162771463394165, + "logits/rejected": 4.04050874710083, + "logps/chosen": -575.5117797851562, + "logps/rejected": -988.9061279296875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.193204879760742, + "rewards/margins": 24.5118408203125, + "rewards/rejected": -34.705047607421875, + "step": 956 + }, + { + "epoch": 0.5953343701399689, + "grad_norm": 0.026538310572504997, + "learning_rate": 4.453665283540803e-06, + "logits/chosen": -0.10087063908576965, + "logits/rejected": 5.041918754577637, + "logps/chosen": -416.1558532714844, + "logps/rejected": -1082.86669921875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.628523349761963, + "rewards/margins": 30.532407760620117, + "rewards/rejected": -38.16093063354492, + "step": 957 + }, + { + "epoch": 0.5959564541213064, + "grad_norm": 37.81728744506836, + "learning_rate": 4.4525126786537576e-06, + "logits/chosen": 1.0219215154647827, + "logits/rejected": 0.9904434680938721, + "logps/chosen": -560.9853515625, + "logps/rejected": -754.2684936523438, + "loss": 0.3818, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.566256523132324, + "rewards/margins": 15.562918663024902, + "rewards/rejected": -28.129173278808594, + "step": 958 + }, + { + "epoch": 0.5965785381026438, + "grad_norm": 0.3736514151096344, + "learning_rate": 4.451360073766713e-06, + "logits/chosen": -0.7447368502616882, + "logits/rejected": 2.2910654544830322, + "logps/chosen": -463.67401123046875, + "logps/rejected": -965.9398193359375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.4921293258667, + "rewards/margins": 22.65591049194336, + "rewards/rejected": -33.14804458618164, + "step": 959 + }, + { + "epoch": 0.5972006220839814, + "grad_norm": 0.0048063406720757484, + "learning_rate": 4.450207468879668e-06, + "logits/chosen": -0.7846198081970215, + "logits/rejected": 2.181448459625244, + "logps/chosen": -573.2274169921875, + "logps/rejected": -1039.460205078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.714447021484375, + "rewards/margins": 28.0849666595459, + "rewards/rejected": -38.799415588378906, + "step": 960 + }, + { + "epoch": 0.5978227060653188, + "grad_norm": 3.337085008621216, + "learning_rate": 4.449054863992623e-06, + "logits/chosen": 0.2808011770248413, + "logits/rejected": 4.691339492797852, + "logps/chosen": -572.583740234375, + "logps/rejected": -1046.2052001953125, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.704526901245117, + "rewards/margins": 25.15294647216797, + "rewards/rejected": -35.85747528076172, + "step": 961 + }, + { + "epoch": 0.5984447900466563, + "grad_norm": 0.00023763404169585556, + "learning_rate": 4.4479022591055785e-06, + "logits/chosen": -0.1722264289855957, + "logits/rejected": 3.0525989532470703, + "logps/chosen": -494.31640625, + "logps/rejected": -920.9412841796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.198963165283203, + "rewards/margins": 23.290542602539062, + "rewards/rejected": -32.489505767822266, + "step": 962 + }, + { + "epoch": 0.5990668740279937, + "grad_norm": 0.0672890692949295, + "learning_rate": 4.446749654218534e-06, + "logits/chosen": 0.6471335887908936, + "logits/rejected": 3.0925450325012207, + "logps/chosen": -591.6119384765625, + "logps/rejected": -936.7149658203125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.652125358581543, + "rewards/margins": 22.490100860595703, + "rewards/rejected": -35.14222717285156, + "step": 963 + }, + { + "epoch": 0.5996889580093313, + "grad_norm": 0.0020569032058119774, + "learning_rate": 4.445597049331489e-06, + "logits/chosen": -0.9469920992851257, + "logits/rejected": 3.857593297958374, + "logps/chosen": -392.8443298339844, + "logps/rejected": -893.7676391601562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.6447930335998535, + "rewards/margins": 26.24173355102539, + "rewards/rejected": -33.88652420043945, + "step": 964 + }, + { + "epoch": 0.6003110419906688, + "grad_norm": 0.0019404878839850426, + "learning_rate": 4.444444444444444e-06, + "logits/chosen": 3.261852264404297, + "logits/rejected": 4.384195804595947, + "logps/chosen": -611.6591186523438, + "logps/rejected": -969.6968994140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.406609058380127, + "rewards/margins": 24.724668502807617, + "rewards/rejected": -32.13127899169922, + "step": 965 + }, + { + "epoch": 0.6009331259720062, + "grad_norm": 0.06938864290714264, + "learning_rate": 4.4432918395574e-06, + "logits/chosen": 1.4138062000274658, + "logits/rejected": 2.655439853668213, + "logps/chosen": -639.2680053710938, + "logps/rejected": -933.8479614257812, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.837974548339844, + "rewards/margins": 23.26096534729004, + "rewards/rejected": -33.09893798828125, + "step": 966 + }, + { + "epoch": 0.6015552099533437, + "grad_norm": 3.7771530151367188, + "learning_rate": 4.4421392346703554e-06, + "logits/chosen": -1.438300371170044, + "logits/rejected": 2.7318453788757324, + "logps/chosen": -392.73748779296875, + "logps/rejected": -839.9110107421875, + "loss": 0.0258, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.509245872497559, + "rewards/margins": 22.722030639648438, + "rewards/rejected": -29.231277465820312, + "step": 967 + }, + { + "epoch": 0.6021772939346812, + "grad_norm": 1.8222912549972534, + "learning_rate": 4.440986629783311e-06, + "logits/chosen": -2.3222408294677734, + "logits/rejected": 2.861205816268921, + "logps/chosen": -411.26904296875, + "logps/rejected": -1023.5480346679688, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.547379493713379, + "rewards/margins": 29.81708526611328, + "rewards/rejected": -34.364463806152344, + "step": 968 + }, + { + "epoch": 0.6027993779160187, + "grad_norm": 0.17347721755504608, + "learning_rate": 4.439834024896266e-06, + "logits/chosen": 1.14876127243042, + "logits/rejected": 3.9284627437591553, + "logps/chosen": -526.479248046875, + "logps/rejected": -783.3134765625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.16906452178955, + "rewards/margins": 16.533124923706055, + "rewards/rejected": -25.702190399169922, + "step": 969 + }, + { + "epoch": 0.6034214618973561, + "grad_norm": 0.12882846593856812, + "learning_rate": 4.438681420009221e-06, + "logits/chosen": 1.3886802196502686, + "logits/rejected": 3.3670883178710938, + "logps/chosen": -646.9923095703125, + "logps/rejected": -995.0706176757812, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.358152389526367, + "rewards/margins": 25.910850524902344, + "rewards/rejected": -38.269004821777344, + "step": 970 + }, + { + "epoch": 0.6040435458786936, + "grad_norm": 2.1180777549743652, + "learning_rate": 4.437528815122176e-06, + "logits/chosen": -1.9676039218902588, + "logits/rejected": 1.6668283939361572, + "logps/chosen": -396.4910583496094, + "logps/rejected": -773.7052001953125, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.505646705627441, + "rewards/margins": 22.966257095336914, + "rewards/rejected": -29.471904754638672, + "step": 971 + }, + { + "epoch": 0.6046656298600311, + "grad_norm": 0.35996007919311523, + "learning_rate": 4.4363762102351316e-06, + "logits/chosen": -1.3434209823608398, + "logits/rejected": 3.100332260131836, + "logps/chosen": -553.2901000976562, + "logps/rejected": -1007.8410034179688, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.710760593414307, + "rewards/margins": 26.766399383544922, + "rewards/rejected": -33.47716522216797, + "step": 972 + }, + { + "epoch": 0.6052877138413686, + "grad_norm": 0.1532444953918457, + "learning_rate": 4.435223605348087e-06, + "logits/chosen": -1.289564847946167, + "logits/rejected": 2.1508238315582275, + "logps/chosen": -451.6590881347656, + "logps/rejected": -913.523193359375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.025468349456787, + "rewards/margins": 29.21463394165039, + "rewards/rejected": -36.2401008605957, + "step": 973 + }, + { + "epoch": 0.605909797822706, + "grad_norm": 0.03995480760931969, + "learning_rate": 4.434071000461042e-06, + "logits/chosen": -0.7090196013450623, + "logits/rejected": 3.135948419570923, + "logps/chosen": -515.958740234375, + "logps/rejected": -1091.345947265625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.616611957550049, + "rewards/margins": 27.977787017822266, + "rewards/rejected": -35.594398498535156, + "step": 974 + }, + { + "epoch": 0.6065318818040435, + "grad_norm": 0.0549919418990612, + "learning_rate": 4.432918395573997e-06, + "logits/chosen": -3.5261151790618896, + "logits/rejected": 2.896216869354248, + "logps/chosen": -440.1463623046875, + "logps/rejected": -1085.7508544921875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.423600673675537, + "rewards/margins": 23.392396926879883, + "rewards/rejected": -30.815998077392578, + "step": 975 + }, + { + "epoch": 0.6071539657853811, + "grad_norm": 2.218489044025773e-06, + "learning_rate": 4.4317657906869525e-06, + "logits/chosen": 1.2783446311950684, + "logits/rejected": 4.227397918701172, + "logps/chosen": -630.39892578125, + "logps/rejected": -1058.150146484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.845111846923828, + "rewards/margins": 26.44537353515625, + "rewards/rejected": -37.29048538208008, + "step": 976 + }, + { + "epoch": 0.6077760497667185, + "grad_norm": 8.354219608008862e-05, + "learning_rate": 4.430613185799908e-06, + "logits/chosen": 1.6482150554656982, + "logits/rejected": 4.484062671661377, + "logps/chosen": -564.74169921875, + "logps/rejected": -963.2635498046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.121161460876465, + "rewards/margins": 20.304988861083984, + "rewards/rejected": -29.426151275634766, + "step": 977 + }, + { + "epoch": 0.608398133748056, + "grad_norm": 4.307034015655518, + "learning_rate": 4.429460580912863e-06, + "logits/chosen": 2.6032145023345947, + "logits/rejected": 4.085454940795898, + "logps/chosen": -634.4119873046875, + "logps/rejected": -1028.705810546875, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.905746459960938, + "rewards/margins": 25.430469512939453, + "rewards/rejected": -39.336219787597656, + "step": 978 + }, + { + "epoch": 0.6090202177293935, + "grad_norm": 0.4238090217113495, + "learning_rate": 4.428307976025818e-06, + "logits/chosen": -3.106131076812744, + "logits/rejected": 2.2020976543426514, + "logps/chosen": -386.2730712890625, + "logps/rejected": -872.7025146484375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.160600662231445, + "rewards/margins": 20.310396194458008, + "rewards/rejected": -28.47100067138672, + "step": 979 + }, + { + "epoch": 0.609642301710731, + "grad_norm": 26.356096267700195, + "learning_rate": 4.427155371138774e-06, + "logits/chosen": -2.559213399887085, + "logits/rejected": 1.2290418148040771, + "logps/chosen": -407.3168029785156, + "logps/rejected": -864.3316650390625, + "loss": 0.1682, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.493479251861572, + "rewards/margins": 27.14444923400879, + "rewards/rejected": -34.63793182373047, + "step": 980 + }, + { + "epoch": 0.6102643856920684, + "grad_norm": 22.799148559570312, + "learning_rate": 4.4260027662517294e-06, + "logits/chosen": 2.6630711555480957, + "logits/rejected": 5.5165886878967285, + "logps/chosen": -714.7571411132812, + "logps/rejected": -1119.52490234375, + "loss": 0.1418, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.795547485351562, + "rewards/margins": 20.45812225341797, + "rewards/rejected": -34.25366973876953, + "step": 981 + }, + { + "epoch": 0.6108864696734059, + "grad_norm": 3.947579685359415e-08, + "learning_rate": 4.424850161364685e-06, + "logits/chosen": -2.429842710494995, + "logits/rejected": 3.5816140174865723, + "logps/chosen": -479.0736083984375, + "logps/rejected": -1232.229248046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.675029754638672, + "rewards/margins": 39.637107849121094, + "rewards/rejected": -47.31214141845703, + "step": 982 + }, + { + "epoch": 0.6115085536547434, + "grad_norm": 0.040997885167598724, + "learning_rate": 4.42369755647764e-06, + "logits/chosen": -1.8641680479049683, + "logits/rejected": 0.8546357154846191, + "logps/chosen": -338.0428466796875, + "logps/rejected": -715.9549560546875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.433816432952881, + "rewards/margins": 18.383935928344727, + "rewards/rejected": -25.8177547454834, + "step": 983 + }, + { + "epoch": 0.6121306376360809, + "grad_norm": 0.002463550539687276, + "learning_rate": 4.422544951590595e-06, + "logits/chosen": -1.1334680318832397, + "logits/rejected": 2.1662166118621826, + "logps/chosen": -500.00653076171875, + "logps/rejected": -1011.924560546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.910249710083008, + "rewards/margins": 30.985679626464844, + "rewards/rejected": -37.895931243896484, + "step": 984 + }, + { + "epoch": 0.6127527216174183, + "grad_norm": 5.114550590515137, + "learning_rate": 4.42139234670355e-06, + "logits/chosen": 0.6392805576324463, + "logits/rejected": 2.9587857723236084, + "logps/chosen": -700.7958374023438, + "logps/rejected": -1121.68115234375, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.589431762695312, + "rewards/margins": 27.82578468322754, + "rewards/rejected": -43.41521453857422, + "step": 985 + }, + { + "epoch": 0.6133748055987558, + "grad_norm": 0.00019221102411393076, + "learning_rate": 4.4202397418165056e-06, + "logits/chosen": 0.7990036010742188, + "logits/rejected": 4.720720291137695, + "logps/chosen": -450.42877197265625, + "logps/rejected": -1025.32861328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.870571136474609, + "rewards/margins": 35.376182556152344, + "rewards/rejected": -42.24674987792969, + "step": 986 + }, + { + "epoch": 0.6139968895800934, + "grad_norm": 0.0023970867041498423, + "learning_rate": 4.419087136929461e-06, + "logits/chosen": -0.657366931438446, + "logits/rejected": 2.340158462524414, + "logps/chosen": -488.14508056640625, + "logps/rejected": -866.5415649414062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.836606979370117, + "rewards/margins": 24.216018676757812, + "rewards/rejected": -35.0526237487793, + "step": 987 + }, + { + "epoch": 0.6146189735614308, + "grad_norm": 6.9399729909491725e-06, + "learning_rate": 4.417934532042416e-06, + "logits/chosen": -0.09824991226196289, + "logits/rejected": 2.612088203430176, + "logps/chosen": -665.5135498046875, + "logps/rejected": -1138.546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.184950828552246, + "rewards/margins": 32.675193786621094, + "rewards/rejected": -44.86014175415039, + "step": 988 + }, + { + "epoch": 0.6152410575427683, + "grad_norm": 3.5185718536376953, + "learning_rate": 4.416781927155371e-06, + "logits/chosen": 0.624005913734436, + "logits/rejected": 3.345196485519409, + "logps/chosen": -449.0107116699219, + "logps/rejected": -871.7337036132812, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.905702590942383, + "rewards/margins": 23.94113540649414, + "rewards/rejected": -32.84683609008789, + "step": 989 + }, + { + "epoch": 0.6158631415241057, + "grad_norm": 21.973644256591797, + "learning_rate": 4.4156293222683265e-06, + "logits/chosen": -0.036577463150024414, + "logits/rejected": 2.1554677486419678, + "logps/chosen": -622.6143798828125, + "logps/rejected": -868.1095581054688, + "loss": 0.1398, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.636955261230469, + "rewards/margins": 20.860532760620117, + "rewards/rejected": -30.497488021850586, + "step": 990 + }, + { + "epoch": 0.6164852255054433, + "grad_norm": 24.79236602783203, + "learning_rate": 4.414476717381282e-06, + "logits/chosen": 2.160615921020508, + "logits/rejected": 3.0709891319274902, + "logps/chosen": -594.5836181640625, + "logps/rejected": -798.706298828125, + "loss": 0.3574, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.179106712341309, + "rewards/margins": 16.09888458251953, + "rewards/rejected": -25.277992248535156, + "step": 991 + }, + { + "epoch": 0.6171073094867807, + "grad_norm": 0.13140863180160522, + "learning_rate": 4.413324112494237e-06, + "logits/chosen": -2.102931499481201, + "logits/rejected": 3.019148588180542, + "logps/chosen": -356.99591064453125, + "logps/rejected": -933.642333984375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.668631553649902, + "rewards/margins": 25.06151580810547, + "rewards/rejected": -33.73014831542969, + "step": 992 + }, + { + "epoch": 0.6177293934681182, + "grad_norm": 15.942180633544922, + "learning_rate": 4.412171507607192e-06, + "logits/chosen": 2.256786346435547, + "logits/rejected": 2.633397340774536, + "logps/chosen": -646.35205078125, + "logps/rejected": -853.915283203125, + "loss": 0.0991, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.75490951538086, + "rewards/margins": 15.246297836303711, + "rewards/rejected": -24.00120735168457, + "step": 993 + }, + { + "epoch": 0.6183514774494556, + "grad_norm": 0.00010330742225050926, + "learning_rate": 4.411018902720147e-06, + "logits/chosen": -0.5863848924636841, + "logits/rejected": 3.1927225589752197, + "logps/chosen": -504.6681213378906, + "logps/rejected": -1111.0050048828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.162359237670898, + "rewards/margins": 32.09947204589844, + "rewards/rejected": -39.26183319091797, + "step": 994 + }, + { + "epoch": 0.6189735614307932, + "grad_norm": 6.420986652374268, + "learning_rate": 4.4098662978331034e-06, + "logits/chosen": -1.7422621250152588, + "logits/rejected": 1.580471396446228, + "logps/chosen": -557.5867309570312, + "logps/rejected": -980.4736938476562, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.687779426574707, + "rewards/margins": 21.697011947631836, + "rewards/rejected": -29.384790420532227, + "step": 995 + }, + { + "epoch": 0.6195956454121306, + "grad_norm": 0.06811324506998062, + "learning_rate": 4.408713692946059e-06, + "logits/chosen": -1.5099706649780273, + "logits/rejected": 1.8679317235946655, + "logps/chosen": -473.080078125, + "logps/rejected": -850.0989990234375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.807185173034668, + "rewards/margins": 24.385814666748047, + "rewards/rejected": -30.19300079345703, + "step": 996 + }, + { + "epoch": 0.6202177293934681, + "grad_norm": 0.033417295664548874, + "learning_rate": 4.407561088059014e-06, + "logits/chosen": -1.5323376655578613, + "logits/rejected": 3.1901423931121826, + "logps/chosen": -524.0758056640625, + "logps/rejected": -959.96484375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.175868034362793, + "rewards/margins": 24.759984970092773, + "rewards/rejected": -31.935853958129883, + "step": 997 + }, + { + "epoch": 0.6208398133748056, + "grad_norm": 3.334057282700087e-06, + "learning_rate": 4.406408483171969e-06, + "logits/chosen": -0.14260584115982056, + "logits/rejected": 4.323513984680176, + "logps/chosen": -528.5975341796875, + "logps/rejected": -1003.25390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.384839057922363, + "rewards/margins": 27.611757278442383, + "rewards/rejected": -34.99659729003906, + "step": 998 + }, + { + "epoch": 0.6214618973561431, + "grad_norm": 30.795129776000977, + "learning_rate": 4.405255878284924e-06, + "logits/chosen": 0.9537069201469421, + "logits/rejected": 3.2278642654418945, + "logps/chosen": -481.7140197753906, + "logps/rejected": -847.1483764648438, + "loss": 1.1289, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.633514404296875, + "rewards/margins": 20.894466400146484, + "rewards/rejected": -31.527982711791992, + "step": 999 + }, + { + "epoch": 0.6220839813374806, + "grad_norm": 33.78679656982422, + "learning_rate": 4.4041032733978796e-06, + "logits/chosen": 1.469162106513977, + "logits/rejected": 2.2073097229003906, + "logps/chosen": -571.0089111328125, + "logps/rejected": -838.247802734375, + "loss": 1.7458, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.620084762573242, + "rewards/margins": 23.940921783447266, + "rewards/rejected": -34.561004638671875, + "step": 1000 + }, + { + "epoch": 0.622706065318818, + "grad_norm": 0.0004030153213534504, + "learning_rate": 4.402950668510835e-06, + "logits/chosen": 2.196051597595215, + "logits/rejected": 2.730830430984497, + "logps/chosen": -716.089111328125, + "logps/rejected": -981.1859130859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.918209075927734, + "rewards/margins": 24.088151931762695, + "rewards/rejected": -40.00636291503906, + "step": 1001 + }, + { + "epoch": 0.6233281493001556, + "grad_norm": 0.00020120911358390003, + "learning_rate": 4.40179806362379e-06, + "logits/chosen": -1.0908539295196533, + "logits/rejected": 2.8006591796875, + "logps/chosen": -419.78668212890625, + "logps/rejected": -949.0003051757812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.273379802703857, + "rewards/margins": 28.251667022705078, + "rewards/rejected": -34.525047302246094, + "step": 1002 + }, + { + "epoch": 0.623950233281493, + "grad_norm": 1.5236693620681763, + "learning_rate": 4.400645458736745e-06, + "logits/chosen": 0.5440617203712463, + "logits/rejected": 2.4226527214050293, + "logps/chosen": -589.6771850585938, + "logps/rejected": -848.9038696289062, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.010832786560059, + "rewards/margins": 15.059330940246582, + "rewards/rejected": -22.07016372680664, + "step": 1003 + }, + { + "epoch": 0.6245723172628305, + "grad_norm": 30.517330169677734, + "learning_rate": 4.3994928538497005e-06, + "logits/chosen": 2.2613136768341064, + "logits/rejected": 2.4845120906829834, + "logps/chosen": -683.16845703125, + "logps/rejected": -969.8983154296875, + "loss": 0.3333, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.087505340576172, + "rewards/margins": 21.87521743774414, + "rewards/rejected": -32.96272277832031, + "step": 1004 + }, + { + "epoch": 0.6251944012441679, + "grad_norm": 0.007849554531276226, + "learning_rate": 4.398340248962656e-06, + "logits/chosen": 3.205324649810791, + "logits/rejected": 4.223089218139648, + "logps/chosen": -602.585205078125, + "logps/rejected": -911.59521484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.011014938354492, + "rewards/margins": 24.49148178100586, + "rewards/rejected": -32.50249481201172, + "step": 1005 + }, + { + "epoch": 0.6258164852255055, + "grad_norm": 0.010902726091444492, + "learning_rate": 4.397187644075611e-06, + "logits/chosen": 0.14429256319999695, + "logits/rejected": 2.621492624282837, + "logps/chosen": -522.736083984375, + "logps/rejected": -865.6357421875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.426938533782959, + "rewards/margins": 22.33446502685547, + "rewards/rejected": -28.761404037475586, + "step": 1006 + }, + { + "epoch": 0.6264385692068429, + "grad_norm": 13.197189331054688, + "learning_rate": 4.396035039188566e-06, + "logits/chosen": 1.6405010223388672, + "logits/rejected": 1.7997384071350098, + "logps/chosen": -608.0145874023438, + "logps/rejected": -708.7522583007812, + "loss": 0.0802, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.889400482177734, + "rewards/margins": 7.958559036254883, + "rewards/rejected": -20.847959518432617, + "step": 1007 + }, + { + "epoch": 0.6270606531881804, + "grad_norm": 0.01054982841014862, + "learning_rate": 4.394882434301521e-06, + "logits/chosen": -0.7522687315940857, + "logits/rejected": 2.160587787628174, + "logps/chosen": -489.1429748535156, + "logps/rejected": -953.8292236328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.892003059387207, + "rewards/margins": 25.63636016845703, + "rewards/rejected": -34.52836227416992, + "step": 1008 + }, + { + "epoch": 0.6276827371695178, + "grad_norm": 40.35072326660156, + "learning_rate": 4.3937298294144774e-06, + "logits/chosen": -0.01016843318939209, + "logits/rejected": 5.0052595138549805, + "logps/chosen": -512.0286865234375, + "logps/rejected": -1018.2623901367188, + "loss": 0.7274, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.998366355895996, + "rewards/margins": 23.161270141601562, + "rewards/rejected": -28.159637451171875, + "step": 1009 + }, + { + "epoch": 0.6283048211508554, + "grad_norm": 52.293617248535156, + "learning_rate": 4.392577224527433e-06, + "logits/chosen": 1.1491129398345947, + "logits/rejected": 3.0440309047698975, + "logps/chosen": -722.254150390625, + "logps/rejected": -954.6030883789062, + "loss": 1.4045, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.873852729797363, + "rewards/margins": 16.21906852722168, + "rewards/rejected": -27.09292221069336, + "step": 1010 + }, + { + "epoch": 0.6289269051321928, + "grad_norm": 0.0604068785905838, + "learning_rate": 4.391424619640388e-06, + "logits/chosen": 0.5932100415229797, + "logits/rejected": 1.7829232215881348, + "logps/chosen": -480.3331604003906, + "logps/rejected": -731.1142578125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.753337860107422, + "rewards/margins": 21.87540626525879, + "rewards/rejected": -27.62874412536621, + "step": 1011 + }, + { + "epoch": 0.6295489891135303, + "grad_norm": 40.562896728515625, + "learning_rate": 4.390272014753343e-06, + "logits/chosen": -1.131493330001831, + "logits/rejected": 2.884080410003662, + "logps/chosen": -587.9010009765625, + "logps/rejected": -928.799560546875, + "loss": 1.0836, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.43549633026123, + "rewards/margins": 21.197429656982422, + "rewards/rejected": -29.63292694091797, + "step": 1012 + }, + { + "epoch": 0.6301710730948679, + "grad_norm": 0.0011462063994258642, + "learning_rate": 4.389119409866298e-06, + "logits/chosen": 1.7970309257507324, + "logits/rejected": 3.385439872741699, + "logps/chosen": -638.41015625, + "logps/rejected": -985.9293212890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.416422843933105, + "rewards/margins": 26.92089080810547, + "rewards/rejected": -35.337310791015625, + "step": 1013 + }, + { + "epoch": 0.6307931570762053, + "grad_norm": 0.04923012852668762, + "learning_rate": 4.3879668049792536e-06, + "logits/chosen": -3.66701340675354, + "logits/rejected": 2.5858314037323, + "logps/chosen": -333.7469787597656, + "logps/rejected": -959.65673828125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.30173110961914, + "rewards/margins": 26.938438415527344, + "rewards/rejected": -35.240169525146484, + "step": 1014 + }, + { + "epoch": 0.6314152410575428, + "grad_norm": 11.959614753723145, + "learning_rate": 4.386814200092209e-06, + "logits/chosen": 1.8299262523651123, + "logits/rejected": 3.218519687652588, + "logps/chosen": -626.8076171875, + "logps/rejected": -905.640625, + "loss": 0.0661, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.366762161254883, + "rewards/margins": 18.138498306274414, + "rewards/rejected": -30.505260467529297, + "step": 1015 + }, + { + "epoch": 0.6320373250388802, + "grad_norm": 0.0018947566859424114, + "learning_rate": 4.385661595205164e-06, + "logits/chosen": -0.7899747490882874, + "logits/rejected": 4.134137153625488, + "logps/chosen": -327.9424133300781, + "logps/rejected": -877.3003540039062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7264366149902344, + "rewards/margins": 26.387657165527344, + "rewards/rejected": -30.114093780517578, + "step": 1016 + }, + { + "epoch": 0.6326594090202178, + "grad_norm": 35.56440734863281, + "learning_rate": 4.384508990318119e-06, + "logits/chosen": 2.359657049179077, + "logits/rejected": 2.69771146774292, + "logps/chosen": -820.05224609375, + "logps/rejected": -974.1748657226562, + "loss": 0.6496, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.05585765838623, + "rewards/margins": 15.927709579467773, + "rewards/rejected": -28.983566284179688, + "step": 1017 + }, + { + "epoch": 0.6332814930015552, + "grad_norm": 1.1538484159245854e-06, + "learning_rate": 4.3833563854310744e-06, + "logits/chosen": 3.8544185161590576, + "logits/rejected": 3.564330577850342, + "logps/chosen": -839.0157470703125, + "logps/rejected": -1088.1396484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.0699462890625, + "rewards/margins": 28.169740676879883, + "rewards/rejected": -41.23968505859375, + "step": 1018 + }, + { + "epoch": 0.6339035769828927, + "grad_norm": 6.535756983794272e-05, + "learning_rate": 4.38220378054403e-06, + "logits/chosen": 3.4101524353027344, + "logits/rejected": 3.27657413482666, + "logps/chosen": -806.0890502929688, + "logps/rejected": -1069.5372314453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.686437606811523, + "rewards/margins": 26.94811248779297, + "rewards/rejected": -39.634552001953125, + "step": 1019 + }, + { + "epoch": 0.6345256609642301, + "grad_norm": 8.647769927978516, + "learning_rate": 4.381051175656985e-06, + "logits/chosen": 0.3077080249786377, + "logits/rejected": 2.6129024028778076, + "logps/chosen": -582.3071899414062, + "logps/rejected": -991.269287109375, + "loss": 0.0479, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.514552116394043, + "rewards/margins": 20.795188903808594, + "rewards/rejected": -30.30974006652832, + "step": 1020 + }, + { + "epoch": 0.6351477449455677, + "grad_norm": 28.72833824157715, + "learning_rate": 4.37989857076994e-06, + "logits/chosen": -0.33616751432418823, + "logits/rejected": 2.4276583194732666, + "logps/chosen": -450.0052185058594, + "logps/rejected": -819.1146240234375, + "loss": 0.5203, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.684841632843018, + "rewards/margins": 26.024911880493164, + "rewards/rejected": -32.709754943847656, + "step": 1021 + }, + { + "epoch": 0.6357698289269051, + "grad_norm": 0.21740223467350006, + "learning_rate": 4.378745965882895e-06, + "logits/chosen": 0.4956507682800293, + "logits/rejected": 1.2035505771636963, + "logps/chosen": -541.427490234375, + "logps/rejected": -844.2938232421875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.296308517456055, + "rewards/margins": 21.546104431152344, + "rewards/rejected": -30.8424129486084, + "step": 1022 + }, + { + "epoch": 0.6363919129082426, + "grad_norm": 2.216559648513794, + "learning_rate": 4.3775933609958506e-06, + "logits/chosen": 2.147252321243286, + "logits/rejected": 4.214204788208008, + "logps/chosen": -626.7850341796875, + "logps/rejected": -1001.1859130859375, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.586669921875, + "rewards/margins": 21.359535217285156, + "rewards/rejected": -29.946203231811523, + "step": 1023 + }, + { + "epoch": 0.63701399688958, + "grad_norm": 23.62967872619629, + "learning_rate": 4.376440756108807e-06, + "logits/chosen": 0.6927123069763184, + "logits/rejected": 3.6509900093078613, + "logps/chosen": -519.7496337890625, + "logps/rejected": -928.06396484375, + "loss": 0.5355, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.055206298828125, + "rewards/margins": 18.006229400634766, + "rewards/rejected": -26.061437606811523, + "step": 1024 + }, + { + "epoch": 0.6376360808709176, + "grad_norm": 0.0004841023765038699, + "learning_rate": 4.375288151221762e-06, + "logits/chosen": 1.0736597776412964, + "logits/rejected": 4.022658348083496, + "logps/chosen": -565.89208984375, + "logps/rejected": -976.2791137695312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.828113555908203, + "rewards/margins": 28.443811416625977, + "rewards/rejected": -37.27192687988281, + "step": 1025 + }, + { + "epoch": 0.6382581648522551, + "grad_norm": 49.700984954833984, + "learning_rate": 4.374135546334717e-06, + "logits/chosen": 3.4103102684020996, + "logits/rejected": 2.3115811347961426, + "logps/chosen": -681.27197265625, + "logps/rejected": -830.0875244140625, + "loss": 2.0367, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.577679634094238, + "rewards/margins": 16.04987144470215, + "rewards/rejected": -28.627552032470703, + "step": 1026 + }, + { + "epoch": 0.6388802488335925, + "grad_norm": 0.06918217986822128, + "learning_rate": 4.372982941447672e-06, + "logits/chosen": -1.2822195291519165, + "logits/rejected": 2.634605646133423, + "logps/chosen": -509.82550048828125, + "logps/rejected": -1073.20751953125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.181923866271973, + "rewards/margins": 29.78413200378418, + "rewards/rejected": -39.96605682373047, + "step": 1027 + }, + { + "epoch": 0.63950233281493, + "grad_norm": 2.3471317291259766, + "learning_rate": 4.3718303365606275e-06, + "logits/chosen": 0.784964919090271, + "logits/rejected": 2.4620285034179688, + "logps/chosen": -525.9091796875, + "logps/rejected": -822.0399169921875, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.523943901062012, + "rewards/margins": 22.30738067626953, + "rewards/rejected": -27.83132553100586, + "step": 1028 + }, + { + "epoch": 0.6401244167962675, + "grad_norm": 2.681765920442558e-07, + "learning_rate": 4.370677731673583e-06, + "logits/chosen": 0.8570016622543335, + "logits/rejected": 3.016300678253174, + "logps/chosen": -529.441650390625, + "logps/rejected": -956.5719604492188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.739961624145508, + "rewards/margins": 32.02619934082031, + "rewards/rejected": -40.76616287231445, + "step": 1029 + }, + { + "epoch": 0.640746500777605, + "grad_norm": 3.9567577838897705, + "learning_rate": 4.369525126786538e-06, + "logits/chosen": -3.876920700073242, + "logits/rejected": 1.7937712669372559, + "logps/chosen": -355.7938232421875, + "logps/rejected": -930.0498657226562, + "loss": 0.0304, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.368406295776367, + "rewards/margins": 25.666030883789062, + "rewards/rejected": -29.034439086914062, + "step": 1030 + }, + { + "epoch": 0.6413685847589424, + "grad_norm": 0.2327953577041626, + "learning_rate": 4.368372521899493e-06, + "logits/chosen": 2.2502920627593994, + "logits/rejected": 3.6756107807159424, + "logps/chosen": -637.3325805664062, + "logps/rejected": -1016.4441528320312, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.519428253173828, + "rewards/margins": 23.643564224243164, + "rewards/rejected": -31.162994384765625, + "step": 1031 + }, + { + "epoch": 0.64199066874028, + "grad_norm": 1.140275478363037, + "learning_rate": 4.3672199170124484e-06, + "logits/chosen": 2.291574001312256, + "logits/rejected": 3.5964736938476562, + "logps/chosen": -700.5925903320312, + "logps/rejected": -1020.0197143554688, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.006669998168945, + "rewards/margins": 21.45091438293457, + "rewards/rejected": -31.457584381103516, + "step": 1032 + }, + { + "epoch": 0.6426127527216174, + "grad_norm": 2.2512295246124268, + "learning_rate": 4.366067312125404e-06, + "logits/chosen": 0.2611212432384491, + "logits/rejected": 3.5303797721862793, + "logps/chosen": -486.99786376953125, + "logps/rejected": -896.658203125, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.969782829284668, + "rewards/margins": 21.509904861450195, + "rewards/rejected": -32.47968673706055, + "step": 1033 + }, + { + "epoch": 0.6432348367029549, + "grad_norm": 0.03622567281126976, + "learning_rate": 4.364914707238359e-06, + "logits/chosen": -1.3039064407348633, + "logits/rejected": 3.5805559158325195, + "logps/chosen": -321.51263427734375, + "logps/rejected": -974.6703491210938, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9876842498779297, + "rewards/margins": 33.68848419189453, + "rewards/rejected": -37.676170349121094, + "step": 1034 + }, + { + "epoch": 0.6438569206842923, + "grad_norm": 0.006617639679461718, + "learning_rate": 4.363762102351314e-06, + "logits/chosen": 0.1614302396774292, + "logits/rejected": 3.4580628871917725, + "logps/chosen": -348.7968444824219, + "logps/rejected": -815.583740234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.265744686126709, + "rewards/margins": 26.007923126220703, + "rewards/rejected": -32.27366638183594, + "step": 1035 + }, + { + "epoch": 0.6444790046656299, + "grad_norm": 7.089043140411377, + "learning_rate": 4.362609497464269e-06, + "logits/chosen": 0.5390743017196655, + "logits/rejected": 2.8165862560272217, + "logps/chosen": -610.386474609375, + "logps/rejected": -993.70263671875, + "loss": 0.0791, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.227594375610352, + "rewards/margins": 21.322710037231445, + "rewards/rejected": -32.5503044128418, + "step": 1036 + }, + { + "epoch": 0.6451010886469674, + "grad_norm": 10.708149909973145, + "learning_rate": 4.3614568925772246e-06, + "logits/chosen": 2.2750465869903564, + "logits/rejected": 3.9014241695404053, + "logps/chosen": -666.2877197265625, + "logps/rejected": -926.8017578125, + "loss": 0.0739, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.22494125366211, + "rewards/margins": 16.43631935119629, + "rewards/rejected": -24.66126251220703, + "step": 1037 + }, + { + "epoch": 0.6457231726283048, + "grad_norm": 0.009115724824368954, + "learning_rate": 4.360304287690181e-06, + "logits/chosen": -1.581646203994751, + "logits/rejected": 0.9383874535560608, + "logps/chosen": -342.2408447265625, + "logps/rejected": -759.0081176757812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.06154727935791, + "rewards/margins": 25.451290130615234, + "rewards/rejected": -31.512836456298828, + "step": 1038 + }, + { + "epoch": 0.6463452566096423, + "grad_norm": 4.3823953888022515e-07, + "learning_rate": 4.359151682803136e-06, + "logits/chosen": 0.9563695192337036, + "logits/rejected": 4.24501895904541, + "logps/chosen": -538.5618286132812, + "logps/rejected": -1025.32177734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.323747634887695, + "rewards/margins": 28.09737205505371, + "rewards/rejected": -38.421119689941406, + "step": 1039 + }, + { + "epoch": 0.6469673405909798, + "grad_norm": 53.47418212890625, + "learning_rate": 4.357999077916091e-06, + "logits/chosen": 3.6600234508514404, + "logits/rejected": 4.332643508911133, + "logps/chosen": -763.8884887695312, + "logps/rejected": -1002.970458984375, + "loss": 3.4196, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.286805152893066, + "rewards/margins": 16.814016342163086, + "rewards/rejected": -29.100818634033203, + "step": 1040 + }, + { + "epoch": 0.6475894245723173, + "grad_norm": 36.0056266784668, + "learning_rate": 4.356846473029046e-06, + "logits/chosen": 0.25408101081848145, + "logits/rejected": 2.7115085124969482, + "logps/chosen": -581.7474365234375, + "logps/rejected": -872.8613891601562, + "loss": 0.3032, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.85314655303955, + "rewards/margins": 20.30516242980957, + "rewards/rejected": -34.15830612182617, + "step": 1041 + }, + { + "epoch": 0.6482115085536547, + "grad_norm": 0.3534427285194397, + "learning_rate": 4.3556938681420015e-06, + "logits/chosen": 0.19164976477622986, + "logits/rejected": 2.4058098793029785, + "logps/chosen": -709.7728271484375, + "logps/rejected": -1034.244384765625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.23792552947998, + "rewards/margins": 23.737945556640625, + "rewards/rejected": -33.97587585449219, + "step": 1042 + }, + { + "epoch": 0.6488335925349922, + "grad_norm": 35.1163444519043, + "learning_rate": 4.354541263254957e-06, + "logits/chosen": 0.810158371925354, + "logits/rejected": 2.667872428894043, + "logps/chosen": -640.8101196289062, + "logps/rejected": -909.2906494140625, + "loss": 1.0731, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.474588394165039, + "rewards/margins": 19.483205795288086, + "rewards/rejected": -31.957794189453125, + "step": 1043 + }, + { + "epoch": 0.6494556765163297, + "grad_norm": 25.680383682250977, + "learning_rate": 4.353388658367912e-06, + "logits/chosen": -3.0454163551330566, + "logits/rejected": 3.9892959594726562, + "logps/chosen": -402.1624755859375, + "logps/rejected": -987.141357421875, + "loss": 0.2195, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.043094635009766, + "rewards/margins": 29.327125549316406, + "rewards/rejected": -35.37022018432617, + "step": 1044 + }, + { + "epoch": 0.6500777604976672, + "grad_norm": 5.064794540405273, + "learning_rate": 4.352236053480867e-06, + "logits/chosen": 1.396227240562439, + "logits/rejected": 4.078095436096191, + "logps/chosen": -505.2065124511719, + "logps/rejected": -839.9376220703125, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.201690673828125, + "rewards/margins": 19.825117111206055, + "rewards/rejected": -26.02680778503418, + "step": 1045 + }, + { + "epoch": 0.6506998444790046, + "grad_norm": 0.008247487246990204, + "learning_rate": 4.3510834485938224e-06, + "logits/chosen": -2.75317120552063, + "logits/rejected": 2.8981680870056152, + "logps/chosen": -273.85064697265625, + "logps/rejected": -738.4959106445312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.633110523223877, + "rewards/margins": 20.329675674438477, + "rewards/rejected": -24.962783813476562, + "step": 1046 + }, + { + "epoch": 0.6513219284603421, + "grad_norm": 6.141415119171143, + "learning_rate": 4.349930843706778e-06, + "logits/chosen": 2.316647529602051, + "logits/rejected": 3.3674442768096924, + "logps/chosen": -659.9111938476562, + "logps/rejected": -934.2586669921875, + "loss": 0.031, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.761323928833008, + "rewards/margins": 20.5927677154541, + "rewards/rejected": -30.35409164428711, + "step": 1047 + }, + { + "epoch": 0.6519440124416797, + "grad_norm": 0.015566140413284302, + "learning_rate": 4.348778238819733e-06, + "logits/chosen": 2.220116138458252, + "logits/rejected": 4.262792110443115, + "logps/chosen": -594.0078735351562, + "logps/rejected": -833.1097412109375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.603310585021973, + "rewards/margins": 16.571250915527344, + "rewards/rejected": -24.174562454223633, + "step": 1048 + }, + { + "epoch": 0.6525660964230171, + "grad_norm": 0.9627737402915955, + "learning_rate": 4.347625633932688e-06, + "logits/chosen": 1.216709017753601, + "logits/rejected": 4.047371864318848, + "logps/chosen": -513.246826171875, + "logps/rejected": -946.6227416992188, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.843459129333496, + "rewards/margins": 19.405704498291016, + "rewards/rejected": -30.249164581298828, + "step": 1049 + }, + { + "epoch": 0.6531881804043546, + "grad_norm": 0.03551546484231949, + "learning_rate": 4.346473029045643e-06, + "logits/chosen": -0.7971693873405457, + "logits/rejected": 2.8314757347106934, + "logps/chosen": -499.04095458984375, + "logps/rejected": -885.7818603515625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.940540313720703, + "rewards/margins": 17.48625946044922, + "rewards/rejected": -26.426803588867188, + "step": 1050 + }, + { + "epoch": 0.6538102643856921, + "grad_norm": 12.715831756591797, + "learning_rate": 4.3453204241585986e-06, + "logits/chosen": 1.9154138565063477, + "logits/rejected": 4.475796699523926, + "logps/chosen": -618.64111328125, + "logps/rejected": -1036.7139892578125, + "loss": 0.1346, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.447593688964844, + "rewards/margins": 20.037174224853516, + "rewards/rejected": -32.48476791381836, + "step": 1051 + }, + { + "epoch": 0.6544323483670296, + "grad_norm": 3.329910396132618e-05, + "learning_rate": 4.344167819271554e-06, + "logits/chosen": 1.1139119863510132, + "logits/rejected": 4.082623481750488, + "logps/chosen": -543.2000122070312, + "logps/rejected": -942.43212890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.591399192810059, + "rewards/margins": 23.034143447875977, + "rewards/rejected": -31.62554168701172, + "step": 1052 + }, + { + "epoch": 0.655054432348367, + "grad_norm": 0.15551453828811646, + "learning_rate": 4.34301521438451e-06, + "logits/chosen": 1.3609882593154907, + "logits/rejected": 4.041923522949219, + "logps/chosen": -533.775146484375, + "logps/rejected": -987.229248046875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.904748916625977, + "rewards/margins": 26.243389129638672, + "rewards/rejected": -35.14813995361328, + "step": 1053 + }, + { + "epoch": 0.6556765163297045, + "grad_norm": 0.6531232595443726, + "learning_rate": 4.341862609497465e-06, + "logits/chosen": 0.10662335157394409, + "logits/rejected": 2.761049509048462, + "logps/chosen": -542.4237060546875, + "logps/rejected": -890.3243408203125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.279252052307129, + "rewards/margins": 21.53714942932129, + "rewards/rejected": -28.81639862060547, + "step": 1054 + }, + { + "epoch": 0.656298600311042, + "grad_norm": 0.011171232908964157, + "learning_rate": 4.34071000461042e-06, + "logits/chosen": 1.9401049613952637, + "logits/rejected": 4.1546549797058105, + "logps/chosen": -606.16552734375, + "logps/rejected": -992.0186157226562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.93386173248291, + "rewards/margins": 28.016857147216797, + "rewards/rejected": -36.950721740722656, + "step": 1055 + }, + { + "epoch": 0.6569206842923795, + "grad_norm": 5.907070636749268, + "learning_rate": 4.3395573997233755e-06, + "logits/chosen": -2.723787784576416, + "logits/rejected": 1.073384165763855, + "logps/chosen": -247.42018127441406, + "logps/rejected": -671.1397094726562, + "loss": 0.0333, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.980680465698242, + "rewards/margins": 20.786914825439453, + "rewards/rejected": -23.767597198486328, + "step": 1056 + }, + { + "epoch": 0.6575427682737169, + "grad_norm": 2.8011792892357334e-05, + "learning_rate": 4.338404794836331e-06, + "logits/chosen": -2.1832637786865234, + "logits/rejected": 3.1213631629943848, + "logps/chosen": -317.74371337890625, + "logps/rejected": -867.841552734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.91716194152832, + "rewards/margins": 26.970073699951172, + "rewards/rejected": -31.887237548828125, + "step": 1057 + }, + { + "epoch": 0.6581648522550544, + "grad_norm": 0.0006950918468646705, + "learning_rate": 4.337252189949286e-06, + "logits/chosen": -0.36333489418029785, + "logits/rejected": 2.7499642372131348, + "logps/chosen": -508.69482421875, + "logps/rejected": -994.1200561523438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.09777545928955, + "rewards/margins": 26.999156951904297, + "rewards/rejected": -36.09693145751953, + "step": 1058 + }, + { + "epoch": 0.658786936236392, + "grad_norm": 2.4685513973236084, + "learning_rate": 4.336099585062241e-06, + "logits/chosen": -0.1563507318496704, + "logits/rejected": 3.1660311222076416, + "logps/chosen": -530.2171630859375, + "logps/rejected": -1001.8350219726562, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.256999969482422, + "rewards/margins": 25.335674285888672, + "rewards/rejected": -37.592674255371094, + "step": 1059 + }, + { + "epoch": 0.6594090202177294, + "grad_norm": 0.9450600743293762, + "learning_rate": 4.3349469801751964e-06, + "logits/chosen": 1.899235486984253, + "logits/rejected": 2.9143006801605225, + "logps/chosen": -607.775146484375, + "logps/rejected": -880.9578247070312, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.9500732421875, + "rewards/margins": 20.845109939575195, + "rewards/rejected": -30.795185089111328, + "step": 1060 + }, + { + "epoch": 0.6600311041990669, + "grad_norm": 6.812902450561523, + "learning_rate": 4.333794375288152e-06, + "logits/chosen": 0.020940184593200684, + "logits/rejected": 3.1008946895599365, + "logps/chosen": -641.708251953125, + "logps/rejected": -1082.6326904296875, + "loss": 0.029, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.005695343017578, + "rewards/margins": 22.83586883544922, + "rewards/rejected": -33.8415641784668, + "step": 1061 + }, + { + "epoch": 0.6606531881804043, + "grad_norm": 39.42338180541992, + "learning_rate": 4.332641770401107e-06, + "logits/chosen": 2.0118627548217773, + "logits/rejected": 4.52200984954834, + "logps/chosen": -578.223388671875, + "logps/rejected": -1029.5029296875, + "loss": 0.7323, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.329606056213379, + "rewards/margins": 25.680768966674805, + "rewards/rejected": -36.010372161865234, + "step": 1062 + }, + { + "epoch": 0.6612752721617419, + "grad_norm": 25.15177345275879, + "learning_rate": 4.331489165514062e-06, + "logits/chosen": 2.8278603553771973, + "logits/rejected": 4.047382831573486, + "logps/chosen": -703.8547973632812, + "logps/rejected": -873.251953125, + "loss": 0.1875, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.270652770996094, + "rewards/margins": 9.811416625976562, + "rewards/rejected": -22.082069396972656, + "step": 1063 + }, + { + "epoch": 0.6618973561430793, + "grad_norm": 5.5414453527191654e-05, + "learning_rate": 4.330336560627017e-06, + "logits/chosen": 2.8537893295288086, + "logits/rejected": 5.341554164886475, + "logps/chosen": -699.42626953125, + "logps/rejected": -1039.4193115234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.638263702392578, + "rewards/margins": 21.15549659729004, + "rewards/rejected": -30.793758392333984, + "step": 1064 + }, + { + "epoch": 0.6625194401244168, + "grad_norm": 0.4664863348007202, + "learning_rate": 4.3291839557399726e-06, + "logits/chosen": 1.168850302696228, + "logits/rejected": 4.478352069854736, + "logps/chosen": -556.7830200195312, + "logps/rejected": -1005.1434326171875, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.349123954772949, + "rewards/margins": 23.540437698364258, + "rewards/rejected": -30.889562606811523, + "step": 1065 + }, + { + "epoch": 0.6631415241057543, + "grad_norm": 0.7163582444190979, + "learning_rate": 4.328031350852928e-06, + "logits/chosen": 1.0548595190048218, + "logits/rejected": 3.2712948322296143, + "logps/chosen": -639.1087646484375, + "logps/rejected": -1086.1822509765625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.083908081054688, + "rewards/margins": 25.394393920898438, + "rewards/rejected": -34.478302001953125, + "step": 1066 + }, + { + "epoch": 0.6637636080870918, + "grad_norm": 33.42515182495117, + "learning_rate": 4.326878745965883e-06, + "logits/chosen": -0.47864389419555664, + "logits/rejected": 2.4255480766296387, + "logps/chosen": -567.6307983398438, + "logps/rejected": -969.3095703125, + "loss": 0.6311, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.092820167541504, + "rewards/margins": 23.71457290649414, + "rewards/rejected": -32.807395935058594, + "step": 1067 + }, + { + "epoch": 0.6643856920684292, + "grad_norm": 12.694299697875977, + "learning_rate": 4.325726141078839e-06, + "logits/chosen": -1.4194493293762207, + "logits/rejected": 3.0020575523376465, + "logps/chosen": -451.8589172363281, + "logps/rejected": -883.7705078125, + "loss": 0.0606, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.017159461975098, + "rewards/margins": 20.179271697998047, + "rewards/rejected": -26.196434020996094, + "step": 1068 + }, + { + "epoch": 0.6650077760497667, + "grad_norm": 0.006870250217616558, + "learning_rate": 4.324573536191794e-06, + "logits/chosen": 1.6804357767105103, + "logits/rejected": 5.030431747436523, + "logps/chosen": -512.820556640625, + "logps/rejected": -1007.7488403320312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.801078796386719, + "rewards/margins": 25.51949691772461, + "rewards/rejected": -34.32057571411133, + "step": 1069 + }, + { + "epoch": 0.6656298600311042, + "grad_norm": 32.28739547729492, + "learning_rate": 4.3234209313047495e-06, + "logits/chosen": 1.4529192447662354, + "logits/rejected": 3.566375494003296, + "logps/chosen": -579.253173828125, + "logps/rejected": -901.6878051757812, + "loss": 0.72, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.97698974609375, + "rewards/margins": 17.470598220825195, + "rewards/rejected": -27.447586059570312, + "step": 1070 + }, + { + "epoch": 0.6662519440124417, + "grad_norm": 0.3848552703857422, + "learning_rate": 4.322268326417705e-06, + "logits/chosen": -1.314362645149231, + "logits/rejected": 3.793914794921875, + "logps/chosen": -444.42431640625, + "logps/rejected": -918.371826171875, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.004027366638184, + "rewards/margins": 23.923250198364258, + "rewards/rejected": -34.927276611328125, + "step": 1071 + }, + { + "epoch": 0.6668740279937792, + "grad_norm": 7.505640983581543, + "learning_rate": 4.32111572153066e-06, + "logits/chosen": 1.2513737678527832, + "logits/rejected": 3.1623566150665283, + "logps/chosen": -580.482177734375, + "logps/rejected": -918.35205078125, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.184943199157715, + "rewards/margins": 20.685853958129883, + "rewards/rejected": -27.87079620361328, + "step": 1072 + }, + { + "epoch": 0.6674961119751166, + "grad_norm": 0.02148018218576908, + "learning_rate": 4.319963116643615e-06, + "logits/chosen": -1.3228559494018555, + "logits/rejected": 2.4585723876953125, + "logps/chosen": -362.78375244140625, + "logps/rejected": -723.1258544921875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.308596134185791, + "rewards/margins": 14.433586120605469, + "rewards/rejected": -19.742183685302734, + "step": 1073 + }, + { + "epoch": 0.6681181959564542, + "grad_norm": 3.208101406926289e-05, + "learning_rate": 4.31881051175657e-06, + "logits/chosen": 0.2509106993675232, + "logits/rejected": 3.2502963542938232, + "logps/chosen": -535.8070068359375, + "logps/rejected": -910.6559448242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.306463718414307, + "rewards/margins": 25.636838912963867, + "rewards/rejected": -31.943300247192383, + "step": 1074 + }, + { + "epoch": 0.6687402799377916, + "grad_norm": 9.964673154172488e-06, + "learning_rate": 4.317657906869526e-06, + "logits/chosen": 1.126412034034729, + "logits/rejected": 4.417489528656006, + "logps/chosen": -470.00775146484375, + "logps/rejected": -927.0919189453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.198090553283691, + "rewards/margins": 26.117660522460938, + "rewards/rejected": -36.31575012207031, + "step": 1075 + }, + { + "epoch": 0.6693623639191291, + "grad_norm": 0.0008977479301393032, + "learning_rate": 4.316505301982481e-06, + "logits/chosen": 0.4949992299079895, + "logits/rejected": 3.2071704864501953, + "logps/chosen": -493.83941650390625, + "logps/rejected": -869.6880493164062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.259034156799316, + "rewards/margins": 23.93576431274414, + "rewards/rejected": -32.19479751586914, + "step": 1076 + }, + { + "epoch": 0.6699844479004665, + "grad_norm": 3.079448938369751, + "learning_rate": 4.315352697095436e-06, + "logits/chosen": 0.8684056401252747, + "logits/rejected": 2.2930994033813477, + "logps/chosen": -553.3845825195312, + "logps/rejected": -812.5850830078125, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.49267578125, + "rewards/margins": 17.628101348876953, + "rewards/rejected": -28.120777130126953, + "step": 1077 + }, + { + "epoch": 0.6706065318818041, + "grad_norm": 41.547515869140625, + "learning_rate": 4.314200092208391e-06, + "logits/chosen": 0.5994904041290283, + "logits/rejected": 2.7967746257781982, + "logps/chosen": -611.67626953125, + "logps/rejected": -939.912841796875, + "loss": 0.5201, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.551267623901367, + "rewards/margins": 19.001649856567383, + "rewards/rejected": -33.552913665771484, + "step": 1078 + }, + { + "epoch": 0.6712286158631415, + "grad_norm": 22.828508377075195, + "learning_rate": 4.3130474873213465e-06, + "logits/chosen": -1.1581928730010986, + "logits/rejected": 2.9677014350891113, + "logps/chosen": -476.2674560546875, + "logps/rejected": -992.6043701171875, + "loss": 0.5091, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.894004821777344, + "rewards/margins": 30.105693817138672, + "rewards/rejected": -35.999698638916016, + "step": 1079 + }, + { + "epoch": 0.671850699844479, + "grad_norm": 8.702930450439453, + "learning_rate": 4.311894882434302e-06, + "logits/chosen": 1.4790234565734863, + "logits/rejected": 2.22629451751709, + "logps/chosen": -553.0067138671875, + "logps/rejected": -781.3177490234375, + "loss": 0.0453, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.244466781616211, + "rewards/margins": 18.253623962402344, + "rewards/rejected": -30.498090744018555, + "step": 1080 + }, + { + "epoch": 0.6724727838258164, + "grad_norm": 4.632612705230713, + "learning_rate": 4.310742277547257e-06, + "logits/chosen": -1.0388520956039429, + "logits/rejected": 3.8379063606262207, + "logps/chosen": -415.9241027832031, + "logps/rejected": -862.2066650390625, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.942617893218994, + "rewards/margins": 18.617382049560547, + "rewards/rejected": -25.559999465942383, + "step": 1081 + }, + { + "epoch": 0.673094867807154, + "grad_norm": 1.0436056982143782e-05, + "learning_rate": 4.309589672660213e-06, + "logits/chosen": 0.40221107006073, + "logits/rejected": 3.482527017593384, + "logps/chosen": -478.3935546875, + "logps/rejected": -928.32080078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.656597137451172, + "rewards/margins": 24.38768768310547, + "rewards/rejected": -32.044288635253906, + "step": 1082 + }, + { + "epoch": 0.6737169517884914, + "grad_norm": 0.0026403770316392183, + "learning_rate": 4.308437067773168e-06, + "logits/chosen": 1.7504465579986572, + "logits/rejected": 3.3605589866638184, + "logps/chosen": -625.5438232421875, + "logps/rejected": -1023.2490844726562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.362934112548828, + "rewards/margins": 24.983165740966797, + "rewards/rejected": -37.346099853515625, + "step": 1083 + }, + { + "epoch": 0.6743390357698289, + "grad_norm": 1.1153315305709839, + "learning_rate": 4.3072844628861235e-06, + "logits/chosen": -2.518407106399536, + "logits/rejected": 2.908334970474243, + "logps/chosen": -213.45147705078125, + "logps/rejected": -750.7979736328125, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.212100028991699, + "rewards/margins": 24.980051040649414, + "rewards/rejected": -27.192150115966797, + "step": 1084 + }, + { + "epoch": 0.6749611197511665, + "grad_norm": 1.64064513228368e-05, + "learning_rate": 4.306131857999079e-06, + "logits/chosen": -2.173828363418579, + "logits/rejected": 3.981977939605713, + "logps/chosen": -264.6275634765625, + "logps/rejected": -981.3345947265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.846001625061035, + "rewards/margins": 35.636131286621094, + "rewards/rejected": -40.48213195800781, + "step": 1085 + }, + { + "epoch": 0.6755832037325039, + "grad_norm": 0.5660700798034668, + "learning_rate": 4.304979253112034e-06, + "logits/chosen": -2.6855359077453613, + "logits/rejected": 3.02661395072937, + "logps/chosen": -393.4590759277344, + "logps/rejected": -945.981689453125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0285325050354, + "rewards/margins": 25.920177459716797, + "rewards/rejected": -30.948711395263672, + "step": 1086 + }, + { + "epoch": 0.6762052877138414, + "grad_norm": 0.030613282695412636, + "learning_rate": 4.303826648224989e-06, + "logits/chosen": 1.5232200622558594, + "logits/rejected": 4.597107887268066, + "logps/chosen": -372.9260559082031, + "logps/rejected": -692.3011474609375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.295344352722168, + "rewards/margins": 18.140079498291016, + "rewards/rejected": -26.435422897338867, + "step": 1087 + }, + { + "epoch": 0.6768273716951788, + "grad_norm": 37.187347412109375, + "learning_rate": 4.302674043337944e-06, + "logits/chosen": -2.1207356452941895, + "logits/rejected": 3.0639090538024902, + "logps/chosen": -440.3720703125, + "logps/rejected": -1019.062255859375, + "loss": 0.8417, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.682382583618164, + "rewards/margins": 26.8316650390625, + "rewards/rejected": -36.51404571533203, + "step": 1088 + }, + { + "epoch": 0.6774494556765164, + "grad_norm": 29.836376190185547, + "learning_rate": 4.3015214384509e-06, + "logits/chosen": 1.0716478824615479, + "logits/rejected": 2.154702663421631, + "logps/chosen": -618.8206176757812, + "logps/rejected": -877.2896728515625, + "loss": 0.2427, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.232145309448242, + "rewards/margins": 20.821189880371094, + "rewards/rejected": -29.053335189819336, + "step": 1089 + }, + { + "epoch": 0.6780715396578538, + "grad_norm": 42.209190368652344, + "learning_rate": 4.300368833563855e-06, + "logits/chosen": 1.3273580074310303, + "logits/rejected": 1.303884506225586, + "logps/chosen": -744.8178100585938, + "logps/rejected": -790.72021484375, + "loss": 1.0787, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.935836791992188, + "rewards/margins": 10.744930267333984, + "rewards/rejected": -19.68076515197754, + "step": 1090 + }, + { + "epoch": 0.6786936236391913, + "grad_norm": 0.10251186788082123, + "learning_rate": 4.29921622867681e-06, + "logits/chosen": 0.6547682285308838, + "logits/rejected": 3.9677000045776367, + "logps/chosen": -543.1813354492188, + "logps/rejected": -987.18017578125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.872847557067871, + "rewards/margins": 27.053184509277344, + "rewards/rejected": -37.92603302001953, + "step": 1091 + }, + { + "epoch": 0.6793157076205287, + "grad_norm": 2.1696592739317566e-05, + "learning_rate": 4.298063623789765e-06, + "logits/chosen": -1.0745490789413452, + "logits/rejected": 3.7233543395996094, + "logps/chosen": -304.859619140625, + "logps/rejected": -808.3275146484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9483261108398438, + "rewards/margins": 24.147890090942383, + "rewards/rejected": -27.096214294433594, + "step": 1092 + }, + { + "epoch": 0.6799377916018663, + "grad_norm": 2.565423011779785, + "learning_rate": 4.2969110189027205e-06, + "logits/chosen": -3.332108497619629, + "logits/rejected": 2.945809841156006, + "logps/chosen": -467.0668640136719, + "logps/rejected": -1202.020263671875, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.956332206726074, + "rewards/margins": 32.6529655456543, + "rewards/rejected": -41.60929870605469, + "step": 1093 + }, + { + "epoch": 0.6805598755832037, + "grad_norm": 8.504562377929688, + "learning_rate": 4.295758414015676e-06, + "logits/chosen": -1.0805021524429321, + "logits/rejected": 3.7121896743774414, + "logps/chosen": -542.5369262695312, + "logps/rejected": -1073.4661865234375, + "loss": 0.0441, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.501508712768555, + "rewards/margins": 22.639549255371094, + "rewards/rejected": -35.14105987548828, + "step": 1094 + }, + { + "epoch": 0.6811819595645412, + "grad_norm": 0.003098748391494155, + "learning_rate": 4.294605809128631e-06, + "logits/chosen": -1.046175241470337, + "logits/rejected": 4.001430511474609, + "logps/chosen": -493.8402404785156, + "logps/rejected": -1004.2360229492188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.581122398376465, + "rewards/margins": 22.447479248046875, + "rewards/rejected": -33.028602600097656, + "step": 1095 + }, + { + "epoch": 0.6818040435458786, + "grad_norm": 0.005420563742518425, + "learning_rate": 4.293453204241586e-06, + "logits/chosen": -2.7170073986053467, + "logits/rejected": 4.104083061218262, + "logps/chosen": -232.8094482421875, + "logps/rejected": -865.2706298828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.027500867843628, + "rewards/margins": 31.173065185546875, + "rewards/rejected": -33.200565338134766, + "step": 1096 + }, + { + "epoch": 0.6824261275272162, + "grad_norm": 19.43113899230957, + "learning_rate": 4.2923005993545414e-06, + "logits/chosen": 2.000507354736328, + "logits/rejected": 3.084726572036743, + "logps/chosen": -587.5668334960938, + "logps/rejected": -902.2487182617188, + "loss": 0.0842, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.952615737915039, + "rewards/margins": 22.29494285583496, + "rewards/rejected": -35.24755859375, + "step": 1097 + }, + { + "epoch": 0.6830482115085537, + "grad_norm": 39.97905349731445, + "learning_rate": 4.291147994467497e-06, + "logits/chosen": -0.7416508197784424, + "logits/rejected": 1.96217679977417, + "logps/chosen": -488.38623046875, + "logps/rejected": -880.5936889648438, + "loss": 0.5121, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.624948024749756, + "rewards/margins": 23.470813751220703, + "rewards/rejected": -31.095762252807617, + "step": 1098 + }, + { + "epoch": 0.6836702954898911, + "grad_norm": 34.53898620605469, + "learning_rate": 4.289995389580452e-06, + "logits/chosen": 0.8552457094192505, + "logits/rejected": 2.1850810050964355, + "logps/chosen": -686.6106567382812, + "logps/rejected": -946.2464599609375, + "loss": 0.4666, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.199734687805176, + "rewards/margins": 21.99437713623047, + "rewards/rejected": -28.194110870361328, + "step": 1099 + }, + { + "epoch": 0.6842923794712286, + "grad_norm": 0.003329685889184475, + "learning_rate": 4.288842784693407e-06, + "logits/chosen": 0.4598864018917084, + "logits/rejected": 4.591629981994629, + "logps/chosen": -522.9757080078125, + "logps/rejected": -1050.53515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.485584259033203, + "rewards/margins": 27.7689151763916, + "rewards/rejected": -38.25450134277344, + "step": 1100 + }, + { + "epoch": 0.6849144634525661, + "grad_norm": 12.893879890441895, + "learning_rate": 4.287690179806362e-06, + "logits/chosen": 0.198805034160614, + "logits/rejected": 2.8921289443969727, + "logps/chosen": -504.9701232910156, + "logps/rejected": -907.6068115234375, + "loss": 0.0672, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.6027445793151855, + "rewards/margins": 25.79681396484375, + "rewards/rejected": -32.399559020996094, + "step": 1101 + }, + { + "epoch": 0.6855365474339036, + "grad_norm": 1.0506843328475952, + "learning_rate": 4.2865375749193176e-06, + "logits/chosen": 0.5944235324859619, + "logits/rejected": 2.503926992416382, + "logps/chosen": -628.24951171875, + "logps/rejected": -913.1500854492188, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.653724670410156, + "rewards/margins": 21.677860260009766, + "rewards/rejected": -34.33158493041992, + "step": 1102 + }, + { + "epoch": 0.686158631415241, + "grad_norm": 0.0006199941853992641, + "learning_rate": 4.285384970032273e-06, + "logits/chosen": -0.5382259488105774, + "logits/rejected": 1.0772992372512817, + "logps/chosen": -622.629638671875, + "logps/rejected": -902.0396118164062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.727117538452148, + "rewards/margins": 21.430234909057617, + "rewards/rejected": -32.157352447509766, + "step": 1103 + }, + { + "epoch": 0.6867807153965786, + "grad_norm": 7.48344612121582, + "learning_rate": 4.284232365145228e-06, + "logits/chosen": 1.367098093032837, + "logits/rejected": 4.316980361938477, + "logps/chosen": -645.3848266601562, + "logps/rejected": -1083.33642578125, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.908622741699219, + "rewards/margins": 25.33052635192871, + "rewards/rejected": -39.23915100097656, + "step": 1104 + }, + { + "epoch": 0.687402799377916, + "grad_norm": 0.026603125035762787, + "learning_rate": 4.283079760258183e-06, + "logits/chosen": 0.3288910984992981, + "logits/rejected": 2.3766226768493652, + "logps/chosen": -659.520751953125, + "logps/rejected": -1085.520263671875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.07796859741211, + "rewards/margins": 29.936979293823242, + "rewards/rejected": -44.01494598388672, + "step": 1105 + }, + { + "epoch": 0.6880248833592535, + "grad_norm": 0.002381574595347047, + "learning_rate": 4.281927155371139e-06, + "logits/chosen": -1.4142065048217773, + "logits/rejected": 3.241100311279297, + "logps/chosen": -346.6325378417969, + "logps/rejected": -852.815185546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.068493366241455, + "rewards/margins": 26.232025146484375, + "rewards/rejected": -33.30052185058594, + "step": 1106 + }, + { + "epoch": 0.6886469673405909, + "grad_norm": 0.00010436380398459733, + "learning_rate": 4.2807745504840945e-06, + "logits/chosen": 2.5706067085266113, + "logits/rejected": 4.239389419555664, + "logps/chosen": -624.5850219726562, + "logps/rejected": -1006.6663818359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.321513175964355, + "rewards/margins": 25.764984130859375, + "rewards/rejected": -36.08649826049805, + "step": 1107 + }, + { + "epoch": 0.6892690513219285, + "grad_norm": 31.843982696533203, + "learning_rate": 4.27962194559705e-06, + "logits/chosen": -0.6637098789215088, + "logits/rejected": 5.162286281585693, + "logps/chosen": -434.01593017578125, + "logps/rejected": -1057.7811279296875, + "loss": 0.5937, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.872724533081055, + "rewards/margins": 28.320775985717773, + "rewards/rejected": -36.19350051879883, + "step": 1108 + }, + { + "epoch": 0.689891135303266, + "grad_norm": 47.80104446411133, + "learning_rate": 4.278469340710005e-06, + "logits/chosen": 0.11493664979934692, + "logits/rejected": 4.293163299560547, + "logps/chosen": -686.111083984375, + "logps/rejected": -1146.0205078125, + "loss": 1.3721, + "rewards/accuracies": 0.875, + "rewards/chosen": -17.41529083251953, + "rewards/margins": 23.800018310546875, + "rewards/rejected": -41.215309143066406, + "step": 1109 + }, + { + "epoch": 0.6905132192846034, + "grad_norm": 0.004325952846556902, + "learning_rate": 4.27731673582296e-06, + "logits/chosen": 2.2962706089019775, + "logits/rejected": 4.085445404052734, + "logps/chosen": -549.131591796875, + "logps/rejected": -846.0113525390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.297896385192871, + "rewards/margins": 24.9063663482666, + "rewards/rejected": -33.204261779785156, + "step": 1110 + }, + { + "epoch": 0.6911353032659409, + "grad_norm": 0.001067809178493917, + "learning_rate": 4.2761641309359154e-06, + "logits/chosen": 2.287487506866455, + "logits/rejected": 4.490987300872803, + "logps/chosen": -520.4144897460938, + "logps/rejected": -875.7572631835938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.179506301879883, + "rewards/margins": 23.131309509277344, + "rewards/rejected": -31.310815811157227, + "step": 1111 + }, + { + "epoch": 0.6917573872472784, + "grad_norm": 0.005620141979306936, + "learning_rate": 4.275011526048871e-06, + "logits/chosen": -0.24929457902908325, + "logits/rejected": 3.7390763759613037, + "logps/chosen": -464.5872497558594, + "logps/rejected": -948.9923706054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.896591186523438, + "rewards/margins": 24.613645553588867, + "rewards/rejected": -34.51023864746094, + "step": 1112 + }, + { + "epoch": 0.6923794712286159, + "grad_norm": 1.4482674598693848, + "learning_rate": 4.273858921161826e-06, + "logits/chosen": -0.8546969294548035, + "logits/rejected": 2.3102991580963135, + "logps/chosen": -460.37567138671875, + "logps/rejected": -781.9107666015625, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.514959335327148, + "rewards/margins": 17.49068832397461, + "rewards/rejected": -27.005647659301758, + "step": 1113 + }, + { + "epoch": 0.6930015552099533, + "grad_norm": 0.015572289004921913, + "learning_rate": 4.272706316274781e-06, + "logits/chosen": -0.16300639510154724, + "logits/rejected": 4.673491954803467, + "logps/chosen": -304.6646728515625, + "logps/rejected": -899.9229736328125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.211049556732178, + "rewards/margins": 29.155414581298828, + "rewards/rejected": -34.36646270751953, + "step": 1114 + }, + { + "epoch": 0.6936236391912908, + "grad_norm": 0.4332619309425354, + "learning_rate": 4.271553711387736e-06, + "logits/chosen": -0.7185725569725037, + "logits/rejected": 2.5487172603607178, + "logps/chosen": -564.232421875, + "logps/rejected": -977.0563354492188, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.150856971740723, + "rewards/margins": 24.530282974243164, + "rewards/rejected": -33.6811408996582, + "step": 1115 + }, + { + "epoch": 0.6942457231726283, + "grad_norm": 0.03866080194711685, + "learning_rate": 4.2704011065006916e-06, + "logits/chosen": -0.015806496143341064, + "logits/rejected": 3.439502477645874, + "logps/chosen": -407.9251403808594, + "logps/rejected": -908.4151000976562, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0901570320129395, + "rewards/margins": 28.038902282714844, + "rewards/rejected": -33.129058837890625, + "step": 1116 + }, + { + "epoch": 0.6948678071539658, + "grad_norm": 0.0025013545528054237, + "learning_rate": 4.269248501613647e-06, + "logits/chosen": -2.3568055629730225, + "logits/rejected": 3.969569683074951, + "logps/chosen": -347.612060546875, + "logps/rejected": -945.9745483398438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6943399906158447, + "rewards/margins": 22.949398040771484, + "rewards/rejected": -26.643739700317383, + "step": 1117 + }, + { + "epoch": 0.6954898911353032, + "grad_norm": 15.973790168762207, + "learning_rate": 4.268095896726602e-06, + "logits/chosen": 1.2299774885177612, + "logits/rejected": 2.655921697616577, + "logps/chosen": -528.4793090820312, + "logps/rejected": -724.660888671875, + "loss": 0.2251, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.363526344299316, + "rewards/margins": 11.236658096313477, + "rewards/rejected": -18.600183486938477, + "step": 1118 + }, + { + "epoch": 0.6961119751166407, + "grad_norm": 1.4213616847991943, + "learning_rate": 4.266943291839557e-06, + "logits/chosen": -0.38948550820350647, + "logits/rejected": 4.425605773925781, + "logps/chosen": -447.60528564453125, + "logps/rejected": -929.9447021484375, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.339234352111816, + "rewards/margins": 24.30902862548828, + "rewards/rejected": -29.648265838623047, + "step": 1119 + }, + { + "epoch": 0.6967340590979783, + "grad_norm": 0.3841400742530823, + "learning_rate": 4.2657906869525125e-06, + "logits/chosen": -2.0254392623901367, + "logits/rejected": 3.4356141090393066, + "logps/chosen": -338.26055908203125, + "logps/rejected": -872.2510986328125, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.423895835876465, + "rewards/margins": 25.691959381103516, + "rewards/rejected": -32.1158561706543, + "step": 1120 + }, + { + "epoch": 0.6973561430793157, + "grad_norm": 4.166716394138348e-08, + "learning_rate": 4.2646380820654685e-06, + "logits/chosen": 2.1841678619384766, + "logits/rejected": 4.150865077972412, + "logps/chosen": -758.68896484375, + "logps/rejected": -1092.0853271484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.268762588500977, + "rewards/margins": 25.824085235595703, + "rewards/rejected": -36.09284973144531, + "step": 1121 + }, + { + "epoch": 0.6979782270606532, + "grad_norm": 9.886246516543906e-06, + "learning_rate": 4.263485477178424e-06, + "logits/chosen": 1.7041479349136353, + "logits/rejected": 3.58937406539917, + "logps/chosen": -578.86767578125, + "logps/rejected": -935.6997680664062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.358209609985352, + "rewards/margins": 23.363967895507812, + "rewards/rejected": -31.722179412841797, + "step": 1122 + }, + { + "epoch": 0.6986003110419907, + "grad_norm": 4.791447639465332, + "learning_rate": 4.262332872291379e-06, + "logits/chosen": 1.879792332649231, + "logits/rejected": 2.9788360595703125, + "logps/chosen": -565.7600708007812, + "logps/rejected": -867.056396484375, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.71964168548584, + "rewards/margins": 20.734661102294922, + "rewards/rejected": -30.454303741455078, + "step": 1123 + }, + { + "epoch": 0.6992223950233282, + "grad_norm": 29.363506317138672, + "learning_rate": 4.261180267404334e-06, + "logits/chosen": 1.0604872703552246, + "logits/rejected": 3.045793294906616, + "logps/chosen": -479.184326171875, + "logps/rejected": -773.6400146484375, + "loss": 0.3521, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.974681854248047, + "rewards/margins": 16.952167510986328, + "rewards/rejected": -21.926849365234375, + "step": 1124 + }, + { + "epoch": 0.6998444790046656, + "grad_norm": 15.334929466247559, + "learning_rate": 4.2600276625172894e-06, + "logits/chosen": -2.5029263496398926, + "logits/rejected": 1.321319341659546, + "logps/chosen": -454.6629943847656, + "logps/rejected": -864.0693359375, + "loss": 0.063, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.473987579345703, + "rewards/margins": 19.154024124145508, + "rewards/rejected": -28.628009796142578, + "step": 1125 + }, + { + "epoch": 0.7004665629860031, + "grad_norm": 7.4878207669826224e-06, + "learning_rate": 4.258875057630245e-06, + "logits/chosen": 2.3144640922546387, + "logits/rejected": 3.8180489540100098, + "logps/chosen": -532.5983276367188, + "logps/rejected": -865.9114990234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.514817237854004, + "rewards/margins": 27.909273147583008, + "rewards/rejected": -37.42408752441406, + "step": 1126 + }, + { + "epoch": 0.7010886469673406, + "grad_norm": 0.5590468645095825, + "learning_rate": 4.2577224527432e-06, + "logits/chosen": 1.154442310333252, + "logits/rejected": 4.903882026672363, + "logps/chosen": -610.6256713867188, + "logps/rejected": -1103.4720458984375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.860889434814453, + "rewards/margins": 27.493499755859375, + "rewards/rejected": -38.35438919067383, + "step": 1127 + }, + { + "epoch": 0.7017107309486781, + "grad_norm": 0.0033634125720709562, + "learning_rate": 4.256569847856155e-06, + "logits/chosen": -0.3866727948188782, + "logits/rejected": 1.2385426759719849, + "logps/chosen": -459.158935546875, + "logps/rejected": -760.5558471679688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.200139999389648, + "rewards/margins": 20.704795837402344, + "rewards/rejected": -28.904937744140625, + "step": 1128 + }, + { + "epoch": 0.7023328149300155, + "grad_norm": 0.00017669117369223386, + "learning_rate": 4.25541724296911e-06, + "logits/chosen": -0.9125028252601624, + "logits/rejected": 2.94661808013916, + "logps/chosen": -439.4700927734375, + "logps/rejected": -901.3716430664062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.392777919769287, + "rewards/margins": 26.68711280822754, + "rewards/rejected": -34.07988739013672, + "step": 1129 + }, + { + "epoch": 0.702954898911353, + "grad_norm": 20.32399559020996, + "learning_rate": 4.2542646380820656e-06, + "logits/chosen": 2.0383431911468506, + "logits/rejected": 4.066139221191406, + "logps/chosen": -680.9654541015625, + "logps/rejected": -1033.8470458984375, + "loss": 0.1797, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.162495613098145, + "rewards/margins": 18.726539611816406, + "rewards/rejected": -30.889034271240234, + "step": 1130 + }, + { + "epoch": 0.7035769828926906, + "grad_norm": 0.00040257195360027254, + "learning_rate": 4.253112033195021e-06, + "logits/chosen": 1.1132146120071411, + "logits/rejected": 2.3047876358032227, + "logps/chosen": -578.9852905273438, + "logps/rejected": -819.7774658203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.081069946289062, + "rewards/margins": 21.187292098999023, + "rewards/rejected": -30.268362045288086, + "step": 1131 + }, + { + "epoch": 0.704199066874028, + "grad_norm": 0.03850219398736954, + "learning_rate": 4.251959428307976e-06, + "logits/chosen": -2.612558364868164, + "logits/rejected": 2.0713813304901123, + "logps/chosen": -407.8648681640625, + "logps/rejected": -971.5011596679688, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.519382476806641, + "rewards/margins": 29.602420806884766, + "rewards/rejected": -35.121803283691406, + "step": 1132 + }, + { + "epoch": 0.7048211508553655, + "grad_norm": 11.934901237487793, + "learning_rate": 4.250806823420931e-06, + "logits/chosen": -0.5439414978027344, + "logits/rejected": 3.837416172027588, + "logps/chosen": -493.5166320800781, + "logps/rejected": -1030.0224609375, + "loss": 0.0808, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.399980545043945, + "rewards/margins": 22.14865493774414, + "rewards/rejected": -31.54863166809082, + "step": 1133 + }, + { + "epoch": 0.7054432348367029, + "grad_norm": 0.003834787290543318, + "learning_rate": 4.2496542185338864e-06, + "logits/chosen": 0.1230611801147461, + "logits/rejected": 2.813925266265869, + "logps/chosen": -550.717529296875, + "logps/rejected": -868.7197875976562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.865203857421875, + "rewards/margins": 20.2117919921875, + "rewards/rejected": -27.076993942260742, + "step": 1134 + }, + { + "epoch": 0.7060653188180405, + "grad_norm": 0.04540088772773743, + "learning_rate": 4.2485016136468425e-06, + "logits/chosen": -1.5632213354110718, + "logits/rejected": 2.7671170234680176, + "logps/chosen": -450.50579833984375, + "logps/rejected": -946.9651489257812, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.025535583496094, + "rewards/margins": 23.097501754760742, + "rewards/rejected": -33.12303924560547, + "step": 1135 + }, + { + "epoch": 0.7066874027993779, + "grad_norm": 5.21671724319458, + "learning_rate": 4.247349008759798e-06, + "logits/chosen": -2.5789096355438232, + "logits/rejected": 2.2848589420318604, + "logps/chosen": -390.320556640625, + "logps/rejected": -940.9581298828125, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.420944213867188, + "rewards/margins": 23.340347290039062, + "rewards/rejected": -31.76129150390625, + "step": 1136 + }, + { + "epoch": 0.7073094867807154, + "grad_norm": 2.545893430709839, + "learning_rate": 4.246196403872753e-06, + "logits/chosen": 1.3946446180343628, + "logits/rejected": 3.885446071624756, + "logps/chosen": -593.1643676757812, + "logps/rejected": -894.2576904296875, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.472441673278809, + "rewards/margins": 18.71339225769043, + "rewards/rejected": -28.185832977294922, + "step": 1137 + }, + { + "epoch": 0.7079315707620529, + "grad_norm": 0.9141021370887756, + "learning_rate": 4.245043798985708e-06, + "logits/chosen": -1.6937819719314575, + "logits/rejected": 3.2571754455566406, + "logps/chosen": -420.5745544433594, + "logps/rejected": -904.1632690429688, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.709376335144043, + "rewards/margins": 21.777610778808594, + "rewards/rejected": -29.486984252929688, + "step": 1138 + }, + { + "epoch": 0.7085536547433904, + "grad_norm": 17.139318466186523, + "learning_rate": 4.243891194098663e-06, + "logits/chosen": 0.8341930508613586, + "logits/rejected": 2.6119236946105957, + "logps/chosen": -627.296630859375, + "logps/rejected": -942.0413818359375, + "loss": 0.3848, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.913735389709473, + "rewards/margins": 20.716289520263672, + "rewards/rejected": -32.630027770996094, + "step": 1139 + }, + { + "epoch": 0.7091757387247278, + "grad_norm": 0.12485391646623611, + "learning_rate": 4.242738589211619e-06, + "logits/chosen": 1.4291424751281738, + "logits/rejected": 4.054292678833008, + "logps/chosen": -568.59326171875, + "logps/rejected": -992.9465942382812, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.06442642211914, + "rewards/margins": 25.010478973388672, + "rewards/rejected": -34.07490539550781, + "step": 1140 + }, + { + "epoch": 0.7097978227060653, + "grad_norm": 0.08407150208950043, + "learning_rate": 4.241585984324574e-06, + "logits/chosen": 0.009067535400390625, + "logits/rejected": 3.6980178356170654, + "logps/chosen": -557.7619018554688, + "logps/rejected": -1060.493896484375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.739320755004883, + "rewards/margins": 27.374553680419922, + "rewards/rejected": -37.11387634277344, + "step": 1141 + }, + { + "epoch": 0.7104199066874028, + "grad_norm": 0.9794238209724426, + "learning_rate": 4.240433379437529e-06, + "logits/chosen": 1.6314430236816406, + "logits/rejected": 3.8010740280151367, + "logps/chosen": -520.548583984375, + "logps/rejected": -840.6610107421875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.92463493347168, + "rewards/margins": 18.227628707885742, + "rewards/rejected": -27.152263641357422, + "step": 1142 + }, + { + "epoch": 0.7110419906687403, + "grad_norm": 0.0969887226819992, + "learning_rate": 4.239280774550484e-06, + "logits/chosen": 1.1301627159118652, + "logits/rejected": 4.129144668579102, + "logps/chosen": -588.8717651367188, + "logps/rejected": -987.9451904296875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.679798126220703, + "rewards/margins": 20.80878448486328, + "rewards/rejected": -32.488582611083984, + "step": 1143 + }, + { + "epoch": 0.7116640746500777, + "grad_norm": 0.00018440843268763274, + "learning_rate": 4.2381281696634395e-06, + "logits/chosen": -4.2730536460876465, + "logits/rejected": 1.6310522556304932, + "logps/chosen": -379.5325622558594, + "logps/rejected": -1090.905517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.479127883911133, + "rewards/margins": 27.126827239990234, + "rewards/rejected": -36.60595703125, + "step": 1144 + }, + { + "epoch": 0.7122861586314152, + "grad_norm": 5.640890321956249e-06, + "learning_rate": 4.236975564776395e-06, + "logits/chosen": -0.5040819048881531, + "logits/rejected": 2.4404137134552, + "logps/chosen": -602.3915405273438, + "logps/rejected": -1003.7799072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.199384689331055, + "rewards/margins": 25.80965232849121, + "rewards/rejected": -38.009037017822266, + "step": 1145 + }, + { + "epoch": 0.7129082426127528, + "grad_norm": 0.0018288391875103116, + "learning_rate": 4.23582295988935e-06, + "logits/chosen": -2.6323633193969727, + "logits/rejected": 3.2968077659606934, + "logps/chosen": -274.1453857421875, + "logps/rejected": -953.5428466796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.15943717956543, + "rewards/margins": 31.97180938720703, + "rewards/rejected": -38.131248474121094, + "step": 1146 + }, + { + "epoch": 0.7135303265940902, + "grad_norm": 0.5116393566131592, + "learning_rate": 4.234670355002305e-06, + "logits/chosen": 2.3888742923736572, + "logits/rejected": 4.2413763999938965, + "logps/chosen": -573.247314453125, + "logps/rejected": -932.5107421875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.365524291992188, + "rewards/margins": 23.069791793823242, + "rewards/rejected": -33.43531799316406, + "step": 1147 + }, + { + "epoch": 0.7141524105754277, + "grad_norm": 1.6715589481464121e-06, + "learning_rate": 4.2335177501152604e-06, + "logits/chosen": -1.7874610424041748, + "logits/rejected": 1.3828914165496826, + "logps/chosen": -511.89208984375, + "logps/rejected": -1041.0760498046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.092294692993164, + "rewards/margins": 31.230894088745117, + "rewards/rejected": -45.32318878173828, + "step": 1148 + }, + { + "epoch": 0.7147744945567651, + "grad_norm": 0.0019122587982565165, + "learning_rate": 4.232365145228216e-06, + "logits/chosen": -0.33687490224838257, + "logits/rejected": 2.45595121383667, + "logps/chosen": -480.1834411621094, + "logps/rejected": -849.4967041015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.1947479248046875, + "rewards/margins": 21.8647518157959, + "rewards/rejected": -29.059499740600586, + "step": 1149 + }, + { + "epoch": 0.7153965785381027, + "grad_norm": 1.4362432956695557, + "learning_rate": 4.231212540341172e-06, + "logits/chosen": -0.1586349606513977, + "logits/rejected": 3.885680675506592, + "logps/chosen": -409.5668640136719, + "logps/rejected": -885.145263671875, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.473544120788574, + "rewards/margins": 24.60749626159668, + "rewards/rejected": -33.08103942871094, + "step": 1150 + }, + { + "epoch": 0.7160186625194401, + "grad_norm": 0.010208888910710812, + "learning_rate": 4.230059935454127e-06, + "logits/chosen": -0.2658725380897522, + "logits/rejected": 3.370497703552246, + "logps/chosen": -574.7543334960938, + "logps/rejected": -974.1325073242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.140201568603516, + "rewards/margins": 26.478336334228516, + "rewards/rejected": -35.61853790283203, + "step": 1151 + }, + { + "epoch": 0.7166407465007776, + "grad_norm": 36.79997634887695, + "learning_rate": 4.228907330567082e-06, + "logits/chosen": 1.2792608737945557, + "logits/rejected": 3.2375693321228027, + "logps/chosen": -534.0382690429688, + "logps/rejected": -953.0762939453125, + "loss": 0.1517, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.882672309875488, + "rewards/margins": 26.688556671142578, + "rewards/rejected": -39.57122802734375, + "step": 1152 + }, + { + "epoch": 0.717262830482115, + "grad_norm": 0.035786353051662445, + "learning_rate": 4.227754725680037e-06, + "logits/chosen": -1.173093318939209, + "logits/rejected": 4.787845134735107, + "logps/chosen": -524.6954956054688, + "logps/rejected": -1209.607666015625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.333581924438477, + "rewards/margins": 33.21443176269531, + "rewards/rejected": -48.548011779785156, + "step": 1153 + }, + { + "epoch": 0.7178849144634526, + "grad_norm": 3.625681088692545e-08, + "learning_rate": 4.226602120792993e-06, + "logits/chosen": -1.6249737739562988, + "logits/rejected": 4.376003265380859, + "logps/chosen": -340.1498107910156, + "logps/rejected": -1045.368896484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.608695030212402, + "rewards/margins": 34.630897521972656, + "rewards/rejected": -43.23958969116211, + "step": 1154 + }, + { + "epoch": 0.71850699844479, + "grad_norm": 2.848944689048949e-07, + "learning_rate": 4.225449515905948e-06, + "logits/chosen": 2.577023983001709, + "logits/rejected": 4.594021320343018, + "logps/chosen": -614.6937255859375, + "logps/rejected": -1042.291259765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.032281875610352, + "rewards/margins": 31.21833038330078, + "rewards/rejected": -41.250614166259766, + "step": 1155 + }, + { + "epoch": 0.7191290824261275, + "grad_norm": 38.33723068237305, + "learning_rate": 4.224296911018903e-06, + "logits/chosen": 1.7918505668640137, + "logits/rejected": 2.563223361968994, + "logps/chosen": -577.7158203125, + "logps/rejected": -873.4849243164062, + "loss": 1.1238, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.859672546386719, + "rewards/margins": 15.18758773803711, + "rewards/rejected": -25.047260284423828, + "step": 1156 + }, + { + "epoch": 0.7197511664074651, + "grad_norm": 0.010270086117088795, + "learning_rate": 4.223144306131858e-06, + "logits/chosen": -1.845631718635559, + "logits/rejected": 3.4590423107147217, + "logps/chosen": -422.4835510253906, + "logps/rejected": -1021.2655029296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.028568267822266, + "rewards/margins": 28.59836196899414, + "rewards/rejected": -38.626930236816406, + "step": 1157 + }, + { + "epoch": 0.7203732503888025, + "grad_norm": 39.00583267211914, + "learning_rate": 4.2219917012448135e-06, + "logits/chosen": 2.3478219509124756, + "logits/rejected": 3.523054599761963, + "logps/chosen": -624.9152221679688, + "logps/rejected": -920.3958129882812, + "loss": 1.2437, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.256948471069336, + "rewards/margins": 21.686954498291016, + "rewards/rejected": -34.94390106201172, + "step": 1158 + }, + { + "epoch": 0.72099533437014, + "grad_norm": 26.327835083007812, + "learning_rate": 4.220839096357769e-06, + "logits/chosen": 2.0256595611572266, + "logits/rejected": 4.168939590454102, + "logps/chosen": -581.934814453125, + "logps/rejected": -965.870361328125, + "loss": 0.4675, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.27999496459961, + "rewards/margins": 23.65022850036621, + "rewards/rejected": -37.93022155761719, + "step": 1159 + }, + { + "epoch": 0.7216174183514774, + "grad_norm": 30.75332260131836, + "learning_rate": 4.219686491470724e-06, + "logits/chosen": -1.0962412357330322, + "logits/rejected": 4.113137245178223, + "logps/chosen": -478.94451904296875, + "logps/rejected": -908.378173828125, + "loss": 0.731, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.116175651550293, + "rewards/margins": 23.1622314453125, + "rewards/rejected": -34.278404235839844, + "step": 1160 + }, + { + "epoch": 0.722239502332815, + "grad_norm": 4.19508695602417, + "learning_rate": 4.218533886583679e-06, + "logits/chosen": 1.5392760038375854, + "logits/rejected": 3.2550981044769287, + "logps/chosen": -615.6009521484375, + "logps/rejected": -1001.22021484375, + "loss": 0.0196, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.858214378356934, + "rewards/margins": 21.70871925354004, + "rewards/rejected": -36.566932678222656, + "step": 1161 + }, + { + "epoch": 0.7228615863141524, + "grad_norm": 15.874382019042969, + "learning_rate": 4.2173812816966344e-06, + "logits/chosen": 0.26850560307502747, + "logits/rejected": 1.8894236087799072, + "logps/chosen": -580.573974609375, + "logps/rejected": -969.2291870117188, + "loss": 0.1518, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.876068115234375, + "rewards/margins": 26.33701515197754, + "rewards/rejected": -37.21308135986328, + "step": 1162 + }, + { + "epoch": 0.7234836702954899, + "grad_norm": 0.0005865055718459189, + "learning_rate": 4.21622867680959e-06, + "logits/chosen": -1.2452207803726196, + "logits/rejected": 2.307518482208252, + "logps/chosen": -477.93206787109375, + "logps/rejected": -966.4757690429688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.003711700439453, + "rewards/margins": 25.798137664794922, + "rewards/rejected": -36.801849365234375, + "step": 1163 + }, + { + "epoch": 0.7241057542768273, + "grad_norm": 31.254302978515625, + "learning_rate": 4.215076071922546e-06, + "logits/chosen": -1.0185731649398804, + "logits/rejected": 2.684237480163574, + "logps/chosen": -494.90966796875, + "logps/rejected": -883.1465454101562, + "loss": 0.2146, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.114580154418945, + "rewards/margins": 20.16065788269043, + "rewards/rejected": -30.275238037109375, + "step": 1164 + }, + { + "epoch": 0.7247278382581649, + "grad_norm": 0.04852492734789848, + "learning_rate": 4.213923467035501e-06, + "logits/chosen": 0.2611873149871826, + "logits/rejected": 3.668205738067627, + "logps/chosen": -466.4119873046875, + "logps/rejected": -863.919189453125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.094996452331543, + "rewards/margins": 22.080657958984375, + "rewards/rejected": -31.1756534576416, + "step": 1165 + }, + { + "epoch": 0.7253499222395023, + "grad_norm": 41.72018051147461, + "learning_rate": 4.212770862148456e-06, + "logits/chosen": 0.11253970861434937, + "logits/rejected": 3.1600046157836914, + "logps/chosen": -525.0753784179688, + "logps/rejected": -816.2158813476562, + "loss": 1.3193, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.143072128295898, + "rewards/margins": 22.540931701660156, + "rewards/rejected": -33.68400192260742, + "step": 1166 + }, + { + "epoch": 0.7259720062208398, + "grad_norm": 0.019932212308049202, + "learning_rate": 4.211618257261411e-06, + "logits/chosen": 2.4527406692504883, + "logits/rejected": 3.221583843231201, + "logps/chosen": -673.4598388671875, + "logps/rejected": -895.5223388671875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.391605377197266, + "rewards/margins": 19.719661712646484, + "rewards/rejected": -30.111265182495117, + "step": 1167 + }, + { + "epoch": 0.7265940902021772, + "grad_norm": 0.0018634117441251874, + "learning_rate": 4.210465652374367e-06, + "logits/chosen": 2.2655115127563477, + "logits/rejected": 3.0503270626068115, + "logps/chosen": -653.0009155273438, + "logps/rejected": -930.0877075195312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.022181510925293, + "rewards/margins": 25.00848388671875, + "rewards/rejected": -38.03066635131836, + "step": 1168 + }, + { + "epoch": 0.7272161741835148, + "grad_norm": 4.644537448883057, + "learning_rate": 4.209313047487322e-06, + "logits/chosen": 0.016414497047662735, + "logits/rejected": 4.014350891113281, + "logps/chosen": -466.252197265625, + "logps/rejected": -1043.4276123046875, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.585433959960938, + "rewards/margins": 28.32579803466797, + "rewards/rejected": -37.911231994628906, + "step": 1169 + }, + { + "epoch": 0.7278382581648523, + "grad_norm": 47.371559143066406, + "learning_rate": 4.208160442600277e-06, + "logits/chosen": 0.4835778772830963, + "logits/rejected": 2.874391555786133, + "logps/chosen": -633.0033569335938, + "logps/rejected": -1000.6109619140625, + "loss": 0.6942, + "rewards/accuracies": 0.875, + "rewards/chosen": -16.999771118164062, + "rewards/margins": 18.929607391357422, + "rewards/rejected": -35.929378509521484, + "step": 1170 + }, + { + "epoch": 0.7284603421461897, + "grad_norm": 5.057783603668213, + "learning_rate": 4.207007837713232e-06, + "logits/chosen": -0.22269773483276367, + "logits/rejected": 3.5258021354675293, + "logps/chosen": -513.5776977539062, + "logps/rejected": -1006.0166015625, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.099183082580566, + "rewards/margins": 25.058616638183594, + "rewards/rejected": -36.157798767089844, + "step": 1171 + }, + { + "epoch": 0.7290824261275272, + "grad_norm": 30.32122802734375, + "learning_rate": 4.2058552328261875e-06, + "logits/chosen": -0.032325103878974915, + "logits/rejected": 3.461878776550293, + "logps/chosen": -534.3825073242188, + "logps/rejected": -879.685302734375, + "loss": 0.269, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.895355224609375, + "rewards/margins": 17.291181564331055, + "rewards/rejected": -27.18653678894043, + "step": 1172 + }, + { + "epoch": 0.7297045101088647, + "grad_norm": 5.359790802001953, + "learning_rate": 4.204702627939143e-06, + "logits/chosen": 0.5488654971122742, + "logits/rejected": 3.726155996322632, + "logps/chosen": -459.772216796875, + "logps/rejected": -815.4249267578125, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.793237686157227, + "rewards/margins": 21.808074951171875, + "rewards/rejected": -31.6013126373291, + "step": 1173 + }, + { + "epoch": 0.7303265940902022, + "grad_norm": 7.4824442863464355, + "learning_rate": 4.203550023052098e-06, + "logits/chosen": 0.13619333505630493, + "logits/rejected": 4.627652168273926, + "logps/chosen": -367.88629150390625, + "logps/rejected": -852.6173095703125, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.94129467010498, + "rewards/margins": 24.91082000732422, + "rewards/rejected": -33.85211181640625, + "step": 1174 + }, + { + "epoch": 0.7309486780715396, + "grad_norm": 0.19221581518650055, + "learning_rate": 4.202397418165053e-06, + "logits/chosen": 0.3853622376918793, + "logits/rejected": 3.9664766788482666, + "logps/chosen": -639.0650634765625, + "logps/rejected": -1077.99169921875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.3831148147583, + "rewards/margins": 20.84623908996582, + "rewards/rejected": -30.229352951049805, + "step": 1175 + }, + { + "epoch": 0.7315707620528772, + "grad_norm": 0.00024160981411114335, + "learning_rate": 4.2012448132780084e-06, + "logits/chosen": -3.6084365844726562, + "logits/rejected": 2.1406548023223877, + "logps/chosen": -383.0024108886719, + "logps/rejected": -1027.7147216796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.151406288146973, + "rewards/margins": 29.40199851989746, + "rewards/rejected": -35.55340576171875, + "step": 1176 + }, + { + "epoch": 0.7321928460342146, + "grad_norm": 0.12362519651651382, + "learning_rate": 4.200092208390964e-06, + "logits/chosen": -0.8688986301422119, + "logits/rejected": 3.6696877479553223, + "logps/chosen": -380.8484802246094, + "logps/rejected": -854.9744262695312, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.829305171966553, + "rewards/margins": 21.9825439453125, + "rewards/rejected": -28.811851501464844, + "step": 1177 + }, + { + "epoch": 0.7328149300155521, + "grad_norm": 0.08682712912559509, + "learning_rate": 4.198939603503919e-06, + "logits/chosen": -1.9073578119277954, + "logits/rejected": 2.8443100452423096, + "logps/chosen": -363.9940490722656, + "logps/rejected": -863.8155517578125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.090331077575684, + "rewards/margins": 23.799715042114258, + "rewards/rejected": -29.890045166015625, + "step": 1178 + }, + { + "epoch": 0.7334370139968895, + "grad_norm": 0.023318586871027946, + "learning_rate": 4.197786998616875e-06, + "logits/chosen": 0.8904905319213867, + "logits/rejected": 2.4922831058502197, + "logps/chosen": -671.4569091796875, + "logps/rejected": -982.808837890625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.915792465209961, + "rewards/margins": 25.762042999267578, + "rewards/rejected": -34.67783737182617, + "step": 1179 + }, + { + "epoch": 0.7340590979782271, + "grad_norm": 16.87903594970703, + "learning_rate": 4.19663439372983e-06, + "logits/chosen": 3.0492799282073975, + "logits/rejected": 4.473697662353516, + "logps/chosen": -709.522216796875, + "logps/rejected": -964.0701904296875, + "loss": 0.2486, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.466982364654541, + "rewards/margins": 19.10428237915039, + "rewards/rejected": -26.571266174316406, + "step": 1180 + }, + { + "epoch": 0.7346811819595646, + "grad_norm": 0.0012558766175061464, + "learning_rate": 4.195481788842785e-06, + "logits/chosen": -0.866086483001709, + "logits/rejected": 3.2224390506744385, + "logps/chosen": -540.8012084960938, + "logps/rejected": -1071.502685546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.356019973754883, + "rewards/margins": 32.99429702758789, + "rewards/rejected": -45.35031509399414, + "step": 1181 + }, + { + "epoch": 0.735303265940902, + "grad_norm": 0.0014930395409464836, + "learning_rate": 4.194329183955741e-06, + "logits/chosen": -0.5875803232192993, + "logits/rejected": 2.544929265975952, + "logps/chosen": -353.6885681152344, + "logps/rejected": -829.4857177734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.364542961120605, + "rewards/margins": 27.51534652709961, + "rewards/rejected": -35.87989044189453, + "step": 1182 + }, + { + "epoch": 0.7359253499222395, + "grad_norm": 0.022201891988515854, + "learning_rate": 4.193176579068696e-06, + "logits/chosen": 0.19210612773895264, + "logits/rejected": 3.1613025665283203, + "logps/chosen": -617.490234375, + "logps/rejected": -954.2713012695312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.373228073120117, + "rewards/margins": 24.7774600982666, + "rewards/rejected": -35.15068817138672, + "step": 1183 + }, + { + "epoch": 0.736547433903577, + "grad_norm": 1.701123595237732, + "learning_rate": 4.192023974181651e-06, + "logits/chosen": -0.042814530432224274, + "logits/rejected": 3.2058024406433105, + "logps/chosen": -491.13330078125, + "logps/rejected": -948.0253295898438, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.885446548461914, + "rewards/margins": 28.518571853637695, + "rewards/rejected": -39.40401840209961, + "step": 1184 + }, + { + "epoch": 0.7371695178849145, + "grad_norm": 0.0002101602149195969, + "learning_rate": 4.190871369294606e-06, + "logits/chosen": 1.0134602785110474, + "logits/rejected": 2.7794322967529297, + "logps/chosen": -631.4107055664062, + "logps/rejected": -1013.0341186523438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.573670387268066, + "rewards/margins": 24.038246154785156, + "rewards/rejected": -35.611915588378906, + "step": 1185 + }, + { + "epoch": 0.7377916018662519, + "grad_norm": 37.85633087158203, + "learning_rate": 4.1897187644075615e-06, + "logits/chosen": 2.350338935852051, + "logits/rejected": 2.9470388889312744, + "logps/chosen": -744.1123046875, + "logps/rejected": -894.8653564453125, + "loss": 0.3682, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.176665306091309, + "rewards/margins": 11.495548248291016, + "rewards/rejected": -21.67221450805664, + "step": 1186 + }, + { + "epoch": 0.7384136858475894, + "grad_norm": 18.087539672851562, + "learning_rate": 4.188566159520517e-06, + "logits/chosen": 1.4269673824310303, + "logits/rejected": 4.23713493347168, + "logps/chosen": -613.511474609375, + "logps/rejected": -1102.857421875, + "loss": 0.1002, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.977934837341309, + "rewards/margins": 28.6257266998291, + "rewards/rejected": -38.60366439819336, + "step": 1187 + }, + { + "epoch": 0.7390357698289269, + "grad_norm": 4.416318461153423e-06, + "learning_rate": 4.187413554633472e-06, + "logits/chosen": 1.2357975244522095, + "logits/rejected": 5.414222240447998, + "logps/chosen": -494.890869140625, + "logps/rejected": -973.6812133789062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.658195972442627, + "rewards/margins": 33.96454620361328, + "rewards/rejected": -39.62274169921875, + "step": 1188 + }, + { + "epoch": 0.7396578538102644, + "grad_norm": 0.07317977398633957, + "learning_rate": 4.186260949746427e-06, + "logits/chosen": -2.1512906551361084, + "logits/rejected": 3.888598918914795, + "logps/chosen": -344.72454833984375, + "logps/rejected": -1022.0072021484375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.827019691467285, + "rewards/margins": 32.47252655029297, + "rewards/rejected": -40.29954528808594, + "step": 1189 + }, + { + "epoch": 0.7402799377916018, + "grad_norm": 33.14171600341797, + "learning_rate": 4.185108344859382e-06, + "logits/chosen": -2.4892563819885254, + "logits/rejected": 3.2610530853271484, + "logps/chosen": -430.8138122558594, + "logps/rejected": -886.5509643554688, + "loss": 0.5152, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.59311056137085, + "rewards/margins": 19.534095764160156, + "rewards/rejected": -27.12720489501953, + "step": 1190 + }, + { + "epoch": 0.7409020217729394, + "grad_norm": 38.92852020263672, + "learning_rate": 4.183955739972338e-06, + "logits/chosen": 1.1341769695281982, + "logits/rejected": 3.1176342964172363, + "logps/chosen": -613.213134765625, + "logps/rejected": -884.856689453125, + "loss": 0.9723, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.557759284973145, + "rewards/margins": 15.217055320739746, + "rewards/rejected": -26.774816513061523, + "step": 1191 + }, + { + "epoch": 0.7415241057542769, + "grad_norm": 0.0014572658110409975, + "learning_rate": 4.182803135085293e-06, + "logits/chosen": -3.2773330211639404, + "logits/rejected": 2.6450986862182617, + "logps/chosen": -469.614501953125, + "logps/rejected": -1094.7154541015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.044410705566406, + "rewards/margins": 28.91035270690918, + "rewards/rejected": -37.95476531982422, + "step": 1192 + }, + { + "epoch": 0.7421461897356143, + "grad_norm": 51.88095474243164, + "learning_rate": 4.181650530198248e-06, + "logits/chosen": -0.44079411029815674, + "logits/rejected": 0.846016526222229, + "logps/chosen": -553.1190185546875, + "logps/rejected": -895.7581787109375, + "loss": 0.6423, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.631755828857422, + "rewards/margins": 19.03414535522461, + "rewards/rejected": -26.665903091430664, + "step": 1193 + }, + { + "epoch": 0.7427682737169518, + "grad_norm": 1.7541396617889404, + "learning_rate": 4.180497925311204e-06, + "logits/chosen": 2.258620262145996, + "logits/rejected": 2.388881206512451, + "logps/chosen": -714.3488159179688, + "logps/rejected": -937.6739501953125, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.310730934143066, + "rewards/margins": 22.443029403686523, + "rewards/rejected": -35.753761291503906, + "step": 1194 + }, + { + "epoch": 0.7433903576982893, + "grad_norm": 41.518917083740234, + "learning_rate": 4.179345320424159e-06, + "logits/chosen": -0.8975342512130737, + "logits/rejected": 3.0638198852539062, + "logps/chosen": -586.2952880859375, + "logps/rejected": -1060.0189208984375, + "loss": 0.6023, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.759418487548828, + "rewards/margins": 23.457223892211914, + "rewards/rejected": -37.216644287109375, + "step": 1195 + }, + { + "epoch": 0.7440124416796268, + "grad_norm": 0.0007180199609138072, + "learning_rate": 4.178192715537115e-06, + "logits/chosen": 2.2216758728027344, + "logits/rejected": 0.9457840919494629, + "logps/chosen": -644.4976806640625, + "logps/rejected": -935.6669921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.617611885070801, + "rewards/margins": 24.9567928314209, + "rewards/rejected": -32.57440185546875, + "step": 1196 + }, + { + "epoch": 0.7446345256609642, + "grad_norm": 0.6821045875549316, + "learning_rate": 4.17704011065007e-06, + "logits/chosen": -0.23671680688858032, + "logits/rejected": 4.001680850982666, + "logps/chosen": -459.28765869140625, + "logps/rejected": -920.262451171875, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.031026840209961, + "rewards/margins": 26.548620223999023, + "rewards/rejected": -33.57965087890625, + "step": 1197 + }, + { + "epoch": 0.7452566096423017, + "grad_norm": 0.0007749017095193267, + "learning_rate": 4.175887505763025e-06, + "logits/chosen": 0.943214476108551, + "logits/rejected": 3.5111474990844727, + "logps/chosen": -541.9749145507812, + "logps/rejected": -922.7454833984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.438486099243164, + "rewards/margins": 21.518447875976562, + "rewards/rejected": -32.95693588256836, + "step": 1198 + }, + { + "epoch": 0.7458786936236392, + "grad_norm": 0.011360271833837032, + "learning_rate": 4.17473490087598e-06, + "logits/chosen": 0.1720401495695114, + "logits/rejected": 2.3051769733428955, + "logps/chosen": -502.39007568359375, + "logps/rejected": -829.52880859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.472356796264648, + "rewards/margins": 20.189279556274414, + "rewards/rejected": -28.661632537841797, + "step": 1199 + }, + { + "epoch": 0.7465007776049767, + "grad_norm": 0.048153672367334366, + "learning_rate": 4.1735822959889355e-06, + "logits/chosen": 0.5549330711364746, + "logits/rejected": 1.9446189403533936, + "logps/chosen": -512.0660400390625, + "logps/rejected": -945.5157470703125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.108501434326172, + "rewards/margins": 30.116321563720703, + "rewards/rejected": -38.224822998046875, + "step": 1200 + }, + { + "epoch": 0.7471228615863141, + "grad_norm": 0.3604937493801117, + "learning_rate": 4.172429691101891e-06, + "logits/chosen": -1.7650338411331177, + "logits/rejected": 4.217351913452148, + "logps/chosen": -459.73455810546875, + "logps/rejected": -1017.9171752929688, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.848245620727539, + "rewards/margins": 28.195613861083984, + "rewards/rejected": -36.04385757446289, + "step": 1201 + }, + { + "epoch": 0.7477449455676516, + "grad_norm": 8.572696685860137e-08, + "learning_rate": 4.171277086214846e-06, + "logits/chosen": -3.219414710998535, + "logits/rejected": 3.129903793334961, + "logps/chosen": -304.957763671875, + "logps/rejected": -1000.2400512695312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.250598907470703, + "rewards/margins": 32.68410110473633, + "rewards/rejected": -39.93470001220703, + "step": 1202 + }, + { + "epoch": 0.7483670295489891, + "grad_norm": 0.0002947688626591116, + "learning_rate": 4.170124481327801e-06, + "logits/chosen": -4.783797264099121, + "logits/rejected": 0.5106593370437622, + "logps/chosen": -353.1451416015625, + "logps/rejected": -933.575439453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.65771198272705, + "rewards/margins": 30.264278411865234, + "rewards/rejected": -38.92198944091797, + "step": 1203 + }, + { + "epoch": 0.7489891135303266, + "grad_norm": 0.25060757994651794, + "learning_rate": 4.168971876440756e-06, + "logits/chosen": -3.233412981033325, + "logits/rejected": 3.1030588150024414, + "logps/chosen": -330.5057373046875, + "logps/rejected": -871.0208740234375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.052924394607544, + "rewards/margins": 21.390153884887695, + "rewards/rejected": -23.443077087402344, + "step": 1204 + }, + { + "epoch": 0.749611197511664, + "grad_norm": 26.881370544433594, + "learning_rate": 4.167819271553712e-06, + "logits/chosen": -0.15430384874343872, + "logits/rejected": 0.6273139715194702, + "logps/chosen": -479.899658203125, + "logps/rejected": -717.349365234375, + "loss": 0.3197, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.2665629386901855, + "rewards/margins": 14.102705955505371, + "rewards/rejected": -20.36927032470703, + "step": 1205 + }, + { + "epoch": 0.7502332814930015, + "grad_norm": 26.799030303955078, + "learning_rate": 4.166666666666667e-06, + "logits/chosen": 1.0275671482086182, + "logits/rejected": 3.896846294403076, + "logps/chosen": -617.9913330078125, + "logps/rejected": -1051.7125244140625, + "loss": 0.7833, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.258463859558105, + "rewards/margins": 28.130443572998047, + "rewards/rejected": -40.3889045715332, + "step": 1206 + }, + { + "epoch": 0.7508553654743391, + "grad_norm": 1.8569644453236833e-05, + "learning_rate": 4.165514061779622e-06, + "logits/chosen": -2.7208077907562256, + "logits/rejected": 1.4001092910766602, + "logps/chosen": -469.54425048828125, + "logps/rejected": -927.6661987304688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.268619537353516, + "rewards/margins": 24.516754150390625, + "rewards/rejected": -33.78537368774414, + "step": 1207 + }, + { + "epoch": 0.7514774494556765, + "grad_norm": 0.35053345561027527, + "learning_rate": 4.164361456892578e-06, + "logits/chosen": 1.7314445972442627, + "logits/rejected": 3.289358377456665, + "logps/chosen": -510.5986633300781, + "logps/rejected": -721.63720703125, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.4644880294799805, + "rewards/margins": 17.393329620361328, + "rewards/rejected": -24.857816696166992, + "step": 1208 + }, + { + "epoch": 0.752099533437014, + "grad_norm": 0.00014789852139074355, + "learning_rate": 4.163208852005533e-06, + "logits/chosen": -1.9375808238983154, + "logits/rejected": 4.110387802124023, + "logps/chosen": -506.71685791015625, + "logps/rejected": -1072.865234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.544163227081299, + "rewards/margins": 24.764442443847656, + "rewards/rejected": -32.3086051940918, + "step": 1209 + }, + { + "epoch": 0.7527216174183515, + "grad_norm": 15.979223251342773, + "learning_rate": 4.162056247118489e-06, + "logits/chosen": -2.863640308380127, + "logits/rejected": -0.7609111070632935, + "logps/chosen": -369.8997802734375, + "logps/rejected": -622.5078735351562, + "loss": 0.7803, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.1783270835876465, + "rewards/margins": 17.676910400390625, + "rewards/rejected": -24.85523796081543, + "step": 1210 + }, + { + "epoch": 0.753343701399689, + "grad_norm": 13.629114151000977, + "learning_rate": 4.160903642231444e-06, + "logits/chosen": -1.767147421836853, + "logits/rejected": 3.5947704315185547, + "logps/chosen": -434.3961181640625, + "logps/rejected": -929.360595703125, + "loss": 0.0854, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.125800132751465, + "rewards/margins": 20.63152313232422, + "rewards/rejected": -23.75732421875, + "step": 1211 + }, + { + "epoch": 0.7539657853810264, + "grad_norm": 23.103200912475586, + "learning_rate": 4.159751037344399e-06, + "logits/chosen": 2.1288254261016846, + "logits/rejected": 4.818546772003174, + "logps/chosen": -674.2977294921875, + "logps/rejected": -1065.44775390625, + "loss": 0.1744, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.28523063659668, + "rewards/margins": 26.10309600830078, + "rewards/rejected": -35.38832473754883, + "step": 1212 + }, + { + "epoch": 0.7545878693623639, + "grad_norm": 0.00010452913556946442, + "learning_rate": 4.158598432457354e-06, + "logits/chosen": -0.18677985668182373, + "logits/rejected": 3.669586658477783, + "logps/chosen": -445.21282958984375, + "logps/rejected": -975.4344482421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.2635016441345215, + "rewards/margins": 30.339672088623047, + "rewards/rejected": -35.60317611694336, + "step": 1213 + }, + { + "epoch": 0.7552099533437014, + "grad_norm": 0.026064734905958176, + "learning_rate": 4.1574458275703095e-06, + "logits/chosen": -2.9125752449035645, + "logits/rejected": 2.556290864944458, + "logps/chosen": -416.84442138671875, + "logps/rejected": -1013.16455078125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.045517921447754, + "rewards/margins": 28.890634536743164, + "rewards/rejected": -37.936153411865234, + "step": 1214 + }, + { + "epoch": 0.7558320373250389, + "grad_norm": 0.056555796414613724, + "learning_rate": 4.156293222683265e-06, + "logits/chosen": 0.5654237866401672, + "logits/rejected": 3.7856836318969727, + "logps/chosen": -555.9217529296875, + "logps/rejected": -896.5208740234375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.224716186523438, + "rewards/margins": 21.14641571044922, + "rewards/rejected": -31.371129989624023, + "step": 1215 + }, + { + "epoch": 0.7564541213063763, + "grad_norm": 27.12925910949707, + "learning_rate": 4.15514061779622e-06, + "logits/chosen": 1.7580722570419312, + "logits/rejected": 4.319855690002441, + "logps/chosen": -604.033447265625, + "logps/rejected": -890.464111328125, + "loss": 0.2599, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.791042804718018, + "rewards/margins": 17.623245239257812, + "rewards/rejected": -25.414289474487305, + "step": 1216 + }, + { + "epoch": 0.7570762052877138, + "grad_norm": 48.94939422607422, + "learning_rate": 4.153988012909175e-06, + "logits/chosen": 0.016593068838119507, + "logits/rejected": 3.1171576976776123, + "logps/chosen": -501.7952880859375, + "logps/rejected": -860.7073364257812, + "loss": 1.7968, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.870309829711914, + "rewards/margins": 20.11650276184082, + "rewards/rejected": -32.98681640625, + "step": 1217 + }, + { + "epoch": 0.7576982892690514, + "grad_norm": 0.00010765832848846912, + "learning_rate": 4.15283540802213e-06, + "logits/chosen": -0.6010594964027405, + "logits/rejected": 4.7762250900268555, + "logps/chosen": -502.10894775390625, + "logps/rejected": -1062.4873046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.751803398132324, + "rewards/margins": 29.6951904296875, + "rewards/rejected": -39.446990966796875, + "step": 1218 + }, + { + "epoch": 0.7583203732503888, + "grad_norm": 29.66597557067871, + "learning_rate": 4.151682803135086e-06, + "logits/chosen": 0.9268134832382202, + "logits/rejected": 2.4630675315856934, + "logps/chosen": -636.5853271484375, + "logps/rejected": -1010.4078369140625, + "loss": 0.3565, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.526751518249512, + "rewards/margins": 27.499366760253906, + "rewards/rejected": -37.02611541748047, + "step": 1219 + }, + { + "epoch": 0.7589424572317263, + "grad_norm": 40.08103561401367, + "learning_rate": 4.150530198248041e-06, + "logits/chosen": -1.796665072441101, + "logits/rejected": 2.3823959827423096, + "logps/chosen": -502.9656982421875, + "logps/rejected": -979.6764526367188, + "loss": 0.5065, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.961488723754883, + "rewards/margins": 22.71531105041504, + "rewards/rejected": -33.67679977416992, + "step": 1220 + }, + { + "epoch": 0.7595645412130637, + "grad_norm": 0.29949039220809937, + "learning_rate": 4.149377593360996e-06, + "logits/chosen": 2.081892728805542, + "logits/rejected": 3.9054107666015625, + "logps/chosen": -588.187744140625, + "logps/rejected": -923.739990234375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.143125534057617, + "rewards/margins": 25.538053512573242, + "rewards/rejected": -33.68117904663086, + "step": 1221 + }, + { + "epoch": 0.7601866251944013, + "grad_norm": 5.415968189481646e-06, + "learning_rate": 4.148224988473951e-06, + "logits/chosen": 1.3517963886260986, + "logits/rejected": 3.5172603130340576, + "logps/chosen": -586.5282592773438, + "logps/rejected": -1011.800537109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.000934600830078, + "rewards/margins": 27.62999153137207, + "rewards/rejected": -38.630924224853516, + "step": 1222 + }, + { + "epoch": 0.7608087091757387, + "grad_norm": 0.0005389899015426636, + "learning_rate": 4.147072383586907e-06, + "logits/chosen": -0.550157368183136, + "logits/rejected": 1.908506989479065, + "logps/chosen": -415.641357421875, + "logps/rejected": -694.29345703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.685222148895264, + "rewards/margins": 19.236995697021484, + "rewards/rejected": -24.922218322753906, + "step": 1223 + }, + { + "epoch": 0.7614307931570762, + "grad_norm": 0.00261475401930511, + "learning_rate": 4.145919778699863e-06, + "logits/chosen": 0.1322479248046875, + "logits/rejected": 2.452498435974121, + "logps/chosen": -573.089111328125, + "logps/rejected": -906.893310546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.214548110961914, + "rewards/margins": 22.150157928466797, + "rewards/rejected": -32.364707946777344, + "step": 1224 + }, + { + "epoch": 0.7620528771384136, + "grad_norm": 0.026828749105334282, + "learning_rate": 4.144767173812818e-06, + "logits/chosen": 1.557422161102295, + "logits/rejected": 2.9317455291748047, + "logps/chosen": -688.3087158203125, + "logps/rejected": -935.5712890625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.075133323669434, + "rewards/margins": 18.204994201660156, + "rewards/rejected": -30.280128479003906, + "step": 1225 + }, + { + "epoch": 0.7626749611197512, + "grad_norm": 6.467380523681641, + "learning_rate": 4.143614568925773e-06, + "logits/chosen": 0.3639960289001465, + "logits/rejected": 3.995713233947754, + "logps/chosen": -623.2493286132812, + "logps/rejected": -1020.880126953125, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.351022720336914, + "rewards/margins": 22.294944763183594, + "rewards/rejected": -33.64596939086914, + "step": 1226 + }, + { + "epoch": 0.7632970451010886, + "grad_norm": 0.0015312153846025467, + "learning_rate": 4.142461964038728e-06, + "logits/chosen": 2.2482800483703613, + "logits/rejected": 3.425135374069214, + "logps/chosen": -578.300048828125, + "logps/rejected": -859.5419921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.633014678955078, + "rewards/margins": 21.133586883544922, + "rewards/rejected": -31.7666015625, + "step": 1227 + }, + { + "epoch": 0.7639191290824261, + "grad_norm": 45.72669982910156, + "learning_rate": 4.1413093591516835e-06, + "logits/chosen": -0.774622917175293, + "logits/rejected": 3.128000259399414, + "logps/chosen": -628.8204345703125, + "logps/rejected": -994.5520629882812, + "loss": 0.7044, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.55477523803711, + "rewards/margins": 14.067876815795898, + "rewards/rejected": -24.62265396118164, + "step": 1228 + }, + { + "epoch": 0.7645412130637637, + "grad_norm": 0.012374775484204292, + "learning_rate": 4.140156754264638e-06, + "logits/chosen": -1.5677317380905151, + "logits/rejected": 3.701002359390259, + "logps/chosen": -402.24896240234375, + "logps/rejected": -841.3854370117188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.498409271240234, + "rewards/margins": 20.21123504638672, + "rewards/rejected": -25.709644317626953, + "step": 1229 + }, + { + "epoch": 0.7651632970451011, + "grad_norm": 17.667760848999023, + "learning_rate": 4.139004149377593e-06, + "logits/chosen": -0.14382916688919067, + "logits/rejected": 3.2963027954101562, + "logps/chosen": -537.2012939453125, + "logps/rejected": -980.7515869140625, + "loss": 0.0985, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.536566734313965, + "rewards/margins": 25.164365768432617, + "rewards/rejected": -33.70093536376953, + "step": 1230 + }, + { + "epoch": 0.7657853810264386, + "grad_norm": 0.952735185623169, + "learning_rate": 4.137851544490548e-06, + "logits/chosen": 2.5447874069213867, + "logits/rejected": 3.823918104171753, + "logps/chosen": -715.80419921875, + "logps/rejected": -1034.6048583984375, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.001297950744629, + "rewards/margins": 20.011383056640625, + "rewards/rejected": -34.01268005371094, + "step": 1231 + }, + { + "epoch": 0.766407465007776, + "grad_norm": 0.005690231919288635, + "learning_rate": 4.136698939603504e-06, + "logits/chosen": -2.129636287689209, + "logits/rejected": 3.6906516551971436, + "logps/chosen": -313.97076416015625, + "logps/rejected": -797.6610107421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.206093788146973, + "rewards/margins": 20.89019012451172, + "rewards/rejected": -28.096282958984375, + "step": 1232 + }, + { + "epoch": 0.7670295489891136, + "grad_norm": 35.85158920288086, + "learning_rate": 4.13554633471646e-06, + "logits/chosen": -0.5706221461296082, + "logits/rejected": 2.8684823513031006, + "logps/chosen": -525.5751953125, + "logps/rejected": -977.8446044921875, + "loss": 0.8167, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.22457218170166, + "rewards/margins": 23.050174713134766, + "rewards/rejected": -36.274749755859375, + "step": 1233 + }, + { + "epoch": 0.767651632970451, + "grad_norm": 57.710357666015625, + "learning_rate": 4.134393729829415e-06, + "logits/chosen": -0.09655407816171646, + "logits/rejected": 1.2852544784545898, + "logps/chosen": -605.3065185546875, + "logps/rejected": -809.9786376953125, + "loss": 1.7346, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.01135540008545, + "rewards/margins": 13.374024391174316, + "rewards/rejected": -26.385379791259766, + "step": 1234 + }, + { + "epoch": 0.7682737169517885, + "grad_norm": 0.0007348281214945018, + "learning_rate": 4.13324112494237e-06, + "logits/chosen": 2.3951401710510254, + "logits/rejected": 4.68405294418335, + "logps/chosen": -720.020751953125, + "logps/rejected": -1034.022705078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.383231163024902, + "rewards/margins": 21.747941970825195, + "rewards/rejected": -33.13117218017578, + "step": 1235 + }, + { + "epoch": 0.7688958009331259, + "grad_norm": 40.10581970214844, + "learning_rate": 4.132088520055325e-06, + "logits/chosen": -1.058050513267517, + "logits/rejected": 1.4945002794265747, + "logps/chosen": -526.7613525390625, + "logps/rejected": -863.2998657226562, + "loss": 0.6122, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.167625427246094, + "rewards/margins": 21.5457763671875, + "rewards/rejected": -29.713401794433594, + "step": 1236 + }, + { + "epoch": 0.7695178849144635, + "grad_norm": 0.1868850290775299, + "learning_rate": 4.1309359151682805e-06, + "logits/chosen": -0.6735565662384033, + "logits/rejected": 3.166142463684082, + "logps/chosen": -555.5580444335938, + "logps/rejected": -926.4407958984375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.521209716796875, + "rewards/margins": 24.875356674194336, + "rewards/rejected": -38.396568298339844, + "step": 1237 + }, + { + "epoch": 0.7701399688958009, + "grad_norm": 17.024826049804688, + "learning_rate": 4.129783310281236e-06, + "logits/chosen": -1.321545124053955, + "logits/rejected": 2.6138415336608887, + "logps/chosen": -372.07635498046875, + "logps/rejected": -855.348388671875, + "loss": 0.1021, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.196434020996094, + "rewards/margins": 24.538761138916016, + "rewards/rejected": -30.735193252563477, + "step": 1238 + }, + { + "epoch": 0.7707620528771384, + "grad_norm": 8.435630798339844, + "learning_rate": 4.128630705394191e-06, + "logits/chosen": 2.0260398387908936, + "logits/rejected": 4.694815635681152, + "logps/chosen": -593.011962890625, + "logps/rejected": -928.0355224609375, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.318330764770508, + "rewards/margins": 20.424802780151367, + "rewards/rejected": -29.743133544921875, + "step": 1239 + }, + { + "epoch": 0.7713841368584758, + "grad_norm": 15.079669952392578, + "learning_rate": 4.127478100507146e-06, + "logits/chosen": -1.5058603286743164, + "logits/rejected": 3.0221991539001465, + "logps/chosen": -332.8693542480469, + "logps/rejected": -755.5462036132812, + "loss": 0.0755, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.04178524017334, + "rewards/margins": 22.907142639160156, + "rewards/rejected": -26.94892692565918, + "step": 1240 + }, + { + "epoch": 0.7720062208398134, + "grad_norm": 1.4181602001190186, + "learning_rate": 4.1263254956201014e-06, + "logits/chosen": 2.111570119857788, + "logits/rejected": 5.410984039306641, + "logps/chosen": -535.548095703125, + "logps/rejected": -1025.924072265625, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.443130016326904, + "rewards/margins": 25.616270065307617, + "rewards/rejected": -33.05940246582031, + "step": 1241 + }, + { + "epoch": 0.7726283048211509, + "grad_norm": 25.272632598876953, + "learning_rate": 4.125172890733057e-06, + "logits/chosen": 0.5199185013771057, + "logits/rejected": 2.5829875469207764, + "logps/chosen": -668.5764770507812, + "logps/rejected": -979.669921875, + "loss": 0.2667, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.801076889038086, + "rewards/margins": 19.066125869750977, + "rewards/rejected": -29.867202758789062, + "step": 1242 + }, + { + "epoch": 0.7732503888024883, + "grad_norm": 7.891100722190458e-06, + "learning_rate": 4.124020285846012e-06, + "logits/chosen": 1.2405672073364258, + "logits/rejected": 4.021305084228516, + "logps/chosen": -650.9105224609375, + "logps/rejected": -1101.25341796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.636184692382812, + "rewards/margins": 27.267887115478516, + "rewards/rejected": -36.904075622558594, + "step": 1243 + }, + { + "epoch": 0.7738724727838259, + "grad_norm": 26.89502716064453, + "learning_rate": 4.122867680958967e-06, + "logits/chosen": 2.664867401123047, + "logits/rejected": 4.436239242553711, + "logps/chosen": -678.9229736328125, + "logps/rejected": -927.5115356445312, + "loss": 0.2157, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.128175735473633, + "rewards/margins": 16.038503646850586, + "rewards/rejected": -26.16668128967285, + "step": 1244 + }, + { + "epoch": 0.7744945567651633, + "grad_norm": 0.000490253500174731, + "learning_rate": 4.121715076071922e-06, + "logits/chosen": 3.488694667816162, + "logits/rejected": 3.4402871131896973, + "logps/chosen": -698.736328125, + "logps/rejected": -955.9376220703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.386505126953125, + "rewards/margins": 21.48382568359375, + "rewards/rejected": -30.870332717895508, + "step": 1245 + }, + { + "epoch": 0.7751166407465008, + "grad_norm": 0.0054618967697024345, + "learning_rate": 4.1205624711848776e-06, + "logits/chosen": -0.03978198766708374, + "logits/rejected": 4.639406204223633, + "logps/chosen": -345.4989013671875, + "logps/rejected": -861.0494384765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.173473358154297, + "rewards/margins": 24.827526092529297, + "rewards/rejected": -32.000999450683594, + "step": 1246 + }, + { + "epoch": 0.7757387247278382, + "grad_norm": 27.640233993530273, + "learning_rate": 4.119409866297834e-06, + "logits/chosen": -1.448029637336731, + "logits/rejected": 2.496913433074951, + "logps/chosen": -540.625732421875, + "logps/rejected": -894.2573852539062, + "loss": 0.4569, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.5917887687683105, + "rewards/margins": 21.170873641967773, + "rewards/rejected": -28.762664794921875, + "step": 1247 + }, + { + "epoch": 0.7763608087091758, + "grad_norm": 0.006316736806184053, + "learning_rate": 4.118257261410789e-06, + "logits/chosen": -2.042257785797119, + "logits/rejected": 3.5098836421966553, + "logps/chosen": -353.7255859375, + "logps/rejected": -886.4546508789062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.367911338806152, + "rewards/margins": 21.32790756225586, + "rewards/rejected": -27.695819854736328, + "step": 1248 + }, + { + "epoch": 0.7769828926905132, + "grad_norm": 8.07496166229248, + "learning_rate": 4.117104656523744e-06, + "logits/chosen": -0.2673183083534241, + "logits/rejected": 3.8783979415893555, + "logps/chosen": -519.2168579101562, + "logps/rejected": -1002.5673828125, + "loss": 0.0399, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.099881172180176, + "rewards/margins": 28.038101196289062, + "rewards/rejected": -38.13798522949219, + "step": 1249 + }, + { + "epoch": 0.7776049766718507, + "grad_norm": 2.958667278289795, + "learning_rate": 4.115952051636699e-06, + "logits/chosen": 0.0354306697845459, + "logits/rejected": 3.6872379779815674, + "logps/chosen": -516.661376953125, + "logps/rejected": -992.490478515625, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.733585357666016, + "rewards/margins": 24.6900634765625, + "rewards/rejected": -33.423648834228516, + "step": 1250 + }, + { + "epoch": 0.7782270606531881, + "grad_norm": 6.522555828094482, + "learning_rate": 4.1147994467496545e-06, + "logits/chosen": -0.5028421878814697, + "logits/rejected": 2.9892351627349854, + "logps/chosen": -484.8457336425781, + "logps/rejected": -888.728759765625, + "loss": 0.0462, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.360073089599609, + "rewards/margins": 21.569787979125977, + "rewards/rejected": -27.929861068725586, + "step": 1251 + }, + { + "epoch": 0.7788491446345257, + "grad_norm": 0.0009231179719790816, + "learning_rate": 4.11364684186261e-06, + "logits/chosen": 1.3579542636871338, + "logits/rejected": 3.720470905303955, + "logps/chosen": -570.8295288085938, + "logps/rejected": -911.2254638671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.875762939453125, + "rewards/margins": 23.522167205810547, + "rewards/rejected": -31.397930145263672, + "step": 1252 + }, + { + "epoch": 0.7794712286158632, + "grad_norm": 33.93539810180664, + "learning_rate": 4.112494236975565e-06, + "logits/chosen": -0.39095282554626465, + "logits/rejected": 3.6155381202697754, + "logps/chosen": -434.69134521484375, + "logps/rejected": -829.1648559570312, + "loss": 0.88, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.6295928955078125, + "rewards/margins": 20.17112922668457, + "rewards/rejected": -26.800724029541016, + "step": 1253 + }, + { + "epoch": 0.7800933125972006, + "grad_norm": 0.00012112106196582317, + "learning_rate": 4.11134163208852e-06, + "logits/chosen": 0.8299846649169922, + "logits/rejected": 3.10577654838562, + "logps/chosen": -411.1665344238281, + "logps/rejected": -749.471923828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.046252250671387, + "rewards/margins": 24.529747009277344, + "rewards/rejected": -29.576000213623047, + "step": 1254 + }, + { + "epoch": 0.7807153965785381, + "grad_norm": 0.04176100715994835, + "learning_rate": 4.110189027201475e-06, + "logits/chosen": 3.4682154655456543, + "logits/rejected": 4.857761859893799, + "logps/chosen": -729.2528686523438, + "logps/rejected": -954.7874145507812, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.462285995483398, + "rewards/margins": 15.092519760131836, + "rewards/rejected": -23.554805755615234, + "step": 1255 + }, + { + "epoch": 0.7813374805598756, + "grad_norm": 0.012754272669553757, + "learning_rate": 4.109036422314431e-06, + "logits/chosen": -1.916728138923645, + "logits/rejected": 3.354323625564575, + "logps/chosen": -385.1650390625, + "logps/rejected": -876.616455078125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.575162649154663, + "rewards/margins": 26.097787857055664, + "rewards/rejected": -28.672950744628906, + "step": 1256 + }, + { + "epoch": 0.7819595645412131, + "grad_norm": 3.408075281186029e-05, + "learning_rate": 4.107883817427386e-06, + "logits/chosen": -1.3067548274993896, + "logits/rejected": 2.744790554046631, + "logps/chosen": -485.6220397949219, + "logps/rejected": -959.7124633789062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.141744613647461, + "rewards/margins": 25.00613784790039, + "rewards/rejected": -31.14788055419922, + "step": 1257 + }, + { + "epoch": 0.7825816485225505, + "grad_norm": 30.633358001708984, + "learning_rate": 4.106731212540341e-06, + "logits/chosen": 0.2253757119178772, + "logits/rejected": 3.640742778778076, + "logps/chosen": -466.9993591308594, + "logps/rejected": -941.8668212890625, + "loss": 0.7124, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.170479774475098, + "rewards/margins": 25.03893280029297, + "rewards/rejected": -29.209413528442383, + "step": 1258 + }, + { + "epoch": 0.783203732503888, + "grad_norm": 0.0012299851514399052, + "learning_rate": 4.105578607653296e-06, + "logits/chosen": 0.25474441051483154, + "logits/rejected": 4.737069606781006, + "logps/chosen": -513.61181640625, + "logps/rejected": -1030.615966796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.679970741271973, + "rewards/margins": 24.170072555541992, + "rewards/rejected": -32.85004425048828, + "step": 1259 + }, + { + "epoch": 0.7838258164852255, + "grad_norm": 8.764855010667816e-06, + "learning_rate": 4.1044260027662515e-06, + "logits/chosen": 1.943559169769287, + "logits/rejected": 4.239380836486816, + "logps/chosen": -667.2208251953125, + "logps/rejected": -1133.9271240234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.01846694946289, + "rewards/margins": 32.54998779296875, + "rewards/rejected": -41.56845474243164, + "step": 1260 + }, + { + "epoch": 0.784447900466563, + "grad_norm": 0.08819999545812607, + "learning_rate": 4.103273397879208e-06, + "logits/chosen": -0.5123782157897949, + "logits/rejected": 3.680102825164795, + "logps/chosen": -395.6453857421875, + "logps/rejected": -820.6551513671875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.562816858291626, + "rewards/margins": 22.052396774291992, + "rewards/rejected": -24.61521339416504, + "step": 1261 + }, + { + "epoch": 0.7850699844479004, + "grad_norm": 1.087512493133545, + "learning_rate": 4.102120792992163e-06, + "logits/chosen": 0.575364351272583, + "logits/rejected": 5.38516092300415, + "logps/chosen": -390.524658203125, + "logps/rejected": -909.357421875, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.065713882446289, + "rewards/margins": 23.575519561767578, + "rewards/rejected": -29.641231536865234, + "step": 1262 + }, + { + "epoch": 0.785692068429238, + "grad_norm": 0.4358244240283966, + "learning_rate": 4.100968188105118e-06, + "logits/chosen": -0.8720681667327881, + "logits/rejected": 3.837329387664795, + "logps/chosen": -467.9682312011719, + "logps/rejected": -962.12744140625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.806166648864746, + "rewards/margins": 28.562875747680664, + "rewards/rejected": -37.369041442871094, + "step": 1263 + }, + { + "epoch": 0.7863141524105755, + "grad_norm": 0.13888658583164215, + "learning_rate": 4.099815583218073e-06, + "logits/chosen": 1.3139761686325073, + "logits/rejected": 3.096834421157837, + "logps/chosen": -642.6888427734375, + "logps/rejected": -916.2388916015625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.698795318603516, + "rewards/margins": 20.658552169799805, + "rewards/rejected": -32.35734939575195, + "step": 1264 + }, + { + "epoch": 0.7869362363919129, + "grad_norm": 0.20697729289531708, + "learning_rate": 4.0986629783310285e-06, + "logits/chosen": -1.3422939777374268, + "logits/rejected": 3.9096124172210693, + "logps/chosen": -477.9691162109375, + "logps/rejected": -1044.06787109375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.874945640563965, + "rewards/margins": 23.77069854736328, + "rewards/rejected": -30.645645141601562, + "step": 1265 + }, + { + "epoch": 0.7875583203732504, + "grad_norm": 25.782604217529297, + "learning_rate": 4.097510373443984e-06, + "logits/chosen": 1.302040696144104, + "logits/rejected": 4.496374130249023, + "logps/chosen": -546.2277221679688, + "logps/rejected": -870.092041015625, + "loss": 0.4589, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.486966609954834, + "rewards/margins": 18.784543991088867, + "rewards/rejected": -26.27151107788086, + "step": 1266 + }, + { + "epoch": 0.7881804043545879, + "grad_norm": 5.894190311431885, + "learning_rate": 4.096357768556939e-06, + "logits/chosen": 1.1210216283798218, + "logits/rejected": 3.163902759552002, + "logps/chosen": -623.9061889648438, + "logps/rejected": -906.975830078125, + "loss": 0.0295, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.462145805358887, + "rewards/margins": 18.545324325561523, + "rewards/rejected": -28.007469177246094, + "step": 1267 + }, + { + "epoch": 0.7888024883359254, + "grad_norm": 5.6784025218803436e-05, + "learning_rate": 4.095205163669894e-06, + "logits/chosen": -1.7827244997024536, + "logits/rejected": 2.870236396789551, + "logps/chosen": -451.1767578125, + "logps/rejected": -966.4337158203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.05531120300293, + "rewards/margins": 30.899620056152344, + "rewards/rejected": -38.954933166503906, + "step": 1268 + }, + { + "epoch": 0.7894245723172628, + "grad_norm": 26.408723831176758, + "learning_rate": 4.094052558782849e-06, + "logits/chosen": 0.2130802422761917, + "logits/rejected": 3.3761491775512695, + "logps/chosen": -543.6114501953125, + "logps/rejected": -946.999267578125, + "loss": 0.3535, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.727877616882324, + "rewards/margins": 23.667865753173828, + "rewards/rejected": -32.39574432373047, + "step": 1269 + }, + { + "epoch": 0.7900466562986003, + "grad_norm": 0.0007318244897760451, + "learning_rate": 4.092899953895805e-06, + "logits/chosen": 1.295201301574707, + "logits/rejected": 3.541900157928467, + "logps/chosen": -644.5004272460938, + "logps/rejected": -1030.76123046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.541393756866455, + "rewards/margins": 28.21343421936035, + "rewards/rejected": -33.75482940673828, + "step": 1270 + }, + { + "epoch": 0.7906687402799378, + "grad_norm": 0.8289233446121216, + "learning_rate": 4.09174734900876e-06, + "logits/chosen": 1.6525609493255615, + "logits/rejected": 4.1350908279418945, + "logps/chosen": -319.18353271484375, + "logps/rejected": -693.30517578125, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.030683517456055, + "rewards/margins": 20.20093536376953, + "rewards/rejected": -25.231618881225586, + "step": 1271 + }, + { + "epoch": 0.7912908242612753, + "grad_norm": 1.632172703742981, + "learning_rate": 4.090594744121715e-06, + "logits/chosen": 3.0184199810028076, + "logits/rejected": 4.094862937927246, + "logps/chosen": -606.4073486328125, + "logps/rejected": -852.579833984375, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.487676620483398, + "rewards/margins": 18.483928680419922, + "rewards/rejected": -27.971603393554688, + "step": 1272 + }, + { + "epoch": 0.7919129082426127, + "grad_norm": 0.05077657476067543, + "learning_rate": 4.08944213923467e-06, + "logits/chosen": 3.8291378021240234, + "logits/rejected": 3.1013073921203613, + "logps/chosen": -732.06884765625, + "logps/rejected": -885.6846923828125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.131711959838867, + "rewards/margins": 19.296707153320312, + "rewards/rejected": -33.42841720581055, + "step": 1273 + }, + { + "epoch": 0.7925349922239502, + "grad_norm": 23.887598037719727, + "learning_rate": 4.0882895343476255e-06, + "logits/chosen": -0.9498767852783203, + "logits/rejected": 1.325081467628479, + "logps/chosen": -477.6418762207031, + "logps/rejected": -833.086181640625, + "loss": 0.4625, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.639581680297852, + "rewards/margins": 16.080799102783203, + "rewards/rejected": -24.720382690429688, + "step": 1274 + }, + { + "epoch": 0.7931570762052877, + "grad_norm": 28.14263153076172, + "learning_rate": 4.087136929460581e-06, + "logits/chosen": 1.22007417678833, + "logits/rejected": 5.139258861541748, + "logps/chosen": -536.0076904296875, + "logps/rejected": -923.4759521484375, + "loss": 0.6636, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.12144660949707, + "rewards/margins": 16.953052520751953, + "rewards/rejected": -28.074499130249023, + "step": 1275 + }, + { + "epoch": 0.7937791601866252, + "grad_norm": 0.00033927810727618635, + "learning_rate": 4.085984324573537e-06, + "logits/chosen": 1.6169593334197998, + "logits/rejected": 4.11783504486084, + "logps/chosen": -577.6982421875, + "logps/rejected": -1023.56396484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.645487785339355, + "rewards/margins": 30.665348052978516, + "rewards/rejected": -40.31083297729492, + "step": 1276 + }, + { + "epoch": 0.7944012441679627, + "grad_norm": 0.0009849730413407087, + "learning_rate": 4.084831719686492e-06, + "logits/chosen": 5.122714519500732, + "logits/rejected": 5.933152675628662, + "logps/chosen": -818.3211059570312, + "logps/rejected": -1011.8892822265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.776374816894531, + "rewards/margins": 21.75531768798828, + "rewards/rejected": -32.53169250488281, + "step": 1277 + }, + { + "epoch": 0.7950233281493001, + "grad_norm": 1.222702980041504, + "learning_rate": 4.083679114799447e-06, + "logits/chosen": 0.6945499777793884, + "logits/rejected": 2.474721908569336, + "logps/chosen": -430.6312255859375, + "logps/rejected": -710.2374267578125, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.810029983520508, + "rewards/margins": 17.72357177734375, + "rewards/rejected": -25.533601760864258, + "step": 1278 + }, + { + "epoch": 0.7956454121306377, + "grad_norm": 0.0002225928328698501, + "learning_rate": 4.0825265099124025e-06, + "logits/chosen": 0.8473066091537476, + "logits/rejected": 5.379339218139648, + "logps/chosen": -543.3826293945312, + "logps/rejected": -1045.0135498046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.165830612182617, + "rewards/margins": 25.138229370117188, + "rewards/rejected": -33.30405807495117, + "step": 1279 + }, + { + "epoch": 0.7962674961119751, + "grad_norm": 3.7584469318389893, + "learning_rate": 4.081373905025358e-06, + "logits/chosen": -2.6725735664367676, + "logits/rejected": 2.786515951156616, + "logps/chosen": -283.8596496582031, + "logps/rejected": -721.12841796875, + "loss": 0.0336, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.44124698638916, + "rewards/margins": 19.681657791137695, + "rewards/rejected": -24.122905731201172, + "step": 1280 + }, + { + "epoch": 0.7968895800933126, + "grad_norm": 0.00032045444822870195, + "learning_rate": 4.080221300138313e-06, + "logits/chosen": 0.8776124119758606, + "logits/rejected": 4.913902282714844, + "logps/chosen": -560.6947631835938, + "logps/rejected": -1074.2099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.016932487487793, + "rewards/margins": 28.982444763183594, + "rewards/rejected": -37.99938201904297, + "step": 1281 + }, + { + "epoch": 0.7975116640746501, + "grad_norm": 0.02013535052537918, + "learning_rate": 4.079068695251268e-06, + "logits/chosen": 1.071070909500122, + "logits/rejected": 3.490199565887451, + "logps/chosen": -558.4388427734375, + "logps/rejected": -955.5740356445312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.795027732849121, + "rewards/margins": 21.521804809570312, + "rewards/rejected": -28.31683349609375, + "step": 1282 + }, + { + "epoch": 0.7981337480559876, + "grad_norm": 0.3003612160682678, + "learning_rate": 4.077916090364223e-06, + "logits/chosen": 0.02880948781967163, + "logits/rejected": 3.635495185852051, + "logps/chosen": -549.6759033203125, + "logps/rejected": -964.7757568359375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.934090614318848, + "rewards/margins": 24.060932159423828, + "rewards/rejected": -34.995025634765625, + "step": 1283 + }, + { + "epoch": 0.798755832037325, + "grad_norm": 0.021271033212542534, + "learning_rate": 4.076763485477179e-06, + "logits/chosen": 0.04239767789840698, + "logits/rejected": 2.0946714878082275, + "logps/chosen": -625.83837890625, + "logps/rejected": -981.59423828125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.937152862548828, + "rewards/margins": 23.588687896728516, + "rewards/rejected": -29.525840759277344, + "step": 1284 + }, + { + "epoch": 0.7993779160186625, + "grad_norm": 31.318174362182617, + "learning_rate": 4.075610880590134e-06, + "logits/chosen": 2.3726325035095215, + "logits/rejected": 2.115004301071167, + "logps/chosen": -538.9212646484375, + "logps/rejected": -803.7254638671875, + "loss": 1.1857, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.018959045410156, + "rewards/margins": 18.249353408813477, + "rewards/rejected": -29.268310546875, + "step": 1285 + }, + { + "epoch": 0.8, + "grad_norm": 9.438876152038574, + "learning_rate": 4.074458275703089e-06, + "logits/chosen": -1.8963449001312256, + "logits/rejected": 2.997347354888916, + "logps/chosen": -343.0146179199219, + "logps/rejected": -801.9140014648438, + "loss": 0.1191, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.2926506996154785, + "rewards/margins": 18.306407928466797, + "rewards/rejected": -25.59906005859375, + "step": 1286 + }, + { + "epoch": 0.8006220839813375, + "grad_norm": 10.629618644714355, + "learning_rate": 4.073305670816044e-06, + "logits/chosen": -1.7159764766693115, + "logits/rejected": 3.7132809162139893, + "logps/chosen": -352.5853271484375, + "logps/rejected": -948.1820068359375, + "loss": 0.1264, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.144214630126953, + "rewards/margins": 26.92259979248047, + "rewards/rejected": -35.06681442260742, + "step": 1287 + }, + { + "epoch": 0.801244167962675, + "grad_norm": 1.6232259273529053, + "learning_rate": 4.0721530659289995e-06, + "logits/chosen": 1.8202488422393799, + "logits/rejected": 5.256498336791992, + "logps/chosen": -496.8153076171875, + "logps/rejected": -939.01953125, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.51845645904541, + "rewards/margins": 27.129898071289062, + "rewards/rejected": -36.64834976196289, + "step": 1288 + }, + { + "epoch": 0.8018662519440124, + "grad_norm": 2.7157328128814697, + "learning_rate": 4.071000461041955e-06, + "logits/chosen": 1.093205213546753, + "logits/rejected": 4.126691818237305, + "logps/chosen": -497.1220397949219, + "logps/rejected": -887.37060546875, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.251227855682373, + "rewards/margins": 22.370820999145508, + "rewards/rejected": -29.62204933166504, + "step": 1289 + }, + { + "epoch": 0.80248833592535, + "grad_norm": 0.0017288104863837361, + "learning_rate": 4.06984785615491e-06, + "logits/chosen": -2.63922381401062, + "logits/rejected": 2.14827823638916, + "logps/chosen": -384.6424560546875, + "logps/rejected": -1019.2603149414062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.489599704742432, + "rewards/margins": 26.801565170288086, + "rewards/rejected": -34.29116439819336, + "step": 1290 + }, + { + "epoch": 0.8031104199066874, + "grad_norm": 0.3165653347969055, + "learning_rate": 4.068695251267866e-06, + "logits/chosen": -0.5058521628379822, + "logits/rejected": 3.0581393241882324, + "logps/chosen": -632.1590576171875, + "logps/rejected": -1066.4691162109375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.409887313842773, + "rewards/margins": 26.200271606445312, + "rewards/rejected": -34.61016082763672, + "step": 1291 + }, + { + "epoch": 0.8037325038880249, + "grad_norm": 0.0003989443648606539, + "learning_rate": 4.067542646380821e-06, + "logits/chosen": -0.6932083964347839, + "logits/rejected": 2.834740161895752, + "logps/chosen": -439.7989196777344, + "logps/rejected": -1053.9024658203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.509608745574951, + "rewards/margins": 31.004009246826172, + "rewards/rejected": -36.51361846923828, + "step": 1292 + }, + { + "epoch": 0.8043545878693623, + "grad_norm": 0.0012794709764420986, + "learning_rate": 4.0663900414937765e-06, + "logits/chosen": 0.4577113091945648, + "logits/rejected": 3.1413795948028564, + "logps/chosen": -636.5763549804688, + "logps/rejected": -1040.104736328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.54163932800293, + "rewards/margins": 28.04217529296875, + "rewards/rejected": -39.58381271362305, + "step": 1293 + }, + { + "epoch": 0.8049766718506999, + "grad_norm": 0.010304873809218407, + "learning_rate": 4.065237436606732e-06, + "logits/chosen": 0.19404852390289307, + "logits/rejected": 4.620615005493164, + "logps/chosen": -519.24853515625, + "logps/rejected": -1041.069091796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.314437866210938, + "rewards/margins": 24.46490478515625, + "rewards/rejected": -32.77934265136719, + "step": 1294 + }, + { + "epoch": 0.8055987558320373, + "grad_norm": 28.267431259155273, + "learning_rate": 4.064084831719687e-06, + "logits/chosen": -1.7566967010498047, + "logits/rejected": 2.2776732444763184, + "logps/chosen": -419.6002197265625, + "logps/rejected": -839.655517578125, + "loss": 0.1848, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.208789825439453, + "rewards/margins": 19.25501251220703, + "rewards/rejected": -28.463802337646484, + "step": 1295 + }, + { + "epoch": 0.8062208398133748, + "grad_norm": 37.10252380371094, + "learning_rate": 4.062932226832642e-06, + "logits/chosen": 2.2791783809661865, + "logits/rejected": 4.953285217285156, + "logps/chosen": -517.1919555664062, + "logps/rejected": -895.48779296875, + "loss": 0.4776, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.959544658660889, + "rewards/margins": 20.814197540283203, + "rewards/rejected": -28.77374267578125, + "step": 1296 + }, + { + "epoch": 0.8068429237947123, + "grad_norm": 0.35563036799430847, + "learning_rate": 4.061779621945597e-06, + "logits/chosen": -0.17489880323410034, + "logits/rejected": 3.205916404724121, + "logps/chosen": -325.92034912109375, + "logps/rejected": -804.7338256835938, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.247753143310547, + "rewards/margins": 23.912797927856445, + "rewards/rejected": -31.16054916381836, + "step": 1297 + }, + { + "epoch": 0.8074650077760498, + "grad_norm": 21.93829917907715, + "learning_rate": 4.060627017058553e-06, + "logits/chosen": -0.9875195026397705, + "logits/rejected": 2.943293571472168, + "logps/chosen": -536.0520629882812, + "logps/rejected": -937.1588134765625, + "loss": 0.1268, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.072823524475098, + "rewards/margins": 22.72176742553711, + "rewards/rejected": -30.794591903686523, + "step": 1298 + }, + { + "epoch": 0.8080870917573872, + "grad_norm": 5.47617491974961e-05, + "learning_rate": 4.059474412171508e-06, + "logits/chosen": 1.7214620113372803, + "logits/rejected": 4.240743637084961, + "logps/chosen": -623.8914184570312, + "logps/rejected": -996.2498168945312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.60321044921875, + "rewards/margins": 24.35324478149414, + "rewards/rejected": -33.95645523071289, + "step": 1299 + }, + { + "epoch": 0.8087091757387247, + "grad_norm": 42.22055435180664, + "learning_rate": 4.058321807284463e-06, + "logits/chosen": 0.30668437480926514, + "logits/rejected": 2.93937087059021, + "logps/chosen": -671.924072265625, + "logps/rejected": -990.665283203125, + "loss": 0.871, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.068768501281738, + "rewards/margins": 17.43556785583496, + "rewards/rejected": -27.504337310791016, + "step": 1300 + }, + { + "epoch": 0.8093312597200623, + "grad_norm": 0.3986184597015381, + "learning_rate": 4.057169202397418e-06, + "logits/chosen": -1.8530347347259521, + "logits/rejected": 2.342902183532715, + "logps/chosen": -344.0250549316406, + "logps/rejected": -775.0608520507812, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.663173198699951, + "rewards/margins": 27.073396682739258, + "rewards/rejected": -30.736572265625, + "step": 1301 + }, + { + "epoch": 0.8099533437013997, + "grad_norm": 0.01742498017847538, + "learning_rate": 4.0560165975103735e-06, + "logits/chosen": -0.03469623625278473, + "logits/rejected": 3.8866472244262695, + "logps/chosen": -395.10205078125, + "logps/rejected": -897.292236328125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.772964954376221, + "rewards/margins": 22.193988800048828, + "rewards/rejected": -29.96695327758789, + "step": 1302 + }, + { + "epoch": 0.8105754276827372, + "grad_norm": 3.09145289065782e-05, + "learning_rate": 4.054863992623329e-06, + "logits/chosen": -0.405514657497406, + "logits/rejected": 2.8145222663879395, + "logps/chosen": -432.71527099609375, + "logps/rejected": -894.4420166015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.114578247070312, + "rewards/margins": 24.84501838684082, + "rewards/rejected": -32.9595947265625, + "step": 1303 + }, + { + "epoch": 0.8111975116640746, + "grad_norm": 0.15704073011875153, + "learning_rate": 4.053711387736284e-06, + "logits/chosen": 0.05295020341873169, + "logits/rejected": 4.433631420135498, + "logps/chosen": -493.4110107421875, + "logps/rejected": -917.6148681640625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.961913585662842, + "rewards/margins": 22.568294525146484, + "rewards/rejected": -30.530208587646484, + "step": 1304 + }, + { + "epoch": 0.8118195956454122, + "grad_norm": 0.05151224881410599, + "learning_rate": 4.05255878284924e-06, + "logits/chosen": -1.0337135791778564, + "logits/rejected": 2.5338025093078613, + "logps/chosen": -373.13446044921875, + "logps/rejected": -814.0833740234375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.4312238693237305, + "rewards/margins": 24.39572525024414, + "rewards/rejected": -28.826946258544922, + "step": 1305 + }, + { + "epoch": 0.8124416796267496, + "grad_norm": 3.894642304658191e-06, + "learning_rate": 4.051406177962195e-06, + "logits/chosen": 0.11323362588882446, + "logits/rejected": 5.9528961181640625, + "logps/chosen": -481.096435546875, + "logps/rejected": -1122.715087890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.372435092926025, + "rewards/margins": 27.15767478942871, + "rewards/rejected": -34.53010940551758, + "step": 1306 + }, + { + "epoch": 0.8130637636080871, + "grad_norm": 1.2801079719793051e-05, + "learning_rate": 4.0502535730751505e-06, + "logits/chosen": -1.0122219324111938, + "logits/rejected": 2.4837958812713623, + "logps/chosen": -411.2268981933594, + "logps/rejected": -929.159423828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.501988410949707, + "rewards/margins": 28.270177841186523, + "rewards/rejected": -36.77216339111328, + "step": 1307 + }, + { + "epoch": 0.8136858475894245, + "grad_norm": 0.003478578059002757, + "learning_rate": 4.049100968188106e-06, + "logits/chosen": -1.201788067817688, + "logits/rejected": 2.739528179168701, + "logps/chosen": -561.0819091796875, + "logps/rejected": -1004.871337890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.666325569152832, + "rewards/margins": 24.24523162841797, + "rewards/rejected": -33.911556243896484, + "step": 1308 + }, + { + "epoch": 0.8143079315707621, + "grad_norm": 35.751625061035156, + "learning_rate": 4.047948363301061e-06, + "logits/chosen": 1.2579597234725952, + "logits/rejected": 3.93234920501709, + "logps/chosen": -611.4188232421875, + "logps/rejected": -933.8849487304688, + "loss": 0.8951, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.106334686279297, + "rewards/margins": 18.69715118408203, + "rewards/rejected": -30.803485870361328, + "step": 1309 + }, + { + "epoch": 0.8149300155520995, + "grad_norm": 0.08502575755119324, + "learning_rate": 4.046795758414016e-06, + "logits/chosen": 2.097734212875366, + "logits/rejected": 4.242420673370361, + "logps/chosen": -593.9769287109375, + "logps/rejected": -998.7327880859375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.55612564086914, + "rewards/margins": 24.9191837310791, + "rewards/rejected": -34.47530746459961, + "step": 1310 + }, + { + "epoch": 0.815552099533437, + "grad_norm": 10.52176284790039, + "learning_rate": 4.045643153526971e-06, + "logits/chosen": -0.6155394315719604, + "logits/rejected": 2.2130019664764404, + "logps/chosen": -541.8138427734375, + "logps/rejected": -1002.4979858398438, + "loss": 0.049, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.962281227111816, + "rewards/margins": 29.961563110351562, + "rewards/rejected": -37.92384338378906, + "step": 1311 + }, + { + "epoch": 0.8161741835147744, + "grad_norm": 18.777523040771484, + "learning_rate": 4.044490548639927e-06, + "logits/chosen": -0.9691512584686279, + "logits/rejected": 2.1284027099609375, + "logps/chosen": -523.74365234375, + "logps/rejected": -879.116943359375, + "loss": 0.3622, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.904512405395508, + "rewards/margins": 19.846830368041992, + "rewards/rejected": -27.751344680786133, + "step": 1312 + }, + { + "epoch": 0.816796267496112, + "grad_norm": 30.227996826171875, + "learning_rate": 4.043337943752882e-06, + "logits/chosen": 0.47516682744026184, + "logits/rejected": 2.2081222534179688, + "logps/chosen": -482.805908203125, + "logps/rejected": -740.0989379882812, + "loss": 0.3119, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.329744338989258, + "rewards/margins": 23.253252029418945, + "rewards/rejected": -31.582996368408203, + "step": 1313 + }, + { + "epoch": 0.8174183514774495, + "grad_norm": 0.0792866200208664, + "learning_rate": 4.042185338865837e-06, + "logits/chosen": -1.4188458919525146, + "logits/rejected": 2.5889883041381836, + "logps/chosen": -361.36846923828125, + "logps/rejected": -814.2896728515625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.850963115692139, + "rewards/margins": 22.003490447998047, + "rewards/rejected": -27.854455947875977, + "step": 1314 + }, + { + "epoch": 0.8180404354587869, + "grad_norm": 0.00015036317927297205, + "learning_rate": 4.041032733978792e-06, + "logits/chosen": -2.443631172180176, + "logits/rejected": 3.8295273780822754, + "logps/chosen": -400.744384765625, + "logps/rejected": -1040.559814453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.101746082305908, + "rewards/margins": 30.8872127532959, + "rewards/rejected": -36.98896026611328, + "step": 1315 + }, + { + "epoch": 0.8186625194401245, + "grad_norm": 0.001937979948706925, + "learning_rate": 4.0398801290917475e-06, + "logits/chosen": -2.559013843536377, + "logits/rejected": 1.6643040180206299, + "logps/chosen": -383.789306640625, + "logps/rejected": -887.9030151367188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.655877113342285, + "rewards/margins": 26.765106201171875, + "rewards/rejected": -32.420982360839844, + "step": 1316 + }, + { + "epoch": 0.8192846034214619, + "grad_norm": 0.5322608351707458, + "learning_rate": 4.038727524204703e-06, + "logits/chosen": 3.073065757751465, + "logits/rejected": 3.1308679580688477, + "logps/chosen": -611.296142578125, + "logps/rejected": -846.6818237304688, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.426542282104492, + "rewards/margins": 20.219778060913086, + "rewards/rejected": -29.646320343017578, + "step": 1317 + }, + { + "epoch": 0.8199066874027994, + "grad_norm": 0.12200061976909637, + "learning_rate": 4.037574919317658e-06, + "logits/chosen": 0.1979970932006836, + "logits/rejected": 3.523850440979004, + "logps/chosen": -410.6728210449219, + "logps/rejected": -758.5045166015625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.334256172180176, + "rewards/margins": 18.89111328125, + "rewards/rejected": -25.225370407104492, + "step": 1318 + }, + { + "epoch": 0.8205287713841368, + "grad_norm": 41.30738067626953, + "learning_rate": 4.036422314430613e-06, + "logits/chosen": 1.2874574661254883, + "logits/rejected": 2.957958936691284, + "logps/chosen": -643.2376098632812, + "logps/rejected": -870.0418090820312, + "loss": 0.5965, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.196017265319824, + "rewards/margins": 16.791812896728516, + "rewards/rejected": -26.98782730102539, + "step": 1319 + }, + { + "epoch": 0.8211508553654744, + "grad_norm": 13.898651123046875, + "learning_rate": 4.035269709543569e-06, + "logits/chosen": -0.14826762676239014, + "logits/rejected": 2.940274238586426, + "logps/chosen": -498.494384765625, + "logps/rejected": -841.1326904296875, + "loss": 0.0608, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.115964412689209, + "rewards/margins": 20.87997055053711, + "rewards/rejected": -27.995935440063477, + "step": 1320 + }, + { + "epoch": 0.8217729393468118, + "grad_norm": 0.00021476426627486944, + "learning_rate": 4.0341171046565245e-06, + "logits/chosen": 1.9943870306015015, + "logits/rejected": 4.891172885894775, + "logps/chosen": -547.0040283203125, + "logps/rejected": -987.03564453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.015552997589111, + "rewards/margins": 29.849987030029297, + "rewards/rejected": -33.86553955078125, + "step": 1321 + }, + { + "epoch": 0.8223950233281493, + "grad_norm": 0.2498874068260193, + "learning_rate": 4.03296449976948e-06, + "logits/chosen": 0.19681772589683533, + "logits/rejected": 2.519896984100342, + "logps/chosen": -482.88604736328125, + "logps/rejected": -794.16650390625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.21995735168457, + "rewards/margins": 18.27970314025879, + "rewards/rejected": -23.49966049194336, + "step": 1322 + }, + { + "epoch": 0.8230171073094867, + "grad_norm": 0.48097652196884155, + "learning_rate": 4.031811894882435e-06, + "logits/chosen": 0.03272548317909241, + "logits/rejected": 2.3512604236602783, + "logps/chosen": -544.5303955078125, + "logps/rejected": -919.6380615234375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.7990083694458, + "rewards/margins": 20.45400047302246, + "rewards/rejected": -31.253009796142578, + "step": 1323 + }, + { + "epoch": 0.8236391912908243, + "grad_norm": 18.469717025756836, + "learning_rate": 4.03065928999539e-06, + "logits/chosen": 1.2011947631835938, + "logits/rejected": 2.6470980644226074, + "logps/chosen": -474.2806091308594, + "logps/rejected": -750.458984375, + "loss": 0.3039, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.48334264755249, + "rewards/margins": 17.305072784423828, + "rewards/rejected": -22.788414001464844, + "step": 1324 + }, + { + "epoch": 0.8242612752721618, + "grad_norm": 0.008889297023415565, + "learning_rate": 4.029506685108345e-06, + "logits/chosen": -0.05120176076889038, + "logits/rejected": 3.0984973907470703, + "logps/chosen": -528.498046875, + "logps/rejected": -984.384765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.832124710083008, + "rewards/margins": 21.603178024291992, + "rewards/rejected": -29.435304641723633, + "step": 1325 + }, + { + "epoch": 0.8248833592534992, + "grad_norm": 32.134124755859375, + "learning_rate": 4.028354080221301e-06, + "logits/chosen": 1.6112223863601685, + "logits/rejected": 3.236201763153076, + "logps/chosen": -515.0714721679688, + "logps/rejected": -771.778564453125, + "loss": 0.3432, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.943190574645996, + "rewards/margins": 17.906421661376953, + "rewards/rejected": -23.849613189697266, + "step": 1326 + }, + { + "epoch": 0.8255054432348367, + "grad_norm": 0.06956858187913895, + "learning_rate": 4.027201475334256e-06, + "logits/chosen": 2.438955068588257, + "logits/rejected": 3.4251158237457275, + "logps/chosen": -608.2529296875, + "logps/rejected": -849.7999267578125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.219766616821289, + "rewards/margins": 18.12618637084961, + "rewards/rejected": -27.34595489501953, + "step": 1327 + }, + { + "epoch": 0.8261275272161742, + "grad_norm": 2.3918983060866594e-07, + "learning_rate": 4.026048870447211e-06, + "logits/chosen": 1.0514161586761475, + "logits/rejected": 2.963059425354004, + "logps/chosen": -494.9385070800781, + "logps/rejected": -828.365234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.151393890380859, + "rewards/margins": 27.91655158996582, + "rewards/rejected": -33.06794357299805, + "step": 1328 + }, + { + "epoch": 0.8267496111975117, + "grad_norm": 7.566370010375977, + "learning_rate": 4.024896265560166e-06, + "logits/chosen": 0.3350151777267456, + "logits/rejected": 3.3889219760894775, + "logps/chosen": -504.8259582519531, + "logps/rejected": -994.1041259765625, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.070315361022949, + "rewards/margins": 25.839262008666992, + "rewards/rejected": -31.909576416015625, + "step": 1329 + }, + { + "epoch": 0.8273716951788491, + "grad_norm": 0.982587993144989, + "learning_rate": 4.0237436606731215e-06, + "logits/chosen": 0.6158033013343811, + "logits/rejected": 4.1185383796691895, + "logps/chosen": -592.6993408203125, + "logps/rejected": -937.3599853515625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.00186538696289, + "rewards/margins": 17.788156509399414, + "rewards/rejected": -26.790019989013672, + "step": 1330 + }, + { + "epoch": 0.8279937791601866, + "grad_norm": 0.037367358803749084, + "learning_rate": 4.022591055786077e-06, + "logits/chosen": 0.18107163906097412, + "logits/rejected": 4.079789638519287, + "logps/chosen": -401.80194091796875, + "logps/rejected": -841.3532104492188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.279299736022949, + "rewards/margins": 18.987918853759766, + "rewards/rejected": -24.26721954345703, + "step": 1331 + }, + { + "epoch": 0.8286158631415241, + "grad_norm": 0.060256477445364, + "learning_rate": 4.021438450899032e-06, + "logits/chosen": 1.9197179079055786, + "logits/rejected": 4.6027984619140625, + "logps/chosen": -545.1920166015625, + "logps/rejected": -982.877685546875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.839240074157715, + "rewards/margins": 21.339420318603516, + "rewards/rejected": -27.17866325378418, + "step": 1332 + }, + { + "epoch": 0.8292379471228616, + "grad_norm": 0.0030993474647402763, + "learning_rate": 4.020285846011987e-06, + "logits/chosen": 2.615834951400757, + "logits/rejected": 4.404464244842529, + "logps/chosen": -659.1356811523438, + "logps/rejected": -1015.8995971679688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.51331901550293, + "rewards/margins": 24.97059440612793, + "rewards/rejected": -32.483917236328125, + "step": 1333 + }, + { + "epoch": 0.829860031104199, + "grad_norm": 31.577295303344727, + "learning_rate": 4.019133241124943e-06, + "logits/chosen": 0.41338852047920227, + "logits/rejected": 2.415388584136963, + "logps/chosen": -541.7307739257812, + "logps/rejected": -757.7950439453125, + "loss": 0.4238, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.685833930969238, + "rewards/margins": 14.005095481872559, + "rewards/rejected": -19.690927505493164, + "step": 1334 + }, + { + "epoch": 0.8304821150855366, + "grad_norm": 31.769001007080078, + "learning_rate": 4.0179806362378985e-06, + "logits/chosen": 0.11144089698791504, + "logits/rejected": 4.3588714599609375, + "logps/chosen": -499.26263427734375, + "logps/rejected": -887.7661743164062, + "loss": 0.3706, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.3436174392700195, + "rewards/margins": 19.843544006347656, + "rewards/rejected": -25.187162399291992, + "step": 1335 + }, + { + "epoch": 0.831104199066874, + "grad_norm": 11.622576713562012, + "learning_rate": 4.016828031350854e-06, + "logits/chosen": 1.747812032699585, + "logits/rejected": 3.772061824798584, + "logps/chosen": -461.38385009765625, + "logps/rejected": -747.3123168945312, + "loss": 0.1702, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.077765941619873, + "rewards/margins": 15.385506629943848, + "rewards/rejected": -19.463272094726562, + "step": 1336 + }, + { + "epoch": 0.8317262830482115, + "grad_norm": 1.9541438817977905, + "learning_rate": 4.015675426463809e-06, + "logits/chosen": -2.268688678741455, + "logits/rejected": 1.4469751119613647, + "logps/chosen": -414.74066162109375, + "logps/rejected": -820.842529296875, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.688546657562256, + "rewards/margins": 19.483699798583984, + "rewards/rejected": -25.1722469329834, + "step": 1337 + }, + { + "epoch": 0.832348367029549, + "grad_norm": 4.007698589703068e-05, + "learning_rate": 4.014522821576764e-06, + "logits/chosen": 2.887434482574463, + "logits/rejected": 4.472775936126709, + "logps/chosen": -543.4795532226562, + "logps/rejected": -864.4800415039062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.919847011566162, + "rewards/margins": 24.65685272216797, + "rewards/rejected": -29.576698303222656, + "step": 1338 + }, + { + "epoch": 0.8329704510108865, + "grad_norm": 0.002910461975261569, + "learning_rate": 4.013370216689719e-06, + "logits/chosen": 1.7962815761566162, + "logits/rejected": 0.9302045106887817, + "logps/chosen": -607.7379150390625, + "logps/rejected": -662.3109130859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.729726314544678, + "rewards/margins": 16.888690948486328, + "rewards/rejected": -21.61841583251953, + "step": 1339 + }, + { + "epoch": 0.833592534992224, + "grad_norm": 2.4401133487117477e-05, + "learning_rate": 4.012217611802675e-06, + "logits/chosen": 0.7974531650543213, + "logits/rejected": 3.5518081188201904, + "logps/chosen": -415.49932861328125, + "logps/rejected": -802.1265869140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4340407848358154, + "rewards/margins": 24.112585067749023, + "rewards/rejected": -26.546627044677734, + "step": 1340 + }, + { + "epoch": 0.8342146189735614, + "grad_norm": 0.8045310974121094, + "learning_rate": 4.01106500691563e-06, + "logits/chosen": 2.564563274383545, + "logits/rejected": 3.640261173248291, + "logps/chosen": -601.762451171875, + "logps/rejected": -823.6668701171875, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.859514236450195, + "rewards/margins": 20.506380081176758, + "rewards/rejected": -26.365894317626953, + "step": 1341 + }, + { + "epoch": 0.8348367029548989, + "grad_norm": 1.1494208574295044, + "learning_rate": 4.009912402028585e-06, + "logits/chosen": -0.538671612739563, + "logits/rejected": 2.3395137786865234, + "logps/chosen": -535.9871826171875, + "logps/rejected": -866.9425048828125, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.740545749664307, + "rewards/margins": 15.756071090698242, + "rewards/rejected": -20.49661636352539, + "step": 1342 + }, + { + "epoch": 0.8354587869362364, + "grad_norm": 8.666638677823357e-06, + "learning_rate": 4.00875979714154e-06, + "logits/chosen": -1.9475653171539307, + "logits/rejected": 2.8373095989227295, + "logps/chosen": -410.46331787109375, + "logps/rejected": -874.9161987304688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.119300365447998, + "rewards/margins": 24.03900909423828, + "rewards/rejected": -30.158308029174805, + "step": 1343 + }, + { + "epoch": 0.8360808709175739, + "grad_norm": 35.65961456298828, + "learning_rate": 4.0076071922544955e-06, + "logits/chosen": 1.6600673198699951, + "logits/rejected": 3.5128393173217773, + "logps/chosen": -575.7931518554688, + "logps/rejected": -894.3558349609375, + "loss": 0.4741, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.155614852905273, + "rewards/margins": 18.48762321472168, + "rewards/rejected": -25.643238067626953, + "step": 1344 + }, + { + "epoch": 0.8367029548989113, + "grad_norm": 0.0019475392764434218, + "learning_rate": 4.006454587367451e-06, + "logits/chosen": -2.3781747817993164, + "logits/rejected": 3.6053688526153564, + "logps/chosen": -367.33099365234375, + "logps/rejected": -857.9221801757812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.666664123535156, + "rewards/margins": 22.033830642700195, + "rewards/rejected": -30.70049476623535, + "step": 1345 + }, + { + "epoch": 0.8373250388802488, + "grad_norm": 0.07737136632204056, + "learning_rate": 4.005301982480406e-06, + "logits/chosen": 2.302640676498413, + "logits/rejected": 2.480597972869873, + "logps/chosen": -547.6039428710938, + "logps/rejected": -700.687255859375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.121016263961792, + "rewards/margins": 14.329965591430664, + "rewards/rejected": -17.45098304748535, + "step": 1346 + }, + { + "epoch": 0.8379471228615863, + "grad_norm": 8.023022651672363, + "learning_rate": 4.004149377593361e-06, + "logits/chosen": 1.4918807744979858, + "logits/rejected": 5.198164939880371, + "logps/chosen": -551.644287109375, + "logps/rejected": -944.3262939453125, + "loss": 0.0563, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.576295375823975, + "rewards/margins": 17.830055236816406, + "rewards/rejected": -23.406349182128906, + "step": 1347 + }, + { + "epoch": 0.8385692068429238, + "grad_norm": 9.650162610341795e-06, + "learning_rate": 4.002996772706316e-06, + "logits/chosen": -1.5291988849639893, + "logits/rejected": 3.2551000118255615, + "logps/chosen": -291.2911376953125, + "logps/rejected": -817.490966796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.394601821899414, + "rewards/margins": 24.574989318847656, + "rewards/rejected": -30.969589233398438, + "step": 1348 + }, + { + "epoch": 0.8391912908242612, + "grad_norm": 2.9257236747071147e-05, + "learning_rate": 4.0018441678192725e-06, + "logits/chosen": -2.260589122772217, + "logits/rejected": 2.732025146484375, + "logps/chosen": -261.58526611328125, + "logps/rejected": -809.49951171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9103120565414429, + "rewards/margins": 25.605756759643555, + "rewards/rejected": -27.516071319580078, + "step": 1349 + }, + { + "epoch": 0.8398133748055988, + "grad_norm": 4.390152753330767e-05, + "learning_rate": 4.000691562932228e-06, + "logits/chosen": 2.0370168685913086, + "logits/rejected": 2.6412336826324463, + "logps/chosen": -591.8038330078125, + "logps/rejected": -994.63232421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.617895126342773, + "rewards/margins": 27.143917083740234, + "rewards/rejected": -35.761810302734375, + "step": 1350 + }, + { + "epoch": 0.8404354587869363, + "grad_norm": 0.0004083360836375505, + "learning_rate": 3.999538958045183e-06, + "logits/chosen": -2.8748371601104736, + "logits/rejected": 3.718611717224121, + "logps/chosen": -303.1407470703125, + "logps/rejected": -864.2947998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.6751580238342285, + "rewards/margins": 20.303796768188477, + "rewards/rejected": -25.978954315185547, + "step": 1351 + }, + { + "epoch": 0.8410575427682737, + "grad_norm": 0.0017260868335142732, + "learning_rate": 3.998386353158138e-06, + "logits/chosen": 0.10009878873825073, + "logits/rejected": 3.7453651428222656, + "logps/chosen": -399.87725830078125, + "logps/rejected": -858.879638671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.825431823730469, + "rewards/margins": 23.454038619995117, + "rewards/rejected": -29.279468536376953, + "step": 1352 + }, + { + "epoch": 0.8416796267496112, + "grad_norm": 18.463964462280273, + "learning_rate": 3.997233748271093e-06, + "logits/chosen": -0.7138807773590088, + "logits/rejected": 3.2711589336395264, + "logps/chosen": -509.6426086425781, + "logps/rejected": -1053.80029296875, + "loss": 0.1009, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.224634170532227, + "rewards/margins": 35.433074951171875, + "rewards/rejected": -39.657711029052734, + "step": 1353 + }, + { + "epoch": 0.8423017107309487, + "grad_norm": 0.012860018759965897, + "learning_rate": 3.996081143384049e-06, + "logits/chosen": 3.318115711212158, + "logits/rejected": 4.649235725402832, + "logps/chosen": -688.559326171875, + "logps/rejected": -973.8826293945312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.674195766448975, + "rewards/margins": 22.215259552001953, + "rewards/rejected": -29.889455795288086, + "step": 1354 + }, + { + "epoch": 0.8429237947122862, + "grad_norm": 2.4849356350387097e-07, + "learning_rate": 3.994928538497004e-06, + "logits/chosen": -3.6064767837524414, + "logits/rejected": 3.288621425628662, + "logps/chosen": -288.0089111328125, + "logps/rejected": -1011.349609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1988747119903564, + "rewards/margins": 35.37522888183594, + "rewards/rejected": -36.574100494384766, + "step": 1355 + }, + { + "epoch": 0.8435458786936236, + "grad_norm": 6.193071365356445, + "learning_rate": 3.993775933609959e-06, + "logits/chosen": -3.4069361686706543, + "logits/rejected": 2.3285210132598877, + "logps/chosen": -267.35748291015625, + "logps/rejected": -765.3587646484375, + "loss": 0.0229, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.777052402496338, + "rewards/margins": 20.48461151123047, + "rewards/rejected": -26.26166534423828, + "step": 1356 + }, + { + "epoch": 0.8441679626749611, + "grad_norm": 2.2144156446302077e-06, + "learning_rate": 3.992623328722914e-06, + "logits/chosen": -1.2399766445159912, + "logits/rejected": 4.118406772613525, + "logps/chosen": -283.79296875, + "logps/rejected": -785.6634521484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6745691299438477, + "rewards/margins": 23.444019317626953, + "rewards/rejected": -26.118587493896484, + "step": 1357 + }, + { + "epoch": 0.8447900466562986, + "grad_norm": 0.5156397819519043, + "learning_rate": 3.9914707238358695e-06, + "logits/chosen": 0.3204643726348877, + "logits/rejected": 2.7228527069091797, + "logps/chosen": -560.1771850585938, + "logps/rejected": -1028.784912109375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.866544723510742, + "rewards/margins": 23.669208526611328, + "rewards/rejected": -33.53575134277344, + "step": 1358 + }, + { + "epoch": 0.8454121306376361, + "grad_norm": 0.00017892532923724502, + "learning_rate": 3.990318118948825e-06, + "logits/chosen": -2.0653839111328125, + "logits/rejected": 3.424193859100342, + "logps/chosen": -285.17431640625, + "logps/rejected": -915.456787109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.734927177429199, + "rewards/margins": 29.654401779174805, + "rewards/rejected": -34.38932800292969, + "step": 1359 + }, + { + "epoch": 0.8460342146189735, + "grad_norm": 0.012640786357223988, + "learning_rate": 3.98916551406178e-06, + "logits/chosen": 2.023895025253296, + "logits/rejected": 4.745113849639893, + "logps/chosen": -416.2607421875, + "logps/rejected": -926.1375732421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5701878070831299, + "rewards/margins": 21.946216583251953, + "rewards/rejected": -23.516407012939453, + "step": 1360 + }, + { + "epoch": 0.846656298600311, + "grad_norm": 35.73863983154297, + "learning_rate": 3.988012909174735e-06, + "logits/chosen": -0.7985638380050659, + "logits/rejected": 1.3513717651367188, + "logps/chosen": -588.6486206054688, + "logps/rejected": -883.7210693359375, + "loss": 0.4721, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.495615005493164, + "rewards/margins": 19.96755599975586, + "rewards/rejected": -28.463171005249023, + "step": 1361 + }, + { + "epoch": 0.8472783825816486, + "grad_norm": 3.837876558303833, + "learning_rate": 3.98686030428769e-06, + "logits/chosen": -4.322826385498047, + "logits/rejected": 1.2892265319824219, + "logps/chosen": -353.0385437011719, + "logps/rejected": -832.596923828125, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.767023086547852, + "rewards/margins": 19.498014450073242, + "rewards/rejected": -25.265037536621094, + "step": 1362 + }, + { + "epoch": 0.847900466562986, + "grad_norm": 4.668637120630592e-05, + "learning_rate": 3.9857076994006465e-06, + "logits/chosen": 2.5119447708129883, + "logits/rejected": 4.515449047088623, + "logps/chosen": -661.9925537109375, + "logps/rejected": -1010.0653686523438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.104189872741699, + "rewards/margins": 25.521949768066406, + "rewards/rejected": -31.626140594482422, + "step": 1363 + }, + { + "epoch": 0.8485225505443235, + "grad_norm": 26.245492935180664, + "learning_rate": 3.984555094513602e-06, + "logits/chosen": 0.29489636421203613, + "logits/rejected": 3.7235867977142334, + "logps/chosen": -556.9881591796875, + "logps/rejected": -929.4679565429688, + "loss": 0.2556, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.547537803649902, + "rewards/margins": 22.821739196777344, + "rewards/rejected": -31.369277954101562, + "step": 1364 + }, + { + "epoch": 0.8491446345256609, + "grad_norm": 3.216511686332524e-05, + "learning_rate": 3.983402489626556e-06, + "logits/chosen": 0.3073354959487915, + "logits/rejected": 3.334766149520874, + "logps/chosen": -539.6654052734375, + "logps/rejected": -937.9322509765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.841401100158691, + "rewards/margins": 27.675031661987305, + "rewards/rejected": -32.51643371582031, + "step": 1365 + }, + { + "epoch": 0.8497667185069985, + "grad_norm": 19.330078125, + "learning_rate": 3.982249884739511e-06, + "logits/chosen": 0.2854769229888916, + "logits/rejected": 2.968214988708496, + "logps/chosen": -527.625244140625, + "logps/rejected": -923.3499145507812, + "loss": 0.0658, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.42944622039795, + "rewards/margins": 21.908905029296875, + "rewards/rejected": -33.33835220336914, + "step": 1366 + }, + { + "epoch": 0.8503888024883359, + "grad_norm": 0.020990528166294098, + "learning_rate": 3.9810972798524665e-06, + "logits/chosen": 1.3086750507354736, + "logits/rejected": 3.233398914337158, + "logps/chosen": -608.595947265625, + "logps/rejected": -1012.1083374023438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.629014015197754, + "rewards/margins": 26.847347259521484, + "rewards/rejected": -33.47636032104492, + "step": 1367 + }, + { + "epoch": 0.8510108864696734, + "grad_norm": 0.0055344197899103165, + "learning_rate": 3.979944674965422e-06, + "logits/chosen": 1.164971947669983, + "logits/rejected": 4.53289270401001, + "logps/chosen": -633.4853515625, + "logps/rejected": -986.58251953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.878882884979248, + "rewards/margins": 25.460460662841797, + "rewards/rejected": -31.339344024658203, + "step": 1368 + }, + { + "epoch": 0.8516329704510109, + "grad_norm": 0.05928613618016243, + "learning_rate": 3.978792070078377e-06, + "logits/chosen": 2.948124647140503, + "logits/rejected": 3.4460105895996094, + "logps/chosen": -740.6045532226562, + "logps/rejected": -955.6398315429688, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.457581520080566, + "rewards/margins": 21.442768096923828, + "rewards/rejected": -28.90035057067871, + "step": 1369 + }, + { + "epoch": 0.8522550544323484, + "grad_norm": 10.205842018127441, + "learning_rate": 3.977639465191332e-06, + "logits/chosen": 2.0318384170532227, + "logits/rejected": 3.432497024536133, + "logps/chosen": -605.9002685546875, + "logps/rejected": -772.0656127929688, + "loss": 0.0569, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.010197877883911, + "rewards/margins": 10.323250770568848, + "rewards/rejected": -13.333450317382812, + "step": 1370 + }, + { + "epoch": 0.8528771384136858, + "grad_norm": 1.9100292921066284, + "learning_rate": 3.976486860304287e-06, + "logits/chosen": 1.0926796197891235, + "logits/rejected": 4.672023296356201, + "logps/chosen": -490.7054443359375, + "logps/rejected": -892.81591796875, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.199094772338867, + "rewards/margins": 17.715091705322266, + "rewards/rejected": -25.914186477661133, + "step": 1371 + }, + { + "epoch": 0.8534992223950233, + "grad_norm": 0.035171184688806534, + "learning_rate": 3.975334255417243e-06, + "logits/chosen": -2.5091309547424316, + "logits/rejected": 3.1034936904907227, + "logps/chosen": -389.6402893066406, + "logps/rejected": -885.12939453125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.678465843200684, + "rewards/margins": 21.970054626464844, + "rewards/rejected": -26.648521423339844, + "step": 1372 + }, + { + "epoch": 0.8541213063763609, + "grad_norm": 2.5301403999328613, + "learning_rate": 3.974181650530199e-06, + "logits/chosen": -0.04623675346374512, + "logits/rejected": 2.8056583404541016, + "logps/chosen": -357.27496337890625, + "logps/rejected": -749.208984375, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.000709533691406, + "rewards/margins": 20.618499755859375, + "rewards/rejected": -24.61920928955078, + "step": 1373 + }, + { + "epoch": 0.8547433903576983, + "grad_norm": 23.893159866333008, + "learning_rate": 3.973029045643154e-06, + "logits/chosen": 2.059338092803955, + "logits/rejected": 3.9922189712524414, + "logps/chosen": -662.5907592773438, + "logps/rejected": -978.7205810546875, + "loss": 0.1882, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.927433967590332, + "rewards/margins": 19.52408218383789, + "rewards/rejected": -26.451515197753906, + "step": 1374 + }, + { + "epoch": 0.8553654743390358, + "grad_norm": 0.0024465518072247505, + "learning_rate": 3.971876440756109e-06, + "logits/chosen": 0.2747696042060852, + "logits/rejected": 4.139282703399658, + "logps/chosen": -435.94207763671875, + "logps/rejected": -831.924560546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.307514190673828, + "rewards/margins": 16.516843795776367, + "rewards/rejected": -23.824356079101562, + "step": 1375 + }, + { + "epoch": 0.8559875583203732, + "grad_norm": 1.0032643871227265e-07, + "learning_rate": 3.970723835869064e-06, + "logits/chosen": -0.6229652166366577, + "logits/rejected": 5.645219326019287, + "logps/chosen": -434.7634582519531, + "logps/rejected": -1070.2586669921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9480323791503906, + "rewards/margins": 32.68678283691406, + "rewards/rejected": -35.63481903076172, + "step": 1376 + }, + { + "epoch": 0.8566096423017108, + "grad_norm": 0.0009331091423518956, + "learning_rate": 3.96957123098202e-06, + "logits/chosen": 1.4642635583877563, + "logits/rejected": 3.1478796005249023, + "logps/chosen": -506.8468017578125, + "logps/rejected": -809.0045166015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.069552898406982, + "rewards/margins": 19.62548828125, + "rewards/rejected": -26.69504165649414, + "step": 1377 + }, + { + "epoch": 0.8572317262830482, + "grad_norm": 0.1357276737689972, + "learning_rate": 3.968418626094975e-06, + "logits/chosen": 2.1965925693511963, + "logits/rejected": 3.75270676612854, + "logps/chosen": -529.9581298828125, + "logps/rejected": -775.684814453125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.251548767089844, + "rewards/margins": 17.33985710144043, + "rewards/rejected": -23.591405868530273, + "step": 1378 + }, + { + "epoch": 0.8578538102643857, + "grad_norm": 18.46625518798828, + "learning_rate": 3.96726602120793e-06, + "logits/chosen": -0.16634273529052734, + "logits/rejected": 2.594822883605957, + "logps/chosen": -565.5756225585938, + "logps/rejected": -926.483642578125, + "loss": 0.1594, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.502753257751465, + "rewards/margins": 20.557899475097656, + "rewards/rejected": -27.060651779174805, + "step": 1379 + }, + { + "epoch": 0.8584758942457231, + "grad_norm": 0.018732385709881783, + "learning_rate": 3.966113416320885e-06, + "logits/chosen": -0.2522784471511841, + "logits/rejected": 2.6350338459014893, + "logps/chosen": -499.401123046875, + "logps/rejected": -913.3438720703125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.384436130523682, + "rewards/margins": 24.542402267456055, + "rewards/rejected": -30.92683982849121, + "step": 1380 + }, + { + "epoch": 0.8590979782270607, + "grad_norm": 2.273261547088623, + "learning_rate": 3.9649608114338405e-06, + "logits/chosen": -1.5306123495101929, + "logits/rejected": 2.4383885860443115, + "logps/chosen": -394.1301574707031, + "logps/rejected": -719.6019287109375, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.67462158203125, + "rewards/margins": 16.036300659179688, + "rewards/rejected": -25.710922241210938, + "step": 1381 + }, + { + "epoch": 0.8597200622083981, + "grad_norm": 0.14083699882030487, + "learning_rate": 3.963808206546796e-06, + "logits/chosen": 1.4821040630340576, + "logits/rejected": 3.562310218811035, + "logps/chosen": -630.3826904296875, + "logps/rejected": -1096.625732421875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.841997146606445, + "rewards/margins": 29.525222778320312, + "rewards/rejected": -38.367218017578125, + "step": 1382 + }, + { + "epoch": 0.8603421461897356, + "grad_norm": 0.49102070927619934, + "learning_rate": 3.962655601659751e-06, + "logits/chosen": -1.6309912204742432, + "logits/rejected": 2.7926807403564453, + "logps/chosen": -375.1983947753906, + "logps/rejected": -995.9594116210938, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.588959693908691, + "rewards/margins": 31.954254150390625, + "rewards/rejected": -37.543212890625, + "step": 1383 + }, + { + "epoch": 0.860964230171073, + "grad_norm": 1.9696420431137085, + "learning_rate": 3.961502996772706e-06, + "logits/chosen": 1.5886688232421875, + "logits/rejected": 4.186314582824707, + "logps/chosen": -483.3215637207031, + "logps/rejected": -956.5115966796875, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.810182571411133, + "rewards/margins": 23.55510711669922, + "rewards/rejected": -29.36528968811035, + "step": 1384 + }, + { + "epoch": 0.8615863141524106, + "grad_norm": 3.1318552494049072, + "learning_rate": 3.960350391885661e-06, + "logits/chosen": 1.087369680404663, + "logits/rejected": 1.6794929504394531, + "logps/chosen": -620.4326171875, + "logps/rejected": -852.4151000976562, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.948686599731445, + "rewards/margins": 20.768470764160156, + "rewards/rejected": -29.71715545654297, + "step": 1385 + }, + { + "epoch": 0.862208398133748, + "grad_norm": 7.456227467628196e-05, + "learning_rate": 3.959197786998617e-06, + "logits/chosen": -0.2758218050003052, + "logits/rejected": 3.6374728679656982, + "logps/chosen": -510.7708740234375, + "logps/rejected": -986.722412109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.5473737716674805, + "rewards/margins": 26.101940155029297, + "rewards/rejected": -32.649314880371094, + "step": 1386 + }, + { + "epoch": 0.8628304821150855, + "grad_norm": 0.1047658622264862, + "learning_rate": 3.958045182111573e-06, + "logits/chosen": 0.1710960417985916, + "logits/rejected": 2.208815336227417, + "logps/chosen": -611.6002197265625, + "logps/rejected": -1039.8455810546875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.992301940917969, + "rewards/margins": 29.51941680908203, + "rewards/rejected": -39.51171875, + "step": 1387 + }, + { + "epoch": 0.8634525660964231, + "grad_norm": 0.0017270062817260623, + "learning_rate": 3.956892577224528e-06, + "logits/chosen": 0.7946970462799072, + "logits/rejected": 1.492017388343811, + "logps/chosen": -634.5062255859375, + "logps/rejected": -867.8536376953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.7527494430542, + "rewards/margins": 20.600440979003906, + "rewards/rejected": -31.353191375732422, + "step": 1388 + }, + { + "epoch": 0.8640746500777605, + "grad_norm": 23.987987518310547, + "learning_rate": 3.955739972337483e-06, + "logits/chosen": -1.741621494293213, + "logits/rejected": 2.635464906692505, + "logps/chosen": -403.216552734375, + "logps/rejected": -912.2803955078125, + "loss": 0.1765, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.126017570495605, + "rewards/margins": 22.75326919555664, + "rewards/rejected": -30.879287719726562, + "step": 1389 + }, + { + "epoch": 0.864696734059098, + "grad_norm": 3.4604811668395996, + "learning_rate": 3.954587367450438e-06, + "logits/chosen": -1.273258090019226, + "logits/rejected": 3.6706578731536865, + "logps/chosen": -359.53375244140625, + "logps/rejected": -844.9307861328125, + "loss": 0.1569, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.742606163024902, + "rewards/margins": 26.624191284179688, + "rewards/rejected": -32.366798400878906, + "step": 1390 + }, + { + "epoch": 0.8653188180404354, + "grad_norm": 8.28245174488984e-05, + "learning_rate": 3.953434762563394e-06, + "logits/chosen": 0.8197730779647827, + "logits/rejected": 3.010805606842041, + "logps/chosen": -598.204345703125, + "logps/rejected": -1058.3233642578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.552316188812256, + "rewards/margins": 30.922182083129883, + "rewards/rejected": -37.47449493408203, + "step": 1391 + }, + { + "epoch": 0.865940902021773, + "grad_norm": 36.1810417175293, + "learning_rate": 3.952282157676349e-06, + "logits/chosen": 4.309150695800781, + "logits/rejected": 6.092033386230469, + "logps/chosen": -792.2166137695312, + "logps/rejected": -1117.3037109375, + "loss": 0.6571, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.301803588867188, + "rewards/margins": 18.393749237060547, + "rewards/rejected": -28.695552825927734, + "step": 1392 + }, + { + "epoch": 0.8665629860031104, + "grad_norm": 7.91036436567083e-05, + "learning_rate": 3.951129552789304e-06, + "logits/chosen": -0.3365095853805542, + "logits/rejected": 3.304396629333496, + "logps/chosen": -554.6954345703125, + "logps/rejected": -1004.7761840820312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.135865211486816, + "rewards/margins": 28.570690155029297, + "rewards/rejected": -37.70655822753906, + "step": 1393 + }, + { + "epoch": 0.8671850699844479, + "grad_norm": 8.95252513885498, + "learning_rate": 3.949976947902259e-06, + "logits/chosen": -1.6988708972930908, + "logits/rejected": 2.7335429191589355, + "logps/chosen": -381.9712829589844, + "logps/rejected": -769.756103515625, + "loss": 0.0344, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.227147579193115, + "rewards/margins": 21.232027053833008, + "rewards/rejected": -27.459177017211914, + "step": 1394 + }, + { + "epoch": 0.8678071539657853, + "grad_norm": 0.00047960656229406595, + "learning_rate": 3.9488243430152145e-06, + "logits/chosen": 1.272325873374939, + "logits/rejected": 4.848588466644287, + "logps/chosen": -473.9206237792969, + "logps/rejected": -895.236328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.872984886169434, + "rewards/margins": 26.865549087524414, + "rewards/rejected": -33.73853302001953, + "step": 1395 + }, + { + "epoch": 0.8684292379471229, + "grad_norm": 32.17039489746094, + "learning_rate": 3.94767173812817e-06, + "logits/chosen": 1.9787907600402832, + "logits/rejected": 5.032547473907471, + "logps/chosen": -563.3594360351562, + "logps/rejected": -925.163330078125, + "loss": 0.8672, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.483692169189453, + "rewards/margins": 18.271421432495117, + "rewards/rejected": -25.755115509033203, + "step": 1396 + }, + { + "epoch": 0.8690513219284604, + "grad_norm": 34.24311447143555, + "learning_rate": 3.946519133241125e-06, + "logits/chosen": -1.7461861371994019, + "logits/rejected": 1.7403262853622437, + "logps/chosen": -553.9136352539062, + "logps/rejected": -946.935302734375, + "loss": 0.2911, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.655351638793945, + "rewards/margins": 22.64749526977539, + "rewards/rejected": -31.302845001220703, + "step": 1397 + }, + { + "epoch": 0.8696734059097978, + "grad_norm": 29.2575740814209, + "learning_rate": 3.94536652835408e-06, + "logits/chosen": 0.19249913096427917, + "logits/rejected": 3.8216664791107178, + "logps/chosen": -437.7361145019531, + "logps/rejected": -935.0411376953125, + "loss": 0.4214, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.826973915100098, + "rewards/margins": 25.423913955688477, + "rewards/rejected": -30.250885009765625, + "step": 1398 + }, + { + "epoch": 0.8702954898911353, + "grad_norm": 28.614089965820312, + "learning_rate": 3.944213923467035e-06, + "logits/chosen": 1.4811843633651733, + "logits/rejected": 4.661951065063477, + "logps/chosen": -563.6171875, + "logps/rejected": -973.4630126953125, + "loss": 0.9317, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.618762969970703, + "rewards/margins": 22.27410888671875, + "rewards/rejected": -26.892871856689453, + "step": 1399 + }, + { + "epoch": 0.8709175738724728, + "grad_norm": 0.4589652717113495, + "learning_rate": 3.943061318579991e-06, + "logits/chosen": 0.9266824722290039, + "logits/rejected": 2.3370745182037354, + "logps/chosen": -574.8289184570312, + "logps/rejected": -844.5990600585938, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.61984920501709, + "rewards/margins": 17.409948348999023, + "rewards/rejected": -27.029796600341797, + "step": 1400 + }, + { + "epoch": 0.8715396578538103, + "grad_norm": 27.476842880249023, + "learning_rate": 3.941908713692946e-06, + "logits/chosen": 1.1406919956207275, + "logits/rejected": 3.6667251586914062, + "logps/chosen": -604.9240112304688, + "logps/rejected": -743.1671142578125, + "loss": 0.4692, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.29455852508545, + "rewards/margins": 12.168575286865234, + "rewards/rejected": -20.463132858276367, + "step": 1401 + }, + { + "epoch": 0.8721617418351477, + "grad_norm": 17.606111526489258, + "learning_rate": 3.940756108805902e-06, + "logits/chosen": 1.7271714210510254, + "logits/rejected": 3.771793842315674, + "logps/chosen": -625.2269287109375, + "logps/rejected": -941.1549072265625, + "loss": 0.1015, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.325542449951172, + "rewards/margins": 25.147600173950195, + "rewards/rejected": -32.473140716552734, + "step": 1402 + }, + { + "epoch": 0.8727838258164852, + "grad_norm": 0.0003857784904539585, + "learning_rate": 3.939603503918857e-06, + "logits/chosen": -1.2490999698638916, + "logits/rejected": 2.748063087463379, + "logps/chosen": -391.865966796875, + "logps/rejected": -883.5145263671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2403740882873535, + "rewards/margins": 22.9473876953125, + "rewards/rejected": -27.187763214111328, + "step": 1403 + }, + { + "epoch": 0.8734059097978227, + "grad_norm": 16.675168991088867, + "learning_rate": 3.938450899031812e-06, + "logits/chosen": -0.7203967571258545, + "logits/rejected": 2.4281888008117676, + "logps/chosen": -406.6605224609375, + "logps/rejected": -737.8378295898438, + "loss": 0.147, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.123994827270508, + "rewards/margins": 14.674833297729492, + "rewards/rejected": -18.798828125, + "step": 1404 + }, + { + "epoch": 0.8740279937791602, + "grad_norm": 0.03721201419830322, + "learning_rate": 3.937298294144768e-06, + "logits/chosen": -1.2863411903381348, + "logits/rejected": 5.00859260559082, + "logps/chosen": -428.8861083984375, + "logps/rejected": -1081.917724609375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.048243522644043, + "rewards/margins": 28.324411392211914, + "rewards/rejected": -35.372657775878906, + "step": 1405 + }, + { + "epoch": 0.8746500777604976, + "grad_norm": 24.92822265625, + "learning_rate": 3.936145689257723e-06, + "logits/chosen": 1.4132871627807617, + "logits/rejected": 2.852602481842041, + "logps/chosen": -564.416748046875, + "logps/rejected": -827.8515014648438, + "loss": 0.2008, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.818817138671875, + "rewards/margins": 17.714767456054688, + "rewards/rejected": -25.533586502075195, + "step": 1406 + }, + { + "epoch": 0.8752721617418352, + "grad_norm": 6.125887870788574, + "learning_rate": 3.934993084370678e-06, + "logits/chosen": -1.2724628448486328, + "logits/rejected": 3.4561619758605957, + "logps/chosen": -458.8982238769531, + "logps/rejected": -891.2723388671875, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.694085121154785, + "rewards/margins": 23.815200805664062, + "rewards/rejected": -33.50928497314453, + "step": 1407 + }, + { + "epoch": 0.8758942457231726, + "grad_norm": 0.11236572265625, + "learning_rate": 3.933840479483633e-06, + "logits/chosen": -1.3016149997711182, + "logits/rejected": 2.4662973880767822, + "logps/chosen": -342.9910583496094, + "logps/rejected": -818.015380859375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.7304840087890625, + "rewards/margins": 26.967788696289062, + "rewards/rejected": -33.698272705078125, + "step": 1408 + }, + { + "epoch": 0.8765163297045101, + "grad_norm": 41.05025100708008, + "learning_rate": 3.9326878745965885e-06, + "logits/chosen": 0.767713189125061, + "logits/rejected": 4.092617511749268, + "logps/chosen": -576.6070556640625, + "logps/rejected": -1000.982421875, + "loss": 0.9999, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.53645133972168, + "rewards/margins": 25.861608505249023, + "rewards/rejected": -35.3980598449707, + "step": 1409 + }, + { + "epoch": 0.8771384136858476, + "grad_norm": 0.00035387437674216926, + "learning_rate": 3.931535269709544e-06, + "logits/chosen": -0.6754996180534363, + "logits/rejected": 4.191483497619629, + "logps/chosen": -348.2604675292969, + "logps/rejected": -981.4669189453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.8978424072265625, + "rewards/margins": 30.909826278686523, + "rewards/rejected": -36.80767059326172, + "step": 1410 + }, + { + "epoch": 0.8777604976671851, + "grad_norm": 0.04639606177806854, + "learning_rate": 3.930382664822499e-06, + "logits/chosen": 0.5568681955337524, + "logits/rejected": 3.2366063594818115, + "logps/chosen": -395.332763671875, + "logps/rejected": -721.87109375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.578215599060059, + "rewards/margins": 20.79397201538086, + "rewards/rejected": -26.372188568115234, + "step": 1411 + }, + { + "epoch": 0.8783825816485226, + "grad_norm": 16.890092849731445, + "learning_rate": 3.929230059935454e-06, + "logits/chosen": 1.8669103384017944, + "logits/rejected": 2.9881675243377686, + "logps/chosen": -683.0795288085938, + "logps/rejected": -946.185546875, + "loss": 0.0631, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.376381874084473, + "rewards/margins": 20.531028747558594, + "rewards/rejected": -30.90740966796875, + "step": 1412 + }, + { + "epoch": 0.87900466562986, + "grad_norm": 18.696821212768555, + "learning_rate": 3.928077455048409e-06, + "logits/chosen": 1.8616418838500977, + "logits/rejected": 3.8562190532684326, + "logps/chosen": -602.8616943359375, + "logps/rejected": -1019.068603515625, + "loss": 0.2133, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.290316581726074, + "rewards/margins": 27.61571502685547, + "rewards/rejected": -33.90603256225586, + "step": 1413 + }, + { + "epoch": 0.8796267496111975, + "grad_norm": 0.017830608412623405, + "learning_rate": 3.926924850161365e-06, + "logits/chosen": -1.609785795211792, + "logits/rejected": 3.152907609939575, + "logps/chosen": -446.86395263671875, + "logps/rejected": -1040.036865234375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.331204414367676, + "rewards/margins": 32.17317199707031, + "rewards/rejected": -39.50437545776367, + "step": 1414 + }, + { + "epoch": 0.880248833592535, + "grad_norm": 0.01888345554471016, + "learning_rate": 3.92577224527432e-06, + "logits/chosen": 0.08871287107467651, + "logits/rejected": 3.6794910430908203, + "logps/chosen": -519.1620483398438, + "logps/rejected": -920.4866333007812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.557089805603027, + "rewards/margins": 16.47603988647461, + "rewards/rejected": -25.033130645751953, + "step": 1415 + }, + { + "epoch": 0.8808709175738725, + "grad_norm": 0.00010415662109153345, + "learning_rate": 3.924619640387275e-06, + "logits/chosen": -0.11627277731895447, + "logits/rejected": 2.4083340167999268, + "logps/chosen": -451.6080322265625, + "logps/rejected": -901.6516723632812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.407066345214844, + "rewards/margins": 26.67092514038086, + "rewards/rejected": -34.07799530029297, + "step": 1416 + }, + { + "epoch": 0.8814930015552099, + "grad_norm": 0.021583333611488342, + "learning_rate": 3.923467035500231e-06, + "logits/chosen": 1.4402005672454834, + "logits/rejected": 2.0313756465911865, + "logps/chosen": -656.1533813476562, + "logps/rejected": -1004.5067138671875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.986201286315918, + "rewards/margins": 25.95221710205078, + "rewards/rejected": -36.938419342041016, + "step": 1417 + }, + { + "epoch": 0.8821150855365474, + "grad_norm": 0.7738665342330933, + "learning_rate": 3.922314430613186e-06, + "logits/chosen": 3.2806267738342285, + "logits/rejected": 3.2303647994995117, + "logps/chosen": -668.647216796875, + "logps/rejected": -898.922119140625, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.184161186218262, + "rewards/margins": 24.993419647216797, + "rewards/rejected": -34.17757797241211, + "step": 1418 + }, + { + "epoch": 0.882737169517885, + "grad_norm": 23.072006225585938, + "learning_rate": 3.921161825726142e-06, + "logits/chosen": 0.755908727645874, + "logits/rejected": 2.168006658554077, + "logps/chosen": -565.05810546875, + "logps/rejected": -842.9892578125, + "loss": 0.4122, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.335588455200195, + "rewards/margins": 20.874221801757812, + "rewards/rejected": -26.20981216430664, + "step": 1419 + }, + { + "epoch": 0.8833592534992224, + "grad_norm": 0.002978462493047118, + "learning_rate": 3.920009220839097e-06, + "logits/chosen": -0.32014715671539307, + "logits/rejected": 1.7139149904251099, + "logps/chosen": -513.91455078125, + "logps/rejected": -936.3826293945312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.227300643920898, + "rewards/margins": 27.83060073852539, + "rewards/rejected": -32.057899475097656, + "step": 1420 + }, + { + "epoch": 0.8839813374805598, + "grad_norm": 30.52089500427246, + "learning_rate": 3.918856615952052e-06, + "logits/chosen": 2.750991106033325, + "logits/rejected": 2.399867534637451, + "logps/chosen": -659.213623046875, + "logps/rejected": -921.962158203125, + "loss": 0.2245, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.95782470703125, + "rewards/margins": 23.880416870117188, + "rewards/rejected": -32.83824157714844, + "step": 1421 + }, + { + "epoch": 0.8846034214618974, + "grad_norm": 0.259468138217926, + "learning_rate": 3.917704011065007e-06, + "logits/chosen": 1.1308553218841553, + "logits/rejected": 2.8450875282287598, + "logps/chosen": -667.422607421875, + "logps/rejected": -841.1114501953125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.422941207885742, + "rewards/margins": 15.405401229858398, + "rewards/rejected": -24.82834243774414, + "step": 1422 + }, + { + "epoch": 0.8852255054432349, + "grad_norm": 10.124845504760742, + "learning_rate": 3.9165514061779625e-06, + "logits/chosen": 1.7226544618606567, + "logits/rejected": 4.723477840423584, + "logps/chosen": -534.2507934570312, + "logps/rejected": -866.4068603515625, + "loss": 0.0391, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.586525917053223, + "rewards/margins": 19.207855224609375, + "rewards/rejected": -27.79438018798828, + "step": 1423 + }, + { + "epoch": 0.8858475894245723, + "grad_norm": 0.558952808380127, + "learning_rate": 3.915398801290918e-06, + "logits/chosen": -0.2260989248752594, + "logits/rejected": 2.9631381034851074, + "logps/chosen": -496.76678466796875, + "logps/rejected": -953.5576782226562, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.71063232421875, + "rewards/margins": 23.282506942749023, + "rewards/rejected": -31.993141174316406, + "step": 1424 + }, + { + "epoch": 0.8864696734059098, + "grad_norm": 16.527063369750977, + "learning_rate": 3.914246196403873e-06, + "logits/chosen": 3.2674198150634766, + "logits/rejected": 5.55305290222168, + "logps/chosen": -580.180908203125, + "logps/rejected": -1039.1260986328125, + "loss": 0.1108, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.649737358093262, + "rewards/margins": 31.834278106689453, + "rewards/rejected": -42.48401641845703, + "step": 1425 + }, + { + "epoch": 0.8870917573872473, + "grad_norm": 0.042061157524585724, + "learning_rate": 3.913093591516828e-06, + "logits/chosen": 0.06921637058258057, + "logits/rejected": 3.0379743576049805, + "logps/chosen": -557.9514770507812, + "logps/rejected": -997.2664794921875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.284543514251709, + "rewards/margins": 23.125932693481445, + "rewards/rejected": -30.41047477722168, + "step": 1426 + }, + { + "epoch": 0.8877138413685848, + "grad_norm": 1.3195871114730835, + "learning_rate": 3.911940986629783e-06, + "logits/chosen": 1.0034469366073608, + "logits/rejected": 3.320380687713623, + "logps/chosen": -512.767822265625, + "logps/rejected": -833.2984619140625, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.851565361022949, + "rewards/margins": 18.35439109802246, + "rewards/rejected": -25.20595932006836, + "step": 1427 + }, + { + "epoch": 0.8883359253499222, + "grad_norm": 0.3945462703704834, + "learning_rate": 3.910788381742739e-06, + "logits/chosen": -1.1836017370224, + "logits/rejected": 3.2389326095581055, + "logps/chosen": -339.166748046875, + "logps/rejected": -919.89599609375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.06601333618164, + "rewards/margins": 26.293392181396484, + "rewards/rejected": -34.359405517578125, + "step": 1428 + }, + { + "epoch": 0.8889580093312597, + "grad_norm": 5.651898391079158e-06, + "learning_rate": 3.909635776855694e-06, + "logits/chosen": -2.461566686630249, + "logits/rejected": 3.693530321121216, + "logps/chosen": -466.4064636230469, + "logps/rejected": -1155.3829345703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.28778076171875, + "rewards/margins": 35.794368743896484, + "rewards/rejected": -43.08214569091797, + "step": 1429 + }, + { + "epoch": 0.8895800933125972, + "grad_norm": 3.520720565575175e-05, + "learning_rate": 3.908483171968649e-06, + "logits/chosen": 2.5713906288146973, + "logits/rejected": 3.6118719577789307, + "logps/chosen": -649.1976318359375, + "logps/rejected": -1010.8802490234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.335611343383789, + "rewards/margins": 29.236289978027344, + "rewards/rejected": -38.571903228759766, + "step": 1430 + }, + { + "epoch": 0.8902021772939347, + "grad_norm": 0.06428039073944092, + "learning_rate": 3.907330567081605e-06, + "logits/chosen": -0.25640755891799927, + "logits/rejected": 2.151702404022217, + "logps/chosen": -601.4017944335938, + "logps/rejected": -966.105224609375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.176769256591797, + "rewards/margins": 27.328582763671875, + "rewards/rejected": -35.505348205566406, + "step": 1431 + }, + { + "epoch": 0.8908242612752721, + "grad_norm": 0.6199120879173279, + "learning_rate": 3.90617796219456e-06, + "logits/chosen": 2.1671009063720703, + "logits/rejected": 3.3351383209228516, + "logps/chosen": -604.8416748046875, + "logps/rejected": -902.930419921875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.571571350097656, + "rewards/margins": 23.950511932373047, + "rewards/rejected": -32.5220832824707, + "step": 1432 + }, + { + "epoch": 0.8914463452566096, + "grad_norm": 2.534109115600586, + "learning_rate": 3.905025357307516e-06, + "logits/chosen": -1.5430231094360352, + "logits/rejected": 2.451169490814209, + "logps/chosen": -502.9190979003906, + "logps/rejected": -919.2320556640625, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.408283233642578, + "rewards/margins": 18.22637176513672, + "rewards/rejected": -26.634654998779297, + "step": 1433 + }, + { + "epoch": 0.8920684292379472, + "grad_norm": 0.4797965884208679, + "learning_rate": 3.903872752420471e-06, + "logits/chosen": -0.4005916714668274, + "logits/rejected": 3.6412811279296875, + "logps/chosen": -618.4359130859375, + "logps/rejected": -1001.5445556640625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.378423690795898, + "rewards/margins": 21.48226547241211, + "rewards/rejected": -32.86069107055664, + "step": 1434 + }, + { + "epoch": 0.8926905132192846, + "grad_norm": 33.51510238647461, + "learning_rate": 3.902720147533426e-06, + "logits/chosen": 0.43302229046821594, + "logits/rejected": 3.2057642936706543, + "logps/chosen": -321.2042236328125, + "logps/rejected": -509.62615966796875, + "loss": 0.4201, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.494220733642578, + "rewards/margins": 10.129779815673828, + "rewards/rejected": -17.624000549316406, + "step": 1435 + }, + { + "epoch": 0.8933125972006221, + "grad_norm": 0.0015452936058863997, + "learning_rate": 3.901567542646381e-06, + "logits/chosen": 1.8499112129211426, + "logits/rejected": 0.8464174866676331, + "logps/chosen": -620.2147216796875, + "logps/rejected": -714.9520263671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.805253028869629, + "rewards/margins": 18.28689193725586, + "rewards/rejected": -27.092144012451172, + "step": 1436 + }, + { + "epoch": 0.8939346811819595, + "grad_norm": 22.18968391418457, + "learning_rate": 3.9004149377593365e-06, + "logits/chosen": 1.2555292844772339, + "logits/rejected": 4.441434860229492, + "logps/chosen": -566.209228515625, + "logps/rejected": -969.3631591796875, + "loss": 0.1819, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.221857070922852, + "rewards/margins": 23.401203155517578, + "rewards/rejected": -33.6230583190918, + "step": 1437 + }, + { + "epoch": 0.8945567651632971, + "grad_norm": 0.0015446230536326766, + "learning_rate": 3.899262332872292e-06, + "logits/chosen": 0.08676552772521973, + "logits/rejected": 2.6135525703430176, + "logps/chosen": -444.3998107910156, + "logps/rejected": -775.56591796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.776384353637695, + "rewards/margins": 18.37545394897461, + "rewards/rejected": -25.151840209960938, + "step": 1438 + }, + { + "epoch": 0.8951788491446345, + "grad_norm": 20.202762603759766, + "learning_rate": 3.898109727985247e-06, + "logits/chosen": -2.568183422088623, + "logits/rejected": 2.953566312789917, + "logps/chosen": -487.15130615234375, + "logps/rejected": -918.3087158203125, + "loss": 0.1565, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.887203216552734, + "rewards/margins": 16.490564346313477, + "rewards/rejected": -26.377765655517578, + "step": 1439 + }, + { + "epoch": 0.895800933125972, + "grad_norm": 23.546581268310547, + "learning_rate": 3.896957123098202e-06, + "logits/chosen": 1.4027090072631836, + "logits/rejected": 5.3587188720703125, + "logps/chosen": -492.6435546875, + "logps/rejected": -915.1826782226562, + "loss": 0.2141, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.819850921630859, + "rewards/margins": 22.011383056640625, + "rewards/rejected": -28.831233978271484, + "step": 1440 + }, + { + "epoch": 0.8964230171073095, + "grad_norm": 0.7575953602790833, + "learning_rate": 3.895804518211157e-06, + "logits/chosen": 1.5283441543579102, + "logits/rejected": 4.948976039886475, + "logps/chosen": -611.407470703125, + "logps/rejected": -1001.5595092773438, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.188275337219238, + "rewards/margins": 19.500892639160156, + "rewards/rejected": -28.689167022705078, + "step": 1441 + }, + { + "epoch": 0.897045101088647, + "grad_norm": 0.41505351662635803, + "learning_rate": 3.894651913324113e-06, + "logits/chosen": 1.9158740043640137, + "logits/rejected": 3.627244472503662, + "logps/chosen": -689.9222412109375, + "logps/rejected": -981.9581298828125, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.818538665771484, + "rewards/margins": 17.018362045288086, + "rewards/rejected": -22.836902618408203, + "step": 1442 + }, + { + "epoch": 0.8976671850699844, + "grad_norm": 0.14478036761283875, + "learning_rate": 3.893499308437068e-06, + "logits/chosen": 0.055893540382385254, + "logits/rejected": 2.3074419498443604, + "logps/chosen": -507.1900939941406, + "logps/rejected": -802.4224853515625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.000877380371094, + "rewards/margins": 20.082687377929688, + "rewards/rejected": -26.08356475830078, + "step": 1443 + }, + { + "epoch": 0.8982892690513219, + "grad_norm": 3.7201330087555107e-06, + "learning_rate": 3.892346703550023e-06, + "logits/chosen": -1.7229702472686768, + "logits/rejected": 3.9929304122924805, + "logps/chosen": -359.07342529296875, + "logps/rejected": -1021.4876098632812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.652368545532227, + "rewards/margins": 32.61327362060547, + "rewards/rejected": -40.26564025878906, + "step": 1444 + }, + { + "epoch": 0.8989113530326595, + "grad_norm": 0.00015392265049740672, + "learning_rate": 3.891194098662978e-06, + "logits/chosen": -1.7059762477874756, + "logits/rejected": 2.676309585571289, + "logps/chosen": -373.9096374511719, + "logps/rejected": -994.46923828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.087657928466797, + "rewards/margins": 29.257225036621094, + "rewards/rejected": -37.34488296508789, + "step": 1445 + }, + { + "epoch": 0.8995334370139969, + "grad_norm": 4.5255632400512695, + "learning_rate": 3.890041493775934e-06, + "logits/chosen": 2.433472156524658, + "logits/rejected": 3.5981287956237793, + "logps/chosen": -639.4766845703125, + "logps/rejected": -950.7479858398438, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.024510383605957, + "rewards/margins": 22.969099044799805, + "rewards/rejected": -34.99361038208008, + "step": 1446 + }, + { + "epoch": 0.9001555209953344, + "grad_norm": 24.062063217163086, + "learning_rate": 3.88888888888889e-06, + "logits/chosen": 3.22426700592041, + "logits/rejected": 5.0054030418396, + "logps/chosen": -579.1534423828125, + "logps/rejected": -883.1697387695312, + "loss": 0.1114, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.831135272979736, + "rewards/margins": 21.391578674316406, + "rewards/rejected": -28.222713470458984, + "step": 1447 + }, + { + "epoch": 0.9007776049766718, + "grad_norm": 9.99648982542567e-05, + "learning_rate": 3.887736284001845e-06, + "logits/chosen": -1.6023995876312256, + "logits/rejected": 4.028076648712158, + "logps/chosen": -378.3094482421875, + "logps/rejected": -946.167724609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.24447774887085, + "rewards/margins": 29.51643943786621, + "rewards/rejected": -36.76091766357422, + "step": 1448 + }, + { + "epoch": 0.9013996889580094, + "grad_norm": 6.691157341003418, + "learning_rate": 3.8865836791148e-06, + "logits/chosen": 1.337373971939087, + "logits/rejected": 3.3517513275146484, + "logps/chosen": -594.4137573242188, + "logps/rejected": -1030.775634765625, + "loss": 0.0322, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.6077680587768555, + "rewards/margins": 24.80924415588379, + "rewards/rejected": -32.41700744628906, + "step": 1449 + }, + { + "epoch": 0.9020217729393468, + "grad_norm": 0.0007598842494189739, + "learning_rate": 3.885431074227755e-06, + "logits/chosen": 0.9249590039253235, + "logits/rejected": 2.9459688663482666, + "logps/chosen": -374.2164306640625, + "logps/rejected": -668.7687377929688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.251091480255127, + "rewards/margins": 16.075878143310547, + "rewards/rejected": -20.32697105407715, + "step": 1450 + }, + { + "epoch": 0.9026438569206843, + "grad_norm": 0.0011190170189365745, + "learning_rate": 3.8842784693407105e-06, + "logits/chosen": -0.2251252681016922, + "logits/rejected": 3.9785103797912598, + "logps/chosen": -447.6703186035156, + "logps/rejected": -956.0332641601562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.806117296218872, + "rewards/margins": 30.147293090820312, + "rewards/rejected": -33.95341110229492, + "step": 1451 + }, + { + "epoch": 0.9032659409020217, + "grad_norm": 14.530166625976562, + "learning_rate": 3.883125864453666e-06, + "logits/chosen": 1.0380009412765503, + "logits/rejected": 4.930631637573242, + "logps/chosen": -557.4342041015625, + "logps/rejected": -968.177978515625, + "loss": 0.0801, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.003500938415527, + "rewards/margins": 20.134029388427734, + "rewards/rejected": -32.13752746582031, + "step": 1452 + }, + { + "epoch": 0.9038880248833593, + "grad_norm": 33.54362106323242, + "learning_rate": 3.881973259566621e-06, + "logits/chosen": -1.212352991104126, + "logits/rejected": 2.3545548915863037, + "logps/chosen": -355.854736328125, + "logps/rejected": -823.626220703125, + "loss": 0.9964, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.923611164093018, + "rewards/margins": 25.169193267822266, + "rewards/rejected": -32.092803955078125, + "step": 1453 + }, + { + "epoch": 0.9045101088646967, + "grad_norm": 36.934478759765625, + "learning_rate": 3.880820654679576e-06, + "logits/chosen": -0.6643639802932739, + "logits/rejected": 3.543142080307007, + "logps/chosen": -580.7192993164062, + "logps/rejected": -1017.1659545898438, + "loss": 1.1885, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.022318840026855, + "rewards/margins": 23.986433029174805, + "rewards/rejected": -35.008750915527344, + "step": 1454 + }, + { + "epoch": 0.9051321928460342, + "grad_norm": 18.589052200317383, + "learning_rate": 3.879668049792531e-06, + "logits/chosen": -0.07116609811782837, + "logits/rejected": 3.4631710052490234, + "logps/chosen": -381.1407470703125, + "logps/rejected": -778.381103515625, + "loss": 0.0831, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.6300368309021, + "rewards/margins": 24.78908920288086, + "rewards/rejected": -29.41912841796875, + "step": 1455 + }, + { + "epoch": 0.9057542768273716, + "grad_norm": 20.014081954956055, + "learning_rate": 3.878515444905487e-06, + "logits/chosen": 0.9744256734848022, + "logits/rejected": 3.516145944595337, + "logps/chosen": -598.3582763671875, + "logps/rejected": -893.91845703125, + "loss": 0.2065, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.06954574584961, + "rewards/margins": 14.647180557250977, + "rewards/rejected": -24.716726303100586, + "step": 1456 + }, + { + "epoch": 0.9063763608087092, + "grad_norm": 0.7139878869056702, + "learning_rate": 3.877362840018442e-06, + "logits/chosen": -1.604265570640564, + "logits/rejected": 2.56709623336792, + "logps/chosen": -441.33447265625, + "logps/rejected": -902.9589233398438, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.15466594696045, + "rewards/margins": 26.56195640563965, + "rewards/rejected": -34.71662139892578, + "step": 1457 + }, + { + "epoch": 0.9069984447900467, + "grad_norm": 7.607209408888593e-05, + "learning_rate": 3.876210235131397e-06, + "logits/chosen": -0.12832467257976532, + "logits/rejected": 3.708998680114746, + "logps/chosen": -383.6351013183594, + "logps/rejected": -780.0354614257812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.427809238433838, + "rewards/margins": 17.41489028930664, + "rewards/rejected": -21.84269905090332, + "step": 1458 + }, + { + "epoch": 0.9076205287713841, + "grad_norm": 0.0008937264792621136, + "learning_rate": 3.875057630244352e-06, + "logits/chosen": -1.8712693452835083, + "logits/rejected": 2.955657482147217, + "logps/chosen": -354.0270690917969, + "logps/rejected": -975.659912109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.730475425720215, + "rewards/margins": 28.753705978393555, + "rewards/rejected": -35.48418045043945, + "step": 1459 + }, + { + "epoch": 0.9082426127527217, + "grad_norm": 0.16675208508968353, + "learning_rate": 3.873905025357308e-06, + "logits/chosen": -1.975113034248352, + "logits/rejected": 3.2344610691070557, + "logps/chosen": -351.33624267578125, + "logps/rejected": -852.4354248046875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.95164680480957, + "rewards/margins": 20.913976669311523, + "rewards/rejected": -25.865623474121094, + "step": 1460 + }, + { + "epoch": 0.9088646967340591, + "grad_norm": 0.006307605188339949, + "learning_rate": 3.872752420470264e-06, + "logits/chosen": 0.9668619632720947, + "logits/rejected": 4.2649102210998535, + "logps/chosen": -405.6862487792969, + "logps/rejected": -878.3807983398438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.950449466705322, + "rewards/margins": 27.552230834960938, + "rewards/rejected": -34.502681732177734, + "step": 1461 + }, + { + "epoch": 0.9094867807153966, + "grad_norm": 0.003334360895678401, + "learning_rate": 3.871599815583219e-06, + "logits/chosen": -3.593207836151123, + "logits/rejected": 2.388051986694336, + "logps/chosen": -346.031982421875, + "logps/rejected": -930.6376953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.312501907348633, + "rewards/margins": 27.05126953125, + "rewards/rejected": -32.363773345947266, + "step": 1462 + }, + { + "epoch": 0.910108864696734, + "grad_norm": 12.493054389953613, + "learning_rate": 3.870447210696174e-06, + "logits/chosen": 0.8264709711074829, + "logits/rejected": 3.445063591003418, + "logps/chosen": -599.697998046875, + "logps/rejected": -1038.114990234375, + "loss": 0.0691, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.2918062210083, + "rewards/margins": 29.403207778930664, + "rewards/rejected": -37.69501495361328, + "step": 1463 + }, + { + "epoch": 0.9107309486780716, + "grad_norm": 5.892303943634033, + "learning_rate": 3.869294605809129e-06, + "logits/chosen": 2.750220537185669, + "logits/rejected": 4.762117385864258, + "logps/chosen": -643.5029907226562, + "logps/rejected": -941.763427734375, + "loss": 0.0254, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.043200492858887, + "rewards/margins": 20.676185607910156, + "rewards/rejected": -26.71938705444336, + "step": 1464 + }, + { + "epoch": 0.911353032659409, + "grad_norm": 0.4347749948501587, + "learning_rate": 3.8681420009220845e-06, + "logits/chosen": -0.7005534172058105, + "logits/rejected": 3.7231321334838867, + "logps/chosen": -591.067626953125, + "logps/rejected": -1191.287841796875, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.792040824890137, + "rewards/margins": 29.695411682128906, + "rewards/rejected": -43.48745346069336, + "step": 1465 + }, + { + "epoch": 0.9119751166407465, + "grad_norm": 35.06818771362305, + "learning_rate": 3.86698939603504e-06, + "logits/chosen": -0.31166747212409973, + "logits/rejected": 2.879892587661743, + "logps/chosen": -587.843505859375, + "logps/rejected": -1006.7908325195312, + "loss": 0.5774, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.827130317687988, + "rewards/margins": 23.738933563232422, + "rewards/rejected": -33.566062927246094, + "step": 1466 + }, + { + "epoch": 0.9125972006220839, + "grad_norm": 1.881179923657328e-05, + "learning_rate": 3.865836791147995e-06, + "logits/chosen": 0.7096707820892334, + "logits/rejected": 3.158099889755249, + "logps/chosen": -459.2037353515625, + "logps/rejected": -847.1054077148438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.284489631652832, + "rewards/margins": 27.181751251220703, + "rewards/rejected": -35.46623992919922, + "step": 1467 + }, + { + "epoch": 0.9132192846034215, + "grad_norm": 0.25267404317855835, + "learning_rate": 3.86468418626095e-06, + "logits/chosen": 0.695156991481781, + "logits/rejected": 4.628496170043945, + "logps/chosen": -482.04547119140625, + "logps/rejected": -1053.236083984375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.188310146331787, + "rewards/margins": 29.162036895751953, + "rewards/rejected": -36.350345611572266, + "step": 1468 + }, + { + "epoch": 0.913841368584759, + "grad_norm": 45.62656784057617, + "learning_rate": 3.863531581373905e-06, + "logits/chosen": 0.6176962852478027, + "logits/rejected": 4.680180549621582, + "logps/chosen": -467.5579833984375, + "logps/rejected": -935.610595703125, + "loss": 1.2717, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.91832447052002, + "rewards/margins": 21.834449768066406, + "rewards/rejected": -31.752771377563477, + "step": 1469 + }, + { + "epoch": 0.9144634525660964, + "grad_norm": 2.9416592121124268, + "learning_rate": 3.862378976486861e-06, + "logits/chosen": 1.5381639003753662, + "logits/rejected": 2.0583224296569824, + "logps/chosen": -576.9505004882812, + "logps/rejected": -878.991943359375, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.187313079833984, + "rewards/margins": 21.081104278564453, + "rewards/rejected": -32.26841735839844, + "step": 1470 + }, + { + "epoch": 0.9150855365474339, + "grad_norm": 29.226638793945312, + "learning_rate": 3.861226371599816e-06, + "logits/chosen": -0.7698392868041992, + "logits/rejected": 2.5437121391296387, + "logps/chosen": -477.9600830078125, + "logps/rejected": -946.5003051757812, + "loss": 0.5726, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.625056266784668, + "rewards/margins": 24.998863220214844, + "rewards/rejected": -33.62391662597656, + "step": 1471 + }, + { + "epoch": 0.9157076205287714, + "grad_norm": 42.338714599609375, + "learning_rate": 3.860073766712771e-06, + "logits/chosen": -1.9399776458740234, + "logits/rejected": 1.7948557138442993, + "logps/chosen": -435.4498291015625, + "logps/rejected": -864.4039916992188, + "loss": 0.588, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.043302536010742, + "rewards/margins": 22.714567184448242, + "rewards/rejected": -31.757871627807617, + "step": 1472 + }, + { + "epoch": 0.9163297045101089, + "grad_norm": 1.9460109967894823e-08, + "learning_rate": 3.858921161825726e-06, + "logits/chosen": -1.515357255935669, + "logits/rejected": 4.758892059326172, + "logps/chosen": -382.9659118652344, + "logps/rejected": -1205.88232421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.439264297485352, + "rewards/margins": 39.36029052734375, + "rewards/rejected": -43.799556732177734, + "step": 1473 + }, + { + "epoch": 0.9169517884914463, + "grad_norm": 0.0579552948474884, + "learning_rate": 3.8577685569386815e-06, + "logits/chosen": 2.0449562072753906, + "logits/rejected": 3.7542054653167725, + "logps/chosen": -670.1748657226562, + "logps/rejected": -958.8605346679688, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.759854316711426, + "rewards/margins": 21.060232162475586, + "rewards/rejected": -29.820087432861328, + "step": 1474 + }, + { + "epoch": 0.9175738724727839, + "grad_norm": 0.2062915861606598, + "learning_rate": 3.8566159520516376e-06, + "logits/chosen": 0.2341582179069519, + "logits/rejected": 3.144930124282837, + "logps/chosen": -577.846435546875, + "logps/rejected": -970.8562622070312, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.450343132019043, + "rewards/margins": 20.801151275634766, + "rewards/rejected": -31.251493453979492, + "step": 1475 + }, + { + "epoch": 0.9181959564541213, + "grad_norm": 0.007952794432640076, + "learning_rate": 3.855463347164593e-06, + "logits/chosen": 0.30161577463150024, + "logits/rejected": 3.102041244506836, + "logps/chosen": -563.1438598632812, + "logps/rejected": -955.537841796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.881620407104492, + "rewards/margins": 22.008378982543945, + "rewards/rejected": -31.889997482299805, + "step": 1476 + }, + { + "epoch": 0.9188180404354588, + "grad_norm": 7.2674581907961056e-09, + "learning_rate": 3.854310742277548e-06, + "logits/chosen": -3.6798012256622314, + "logits/rejected": 2.85562801361084, + "logps/chosen": -301.4128723144531, + "logps/rejected": -986.750244140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.19948673248291, + "rewards/margins": 34.309539794921875, + "rewards/rejected": -38.50902557373047, + "step": 1477 + }, + { + "epoch": 0.9194401244167962, + "grad_norm": 28.337913513183594, + "learning_rate": 3.853158137390503e-06, + "logits/chosen": -0.632785439491272, + "logits/rejected": 5.028564453125, + "logps/chosen": -345.7908935546875, + "logps/rejected": -858.96337890625, + "loss": 0.5314, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.487540245056152, + "rewards/margins": 20.401901245117188, + "rewards/rejected": -25.889440536499023, + "step": 1478 + }, + { + "epoch": 0.9200622083981338, + "grad_norm": 20.7117919921875, + "learning_rate": 3.8520055325034585e-06, + "logits/chosen": -2.059865713119507, + "logits/rejected": 3.133439540863037, + "logps/chosen": -542.2882080078125, + "logps/rejected": -954.7108154296875, + "loss": 0.1002, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.028863906860352, + "rewards/margins": 21.37855339050293, + "rewards/rejected": -31.40741729736328, + "step": 1479 + }, + { + "epoch": 0.9206842923794712, + "grad_norm": 6.594052314758301, + "learning_rate": 3.850852927616414e-06, + "logits/chosen": 0.9150658845901489, + "logits/rejected": 2.945340156555176, + "logps/chosen": -579.0211181640625, + "logps/rejected": -941.65283203125, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.479080677032471, + "rewards/margins": 20.222869873046875, + "rewards/rejected": -27.70195198059082, + "step": 1480 + }, + { + "epoch": 0.9213063763608087, + "grad_norm": 27.423505783081055, + "learning_rate": 3.849700322729369e-06, + "logits/chosen": 1.4268765449523926, + "logits/rejected": 2.176144599914551, + "logps/chosen": -381.7594909667969, + "logps/rejected": -644.2747192382812, + "loss": 0.8959, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.049097537994385, + "rewards/margins": 18.318859100341797, + "rewards/rejected": -23.367958068847656, + "step": 1481 + }, + { + "epoch": 0.9219284603421461, + "grad_norm": 0.021746966987848282, + "learning_rate": 3.848547717842324e-06, + "logits/chosen": -0.9644485712051392, + "logits/rejected": 0.720867395401001, + "logps/chosen": -620.6685791015625, + "logps/rejected": -1114.089111328125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.114725112915039, + "rewards/margins": 27.469085693359375, + "rewards/rejected": -37.58380889892578, + "step": 1482 + }, + { + "epoch": 0.9225505443234837, + "grad_norm": 7.061457381496439e-06, + "learning_rate": 3.847395112955279e-06, + "logits/chosen": -1.9248368740081787, + "logits/rejected": 3.2156076431274414, + "logps/chosen": -459.8319396972656, + "logps/rejected": -1093.714599609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.586408615112305, + "rewards/margins": 35.24261474609375, + "rewards/rejected": -46.82902526855469, + "step": 1483 + }, + { + "epoch": 0.9231726283048212, + "grad_norm": 1.888171027530916e-05, + "learning_rate": 3.846242508068235e-06, + "logits/chosen": 2.8101515769958496, + "logits/rejected": 4.89240837097168, + "logps/chosen": -612.6300659179688, + "logps/rejected": -977.2906494140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.669929504394531, + "rewards/margins": 28.796072006225586, + "rewards/rejected": -36.465999603271484, + "step": 1484 + }, + { + "epoch": 0.9237947122861586, + "grad_norm": 1.2461988262657542e-05, + "learning_rate": 3.84508990318119e-06, + "logits/chosen": 1.7340483665466309, + "logits/rejected": 2.461452007293701, + "logps/chosen": -687.24658203125, + "logps/rejected": -1035.656005859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.96574592590332, + "rewards/margins": 32.33479690551758, + "rewards/rejected": -39.30054473876953, + "step": 1485 + }, + { + "epoch": 0.9244167962674961, + "grad_norm": 41.07564926147461, + "learning_rate": 3.843937298294145e-06, + "logits/chosen": 1.1576273441314697, + "logits/rejected": 3.3022708892822266, + "logps/chosen": -500.56304931640625, + "logps/rejected": -773.8848876953125, + "loss": 0.6384, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.655431747436523, + "rewards/margins": 11.647870063781738, + "rewards/rejected": -24.303302764892578, + "step": 1486 + }, + { + "epoch": 0.9250388802488336, + "grad_norm": 0.00020040707022417337, + "learning_rate": 3.8427846934071e-06, + "logits/chosen": 0.566953718662262, + "logits/rejected": 5.4430155754089355, + "logps/chosen": -459.21337890625, + "logps/rejected": -1017.710205078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.9802422523498535, + "rewards/margins": 29.258119583129883, + "rewards/rejected": -34.238365173339844, + "step": 1487 + }, + { + "epoch": 0.9256609642301711, + "grad_norm": 1.0271210670471191, + "learning_rate": 3.8416320885200555e-06, + "logits/chosen": 0.3819788098335266, + "logits/rejected": 4.326751708984375, + "logps/chosen": -546.4390869140625, + "logps/rejected": -1124.6964111328125, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.056602478027344, + "rewards/margins": 34.257320404052734, + "rewards/rejected": -43.31392288208008, + "step": 1488 + }, + { + "epoch": 0.9262830482115085, + "grad_norm": 1.6125093679875135e-05, + "learning_rate": 3.8404794836330116e-06, + "logits/chosen": -0.38977789878845215, + "logits/rejected": 3.8898396492004395, + "logps/chosen": -374.86932373046875, + "logps/rejected": -950.9157104492188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.441466331481934, + "rewards/margins": 26.366893768310547, + "rewards/rejected": -32.8083610534668, + "step": 1489 + }, + { + "epoch": 0.926905132192846, + "grad_norm": 0.35164663195610046, + "learning_rate": 3.839326878745967e-06, + "logits/chosen": 0.4527741074562073, + "logits/rejected": 4.156494140625, + "logps/chosen": -468.0583801269531, + "logps/rejected": -877.856201171875, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.627846717834473, + "rewards/margins": 23.385486602783203, + "rewards/rejected": -31.013330459594727, + "step": 1490 + }, + { + "epoch": 0.9275272161741835, + "grad_norm": 0.315719336271286, + "learning_rate": 3.838174273858922e-06, + "logits/chosen": 0.0138932466506958, + "logits/rejected": 2.9424490928649902, + "logps/chosen": -483.2838134765625, + "logps/rejected": -983.056396484375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.161341667175293, + "rewards/margins": 27.726139068603516, + "rewards/rejected": -36.887481689453125, + "step": 1491 + }, + { + "epoch": 0.928149300155521, + "grad_norm": 4.0229817386716604e-05, + "learning_rate": 3.837021668971877e-06, + "logits/chosen": -1.5416885614395142, + "logits/rejected": 2.365922689437866, + "logps/chosen": -419.181884765625, + "logps/rejected": -830.7940063476562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.081663131713867, + "rewards/margins": 18.408069610595703, + "rewards/rejected": -24.48973274230957, + "step": 1492 + }, + { + "epoch": 0.9287713841368584, + "grad_norm": 1.0233517969027162e-05, + "learning_rate": 3.8358690640848325e-06, + "logits/chosen": -1.0449734926223755, + "logits/rejected": 2.454464912414551, + "logps/chosen": -428.5576477050781, + "logps/rejected": -974.4542236328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.8700408935546875, + "rewards/margins": 29.465404510498047, + "rewards/rejected": -34.335445404052734, + "step": 1493 + }, + { + "epoch": 0.929393468118196, + "grad_norm": 0.00012100357707822695, + "learning_rate": 3.834716459197788e-06, + "logits/chosen": -2.1408612728118896, + "logits/rejected": 2.4791533946990967, + "logps/chosen": -360.2967834472656, + "logps/rejected": -843.7381591796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2425336837768555, + "rewards/margins": 23.199928283691406, + "rewards/rejected": -27.442462921142578, + "step": 1494 + }, + { + "epoch": 0.9300155520995335, + "grad_norm": 0.26367151737213135, + "learning_rate": 3.833563854310743e-06, + "logits/chosen": 1.8193261623382568, + "logits/rejected": 3.4168057441711426, + "logps/chosen": -586.12158203125, + "logps/rejected": -1028.549072265625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.226459503173828, + "rewards/margins": 24.45671844482422, + "rewards/rejected": -39.68317794799805, + "step": 1495 + }, + { + "epoch": 0.9306376360808709, + "grad_norm": 3.7611985206604004, + "learning_rate": 3.832411249423698e-06, + "logits/chosen": -1.5775599479675293, + "logits/rejected": 1.8469675779342651, + "logps/chosen": -503.709716796875, + "logps/rejected": -838.383544921875, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.134449481964111, + "rewards/margins": 11.143117904663086, + "rewards/rejected": -18.27756690979004, + "step": 1496 + }, + { + "epoch": 0.9312597200622084, + "grad_norm": 0.019973743706941605, + "learning_rate": 3.831258644536653e-06, + "logits/chosen": 0.1289931833744049, + "logits/rejected": 4.267576217651367, + "logps/chosen": -532.5215454101562, + "logps/rejected": -971.3292846679688, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.559263229370117, + "rewards/margins": 21.614898681640625, + "rewards/rejected": -31.174163818359375, + "step": 1497 + }, + { + "epoch": 0.9318818040435459, + "grad_norm": 28.844974517822266, + "learning_rate": 3.830106039649609e-06, + "logits/chosen": -1.243943691253662, + "logits/rejected": 4.444502830505371, + "logps/chosen": -427.8436279296875, + "logps/rejected": -1023.44580078125, + "loss": 0.3356, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.57684326171875, + "rewards/margins": 24.601680755615234, + "rewards/rejected": -32.178524017333984, + "step": 1498 + }, + { + "epoch": 0.9325038880248834, + "grad_norm": 0.011191684752702713, + "learning_rate": 3.828953434762564e-06, + "logits/chosen": 1.799675464630127, + "logits/rejected": 2.868252754211426, + "logps/chosen": -638.8121948242188, + "logps/rejected": -927.4824829101562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.192627906799316, + "rewards/margins": 24.738967895507812, + "rewards/rejected": -32.93159484863281, + "step": 1499 + }, + { + "epoch": 0.9331259720062208, + "grad_norm": 0.0029204629827290773, + "learning_rate": 3.827800829875519e-06, + "logits/chosen": -1.8656450510025024, + "logits/rejected": 4.460055351257324, + "logps/chosen": -455.1031799316406, + "logps/rejected": -1073.0537109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.718934535980225, + "rewards/margins": 23.455421447753906, + "rewards/rejected": -30.174358367919922, + "step": 1500 + }, + { + "epoch": 0.9337480559875583, + "grad_norm": 25.7710018157959, + "learning_rate": 3.826648224988474e-06, + "logits/chosen": 1.2761621475219727, + "logits/rejected": 3.3277039527893066, + "logps/chosen": -503.96881103515625, + "logps/rejected": -754.3949584960938, + "loss": 0.2218, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.906332015991211, + "rewards/margins": 16.754884719848633, + "rewards/rejected": -27.66121482849121, + "step": 1501 + }, + { + "epoch": 0.9343701399688958, + "grad_norm": 0.059097982943058014, + "learning_rate": 3.8254956201014295e-06, + "logits/chosen": 1.6621503829956055, + "logits/rejected": 3.892489433288574, + "logps/chosen": -591.111328125, + "logps/rejected": -929.8556518554688, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.30999231338501, + "rewards/margins": 23.505043029785156, + "rewards/rejected": -29.815034866333008, + "step": 1502 + }, + { + "epoch": 0.9349922239502333, + "grad_norm": 4.284040187485516e-06, + "learning_rate": 3.824343015214385e-06, + "logits/chosen": 0.3832207918167114, + "logits/rejected": 2.669308662414551, + "logps/chosen": -499.63031005859375, + "logps/rejected": -856.37939453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.689759254455566, + "rewards/margins": 24.031463623046875, + "rewards/rejected": -33.721221923828125, + "step": 1503 + }, + { + "epoch": 0.9356143079315707, + "grad_norm": 9.027886699186638e-05, + "learning_rate": 3.82319041032734e-06, + "logits/chosen": -2.296435594558716, + "logits/rejected": 3.5077037811279297, + "logps/chosen": -212.14871215820312, + "logps/rejected": -839.968505859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.725435733795166, + "rewards/margins": 32.51136016845703, + "rewards/rejected": -34.236793518066406, + "step": 1504 + }, + { + "epoch": 0.9362363919129082, + "grad_norm": 0.11467395722866058, + "learning_rate": 3.822037805440295e-06, + "logits/chosen": 1.1032499074935913, + "logits/rejected": 3.6253840923309326, + "logps/chosen": -658.2777099609375, + "logps/rejected": -1048.64013671875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.855783462524414, + "rewards/margins": 22.73939323425293, + "rewards/rejected": -33.595176696777344, + "step": 1505 + }, + { + "epoch": 0.9368584758942458, + "grad_norm": 17.38050651550293, + "learning_rate": 3.82088520055325e-06, + "logits/chosen": 0.569591224193573, + "logits/rejected": 2.40620493888855, + "logps/chosen": -527.115234375, + "logps/rejected": -804.364013671875, + "loss": 0.1106, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.470925331115723, + "rewards/margins": 20.458789825439453, + "rewards/rejected": -28.929716110229492, + "step": 1506 + }, + { + "epoch": 0.9374805598755832, + "grad_norm": 2.645406084411661e-06, + "learning_rate": 3.819732595666206e-06, + "logits/chosen": 1.03667151927948, + "logits/rejected": 2.616377592086792, + "logps/chosen": -580.8767700195312, + "logps/rejected": -976.0105590820312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.645763397216797, + "rewards/margins": 31.013835906982422, + "rewards/rejected": -38.65959930419922, + "step": 1507 + }, + { + "epoch": 0.9381026438569207, + "grad_norm": 0.3618963658809662, + "learning_rate": 3.818579990779161e-06, + "logits/chosen": 1.1675472259521484, + "logits/rejected": 2.6586809158325195, + "logps/chosen": -697.2937622070312, + "logps/rejected": -1109.1123046875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.925567626953125, + "rewards/margins": 25.126672744750977, + "rewards/rejected": -32.05223846435547, + "step": 1508 + }, + { + "epoch": 0.9387247278382581, + "grad_norm": 0.0021237938199192286, + "learning_rate": 3.817427385892116e-06, + "logits/chosen": -2.1299657821655273, + "logits/rejected": 1.384620189666748, + "logps/chosen": -426.447509765625, + "logps/rejected": -796.8826904296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.058237552642822, + "rewards/margins": 20.120403289794922, + "rewards/rejected": -25.17864227294922, + "step": 1509 + }, + { + "epoch": 0.9393468118195957, + "grad_norm": 3.2634453773498535, + "learning_rate": 3.816274781005071e-06, + "logits/chosen": -0.11459943652153015, + "logits/rejected": 3.2260124683380127, + "logps/chosen": -421.47979736328125, + "logps/rejected": -808.63720703125, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.018412113189697, + "rewards/margins": 19.303525924682617, + "rewards/rejected": -26.321937561035156, + "step": 1510 + }, + { + "epoch": 0.9399688958009331, + "grad_norm": 0.0048073939979076385, + "learning_rate": 3.8151221761180265e-06, + "logits/chosen": 1.1135034561157227, + "logits/rejected": 4.2871599197387695, + "logps/chosen": -391.2454833984375, + "logps/rejected": -800.1884155273438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.755030632019043, + "rewards/margins": 24.084712982177734, + "rewards/rejected": -29.83974266052246, + "step": 1511 + }, + { + "epoch": 0.9405909797822706, + "grad_norm": 3.793792963027954, + "learning_rate": 3.813969571230982e-06, + "logits/chosen": 2.290614604949951, + "logits/rejected": 3.953888177871704, + "logps/chosen": -527.8026123046875, + "logps/rejected": -860.7406005859375, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.779989719390869, + "rewards/margins": 21.989734649658203, + "rewards/rejected": -28.769725799560547, + "step": 1512 + }, + { + "epoch": 0.9412130637636081, + "grad_norm": 2.0059680537087843e-05, + "learning_rate": 3.8128169663439374e-06, + "logits/chosen": -0.5008499622344971, + "logits/rejected": 3.417922019958496, + "logps/chosen": -479.7211608886719, + "logps/rejected": -1060.9326171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.907632827758789, + "rewards/margins": 34.93695068359375, + "rewards/rejected": -43.84458541870117, + "step": 1513 + }, + { + "epoch": 0.9418351477449456, + "grad_norm": 28.518451690673828, + "learning_rate": 3.8116643614568926e-06, + "logits/chosen": 2.51342511177063, + "logits/rejected": 3.519273281097412, + "logps/chosen": -710.60791015625, + "logps/rejected": -995.9493408203125, + "loss": 0.1775, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.318286895751953, + "rewards/margins": 25.26766586303711, + "rewards/rejected": -33.58595275878906, + "step": 1514 + }, + { + "epoch": 0.942457231726283, + "grad_norm": 30.555561065673828, + "learning_rate": 3.810511756569848e-06, + "logits/chosen": -1.0337927341461182, + "logits/rejected": 4.793352127075195, + "logps/chosen": -364.50323486328125, + "logps/rejected": -910.0731811523438, + "loss": 1.2915, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.789246559143066, + "rewards/margins": 18.14818000793457, + "rewards/rejected": -22.93742561340332, + "step": 1515 + }, + { + "epoch": 0.9430793157076205, + "grad_norm": 5.560061300258212e-09, + "learning_rate": 3.809359151682803e-06, + "logits/chosen": 1.3392835855484009, + "logits/rejected": 4.3378682136535645, + "logps/chosen": -580.5053100585938, + "logps/rejected": -1080.9197998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.431815147399902, + "rewards/margins": 34.50389099121094, + "rewards/rejected": -42.935707092285156, + "step": 1516 + }, + { + "epoch": 0.943701399688958, + "grad_norm": 0.00020487657457124442, + "learning_rate": 3.8082065467957587e-06, + "logits/chosen": -1.4015358686447144, + "logits/rejected": 1.7667157649993896, + "logps/chosen": -367.30902099609375, + "logps/rejected": -756.1085815429688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.350221633911133, + "rewards/margins": 23.439151763916016, + "rewards/rejected": -28.789371490478516, + "step": 1517 + }, + { + "epoch": 0.9443234836702955, + "grad_norm": 29.253721237182617, + "learning_rate": 3.807053941908714e-06, + "logits/chosen": -1.1160175800323486, + "logits/rejected": 3.391474723815918, + "logps/chosen": -453.40234375, + "logps/rejected": -814.8978271484375, + "loss": 0.301, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.987330913543701, + "rewards/margins": 16.134384155273438, + "rewards/rejected": -21.121715545654297, + "step": 1518 + }, + { + "epoch": 0.944945567651633, + "grad_norm": 1.373146414756775, + "learning_rate": 3.805901337021669e-06, + "logits/chosen": 1.4696600437164307, + "logits/rejected": 1.907806396484375, + "logps/chosen": -587.392333984375, + "logps/rejected": -867.681884765625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.671820640563965, + "rewards/margins": 20.09083366394043, + "rewards/rejected": -27.762653350830078, + "step": 1519 + }, + { + "epoch": 0.9455676516329704, + "grad_norm": 0.1505119949579239, + "learning_rate": 3.8047487321346244e-06, + "logits/chosen": -0.6864757537841797, + "logits/rejected": 2.311563014984131, + "logps/chosen": -537.08984375, + "logps/rejected": -973.3675537109375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.800401210784912, + "rewards/margins": 29.207828521728516, + "rewards/rejected": -37.00823211669922, + "step": 1520 + }, + { + "epoch": 0.946189735614308, + "grad_norm": 7.776718848617747e-05, + "learning_rate": 3.8035961272475796e-06, + "logits/chosen": 1.3452345132827759, + "logits/rejected": 3.4356236457824707, + "logps/chosen": -501.22119140625, + "logps/rejected": -830.6551513671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.098999500274658, + "rewards/margins": 22.240488052368164, + "rewards/rejected": -28.339487075805664, + "step": 1521 + }, + { + "epoch": 0.9468118195956454, + "grad_norm": 0.03515046462416649, + "learning_rate": 3.802443522360535e-06, + "logits/chosen": -2.4966351985931396, + "logits/rejected": 4.5510101318359375, + "logps/chosen": -336.7701416015625, + "logps/rejected": -1093.436767578125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.726394176483154, + "rewards/margins": 35.122657775878906, + "rewards/rejected": -41.84905242919922, + "step": 1522 + }, + { + "epoch": 0.9474339035769829, + "grad_norm": 4.1857827454805374e-05, + "learning_rate": 3.80129091747349e-06, + "logits/chosen": 0.1419128179550171, + "logits/rejected": 4.158993244171143, + "logps/chosen": -575.00390625, + "logps/rejected": -1083.60546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.428834915161133, + "rewards/margins": 30.497568130493164, + "rewards/rejected": -41.92639923095703, + "step": 1523 + }, + { + "epoch": 0.9480559875583203, + "grad_norm": 0.005177459679543972, + "learning_rate": 3.8001383125864457e-06, + "logits/chosen": 0.1375989019870758, + "logits/rejected": 2.984259605407715, + "logps/chosen": -510.2491455078125, + "logps/rejected": -893.9998779296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.213672637939453, + "rewards/margins": 26.06291961669922, + "rewards/rejected": -32.27659225463867, + "step": 1524 + }, + { + "epoch": 0.9486780715396579, + "grad_norm": 37.03826904296875, + "learning_rate": 3.798985707699401e-06, + "logits/chosen": 0.5167028307914734, + "logits/rejected": 3.305917739868164, + "logps/chosen": -637.629150390625, + "logps/rejected": -1092.0506591796875, + "loss": 1.057, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.100252151489258, + "rewards/margins": 23.94793128967285, + "rewards/rejected": -36.04818344116211, + "step": 1525 + }, + { + "epoch": 0.9493001555209953, + "grad_norm": 0.01407893281430006, + "learning_rate": 3.797833102812356e-06, + "logits/chosen": -1.060408115386963, + "logits/rejected": 4.336411952972412, + "logps/chosen": -326.1235046386719, + "logps/rejected": -876.7889404296875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.4298095703125, + "rewards/margins": 23.324308395385742, + "rewards/rejected": -28.754119873046875, + "step": 1526 + }, + { + "epoch": 0.9499222395023328, + "grad_norm": 43.05733871459961, + "learning_rate": 3.7966804979253114e-06, + "logits/chosen": -0.4274927079677582, + "logits/rejected": 4.001701354980469, + "logps/chosen": -360.96649169921875, + "logps/rejected": -897.93408203125, + "loss": 1.0622, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.60791540145874, + "rewards/margins": 26.19590950012207, + "rewards/rejected": -30.80382537841797, + "step": 1527 + }, + { + "epoch": 0.9505443234836704, + "grad_norm": 0.0020403736270964146, + "learning_rate": 3.7955278930382666e-06, + "logits/chosen": 0.3815556764602661, + "logits/rejected": 4.2986369132995605, + "logps/chosen": -537.9657592773438, + "logps/rejected": -1013.0636596679688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.577686309814453, + "rewards/margins": 27.301551818847656, + "rewards/rejected": -35.87923812866211, + "step": 1528 + }, + { + "epoch": 0.9511664074650078, + "grad_norm": 0.0007894797599874437, + "learning_rate": 3.794375288151222e-06, + "logits/chosen": 0.7989105582237244, + "logits/rejected": 4.2252912521362305, + "logps/chosen": -527.30322265625, + "logps/rejected": -973.3763427734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.929349899291992, + "rewards/margins": 28.399248123168945, + "rewards/rejected": -36.32859802246094, + "step": 1529 + }, + { + "epoch": 0.9517884914463453, + "grad_norm": 0.00019009016978088766, + "learning_rate": 3.793222683264177e-06, + "logits/chosen": 1.2304333448410034, + "logits/rejected": 4.029991626739502, + "logps/chosen": -571.3868408203125, + "logps/rejected": -926.7520141601562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.0893449783325195, + "rewards/margins": 19.844104766845703, + "rewards/rejected": -26.933452606201172, + "step": 1530 + }, + { + "epoch": 0.9524105754276827, + "grad_norm": 0.0637696161866188, + "learning_rate": 3.7920700783771323e-06, + "logits/chosen": 2.5368003845214844, + "logits/rejected": 4.378127574920654, + "logps/chosen": -632.891845703125, + "logps/rejected": -1011.991455078125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.220358848571777, + "rewards/margins": 26.87921714782715, + "rewards/rejected": -35.09957504272461, + "step": 1531 + }, + { + "epoch": 0.9530326594090203, + "grad_norm": 0.38544222712516785, + "learning_rate": 3.790917473490088e-06, + "logits/chosen": -0.9029761552810669, + "logits/rejected": 3.0724494457244873, + "logps/chosen": -549.3706665039062, + "logps/rejected": -1025.44677734375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.191251754760742, + "rewards/margins": 22.814775466918945, + "rewards/rejected": -31.006027221679688, + "step": 1532 + }, + { + "epoch": 0.9536547433903577, + "grad_norm": 34.8115234375, + "learning_rate": 3.789764868603043e-06, + "logits/chosen": 1.5727530717849731, + "logits/rejected": 5.6106414794921875, + "logps/chosen": -443.1806640625, + "logps/rejected": -793.2718505859375, + "loss": 0.4784, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.441493511199951, + "rewards/margins": 17.451435089111328, + "rewards/rejected": -23.892925262451172, + "step": 1533 + }, + { + "epoch": 0.9542768273716952, + "grad_norm": 0.00016758311539888382, + "learning_rate": 3.7886122637159984e-06, + "logits/chosen": 1.5388946533203125, + "logits/rejected": 4.52772331237793, + "logps/chosen": -596.45068359375, + "logps/rejected": -1042.272216796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.116270065307617, + "rewards/margins": 28.782930374145508, + "rewards/rejected": -36.899200439453125, + "step": 1534 + }, + { + "epoch": 0.9548989113530326, + "grad_norm": 0.010228978469967842, + "learning_rate": 3.7874596588289536e-06, + "logits/chosen": 2.3814992904663086, + "logits/rejected": 3.1295394897460938, + "logps/chosen": -621.3070068359375, + "logps/rejected": -937.0552368164062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.042771816253662, + "rewards/margins": 26.261520385742188, + "rewards/rejected": -32.304290771484375, + "step": 1535 + }, + { + "epoch": 0.9555209953343702, + "grad_norm": 0.0004916785983368754, + "learning_rate": 3.786307053941909e-06, + "logits/chosen": -0.32576048374176025, + "logits/rejected": 2.7030093669891357, + "logps/chosen": -467.4586181640625, + "logps/rejected": -823.8536376953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.294271945953369, + "rewards/margins": 24.553098678588867, + "rewards/rejected": -31.847370147705078, + "step": 1536 + }, + { + "epoch": 0.9561430793157076, + "grad_norm": 2.831015110015869, + "learning_rate": 3.785154449054864e-06, + "logits/chosen": 0.12055912613868713, + "logits/rejected": 2.6946396827697754, + "logps/chosen": -471.73199462890625, + "logps/rejected": -810.045166015625, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.989734649658203, + "rewards/margins": 22.072750091552734, + "rewards/rejected": -30.062484741210938, + "step": 1537 + }, + { + "epoch": 0.9567651632970451, + "grad_norm": 0.1458519995212555, + "learning_rate": 3.7840018441678193e-06, + "logits/chosen": -1.1733816862106323, + "logits/rejected": 3.1834568977355957, + "logps/chosen": -345.3623352050781, + "logps/rejected": -824.790771484375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.023584842681885, + "rewards/margins": 25.241973876953125, + "rewards/rejected": -30.26555824279785, + "step": 1538 + }, + { + "epoch": 0.9573872472783825, + "grad_norm": 0.05512907728552818, + "learning_rate": 3.782849239280775e-06, + "logits/chosen": -1.629906177520752, + "logits/rejected": 0.8318213820457458, + "logps/chosen": -447.51483154296875, + "logps/rejected": -814.4088134765625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.437944412231445, + "rewards/margins": 23.343429565429688, + "rewards/rejected": -29.7813720703125, + "step": 1539 + }, + { + "epoch": 0.9580093312597201, + "grad_norm": 3.436262545619684e-07, + "learning_rate": 3.78169663439373e-06, + "logits/chosen": -0.798626184463501, + "logits/rejected": 3.8584299087524414, + "logps/chosen": -544.6287841796875, + "logps/rejected": -1154.15478515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.424917697906494, + "rewards/margins": 31.494213104248047, + "rewards/rejected": -37.919132232666016, + "step": 1540 + }, + { + "epoch": 0.9586314152410575, + "grad_norm": 5.306674779603782e-07, + "learning_rate": 3.7805440295066854e-06, + "logits/chosen": -1.1544137001037598, + "logits/rejected": 4.800648212432861, + "logps/chosen": -271.69183349609375, + "logps/rejected": -902.1212768554688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7454826831817627, + "rewards/margins": 29.29214096069336, + "rewards/rejected": -32.037620544433594, + "step": 1541 + }, + { + "epoch": 0.959253499222395, + "grad_norm": 7.686992168426514, + "learning_rate": 3.7793914246196406e-06, + "logits/chosen": 0.2284245491027832, + "logits/rejected": 2.3884739875793457, + "logps/chosen": -577.6343994140625, + "logps/rejected": -819.865234375, + "loss": 0.0294, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.3329057693481445, + "rewards/margins": 20.446739196777344, + "rewards/rejected": -27.779645919799805, + "step": 1542 + }, + { + "epoch": 0.9598755832037325, + "grad_norm": 0.03158849850296974, + "learning_rate": 3.778238819732596e-06, + "logits/chosen": -3.3085033893585205, + "logits/rejected": 2.410858154296875, + "logps/chosen": -369.4300537109375, + "logps/rejected": -1009.00634765625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.852804660797119, + "rewards/margins": 29.572834014892578, + "rewards/rejected": -33.42564010620117, + "step": 1543 + }, + { + "epoch": 0.96049766718507, + "grad_norm": 0.8256038427352905, + "learning_rate": 3.777086214845551e-06, + "logits/chosen": 0.9084221720695496, + "logits/rejected": 3.3126614093780518, + "logps/chosen": -562.0577392578125, + "logps/rejected": -833.6492309570312, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.350132942199707, + "rewards/margins": 18.237110137939453, + "rewards/rejected": -24.587242126464844, + "step": 1544 + }, + { + "epoch": 0.9611197511664075, + "grad_norm": 0.02136695571243763, + "learning_rate": 3.7759336099585063e-06, + "logits/chosen": -0.20459318161010742, + "logits/rejected": 3.8194947242736816, + "logps/chosen": -334.81689453125, + "logps/rejected": -757.7464599609375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.10609769821167, + "rewards/margins": 27.78612518310547, + "rewards/rejected": -29.892223358154297, + "step": 1545 + }, + { + "epoch": 0.9617418351477449, + "grad_norm": 0.021653296425938606, + "learning_rate": 3.774781005071462e-06, + "logits/chosen": 0.6639248132705688, + "logits/rejected": 4.86984920501709, + "logps/chosen": -418.23291015625, + "logps/rejected": -902.3069458007812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.3227763175964355, + "rewards/margins": 27.793251037597656, + "rewards/rejected": -34.11602783203125, + "step": 1546 + }, + { + "epoch": 0.9623639191290825, + "grad_norm": 0.8193944692611694, + "learning_rate": 3.773628400184417e-06, + "logits/chosen": -1.013139009475708, + "logits/rejected": 1.9905011653900146, + "logps/chosen": -330.7326354980469, + "logps/rejected": -710.2229614257812, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1681528091430664, + "rewards/margins": 18.948850631713867, + "rewards/rejected": -22.11700439453125, + "step": 1547 + }, + { + "epoch": 0.9629860031104199, + "grad_norm": 0.0021697860211133957, + "learning_rate": 3.7724757952973724e-06, + "logits/chosen": -0.6302704215049744, + "logits/rejected": 3.592517614364624, + "logps/chosen": -392.3840026855469, + "logps/rejected": -913.2039794921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.25330924987793, + "rewards/margins": 28.252193450927734, + "rewards/rejected": -32.50550079345703, + "step": 1548 + }, + { + "epoch": 0.9636080870917574, + "grad_norm": 0.0017832452431321144, + "learning_rate": 3.7713231904103276e-06, + "logits/chosen": 2.577538013458252, + "logits/rejected": 4.379067420959473, + "logps/chosen": -711.4339599609375, + "logps/rejected": -950.3943481445312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.664702415466309, + "rewards/margins": 21.427143096923828, + "rewards/rejected": -28.091846466064453, + "step": 1549 + }, + { + "epoch": 0.9642301710730948, + "grad_norm": 0.010014387778937817, + "learning_rate": 3.770170585523283e-06, + "logits/chosen": 1.4134823083877563, + "logits/rejected": 3.6276726722717285, + "logps/chosen": -564.8404541015625, + "logps/rejected": -985.745849609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.489076614379883, + "rewards/margins": 26.61454963684082, + "rewards/rejected": -34.1036262512207, + "step": 1550 + }, + { + "epoch": 0.9648522550544324, + "grad_norm": 0.14533917605876923, + "learning_rate": 3.769017980636238e-06, + "logits/chosen": 1.69158935546875, + "logits/rejected": 4.186570644378662, + "logps/chosen": -550.7415771484375, + "logps/rejected": -960.3980712890625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.251543998718262, + "rewards/margins": 28.392061233520508, + "rewards/rejected": -35.64360046386719, + "step": 1551 + }, + { + "epoch": 0.9654743390357698, + "grad_norm": 3.336298704147339, + "learning_rate": 3.7678653757491933e-06, + "logits/chosen": -2.7250447273254395, + "logits/rejected": 2.549415111541748, + "logps/chosen": -451.37615966796875, + "logps/rejected": -1036.66748046875, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.669151306152344, + "rewards/margins": 29.32394027709961, + "rewards/rejected": -36.99309158325195, + "step": 1552 + }, + { + "epoch": 0.9660964230171073, + "grad_norm": 0.28479263186454773, + "learning_rate": 3.766712770862149e-06, + "logits/chosen": 1.810473084449768, + "logits/rejected": 2.76468563079834, + "logps/chosen": -551.802734375, + "logps/rejected": -898.9547119140625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.322225570678711, + "rewards/margins": 22.87482452392578, + "rewards/rejected": -32.197052001953125, + "step": 1553 + }, + { + "epoch": 0.9667185069984447, + "grad_norm": 0.0004268392804078758, + "learning_rate": 3.765560165975104e-06, + "logits/chosen": 2.188840866088867, + "logits/rejected": 4.608028411865234, + "logps/chosen": -606.5416870117188, + "logps/rejected": -998.5819091796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.792022705078125, + "rewards/margins": 22.136215209960938, + "rewards/rejected": -31.928239822387695, + "step": 1554 + }, + { + "epoch": 0.9673405909797823, + "grad_norm": 0.03662179782986641, + "learning_rate": 3.7644075610880594e-06, + "logits/chosen": 1.1045016050338745, + "logits/rejected": 4.016812324523926, + "logps/chosen": -576.162353515625, + "logps/rejected": -997.515625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.6547722816467285, + "rewards/margins": 24.034767150878906, + "rewards/rejected": -31.68954086303711, + "step": 1555 + }, + { + "epoch": 0.9679626749611198, + "grad_norm": 0.42050519585609436, + "learning_rate": 3.7632549562010146e-06, + "logits/chosen": 1.1344008445739746, + "logits/rejected": 3.8797502517700195, + "logps/chosen": -490.56060791015625, + "logps/rejected": -869.088134765625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.672945022583008, + "rewards/margins": 23.849864959716797, + "rewards/rejected": -33.52280807495117, + "step": 1556 + }, + { + "epoch": 0.9685847589424572, + "grad_norm": 8.319174230564386e-05, + "learning_rate": 3.76210235131397e-06, + "logits/chosen": -1.3127119541168213, + "logits/rejected": 2.8727219104766846, + "logps/chosen": -462.66583251953125, + "logps/rejected": -927.9498901367188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.315878391265869, + "rewards/margins": 25.330734252929688, + "rewards/rejected": -31.6466121673584, + "step": 1557 + }, + { + "epoch": 0.9692068429237947, + "grad_norm": 24.195049285888672, + "learning_rate": 3.760949746426925e-06, + "logits/chosen": 3.776907444000244, + "logits/rejected": 5.161801338195801, + "logps/chosen": -647.3763427734375, + "logps/rejected": -823.6619873046875, + "loss": 0.2038, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.529730796813965, + "rewards/margins": 16.111204147338867, + "rewards/rejected": -24.640933990478516, + "step": 1558 + }, + { + "epoch": 0.9698289269051322, + "grad_norm": 0.14263981580734253, + "learning_rate": 3.7597971415398803e-06, + "logits/chosen": 0.6714162826538086, + "logits/rejected": 3.141258716583252, + "logps/chosen": -566.3853149414062, + "logps/rejected": -1052.81103515625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.304349899291992, + "rewards/margins": 30.776416778564453, + "rewards/rejected": -41.08076858520508, + "step": 1559 + }, + { + "epoch": 0.9704510108864697, + "grad_norm": 0.06349503993988037, + "learning_rate": 3.7586445366528355e-06, + "logits/chosen": 3.69173526763916, + "logits/rejected": 4.2286505699157715, + "logps/chosen": -663.752685546875, + "logps/rejected": -996.7243041992188, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.244875907897949, + "rewards/margins": 24.53765296936035, + "rewards/rejected": -31.782527923583984, + "step": 1560 + }, + { + "epoch": 0.9710730948678071, + "grad_norm": 7.525384902954102, + "learning_rate": 3.757491931765791e-06, + "logits/chosen": -0.16161316633224487, + "logits/rejected": 0.01133960485458374, + "logps/chosen": -527.05859375, + "logps/rejected": -715.83984375, + "loss": 0.0335, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.3956475257873535, + "rewards/margins": 16.443391799926758, + "rewards/rejected": -21.839038848876953, + "step": 1561 + }, + { + "epoch": 0.9716951788491446, + "grad_norm": 0.00021120811288710684, + "learning_rate": 3.7563393268787464e-06, + "logits/chosen": -0.11896657943725586, + "logits/rejected": 3.7753396034240723, + "logps/chosen": -415.9942626953125, + "logps/rejected": -981.8436279296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.215848445892334, + "rewards/margins": 32.04558563232422, + "rewards/rejected": -37.261436462402344, + "step": 1562 + }, + { + "epoch": 0.9723172628304821, + "grad_norm": 7.325221538543701, + "learning_rate": 3.7551867219917016e-06, + "logits/chosen": 0.15939027070999146, + "logits/rejected": 1.709261417388916, + "logps/chosen": -630.5030517578125, + "logps/rejected": -840.1243286132812, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.626913070678711, + "rewards/margins": 15.032745361328125, + "rewards/rejected": -28.659658432006836, + "step": 1563 + }, + { + "epoch": 0.9729393468118196, + "grad_norm": 0.1605205535888672, + "learning_rate": 3.754034117104657e-06, + "logits/chosen": 1.4628174304962158, + "logits/rejected": 4.365293502807617, + "logps/chosen": -595.6699829101562, + "logps/rejected": -947.2667236328125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.8566484451293945, + "rewards/margins": 21.407180786132812, + "rewards/rejected": -29.26382827758789, + "step": 1564 + }, + { + "epoch": 0.973561430793157, + "grad_norm": 2.649874448776245, + "learning_rate": 3.752881512217612e-06, + "logits/chosen": -4.495944976806641, + "logits/rejected": 2.500077247619629, + "logps/chosen": -270.2780456542969, + "logps/rejected": -841.184326171875, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3575921058654785, + "rewards/margins": 22.65192413330078, + "rewards/rejected": -26.009517669677734, + "step": 1565 + }, + { + "epoch": 0.9741835147744946, + "grad_norm": 0.049845147877931595, + "learning_rate": 3.7517289073305673e-06, + "logits/chosen": 1.8272593021392822, + "logits/rejected": 3.8699135780334473, + "logps/chosen": -470.1072692871094, + "logps/rejected": -900.577392578125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.356943130493164, + "rewards/margins": 22.48064422607422, + "rewards/rejected": -28.83758544921875, + "step": 1566 + }, + { + "epoch": 0.9748055987558321, + "grad_norm": 0.03664658963680267, + "learning_rate": 3.7505763024435225e-06, + "logits/chosen": 2.496447801589966, + "logits/rejected": 3.1999642848968506, + "logps/chosen": -694.1896362304688, + "logps/rejected": -1058.082275390625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.669995307922363, + "rewards/margins": 21.938507080078125, + "rewards/rejected": -32.60850143432617, + "step": 1567 + }, + { + "epoch": 0.9754276827371695, + "grad_norm": 1.3099258467264008e-05, + "learning_rate": 3.749423697556478e-06, + "logits/chosen": 1.693497896194458, + "logits/rejected": 3.8283023834228516, + "logps/chosen": -559.7762451171875, + "logps/rejected": -944.8941040039062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.410549163818359, + "rewards/margins": 24.579004287719727, + "rewards/rejected": -31.989551544189453, + "step": 1568 + }, + { + "epoch": 0.976049766718507, + "grad_norm": 0.0269145630300045, + "learning_rate": 3.7482710926694334e-06, + "logits/chosen": 1.4291343688964844, + "logits/rejected": 2.6044325828552246, + "logps/chosen": -597.8721923828125, + "logps/rejected": -844.9185180664062, + "loss": 0.0866, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.501439571380615, + "rewards/margins": 19.320228576660156, + "rewards/rejected": -26.821666717529297, + "step": 1569 + }, + { + "epoch": 0.9766718506998445, + "grad_norm": 11.221540451049805, + "learning_rate": 3.7471184877823886e-06, + "logits/chosen": -2.356743335723877, + "logits/rejected": 0.9585628509521484, + "logps/chosen": -409.2689514160156, + "logps/rejected": -786.9153442382812, + "loss": 0.0673, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4099254608154297, + "rewards/margins": 18.11174201965332, + "rewards/rejected": -21.52166748046875, + "step": 1570 + }, + { + "epoch": 0.977293934681182, + "grad_norm": 0.6764800548553467, + "learning_rate": 3.745965882895344e-06, + "logits/chosen": -0.759571373462677, + "logits/rejected": 2.580238103866577, + "logps/chosen": -517.606689453125, + "logps/rejected": -927.9739379882812, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.846333980560303, + "rewards/margins": 23.722732543945312, + "rewards/rejected": -30.56906509399414, + "step": 1571 + }, + { + "epoch": 0.9779160186625194, + "grad_norm": 0.004113756585866213, + "learning_rate": 3.744813278008299e-06, + "logits/chosen": 0.08926576375961304, + "logits/rejected": 3.5595028400421143, + "logps/chosen": -414.646240234375, + "logps/rejected": -768.23193359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.433351039886475, + "rewards/margins": 15.794235229492188, + "rewards/rejected": -23.22758674621582, + "step": 1572 + }, + { + "epoch": 0.9785381026438569, + "grad_norm": 28.806289672851562, + "learning_rate": 3.7436606731212543e-06, + "logits/chosen": -3.4051802158355713, + "logits/rejected": 4.699017524719238, + "logps/chosen": -343.5263366699219, + "logps/rejected": -1104.3868408203125, + "loss": 0.5611, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.073864936828613, + "rewards/margins": 31.18024253845215, + "rewards/rejected": -39.25410461425781, + "step": 1573 + }, + { + "epoch": 0.9791601866251944, + "grad_norm": 13.794449806213379, + "learning_rate": 3.7425080682342095e-06, + "logits/chosen": 1.9040722846984863, + "logits/rejected": 2.7815659046173096, + "logps/chosen": -543.7911376953125, + "logps/rejected": -792.92333984375, + "loss": 0.0745, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.056597709655762, + "rewards/margins": 22.866260528564453, + "rewards/rejected": -30.92285919189453, + "step": 1574 + }, + { + "epoch": 0.9797822706065319, + "grad_norm": 1.1686525344848633, + "learning_rate": 3.741355463347165e-06, + "logits/chosen": -0.2909442186355591, + "logits/rejected": 2.8880224227905273, + "logps/chosen": -557.0026245117188, + "logps/rejected": -963.135498046875, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.541722297668457, + "rewards/margins": 24.39960289001465, + "rewards/rejected": -32.941322326660156, + "step": 1575 + }, + { + "epoch": 0.9804043545878693, + "grad_norm": 19.24813461303711, + "learning_rate": 3.7402028584601204e-06, + "logits/chosen": 2.339660406112671, + "logits/rejected": 2.4095168113708496, + "logps/chosen": -645.5527954101562, + "logps/rejected": -958.1924438476562, + "loss": 0.1106, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.453535079956055, + "rewards/margins": 22.29770278930664, + "rewards/rejected": -33.75123977661133, + "step": 1576 + }, + { + "epoch": 0.9810264385692068, + "grad_norm": 0.04252305626869202, + "learning_rate": 3.7390502535730756e-06, + "logits/chosen": -0.9952183961868286, + "logits/rejected": 2.1234028339385986, + "logps/chosen": -267.9193420410156, + "logps/rejected": -676.8464965820312, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.890249252319336, + "rewards/margins": 26.59477996826172, + "rewards/rejected": -29.485031127929688, + "step": 1577 + }, + { + "epoch": 0.9816485225505444, + "grad_norm": 0.010037853382527828, + "learning_rate": 3.737897648686031e-06, + "logits/chosen": -0.8864223957061768, + "logits/rejected": 3.127655267715454, + "logps/chosen": -308.0932312011719, + "logps/rejected": -827.228271484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.313598871231079, + "rewards/margins": 30.997886657714844, + "rewards/rejected": -34.311485290527344, + "step": 1578 + }, + { + "epoch": 0.9822706065318818, + "grad_norm": 7.811158866388723e-05, + "learning_rate": 3.736745043798986e-06, + "logits/chosen": -2.4733104705810547, + "logits/rejected": 2.745809555053711, + "logps/chosen": -557.5784912109375, + "logps/rejected": -1105.34423828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.980599880218506, + "rewards/margins": 29.99527931213379, + "rewards/rejected": -35.97587966918945, + "step": 1579 + }, + { + "epoch": 0.9828926905132193, + "grad_norm": 27.442073822021484, + "learning_rate": 3.7355924389119413e-06, + "logits/chosen": -0.202871173620224, + "logits/rejected": 2.683882713317871, + "logps/chosen": -478.58892822265625, + "logps/rejected": -933.044677734375, + "loss": 0.217, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.578217506408691, + "rewards/margins": 20.964635848999023, + "rewards/rejected": -27.5428524017334, + "step": 1580 + }, + { + "epoch": 0.9835147744945568, + "grad_norm": 0.003448404837399721, + "learning_rate": 3.7344398340248965e-06, + "logits/chosen": -0.1981586217880249, + "logits/rejected": 3.0170540809631348, + "logps/chosen": -482.3427429199219, + "logps/rejected": -849.2074584960938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.227471828460693, + "rewards/margins": 25.559865951538086, + "rewards/rejected": -32.78733825683594, + "step": 1581 + }, + { + "epoch": 0.9841368584758943, + "grad_norm": 25.31926918029785, + "learning_rate": 3.7332872291378517e-06, + "logits/chosen": 1.5486270189285278, + "logits/rejected": 3.3535120487213135, + "logps/chosen": -537.72119140625, + "logps/rejected": -846.4741821289062, + "loss": 0.3553, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.907859802246094, + "rewards/margins": 20.8604736328125, + "rewards/rejected": -30.768333435058594, + "step": 1582 + }, + { + "epoch": 0.9847589424572317, + "grad_norm": 0.2907392680644989, + "learning_rate": 3.7321346242508073e-06, + "logits/chosen": 2.166731119155884, + "logits/rejected": 3.08122181892395, + "logps/chosen": -654.4718017578125, + "logps/rejected": -1042.1151123046875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.967266082763672, + "rewards/margins": 25.029521942138672, + "rewards/rejected": -34.99678421020508, + "step": 1583 + }, + { + "epoch": 0.9853810264385692, + "grad_norm": 48.268089294433594, + "learning_rate": 3.7309820193637626e-06, + "logits/chosen": 0.6497130990028381, + "logits/rejected": 3.1703836917877197, + "logps/chosen": -576.513916015625, + "logps/rejected": -972.1256103515625, + "loss": 0.5888, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.059791564941406, + "rewards/margins": 23.662952423095703, + "rewards/rejected": -36.72274398803711, + "step": 1584 + }, + { + "epoch": 0.9860031104199067, + "grad_norm": 0.12340130656957626, + "learning_rate": 3.729829414476718e-06, + "logits/chosen": -1.3608407974243164, + "logits/rejected": 3.115481376647949, + "logps/chosen": -305.034423828125, + "logps/rejected": -866.042724609375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.087643623352051, + "rewards/margins": 27.852447509765625, + "rewards/rejected": -32.94009017944336, + "step": 1585 + }, + { + "epoch": 0.9866251944012442, + "grad_norm": 0.01151037123054266, + "learning_rate": 3.728676809589673e-06, + "logits/chosen": -0.01191103458404541, + "logits/rejected": 4.217233180999756, + "logps/chosen": -447.7720031738281, + "logps/rejected": -1023.5369873046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.726734161376953, + "rewards/margins": 28.46622657775879, + "rewards/rejected": -34.19295883178711, + "step": 1586 + }, + { + "epoch": 0.9872472783825816, + "grad_norm": 8.350014013558393e-08, + "learning_rate": 3.7275242047026282e-06, + "logits/chosen": -1.0805532932281494, + "logits/rejected": 2.784482479095459, + "logps/chosen": -338.6806640625, + "logps/rejected": -867.11376953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.449554443359375, + "rewards/margins": 31.410503387451172, + "rewards/rejected": -37.86006164550781, + "step": 1587 + }, + { + "epoch": 0.9878693623639191, + "grad_norm": 0.25290653109550476, + "learning_rate": 3.7263715998155835e-06, + "logits/chosen": -1.5539438724517822, + "logits/rejected": 4.337480545043945, + "logps/chosen": -450.3958740234375, + "logps/rejected": -1068.654052734375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.97259521484375, + "rewards/margins": 27.88930320739746, + "rewards/rejected": -35.86189651489258, + "step": 1588 + }, + { + "epoch": 0.9884914463452567, + "grad_norm": 1.1513815678654282e-07, + "learning_rate": 3.7252189949285387e-06, + "logits/chosen": -0.6780804395675659, + "logits/rejected": 3.5184125900268555, + "logps/chosen": -455.1690368652344, + "logps/rejected": -1012.3525390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.412100791931152, + "rewards/margins": 34.602020263671875, + "rewards/rejected": -44.01411819458008, + "step": 1589 + }, + { + "epoch": 0.9891135303265941, + "grad_norm": 0.03904994949698448, + "learning_rate": 3.7240663900414943e-06, + "logits/chosen": 0.6309325695037842, + "logits/rejected": 2.775806427001953, + "logps/chosen": -541.5093383789062, + "logps/rejected": -961.15380859375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.032151222229004, + "rewards/margins": 29.603219985961914, + "rewards/rejected": -36.635372161865234, + "step": 1590 + }, + { + "epoch": 0.9897356143079316, + "grad_norm": 2.7226502652411e-07, + "learning_rate": 3.7229137851544496e-06, + "logits/chosen": -1.4838294982910156, + "logits/rejected": 3.9604568481445312, + "logps/chosen": -425.91741943359375, + "logps/rejected": -1060.175048828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.890406608581543, + "rewards/margins": 33.74592208862305, + "rewards/rejected": -41.636329650878906, + "step": 1591 + }, + { + "epoch": 0.990357698289269, + "grad_norm": 0.00038821130874566734, + "learning_rate": 3.721761180267405e-06, + "logits/chosen": 3.0226492881774902, + "logits/rejected": 3.96284818649292, + "logps/chosen": -622.4625244140625, + "logps/rejected": -963.4630126953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.260305404663086, + "rewards/margins": 29.045948028564453, + "rewards/rejected": -36.306251525878906, + "step": 1592 + }, + { + "epoch": 0.9909797822706066, + "grad_norm": 21.46782684326172, + "learning_rate": 3.72060857538036e-06, + "logits/chosen": 0.38420045375823975, + "logits/rejected": 2.7506988048553467, + "logps/chosen": -505.051025390625, + "logps/rejected": -888.744384765625, + "loss": 0.1871, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.530044555664062, + "rewards/margins": 24.28890037536621, + "rewards/rejected": -33.818946838378906, + "step": 1593 + }, + { + "epoch": 0.991601866251944, + "grad_norm": 24.086999893188477, + "learning_rate": 3.7194559704933152e-06, + "logits/chosen": 2.784050464630127, + "logits/rejected": 4.437070369720459, + "logps/chosen": -740.00634765625, + "logps/rejected": -1098.108642578125, + "loss": 0.1265, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.069332122802734, + "rewards/margins": 26.843494415283203, + "rewards/rejected": -38.91282653808594, + "step": 1594 + }, + { + "epoch": 0.9922239502332815, + "grad_norm": 0.003621672512963414, + "learning_rate": 3.7183033656062705e-06, + "logits/chosen": -1.716835379600525, + "logits/rejected": 4.3016862869262695, + "logps/chosen": -372.40069580078125, + "logps/rejected": -949.9512329101562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.385269165039062, + "rewards/margins": 27.320537567138672, + "rewards/rejected": -36.705806732177734, + "step": 1595 + }, + { + "epoch": 0.9928460342146189, + "grad_norm": 7.639461994171143, + "learning_rate": 3.7171507607192257e-06, + "logits/chosen": 2.2062644958496094, + "logits/rejected": 3.4240951538085938, + "logps/chosen": -706.8009033203125, + "logps/rejected": -994.1138305664062, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.54970932006836, + "rewards/margins": 16.25775718688965, + "rewards/rejected": -24.807464599609375, + "step": 1596 + }, + { + "epoch": 0.9934681181959565, + "grad_norm": 6.353683090765117e-08, + "learning_rate": 3.7159981558321813e-06, + "logits/chosen": -0.1960483193397522, + "logits/rejected": 2.1023592948913574, + "logps/chosen": -530.5663452148438, + "logps/rejected": -991.4660034179688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.981578826904297, + "rewards/margins": 29.86503028869629, + "rewards/rejected": -39.84661102294922, + "step": 1597 + }, + { + "epoch": 0.9940902021772939, + "grad_norm": 18.419109344482422, + "learning_rate": 3.7148455509451366e-06, + "logits/chosen": 0.2364797592163086, + "logits/rejected": 2.677727460861206, + "logps/chosen": -561.347412109375, + "logps/rejected": -885.8209838867188, + "loss": 0.1108, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.789037704467773, + "rewards/margins": 22.291229248046875, + "rewards/rejected": -31.080265045166016, + "step": 1598 + }, + { + "epoch": 0.9947122861586314, + "grad_norm": 0.12117072939872742, + "learning_rate": 3.713692946058092e-06, + "logits/chosen": 1.5356340408325195, + "logits/rejected": 3.313687801361084, + "logps/chosen": -543.3634643554688, + "logps/rejected": -787.9395751953125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.35412883758545, + "rewards/margins": 17.061670303344727, + "rewards/rejected": -27.415800094604492, + "step": 1599 + }, + { + "epoch": 0.995334370139969, + "grad_norm": 0.0006846533506177366, + "learning_rate": 3.712540341171047e-06, + "logits/chosen": -0.7549853324890137, + "logits/rejected": 3.749081611633301, + "logps/chosen": -353.02239990234375, + "logps/rejected": -909.6260986328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.173841953277588, + "rewards/margins": 24.962791442871094, + "rewards/rejected": -31.136634826660156, + "step": 1600 + }, + { + "epoch": 0.9959564541213064, + "grad_norm": 39.99483108520508, + "learning_rate": 3.7113877362840022e-06, + "logits/chosen": 1.2029293775558472, + "logits/rejected": 3.3627114295959473, + "logps/chosen": -645.314697265625, + "logps/rejected": -1047.4544677734375, + "loss": 0.669, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.72260856628418, + "rewards/margins": 27.31245994567871, + "rewards/rejected": -37.035064697265625, + "step": 1601 + }, + { + "epoch": 0.9965785381026439, + "grad_norm": 49.874942779541016, + "learning_rate": 3.7102351313969575e-06, + "logits/chosen": 3.284135103225708, + "logits/rejected": 4.036801815032959, + "logps/chosen": -687.740478515625, + "logps/rejected": -827.6663818359375, + "loss": 0.9122, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.97233247756958, + "rewards/margins": 11.839042663574219, + "rewards/rejected": -18.81137466430664, + "step": 1602 + }, + { + "epoch": 0.9972006220839813, + "grad_norm": 0.06565108150243759, + "learning_rate": 3.7090825265099127e-06, + "logits/chosen": -0.45381850004196167, + "logits/rejected": 4.070195198059082, + "logps/chosen": -459.11358642578125, + "logps/rejected": -935.2645874023438, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.786318778991699, + "rewards/margins": 21.61214256286621, + "rewards/rejected": -28.398460388183594, + "step": 1603 + }, + { + "epoch": 0.9978227060653189, + "grad_norm": 0.16008152067661285, + "learning_rate": 3.7079299216228683e-06, + "logits/chosen": 2.136279582977295, + "logits/rejected": 4.350179672241211, + "logps/chosen": -650.0744018554688, + "logps/rejected": -1078.3236083984375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.329696655273438, + "rewards/margins": 26.71537208557129, + "rewards/rejected": -35.04507064819336, + "step": 1604 + }, + { + "epoch": 0.9984447900466563, + "grad_norm": 32.31972122192383, + "learning_rate": 3.7067773167358236e-06, + "logits/chosen": 1.3715128898620605, + "logits/rejected": 4.225821495056152, + "logps/chosen": -534.2005615234375, + "logps/rejected": -910.2581787109375, + "loss": 0.6488, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.50748348236084, + "rewards/margins": 17.401165008544922, + "rewards/rejected": -26.908649444580078, + "step": 1605 + }, + { + "epoch": 0.9990668740279938, + "grad_norm": 0.004076329059898853, + "learning_rate": 3.705624711848779e-06, + "logits/chosen": 0.3134296238422394, + "logits/rejected": 2.1142117977142334, + "logps/chosen": -437.3398132324219, + "logps/rejected": -795.037109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.274627685546875, + "rewards/margins": 22.731220245361328, + "rewards/rejected": -32.0058479309082, + "step": 1606 + }, + { + "epoch": 0.9996889580093312, + "grad_norm": 6.227400263014715e-06, + "learning_rate": 3.704472106961734e-06, + "logits/chosen": -0.9058216214179993, + "logits/rejected": 4.364875793457031, + "logps/chosen": -474.5135498046875, + "logps/rejected": -1069.215576171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.431096076965332, + "rewards/margins": 33.49390411376953, + "rewards/rejected": -40.92500305175781, + "step": 1607 + }, + { + "epoch": 1.0003110419906687, + "grad_norm": 34.91269302368164, + "learning_rate": 3.7033195020746892e-06, + "logits/chosen": 1.389174461364746, + "logits/rejected": 4.184999942779541, + "logps/chosen": -401.0694580078125, + "logps/rejected": -723.718994140625, + "loss": 0.7746, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.645594596862793, + "rewards/margins": 20.324663162231445, + "rewards/rejected": -27.970258712768555, + "step": 1608 + }, + { + "epoch": 1.0009331259720062, + "grad_norm": 0.023849627003073692, + "learning_rate": 3.7021668971876445e-06, + "logits/chosen": -0.6653918027877808, + "logits/rejected": 3.9303970336914062, + "logps/chosen": -543.9305419921875, + "logps/rejected": -1017.9217529296875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.253396511077881, + "rewards/margins": 29.039730072021484, + "rewards/rejected": -35.293128967285156, + "step": 1609 + }, + { + "epoch": 1.0015552099533438, + "grad_norm": 5.411730307969265e-05, + "learning_rate": 3.7010142923005997e-06, + "logits/chosen": -0.11129330098628998, + "logits/rejected": 3.716736316680908, + "logps/chosen": -332.31414794921875, + "logps/rejected": -754.6508178710938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.006335735321045, + "rewards/margins": 23.23809814453125, + "rewards/rejected": -29.24443244934082, + "step": 1610 + }, + { + "epoch": 1.0021772939346811, + "grad_norm": 0.9929457306861877, + "learning_rate": 3.699861687413555e-06, + "logits/chosen": 0.5281230211257935, + "logits/rejected": 3.874539852142334, + "logps/chosen": -519.0057373046875, + "logps/rejected": -1060.0328369140625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.74343204498291, + "rewards/margins": 31.822721481323242, + "rewards/rejected": -39.56615447998047, + "step": 1611 + }, + { + "epoch": 1.0027993779160187, + "grad_norm": 1.0761016607284546, + "learning_rate": 3.6987090825265106e-06, + "logits/chosen": -0.8074854612350464, + "logits/rejected": 4.119511127471924, + "logps/chosen": -493.4097900390625, + "logps/rejected": -956.7243041992188, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.330320358276367, + "rewards/margins": 21.3006591796875, + "rewards/rejected": -28.630979537963867, + "step": 1612 + }, + { + "epoch": 1.003421461897356, + "grad_norm": 0.0010593491606414318, + "learning_rate": 3.6975564776394658e-06, + "logits/chosen": -1.6595462560653687, + "logits/rejected": 3.404412269592285, + "logps/chosen": -308.19329833984375, + "logps/rejected": -925.4331665039062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.133266448974609, + "rewards/margins": 31.814796447753906, + "rewards/rejected": -36.948062896728516, + "step": 1613 + }, + { + "epoch": 1.0040435458786936, + "grad_norm": 0.1719919741153717, + "learning_rate": 3.696403872752421e-06, + "logits/chosen": 1.4374184608459473, + "logits/rejected": 4.8389716148376465, + "logps/chosen": -508.57843017578125, + "logps/rejected": -896.6202392578125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.462780475616455, + "rewards/margins": 22.043060302734375, + "rewards/rejected": -27.505840301513672, + "step": 1614 + }, + { + "epoch": 1.0046656298600312, + "grad_norm": 0.00047982099931687117, + "learning_rate": 3.6952512678653762e-06, + "logits/chosen": -0.11289626359939575, + "logits/rejected": 5.051069259643555, + "logps/chosen": -386.2294921875, + "logps/rejected": -910.8284301757812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.725472450256348, + "rewards/margins": 25.839317321777344, + "rewards/rejected": -33.564788818359375, + "step": 1615 + }, + { + "epoch": 1.0052877138413685, + "grad_norm": 0.684409499168396, + "learning_rate": 3.6940986629783315e-06, + "logits/chosen": 1.061985969543457, + "logits/rejected": 1.488252878189087, + "logps/chosen": -473.2418212890625, + "logps/rejected": -788.3059692382812, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.849893569946289, + "rewards/margins": 23.360258102416992, + "rewards/rejected": -33.21015167236328, + "step": 1616 + }, + { + "epoch": 1.005909797822706, + "grad_norm": 0.01652875542640686, + "learning_rate": 3.6929460580912867e-06, + "logits/chosen": -1.0540319681167603, + "logits/rejected": 2.3286192417144775, + "logps/chosen": -477.8441162109375, + "logps/rejected": -959.3941040039062, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.00267219543457, + "rewards/margins": 23.850139617919922, + "rewards/rejected": -32.85280990600586, + "step": 1617 + }, + { + "epoch": 1.0065318818040436, + "grad_norm": 2.5337561737615033e-07, + "learning_rate": 3.691793453204242e-06, + "logits/chosen": -0.5124354362487793, + "logits/rejected": 4.590569019317627, + "logps/chosen": -465.9842834472656, + "logps/rejected": -1202.935546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.984279632568359, + "rewards/margins": 38.834861755371094, + "rewards/rejected": -45.81914138793945, + "step": 1618 + }, + { + "epoch": 1.007153965785381, + "grad_norm": 2.6867403984069824, + "learning_rate": 3.6906408483171976e-06, + "logits/chosen": 0.7521177530288696, + "logits/rejected": 5.3029465675354, + "logps/chosen": -548.4725341796875, + "logps/rejected": -1041.0906982421875, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.513587951660156, + "rewards/margins": 21.835956573486328, + "rewards/rejected": -31.34954261779785, + "step": 1619 + }, + { + "epoch": 1.0077760497667185, + "grad_norm": 0.004673125222325325, + "learning_rate": 3.6894882434301528e-06, + "logits/chosen": 0.1786353588104248, + "logits/rejected": 3.4529829025268555, + "logps/chosen": -371.00592041015625, + "logps/rejected": -769.5985717773438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.893185615539551, + "rewards/margins": 24.657033920288086, + "rewards/rejected": -31.550220489501953, + "step": 1620 + }, + { + "epoch": 1.008398133748056, + "grad_norm": 0.0009540282189846039, + "learning_rate": 3.688335638543108e-06, + "logits/chosen": -0.4206286072731018, + "logits/rejected": 2.911111831665039, + "logps/chosen": -376.7239990234375, + "logps/rejected": -911.2799072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.087729454040527, + "rewards/margins": 30.201828002929688, + "rewards/rejected": -35.28955841064453, + "step": 1621 + }, + { + "epoch": 1.0090202177293934, + "grad_norm": 5.449791206046939e-05, + "learning_rate": 3.6871830336560632e-06, + "logits/chosen": 1.2042280435562134, + "logits/rejected": 5.087825775146484, + "logps/chosen": -480.3478698730469, + "logps/rejected": -862.7271118164062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.974275588989258, + "rewards/margins": 22.0208797454834, + "rewards/rejected": -27.995153427124023, + "step": 1622 + }, + { + "epoch": 1.009642301710731, + "grad_norm": 19.46409034729004, + "learning_rate": 3.6860304287690185e-06, + "logits/chosen": 3.0977351665496826, + "logits/rejected": 2.2187840938568115, + "logps/chosen": -602.4797973632812, + "logps/rejected": -827.6873168945312, + "loss": 0.1975, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.518498420715332, + "rewards/margins": 18.173076629638672, + "rewards/rejected": -27.691574096679688, + "step": 1623 + }, + { + "epoch": 1.0102643856920683, + "grad_norm": 14.768061637878418, + "learning_rate": 3.6848778238819737e-06, + "logits/chosen": -0.5324491858482361, + "logits/rejected": 3.6249537467956543, + "logps/chosen": -440.9612121582031, + "logps/rejected": -942.3751831054688, + "loss": 0.1718, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.06298828125, + "rewards/margins": 22.763708114624023, + "rewards/rejected": -32.826698303222656, + "step": 1624 + }, + { + "epoch": 1.010886469673406, + "grad_norm": 0.0009539787424728274, + "learning_rate": 3.683725218994929e-06, + "logits/chosen": -2.3274078369140625, + "logits/rejected": 4.436672687530518, + "logps/chosen": -315.9355773925781, + "logps/rejected": -1044.0924072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2964043617248535, + "rewards/margins": 34.74640655517578, + "rewards/rejected": -39.042808532714844, + "step": 1625 + }, + { + "epoch": 1.0115085536547435, + "grad_norm": 0.07945874333381653, + "learning_rate": 3.6825726141078846e-06, + "logits/chosen": 0.2944844663143158, + "logits/rejected": 3.081148624420166, + "logps/chosen": -613.04931640625, + "logps/rejected": -1093.5924072265625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.141210556030273, + "rewards/margins": 28.32501983642578, + "rewards/rejected": -40.46623229980469, + "step": 1626 + }, + { + "epoch": 1.0121306376360808, + "grad_norm": 7.767158649585326e-07, + "learning_rate": 3.6814200092208398e-06, + "logits/chosen": 0.2727533280849457, + "logits/rejected": 2.755222797393799, + "logps/chosen": -553.1746826171875, + "logps/rejected": -915.7510375976562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.928220748901367, + "rewards/margins": 24.597822189331055, + "rewards/rejected": -30.52604103088379, + "step": 1627 + }, + { + "epoch": 1.0127527216174184, + "grad_norm": 1.0329980850219727, + "learning_rate": 3.680267404333795e-06, + "logits/chosen": 0.5199713110923767, + "logits/rejected": 3.451162815093994, + "logps/chosen": -573.7638549804688, + "logps/rejected": -1039.6966552734375, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.805469512939453, + "rewards/margins": 24.30244255065918, + "rewards/rejected": -35.10791015625, + "step": 1628 + }, + { + "epoch": 1.013374805598756, + "grad_norm": 5.4592834203504026e-05, + "learning_rate": 3.6791147994467502e-06, + "logits/chosen": -0.6406729221343994, + "logits/rejected": 3.3449177742004395, + "logps/chosen": -435.84527587890625, + "logps/rejected": -979.3944091796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.502603530883789, + "rewards/margins": 30.276081085205078, + "rewards/rejected": -39.7786865234375, + "step": 1629 + }, + { + "epoch": 1.0139968895800933, + "grad_norm": 23.031963348388672, + "learning_rate": 3.6779621945597055e-06, + "logits/chosen": 2.074857473373413, + "logits/rejected": 4.605471611022949, + "logps/chosen": -614.27490234375, + "logps/rejected": -1021.327880859375, + "loss": 0.151, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.717360019683838, + "rewards/margins": 22.947689056396484, + "rewards/rejected": -30.66505241394043, + "step": 1630 + }, + { + "epoch": 1.0146189735614308, + "grad_norm": 28.341402053833008, + "learning_rate": 3.6768095896726607e-06, + "logits/chosen": 0.06842297315597534, + "logits/rejected": 3.198674440383911, + "logps/chosen": -429.0621643066406, + "logps/rejected": -785.5848388671875, + "loss": 0.3921, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.456113815307617, + "rewards/margins": 12.912420272827148, + "rewards/rejected": -19.368534088134766, + "step": 1631 + }, + { + "epoch": 1.0152410575427682, + "grad_norm": 0.001189008355140686, + "learning_rate": 3.675656984785616e-06, + "logits/chosen": 1.5726920366287231, + "logits/rejected": 4.682127952575684, + "logps/chosen": -489.81280517578125, + "logps/rejected": -987.9769897460938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.758681297302246, + "rewards/margins": 29.487350463867188, + "rewards/rejected": -36.24603271484375, + "step": 1632 + }, + { + "epoch": 1.0158631415241057, + "grad_norm": 0.0009915231494233012, + "learning_rate": 3.674504379898571e-06, + "logits/chosen": -0.04778742790222168, + "logits/rejected": 3.202317237854004, + "logps/chosen": -492.9322509765625, + "logps/rejected": -1061.7415771484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.842962265014648, + "rewards/margins": 33.83513259887695, + "rewards/rejected": -42.67809295654297, + "step": 1633 + }, + { + "epoch": 1.0164852255054433, + "grad_norm": 0.020902851596474648, + "learning_rate": 3.6733517750115268e-06, + "logits/chosen": 1.1798291206359863, + "logits/rejected": 3.560368299484253, + "logps/chosen": -612.0977783203125, + "logps/rejected": -1176.016357421875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.882665157318115, + "rewards/margins": 33.82012176513672, + "rewards/rejected": -40.70278549194336, + "step": 1634 + }, + { + "epoch": 1.0171073094867806, + "grad_norm": 1.8183759450912476, + "learning_rate": 3.672199170124482e-06, + "logits/chosen": 2.5154342651367188, + "logits/rejected": 2.281287670135498, + "logps/chosen": -606.6002197265625, + "logps/rejected": -919.1095581054688, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.341569900512695, + "rewards/margins": 22.567405700683594, + "rewards/rejected": -33.908973693847656, + "step": 1635 + }, + { + "epoch": 1.0177293934681182, + "grad_norm": 2.0949930679137196e-10, + "learning_rate": 3.671046565237437e-06, + "logits/chosen": 1.1360574960708618, + "logits/rejected": 3.4395339488983154, + "logps/chosen": -571.862548828125, + "logps/rejected": -1135.98388671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.192299842834473, + "rewards/margins": 36.66862106323242, + "rewards/rejected": -44.86091995239258, + "step": 1636 + }, + { + "epoch": 1.0183514774494558, + "grad_norm": 2.616175413131714, + "learning_rate": 3.669893960350392e-06, + "logits/chosen": -0.20007240772247314, + "logits/rejected": 3.70963716506958, + "logps/chosen": -440.68817138671875, + "logps/rejected": -851.4669189453125, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.582137107849121, + "rewards/margins": 23.428585052490234, + "rewards/rejected": -31.010723114013672, + "step": 1637 + }, + { + "epoch": 1.018973561430793, + "grad_norm": 0.011836409568786621, + "learning_rate": 3.6687413554633473e-06, + "logits/chosen": 1.411272406578064, + "logits/rejected": 3.4393310546875, + "logps/chosen": -656.6214599609375, + "logps/rejected": -992.0446166992188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.456025123596191, + "rewards/margins": 30.13931655883789, + "rewards/rejected": -34.595340728759766, + "step": 1638 + }, + { + "epoch": 1.0195956454121307, + "grad_norm": 0.24777275323867798, + "learning_rate": 3.6675887505763025e-06, + "logits/chosen": -0.15388274192810059, + "logits/rejected": 0.12217582017183304, + "logps/chosen": -670.36572265625, + "logps/rejected": -854.996337890625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.72994613647461, + "rewards/margins": 20.67226219177246, + "rewards/rejected": -30.402206420898438, + "step": 1639 + }, + { + "epoch": 1.0202177293934682, + "grad_norm": 0.6539616584777832, + "learning_rate": 3.6664361456892577e-06, + "logits/chosen": 0.7842258214950562, + "logits/rejected": 3.279789924621582, + "logps/chosen": -583.931640625, + "logps/rejected": -1048.6573486328125, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.54281759262085, + "rewards/margins": 29.190086364746094, + "rewards/rejected": -36.73290252685547, + "step": 1640 + }, + { + "epoch": 1.0208398133748056, + "grad_norm": 0.44214633107185364, + "learning_rate": 3.665283540802213e-06, + "logits/chosen": 2.359795331954956, + "logits/rejected": 3.069380760192871, + "logps/chosen": -659.0450439453125, + "logps/rejected": -957.125732421875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.856813430786133, + "rewards/margins": 23.456443786621094, + "rewards/rejected": -32.313255310058594, + "step": 1641 + }, + { + "epoch": 1.0214618973561431, + "grad_norm": 0.011518875136971474, + "learning_rate": 3.664130935915168e-06, + "logits/chosen": 3.3660264015197754, + "logits/rejected": 4.603273391723633, + "logps/chosen": -709.8883056640625, + "logps/rejected": -965.43896484375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.867781639099121, + "rewards/margins": 22.071300506591797, + "rewards/rejected": -30.939083099365234, + "step": 1642 + }, + { + "epoch": 1.0220839813374805, + "grad_norm": 0.0006374081131070852, + "learning_rate": 3.662978331028124e-06, + "logits/chosen": -0.2983798682689667, + "logits/rejected": 4.081971168518066, + "logps/chosen": -327.46661376953125, + "logps/rejected": -816.51806640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7871956825256348, + "rewards/margins": 23.058956146240234, + "rewards/rejected": -26.846153259277344, + "step": 1643 + }, + { + "epoch": 1.022706065318818, + "grad_norm": 0.633435845375061, + "learning_rate": 3.661825726141079e-06, + "logits/chosen": -1.1860074996948242, + "logits/rejected": 2.1363470554351807, + "logps/chosen": -608.3544311523438, + "logps/rejected": -986.3404541015625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.85249662399292, + "rewards/margins": 27.040531158447266, + "rewards/rejected": -34.893028259277344, + "step": 1644 + }, + { + "epoch": 1.0233281493001556, + "grad_norm": 0.00012436254473868757, + "learning_rate": 3.6606731212540342e-06, + "logits/chosen": 0.9427947998046875, + "logits/rejected": 2.397813081741333, + "logps/chosen": -606.085693359375, + "logps/rejected": -1083.9486083984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.637308120727539, + "rewards/margins": 34.03984069824219, + "rewards/rejected": -43.677146911621094, + "step": 1645 + }, + { + "epoch": 1.023950233281493, + "grad_norm": 2.4389617465203628e-05, + "learning_rate": 3.6595205163669895e-06, + "logits/chosen": 1.5660756826400757, + "logits/rejected": 2.541382312774658, + "logps/chosen": -635.6356201171875, + "logps/rejected": -952.6004028320312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.896754264831543, + "rewards/margins": 22.892070770263672, + "rewards/rejected": -33.78882598876953, + "step": 1646 + }, + { + "epoch": 1.0245723172628305, + "grad_norm": 0.00010650245530996472, + "learning_rate": 3.6583679114799447e-06, + "logits/chosen": -1.616473913192749, + "logits/rejected": 2.7548515796661377, + "logps/chosen": -359.27069091796875, + "logps/rejected": -915.1429443359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.845514297485352, + "rewards/margins": 27.412410736083984, + "rewards/rejected": -32.2579231262207, + "step": 1647 + }, + { + "epoch": 1.025194401244168, + "grad_norm": 0.15823419392108917, + "learning_rate": 3.6572153065929e-06, + "logits/chosen": 1.3099300861358643, + "logits/rejected": 5.212904930114746, + "logps/chosen": -537.3043823242188, + "logps/rejected": -903.5443725585938, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.36880111694336, + "rewards/margins": 16.472347259521484, + "rewards/rejected": -25.841148376464844, + "step": 1648 + }, + { + "epoch": 1.0258164852255054, + "grad_norm": 6.6609704845177475e-06, + "learning_rate": 3.656062701705855e-06, + "logits/chosen": 1.4306132793426514, + "logits/rejected": 3.6266322135925293, + "logps/chosen": -444.43133544921875, + "logps/rejected": -858.3787841796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.317989349365234, + "rewards/margins": 30.24403190612793, + "rewards/rejected": -34.56201934814453, + "step": 1649 + }, + { + "epoch": 1.026438569206843, + "grad_norm": 0.005263295955955982, + "learning_rate": 3.654910096818811e-06, + "logits/chosen": 0.2962612509727478, + "logits/rejected": 3.644659996032715, + "logps/chosen": -574.5421752929688, + "logps/rejected": -990.66162109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.722488403320312, + "rewards/margins": 26.464990615844727, + "rewards/rejected": -36.187477111816406, + "step": 1650 + }, + { + "epoch": 1.0270606531881805, + "grad_norm": 0.08479459583759308, + "learning_rate": 3.653757491931766e-06, + "logits/chosen": -0.08327645063400269, + "logits/rejected": 3.943711519241333, + "logps/chosen": -545.4989013671875, + "logps/rejected": -958.6614990234375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.588180541992188, + "rewards/margins": 22.37979507446289, + "rewards/rejected": -30.96797752380371, + "step": 1651 + }, + { + "epoch": 1.0276827371695179, + "grad_norm": 3.4717464814093546e-07, + "learning_rate": 3.6526048870447212e-06, + "logits/chosen": -1.4334676265716553, + "logits/rejected": 3.306443452835083, + "logps/chosen": -395.6757507324219, + "logps/rejected": -912.210205078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.050688743591309, + "rewards/margins": 27.281938552856445, + "rewards/rejected": -31.332626342773438, + "step": 1652 + }, + { + "epoch": 1.0283048211508554, + "grad_norm": 0.04955174773931503, + "learning_rate": 3.6514522821576765e-06, + "logits/chosen": 0.1008802056312561, + "logits/rejected": 2.2651162147521973, + "logps/chosen": -557.874267578125, + "logps/rejected": -865.3643798828125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.999269962310791, + "rewards/margins": 18.858959197998047, + "rewards/rejected": -26.85822868347168, + "step": 1653 + }, + { + "epoch": 1.0289269051321928, + "grad_norm": 24.3441162109375, + "learning_rate": 3.6502996772706317e-06, + "logits/chosen": 0.04064282774925232, + "logits/rejected": 3.0165932178497314, + "logps/chosen": -483.45697021484375, + "logps/rejected": -857.9864501953125, + "loss": 0.2728, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.3655900955200195, + "rewards/margins": 18.789987564086914, + "rewards/rejected": -24.155576705932617, + "step": 1654 + }, + { + "epoch": 1.0295489891135303, + "grad_norm": 54.452606201171875, + "learning_rate": 3.649147072383587e-06, + "logits/chosen": -1.2691631317138672, + "logits/rejected": 1.3112772703170776, + "logps/chosen": -548.9378051757812, + "logps/rejected": -790.1439208984375, + "loss": 1.3937, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.830726623535156, + "rewards/margins": 15.580018043518066, + "rewards/rejected": -24.410743713378906, + "step": 1655 + }, + { + "epoch": 1.0301710730948679, + "grad_norm": 1.790919542312622, + "learning_rate": 3.647994467496542e-06, + "logits/chosen": -1.0112364292144775, + "logits/rejected": 0.8678668737411499, + "logps/chosen": -378.75933837890625, + "logps/rejected": -695.4180908203125, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.413599967956543, + "rewards/margins": 19.858667373657227, + "rewards/rejected": -27.272268295288086, + "step": 1656 + }, + { + "epoch": 1.0307931570762052, + "grad_norm": 3.013064088008832e-06, + "learning_rate": 3.6468418626094974e-06, + "logits/chosen": -3.579735279083252, + "logits/rejected": 0.7812553644180298, + "logps/chosen": -366.45904541015625, + "logps/rejected": -877.666259765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.8892035484313965, + "rewards/margins": 25.258516311645508, + "rewards/rejected": -33.14772033691406, + "step": 1657 + }, + { + "epoch": 1.0314152410575428, + "grad_norm": 0.20314611494541168, + "learning_rate": 3.645689257722453e-06, + "logits/chosen": 0.3334569036960602, + "logits/rejected": 2.605698347091675, + "logps/chosen": -470.14068603515625, + "logps/rejected": -883.865478515625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.243642807006836, + "rewards/margins": 25.31780242919922, + "rewards/rejected": -34.56144714355469, + "step": 1658 + }, + { + "epoch": 1.0320373250388803, + "grad_norm": 4.763148542252793e-09, + "learning_rate": 3.6445366528354082e-06, + "logits/chosen": -0.9464988112449646, + "logits/rejected": 3.2320268154144287, + "logps/chosen": -583.88232421875, + "logps/rejected": -1142.690673828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.410359382629395, + "rewards/margins": 33.93184280395508, + "rewards/rejected": -45.342201232910156, + "step": 1659 + }, + { + "epoch": 1.0326594090202177, + "grad_norm": 3.7136945724487305, + "learning_rate": 3.6433840479483635e-06, + "logits/chosen": 0.5876110792160034, + "logits/rejected": 3.5559744834899902, + "logps/chosen": -435.9635009765625, + "logps/rejected": -796.0913696289062, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.127996444702148, + "rewards/margins": 24.278839111328125, + "rewards/rejected": -31.406837463378906, + "step": 1660 + }, + { + "epoch": 1.0332814930015553, + "grad_norm": 6.777904033660889, + "learning_rate": 3.6422314430613187e-06, + "logits/chosen": 2.48170804977417, + "logits/rejected": 4.341867923736572, + "logps/chosen": -720.7596435546875, + "logps/rejected": -1063.3670654296875, + "loss": 0.0417, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.72669506072998, + "rewards/margins": 22.279956817626953, + "rewards/rejected": -33.00665283203125, + "step": 1661 + }, + { + "epoch": 1.0339035769828926, + "grad_norm": 31.819677352905273, + "learning_rate": 3.641078838174274e-06, + "logits/chosen": 0.8430761694908142, + "logits/rejected": 2.8223979473114014, + "logps/chosen": -605.6489868164062, + "logps/rejected": -917.00927734375, + "loss": 0.6779, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.731306076049805, + "rewards/margins": 23.4366455078125, + "rewards/rejected": -33.16795349121094, + "step": 1662 + }, + { + "epoch": 1.0345256609642302, + "grad_norm": 0.5945461392402649, + "learning_rate": 3.639926233287229e-06, + "logits/chosen": 1.1385351419448853, + "logits/rejected": 3.092228412628174, + "logps/chosen": -531.9583740234375, + "logps/rejected": -990.5872802734375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.152070045471191, + "rewards/margins": 28.98786163330078, + "rewards/rejected": -38.139930725097656, + "step": 1663 + }, + { + "epoch": 1.0351477449455677, + "grad_norm": 0.0029474389739334583, + "learning_rate": 3.6387736284001844e-06, + "logits/chosen": 2.3572731018066406, + "logits/rejected": 3.6039628982543945, + "logps/chosen": -572.6138916015625, + "logps/rejected": -994.5134887695312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.076838493347168, + "rewards/margins": 30.933841705322266, + "rewards/rejected": -37.01068115234375, + "step": 1664 + }, + { + "epoch": 1.035769828926905, + "grad_norm": 30.376829147338867, + "learning_rate": 3.63762102351314e-06, + "logits/chosen": -0.8222250938415527, + "logits/rejected": 3.142303943634033, + "logps/chosen": -321.65899658203125, + "logps/rejected": -653.8709716796875, + "loss": 0.4004, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.204082489013672, + "rewards/margins": 15.132912635803223, + "rewards/rejected": -22.33699607849121, + "step": 1665 + }, + { + "epoch": 1.0363919129082426, + "grad_norm": 0.008387638255953789, + "learning_rate": 3.6364684186260952e-06, + "logits/chosen": 3.4995431900024414, + "logits/rejected": 5.597842216491699, + "logps/chosen": -749.5764770507812, + "logps/rejected": -1146.525146484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.341553211212158, + "rewards/margins": 25.269424438476562, + "rewards/rejected": -31.610977172851562, + "step": 1666 + }, + { + "epoch": 1.0370139968895802, + "grad_norm": 25.438154220581055, + "learning_rate": 3.6353158137390505e-06, + "logits/chosen": 0.6028562784194946, + "logits/rejected": 4.041049003601074, + "logps/chosen": -510.02557373046875, + "logps/rejected": -868.3908081054688, + "loss": 0.1768, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.452632427215576, + "rewards/margins": 18.50664520263672, + "rewards/rejected": -25.959278106689453, + "step": 1667 + }, + { + "epoch": 1.0376360808709175, + "grad_norm": 7.822577953338623, + "learning_rate": 3.6341632088520057e-06, + "logits/chosen": -3.6562254428863525, + "logits/rejected": 0.9931538701057434, + "logps/chosen": -330.18707275390625, + "logps/rejected": -787.677734375, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.651616096496582, + "rewards/margins": 20.370330810546875, + "rewards/rejected": -25.021947860717773, + "step": 1668 + }, + { + "epoch": 1.038258164852255, + "grad_norm": 4.106832981109619, + "learning_rate": 3.633010603964961e-06, + "logits/chosen": -0.2826972007751465, + "logits/rejected": 3.079453468322754, + "logps/chosen": -547.0927734375, + "logps/rejected": -959.0612182617188, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.533456802368164, + "rewards/margins": 22.27680206298828, + "rewards/rejected": -31.810256958007812, + "step": 1669 + }, + { + "epoch": 1.0388802488335926, + "grad_norm": 4.633700370788574, + "learning_rate": 3.631857999077916e-06, + "logits/chosen": -0.9866232872009277, + "logits/rejected": 3.467489719390869, + "logps/chosen": -432.56134033203125, + "logps/rejected": -881.6924438476562, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.791879177093506, + "rewards/margins": 22.992813110351562, + "rewards/rejected": -30.784690856933594, + "step": 1670 + }, + { + "epoch": 1.03950233281493, + "grad_norm": 1.7924748659133911, + "learning_rate": 3.6307053941908714e-06, + "logits/chosen": 1.0629141330718994, + "logits/rejected": 3.884521722793579, + "logps/chosen": -445.9981994628906, + "logps/rejected": -854.4365844726562, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.354894638061523, + "rewards/margins": 24.29794692993164, + "rewards/rejected": -30.652841567993164, + "step": 1671 + }, + { + "epoch": 1.0401244167962675, + "grad_norm": 0.001384186209179461, + "learning_rate": 3.629552789303827e-06, + "logits/chosen": 1.3695205450057983, + "logits/rejected": 4.970004081726074, + "logps/chosen": -430.239501953125, + "logps/rejected": -870.437744140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.031839370727539, + "rewards/margins": 24.595035552978516, + "rewards/rejected": -29.626874923706055, + "step": 1672 + }, + { + "epoch": 1.0407465007776049, + "grad_norm": 0.32689252495765686, + "learning_rate": 3.6284001844167822e-06, + "logits/chosen": -0.18244385719299316, + "logits/rejected": 2.874420166015625, + "logps/chosen": -481.1783447265625, + "logps/rejected": -967.6571044921875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.099664688110352, + "rewards/margins": 25.957063674926758, + "rewards/rejected": -33.05672836303711, + "step": 1673 + }, + { + "epoch": 1.0413685847589425, + "grad_norm": 28.884672164916992, + "learning_rate": 3.6272475795297375e-06, + "logits/chosen": -0.16030162572860718, + "logits/rejected": 3.1493980884552, + "logps/chosen": -402.1942443847656, + "logps/rejected": -800.2150268554688, + "loss": 0.4067, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.243395805358887, + "rewards/margins": 16.908164978027344, + "rewards/rejected": -23.151561737060547, + "step": 1674 + }, + { + "epoch": 1.04199066874028, + "grad_norm": 0.03992212936282158, + "learning_rate": 3.6260949746426927e-06, + "logits/chosen": 0.6602403521537781, + "logits/rejected": 3.8015923500061035, + "logps/chosen": -679.17236328125, + "logps/rejected": -1099.7989501953125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.684561729431152, + "rewards/margins": 26.821062088012695, + "rewards/rejected": -36.50562286376953, + "step": 1675 + }, + { + "epoch": 1.0426127527216174, + "grad_norm": 34.57837677001953, + "learning_rate": 3.624942369755648e-06, + "logits/chosen": 0.16994890570640564, + "logits/rejected": 1.8535521030426025, + "logps/chosen": -510.3432922363281, + "logps/rejected": -723.0161743164062, + "loss": 0.6064, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.824977874755859, + "rewards/margins": 18.82767105102539, + "rewards/rejected": -23.652650833129883, + "step": 1676 + }, + { + "epoch": 1.043234836702955, + "grad_norm": 4.1895599365234375, + "learning_rate": 3.623789764868603e-06, + "logits/chosen": -0.6701554656028748, + "logits/rejected": 4.756260871887207, + "logps/chosen": -426.80133056640625, + "logps/rejected": -956.4772338867188, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.39846658706665, + "rewards/margins": 22.706832885742188, + "rewards/rejected": -27.105300903320312, + "step": 1677 + }, + { + "epoch": 1.0438569206842925, + "grad_norm": 0.04196924343705177, + "learning_rate": 3.6226371599815584e-06, + "logits/chosen": 0.7270650267601013, + "logits/rejected": 1.8944915533065796, + "logps/chosen": -611.7488403320312, + "logps/rejected": -976.1044921875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.306270599365234, + "rewards/margins": 25.221054077148438, + "rewards/rejected": -32.52732467651367, + "step": 1678 + }, + { + "epoch": 1.0444790046656298, + "grad_norm": 2.5633633136749268, + "learning_rate": 3.621484555094514e-06, + "logits/chosen": 2.270385503768921, + "logits/rejected": 5.121720314025879, + "logps/chosen": -519.3915405273438, + "logps/rejected": -906.0528564453125, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.927509307861328, + "rewards/margins": 23.416658401489258, + "rewards/rejected": -33.34416580200195, + "step": 1679 + }, + { + "epoch": 1.0451010886469674, + "grad_norm": 0.004999854601919651, + "learning_rate": 3.6203319502074692e-06, + "logits/chosen": -0.6083928346633911, + "logits/rejected": 3.2550253868103027, + "logps/chosen": -405.646484375, + "logps/rejected": -903.578857421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4245429039001465, + "rewards/margins": 26.661523818969727, + "rewards/rejected": -29.08606719970703, + "step": 1680 + }, + { + "epoch": 1.0457231726283047, + "grad_norm": 4.50816260126885e-05, + "learning_rate": 3.6191793453204245e-06, + "logits/chosen": -0.9008429050445557, + "logits/rejected": 1.5069544315338135, + "logps/chosen": -469.40972900390625, + "logps/rejected": -793.4561157226562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.385979652404785, + "rewards/margins": 20.223291397094727, + "rewards/rejected": -26.609272003173828, + "step": 1681 + }, + { + "epoch": 1.0463452566096423, + "grad_norm": 7.654258728027344, + "learning_rate": 3.6180267404333797e-06, + "logits/chosen": -0.44024014472961426, + "logits/rejected": 1.088015079498291, + "logps/chosen": -698.7821044921875, + "logps/rejected": -1008.3251953125, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.793458938598633, + "rewards/margins": 23.57197380065918, + "rewards/rejected": -33.36543273925781, + "step": 1682 + }, + { + "epoch": 1.0469673405909798, + "grad_norm": 0.00012983712076675147, + "learning_rate": 3.616874135546335e-06, + "logits/chosen": -2.612804412841797, + "logits/rejected": 1.900937557220459, + "logps/chosen": -354.28045654296875, + "logps/rejected": -788.4923706054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.724903106689453, + "rewards/margins": 25.43047523498535, + "rewards/rejected": -30.155378341674805, + "step": 1683 + }, + { + "epoch": 1.0475894245723172, + "grad_norm": 0.007622862234711647, + "learning_rate": 3.61572153065929e-06, + "logits/chosen": 1.4981238842010498, + "logits/rejected": 3.9929189682006836, + "logps/chosen": -509.0369567871094, + "logps/rejected": -943.208740234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.061955451965332, + "rewards/margins": 24.140560150146484, + "rewards/rejected": -33.2025146484375, + "step": 1684 + }, + { + "epoch": 1.0482115085536547, + "grad_norm": 25.75448226928711, + "learning_rate": 3.6145689257722454e-06, + "logits/chosen": 2.079399585723877, + "logits/rejected": 2.769184112548828, + "logps/chosen": -719.6904907226562, + "logps/rejected": -1111.2332763671875, + "loss": 0.1932, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.611705780029297, + "rewards/margins": 29.136491775512695, + "rewards/rejected": -40.748199462890625, + "step": 1685 + }, + { + "epoch": 1.0488335925349923, + "grad_norm": 6.400043275789358e-06, + "learning_rate": 3.6134163208852006e-06, + "logits/chosen": 0.7596876621246338, + "logits/rejected": 2.1933321952819824, + "logps/chosen": -483.5215759277344, + "logps/rejected": -831.635009765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.2516350746154785, + "rewards/margins": 24.35067367553711, + "rewards/rejected": -30.602312088012695, + "step": 1686 + }, + { + "epoch": 1.0494556765163296, + "grad_norm": 7.343322067754343e-06, + "learning_rate": 3.6122637159981562e-06, + "logits/chosen": -2.8173489570617676, + "logits/rejected": 3.3061482906341553, + "logps/chosen": -377.90399169921875, + "logps/rejected": -955.3590698242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.850915908813477, + "rewards/margins": 30.7342529296875, + "rewards/rejected": -35.585166931152344, + "step": 1687 + }, + { + "epoch": 1.0500777604976672, + "grad_norm": 0.28369632363319397, + "learning_rate": 3.6111111111111115e-06, + "logits/chosen": 1.0880281925201416, + "logits/rejected": 3.659137725830078, + "logps/chosen": -371.4967346191406, + "logps/rejected": -714.4507446289062, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.510320663452148, + "rewards/margins": 24.008285522460938, + "rewards/rejected": -28.518606185913086, + "step": 1688 + }, + { + "epoch": 1.0506998444790048, + "grad_norm": 0.8588642477989197, + "learning_rate": 3.6099585062240667e-06, + "logits/chosen": 0.7619567513465881, + "logits/rejected": 3.8341445922851562, + "logps/chosen": -413.8572998046875, + "logps/rejected": -709.1582641601562, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.043118953704834, + "rewards/margins": 17.45237922668457, + "rewards/rejected": -24.495498657226562, + "step": 1689 + }, + { + "epoch": 1.0513219284603421, + "grad_norm": 0.015677358955144882, + "learning_rate": 3.608805901337022e-06, + "logits/chosen": -1.9144474267959595, + "logits/rejected": 4.085642337799072, + "logps/chosen": -452.252197265625, + "logps/rejected": -1200.351318359375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.386390686035156, + "rewards/margins": 35.35425567626953, + "rewards/rejected": -48.74065017700195, + "step": 1690 + }, + { + "epoch": 1.0519440124416797, + "grad_norm": 0.030642762780189514, + "learning_rate": 3.607653296449977e-06, + "logits/chosen": -3.1585073471069336, + "logits/rejected": -0.3589048981666565, + "logps/chosen": -465.46905517578125, + "logps/rejected": -877.997314453125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.650146484375, + "rewards/margins": 23.00937271118164, + "rewards/rejected": -32.65951919555664, + "step": 1691 + }, + { + "epoch": 1.052566096423017, + "grad_norm": 0.0374862477183342, + "learning_rate": 3.6065006915629324e-06, + "logits/chosen": -0.11048626899719238, + "logits/rejected": 3.808638095855713, + "logps/chosen": -430.2825622558594, + "logps/rejected": -957.116943359375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.346963882446289, + "rewards/margins": 26.467803955078125, + "rewards/rejected": -33.81476974487305, + "step": 1692 + }, + { + "epoch": 1.0531881804043546, + "grad_norm": 0.0897771343588829, + "learning_rate": 3.6053480866758876e-06, + "logits/chosen": -0.038357075303792953, + "logits/rejected": 1.691274642944336, + "logps/chosen": -655.2930908203125, + "logps/rejected": -917.3342895507812, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.112953186035156, + "rewards/margins": 17.01007843017578, + "rewards/rejected": -27.123031616210938, + "step": 1693 + }, + { + "epoch": 1.0538102643856921, + "grad_norm": 0.006796684116125107, + "learning_rate": 3.6041954817888432e-06, + "logits/chosen": -0.4090842008590698, + "logits/rejected": 0.5765342712402344, + "logps/chosen": -350.09661865234375, + "logps/rejected": -703.912353515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.090215682983398, + "rewards/margins": 24.583942413330078, + "rewards/rejected": -33.674156188964844, + "step": 1694 + }, + { + "epoch": 1.0544323483670295, + "grad_norm": 2.254205355711747e-05, + "learning_rate": 3.6030428769017985e-06, + "logits/chosen": -1.0816428661346436, + "logits/rejected": 3.604936361312866, + "logps/chosen": -298.3710021972656, + "logps/rejected": -861.7861938476562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.984401702880859, + "rewards/margins": 27.764816284179688, + "rewards/rejected": -33.74921798706055, + "step": 1695 + }, + { + "epoch": 1.055054432348367, + "grad_norm": 9.45249485084787e-05, + "learning_rate": 3.6018902720147537e-06, + "logits/chosen": 0.5957896113395691, + "logits/rejected": 4.191173553466797, + "logps/chosen": -351.27838134765625, + "logps/rejected": -839.6342163085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.228531837463379, + "rewards/margins": 27.103302001953125, + "rewards/rejected": -32.33183288574219, + "step": 1696 + }, + { + "epoch": 1.0556765163297046, + "grad_norm": 1.1835973262786865, + "learning_rate": 3.600737667127709e-06, + "logits/chosen": -0.7743030786514282, + "logits/rejected": 3.2317123413085938, + "logps/chosen": -473.1207275390625, + "logps/rejected": -919.996826171875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.106958389282227, + "rewards/margins": 20.596567153930664, + "rewards/rejected": -30.70352554321289, + "step": 1697 + }, + { + "epoch": 1.056298600311042, + "grad_norm": 0.09530337899923325, + "learning_rate": 3.599585062240664e-06, + "logits/chosen": 0.988264799118042, + "logits/rejected": 4.399881839752197, + "logps/chosen": -569.2879028320312, + "logps/rejected": -964.9010009765625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.983223915100098, + "rewards/margins": 26.619644165039062, + "rewards/rejected": -33.602867126464844, + "step": 1698 + }, + { + "epoch": 1.0569206842923795, + "grad_norm": 0.04010099917650223, + "learning_rate": 3.5984324573536193e-06, + "logits/chosen": 1.159104585647583, + "logits/rejected": 4.351685523986816, + "logps/chosen": -564.5264892578125, + "logps/rejected": -908.9020385742188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.197347640991211, + "rewards/margins": 18.77018928527832, + "rewards/rejected": -30.96753692626953, + "step": 1699 + }, + { + "epoch": 1.0575427682737168, + "grad_norm": 0.00016707685426808894, + "learning_rate": 3.5972798524665746e-06, + "logits/chosen": -0.8789552450180054, + "logits/rejected": 2.5933098793029785, + "logps/chosen": -367.5953369140625, + "logps/rejected": -794.2536010742188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.060921669006348, + "rewards/margins": 25.133541107177734, + "rewards/rejected": -31.1944637298584, + "step": 1700 + }, + { + "epoch": 1.0581648522550544, + "grad_norm": 0.0005692985141649842, + "learning_rate": 3.5961272475795302e-06, + "logits/chosen": 1.1418954133987427, + "logits/rejected": 4.3281450271606445, + "logps/chosen": -504.8585205078125, + "logps/rejected": -1039.9560546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.611676216125488, + "rewards/margins": 33.13249206542969, + "rewards/rejected": -38.74416732788086, + "step": 1701 + }, + { + "epoch": 1.058786936236392, + "grad_norm": 25.801237106323242, + "learning_rate": 3.5949746426924854e-06, + "logits/chosen": 0.6158846616744995, + "logits/rejected": 3.2959022521972656, + "logps/chosen": -553.5155639648438, + "logps/rejected": -928.0843505859375, + "loss": 0.2626, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.362363815307617, + "rewards/margins": 24.19908905029297, + "rewards/rejected": -32.56145477294922, + "step": 1702 + }, + { + "epoch": 1.0594090202177293, + "grad_norm": 0.4599364101886749, + "learning_rate": 3.5938220378054407e-06, + "logits/chosen": -2.900252342224121, + "logits/rejected": 1.8551745414733887, + "logps/chosen": -292.502197265625, + "logps/rejected": -844.1199951171875, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.056050777435303, + "rewards/margins": 27.006755828857422, + "rewards/rejected": -32.062808990478516, + "step": 1703 + }, + { + "epoch": 1.0600311041990669, + "grad_norm": 1.3544723515224177e-05, + "learning_rate": 3.592669432918396e-06, + "logits/chosen": 1.1340935230255127, + "logits/rejected": 4.3828887939453125, + "logps/chosen": -554.423828125, + "logps/rejected": -1023.113525390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.717758178710938, + "rewards/margins": 26.753938674926758, + "rewards/rejected": -38.47169876098633, + "step": 1704 + }, + { + "epoch": 1.0606531881804044, + "grad_norm": 0.0009403342264704406, + "learning_rate": 3.591516828031351e-06, + "logits/chosen": 3.1635279655456543, + "logits/rejected": 3.546613931655884, + "logps/chosen": -670.4608154296875, + "logps/rejected": -1008.8966064453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.73913288116455, + "rewards/margins": 24.537559509277344, + "rewards/rejected": -39.27669143676758, + "step": 1705 + }, + { + "epoch": 1.0612752721617418, + "grad_norm": 0.11844262480735779, + "learning_rate": 3.5903642231443063e-06, + "logits/chosen": -1.9967526197433472, + "logits/rejected": -0.313257098197937, + "logps/chosen": -472.6290588378906, + "logps/rejected": -765.5889892578125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.75210189819336, + "rewards/margins": 18.136417388916016, + "rewards/rejected": -28.888519287109375, + "step": 1706 + }, + { + "epoch": 1.0618973561430793, + "grad_norm": 11.051610946655273, + "learning_rate": 3.5892116182572616e-06, + "logits/chosen": 2.81465744972229, + "logits/rejected": 3.082577705383301, + "logps/chosen": -748.4796142578125, + "logps/rejected": -1038.966064453125, + "loss": 0.0563, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.436058044433594, + "rewards/margins": 26.616634368896484, + "rewards/rejected": -38.05268859863281, + "step": 1707 + }, + { + "epoch": 1.062519440124417, + "grad_norm": 0.0004929989227093756, + "learning_rate": 3.588059013370217e-06, + "logits/chosen": -0.6941190958023071, + "logits/rejected": 4.581844329833984, + "logps/chosen": -417.68316650390625, + "logps/rejected": -1064.1141357421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.887452125549316, + "rewards/margins": 31.676464080810547, + "rewards/rejected": -39.56391906738281, + "step": 1708 + }, + { + "epoch": 1.0631415241057542, + "grad_norm": 0.06487241387367249, + "learning_rate": 3.5869064084831724e-06, + "logits/chosen": -1.3791570663452148, + "logits/rejected": 3.591864585876465, + "logps/chosen": -283.37664794921875, + "logps/rejected": -892.575927734375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.048565864562988, + "rewards/margins": 31.034317016601562, + "rewards/rejected": -36.082881927490234, + "step": 1709 + }, + { + "epoch": 1.0637636080870918, + "grad_norm": 1.2701135347015224e-05, + "learning_rate": 3.5857538035961277e-06, + "logits/chosen": -3.2900099754333496, + "logits/rejected": 2.401799201965332, + "logps/chosen": -363.16302490234375, + "logps/rejected": -969.2822265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.9275922775268555, + "rewards/margins": 31.4039306640625, + "rewards/rejected": -37.33152389526367, + "step": 1710 + }, + { + "epoch": 1.0643856920684291, + "grad_norm": 1.2971514706805465e-06, + "learning_rate": 3.584601198709083e-06, + "logits/chosen": 0.3413795232772827, + "logits/rejected": 4.562448501586914, + "logps/chosen": -470.3218994140625, + "logps/rejected": -1056.247314453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.446603775024414, + "rewards/margins": 33.22271728515625, + "rewards/rejected": -41.66931915283203, + "step": 1711 + }, + { + "epoch": 1.0650077760497667, + "grad_norm": 0.004861316177994013, + "learning_rate": 3.583448593822038e-06, + "logits/chosen": 0.6895210146903992, + "logits/rejected": 3.1214401721954346, + "logps/chosen": -584.0718383789062, + "logps/rejected": -966.9913940429688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.424615859985352, + "rewards/margins": 28.374963760375977, + "rewards/rejected": -38.799583435058594, + "step": 1712 + }, + { + "epoch": 1.0656298600311043, + "grad_norm": 1.0662657022476196, + "learning_rate": 3.5822959889349933e-06, + "logits/chosen": 0.13189710676670074, + "logits/rejected": 3.2180283069610596, + "logps/chosen": -615.065185546875, + "logps/rejected": -1035.4063720703125, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.89202880859375, + "rewards/margins": 25.611513137817383, + "rewards/rejected": -37.503543853759766, + "step": 1713 + }, + { + "epoch": 1.0662519440124416, + "grad_norm": 0.06123606860637665, + "learning_rate": 3.5811433840479486e-06, + "logits/chosen": 0.0728345513343811, + "logits/rejected": 2.7082386016845703, + "logps/chosen": -618.5989990234375, + "logps/rejected": -1061.659423828125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.811964988708496, + "rewards/margins": 26.231304168701172, + "rewards/rejected": -34.043270111083984, + "step": 1714 + }, + { + "epoch": 1.0668740279937792, + "grad_norm": 0.47655490040779114, + "learning_rate": 3.579990779160904e-06, + "logits/chosen": -0.4628029465675354, + "logits/rejected": 3.919542074203491, + "logps/chosen": -585.7105712890625, + "logps/rejected": -1029.9808349609375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.386682510375977, + "rewards/margins": 21.445314407348633, + "rewards/rejected": -34.83199691772461, + "step": 1715 + }, + { + "epoch": 1.0674961119751167, + "grad_norm": 0.0005064127617515624, + "learning_rate": 3.5788381742738594e-06, + "logits/chosen": -0.7007519006729126, + "logits/rejected": 3.573007583618164, + "logps/chosen": -193.0349578857422, + "logps/rejected": -660.3382568359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.492222547531128, + "rewards/margins": 21.314695358276367, + "rewards/rejected": -24.806917190551758, + "step": 1716 + }, + { + "epoch": 1.068118195956454, + "grad_norm": 0.0011604634346440434, + "learning_rate": 3.5776855693868147e-06, + "logits/chosen": -0.47038382291793823, + "logits/rejected": 3.3902623653411865, + "logps/chosen": -441.37841796875, + "logps/rejected": -1046.822265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.868171691894531, + "rewards/margins": 31.64148712158203, + "rewards/rejected": -39.50965881347656, + "step": 1717 + }, + { + "epoch": 1.0687402799377916, + "grad_norm": 2.1240060329437256, + "learning_rate": 3.57653296449977e-06, + "logits/chosen": 0.4144650101661682, + "logits/rejected": 3.6955389976501465, + "logps/chosen": -497.20538330078125, + "logps/rejected": -924.2775268554688, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.828954696655273, + "rewards/margins": 21.07897186279297, + "rewards/rejected": -30.90792465209961, + "step": 1718 + }, + { + "epoch": 1.069362363919129, + "grad_norm": 47.840118408203125, + "learning_rate": 3.575380359612725e-06, + "logits/chosen": 1.5221973657608032, + "logits/rejected": 4.614435195922852, + "logps/chosen": -642.9663696289062, + "logps/rejected": -1006.8882446289062, + "loss": 1.9259, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.217392921447754, + "rewards/margins": 26.158397674560547, + "rewards/rejected": -36.375789642333984, + "step": 1719 + }, + { + "epoch": 1.0699844479004665, + "grad_norm": 0.0057447003200650215, + "learning_rate": 3.5742277547256803e-06, + "logits/chosen": -1.6053816080093384, + "logits/rejected": 3.832913398742676, + "logps/chosen": -497.01678466796875, + "logps/rejected": -1073.96142578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.212861061096191, + "rewards/margins": 27.787372589111328, + "rewards/rejected": -38.00023651123047, + "step": 1720 + }, + { + "epoch": 1.070606531881804, + "grad_norm": 0.00011915850336663425, + "learning_rate": 3.5730751498386356e-06, + "logits/chosen": 2.358245611190796, + "logits/rejected": 3.673961877822876, + "logps/chosen": -703.9837646484375, + "logps/rejected": -1114.497802734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.243408203125, + "rewards/margins": 32.560333251953125, + "rewards/rejected": -43.803741455078125, + "step": 1721 + }, + { + "epoch": 1.0712286158631414, + "grad_norm": 0.002361687133088708, + "learning_rate": 3.571922544951591e-06, + "logits/chosen": 0.057686299085617065, + "logits/rejected": 4.59140682220459, + "logps/chosen": -405.74224853515625, + "logps/rejected": -1008.8343505859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.059910774230957, + "rewards/margins": 30.22999382019043, + "rewards/rejected": -37.28990173339844, + "step": 1722 + }, + { + "epoch": 1.071850699844479, + "grad_norm": 0.14930595457553864, + "learning_rate": 3.5707699400645464e-06, + "logits/chosen": 0.9290033578872681, + "logits/rejected": 3.07366943359375, + "logps/chosen": -557.7803955078125, + "logps/rejected": -876.728759765625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.407896041870117, + "rewards/margins": 18.68692398071289, + "rewards/rejected": -27.09482192993164, + "step": 1723 + }, + { + "epoch": 1.0724727838258166, + "grad_norm": 1.5788064899879828e-07, + "learning_rate": 3.5696173351775017e-06, + "logits/chosen": 3.5497095584869385, + "logits/rejected": 4.858210563659668, + "logps/chosen": -683.814208984375, + "logps/rejected": -1127.4013671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.025189399719238, + "rewards/margins": 35.34825134277344, + "rewards/rejected": -47.37343978881836, + "step": 1724 + }, + { + "epoch": 1.073094867807154, + "grad_norm": 5.755589008331299, + "learning_rate": 3.568464730290457e-06, + "logits/chosen": 1.5815749168395996, + "logits/rejected": 2.7180745601654053, + "logps/chosen": -666.0305786132812, + "logps/rejected": -947.807373046875, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.947914123535156, + "rewards/margins": 19.84267234802246, + "rewards/rejected": -31.790584564208984, + "step": 1725 + }, + { + "epoch": 1.0737169517884915, + "grad_norm": 0.0012952083488926291, + "learning_rate": 3.567312125403412e-06, + "logits/chosen": -1.330794095993042, + "logits/rejected": 4.020459175109863, + "logps/chosen": -424.281494140625, + "logps/rejected": -1116.1873779296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.352400779724121, + "rewards/margins": 33.686038970947266, + "rewards/rejected": -41.03843688964844, + "step": 1726 + }, + { + "epoch": 1.074339035769829, + "grad_norm": 0.009987055324018002, + "learning_rate": 3.5661595205163673e-06, + "logits/chosen": 1.846252202987671, + "logits/rejected": 3.3140053749084473, + "logps/chosen": -703.4176025390625, + "logps/rejected": -1091.683837890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.896821022033691, + "rewards/margins": 30.977752685546875, + "rewards/rejected": -42.87457275390625, + "step": 1727 + }, + { + "epoch": 1.0749611197511664, + "grad_norm": 0.7924360632896423, + "learning_rate": 3.5650069156293226e-06, + "logits/chosen": -0.5128199458122253, + "logits/rejected": 2.4827842712402344, + "logps/chosen": -590.5065307617188, + "logps/rejected": -997.1490478515625, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.042410850524902, + "rewards/margins": 26.186431884765625, + "rewards/rejected": -35.228843688964844, + "step": 1728 + }, + { + "epoch": 1.075583203732504, + "grad_norm": 0.13733063638210297, + "learning_rate": 3.5638543107422778e-06, + "logits/chosen": 2.4186601638793945, + "logits/rejected": 3.949766159057617, + "logps/chosen": -703.2379760742188, + "logps/rejected": -1119.8372802734375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.43460464477539, + "rewards/margins": 30.021484375, + "rewards/rejected": -40.45608901977539, + "step": 1729 + }, + { + "epoch": 1.0762052877138413, + "grad_norm": 2.674889628906385e-06, + "learning_rate": 3.5627017058552334e-06, + "logits/chosen": 1.08864426612854, + "logits/rejected": 3.4560294151306152, + "logps/chosen": -562.939208984375, + "logps/rejected": -1042.669677734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.059733390808105, + "rewards/margins": 28.375022888183594, + "rewards/rejected": -36.434757232666016, + "step": 1730 + }, + { + "epoch": 1.0768273716951788, + "grad_norm": 0.05376862734556198, + "learning_rate": 3.5615491009681887e-06, + "logits/chosen": 1.5392053127288818, + "logits/rejected": 4.284621238708496, + "logps/chosen": -443.6164855957031, + "logps/rejected": -851.0294799804688, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.83315372467041, + "rewards/margins": 21.720170974731445, + "rewards/rejected": -28.553325653076172, + "step": 1731 + }, + { + "epoch": 1.0774494556765164, + "grad_norm": 7.555571210104972e-05, + "learning_rate": 3.560396496081144e-06, + "logits/chosen": -1.2423949241638184, + "logits/rejected": 2.552158832550049, + "logps/chosen": -444.58331298828125, + "logps/rejected": -878.7606201171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.577056407928467, + "rewards/margins": 29.53194808959961, + "rewards/rejected": -37.109004974365234, + "step": 1732 + }, + { + "epoch": 1.0780715396578537, + "grad_norm": 0.08181095123291016, + "learning_rate": 3.559243891194099e-06, + "logits/chosen": 1.7129337787628174, + "logits/rejected": 3.8680801391601562, + "logps/chosen": -610.336669921875, + "logps/rejected": -952.7198486328125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.26831579208374, + "rewards/margins": 18.939006805419922, + "rewards/rejected": -26.20732307434082, + "step": 1733 + }, + { + "epoch": 1.0786936236391913, + "grad_norm": 0.15843388438224792, + "learning_rate": 3.5580912863070543e-06, + "logits/chosen": 1.7182284593582153, + "logits/rejected": 3.604914903640747, + "logps/chosen": -526.2333984375, + "logps/rejected": -879.4224853515625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.743529319763184, + "rewards/margins": 24.613826751708984, + "rewards/rejected": -33.357357025146484, + "step": 1734 + }, + { + "epoch": 1.0793157076205289, + "grad_norm": 0.1478980928659439, + "learning_rate": 3.5569386814200096e-06, + "logits/chosen": -3.3239331245422363, + "logits/rejected": 2.918520450592041, + "logps/chosen": -378.73895263671875, + "logps/rejected": -867.5570068359375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.589836120605469, + "rewards/margins": 21.376054763793945, + "rewards/rejected": -29.96588897705078, + "step": 1735 + }, + { + "epoch": 1.0799377916018662, + "grad_norm": 39.71168518066406, + "learning_rate": 3.5557860765329648e-06, + "logits/chosen": -0.2881968021392822, + "logits/rejected": 2.109957695007324, + "logps/chosen": -569.751220703125, + "logps/rejected": -1004.1986083984375, + "loss": 0.5705, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.916030883789062, + "rewards/margins": 23.231706619262695, + "rewards/rejected": -35.14773941040039, + "step": 1736 + }, + { + "epoch": 1.0805598755832038, + "grad_norm": 37.14696502685547, + "learning_rate": 3.55463347164592e-06, + "logits/chosen": -0.9090088605880737, + "logits/rejected": 3.865264415740967, + "logps/chosen": -479.4161682128906, + "logps/rejected": -1054.5419921875, + "loss": 0.7049, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.393864631652832, + "rewards/margins": 24.767200469970703, + "rewards/rejected": -33.16106414794922, + "step": 1737 + }, + { + "epoch": 1.081181959564541, + "grad_norm": 0.012204733677208424, + "learning_rate": 3.5534808667588757e-06, + "logits/chosen": 1.1127996444702148, + "logits/rejected": 3.1084322929382324, + "logps/chosen": -617.5895385742188, + "logps/rejected": -993.505859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.355681419372559, + "rewards/margins": 26.18058204650879, + "rewards/rejected": -34.53626251220703, + "step": 1738 + }, + { + "epoch": 1.0818040435458787, + "grad_norm": 1.4145355635264423e-05, + "learning_rate": 3.552328261871831e-06, + "logits/chosen": 0.434572696685791, + "logits/rejected": 2.9870080947875977, + "logps/chosen": -521.6600341796875, + "logps/rejected": -987.229248046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.352119445800781, + "rewards/margins": 28.358474731445312, + "rewards/rejected": -35.710594177246094, + "step": 1739 + }, + { + "epoch": 1.0824261275272162, + "grad_norm": 0.00888131745159626, + "learning_rate": 3.551175656984786e-06, + "logits/chosen": -1.785019874572754, + "logits/rejected": 3.1531331539154053, + "logps/chosen": -370.5480651855469, + "logps/rejected": -925.7297973632812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.955384731292725, + "rewards/margins": 20.881046295166016, + "rewards/rejected": -25.8364315032959, + "step": 1740 + }, + { + "epoch": 1.0830482115085536, + "grad_norm": 52.913543701171875, + "learning_rate": 3.5500230520977413e-06, + "logits/chosen": 0.27387386560440063, + "logits/rejected": 2.122863292694092, + "logps/chosen": -518.5341186523438, + "logps/rejected": -820.9344482421875, + "loss": 1.1936, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.488155364990234, + "rewards/margins": 17.69202423095703, + "rewards/rejected": -26.180179595947266, + "step": 1741 + }, + { + "epoch": 1.0836702954898911, + "grad_norm": 0.0007935499306768179, + "learning_rate": 3.5488704472106966e-06, + "logits/chosen": 0.46489566564559937, + "logits/rejected": 2.1978495121002197, + "logps/chosen": -661.927734375, + "logps/rejected": -1023.1362915039062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.307547569274902, + "rewards/margins": 29.69561004638672, + "rewards/rejected": -39.00315475463867, + "step": 1742 + }, + { + "epoch": 1.0842923794712287, + "grad_norm": 0.004735798109322786, + "learning_rate": 3.5477178423236518e-06, + "logits/chosen": -0.2975190281867981, + "logits/rejected": 4.6708784103393555, + "logps/chosen": -363.0914306640625, + "logps/rejected": -1009.577880859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.831550121307373, + "rewards/margins": 28.918603897094727, + "rewards/rejected": -32.750152587890625, + "step": 1743 + }, + { + "epoch": 1.084914463452566, + "grad_norm": 2.3199174404144287, + "learning_rate": 3.546565237436607e-06, + "logits/chosen": 1.5427138805389404, + "logits/rejected": 4.5314202308654785, + "logps/chosen": -506.8985900878906, + "logps/rejected": -889.401611328125, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.435576915740967, + "rewards/margins": 23.069461822509766, + "rewards/rejected": -30.505037307739258, + "step": 1744 + }, + { + "epoch": 1.0855365474339036, + "grad_norm": 25.25603485107422, + "learning_rate": 3.5454126325495627e-06, + "logits/chosen": 1.1176072359085083, + "logits/rejected": 2.8879714012145996, + "logps/chosen": -601.0845336914062, + "logps/rejected": -935.34716796875, + "loss": 0.1484, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.336665153503418, + "rewards/margins": 25.64136505126953, + "rewards/rejected": -29.978031158447266, + "step": 1745 + }, + { + "epoch": 1.0861586314152412, + "grad_norm": 0.0002335595345357433, + "learning_rate": 3.544260027662518e-06, + "logits/chosen": -0.33439940214157104, + "logits/rejected": 3.145282745361328, + "logps/chosen": -417.33099365234375, + "logps/rejected": -870.7086791992188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.937946319580078, + "rewards/margins": 25.857038497924805, + "rewards/rejected": -29.794986724853516, + "step": 1746 + }, + { + "epoch": 1.0867807153965785, + "grad_norm": 37.67134094238281, + "learning_rate": 3.543107422775473e-06, + "logits/chosen": 2.3051233291625977, + "logits/rejected": 5.67755126953125, + "logps/chosen": -426.3350830078125, + "logps/rejected": -745.7735595703125, + "loss": 1.4902, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.041298866271973, + "rewards/margins": 14.599910736083984, + "rewards/rejected": -19.641210556030273, + "step": 1747 + }, + { + "epoch": 1.087402799377916, + "grad_norm": 27.38837432861328, + "learning_rate": 3.5419548178884283e-06, + "logits/chosen": 1.921449065208435, + "logits/rejected": 3.4902946949005127, + "logps/chosen": -623.1732788085938, + "logps/rejected": -968.2666015625, + "loss": 0.156, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.072044372558594, + "rewards/margins": 21.721195220947266, + "rewards/rejected": -30.793241500854492, + "step": 1748 + }, + { + "epoch": 1.0880248833592534, + "grad_norm": 0.005325262900441885, + "learning_rate": 3.5408022130013836e-06, + "logits/chosen": -0.21698921918869019, + "logits/rejected": 1.6183581352233887, + "logps/chosen": -479.52520751953125, + "logps/rejected": -765.736083984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.421666145324707, + "rewards/margins": 21.242387771606445, + "rewards/rejected": -25.664051055908203, + "step": 1749 + }, + { + "epoch": 1.088646967340591, + "grad_norm": 29.812423706054688, + "learning_rate": 3.5396496081143388e-06, + "logits/chosen": 2.655641555786133, + "logits/rejected": 3.9962892532348633, + "logps/chosen": -595.0345458984375, + "logps/rejected": -831.300537109375, + "loss": 0.324, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.994194030761719, + "rewards/margins": 15.970343589782715, + "rewards/rejected": -24.96453857421875, + "step": 1750 + }, + { + "epoch": 1.0892690513219285, + "grad_norm": 0.00018733121396508068, + "learning_rate": 3.538497003227294e-06, + "logits/chosen": 2.5199174880981445, + "logits/rejected": 4.3708176612854, + "logps/chosen": -560.2379760742188, + "logps/rejected": -865.1304931640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.548035621643066, + "rewards/margins": 25.743770599365234, + "rewards/rejected": -30.291807174682617, + "step": 1751 + }, + { + "epoch": 1.0898911353032659, + "grad_norm": 0.00782470591366291, + "learning_rate": 3.5373443983402496e-06, + "logits/chosen": -0.2565594017505646, + "logits/rejected": 1.5670526027679443, + "logps/chosen": -402.8641662597656, + "logps/rejected": -719.1917724609375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.71183967590332, + "rewards/margins": 18.037317276000977, + "rewards/rejected": -22.749156951904297, + "step": 1752 + }, + { + "epoch": 1.0905132192846034, + "grad_norm": 6.2403564453125, + "learning_rate": 3.536191793453205e-06, + "logits/chosen": 1.9860265254974365, + "logits/rejected": 3.790701389312744, + "logps/chosen": -570.9795532226562, + "logps/rejected": -994.5543212890625, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.19058895111084, + "rewards/margins": 28.27306365966797, + "rewards/rejected": -34.463653564453125, + "step": 1753 + }, + { + "epoch": 1.091135303265941, + "grad_norm": 0.0002669897803571075, + "learning_rate": 3.53503918856616e-06, + "logits/chosen": -1.019551396369934, + "logits/rejected": 3.5526912212371826, + "logps/chosen": -352.00616455078125, + "logps/rejected": -853.19921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6608293056488037, + "rewards/margins": 24.306198120117188, + "rewards/rejected": -27.967025756835938, + "step": 1754 + }, + { + "epoch": 1.0917573872472783, + "grad_norm": 0.5336105227470398, + "learning_rate": 3.5338865836791153e-06, + "logits/chosen": 0.23992976546287537, + "logits/rejected": 4.342520713806152, + "logps/chosen": -516.0511474609375, + "logps/rejected": -1050.3311767578125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2335124015808105, + "rewards/margins": 25.32655906677246, + "rewards/rejected": -29.560073852539062, + "step": 1755 + }, + { + "epoch": 1.092379471228616, + "grad_norm": 21.57744026184082, + "learning_rate": 3.5327339787920705e-06, + "logits/chosen": -4.463540554046631, + "logits/rejected": 0.9967468976974487, + "logps/chosen": -313.4986267089844, + "logps/rejected": -840.9525146484375, + "loss": 0.1341, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.787725448608398, + "rewards/margins": 21.620792388916016, + "rewards/rejected": -27.408519744873047, + "step": 1756 + }, + { + "epoch": 1.0930015552099532, + "grad_norm": 0.03455764427781105, + "learning_rate": 3.5315813739050258e-06, + "logits/chosen": -0.07754494249820709, + "logits/rejected": 2.7861900329589844, + "logps/chosen": -501.96295166015625, + "logps/rejected": -887.67822265625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.042675971984863, + "rewards/margins": 20.873672485351562, + "rewards/rejected": -28.91634750366211, + "step": 1757 + }, + { + "epoch": 1.0936236391912908, + "grad_norm": 0.006460077594965696, + "learning_rate": 3.530428769017981e-06, + "logits/chosen": -0.666114330291748, + "logits/rejected": 3.3996047973632812, + "logps/chosen": -448.8841247558594, + "logps/rejected": -914.3242797851562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.196425914764404, + "rewards/margins": 28.60602378845215, + "rewards/rejected": -33.802452087402344, + "step": 1758 + }, + { + "epoch": 1.0942457231726284, + "grad_norm": 0.28905969858169556, + "learning_rate": 3.5292761641309362e-06, + "logits/chosen": 2.0526375770568848, + "logits/rejected": 4.210762977600098, + "logps/chosen": -552.855712890625, + "logps/rejected": -991.937255859375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.942243576049805, + "rewards/margins": 27.496761322021484, + "rewards/rejected": -34.439002990722656, + "step": 1759 + }, + { + "epoch": 1.0948678071539657, + "grad_norm": 3.9752464294433594, + "learning_rate": 3.528123559243892e-06, + "logits/chosen": 0.26489585638046265, + "logits/rejected": 4.406553745269775, + "logps/chosen": -454.697021484375, + "logps/rejected": -943.942138671875, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.372507095336914, + "rewards/margins": 25.299306869506836, + "rewards/rejected": -28.67181396484375, + "step": 1760 + }, + { + "epoch": 1.0954898911353033, + "grad_norm": 0.10441815853118896, + "learning_rate": 3.526970954356847e-06, + "logits/chosen": -0.5542750954627991, + "logits/rejected": 1.8777621984481812, + "logps/chosen": -410.62689208984375, + "logps/rejected": -739.4208984375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8234503269195557, + "rewards/margins": 20.886167526245117, + "rewards/rejected": -24.709617614746094, + "step": 1761 + }, + { + "epoch": 1.0961119751166408, + "grad_norm": 0.13274742662906647, + "learning_rate": 3.5258183494698023e-06, + "logits/chosen": 0.48963046073913574, + "logits/rejected": 4.257356643676758, + "logps/chosen": -567.223388671875, + "logps/rejected": -1009.8548583984375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.456854343414307, + "rewards/margins": 22.50156593322754, + "rewards/rejected": -29.95842170715332, + "step": 1762 + }, + { + "epoch": 1.0967340590979782, + "grad_norm": 3.6613287193176802e-06, + "learning_rate": 3.5246657445827575e-06, + "logits/chosen": 2.103224754333496, + "logits/rejected": 3.8015027046203613, + "logps/chosen": -587.1085205078125, + "logps/rejected": -925.3865966796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.701694965362549, + "rewards/margins": 27.56255340576172, + "rewards/rejected": -32.26424789428711, + "step": 1763 + }, + { + "epoch": 1.0973561430793157, + "grad_norm": 0.017274130135774612, + "learning_rate": 3.5235131396957128e-06, + "logits/chosen": 0.8076609373092651, + "logits/rejected": 4.674169063568115, + "logps/chosen": -450.8760986328125, + "logps/rejected": -957.8709716796875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.7160773277282715, + "rewards/margins": 27.142183303833008, + "rewards/rejected": -34.85826110839844, + "step": 1764 + }, + { + "epoch": 1.0979782270606533, + "grad_norm": 0.0009655518224462867, + "learning_rate": 3.522360534808668e-06, + "logits/chosen": 0.0774463415145874, + "logits/rejected": 2.751673460006714, + "logps/chosen": -576.5747680664062, + "logps/rejected": -1003.846923828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.677048683166504, + "rewards/margins": 28.99536895751953, + "rewards/rejected": -38.67241668701172, + "step": 1765 + }, + { + "epoch": 1.0986003110419906, + "grad_norm": 0.015237356536090374, + "learning_rate": 3.5212079299216232e-06, + "logits/chosen": -0.023343242704868317, + "logits/rejected": 3.3576998710632324, + "logps/chosen": -518.5059204101562, + "logps/rejected": -920.527099609375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.050875663757324, + "rewards/margins": 23.865657806396484, + "rewards/rejected": -29.91653060913086, + "step": 1766 + }, + { + "epoch": 1.0992223950233282, + "grad_norm": 33.6476936340332, + "learning_rate": 3.520055325034579e-06, + "logits/chosen": -0.05070209503173828, + "logits/rejected": 4.106517791748047, + "logps/chosen": -587.09130859375, + "logps/rejected": -1130.2176513671875, + "loss": 0.4541, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.864964485168457, + "rewards/margins": 25.942523956298828, + "rewards/rejected": -33.80748748779297, + "step": 1767 + }, + { + "epoch": 1.0998444790046655, + "grad_norm": 8.172510570148006e-05, + "learning_rate": 3.518902720147534e-06, + "logits/chosen": 1.3177427053451538, + "logits/rejected": 2.9541220664978027, + "logps/chosen": -635.0423583984375, + "logps/rejected": -1084.8924560546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.324959754943848, + "rewards/margins": 28.07250213623047, + "rewards/rejected": -40.3974609375, + "step": 1768 + }, + { + "epoch": 1.100466562986003, + "grad_norm": 0.029781443998217583, + "learning_rate": 3.5177501152604893e-06, + "logits/chosen": -0.6049349904060364, + "logits/rejected": 3.1094350814819336, + "logps/chosen": -471.39276123046875, + "logps/rejected": -894.5453491210938, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.47968864440918, + "rewards/margins": 23.859102249145508, + "rewards/rejected": -30.338787078857422, + "step": 1769 + }, + { + "epoch": 1.1010886469673407, + "grad_norm": 4.44408655166626, + "learning_rate": 3.5165975103734445e-06, + "logits/chosen": 0.391635537147522, + "logits/rejected": 2.7561874389648438, + "logps/chosen": -515.3228149414062, + "logps/rejected": -884.0820922851562, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.186380386352539, + "rewards/margins": 16.5716495513916, + "rewards/rejected": -24.758028030395508, + "step": 1770 + }, + { + "epoch": 1.101710730948678, + "grad_norm": 0.00016401683387812227, + "learning_rate": 3.5154449054863993e-06, + "logits/chosen": 0.2484072893857956, + "logits/rejected": 3.562471628189087, + "logps/chosen": -561.8585815429688, + "logps/rejected": -1126.3258056640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.577777862548828, + "rewards/margins": 31.752361297607422, + "rewards/rejected": -42.33013916015625, + "step": 1771 + }, + { + "epoch": 1.1023328149300156, + "grad_norm": 6.723960268573137e-06, + "learning_rate": 3.5142923005993546e-06, + "logits/chosen": -0.14800924062728882, + "logits/rejected": 4.816242218017578, + "logps/chosen": -402.42181396484375, + "logps/rejected": -910.867431640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.692221641540527, + "rewards/margins": 23.456363677978516, + "rewards/rejected": -31.14858627319336, + "step": 1772 + }, + { + "epoch": 1.1029548989113531, + "grad_norm": 8.856857311911881e-05, + "learning_rate": 3.51313969571231e-06, + "logits/chosen": 0.9645895957946777, + "logits/rejected": 3.0167455673217773, + "logps/chosen": -675.44091796875, + "logps/rejected": -964.310791015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.163752555847168, + "rewards/margins": 26.749027252197266, + "rewards/rejected": -34.91278076171875, + "step": 1773 + }, + { + "epoch": 1.1035769828926905, + "grad_norm": 25.04258918762207, + "learning_rate": 3.511987090825265e-06, + "logits/chosen": -0.3559788465499878, + "logits/rejected": 2.854271411895752, + "logps/chosen": -573.7998657226562, + "logps/rejected": -945.9089965820312, + "loss": 0.311, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.527469635009766, + "rewards/margins": 23.415010452270508, + "rewards/rejected": -34.942481994628906, + "step": 1774 + }, + { + "epoch": 1.104199066874028, + "grad_norm": 0.0004723109886981547, + "learning_rate": 3.5108344859382202e-06, + "logits/chosen": 0.11427342891693115, + "logits/rejected": 2.153625011444092, + "logps/chosen": -475.01019287109375, + "logps/rejected": -929.59228515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.516587257385254, + "rewards/margins": 28.553447723388672, + "rewards/rejected": -34.07003402709961, + "step": 1775 + }, + { + "epoch": 1.1048211508553654, + "grad_norm": 27.63475227355957, + "learning_rate": 3.509681881051176e-06, + "logits/chosen": 3.1944663524627686, + "logits/rejected": 3.0088977813720703, + "logps/chosen": -710.120849609375, + "logps/rejected": -900.2386474609375, + "loss": 0.3799, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.0547027587890625, + "rewards/margins": 22.974761962890625, + "rewards/rejected": -30.029464721679688, + "step": 1776 + }, + { + "epoch": 1.105443234836703, + "grad_norm": 26.656400680541992, + "learning_rate": 3.508529276164131e-06, + "logits/chosen": 1.5964024066925049, + "logits/rejected": 3.5252232551574707, + "logps/chosen": -550.2457885742188, + "logps/rejected": -974.29296875, + "loss": 0.2396, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.295560836791992, + "rewards/margins": 23.832962036132812, + "rewards/rejected": -32.12852096557617, + "step": 1777 + }, + { + "epoch": 1.1060653188180405, + "grad_norm": 0.04470802843570709, + "learning_rate": 3.5073766712770863e-06, + "logits/chosen": 1.9093613624572754, + "logits/rejected": 3.0826358795166016, + "logps/chosen": -693.091552734375, + "logps/rejected": -931.0435180664062, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.754042625427246, + "rewards/margins": 23.099082946777344, + "rewards/rejected": -34.853126525878906, + "step": 1778 + }, + { + "epoch": 1.1066874027993778, + "grad_norm": 0.3163846433162689, + "learning_rate": 3.5062240663900416e-06, + "logits/chosen": -1.5664516687393188, + "logits/rejected": 2.738992214202881, + "logps/chosen": -354.3194885253906, + "logps/rejected": -797.6347045898438, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3946616649627686, + "rewards/margins": 20.815555572509766, + "rewards/rejected": -24.21021842956543, + "step": 1779 + }, + { + "epoch": 1.1073094867807154, + "grad_norm": 32.263362884521484, + "learning_rate": 3.505071461502997e-06, + "logits/chosen": 0.2788873314857483, + "logits/rejected": 3.975221633911133, + "logps/chosen": -567.035888671875, + "logps/rejected": -931.5633544921875, + "loss": 0.5395, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.741927146911621, + "rewards/margins": 21.19263458251953, + "rewards/rejected": -28.93456268310547, + "step": 1780 + }, + { + "epoch": 1.107931570762053, + "grad_norm": 0.0009872695663943887, + "learning_rate": 3.503918856615952e-06, + "logits/chosen": 0.8488771915435791, + "logits/rejected": 1.7789608240127563, + "logps/chosen": -445.4822692871094, + "logps/rejected": -737.8762817382812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.745743274688721, + "rewards/margins": 21.692352294921875, + "rewards/rejected": -27.438098907470703, + "step": 1781 + }, + { + "epoch": 1.1085536547433903, + "grad_norm": 0.0002924785949289799, + "learning_rate": 3.5027662517289072e-06, + "logits/chosen": 0.3400723934173584, + "logits/rejected": 3.4457757472991943, + "logps/chosen": -407.4837646484375, + "logps/rejected": -932.68603515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.976365089416504, + "rewards/margins": 27.15723419189453, + "rewards/rejected": -32.13359832763672, + "step": 1782 + }, + { + "epoch": 1.1091757387247279, + "grad_norm": 1.0559955626376905e-05, + "learning_rate": 3.5016136468418625e-06, + "logits/chosen": -1.7059123516082764, + "logits/rejected": 3.296755790710449, + "logps/chosen": -360.22650146484375, + "logps/rejected": -902.837158203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.966699123382568, + "rewards/margins": 26.81707000732422, + "rewards/rejected": -34.78376770019531, + "step": 1783 + }, + { + "epoch": 1.1097978227060654, + "grad_norm": 5.097897529602051, + "learning_rate": 3.500461041954818e-06, + "logits/chosen": 1.7751491069793701, + "logits/rejected": 3.033304452896118, + "logps/chosen": -599.8610229492188, + "logps/rejected": -737.4515380859375, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.404728412628174, + "rewards/margins": 13.427257537841797, + "rewards/rejected": -20.831985473632812, + "step": 1784 + }, + { + "epoch": 1.1104199066874028, + "grad_norm": 0.36752787232398987, + "learning_rate": 3.4993084370677733e-06, + "logits/chosen": 0.8674039244651794, + "logits/rejected": 3.6712758541107178, + "logps/chosen": -614.0510864257812, + "logps/rejected": -977.8643798828125, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.517800331115723, + "rewards/margins": 21.28858757019043, + "rewards/rejected": -30.80638885498047, + "step": 1785 + }, + { + "epoch": 1.1110419906687403, + "grad_norm": 7.078679686856049e-07, + "learning_rate": 3.4981558321807286e-06, + "logits/chosen": 1.386345386505127, + "logits/rejected": 3.601700782775879, + "logps/chosen": -534.5748291015625, + "logps/rejected": -879.3649291992188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.662722587585449, + "rewards/margins": 22.788742065429688, + "rewards/rejected": -27.45146369934082, + "step": 1786 + }, + { + "epoch": 1.1116640746500777, + "grad_norm": 0.01572837121784687, + "learning_rate": 3.4970032272936838e-06, + "logits/chosen": 1.7985820770263672, + "logits/rejected": 2.970552682876587, + "logps/chosen": -631.8236694335938, + "logps/rejected": -1082.2847900390625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.598766803741455, + "rewards/margins": 28.447805404663086, + "rewards/rejected": -36.04656982421875, + "step": 1787 + }, + { + "epoch": 1.1122861586314152, + "grad_norm": 0.0024721594527363777, + "learning_rate": 3.495850622406639e-06, + "logits/chosen": -1.2049006223678589, + "logits/rejected": 3.378148317337036, + "logps/chosen": -365.1925964355469, + "logps/rejected": -871.2085571289062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.7494306564331055, + "rewards/margins": 27.50840187072754, + "rewards/rejected": -32.25783157348633, + "step": 1788 + }, + { + "epoch": 1.1129082426127528, + "grad_norm": 0.0008935022633522749, + "learning_rate": 3.4946980175195942e-06, + "logits/chosen": -1.4913283586502075, + "logits/rejected": 3.5099782943725586, + "logps/chosen": -498.25164794921875, + "logps/rejected": -1032.5533447265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7877275943756104, + "rewards/margins": 30.057601928710938, + "rewards/rejected": -31.84532928466797, + "step": 1789 + }, + { + "epoch": 1.1135303265940901, + "grad_norm": 0.45027968287467957, + "learning_rate": 3.4935454126325495e-06, + "logits/chosen": 0.22388708591461182, + "logits/rejected": 4.521572113037109, + "logps/chosen": -420.34503173828125, + "logps/rejected": -930.3162841796875, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.024784088134766, + "rewards/margins": 24.955102920532227, + "rewards/rejected": -30.97988510131836, + "step": 1790 + }, + { + "epoch": 1.1141524105754277, + "grad_norm": 16.772062301635742, + "learning_rate": 3.492392807745505e-06, + "logits/chosen": 1.6867791414260864, + "logits/rejected": 2.316291332244873, + "logps/chosen": -534.1275024414062, + "logps/rejected": -781.1571044921875, + "loss": 0.0837, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.479506492614746, + "rewards/margins": 22.093637466430664, + "rewards/rejected": -31.573143005371094, + "step": 1791 + }, + { + "epoch": 1.1147744945567652, + "grad_norm": 2.7089431285858154, + "learning_rate": 3.4912402028584603e-06, + "logits/chosen": -1.008533000946045, + "logits/rejected": 2.780308246612549, + "logps/chosen": -537.2943725585938, + "logps/rejected": -1070.39501953125, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.919048309326172, + "rewards/margins": 26.721981048583984, + "rewards/rejected": -35.641029357910156, + "step": 1792 + }, + { + "epoch": 1.1153965785381026, + "grad_norm": 0.09999026358127594, + "learning_rate": 3.4900875979714156e-06, + "logits/chosen": 0.9722707271575928, + "logits/rejected": 2.4060893058776855, + "logps/chosen": -599.2125244140625, + "logps/rejected": -940.9347534179688, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.931610584259033, + "rewards/margins": 24.689077377319336, + "rewards/rejected": -31.62068748474121, + "step": 1793 + }, + { + "epoch": 1.1160186625194402, + "grad_norm": 0.0830949917435646, + "learning_rate": 3.4889349930843708e-06, + "logits/chosen": 1.924202799797058, + "logits/rejected": 3.6883726119995117, + "logps/chosen": -553.5717163085938, + "logps/rejected": -947.3374633789062, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.690497398376465, + "rewards/margins": 26.857397079467773, + "rewards/rejected": -32.54789733886719, + "step": 1794 + }, + { + "epoch": 1.1166407465007775, + "grad_norm": 2.811249032674823e-06, + "learning_rate": 3.487782388197326e-06, + "logits/chosen": -1.5614333152770996, + "logits/rejected": 1.486207365989685, + "logps/chosen": -335.30596923828125, + "logps/rejected": -901.7238159179688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4471044540405273, + "rewards/margins": 29.40472984313965, + "rewards/rejected": -32.85183334350586, + "step": 1795 + }, + { + "epoch": 1.117262830482115, + "grad_norm": 0.00018924751202575862, + "learning_rate": 3.4866297833102812e-06, + "logits/chosen": -1.4683024883270264, + "logits/rejected": 5.106406211853027, + "logps/chosen": -420.83184814453125, + "logps/rejected": -1123.9346923828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.011943817138672, + "rewards/margins": 36.865928649902344, + "rewards/rejected": -42.877872467041016, + "step": 1796 + }, + { + "epoch": 1.1178849144634526, + "grad_norm": 0.023054877296090126, + "learning_rate": 3.4854771784232365e-06, + "logits/chosen": -0.9309723973274231, + "logits/rejected": 4.368443965911865, + "logps/chosen": -336.5071716308594, + "logps/rejected": -876.109375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.8464765548706055, + "rewards/margins": 22.422441482543945, + "rewards/rejected": -27.268917083740234, + "step": 1797 + }, + { + "epoch": 1.11850699844479, + "grad_norm": 0.00115344044752419, + "learning_rate": 3.484324573536192e-06, + "logits/chosen": -1.9685173034667969, + "logits/rejected": 4.014041423797607, + "logps/chosen": -358.6552734375, + "logps/rejected": -1036.917724609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.902338981628418, + "rewards/margins": 30.619861602783203, + "rewards/rejected": -35.52220153808594, + "step": 1798 + }, + { + "epoch": 1.1191290824261275, + "grad_norm": 29.647974014282227, + "learning_rate": 3.4831719686491473e-06, + "logits/chosen": 0.758859395980835, + "logits/rejected": 3.8699092864990234, + "logps/chosen": -597.4893798828125, + "logps/rejected": -891.358154296875, + "loss": 0.1947, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.706351280212402, + "rewards/margins": 18.08136749267578, + "rewards/rejected": -26.7877197265625, + "step": 1799 + }, + { + "epoch": 1.119751166407465, + "grad_norm": 1.148080173152266e-05, + "learning_rate": 3.4820193637621026e-06, + "logits/chosen": -1.1118121147155762, + "logits/rejected": 4.116186141967773, + "logps/chosen": -457.7972106933594, + "logps/rejected": -1084.82958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.23808479309082, + "rewards/margins": 31.739526748657227, + "rewards/rejected": -37.97761154174805, + "step": 1800 + }, + { + "epoch": 1.1203732503888024, + "grad_norm": 0.008707517758011818, + "learning_rate": 3.4808667588750578e-06, + "logits/chosen": -0.4364929497241974, + "logits/rejected": 3.4837522506713867, + "logps/chosen": -444.2327575683594, + "logps/rejected": -858.1317749023438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.276412963867188, + "rewards/margins": 18.92943572998047, + "rewards/rejected": -29.205846786499023, + "step": 1801 + }, + { + "epoch": 1.12099533437014, + "grad_norm": 11.017598152160645, + "learning_rate": 3.479714153988013e-06, + "logits/chosen": -1.9811372756958008, + "logits/rejected": 2.453932523727417, + "logps/chosen": -341.76824951171875, + "logps/rejected": -778.7388916015625, + "loss": 0.2198, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.120153903961182, + "rewards/margins": 24.861141204833984, + "rewards/rejected": -28.98129653930664, + "step": 1802 + }, + { + "epoch": 1.1216174183514775, + "grad_norm": 0.12693174183368683, + "learning_rate": 3.4785615491009682e-06, + "logits/chosen": 1.7608217000961304, + "logits/rejected": 4.857548713684082, + "logps/chosen": -597.873291015625, + "logps/rejected": -1057.2740478515625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.650867462158203, + "rewards/margins": 31.496370315551758, + "rewards/rejected": -40.147239685058594, + "step": 1803 + }, + { + "epoch": 1.1222395023328149, + "grad_norm": 20.185020446777344, + "learning_rate": 3.4774089442139235e-06, + "logits/chosen": 1.1995744705200195, + "logits/rejected": 2.085767984390259, + "logps/chosen": -576.979248046875, + "logps/rejected": -812.4219970703125, + "loss": 0.1261, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.986093521118164, + "rewards/margins": 15.25129508972168, + "rewards/rejected": -21.237390518188477, + "step": 1804 + }, + { + "epoch": 1.1228615863141524, + "grad_norm": 0.004299265798181295, + "learning_rate": 3.476256339326879e-06, + "logits/chosen": 1.5160009860992432, + "logits/rejected": 4.4680047035217285, + "logps/chosen": -594.9562377929688, + "logps/rejected": -942.6099243164062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.65675163269043, + "rewards/margins": 21.242027282714844, + "rewards/rejected": -32.89877700805664, + "step": 1805 + }, + { + "epoch": 1.1234836702954898, + "grad_norm": 8.741262435913086, + "learning_rate": 3.4751037344398343e-06, + "logits/chosen": -1.0180206298828125, + "logits/rejected": 4.231414318084717, + "logps/chosen": -440.399169921875, + "logps/rejected": -888.4819946289062, + "loss": 0.0463, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.514710426330566, + "rewards/margins": 19.439733505249023, + "rewards/rejected": -23.954442977905273, + "step": 1806 + }, + { + "epoch": 1.1241057542768274, + "grad_norm": 0.0063452026806771755, + "learning_rate": 3.4739511295527896e-06, + "logits/chosen": 2.629770278930664, + "logits/rejected": 3.7640302181243896, + "logps/chosen": -643.5936279296875, + "logps/rejected": -977.7257080078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.321480751037598, + "rewards/margins": 25.136545181274414, + "rewards/rejected": -32.45802688598633, + "step": 1807 + }, + { + "epoch": 1.124727838258165, + "grad_norm": 5.7119975090026855, + "learning_rate": 3.4727985246657448e-06, + "logits/chosen": -0.7087704539299011, + "logits/rejected": 2.7257461547851562, + "logps/chosen": -487.17840576171875, + "logps/rejected": -791.4234619140625, + "loss": 0.0396, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.3370232582092285, + "rewards/margins": 14.614947319030762, + "rewards/rejected": -19.95197105407715, + "step": 1808 + }, + { + "epoch": 1.1253499222395023, + "grad_norm": 0.023234574124217033, + "learning_rate": 3.4716459197787e-06, + "logits/chosen": 0.3555334806442261, + "logits/rejected": 2.6853690147399902, + "logps/chosen": -598.6488037109375, + "logps/rejected": -956.1478271484375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7893404960632324, + "rewards/margins": 23.659915924072266, + "rewards/rejected": -26.449256896972656, + "step": 1809 + }, + { + "epoch": 1.1259720062208398, + "grad_norm": 0.003041701391339302, + "learning_rate": 3.4704933148916552e-06, + "logits/chosen": 2.4652957916259766, + "logits/rejected": 4.392395973205566, + "logps/chosen": -628.3650512695312, + "logps/rejected": -973.9765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.876049041748047, + "rewards/margins": 28.1997127532959, + "rewards/rejected": -35.07575988769531, + "step": 1810 + }, + { + "epoch": 1.1265940902021774, + "grad_norm": 0.15895365178585052, + "learning_rate": 3.4693407100046105e-06, + "logits/chosen": -0.8710925579071045, + "logits/rejected": 2.300516128540039, + "logps/chosen": -419.5194396972656, + "logps/rejected": -712.6929321289062, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.929962158203125, + "rewards/margins": 13.017057418823242, + "rewards/rejected": -15.947019577026367, + "step": 1811 + }, + { + "epoch": 1.1272161741835147, + "grad_norm": 12.767523765563965, + "learning_rate": 3.4681881051175657e-06, + "logits/chosen": -1.348296880722046, + "logits/rejected": 3.6018869876861572, + "logps/chosen": -453.49639892578125, + "logps/rejected": -975.9755859375, + "loss": 0.0632, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.690885066986084, + "rewards/margins": 29.5626220703125, + "rewards/rejected": -35.253509521484375, + "step": 1812 + }, + { + "epoch": 1.1278382581648523, + "grad_norm": 0.8971028923988342, + "learning_rate": 3.4670355002305213e-06, + "logits/chosen": -0.9237762689590454, + "logits/rejected": 2.268069267272949, + "logps/chosen": -493.8834228515625, + "logps/rejected": -852.326416015625, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8636460304260254, + "rewards/margins": 20.109888076782227, + "rewards/rejected": -23.973533630371094, + "step": 1813 + }, + { + "epoch": 1.1284603421461896, + "grad_norm": 9.861696243286133, + "learning_rate": 3.4658828953434765e-06, + "logits/chosen": -1.5349409580230713, + "logits/rejected": 3.1588385105133057, + "logps/chosen": -334.5981140136719, + "logps/rejected": -751.074462890625, + "loss": 0.0537, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.684638977050781, + "rewards/margins": 21.330123901367188, + "rewards/rejected": -26.0147647857666, + "step": 1814 + }, + { + "epoch": 1.1290824261275272, + "grad_norm": 0.10593397915363312, + "learning_rate": 3.4647302904564318e-06, + "logits/chosen": -0.2577982544898987, + "logits/rejected": 1.7699002027511597, + "logps/chosen": -580.86181640625, + "logps/rejected": -899.0047607421875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.7143940925598145, + "rewards/margins": 23.71632194519043, + "rewards/rejected": -31.430715560913086, + "step": 1815 + }, + { + "epoch": 1.1297045101088647, + "grad_norm": 0.03057820163667202, + "learning_rate": 3.463577685569387e-06, + "logits/chosen": 2.243663787841797, + "logits/rejected": 4.455940246582031, + "logps/chosen": -518.3104248046875, + "logps/rejected": -925.2927856445312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.088144302368164, + "rewards/margins": 24.08509063720703, + "rewards/rejected": -30.173236846923828, + "step": 1816 + }, + { + "epoch": 1.130326594090202, + "grad_norm": 0.29756349325180054, + "learning_rate": 3.4624250806823422e-06, + "logits/chosen": 1.827619194984436, + "logits/rejected": 3.9997007846832275, + "logps/chosen": -611.642578125, + "logps/rejected": -1030.654296875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.628411293029785, + "rewards/margins": 25.694616317749023, + "rewards/rejected": -32.323028564453125, + "step": 1817 + }, + { + "epoch": 1.1309486780715396, + "grad_norm": 0.008640704676508904, + "learning_rate": 3.4612724757952974e-06, + "logits/chosen": 1.3161953687667847, + "logits/rejected": 3.7667105197906494, + "logps/chosen": -409.8922119140625, + "logps/rejected": -770.370849609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.131985187530518, + "rewards/margins": 24.006284713745117, + "rewards/rejected": -30.138267517089844, + "step": 1818 + }, + { + "epoch": 1.1315707620528772, + "grad_norm": 0.05921082943677902, + "learning_rate": 3.4601198709082527e-06, + "logits/chosen": 2.588874101638794, + "logits/rejected": 2.696937084197998, + "logps/chosen": -628.57568359375, + "logps/rejected": -915.2703857421875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.494431495666504, + "rewards/margins": 29.17119026184082, + "rewards/rejected": -37.66562271118164, + "step": 1819 + }, + { + "epoch": 1.1321928460342146, + "grad_norm": 18.75832748413086, + "learning_rate": 3.4589672660212083e-06, + "logits/chosen": -0.10499536991119385, + "logits/rejected": 2.8624749183654785, + "logps/chosen": -397.4913330078125, + "logps/rejected": -832.5264892578125, + "loss": 0.0655, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.647233486175537, + "rewards/margins": 26.839122772216797, + "rewards/rejected": -33.486358642578125, + "step": 1820 + }, + { + "epoch": 1.1328149300155521, + "grad_norm": 0.5162553191184998, + "learning_rate": 3.4578146611341635e-06, + "logits/chosen": -0.5121818780899048, + "logits/rejected": 4.498852252960205, + "logps/chosen": -433.69439697265625, + "logps/rejected": -1014.330322265625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.915709972381592, + "rewards/margins": 24.85306167602539, + "rewards/rejected": -30.76877212524414, + "step": 1821 + }, + { + "epoch": 1.1334370139968897, + "grad_norm": 16.169828414916992, + "learning_rate": 3.4566620562471188e-06, + "logits/chosen": -2.0768256187438965, + "logits/rejected": 3.2809956073760986, + "logps/chosen": -361.49493408203125, + "logps/rejected": -873.9121704101562, + "loss": 0.1493, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6932404041290283, + "rewards/margins": 18.26701545715332, + "rewards/rejected": -19.960254669189453, + "step": 1822 + }, + { + "epoch": 1.134059097978227, + "grad_norm": 2.1423237323760986, + "learning_rate": 3.455509451360074e-06, + "logits/chosen": 1.41920804977417, + "logits/rejected": 4.007068634033203, + "logps/chosen": -520.28076171875, + "logps/rejected": -855.2392578125, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.254186153411865, + "rewards/margins": 23.409135818481445, + "rewards/rejected": -28.6633243560791, + "step": 1823 + }, + { + "epoch": 1.1346811819595646, + "grad_norm": 40.92685317993164, + "learning_rate": 3.4543568464730292e-06, + "logits/chosen": -0.6169062852859497, + "logits/rejected": 0.6554781198501587, + "logps/chosen": -348.8257141113281, + "logps/rejected": -584.2406005859375, + "loss": 1.5563, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.060636043548584, + "rewards/margins": 13.35194206237793, + "rewards/rejected": -19.412578582763672, + "step": 1824 + }, + { + "epoch": 1.1353032659409021, + "grad_norm": 25.226947784423828, + "learning_rate": 3.4532042415859844e-06, + "logits/chosen": -2.031526565551758, + "logits/rejected": 0.9291627407073975, + "logps/chosen": -439.3551025390625, + "logps/rejected": -834.6775512695312, + "loss": 0.2199, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.6891093254089355, + "rewards/margins": 20.3415584564209, + "rewards/rejected": -25.03066635131836, + "step": 1825 + }, + { + "epoch": 1.1359253499222395, + "grad_norm": 0.34828466176986694, + "learning_rate": 3.4520516366989397e-06, + "logits/chosen": -0.45988568663597107, + "logits/rejected": 4.3835344314575195, + "logps/chosen": -523.087890625, + "logps/rejected": -990.68603515625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.440042495727539, + "rewards/margins": 24.660259246826172, + "rewards/rejected": -30.100299835205078, + "step": 1826 + }, + { + "epoch": 1.136547433903577, + "grad_norm": 0.009784924797713757, + "learning_rate": 3.4508990318118953e-06, + "logits/chosen": 1.2427332401275635, + "logits/rejected": 5.04477596282959, + "logps/chosen": -479.31439208984375, + "logps/rejected": -879.8087158203125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.576506614685059, + "rewards/margins": 22.55929183959961, + "rewards/rejected": -31.135799407958984, + "step": 1827 + }, + { + "epoch": 1.1371695178849144, + "grad_norm": 0.010587329044938087, + "learning_rate": 3.4497464269248505e-06, + "logits/chosen": 1.4068046808242798, + "logits/rejected": 4.196283340454102, + "logps/chosen": -501.2727355957031, + "logps/rejected": -985.1683349609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.856316566467285, + "rewards/margins": 28.490116119384766, + "rewards/rejected": -34.346435546875, + "step": 1828 + }, + { + "epoch": 1.137791601866252, + "grad_norm": 0.047132086008787155, + "learning_rate": 3.4485938220378058e-06, + "logits/chosen": 2.6061911582946777, + "logits/rejected": 3.4840335845947266, + "logps/chosen": -646.5637817382812, + "logps/rejected": -892.6546630859375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.78953742980957, + "rewards/margins": 22.944318771362305, + "rewards/rejected": -28.733856201171875, + "step": 1829 + }, + { + "epoch": 1.1384136858475895, + "grad_norm": 0.00011491310579003766, + "learning_rate": 3.447441217150761e-06, + "logits/chosen": 3.098417043685913, + "logits/rejected": 4.135062217712402, + "logps/chosen": -601.7989501953125, + "logps/rejected": -902.9705810546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.078056335449219, + "rewards/margins": 26.534677505493164, + "rewards/rejected": -32.612735748291016, + "step": 1830 + }, + { + "epoch": 1.1390357698289268, + "grad_norm": 4.412220001220703, + "learning_rate": 3.4462886122637162e-06, + "logits/chosen": 1.1942940950393677, + "logits/rejected": 2.7071847915649414, + "logps/chosen": -393.26165771484375, + "logps/rejected": -777.9138793945312, + "loss": 0.0713, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.756586074829102, + "rewards/margins": 20.164287567138672, + "rewards/rejected": -24.920875549316406, + "step": 1831 + }, + { + "epoch": 1.1396578538102644, + "grad_norm": 0.0028339342679828405, + "learning_rate": 3.4451360073766714e-06, + "logits/chosen": 0.46175873279571533, + "logits/rejected": 4.249823570251465, + "logps/chosen": -444.9114074707031, + "logps/rejected": -845.7836303710938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.111077308654785, + "rewards/margins": 18.86326026916504, + "rewards/rejected": -22.97433853149414, + "step": 1832 + }, + { + "epoch": 1.1402799377916017, + "grad_norm": 21.523834228515625, + "learning_rate": 3.4439834024896267e-06, + "logits/chosen": 0.8463013172149658, + "logits/rejected": 3.7445366382598877, + "logps/chosen": -480.28466796875, + "logps/rejected": -760.3237915039062, + "loss": 0.1944, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.874420166015625, + "rewards/margins": 18.69571304321289, + "rewards/rejected": -25.570131301879883, + "step": 1833 + }, + { + "epoch": 1.1409020217729393, + "grad_norm": 0.8458882570266724, + "learning_rate": 3.442830797602582e-06, + "logits/chosen": 1.6432691812515259, + "logits/rejected": 4.076677322387695, + "logps/chosen": -539.7843017578125, + "logps/rejected": -844.2291259765625, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.506782531738281, + "rewards/margins": 13.569256782531738, + "rewards/rejected": -19.076038360595703, + "step": 1834 + }, + { + "epoch": 1.1415241057542769, + "grad_norm": 0.08397660404443741, + "learning_rate": 3.4416781927155375e-06, + "logits/chosen": -0.7760798335075378, + "logits/rejected": 3.8760299682617188, + "logps/chosen": -462.271484375, + "logps/rejected": -995.9507446289062, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.9000444412231445, + "rewards/margins": 27.665067672729492, + "rewards/rejected": -32.56511306762695, + "step": 1835 + }, + { + "epoch": 1.1421461897356142, + "grad_norm": 30.610496520996094, + "learning_rate": 3.4405255878284928e-06, + "logits/chosen": 1.3444571495056152, + "logits/rejected": 2.9977803230285645, + "logps/chosen": -615.3402709960938, + "logps/rejected": -886.0873413085938, + "loss": 0.2224, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.009075164794922, + "rewards/margins": 18.689491271972656, + "rewards/rejected": -24.698566436767578, + "step": 1836 + }, + { + "epoch": 1.1427682737169518, + "grad_norm": 0.008456461131572723, + "learning_rate": 3.439372982941448e-06, + "logits/chosen": 0.7336775064468384, + "logits/rejected": 1.3986623287200928, + "logps/chosen": -559.4346923828125, + "logps/rejected": -851.5731201171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.957766056060791, + "rewards/margins": 24.573040008544922, + "rewards/rejected": -31.530807495117188, + "step": 1837 + }, + { + "epoch": 1.1433903576982893, + "grad_norm": 0.3551645576953888, + "learning_rate": 3.4382203780544032e-06, + "logits/chosen": -1.7950788736343384, + "logits/rejected": 0.5798860788345337, + "logps/chosen": -513.7240600585938, + "logps/rejected": -909.632568359375, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.312442779541016, + "rewards/margins": 27.34845733642578, + "rewards/rejected": -32.66089630126953, + "step": 1838 + }, + { + "epoch": 1.1440124416796267, + "grad_norm": 0.0067468322813510895, + "learning_rate": 3.4370677731673584e-06, + "logits/chosen": -0.35771840810775757, + "logits/rejected": 1.6661145687103271, + "logps/chosen": -544.712890625, + "logps/rejected": -888.5681762695312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.579408168792725, + "rewards/margins": 24.83259391784668, + "rewards/rejected": -29.412002563476562, + "step": 1839 + }, + { + "epoch": 1.1446345256609642, + "grad_norm": 0.06160581111907959, + "learning_rate": 3.4359151682803137e-06, + "logits/chosen": 1.519280195236206, + "logits/rejected": 4.197678565979004, + "logps/chosen": -560.5360107421875, + "logps/rejected": -958.4043579101562, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.82316780090332, + "rewards/margins": 23.101886749267578, + "rewards/rejected": -28.925052642822266, + "step": 1840 + }, + { + "epoch": 1.1452566096423018, + "grad_norm": 0.05726956948637962, + "learning_rate": 3.434762563393269e-06, + "logits/chosen": -1.2474236488342285, + "logits/rejected": 3.9748692512512207, + "logps/chosen": -507.4255065917969, + "logps/rejected": -922.804443359375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.531657218933105, + "rewards/margins": 16.375091552734375, + "rewards/rejected": -24.90674591064453, + "step": 1841 + }, + { + "epoch": 1.1458786936236391, + "grad_norm": 0.13852474093437195, + "learning_rate": 3.4336099585062245e-06, + "logits/chosen": -4.124720573425293, + "logits/rejected": 1.3218566179275513, + "logps/chosen": -302.812255859375, + "logps/rejected": -713.38720703125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.925245761871338, + "rewards/margins": 17.303882598876953, + "rewards/rejected": -21.229129791259766, + "step": 1842 + }, + { + "epoch": 1.1465007776049767, + "grad_norm": 0.4648420810699463, + "learning_rate": 3.4324573536191798e-06, + "logits/chosen": -1.682816982269287, + "logits/rejected": 1.9900355339050293, + "logps/chosen": -366.4227600097656, + "logps/rejected": -789.5729370117188, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.685972213745117, + "rewards/margins": 22.045013427734375, + "rewards/rejected": -27.730985641479492, + "step": 1843 + }, + { + "epoch": 1.1471228615863143, + "grad_norm": 0.00197424809448421, + "learning_rate": 3.431304748732135e-06, + "logits/chosen": 2.2375340461730957, + "logits/rejected": 3.6597280502319336, + "logps/chosen": -543.592529296875, + "logps/rejected": -841.5507202148438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5524721145629883, + "rewards/margins": 21.644725799560547, + "rewards/rejected": -25.19719696044922, + "step": 1844 + }, + { + "epoch": 1.1477449455676516, + "grad_norm": 0.0012396638048812747, + "learning_rate": 3.43015214384509e-06, + "logits/chosen": 0.10678932815790176, + "logits/rejected": 2.318019390106201, + "logps/chosen": -518.6724853515625, + "logps/rejected": -880.78759765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.50995397567749, + "rewards/margins": 25.387535095214844, + "rewards/rejected": -29.89748764038086, + "step": 1845 + }, + { + "epoch": 1.1483670295489892, + "grad_norm": 29.93779945373535, + "learning_rate": 3.4289995389580454e-06, + "logits/chosen": -0.42190590500831604, + "logits/rejected": 3.944661855697632, + "logps/chosen": -453.431640625, + "logps/rejected": -858.2440185546875, + "loss": 0.3281, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.201959133148193, + "rewards/margins": 17.08089256286621, + "rewards/rejected": -24.282852172851562, + "step": 1846 + }, + { + "epoch": 1.1489891135303265, + "grad_norm": 0.016757098957896233, + "learning_rate": 3.4278469340710007e-06, + "logits/chosen": -0.9321748614311218, + "logits/rejected": 2.595136880874634, + "logps/chosen": -430.6005859375, + "logps/rejected": -932.3299560546875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7968358993530273, + "rewards/margins": 21.861114501953125, + "rewards/rejected": -25.657949447631836, + "step": 1847 + }, + { + "epoch": 1.149611197511664, + "grad_norm": 0.001826676307246089, + "learning_rate": 3.426694329183956e-06, + "logits/chosen": -1.5225608348846436, + "logits/rejected": 3.197697162628174, + "logps/chosen": -296.51165771484375, + "logps/rejected": -825.5092163085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.746575117111206, + "rewards/margins": 27.655593872070312, + "rewards/rejected": -29.40216827392578, + "step": 1848 + }, + { + "epoch": 1.1502332814930016, + "grad_norm": 0.0038855511229485273, + "learning_rate": 3.4255417242969115e-06, + "logits/chosen": -2.6227917671203613, + "logits/rejected": 0.8911744356155396, + "logps/chosen": -415.6127014160156, + "logps/rejected": -1022.6885375976562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.036016464233398, + "rewards/margins": 29.356048583984375, + "rewards/rejected": -37.392066955566406, + "step": 1849 + }, + { + "epoch": 1.150855365474339, + "grad_norm": 30.342679977416992, + "learning_rate": 3.4243891194098668e-06, + "logits/chosen": 1.395837426185608, + "logits/rejected": 2.462480306625366, + "logps/chosen": -599.7311401367188, + "logps/rejected": -745.6565551757812, + "loss": 0.6978, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.861150741577148, + "rewards/margins": 15.441508293151855, + "rewards/rejected": -22.302658081054688, + "step": 1850 + }, + { + "epoch": 1.1514774494556765, + "grad_norm": 10.274881362915039, + "learning_rate": 3.423236514522822e-06, + "logits/chosen": 3.5531959533691406, + "logits/rejected": 3.5783658027648926, + "logps/chosen": -626.026123046875, + "logps/rejected": -849.3629150390625, + "loss": 0.0342, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.679840087890625, + "rewards/margins": 17.586177825927734, + "rewards/rejected": -27.26601791381836, + "step": 1851 + }, + { + "epoch": 1.1520995334370139, + "grad_norm": 0.01562790386378765, + "learning_rate": 3.422083909635777e-06, + "logits/chosen": 1.6220135688781738, + "logits/rejected": 3.173398971557617, + "logps/chosen": -565.2597045898438, + "logps/rejected": -797.6683349609375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.182493209838867, + "rewards/margins": 15.873281478881836, + "rewards/rejected": -22.055774688720703, + "step": 1852 + }, + { + "epoch": 1.1527216174183514, + "grad_norm": 0.02775227278470993, + "learning_rate": 3.4209313047487324e-06, + "logits/chosen": 0.7502641677856445, + "logits/rejected": 4.694336414337158, + "logps/chosen": -500.84454345703125, + "logps/rejected": -958.1452026367188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.311468124389648, + "rewards/margins": 25.65319061279297, + "rewards/rejected": -30.964656829833984, + "step": 1853 + }, + { + "epoch": 1.153343701399689, + "grad_norm": 0.004239192698150873, + "learning_rate": 3.4197786998616877e-06, + "logits/chosen": 1.1329727172851562, + "logits/rejected": 4.304278373718262, + "logps/chosen": -465.35028076171875, + "logps/rejected": -992.7420043945312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.760903358459473, + "rewards/margins": 27.45376968383789, + "rewards/rejected": -33.21466827392578, + "step": 1854 + }, + { + "epoch": 1.1539657853810263, + "grad_norm": 36.458282470703125, + "learning_rate": 3.418626094974643e-06, + "logits/chosen": 0.939897894859314, + "logits/rejected": 3.1979446411132812, + "logps/chosen": -555.7550659179688, + "logps/rejected": -973.84814453125, + "loss": 0.5565, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7667133808135986, + "rewards/margins": 26.34735107421875, + "rewards/rejected": -29.114065170288086, + "step": 1855 + }, + { + "epoch": 1.154587869362364, + "grad_norm": 0.10218100994825363, + "learning_rate": 3.4174734900875985e-06, + "logits/chosen": 2.230012893676758, + "logits/rejected": 1.621048927307129, + "logps/chosen": -566.50341796875, + "logps/rejected": -781.8897094726562, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.9751152992248535, + "rewards/margins": 17.062904357910156, + "rewards/rejected": -24.038021087646484, + "step": 1856 + }, + { + "epoch": 1.1552099533437015, + "grad_norm": 3.060608833038714e-07, + "learning_rate": 3.4163208852005538e-06, + "logits/chosen": 0.6016669273376465, + "logits/rejected": 2.2606773376464844, + "logps/chosen": -630.81396484375, + "logps/rejected": -930.8638916015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.966922760009766, + "rewards/margins": 28.5898380279541, + "rewards/rejected": -39.5567626953125, + "step": 1857 + }, + { + "epoch": 1.1558320373250388, + "grad_norm": 7.1231184005737305, + "learning_rate": 3.415168280313509e-06, + "logits/chosen": -0.9377905130386353, + "logits/rejected": 2.894376277923584, + "logps/chosen": -480.42840576171875, + "logps/rejected": -777.5912475585938, + "loss": 0.0491, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.04439115524292, + "rewards/margins": 14.577075958251953, + "rewards/rejected": -19.6214656829834, + "step": 1858 + }, + { + "epoch": 1.1564541213063764, + "grad_norm": 14.957558631896973, + "learning_rate": 3.414015675426464e-06, + "logits/chosen": 1.6655893325805664, + "logits/rejected": 4.472172260284424, + "logps/chosen": -559.6851196289062, + "logps/rejected": -858.532958984375, + "loss": 0.139, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.541600227355957, + "rewards/margins": 15.499042510986328, + "rewards/rejected": -23.04064178466797, + "step": 1859 + }, + { + "epoch": 1.157076205287714, + "grad_norm": 0.31053587794303894, + "learning_rate": 3.4128630705394194e-06, + "logits/chosen": 2.565433979034424, + "logits/rejected": 3.6672003269195557, + "logps/chosen": -529.284912109375, + "logps/rejected": -770.8364868164062, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.860030174255371, + "rewards/margins": 14.366354942321777, + "rewards/rejected": -21.22638511657715, + "step": 1860 + }, + { + "epoch": 1.1576982892690513, + "grad_norm": 0.01324005238711834, + "learning_rate": 3.4117104656523747e-06, + "logits/chosen": -0.9546438455581665, + "logits/rejected": 3.5789406299591064, + "logps/chosen": -428.26800537109375, + "logps/rejected": -955.6097412109375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.1026291847229, + "rewards/margins": 27.42490005493164, + "rewards/rejected": -32.52752685546875, + "step": 1861 + }, + { + "epoch": 1.1583203732503888, + "grad_norm": 2.0153255453347896e-10, + "learning_rate": 3.41055786076533e-06, + "logits/chosen": 0.16465669870376587, + "logits/rejected": 4.746971130371094, + "logps/chosen": -483.0347595214844, + "logps/rejected": -1162.955078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.803442478179932, + "rewards/margins": 39.28862380981445, + "rewards/rejected": -46.092063903808594, + "step": 1862 + }, + { + "epoch": 1.1589424572317264, + "grad_norm": 1.7264849816456262e-07, + "learning_rate": 3.409405255878285e-06, + "logits/chosen": -1.883517861366272, + "logits/rejected": 3.036733388900757, + "logps/chosen": -385.6026611328125, + "logps/rejected": -886.4443359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.124665260314941, + "rewards/margins": 31.6492977142334, + "rewards/rejected": -35.773963928222656, + "step": 1863 + }, + { + "epoch": 1.1595645412130637, + "grad_norm": 0.017328694462776184, + "learning_rate": 3.4082526509912408e-06, + "logits/chosen": -1.9856454133987427, + "logits/rejected": 4.194329261779785, + "logps/chosen": -361.50079345703125, + "logps/rejected": -1048.82275390625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6735262870788574, + "rewards/margins": 32.9036750793457, + "rewards/rejected": -36.57720184326172, + "step": 1864 + }, + { + "epoch": 1.1601866251944013, + "grad_norm": 0.021417386829853058, + "learning_rate": 3.407100046104196e-06, + "logits/chosen": -2.5190682411193848, + "logits/rejected": 1.7409802675247192, + "logps/chosen": -420.822998046875, + "logps/rejected": -982.8463134765625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.529491424560547, + "rewards/margins": 26.406007766723633, + "rewards/rejected": -35.93549728393555, + "step": 1865 + }, + { + "epoch": 1.1608087091757386, + "grad_norm": 2.4923558157752268e-05, + "learning_rate": 3.405947441217151e-06, + "logits/chosen": -1.5282564163208008, + "logits/rejected": 1.6944283246994019, + "logps/chosen": -333.0899353027344, + "logps/rejected": -779.9032592773438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1282002925872803, + "rewards/margins": 28.565494537353516, + "rewards/rejected": -31.693696975708008, + "step": 1866 + }, + { + "epoch": 1.1614307931570762, + "grad_norm": 8.579933166503906, + "learning_rate": 3.4047948363301064e-06, + "logits/chosen": 0.9272339940071106, + "logits/rejected": 3.0300166606903076, + "logps/chosen": -622.0635986328125, + "logps/rejected": -927.7623291015625, + "loss": 0.0467, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.8049421310424805, + "rewards/margins": 20.665813446044922, + "rewards/rejected": -27.470754623413086, + "step": 1867 + }, + { + "epoch": 1.1620528771384138, + "grad_norm": 0.16499559581279755, + "learning_rate": 3.4036422314430616e-06, + "logits/chosen": 0.13868463039398193, + "logits/rejected": 1.8279064893722534, + "logps/chosen": -570.3772583007812, + "logps/rejected": -917.9130249023438, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.679691314697266, + "rewards/margins": 21.65399742126465, + "rewards/rejected": -30.333688735961914, + "step": 1868 + }, + { + "epoch": 1.162674961119751, + "grad_norm": 4.282171249389648, + "learning_rate": 3.402489626556017e-06, + "logits/chosen": 1.3641959428787231, + "logits/rejected": 2.3401095867156982, + "logps/chosen": -680.9366455078125, + "logps/rejected": -977.0753173828125, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.871400833129883, + "rewards/margins": 18.216270446777344, + "rewards/rejected": -27.087669372558594, + "step": 1869 + }, + { + "epoch": 1.1632970451010887, + "grad_norm": 0.30975544452667236, + "learning_rate": 3.401337021668972e-06, + "logits/chosen": -0.5252646207809448, + "logits/rejected": 1.5963172912597656, + "logps/chosen": -546.5020751953125, + "logps/rejected": -842.185546875, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.25322151184082, + "rewards/margins": 20.49350929260254, + "rewards/rejected": -27.746732711791992, + "step": 1870 + }, + { + "epoch": 1.163919129082426, + "grad_norm": 0.0698472410440445, + "learning_rate": 3.4001844167819277e-06, + "logits/chosen": -0.9682607054710388, + "logits/rejected": 3.4010274410247803, + "logps/chosen": -492.4614562988281, + "logps/rejected": -1111.92236328125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.805222511291504, + "rewards/margins": 35.77224349975586, + "rewards/rejected": -41.57746505737305, + "step": 1871 + }, + { + "epoch": 1.1645412130637636, + "grad_norm": 18.648984909057617, + "learning_rate": 3.399031811894883e-06, + "logits/chosen": 0.6838645935058594, + "logits/rejected": 2.589054584503174, + "logps/chosen": -500.4499816894531, + "logps/rejected": -837.994384765625, + "loss": 0.1269, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.229727745056152, + "rewards/margins": 16.818801879882812, + "rewards/rejected": -24.04853057861328, + "step": 1872 + }, + { + "epoch": 1.1651632970451011, + "grad_norm": 7.5405064308142755e-06, + "learning_rate": 3.397879207007838e-06, + "logits/chosen": -1.3973257541656494, + "logits/rejected": 0.9450278878211975, + "logps/chosen": -473.623779296875, + "logps/rejected": -899.1165771484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.336153268814087, + "rewards/margins": 30.845565795898438, + "rewards/rejected": -34.18171691894531, + "step": 1873 + }, + { + "epoch": 1.1657853810264385, + "grad_norm": 0.12114159017801285, + "learning_rate": 3.3967266021207934e-06, + "logits/chosen": 3.0597760677337646, + "logits/rejected": 4.02023983001709, + "logps/chosen": -646.0111083984375, + "logps/rejected": -886.9459228515625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.5245513916015625, + "rewards/margins": 20.911964416503906, + "rewards/rejected": -28.43651580810547, + "step": 1874 + }, + { + "epoch": 1.166407465007776, + "grad_norm": 1.1679210662841797, + "learning_rate": 3.3955739972337486e-06, + "logits/chosen": 0.360098659992218, + "logits/rejected": 2.741698741912842, + "logps/chosen": -627.6326904296875, + "logps/rejected": -949.013671875, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.880720138549805, + "rewards/margins": 21.968727111816406, + "rewards/rejected": -32.849449157714844, + "step": 1875 + }, + { + "epoch": 1.1670295489891136, + "grad_norm": 20.22886085510254, + "learning_rate": 3.394421392346704e-06, + "logits/chosen": 1.9235471487045288, + "logits/rejected": 4.863920211791992, + "logps/chosen": -573.7352294921875, + "logps/rejected": -1049.0943603515625, + "loss": 0.221, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.95592212677002, + "rewards/margins": 27.50440788269043, + "rewards/rejected": -37.4603271484375, + "step": 1876 + }, + { + "epoch": 1.167651632970451, + "grad_norm": 1.4595121683669277e-05, + "learning_rate": 3.393268787459659e-06, + "logits/chosen": -2.712231397628784, + "logits/rejected": 1.803051471710205, + "logps/chosen": -349.8004150390625, + "logps/rejected": -857.34228515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.230327129364014, + "rewards/margins": 29.79707145690918, + "rewards/rejected": -36.02739715576172, + "step": 1877 + }, + { + "epoch": 1.1682737169517885, + "grad_norm": 22.88018035888672, + "learning_rate": 3.3921161825726147e-06, + "logits/chosen": 1.118056297302246, + "logits/rejected": 3.7516231536865234, + "logps/chosen": -641.5447998046875, + "logps/rejected": -1088.7965087890625, + "loss": 0.3303, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.877915382385254, + "rewards/margins": 25.538475036621094, + "rewards/rejected": -34.4163932800293, + "step": 1878 + }, + { + "epoch": 1.168895800933126, + "grad_norm": 0.9193662405014038, + "learning_rate": 3.39096357768557e-06, + "logits/chosen": 1.4894222021102905, + "logits/rejected": 4.886639595031738, + "logps/chosen": -579.7659301757812, + "logps/rejected": -1037.610595703125, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.003030776977539, + "rewards/margins": 23.57931137084961, + "rewards/rejected": -37.58234405517578, + "step": 1879 + }, + { + "epoch": 1.1695178849144634, + "grad_norm": 8.398724555969238, + "learning_rate": 3.389810972798525e-06, + "logits/chosen": 1.810590147972107, + "logits/rejected": 4.150803089141846, + "logps/chosen": -519.09326171875, + "logps/rejected": -977.6038818359375, + "loss": 0.0846, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.65777587890625, + "rewards/margins": 32.098777770996094, + "rewards/rejected": -37.756553649902344, + "step": 1880 + }, + { + "epoch": 1.170139968895801, + "grad_norm": 0.16134734451770782, + "learning_rate": 3.3886583679114804e-06, + "logits/chosen": -2.5285017490386963, + "logits/rejected": 2.1500158309936523, + "logps/chosen": -376.177001953125, + "logps/rejected": -864.7513427734375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.267976760864258, + "rewards/margins": 24.168720245361328, + "rewards/rejected": -33.43669891357422, + "step": 1881 + }, + { + "epoch": 1.1707620528771385, + "grad_norm": 0.00012305942072998732, + "learning_rate": 3.3875057630244356e-06, + "logits/chosen": -0.5643349885940552, + "logits/rejected": 3.5905985832214355, + "logps/chosen": -445.88201904296875, + "logps/rejected": -1024.126708984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.768947601318359, + "rewards/margins": 31.707120895385742, + "rewards/rejected": -38.47606658935547, + "step": 1882 + }, + { + "epoch": 1.1713841368584759, + "grad_norm": 0.018285546451807022, + "learning_rate": 3.386353158137391e-06, + "logits/chosen": -1.7855366468429565, + "logits/rejected": 2.1956353187561035, + "logps/chosen": -345.2109375, + "logps/rejected": -812.0648193359375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.950940132141113, + "rewards/margins": 26.879283905029297, + "rewards/rejected": -33.830223083496094, + "step": 1883 + }, + { + "epoch": 1.1720062208398134, + "grad_norm": 5.884623169549741e-05, + "learning_rate": 3.385200553250346e-06, + "logits/chosen": -0.8889130353927612, + "logits/rejected": 4.409315586090088, + "logps/chosen": -434.36590576171875, + "logps/rejected": -1086.90625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.007776737213135, + "rewards/margins": 31.600605010986328, + "rewards/rejected": -37.60838317871094, + "step": 1884 + }, + { + "epoch": 1.1726283048211508, + "grad_norm": 0.03610111027956009, + "learning_rate": 3.3840479483633013e-06, + "logits/chosen": 0.03821098804473877, + "logits/rejected": 1.735025405883789, + "logps/chosen": -539.9411010742188, + "logps/rejected": -877.2424926757812, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.749122619628906, + "rewards/margins": 24.988826751708984, + "rewards/rejected": -31.737951278686523, + "step": 1885 + }, + { + "epoch": 1.1732503888024883, + "grad_norm": 2.511057937226724e-05, + "learning_rate": 3.382895343476257e-06, + "logits/chosen": -0.6976618766784668, + "logits/rejected": 5.024380683898926, + "logps/chosen": -392.182861328125, + "logps/rejected": -1032.8702392578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2829113006591797, + "rewards/margins": 29.962074279785156, + "rewards/rejected": -33.24498748779297, + "step": 1886 + }, + { + "epoch": 1.173872472783826, + "grad_norm": 0.3009835183620453, + "learning_rate": 3.381742738589212e-06, + "logits/chosen": 0.5339366793632507, + "logits/rejected": 3.618192195892334, + "logps/chosen": -487.1822204589844, + "logps/rejected": -911.4076538085938, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.047733306884766, + "rewards/margins": 20.27420425415039, + "rewards/rejected": -26.321937561035156, + "step": 1887 + }, + { + "epoch": 1.1744945567651632, + "grad_norm": 13.114173889160156, + "learning_rate": 3.3805901337021674e-06, + "logits/chosen": -0.8705597519874573, + "logits/rejected": 3.0176239013671875, + "logps/chosen": -430.07037353515625, + "logps/rejected": -851.323486328125, + "loss": 0.0511, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.196186065673828, + "rewards/margins": 23.563753128051758, + "rewards/rejected": -29.75994110107422, + "step": 1888 + }, + { + "epoch": 1.1751166407465008, + "grad_norm": 15.261913299560547, + "learning_rate": 3.3794375288151226e-06, + "logits/chosen": -1.91538667678833, + "logits/rejected": 3.901822090148926, + "logps/chosen": -502.11639404296875, + "logps/rejected": -1063.5262451171875, + "loss": 0.0575, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.721534729003906, + "rewards/margins": 28.344356536865234, + "rewards/rejected": -39.065895080566406, + "step": 1889 + }, + { + "epoch": 1.1757387247278381, + "grad_norm": 0.00018090769299305975, + "learning_rate": 3.378284923928078e-06, + "logits/chosen": -2.1836063861846924, + "logits/rejected": 3.5650699138641357, + "logps/chosen": -461.0611572265625, + "logps/rejected": -1103.5301513671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.961669921875, + "rewards/margins": 30.819530487060547, + "rewards/rejected": -39.78120040893555, + "step": 1890 + }, + { + "epoch": 1.1763608087091757, + "grad_norm": 6.663813591003418, + "learning_rate": 3.377132319041033e-06, + "logits/chosen": -0.012041866779327393, + "logits/rejected": 3.2969517707824707, + "logps/chosen": -520.4405517578125, + "logps/rejected": -839.4365844726562, + "loss": 0.0301, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.451179504394531, + "rewards/margins": 17.711742401123047, + "rewards/rejected": -25.162921905517578, + "step": 1891 + }, + { + "epoch": 1.1769828926905133, + "grad_norm": 0.0020499967504292727, + "learning_rate": 3.3759797141539883e-06, + "logits/chosen": 0.5658305883407593, + "logits/rejected": 2.8545844554901123, + "logps/chosen": -587.4124755859375, + "logps/rejected": -868.9716796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.186939239501953, + "rewards/margins": 18.542457580566406, + "rewards/rejected": -28.72939682006836, + "step": 1892 + }, + { + "epoch": 1.1776049766718506, + "grad_norm": 0.056425973773002625, + "learning_rate": 3.374827109266944e-06, + "logits/chosen": 1.9432047605514526, + "logits/rejected": 5.135171890258789, + "logps/chosen": -534.0836181640625, + "logps/rejected": -919.96484375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.417974472045898, + "rewards/margins": 22.20659828186035, + "rewards/rejected": -30.62457275390625, + "step": 1893 + }, + { + "epoch": 1.1782270606531882, + "grad_norm": 5.256790609564632e-05, + "learning_rate": 3.373674504379899e-06, + "logits/chosen": 0.011599451303482056, + "logits/rejected": 2.3024370670318604, + "logps/chosen": -513.41259765625, + "logps/rejected": -879.0580444335938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.419273376464844, + "rewards/margins": 26.65741539001465, + "rewards/rejected": -35.076690673828125, + "step": 1894 + }, + { + "epoch": 1.1788491446345257, + "grad_norm": 0.29891255497932434, + "learning_rate": 3.3725218994928544e-06, + "logits/chosen": -1.412156581878662, + "logits/rejected": 1.3909084796905518, + "logps/chosen": -525.9658203125, + "logps/rejected": -961.8241577148438, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.266643524169922, + "rewards/margins": 25.69481086730957, + "rewards/rejected": -31.961456298828125, + "step": 1895 + }, + { + "epoch": 1.179471228615863, + "grad_norm": 6.161828517913818, + "learning_rate": 3.3713692946058096e-06, + "logits/chosen": -0.9599625468254089, + "logits/rejected": 3.51469087600708, + "logps/chosen": -455.3974609375, + "logps/rejected": -972.5230102539062, + "loss": 0.0321, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.252844333648682, + "rewards/margins": 24.31687355041504, + "rewards/rejected": -30.569719314575195, + "step": 1896 + }, + { + "epoch": 1.1800933125972006, + "grad_norm": 16.62723159790039, + "learning_rate": 3.370216689718765e-06, + "logits/chosen": 0.051849544048309326, + "logits/rejected": 2.560445785522461, + "logps/chosen": -513.8330078125, + "logps/rejected": -841.5488891601562, + "loss": 0.1151, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.442409992218018, + "rewards/margins": 20.9028377532959, + "rewards/rejected": -28.34524917602539, + "step": 1897 + }, + { + "epoch": 1.1807153965785382, + "grad_norm": 0.0016077188774943352, + "learning_rate": 3.36906408483172e-06, + "logits/chosen": -0.7170097231864929, + "logits/rejected": 4.153207778930664, + "logps/chosen": -476.2669982910156, + "logps/rejected": -1010.2525634765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.360616683959961, + "rewards/margins": 26.90864372253418, + "rewards/rejected": -33.269264221191406, + "step": 1898 + }, + { + "epoch": 1.1813374805598755, + "grad_norm": 0.009510945528745651, + "learning_rate": 3.3679114799446753e-06, + "logits/chosen": 1.9269828796386719, + "logits/rejected": 4.278368949890137, + "logps/chosen": -580.8291625976562, + "logps/rejected": -914.4641723632812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.361428260803223, + "rewards/margins": 25.89552116394043, + "rewards/rejected": -36.25695037841797, + "step": 1899 + }, + { + "epoch": 1.181959564541213, + "grad_norm": 0.16183213889598846, + "learning_rate": 3.366758875057631e-06, + "logits/chosen": -1.8444448709487915, + "logits/rejected": 3.2564399242401123, + "logps/chosen": -402.34051513671875, + "logps/rejected": -958.4459228515625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.02139139175415, + "rewards/margins": 23.725345611572266, + "rewards/rejected": -28.746734619140625, + "step": 1900 + }, + { + "epoch": 1.1825816485225507, + "grad_norm": 9.202930328910952e-08, + "learning_rate": 3.365606270170586e-06, + "logits/chosen": 0.11875700950622559, + "logits/rejected": 4.793677806854248, + "logps/chosen": -453.58819580078125, + "logps/rejected": -1055.4208984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.494197845458984, + "rewards/margins": 36.492919921875, + "rewards/rejected": -42.98711395263672, + "step": 1901 + }, + { + "epoch": 1.183203732503888, + "grad_norm": 0.012897444888949394, + "learning_rate": 3.3644536652835414e-06, + "logits/chosen": -0.3415604829788208, + "logits/rejected": 3.5608906745910645, + "logps/chosen": -443.36767578125, + "logps/rejected": -957.35888671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.884511947631836, + "rewards/margins": 33.34904479980469, + "rewards/rejected": -43.233558654785156, + "step": 1902 + }, + { + "epoch": 1.1838258164852256, + "grad_norm": 0.0005375120672397316, + "learning_rate": 3.3633010603964966e-06, + "logits/chosen": -1.8501484394073486, + "logits/rejected": 3.881671905517578, + "logps/chosen": -455.2761535644531, + "logps/rejected": -1120.222900390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.41702651977539, + "rewards/margins": 33.15281677246094, + "rewards/rejected": -41.569847106933594, + "step": 1903 + }, + { + "epoch": 1.184447900466563, + "grad_norm": 1.0201901197433472, + "learning_rate": 3.362148455509452e-06, + "logits/chosen": 0.9012954235076904, + "logits/rejected": 3.5016984939575195, + "logps/chosen": -504.5726623535156, + "logps/rejected": -815.4268798828125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.356898307800293, + "rewards/margins": 18.643299102783203, + "rewards/rejected": -28.00019645690918, + "step": 1904 + }, + { + "epoch": 1.1850699844479005, + "grad_norm": 2.634040594100952, + "learning_rate": 3.360995850622407e-06, + "logits/chosen": 1.6875536441802979, + "logits/rejected": 4.519953727722168, + "logps/chosen": -580.7654418945312, + "logps/rejected": -1029.687744140625, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.505790710449219, + "rewards/margins": 27.653100967407227, + "rewards/rejected": -38.15888977050781, + "step": 1905 + }, + { + "epoch": 1.185692068429238, + "grad_norm": 27.720535278320312, + "learning_rate": 3.3598432457353623e-06, + "logits/chosen": 2.4027135372161865, + "logits/rejected": 3.5298256874084473, + "logps/chosen": -580.3115844726562, + "logps/rejected": -849.4248046875, + "loss": 0.6196, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.953683853149414, + "rewards/margins": 22.228294372558594, + "rewards/rejected": -33.181976318359375, + "step": 1906 + }, + { + "epoch": 1.1863141524105754, + "grad_norm": 0.0001605092256795615, + "learning_rate": 3.358690640848317e-06, + "logits/chosen": 1.2494337558746338, + "logits/rejected": 3.3936190605163574, + "logps/chosen": -534.42236328125, + "logps/rejected": -982.61865234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.883479118347168, + "rewards/margins": 31.28057861328125, + "rewards/rejected": -38.164058685302734, + "step": 1907 + }, + { + "epoch": 1.186936236391913, + "grad_norm": 4.926142196381988e-07, + "learning_rate": 3.3575380359612723e-06, + "logits/chosen": -1.3139442205429077, + "logits/rejected": 2.4188690185546875, + "logps/chosen": -367.416015625, + "logps/rejected": -865.8370361328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.059207916259766, + "rewards/margins": 31.45437240600586, + "rewards/rejected": -40.513580322265625, + "step": 1908 + }, + { + "epoch": 1.1875583203732503, + "grad_norm": 0.0003280296514276415, + "learning_rate": 3.3563854310742276e-06, + "logits/chosen": -0.26524052023887634, + "logits/rejected": 2.9963579177856445, + "logps/chosen": -569.2260131835938, + "logps/rejected": -1066.4149169921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.808199882507324, + "rewards/margins": 28.098133087158203, + "rewards/rejected": -35.906333923339844, + "step": 1909 + }, + { + "epoch": 1.1881804043545878, + "grad_norm": 0.014919915236532688, + "learning_rate": 3.355232826187183e-06, + "logits/chosen": 2.3412349224090576, + "logits/rejected": 4.641690254211426, + "logps/chosen": -598.1677856445312, + "logps/rejected": -1043.271240234375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.240728378295898, + "rewards/margins": 25.26788330078125, + "rewards/rejected": -32.50861358642578, + "step": 1910 + }, + { + "epoch": 1.1888024883359254, + "grad_norm": 2.226551581996361e-10, + "learning_rate": 3.3540802213001384e-06, + "logits/chosen": 3.548205852508545, + "logits/rejected": 3.9359588623046875, + "logps/chosen": -690.6002807617188, + "logps/rejected": -1156.86572265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.449170112609863, + "rewards/margins": 35.990379333496094, + "rewards/rejected": -48.439544677734375, + "step": 1911 + }, + { + "epoch": 1.1894245723172627, + "grad_norm": 0.0006320227403193712, + "learning_rate": 3.3529276164130937e-06, + "logits/chosen": 2.9576265811920166, + "logits/rejected": 1.8555573225021362, + "logps/chosen": -705.2396240234375, + "logps/rejected": -898.2821044921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.850976943969727, + "rewards/margins": 17.06894874572754, + "rewards/rejected": -29.9199275970459, + "step": 1912 + }, + { + "epoch": 1.1900466562986003, + "grad_norm": 0.008909384720027447, + "learning_rate": 3.351775011526049e-06, + "logits/chosen": -3.500988006591797, + "logits/rejected": 3.632211446762085, + "logps/chosen": -272.4637756347656, + "logps/rejected": -955.69091796875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5750045776367188, + "rewards/margins": 33.8862419128418, + "rewards/rejected": -37.461246490478516, + "step": 1913 + }, + { + "epoch": 1.1906687402799379, + "grad_norm": 52.2058219909668, + "learning_rate": 3.350622406639004e-06, + "logits/chosen": 0.48060905933380127, + "logits/rejected": 2.518648147583008, + "logps/chosen": -588.2604370117188, + "logps/rejected": -919.8138427734375, + "loss": 0.5477, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.316235542297363, + "rewards/margins": 21.10009765625, + "rewards/rejected": -33.41633605957031, + "step": 1914 + }, + { + "epoch": 1.1912908242612752, + "grad_norm": 0.004597253166139126, + "learning_rate": 3.3494698017519593e-06, + "logits/chosen": -0.6631978750228882, + "logits/rejected": 3.513437271118164, + "logps/chosen": -484.0467224121094, + "logps/rejected": -1010.7188720703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.64289665222168, + "rewards/margins": 30.839702606201172, + "rewards/rejected": -39.48259735107422, + "step": 1915 + }, + { + "epoch": 1.1919129082426128, + "grad_norm": 0.00033162301406264305, + "learning_rate": 3.3483171968649146e-06, + "logits/chosen": 1.9419931173324585, + "logits/rejected": 4.727841377258301, + "logps/chosen": -546.1142578125, + "logps/rejected": -1015.4229736328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.172233581542969, + "rewards/margins": 29.4285888671875, + "rewards/rejected": -37.60082244873047, + "step": 1916 + }, + { + "epoch": 1.1925349922239503, + "grad_norm": 2.137709617614746, + "learning_rate": 3.34716459197787e-06, + "logits/chosen": 1.4818284511566162, + "logits/rejected": 4.6252312660217285, + "logps/chosen": -506.4788513183594, + "logps/rejected": -930.1427612304688, + "loss": 0.0853, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.089163780212402, + "rewards/margins": 22.086009979248047, + "rewards/rejected": -32.175174713134766, + "step": 1917 + }, + { + "epoch": 1.1931570762052877, + "grad_norm": 0.007987787947058678, + "learning_rate": 3.3460119870908254e-06, + "logits/chosen": 0.12977349758148193, + "logits/rejected": 1.7750893831253052, + "logps/chosen": -578.8599853515625, + "logps/rejected": -882.8643188476562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.8722429275512695, + "rewards/margins": 24.44931411743164, + "rewards/rejected": -31.321556091308594, + "step": 1918 + }, + { + "epoch": 1.1937791601866252, + "grad_norm": 0.645113468170166, + "learning_rate": 3.3448593822037807e-06, + "logits/chosen": -1.326491117477417, + "logits/rejected": 2.144299268722534, + "logps/chosen": -461.71722412109375, + "logps/rejected": -866.579833984375, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.663753986358643, + "rewards/margins": 25.251678466796875, + "rewards/rejected": -32.91543197631836, + "step": 1919 + }, + { + "epoch": 1.1944012441679628, + "grad_norm": 0.18914903700351715, + "learning_rate": 3.343706777316736e-06, + "logits/chosen": 3.1509156227111816, + "logits/rejected": 4.0067291259765625, + "logps/chosen": -537.1878662109375, + "logps/rejected": -864.8395385742188, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.262554168701172, + "rewards/margins": 20.671634674072266, + "rewards/rejected": -30.93419075012207, + "step": 1920 + }, + { + "epoch": 1.1950233281493001, + "grad_norm": 0.03683840483427048, + "learning_rate": 3.342554172429691e-06, + "logits/chosen": -2.908719062805176, + "logits/rejected": 1.6505014896392822, + "logps/chosen": -387.63677978515625, + "logps/rejected": -857.525390625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.601930618286133, + "rewards/margins": 25.352493286132812, + "rewards/rejected": -34.95442199707031, + "step": 1921 + }, + { + "epoch": 1.1956454121306377, + "grad_norm": 2.0521059036254883, + "learning_rate": 3.3414015675426463e-06, + "logits/chosen": -0.8097438812255859, + "logits/rejected": 2.9124770164489746, + "logps/chosen": -482.3450927734375, + "logps/rejected": -1000.04248046875, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.175515651702881, + "rewards/margins": 22.42913246154785, + "rewards/rejected": -28.60464859008789, + "step": 1922 + }, + { + "epoch": 1.196267496111975, + "grad_norm": 0.00045063262223266065, + "learning_rate": 3.3402489626556016e-06, + "logits/chosen": 1.2557860612869263, + "logits/rejected": 3.385568380355835, + "logps/chosen": -530.5950927734375, + "logps/rejected": -909.8157958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.130796432495117, + "rewards/margins": 27.050382614135742, + "rewards/rejected": -35.18117904663086, + "step": 1923 + }, + { + "epoch": 1.1968895800933126, + "grad_norm": 2.0517676446729638e-08, + "learning_rate": 3.339096357768557e-06, + "logits/chosen": -0.006276607513427734, + "logits/rejected": 3.593689441680908, + "logps/chosen": -383.19757080078125, + "logps/rejected": -871.0137939453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.104961395263672, + "rewards/margins": 32.07358169555664, + "rewards/rejected": -36.17854309082031, + "step": 1924 + }, + { + "epoch": 1.1975116640746502, + "grad_norm": 0.0016482784412801266, + "learning_rate": 3.3379437528815124e-06, + "logits/chosen": 0.5384297370910645, + "logits/rejected": 3.214430809020996, + "logps/chosen": -566.6683349609375, + "logps/rejected": -984.3301391601562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.574440002441406, + "rewards/margins": 23.82293701171875, + "rewards/rejected": -33.397377014160156, + "step": 1925 + }, + { + "epoch": 1.1981337480559875, + "grad_norm": 27.448835372924805, + "learning_rate": 3.3367911479944676e-06, + "logits/chosen": 0.10056604444980621, + "logits/rejected": 3.597458839416504, + "logps/chosen": -456.06005859375, + "logps/rejected": -866.9276123046875, + "loss": 0.571, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.5373406410217285, + "rewards/margins": 24.73653221130371, + "rewards/rejected": -31.27387237548828, + "step": 1926 + }, + { + "epoch": 1.198755832037325, + "grad_norm": 34.07020950317383, + "learning_rate": 3.335638543107423e-06, + "logits/chosen": -0.8443750143051147, + "logits/rejected": 2.3590760231018066, + "logps/chosen": -502.44146728515625, + "logps/rejected": -899.0219116210938, + "loss": 1.0365, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.6267242431640625, + "rewards/margins": 19.69811248779297, + "rewards/rejected": -24.32483673095703, + "step": 1927 + }, + { + "epoch": 1.1993779160186626, + "grad_norm": 0.00026728963712230325, + "learning_rate": 3.334485938220378e-06, + "logits/chosen": 0.41143798828125, + "logits/rejected": 3.4771077632904053, + "logps/chosen": -672.9883422851562, + "logps/rejected": -1171.4521484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.810882568359375, + "rewards/margins": 35.20409393310547, + "rewards/rejected": -43.01497268676758, + "step": 1928 + }, + { + "epoch": 1.2, + "grad_norm": 2.0605654071914614e-08, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": 0.14419078826904297, + "logits/rejected": 3.9184060096740723, + "logps/chosen": -515.4486694335938, + "logps/rejected": -1113.77783203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.166099548339844, + "rewards/margins": 39.462459564208984, + "rewards/rejected": -47.628562927246094, + "step": 1929 + }, + { + "epoch": 1.2006220839813375, + "grad_norm": 0.08832728117704391, + "learning_rate": 3.3321807284462885e-06, + "logits/chosen": 0.09780023992061615, + "logits/rejected": 4.137458801269531, + "logps/chosen": -517.83642578125, + "logps/rejected": -946.2546997070312, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.975343704223633, + "rewards/margins": 27.883960723876953, + "rewards/rejected": -37.85930633544922, + "step": 1930 + }, + { + "epoch": 1.2012441679626749, + "grad_norm": 0.004210162442177534, + "learning_rate": 3.331028123559244e-06, + "logits/chosen": -1.5383262634277344, + "logits/rejected": 4.4926958084106445, + "logps/chosen": -306.3968505859375, + "logps/rejected": -1023.9912109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.66239857673645, + "rewards/margins": 36.54762268066406, + "rewards/rejected": -40.21002197265625, + "step": 1931 + }, + { + "epoch": 1.2018662519440124, + "grad_norm": 0.007443973794579506, + "learning_rate": 3.3298755186721994e-06, + "logits/chosen": 0.7929803133010864, + "logits/rejected": 4.572413921356201, + "logps/chosen": -468.37957763671875, + "logps/rejected": -869.6012573242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.426292896270752, + "rewards/margins": 21.67800521850586, + "rewards/rejected": -27.104297637939453, + "step": 1932 + }, + { + "epoch": 1.20248833592535, + "grad_norm": 0.05187014490365982, + "learning_rate": 3.3287229137851546e-06, + "logits/chosen": 0.013190984725952148, + "logits/rejected": 2.615093231201172, + "logps/chosen": -338.7560729980469, + "logps/rejected": -766.6841430664062, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.282043933868408, + "rewards/margins": 25.907127380371094, + "rewards/rejected": -30.189170837402344, + "step": 1933 + }, + { + "epoch": 1.2031104199066873, + "grad_norm": 0.5828655958175659, + "learning_rate": 3.32757030889811e-06, + "logits/chosen": 0.724718451499939, + "logits/rejected": 3.5598180294036865, + "logps/chosen": -377.14056396484375, + "logps/rejected": -799.251708984375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.795290946960449, + "rewards/margins": 22.91904067993164, + "rewards/rejected": -28.714330673217773, + "step": 1934 + }, + { + "epoch": 1.2037325038880249, + "grad_norm": 0.00023630520445294678, + "learning_rate": 3.326417704011065e-06, + "logits/chosen": -0.6954635381698608, + "logits/rejected": 3.3336634635925293, + "logps/chosen": -451.3642578125, + "logps/rejected": -940.421142578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.785922050476074, + "rewards/margins": 25.902555465698242, + "rewards/rejected": -30.688478469848633, + "step": 1935 + }, + { + "epoch": 1.2043545878693624, + "grad_norm": 0.0010673885699361563, + "learning_rate": 3.3252650991240203e-06, + "logits/chosen": -1.998305082321167, + "logits/rejected": 3.031818389892578, + "logps/chosen": -428.25927734375, + "logps/rejected": -1018.7134399414062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.27209186553955, + "rewards/margins": 32.9398307800293, + "rewards/rejected": -43.2119255065918, + "step": 1936 + }, + { + "epoch": 1.2049766718506998, + "grad_norm": 1.6674846410751343, + "learning_rate": 3.3241124942369755e-06, + "logits/chosen": -1.2776638269424438, + "logits/rejected": 3.502741813659668, + "logps/chosen": -463.2667236328125, + "logps/rejected": -874.223876953125, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.820012092590332, + "rewards/margins": 20.7093505859375, + "rewards/rejected": -25.529361724853516, + "step": 1937 + }, + { + "epoch": 1.2055987558320373, + "grad_norm": 0.02726488746702671, + "learning_rate": 3.3229598893499308e-06, + "logits/chosen": 0.24503111839294434, + "logits/rejected": 4.084878921508789, + "logps/chosen": -399.0093994140625, + "logps/rejected": -825.203369140625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.999037742614746, + "rewards/margins": 23.390485763549805, + "rewards/rejected": -29.3895263671875, + "step": 1938 + }, + { + "epoch": 1.206220839813375, + "grad_norm": 0.00054695934522897, + "learning_rate": 3.3218072844628864e-06, + "logits/chosen": 2.297356128692627, + "logits/rejected": 3.3070321083068848, + "logps/chosen": -587.7952880859375, + "logps/rejected": -1102.398193359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.056290626525879, + "rewards/margins": 34.74806213378906, + "rewards/rejected": -45.804351806640625, + "step": 1939 + }, + { + "epoch": 1.2068429237947123, + "grad_norm": 2.3318307399749756, + "learning_rate": 3.3206546795758416e-06, + "logits/chosen": 0.43444839119911194, + "logits/rejected": 2.4421451091766357, + "logps/chosen": -514.5410766601562, + "logps/rejected": -874.9398803710938, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.247170448303223, + "rewards/margins": 22.54619026184082, + "rewards/rejected": -32.793357849121094, + "step": 1940 + }, + { + "epoch": 1.2074650077760498, + "grad_norm": 0.00011700214236043394, + "learning_rate": 3.319502074688797e-06, + "logits/chosen": 0.9871399402618408, + "logits/rejected": 3.244748592376709, + "logps/chosen": -435.0483703613281, + "logps/rejected": -841.668701171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.72576904296875, + "rewards/margins": 29.82848358154297, + "rewards/rejected": -34.55425262451172, + "step": 1941 + }, + { + "epoch": 1.2080870917573872, + "grad_norm": 2.920883893966675, + "learning_rate": 3.318349469801752e-06, + "logits/chosen": -0.21452677249908447, + "logits/rejected": 4.4931230545043945, + "logps/chosen": -367.2193603515625, + "logps/rejected": -1028.98046875, + "loss": 0.0713, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.970757484436035, + "rewards/margins": 35.071495056152344, + "rewards/rejected": -42.04225158691406, + "step": 1942 + }, + { + "epoch": 1.2087091757387247, + "grad_norm": 0.03972639888525009, + "learning_rate": 3.3171968649147073e-06, + "logits/chosen": 0.8004956245422363, + "logits/rejected": 3.5307986736297607, + "logps/chosen": -599.837890625, + "logps/rejected": -1062.3994140625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.409638404846191, + "rewards/margins": 28.89748764038086, + "rewards/rejected": -40.307125091552734, + "step": 1943 + }, + { + "epoch": 1.2093312597200623, + "grad_norm": 0.03882686793804169, + "learning_rate": 3.3160442600276625e-06, + "logits/chosen": 3.156482458114624, + "logits/rejected": 3.9867730140686035, + "logps/chosen": -716.43994140625, + "logps/rejected": -1105.8594970703125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.058252334594727, + "rewards/margins": 29.100160598754883, + "rewards/rejected": -39.15841293334961, + "step": 1944 + }, + { + "epoch": 1.2099533437013996, + "grad_norm": 0.02182290330529213, + "learning_rate": 3.3148916551406178e-06, + "logits/chosen": -0.6206588745117188, + "logits/rejected": 3.366884231567383, + "logps/chosen": -562.1417236328125, + "logps/rejected": -1199.1522216796875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.860138893127441, + "rewards/margins": 36.03993225097656, + "rewards/rejected": -42.90007019042969, + "step": 1945 + }, + { + "epoch": 1.2105754276827372, + "grad_norm": 0.00022302680008579046, + "learning_rate": 3.3137390502535734e-06, + "logits/chosen": 3.027329444885254, + "logits/rejected": 4.128240585327148, + "logps/chosen": -577.2239990234375, + "logps/rejected": -878.79150390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.890042304992676, + "rewards/margins": 27.70443344116211, + "rewards/rejected": -37.59447479248047, + "step": 1946 + }, + { + "epoch": 1.2111975116640747, + "grad_norm": 0.008291305974125862, + "learning_rate": 3.3125864453665286e-06, + "logits/chosen": 0.27542710304260254, + "logits/rejected": 4.529088973999023, + "logps/chosen": -476.2554931640625, + "logps/rejected": -1009.677490234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.14692497253418, + "rewards/margins": 31.135059356689453, + "rewards/rejected": -43.281982421875, + "step": 1947 + }, + { + "epoch": 1.211819595645412, + "grad_norm": 0.00021095202828291804, + "learning_rate": 3.311433840479484e-06, + "logits/chosen": 1.0152184963226318, + "logits/rejected": 4.031658172607422, + "logps/chosen": -593.0845336914062, + "logps/rejected": -943.8492431640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.655757904052734, + "rewards/margins": 27.139101028442383, + "rewards/rejected": -33.794857025146484, + "step": 1948 + }, + { + "epoch": 1.2124416796267496, + "grad_norm": 0.08369395136833191, + "learning_rate": 3.310281235592439e-06, + "logits/chosen": -1.5338207483291626, + "logits/rejected": 3.6264917850494385, + "logps/chosen": -494.798828125, + "logps/rejected": -950.973388671875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.699420928955078, + "rewards/margins": 25.905492782592773, + "rewards/rejected": -37.60491180419922, + "step": 1949 + }, + { + "epoch": 1.213063763608087, + "grad_norm": 2.5580344200134277, + "learning_rate": 3.3091286307053943e-06, + "logits/chosen": 0.913398265838623, + "logits/rejected": 3.289531707763672, + "logps/chosen": -484.4707946777344, + "logps/rejected": -813.2208251953125, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.331765174865723, + "rewards/margins": 19.53480339050293, + "rewards/rejected": -30.86656951904297, + "step": 1950 + }, + { + "epoch": 1.2136858475894245, + "grad_norm": 0.050781961530447006, + "learning_rate": 3.3079760258183495e-06, + "logits/chosen": 0.06513766199350357, + "logits/rejected": 3.91764760017395, + "logps/chosen": -417.3397521972656, + "logps/rejected": -851.27978515625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.4588847160339355, + "rewards/margins": 18.167699813842773, + "rewards/rejected": -25.626585006713867, + "step": 1951 + }, + { + "epoch": 1.2143079315707621, + "grad_norm": 13.09532356262207, + "learning_rate": 3.3068234209313048e-06, + "logits/chosen": -0.8919419646263123, + "logits/rejected": 3.8915514945983887, + "logps/chosen": -444.9429626464844, + "logps/rejected": -1011.6834106445312, + "loss": 0.047, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.357451438903809, + "rewards/margins": 25.754016876220703, + "rewards/rejected": -35.11146926879883, + "step": 1952 + }, + { + "epoch": 1.2149300155520995, + "grad_norm": 0.04654877260327339, + "learning_rate": 3.3056708160442604e-06, + "logits/chosen": -1.6564157009124756, + "logits/rejected": 2.609309673309326, + "logps/chosen": -329.3642578125, + "logps/rejected": -767.1329956054688, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.011837959289551, + "rewards/margins": 18.270030975341797, + "rewards/rejected": -22.281869888305664, + "step": 1953 + }, + { + "epoch": 1.215552099533437, + "grad_norm": 1.1246953010559082, + "learning_rate": 3.3045182111572156e-06, + "logits/chosen": 1.621370792388916, + "logits/rejected": 4.352920055389404, + "logps/chosen": -653.8741455078125, + "logps/rejected": -1105.3143310546875, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.642403602600098, + "rewards/margins": 29.800912857055664, + "rewards/rejected": -44.44331741333008, + "step": 1954 + }, + { + "epoch": 1.2161741835147746, + "grad_norm": 0.0006063711480237544, + "learning_rate": 3.303365606270171e-06, + "logits/chosen": -0.577562153339386, + "logits/rejected": 4.346451759338379, + "logps/chosen": -534.3248291015625, + "logps/rejected": -1122.94580078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.442867279052734, + "rewards/margins": 32.12034606933594, + "rewards/rejected": -43.56321334838867, + "step": 1955 + }, + { + "epoch": 1.216796267496112, + "grad_norm": 1.4868415594100952, + "learning_rate": 3.302213001383126e-06, + "logits/chosen": 1.4747681617736816, + "logits/rejected": 3.2828774452209473, + "logps/chosen": -673.1659545898438, + "logps/rejected": -1035.533203125, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.805810928344727, + "rewards/margins": 22.413631439208984, + "rewards/rejected": -36.21944046020508, + "step": 1956 + }, + { + "epoch": 1.2174183514774495, + "grad_norm": 0.013984655030071735, + "learning_rate": 3.3010603964960813e-06, + "logits/chosen": 2.234647512435913, + "logits/rejected": 3.8705615997314453, + "logps/chosen": -657.6099243164062, + "logps/rejected": -959.9219970703125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.698150634765625, + "rewards/margins": 24.213985443115234, + "rewards/rejected": -34.912139892578125, + "step": 1957 + }, + { + "epoch": 1.218040435458787, + "grad_norm": 30.901208877563477, + "learning_rate": 3.2999077916090365e-06, + "logits/chosen": -0.880326509475708, + "logits/rejected": 3.0242815017700195, + "logps/chosen": -587.2315673828125, + "logps/rejected": -973.1295776367188, + "loss": 0.1456, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.446496963500977, + "rewards/margins": 20.663053512573242, + "rewards/rejected": -30.10955047607422, + "step": 1958 + }, + { + "epoch": 1.2186625194401244, + "grad_norm": 0.33974507451057434, + "learning_rate": 3.2987551867219918e-06, + "logits/chosen": 2.2866456508636475, + "logits/rejected": 3.3971104621887207, + "logps/chosen": -551.7151489257812, + "logps/rejected": -930.0809326171875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.42250919342041, + "rewards/margins": 25.322158813476562, + "rewards/rejected": -36.744667053222656, + "step": 1959 + }, + { + "epoch": 1.219284603421462, + "grad_norm": 0.031288594007492065, + "learning_rate": 3.297602581834947e-06, + "logits/chosen": -1.6816962957382202, + "logits/rejected": 1.167089819908142, + "logps/chosen": -536.8770751953125, + "logps/rejected": -976.789306640625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.510819435119629, + "rewards/margins": 25.894309997558594, + "rewards/rejected": -33.405128479003906, + "step": 1960 + }, + { + "epoch": 1.2199066874027993, + "grad_norm": 0.018396716564893723, + "learning_rate": 3.2964499769479026e-06, + "logits/chosen": 2.6608729362487793, + "logits/rejected": 3.563199520111084, + "logps/chosen": -745.041748046875, + "logps/rejected": -1060.1583251953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.359828948974609, + "rewards/margins": 28.92093276977539, + "rewards/rejected": -36.28076171875, + "step": 1961 + }, + { + "epoch": 1.2205287713841368, + "grad_norm": 0.0005018580704927444, + "learning_rate": 3.295297372060858e-06, + "logits/chosen": -1.0413039922714233, + "logits/rejected": 2.514838457107544, + "logps/chosen": -389.07940673828125, + "logps/rejected": -1057.867919921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.18116283416748, + "rewards/margins": 42.0729866027832, + "rewards/rejected": -52.254150390625, + "step": 1962 + }, + { + "epoch": 1.2211508553654744, + "grad_norm": 4.92830122311716e-06, + "learning_rate": 3.294144767173813e-06, + "logits/chosen": 0.7838652729988098, + "logits/rejected": 3.6844985485076904, + "logps/chosen": -555.4747314453125, + "logps/rejected": -1020.4698486328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.917938232421875, + "rewards/margins": 30.011611938476562, + "rewards/rejected": -40.92955017089844, + "step": 1963 + }, + { + "epoch": 1.2217729393468117, + "grad_norm": 0.00024959229631349444, + "learning_rate": 3.2929921622867683e-06, + "logits/chosen": -0.8331981897354126, + "logits/rejected": 2.0492069721221924, + "logps/chosen": -357.7832946777344, + "logps/rejected": -851.5213623046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.565192222595215, + "rewards/margins": 34.37034606933594, + "rewards/rejected": -38.93553924560547, + "step": 1964 + }, + { + "epoch": 1.2223950233281493, + "grad_norm": 10.026619911193848, + "learning_rate": 3.2918395573997235e-06, + "logits/chosen": 1.6410657167434692, + "logits/rejected": 3.0660173892974854, + "logps/chosen": -495.8389892578125, + "logps/rejected": -751.0767822265625, + "loss": 0.0597, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.825559616088867, + "rewards/margins": 16.955659866333008, + "rewards/rejected": -24.781219482421875, + "step": 1965 + }, + { + "epoch": 1.2230171073094869, + "grad_norm": 0.003237862139940262, + "learning_rate": 3.2906869525126788e-06, + "logits/chosen": -0.04731714725494385, + "logits/rejected": 1.0842527151107788, + "logps/chosen": -526.9979858398438, + "logps/rejected": -806.66064453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.047800064086914, + "rewards/margins": 18.901748657226562, + "rewards/rejected": -28.949548721313477, + "step": 1966 + }, + { + "epoch": 1.2236391912908242, + "grad_norm": 0.037597037851810455, + "learning_rate": 3.289534347625634e-06, + "logits/chosen": 1.0335030555725098, + "logits/rejected": 3.654738426208496, + "logps/chosen": -643.566162109375, + "logps/rejected": -1147.51953125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.852840423583984, + "rewards/margins": 31.7597599029541, + "rewards/rejected": -42.61259841918945, + "step": 1967 + }, + { + "epoch": 1.2242612752721618, + "grad_norm": 1.8747437934507616e-05, + "learning_rate": 3.2883817427385896e-06, + "logits/chosen": 0.3725661039352417, + "logits/rejected": 2.0947065353393555, + "logps/chosen": -600.5690307617188, + "logps/rejected": -1068.793212890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.553552627563477, + "rewards/margins": 31.938674926757812, + "rewards/rejected": -42.492225646972656, + "step": 1968 + }, + { + "epoch": 1.2248833592534991, + "grad_norm": 0.0005407995195128024, + "learning_rate": 3.287229137851545e-06, + "logits/chosen": -1.3575810194015503, + "logits/rejected": 1.826256513595581, + "logps/chosen": -450.8987121582031, + "logps/rejected": -844.3551635742188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.845052719116211, + "rewards/margins": 22.745574951171875, + "rewards/rejected": -27.59062957763672, + "step": 1969 + }, + { + "epoch": 1.2255054432348367, + "grad_norm": 0.02516097202897072, + "learning_rate": 3.2860765329645e-06, + "logits/chosen": -0.5253548622131348, + "logits/rejected": 2.288137435913086, + "logps/chosen": -445.89776611328125, + "logps/rejected": -847.6441650390625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.445370674133301, + "rewards/margins": 23.657224655151367, + "rewards/rejected": -31.102596282958984, + "step": 1970 + }, + { + "epoch": 1.2261275272161742, + "grad_norm": 0.004528042860329151, + "learning_rate": 3.2849239280774553e-06, + "logits/chosen": -0.6673039197921753, + "logits/rejected": 4.811821937561035, + "logps/chosen": -388.5225524902344, + "logps/rejected": -1090.91552734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.815102577209473, + "rewards/margins": 32.85487365722656, + "rewards/rejected": -40.66997528076172, + "step": 1971 + }, + { + "epoch": 1.2267496111975116, + "grad_norm": 27.828004837036133, + "learning_rate": 3.2837713231904105e-06, + "logits/chosen": -0.22079074382781982, + "logits/rejected": 1.5481947660446167, + "logps/chosen": -481.0959167480469, + "logps/rejected": -800.5830078125, + "loss": 0.3633, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.814485549926758, + "rewards/margins": 17.76893424987793, + "rewards/rejected": -27.583419799804688, + "step": 1972 + }, + { + "epoch": 1.2273716951788491, + "grad_norm": 1.7662297295828466e-06, + "learning_rate": 3.2826187183033658e-06, + "logits/chosen": -1.9919651746749878, + "logits/rejected": 3.906245231628418, + "logps/chosen": -400.87591552734375, + "logps/rejected": -1072.261962890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.9460577964782715, + "rewards/margins": 34.99480438232422, + "rewards/rejected": -41.94086456298828, + "step": 1973 + }, + { + "epoch": 1.2279937791601867, + "grad_norm": 36.82212829589844, + "learning_rate": 3.281466113416321e-06, + "logits/chosen": 1.6460797786712646, + "logits/rejected": 4.4644365310668945, + "logps/chosen": -531.8225708007812, + "logps/rejected": -957.0294189453125, + "loss": 1.1201, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.023094177246094, + "rewards/margins": 26.032180786132812, + "rewards/rejected": -36.055274963378906, + "step": 1974 + }, + { + "epoch": 1.228615863141524, + "grad_norm": 0.004306244198232889, + "learning_rate": 3.2803135085292766e-06, + "logits/chosen": 0.5558570027351379, + "logits/rejected": 1.4123010635375977, + "logps/chosen": -666.03662109375, + "logps/rejected": -970.247314453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.882766723632812, + "rewards/margins": 25.52187156677246, + "rewards/rejected": -39.404640197753906, + "step": 1975 + }, + { + "epoch": 1.2292379471228616, + "grad_norm": 2.109964370727539, + "learning_rate": 3.279160903642232e-06, + "logits/chosen": -0.5743099451065063, + "logits/rejected": 1.87367844581604, + "logps/chosen": -483.983154296875, + "logps/rejected": -847.1797485351562, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.363353729248047, + "rewards/margins": 26.407093048095703, + "rewards/rejected": -33.77044677734375, + "step": 1976 + }, + { + "epoch": 1.2298600311041992, + "grad_norm": 0.10898027569055557, + "learning_rate": 3.278008298755187e-06, + "logits/chosen": 1.553083062171936, + "logits/rejected": 2.812044143676758, + "logps/chosen": -544.963134765625, + "logps/rejected": -787.267578125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.337335586547852, + "rewards/margins": 15.037760734558105, + "rewards/rejected": -23.375097274780273, + "step": 1977 + }, + { + "epoch": 1.2304821150855365, + "grad_norm": 1.6577892303466797, + "learning_rate": 3.2768556938681423e-06, + "logits/chosen": -0.6032044887542725, + "logits/rejected": 2.0228164196014404, + "logps/chosen": -337.15655517578125, + "logps/rejected": -684.0770874023438, + "loss": 0.0371, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.9896721839904785, + "rewards/margins": 17.63288688659668, + "rewards/rejected": -22.622560501098633, + "step": 1978 + }, + { + "epoch": 1.231104199066874, + "grad_norm": 0.0017263826448470354, + "learning_rate": 3.2757030889810975e-06, + "logits/chosen": -0.22079506516456604, + "logits/rejected": 4.21293830871582, + "logps/chosen": -463.0474853515625, + "logps/rejected": -1015.839111328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.130630016326904, + "rewards/margins": 31.987415313720703, + "rewards/rejected": -38.118045806884766, + "step": 1979 + }, + { + "epoch": 1.2317262830482114, + "grad_norm": 0.0001258711126865819, + "learning_rate": 3.2745504840940528e-06, + "logits/chosen": 0.06448210775852203, + "logits/rejected": 3.2480227947235107, + "logps/chosen": -513.82177734375, + "logps/rejected": -953.921630859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.06885051727295, + "rewards/margins": 28.184560775756836, + "rewards/rejected": -38.25341033935547, + "step": 1980 + }, + { + "epoch": 1.232348367029549, + "grad_norm": 0.003937113098800182, + "learning_rate": 3.273397879207008e-06, + "logits/chosen": -0.053459495306015015, + "logits/rejected": 2.767993927001953, + "logps/chosen": -453.16973876953125, + "logps/rejected": -907.441162109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.106377601623535, + "rewards/margins": 22.306228637695312, + "rewards/rejected": -31.41260528564453, + "step": 1981 + }, + { + "epoch": 1.2329704510108865, + "grad_norm": 2.3660552501678467, + "learning_rate": 3.2722452743199636e-06, + "logits/chosen": 0.03287597745656967, + "logits/rejected": 2.237107753753662, + "logps/chosen": -492.45489501953125, + "logps/rejected": -861.8848876953125, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.626747131347656, + "rewards/margins": 24.731849670410156, + "rewards/rejected": -36.35859680175781, + "step": 1982 + }, + { + "epoch": 1.2335925349922239, + "grad_norm": 41.7318115234375, + "learning_rate": 3.271092669432919e-06, + "logits/chosen": 0.4554020166397095, + "logits/rejected": 3.25327730178833, + "logps/chosen": -608.2691650390625, + "logps/rejected": -1019.5993041992188, + "loss": 1.0557, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.943293571472168, + "rewards/margins": 25.79165267944336, + "rewards/rejected": -37.734947204589844, + "step": 1983 + }, + { + "epoch": 1.2342146189735614, + "grad_norm": 0.2129361927509308, + "learning_rate": 3.269940064545874e-06, + "logits/chosen": -0.07215934991836548, + "logits/rejected": 2.628373384475708, + "logps/chosen": -618.8433227539062, + "logps/rejected": -1128.62060546875, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.97716236114502, + "rewards/margins": 31.908266067504883, + "rewards/rejected": -40.88542938232422, + "step": 1984 + }, + { + "epoch": 1.234836702954899, + "grad_norm": 25.82171058654785, + "learning_rate": 3.2687874596588293e-06, + "logits/chosen": 1.560760736465454, + "logits/rejected": 2.470482587814331, + "logps/chosen": -685.6170654296875, + "logps/rejected": -946.1788330078125, + "loss": 0.2324, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.528305053710938, + "rewards/margins": 20.159406661987305, + "rewards/rejected": -28.687711715698242, + "step": 1985 + }, + { + "epoch": 1.2354587869362363, + "grad_norm": 0.010561628267168999, + "learning_rate": 3.2676348547717845e-06, + "logits/chosen": 0.25866788625717163, + "logits/rejected": 2.066354751586914, + "logps/chosen": -548.3967895507812, + "logps/rejected": -968.6322021484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.815115928649902, + "rewards/margins": 26.51719856262207, + "rewards/rejected": -39.332313537597656, + "step": 1986 + }, + { + "epoch": 1.236080870917574, + "grad_norm": 39.62969207763672, + "learning_rate": 3.2664822498847397e-06, + "logits/chosen": 1.6390573978424072, + "logits/rejected": 3.757746696472168, + "logps/chosen": -676.7535400390625, + "logps/rejected": -942.8658447265625, + "loss": 0.7047, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.365790367126465, + "rewards/margins": 15.35942268371582, + "rewards/rejected": -23.72521209716797, + "step": 1987 + }, + { + "epoch": 1.2367029548989112, + "grad_norm": 2.783613681793213, + "learning_rate": 3.265329644997695e-06, + "logits/chosen": 1.4804677963256836, + "logits/rejected": 2.917877197265625, + "logps/chosen": -530.9738159179688, + "logps/rejected": -870.9774169921875, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.374899387359619, + "rewards/margins": 26.20675277709961, + "rewards/rejected": -33.5816535949707, + "step": 1988 + }, + { + "epoch": 1.2373250388802488, + "grad_norm": 0.20069032907485962, + "learning_rate": 3.26417704011065e-06, + "logits/chosen": 0.017816901206970215, + "logits/rejected": 3.8825366497039795, + "logps/chosen": -483.43939208984375, + "logps/rejected": -911.643798828125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.31286096572876, + "rewards/margins": 21.46490478515625, + "rewards/rejected": -26.77776527404785, + "step": 1989 + }, + { + "epoch": 1.2379471228615864, + "grad_norm": 0.6583080887794495, + "learning_rate": 3.263024435223606e-06, + "logits/chosen": 3.9078497886657715, + "logits/rejected": 3.138378381729126, + "logps/chosen": -599.2481079101562, + "logps/rejected": -749.1458740234375, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.0813570022583, + "rewards/margins": 19.185405731201172, + "rewards/rejected": -27.26676368713379, + "step": 1990 + }, + { + "epoch": 1.2385692068429237, + "grad_norm": 0.01927441544830799, + "learning_rate": 3.261871830336561e-06, + "logits/chosen": 2.222565174102783, + "logits/rejected": 4.206331729888916, + "logps/chosen": -617.39990234375, + "logps/rejected": -958.8414306640625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.376999378204346, + "rewards/margins": 23.93924331665039, + "rewards/rejected": -28.316242218017578, + "step": 1991 + }, + { + "epoch": 1.2391912908242613, + "grad_norm": 3.1013777256011963, + "learning_rate": 3.2607192254495163e-06, + "logits/chosen": -1.4709454774856567, + "logits/rejected": 1.9248082637786865, + "logps/chosen": -505.6230773925781, + "logps/rejected": -811.3922119140625, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.807146072387695, + "rewards/margins": 13.65088939666748, + "rewards/rejected": -24.45803451538086, + "step": 1992 + }, + { + "epoch": 1.2398133748055988, + "grad_norm": 25.103227615356445, + "learning_rate": 3.2595666205624715e-06, + "logits/chosen": 0.7945823073387146, + "logits/rejected": 4.353921413421631, + "logps/chosen": -557.3357543945312, + "logps/rejected": -973.616943359375, + "loss": 0.3781, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.110764503479004, + "rewards/margins": 25.25164222717285, + "rewards/rejected": -33.362403869628906, + "step": 1993 + }, + { + "epoch": 1.2404354587869362, + "grad_norm": 6.928995571797714e-05, + "learning_rate": 3.2584140156754267e-06, + "logits/chosen": -1.121807336807251, + "logits/rejected": 3.6673836708068848, + "logps/chosen": -458.9044189453125, + "logps/rejected": -1019.62060546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.898647785186768, + "rewards/margins": 24.41216278076172, + "rewards/rejected": -30.31081199645996, + "step": 1994 + }, + { + "epoch": 1.2410575427682737, + "grad_norm": 0.7680820226669312, + "learning_rate": 3.257261410788382e-06, + "logits/chosen": -1.2524921894073486, + "logits/rejected": 0.39433813095092773, + "logps/chosen": -464.68634033203125, + "logps/rejected": -828.046142578125, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.972171306610107, + "rewards/margins": 24.00033187866211, + "rewards/rejected": -30.97249984741211, + "step": 1995 + }, + { + "epoch": 1.2416796267496113, + "grad_norm": 4.20468268202967e-06, + "learning_rate": 3.256108805901337e-06, + "logits/chosen": 1.3287625312805176, + "logits/rejected": 2.1750547885894775, + "logps/chosen": -659.4127197265625, + "logps/rejected": -1102.378662109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.000626564025879, + "rewards/margins": 29.679622650146484, + "rewards/rejected": -42.68025207519531, + "step": 1996 + }, + { + "epoch": 1.2423017107309486, + "grad_norm": 29.853729248046875, + "learning_rate": 3.254956201014293e-06, + "logits/chosen": -2.877234697341919, + "logits/rejected": 3.3251304626464844, + "logps/chosen": -391.4083557128906, + "logps/rejected": -1055.4921875, + "loss": 0.318, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.726519584655762, + "rewards/margins": 30.075284957885742, + "rewards/rejected": -36.80180358886719, + "step": 1997 + }, + { + "epoch": 1.2429237947122862, + "grad_norm": 0.00010009088146034628, + "learning_rate": 3.253803596127248e-06, + "logits/chosen": 0.9821017384529114, + "logits/rejected": 3.299887180328369, + "logps/chosen": -577.6727294921875, + "logps/rejected": -1019.5888671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.769001007080078, + "rewards/margins": 27.716604232788086, + "rewards/rejected": -33.48560333251953, + "step": 1998 + }, + { + "epoch": 1.2435458786936238, + "grad_norm": 1.9293725927127525e-05, + "learning_rate": 3.2526509912402033e-06, + "logits/chosen": -0.9194085001945496, + "logits/rejected": 1.708883285522461, + "logps/chosen": -453.50048828125, + "logps/rejected": -840.7008056640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.381048202514648, + "rewards/margins": 25.372150421142578, + "rewards/rejected": -30.753196716308594, + "step": 1999 + }, + { + "epoch": 1.244167962674961, + "grad_norm": 0.05470879748463631, + "learning_rate": 3.2514983863531585e-06, + "logits/chosen": -2.1505768299102783, + "logits/rejected": 3.283268928527832, + "logps/chosen": -362.8547668457031, + "logps/rejected": -830.670166015625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.526420593261719, + "rewards/margins": 21.226747512817383, + "rewards/rejected": -27.7531681060791, + "step": 2000 + }, + { + "epoch": 1.2447900466562987, + "grad_norm": 3.966289520263672, + "learning_rate": 3.2503457814661137e-06, + "logits/chosen": -2.3252477645874023, + "logits/rejected": 1.5557923316955566, + "logps/chosen": -499.74041748046875, + "logps/rejected": -917.0076904296875, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.588899612426758, + "rewards/margins": 19.9823055267334, + "rewards/rejected": -30.571205139160156, + "step": 2001 + }, + { + "epoch": 1.245412130637636, + "grad_norm": 0.007073727436363697, + "learning_rate": 3.249193176579069e-06, + "logits/chosen": -2.3345444202423096, + "logits/rejected": 0.8063172698020935, + "logps/chosen": -391.07745361328125, + "logps/rejected": -796.2643432617188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.851896286010742, + "rewards/margins": 20.569223403930664, + "rewards/rejected": -28.421119689941406, + "step": 2002 + }, + { + "epoch": 1.2460342146189736, + "grad_norm": 8.042104309424758e-05, + "learning_rate": 3.248040571692024e-06, + "logits/chosen": -0.987544596195221, + "logits/rejected": 2.0049033164978027, + "logps/chosen": -505.7494201660156, + "logps/rejected": -1006.3981323242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.076696872711182, + "rewards/margins": 24.556446075439453, + "rewards/rejected": -31.633142471313477, + "step": 2003 + }, + { + "epoch": 1.2466562986003111, + "grad_norm": 7.321867145514051e-15, + "learning_rate": 3.24688796680498e-06, + "logits/chosen": -0.5694581270217896, + "logits/rejected": 3.1910927295684814, + "logps/chosen": -467.1485595703125, + "logps/rejected": -1130.46142578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.254088401794434, + "rewards/margins": 44.00873565673828, + "rewards/rejected": -48.26282501220703, + "step": 2004 + }, + { + "epoch": 1.2472783825816485, + "grad_norm": 7.13050667400239e-06, + "learning_rate": 3.245735361917935e-06, + "logits/chosen": -0.6603339910507202, + "logits/rejected": 2.0062170028686523, + "logps/chosen": -396.538818359375, + "logps/rejected": -916.5963134765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.292914867401123, + "rewards/margins": 31.59789276123047, + "rewards/rejected": -35.89080810546875, + "step": 2005 + }, + { + "epoch": 1.247900466562986, + "grad_norm": 0.019461452960968018, + "learning_rate": 3.2445827570308903e-06, + "logits/chosen": -2.6960866451263428, + "logits/rejected": 3.560029983520508, + "logps/chosen": -458.8998107910156, + "logps/rejected": -1042.880859375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.916061401367188, + "rewards/margins": 28.842018127441406, + "rewards/rejected": -38.75807571411133, + "step": 2006 + }, + { + "epoch": 1.2485225505443234, + "grad_norm": 3.5628645420074463, + "learning_rate": 3.2434301521438455e-06, + "logits/chosen": 1.0278129577636719, + "logits/rejected": 1.1622637510299683, + "logps/chosen": -554.2152099609375, + "logps/rejected": -736.61376953125, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.3448333740234375, + "rewards/margins": 14.856306076049805, + "rewards/rejected": -21.201141357421875, + "step": 2007 + }, + { + "epoch": 1.249144634525661, + "grad_norm": 0.07135644555091858, + "learning_rate": 3.2422775472568007e-06, + "logits/chosen": 1.733124017715454, + "logits/rejected": 4.378563404083252, + "logps/chosen": -640.43896484375, + "logps/rejected": -1003.464599609375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.873009204864502, + "rewards/margins": 19.59971046447754, + "rewards/rejected": -25.472719192504883, + "step": 2008 + }, + { + "epoch": 1.2497667185069985, + "grad_norm": 7.073273877722386e-07, + "learning_rate": 3.241124942369756e-06, + "logits/chosen": 0.5708224773406982, + "logits/rejected": 3.2701048851013184, + "logps/chosen": -563.2235717773438, + "logps/rejected": -1009.9588012695312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.070895195007324, + "rewards/margins": 29.836795806884766, + "rewards/rejected": -35.907691955566406, + "step": 2009 + }, + { + "epoch": 1.2503888024883358, + "grad_norm": 0.0315566249191761, + "learning_rate": 3.239972337482711e-06, + "logits/chosen": 1.7904354333877563, + "logits/rejected": 1.5742547512054443, + "logps/chosen": -606.237060546875, + "logps/rejected": -813.814208984375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.600082874298096, + "rewards/margins": 21.077608108520508, + "rewards/rejected": -27.677692413330078, + "step": 2010 + }, + { + "epoch": 1.2510108864696734, + "grad_norm": 14.575484275817871, + "learning_rate": 3.2388197325956664e-06, + "logits/chosen": 0.15945017337799072, + "logits/rejected": 2.334524154663086, + "logps/chosen": -357.0718078613281, + "logps/rejected": -729.5298461914062, + "loss": 0.1145, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7053083777427673, + "rewards/margins": 26.521766662597656, + "rewards/rejected": -27.227073669433594, + "step": 2011 + }, + { + "epoch": 1.251632970451011, + "grad_norm": 4.577037543640472e-05, + "learning_rate": 3.237667127708622e-06, + "logits/chosen": -0.05641406774520874, + "logits/rejected": 3.0679242610931396, + "logps/chosen": -470.065185546875, + "logps/rejected": -942.2273559570312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.128070831298828, + "rewards/margins": 29.991792678833008, + "rewards/rejected": -38.11986541748047, + "step": 2012 + }, + { + "epoch": 1.2522550544323483, + "grad_norm": 36.018550872802734, + "learning_rate": 3.2365145228215773e-06, + "logits/chosen": 3.5689821243286133, + "logits/rejected": 2.6732563972473145, + "logps/chosen": -845.01318359375, + "logps/rejected": -962.8516845703125, + "loss": 1.7042, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.484785079956055, + "rewards/margins": 17.063976287841797, + "rewards/rejected": -29.54875946044922, + "step": 2013 + }, + { + "epoch": 1.2528771384136859, + "grad_norm": 4.052655640407465e-06, + "learning_rate": 3.2353619179345325e-06, + "logits/chosen": 1.3075627088546753, + "logits/rejected": 5.013179779052734, + "logps/chosen": -513.4508056640625, + "logps/rejected": -1001.1782836914062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.508722305297852, + "rewards/margins": 29.324068069458008, + "rewards/rejected": -37.83279037475586, + "step": 2014 + }, + { + "epoch": 1.2534992223950234, + "grad_norm": 0.396076500415802, + "learning_rate": 3.2342093130474877e-06, + "logits/chosen": -0.08282530307769775, + "logits/rejected": 2.28505802154541, + "logps/chosen": -409.0604248046875, + "logps/rejected": -751.6499633789062, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.9696044921875, + "rewards/margins": 19.8177547454834, + "rewards/rejected": -25.7873592376709, + "step": 2015 + }, + { + "epoch": 1.2541213063763608, + "grad_norm": 22.250051498413086, + "learning_rate": 3.233056708160443e-06, + "logits/chosen": 2.5599923133850098, + "logits/rejected": 2.784825325012207, + "logps/chosen": -707.2561645507812, + "logps/rejected": -902.107666015625, + "loss": 0.1687, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.913710594177246, + "rewards/margins": 21.890016555786133, + "rewards/rejected": -29.803726196289062, + "step": 2016 + }, + { + "epoch": 1.2547433903576983, + "grad_norm": 3.3847265967779094e-07, + "learning_rate": 3.231904103273398e-06, + "logits/chosen": -0.3941129446029663, + "logits/rejected": 2.676504135131836, + "logps/chosen": -527.0136108398438, + "logps/rejected": -865.6759033203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.5630974769592285, + "rewards/margins": 25.08944320678711, + "rewards/rejected": -29.652542114257812, + "step": 2017 + }, + { + "epoch": 1.255365474339036, + "grad_norm": 1.4325063228607178, + "learning_rate": 3.2307514983863534e-06, + "logits/chosen": -0.9161220192909241, + "logits/rejected": 2.9123919010162354, + "logps/chosen": -465.49871826171875, + "logps/rejected": -955.76611328125, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.465087890625, + "rewards/margins": 24.841506958007812, + "rewards/rejected": -30.306594848632812, + "step": 2018 + }, + { + "epoch": 1.2559875583203732, + "grad_norm": 0.0013318108394742012, + "learning_rate": 3.229598893499309e-06, + "logits/chosen": 3.270078659057617, + "logits/rejected": 3.45019268989563, + "logps/chosen": -590.9571533203125, + "logps/rejected": -902.8822021484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.184449672698975, + "rewards/margins": 22.32142448425293, + "rewards/rejected": -29.505874633789062, + "step": 2019 + }, + { + "epoch": 1.2566096423017108, + "grad_norm": 0.0002312797005288303, + "learning_rate": 3.2284462886122643e-06, + "logits/chosen": 1.7641077041625977, + "logits/rejected": 3.2676451206207275, + "logps/chosen": -574.0552978515625, + "logps/rejected": -958.4447631835938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.362613677978516, + "rewards/margins": 22.630130767822266, + "rewards/rejected": -28.99274444580078, + "step": 2020 + }, + { + "epoch": 1.2572317262830481, + "grad_norm": 16.431461334228516, + "learning_rate": 3.2272936837252195e-06, + "logits/chosen": 1.4729183912277222, + "logits/rejected": 4.335656642913818, + "logps/chosen": -618.861572265625, + "logps/rejected": -970.4544677734375, + "loss": 0.1198, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.866339683532715, + "rewards/margins": 20.629886627197266, + "rewards/rejected": -23.496227264404297, + "step": 2021 + }, + { + "epoch": 1.2578538102643857, + "grad_norm": 2.040773868560791, + "learning_rate": 3.2261410788381747e-06, + "logits/chosen": 0.7448826432228088, + "logits/rejected": 3.73989200592041, + "logps/chosen": -474.6575012207031, + "logps/rejected": -845.82080078125, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.902726173400879, + "rewards/margins": 24.317073822021484, + "rewards/rejected": -30.219797134399414, + "step": 2022 + }, + { + "epoch": 1.258475894245723, + "grad_norm": 0.001415720907971263, + "learning_rate": 3.22498847395113e-06, + "logits/chosen": -3.1525511741638184, + "logits/rejected": 3.938098430633545, + "logps/chosen": -251.63796997070312, + "logps/rejected": -826.9188842773438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.161817789077759, + "rewards/margins": 26.74517059326172, + "rewards/rejected": -28.906986236572266, + "step": 2023 + }, + { + "epoch": 1.2590979782270606, + "grad_norm": 2.495192766189575, + "learning_rate": 3.223835869064085e-06, + "logits/chosen": 1.8843852281570435, + "logits/rejected": 3.638036012649536, + "logps/chosen": -599.0542602539062, + "logps/rejected": -920.1378784179688, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.660336494445801, + "rewards/margins": 21.504348754882812, + "rewards/rejected": -29.164684295654297, + "step": 2024 + }, + { + "epoch": 1.2597200622083982, + "grad_norm": 0.10371783375740051, + "learning_rate": 3.2226832641770404e-06, + "logits/chosen": -3.003751754760742, + "logits/rejected": 2.675790548324585, + "logps/chosen": -427.89923095703125, + "logps/rejected": -978.2462158203125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.760486125946045, + "rewards/margins": 27.6287841796875, + "rewards/rejected": -32.38926696777344, + "step": 2025 + }, + { + "epoch": 1.2603421461897355, + "grad_norm": 12.984769821166992, + "learning_rate": 3.221530659289996e-06, + "logits/chosen": 0.3211978077888489, + "logits/rejected": 2.3837990760803223, + "logps/chosen": -542.7652587890625, + "logps/rejected": -1028.8651123046875, + "loss": 0.048, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.041831970214844, + "rewards/margins": 29.209022521972656, + "rewards/rejected": -34.2508544921875, + "step": 2026 + }, + { + "epoch": 1.260964230171073, + "grad_norm": 0.0963178277015686, + "learning_rate": 3.2203780544029513e-06, + "logits/chosen": 1.8055074214935303, + "logits/rejected": 4.252056121826172, + "logps/chosen": -537.7628173828125, + "logps/rejected": -976.2918701171875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.120723724365234, + "rewards/margins": 26.42084503173828, + "rewards/rejected": -34.54157257080078, + "step": 2027 + }, + { + "epoch": 1.2615863141524106, + "grad_norm": 0.031602680683135986, + "learning_rate": 3.2192254495159065e-06, + "logits/chosen": 0.5145872235298157, + "logits/rejected": 3.8570258617401123, + "logps/chosen": -490.4646301269531, + "logps/rejected": -889.964599609375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.190765857696533, + "rewards/margins": 19.79231071472168, + "rewards/rejected": -23.983078002929688, + "step": 2028 + }, + { + "epoch": 1.262208398133748, + "grad_norm": 3.2892661094665527, + "learning_rate": 3.2180728446288617e-06, + "logits/chosen": 2.532592296600342, + "logits/rejected": 2.6494524478912354, + "logps/chosen": -501.0936279296875, + "logps/rejected": -779.4171752929688, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.345937728881836, + "rewards/margins": 20.526443481445312, + "rewards/rejected": -31.872379302978516, + "step": 2029 + }, + { + "epoch": 1.2628304821150855, + "grad_norm": 0.1048092469573021, + "learning_rate": 3.216920239741817e-06, + "logits/chosen": 0.8485032320022583, + "logits/rejected": 2.9057791233062744, + "logps/chosen": -572.03857421875, + "logps/rejected": -965.0180053710938, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7810775637626648, + "rewards/margins": 28.03656005859375, + "rewards/rejected": -28.817636489868164, + "step": 2030 + }, + { + "epoch": 1.263452566096423, + "grad_norm": 0.04916305094957352, + "learning_rate": 3.215767634854772e-06, + "logits/chosen": 2.4191346168518066, + "logits/rejected": 5.252843379974365, + "logps/chosen": -629.9830932617188, + "logps/rejected": -1004.4718627929688, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.305938243865967, + "rewards/margins": 24.819896697998047, + "rewards/rejected": -31.125835418701172, + "step": 2031 + }, + { + "epoch": 1.2640746500777604, + "grad_norm": 4.426191298989579e-05, + "learning_rate": 3.2146150299677274e-06, + "logits/chosen": -0.6191399097442627, + "logits/rejected": 2.3437392711639404, + "logps/chosen": -436.3004150390625, + "logps/rejected": -949.069580078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.960773468017578, + "rewards/margins": 28.112300872802734, + "rewards/rejected": -37.07307052612305, + "step": 2032 + }, + { + "epoch": 1.264696734059098, + "grad_norm": 0.016834422945976257, + "learning_rate": 3.213462425080683e-06, + "logits/chosen": 1.7167022228240967, + "logits/rejected": 2.2886548042297363, + "logps/chosen": -659.6353759765625, + "logps/rejected": -891.0438842773438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.525177001953125, + "rewards/margins": 16.286649703979492, + "rewards/rejected": -26.811824798583984, + "step": 2033 + }, + { + "epoch": 1.2653188180404356, + "grad_norm": 0.1283029019832611, + "learning_rate": 3.2123098201936383e-06, + "logits/chosen": -2.6232731342315674, + "logits/rejected": 2.2754905223846436, + "logps/chosen": -330.6903381347656, + "logps/rejected": -941.403564453125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9191455841064453, + "rewards/margins": 31.921056747436523, + "rewards/rejected": -33.84020233154297, + "step": 2034 + }, + { + "epoch": 1.265940902021773, + "grad_norm": 2.896956357290037e-05, + "learning_rate": 3.2111572153065935e-06, + "logits/chosen": 0.8080716729164124, + "logits/rejected": 2.0364251136779785, + "logps/chosen": -618.687255859375, + "logps/rejected": -909.19580078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.720874309539795, + "rewards/margins": 23.601850509643555, + "rewards/rejected": -29.32272720336914, + "step": 2035 + }, + { + "epoch": 1.2665629860031105, + "grad_norm": 0.4231943190097809, + "learning_rate": 3.2100046104195487e-06, + "logits/chosen": 0.10635495185852051, + "logits/rejected": 2.2073330879211426, + "logps/chosen": -602.9811401367188, + "logps/rejected": -936.6310424804688, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.511858940124512, + "rewards/margins": 18.06280517578125, + "rewards/rejected": -29.574663162231445, + "step": 2036 + }, + { + "epoch": 1.267185069984448, + "grad_norm": 0.10137329250574112, + "learning_rate": 3.208852005532504e-06, + "logits/chosen": 0.14811724424362183, + "logits/rejected": 3.2426066398620605, + "logps/chosen": -468.83148193359375, + "logps/rejected": -961.1331176757812, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.441210746765137, + "rewards/margins": 29.445789337158203, + "rewards/rejected": -35.887001037597656, + "step": 2037 + }, + { + "epoch": 1.2678071539657854, + "grad_norm": 0.0012554771965369582, + "learning_rate": 3.207699400645459e-06, + "logits/chosen": -1.054654598236084, + "logits/rejected": 2.8366193771362305, + "logps/chosen": -340.777099609375, + "logps/rejected": -852.3719482421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.394537925720215, + "rewards/margins": 27.311038970947266, + "rewards/rejected": -31.705581665039062, + "step": 2038 + }, + { + "epoch": 1.268429237947123, + "grad_norm": 0.002670434070751071, + "learning_rate": 3.2065467957584144e-06, + "logits/chosen": 0.28375720977783203, + "logits/rejected": 4.042474269866943, + "logps/chosen": -600.4718627929688, + "logps/rejected": -1158.3646240234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.513896942138672, + "rewards/margins": 30.927602767944336, + "rewards/rejected": -37.44150161743164, + "step": 2039 + }, + { + "epoch": 1.2690513219284603, + "grad_norm": 3.808347901212983e-05, + "learning_rate": 3.2053941908713696e-06, + "logits/chosen": 1.5050125122070312, + "logits/rejected": 4.318361759185791, + "logps/chosen": -664.9024658203125, + "logps/rejected": -1053.49609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.091887950897217, + "rewards/margins": 28.934024810791016, + "rewards/rejected": -35.025909423828125, + "step": 2040 + }, + { + "epoch": 1.2696734059097978, + "grad_norm": 0.009404631331562996, + "learning_rate": 3.2042415859843253e-06, + "logits/chosen": -1.170180320739746, + "logits/rejected": 2.024054765701294, + "logps/chosen": -408.93048095703125, + "logps/rejected": -817.9940185546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.462571620941162, + "rewards/margins": 18.640281677246094, + "rewards/rejected": -25.102855682373047, + "step": 2041 + }, + { + "epoch": 1.2702954898911352, + "grad_norm": 1.738207538437564e-05, + "learning_rate": 3.2030889810972796e-06, + "logits/chosen": 2.111595392227173, + "logits/rejected": 4.570797443389893, + "logps/chosen": -592.1729125976562, + "logps/rejected": -1171.582763671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.525852203369141, + "rewards/margins": 41.40595626831055, + "rewards/rejected": -48.93180847167969, + "step": 2042 + }, + { + "epoch": 1.2709175738724727, + "grad_norm": 1.4109896421432495, + "learning_rate": 3.2019363762102353e-06, + "logits/chosen": 0.4325029253959656, + "logits/rejected": 3.582610607147217, + "logps/chosen": -439.2037658691406, + "logps/rejected": -752.7412109375, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.916749477386475, + "rewards/margins": 15.604118347167969, + "rewards/rejected": -21.5208683013916, + "step": 2043 + }, + { + "epoch": 1.2715396578538103, + "grad_norm": 0.002375447889789939, + "learning_rate": 3.2007837713231905e-06, + "logits/chosen": 0.35185641050338745, + "logits/rejected": 2.958373785018921, + "logps/chosen": -416.5084228515625, + "logps/rejected": -772.28125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.487848281860352, + "rewards/margins": 21.232372283935547, + "rewards/rejected": -26.72022247314453, + "step": 2044 + }, + { + "epoch": 1.2721617418351476, + "grad_norm": 0.1174873635172844, + "learning_rate": 3.1996311664361457e-06, + "logits/chosen": -0.5361908078193665, + "logits/rejected": 3.4455199241638184, + "logps/chosen": -454.63323974609375, + "logps/rejected": -859.8018798828125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.4316816329956055, + "rewards/margins": 16.827882766723633, + "rewards/rejected": -21.259565353393555, + "step": 2045 + }, + { + "epoch": 1.2727838258164852, + "grad_norm": 0.022169746458530426, + "learning_rate": 3.198478561549101e-06, + "logits/chosen": 1.50562584400177, + "logits/rejected": 2.5826327800750732, + "logps/chosen": -629.9881591796875, + "logps/rejected": -940.7921142578125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.677554607391357, + "rewards/margins": 23.858470916748047, + "rewards/rejected": -29.536026000976562, + "step": 2046 + }, + { + "epoch": 1.2734059097978228, + "grad_norm": 0.00024391288752667606, + "learning_rate": 3.197325956662056e-06, + "logits/chosen": -0.16936111450195312, + "logits/rejected": 3.6262879371643066, + "logps/chosen": -429.77386474609375, + "logps/rejected": -943.689453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.290872573852539, + "rewards/margins": 28.654075622558594, + "rewards/rejected": -32.944950103759766, + "step": 2047 + }, + { + "epoch": 1.27402799377916, + "grad_norm": 0.09606008976697922, + "learning_rate": 3.1961733517750114e-06, + "logits/chosen": -1.716057538986206, + "logits/rejected": 2.7073190212249756, + "logps/chosen": -440.507080078125, + "logps/rejected": -845.6986083984375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.6731438636779785, + "rewards/margins": 22.331708908081055, + "rewards/rejected": -28.004854202270508, + "step": 2048 + }, + { + "epoch": 1.2746500777604977, + "grad_norm": 0.04162227362394333, + "learning_rate": 3.1950207468879666e-06, + "logits/chosen": -2.305849075317383, + "logits/rejected": 3.7094991207122803, + "logps/chosen": -292.4382019042969, + "logps/rejected": -912.4019775390625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.582938194274902, + "rewards/margins": 30.001415252685547, + "rewards/rejected": -34.5843505859375, + "step": 2049 + }, + { + "epoch": 1.2752721617418352, + "grad_norm": 7.29586124420166, + "learning_rate": 3.1938681420009223e-06, + "logits/chosen": 0.8130073547363281, + "logits/rejected": 2.7934420108795166, + "logps/chosen": -439.5497131347656, + "logps/rejected": -721.2268676757812, + "loss": 0.0395, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.392171859741211, + "rewards/margins": 15.094999313354492, + "rewards/rejected": -24.487171173095703, + "step": 2050 + }, + { + "epoch": 1.2758942457231726, + "grad_norm": 0.008065351285040379, + "learning_rate": 3.1927155371138775e-06, + "logits/chosen": 1.056755781173706, + "logits/rejected": 4.707803726196289, + "logps/chosen": -500.83953857421875, + "logps/rejected": -960.3767700195312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.4384870529174805, + "rewards/margins": 24.951709747314453, + "rewards/rejected": -31.390199661254883, + "step": 2051 + }, + { + "epoch": 1.2765163297045101, + "grad_norm": 43.08720779418945, + "learning_rate": 3.1915629322268327e-06, + "logits/chosen": 2.101077079772949, + "logits/rejected": 3.4182801246643066, + "logps/chosen": -597.8524169921875, + "logps/rejected": -868.7628784179688, + "loss": 1.6764, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.14328670501709, + "rewards/margins": 16.80899429321289, + "rewards/rejected": -22.952281951904297, + "step": 2052 + }, + { + "epoch": 1.2771384136858477, + "grad_norm": 0.9350723624229431, + "learning_rate": 3.190410327339788e-06, + "logits/chosen": 0.829081118106842, + "logits/rejected": 3.3746399879455566, + "logps/chosen": -456.2769470214844, + "logps/rejected": -832.5189208984375, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.106916427612305, + "rewards/margins": 25.035396575927734, + "rewards/rejected": -32.142311096191406, + "step": 2053 + }, + { + "epoch": 1.277760497667185, + "grad_norm": 1.491163730621338, + "learning_rate": 3.189257722452743e-06, + "logits/chosen": -0.5811960101127625, + "logits/rejected": 3.256408214569092, + "logps/chosen": -448.40478515625, + "logps/rejected": -795.5897827148438, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.50742769241333, + "rewards/margins": 18.371578216552734, + "rewards/rejected": -24.879005432128906, + "step": 2054 + }, + { + "epoch": 1.2783825816485226, + "grad_norm": 0.00022475777950603515, + "learning_rate": 3.1881051175656984e-06, + "logits/chosen": -2.1531898975372314, + "logits/rejected": 1.8872551918029785, + "logps/chosen": -377.0928039550781, + "logps/rejected": -985.4249267578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.719932556152344, + "rewards/margins": 31.08294677734375, + "rewards/rejected": -37.802879333496094, + "step": 2055 + }, + { + "epoch": 1.2790046656298601, + "grad_norm": 0.0981423407793045, + "learning_rate": 3.1869525126786536e-06, + "logits/chosen": 0.9427670240402222, + "logits/rejected": 3.304424285888672, + "logps/chosen": -557.6295166015625, + "logps/rejected": -969.7200927734375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.98606014251709, + "rewards/margins": 25.926830291748047, + "rewards/rejected": -32.91289138793945, + "step": 2056 + }, + { + "epoch": 1.2796267496111975, + "grad_norm": 0.7095301151275635, + "learning_rate": 3.1857999077916093e-06, + "logits/chosen": -1.1271562576293945, + "logits/rejected": 3.539489984512329, + "logps/chosen": -500.1939697265625, + "logps/rejected": -919.2050170898438, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.236118316650391, + "rewards/margins": 19.31410026550293, + "rewards/rejected": -23.550216674804688, + "step": 2057 + }, + { + "epoch": 1.280248833592535, + "grad_norm": 0.01117491815239191, + "learning_rate": 3.1846473029045645e-06, + "logits/chosen": 0.3624744415283203, + "logits/rejected": 2.193741798400879, + "logps/chosen": -464.18096923828125, + "logps/rejected": -894.273681640625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.566399574279785, + "rewards/margins": 26.122310638427734, + "rewards/rejected": -32.68871307373047, + "step": 2058 + }, + { + "epoch": 1.2808709175738724, + "grad_norm": 0.6809660196304321, + "learning_rate": 3.1834946980175197e-06, + "logits/chosen": -0.000639304518699646, + "logits/rejected": 1.8177388906478882, + "logps/chosen": -540.7205810546875, + "logps/rejected": -900.8763427734375, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.371767044067383, + "rewards/margins": 22.083099365234375, + "rewards/rejected": -32.45486831665039, + "step": 2059 + }, + { + "epoch": 1.28149300155521, + "grad_norm": 0.001697477768175304, + "learning_rate": 3.182342093130475e-06, + "logits/chosen": -0.06266975402832031, + "logits/rejected": 3.489635467529297, + "logps/chosen": -470.419677734375, + "logps/rejected": -1003.0845947265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.994956970214844, + "rewards/margins": 27.230785369873047, + "rewards/rejected": -35.22574234008789, + "step": 2060 + }, + { + "epoch": 1.2821150855365475, + "grad_norm": 0.18616902828216553, + "learning_rate": 3.18118948824343e-06, + "logits/chosen": 1.1356843709945679, + "logits/rejected": 4.710424423217773, + "logps/chosen": -583.5643920898438, + "logps/rejected": -959.2979125976562, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.797614097595215, + "rewards/margins": 23.102134704589844, + "rewards/rejected": -27.89974594116211, + "step": 2061 + }, + { + "epoch": 1.2827371695178849, + "grad_norm": 0.047538481652736664, + "learning_rate": 3.1800368833563854e-06, + "logits/chosen": -1.8425778150558472, + "logits/rejected": 2.8900485038757324, + "logps/chosen": -361.5750732421875, + "logps/rejected": -856.4237670898438, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.557888984680176, + "rewards/margins": 26.233121871948242, + "rewards/rejected": -31.791013717651367, + "step": 2062 + }, + { + "epoch": 1.2833592534992224, + "grad_norm": 0.30076029896736145, + "learning_rate": 3.1788842784693406e-06, + "logits/chosen": 1.5041686296463013, + "logits/rejected": 3.6929330825805664, + "logps/chosen": -591.1065673828125, + "logps/rejected": -993.000244140625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.452116012573242, + "rewards/margins": 27.180265426635742, + "rewards/rejected": -35.632381439208984, + "step": 2063 + }, + { + "epoch": 1.2839813374805598, + "grad_norm": 0.7477201819419861, + "learning_rate": 3.177731673582296e-06, + "logits/chosen": -0.05225837230682373, + "logits/rejected": 4.111126899719238, + "logps/chosen": -477.04266357421875, + "logps/rejected": -881.5646362304688, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.437117099761963, + "rewards/margins": 25.369081497192383, + "rewards/rejected": -31.80620002746582, + "step": 2064 + }, + { + "epoch": 1.2846034214618973, + "grad_norm": 3.26867825606314e-06, + "learning_rate": 3.1765790686952515e-06, + "logits/chosen": 0.5494682788848877, + "logits/rejected": 3.7316155433654785, + "logps/chosen": -527.177001953125, + "logps/rejected": -987.5484619140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.781964302062988, + "rewards/margins": 29.342601776123047, + "rewards/rejected": -36.12456512451172, + "step": 2065 + }, + { + "epoch": 1.2852255054432349, + "grad_norm": 1.0592074431770016e-05, + "learning_rate": 3.1754264638082067e-06, + "logits/chosen": 1.2520337104797363, + "logits/rejected": 5.836635589599609, + "logps/chosen": -480.50396728515625, + "logps/rejected": -1135.91796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.007936000823975, + "rewards/margins": 32.592288970947266, + "rewards/rejected": -36.600223541259766, + "step": 2066 + }, + { + "epoch": 1.2858475894245722, + "grad_norm": 2.0127735137939453, + "learning_rate": 3.174273858921162e-06, + "logits/chosen": 2.7770450115203857, + "logits/rejected": 3.0169568061828613, + "logps/chosen": -601.7747802734375, + "logps/rejected": -867.8875732421875, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.067495346069336, + "rewards/margins": 22.419506072998047, + "rewards/rejected": -29.487001419067383, + "step": 2067 + }, + { + "epoch": 1.2864696734059098, + "grad_norm": 0.13479429483413696, + "learning_rate": 3.173121254034117e-06, + "logits/chosen": 1.5869368314743042, + "logits/rejected": 4.738080978393555, + "logps/chosen": -596.6153564453125, + "logps/rejected": -1134.51904296875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.250545501708984, + "rewards/margins": 30.297420501708984, + "rewards/rejected": -39.5479621887207, + "step": 2068 + }, + { + "epoch": 1.2870917573872473, + "grad_norm": 6.155482769012451, + "learning_rate": 3.1719686491470724e-06, + "logits/chosen": -1.9126255512237549, + "logits/rejected": 1.934863805770874, + "logps/chosen": -307.49249267578125, + "logps/rejected": -780.62939453125, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.279009819030762, + "rewards/margins": 22.31053924560547, + "rewards/rejected": -27.589548110961914, + "step": 2069 + }, + { + "epoch": 1.2877138413685847, + "grad_norm": 0.27120184898376465, + "learning_rate": 3.1708160442600276e-06, + "logits/chosen": 2.8405706882476807, + "logits/rejected": 4.2547526359558105, + "logps/chosen": -458.87255859375, + "logps/rejected": -801.9660034179688, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.665666580200195, + "rewards/margins": 23.629499435424805, + "rewards/rejected": -29.295169830322266, + "step": 2070 + }, + { + "epoch": 1.2883359253499223, + "grad_norm": 36.809181213378906, + "learning_rate": 3.169663439372983e-06, + "logits/chosen": -3.2923147678375244, + "logits/rejected": 3.318225383758545, + "logps/chosen": -266.0099182128906, + "logps/rejected": -859.7506103515625, + "loss": 0.9804, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.480389595031738, + "rewards/margins": 24.918094635009766, + "rewards/rejected": -32.39848327636719, + "step": 2071 + }, + { + "epoch": 1.2889580093312598, + "grad_norm": 0.0019073631847277284, + "learning_rate": 3.1685108344859385e-06, + "logits/chosen": 2.2889578342437744, + "logits/rejected": 3.743168354034424, + "logps/chosen": -539.8269653320312, + "logps/rejected": -848.6666870117188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.808744430541992, + "rewards/margins": 21.052974700927734, + "rewards/rejected": -26.86172103881836, + "step": 2072 + }, + { + "epoch": 1.2895800933125972, + "grad_norm": 0.00458146259188652, + "learning_rate": 3.1673582295988937e-06, + "logits/chosen": -0.6084625720977783, + "logits/rejected": 2.6312525272369385, + "logps/chosen": -308.7406005859375, + "logps/rejected": -774.83544921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.150224208831787, + "rewards/margins": 26.105060577392578, + "rewards/rejected": -31.255285263061523, + "step": 2073 + }, + { + "epoch": 1.2902021772939347, + "grad_norm": 7.325793266296387, + "learning_rate": 3.166205624711849e-06, + "logits/chosen": 0.36518922448158264, + "logits/rejected": 1.6637272834777832, + "logps/chosen": -547.444580078125, + "logps/rejected": -901.6878662109375, + "loss": 0.0313, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.453319549560547, + "rewards/margins": 28.862653732299805, + "rewards/rejected": -37.315975189208984, + "step": 2074 + }, + { + "epoch": 1.2908242612752723, + "grad_norm": 25.79266357421875, + "learning_rate": 3.165053019824804e-06, + "logits/chosen": -0.021634042263031006, + "logits/rejected": 2.2119054794311523, + "logps/chosen": -526.3079223632812, + "logps/rejected": -965.0336303710938, + "loss": 0.2514, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.492626190185547, + "rewards/margins": 30.373445510864258, + "rewards/rejected": -38.86606979370117, + "step": 2075 + }, + { + "epoch": 1.2914463452566096, + "grad_norm": 0.0180280189961195, + "learning_rate": 3.1639004149377594e-06, + "logits/chosen": 2.165846824645996, + "logits/rejected": 4.180533409118652, + "logps/chosen": -627.2410278320312, + "logps/rejected": -1168.7398681640625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.25662899017334, + "rewards/margins": 32.789207458496094, + "rewards/rejected": -44.045833587646484, + "step": 2076 + }, + { + "epoch": 1.2920684292379472, + "grad_norm": 0.0044320556335151196, + "learning_rate": 3.1627478100507146e-06, + "logits/chosen": -1.9515249729156494, + "logits/rejected": 2.808650016784668, + "logps/chosen": -472.1334228515625, + "logps/rejected": -1063.12548828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.644009590148926, + "rewards/margins": 31.643199920654297, + "rewards/rejected": -42.287208557128906, + "step": 2077 + }, + { + "epoch": 1.2926905132192845, + "grad_norm": 0.0055711762979626656, + "learning_rate": 3.16159520516367e-06, + "logits/chosen": 0.9135769605636597, + "logits/rejected": 3.6809005737304688, + "logps/chosen": -445.4312744140625, + "logps/rejected": -779.880859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.659487247467041, + "rewards/margins": 16.80797004699707, + "rewards/rejected": -24.467456817626953, + "step": 2078 + }, + { + "epoch": 1.293312597200622, + "grad_norm": 2.7579030990600586, + "learning_rate": 3.1604426002766255e-06, + "logits/chosen": -2.535278797149658, + "logits/rejected": 2.1650233268737793, + "logps/chosen": -480.51025390625, + "logps/rejected": -1007.449462890625, + "loss": 0.0858, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.463523864746094, + "rewards/margins": 27.31024742126465, + "rewards/rejected": -38.773773193359375, + "step": 2079 + }, + { + "epoch": 1.2939346811819596, + "grad_norm": 1.0793917226692429e-07, + "learning_rate": 3.1592899953895807e-06, + "logits/chosen": 1.1380003690719604, + "logits/rejected": 4.078077793121338, + "logps/chosen": -603.9254150390625, + "logps/rejected": -1046.822998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.813261032104492, + "rewards/margins": 32.562355041503906, + "rewards/rejected": -41.375614166259766, + "step": 2080 + }, + { + "epoch": 1.294556765163297, + "grad_norm": 0.0008141865837387741, + "learning_rate": 3.158137390502536e-06, + "logits/chosen": 0.8326452970504761, + "logits/rejected": 2.2659921646118164, + "logps/chosen": -609.6137084960938, + "logps/rejected": -971.3287963867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.248361587524414, + "rewards/margins": 27.811277389526367, + "rewards/rejected": -40.05963897705078, + "step": 2081 + }, + { + "epoch": 1.2951788491446345, + "grad_norm": 34.80479049682617, + "learning_rate": 3.156984785615491e-06, + "logits/chosen": 1.3409345149993896, + "logits/rejected": 3.883260726928711, + "logps/chosen": -566.5382080078125, + "logps/rejected": -915.5721435546875, + "loss": 1.045, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.75674057006836, + "rewards/margins": 24.255878448486328, + "rewards/rejected": -34.01261901855469, + "step": 2082 + }, + { + "epoch": 1.2958009331259719, + "grad_norm": 0.1624457836151123, + "learning_rate": 3.1558321807284464e-06, + "logits/chosen": 1.185449481010437, + "logits/rejected": 1.4735817909240723, + "logps/chosen": -448.03564453125, + "logps/rejected": -671.8951416015625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0474700927734375, + "rewards/margins": 20.61285400390625, + "rewards/rejected": -25.66032600402832, + "step": 2083 + }, + { + "epoch": 1.2964230171073094, + "grad_norm": 5.242217957857065e-05, + "learning_rate": 3.1546795758414016e-06, + "logits/chosen": -3.8523991107940674, + "logits/rejected": 2.827188491821289, + "logps/chosen": -233.0387725830078, + "logps/rejected": -933.7574462890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.143452167510986, + "rewards/margins": 33.942142486572266, + "rewards/rejected": -39.085594177246094, + "step": 2084 + }, + { + "epoch": 1.297045101088647, + "grad_norm": 3.0982849352767516e-07, + "learning_rate": 3.153526970954357e-06, + "logits/chosen": 2.814451217651367, + "logits/rejected": 5.035799026489258, + "logps/chosen": -613.6903076171875, + "logps/rejected": -1098.580078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.112676620483398, + "rewards/margins": 32.56719207763672, + "rewards/rejected": -42.679874420166016, + "step": 2085 + }, + { + "epoch": 1.2976671850699844, + "grad_norm": 0.005450894124805927, + "learning_rate": 3.152374366067312e-06, + "logits/chosen": -0.5371922254562378, + "logits/rejected": 3.8709936141967773, + "logps/chosen": -378.77783203125, + "logps/rejected": -962.4693603515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.809730529785156, + "rewards/margins": 30.568540573120117, + "rewards/rejected": -37.378273010253906, + "step": 2086 + }, + { + "epoch": 1.298289269051322, + "grad_norm": 4.169853687286377, + "learning_rate": 3.1512217611802677e-06, + "logits/chosen": 0.9278308153152466, + "logits/rejected": 4.105837345123291, + "logps/chosen": -345.00360107421875, + "logps/rejected": -775.925048828125, + "loss": 0.0248, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.518424987792969, + "rewards/margins": 26.84881591796875, + "rewards/rejected": -32.36724090576172, + "step": 2087 + }, + { + "epoch": 1.2989113530326595, + "grad_norm": 0.3264186978340149, + "learning_rate": 3.150069156293223e-06, + "logits/chosen": 1.8526872396469116, + "logits/rejected": 3.779759168624878, + "logps/chosen": -618.3271484375, + "logps/rejected": -1036.8548583984375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.674715995788574, + "rewards/margins": 28.104290008544922, + "rewards/rejected": -37.77900695800781, + "step": 2088 + }, + { + "epoch": 1.2995334370139968, + "grad_norm": 7.297713756561279, + "learning_rate": 3.148916551406178e-06, + "logits/chosen": 1.2116997241973877, + "logits/rejected": 2.26175594329834, + "logps/chosen": -562.7557983398438, + "logps/rejected": -918.6259155273438, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.19647216796875, + "rewards/margins": 26.213180541992188, + "rewards/rejected": -33.40965270996094, + "step": 2089 + }, + { + "epoch": 1.3001555209953344, + "grad_norm": 0.004468827974051237, + "learning_rate": 3.1477639465191334e-06, + "logits/chosen": -1.1220444440841675, + "logits/rejected": 3.41408109664917, + "logps/chosen": -421.54791259765625, + "logps/rejected": -948.6719970703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.602625846862793, + "rewards/margins": 27.19068145751953, + "rewards/rejected": -33.793304443359375, + "step": 2090 + }, + { + "epoch": 1.300777604976672, + "grad_norm": 0.007508592680096626, + "learning_rate": 3.1466113416320886e-06, + "logits/chosen": -0.01274651288986206, + "logits/rejected": 2.5498158931732178, + "logps/chosen": -466.1578369140625, + "logps/rejected": -758.73583984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.210990905761719, + "rewards/margins": 18.896699905395508, + "rewards/rejected": -28.107690811157227, + "step": 2091 + }, + { + "epoch": 1.3013996889580093, + "grad_norm": 2.6341097354888916, + "learning_rate": 3.145458736745044e-06, + "logits/chosen": 1.0074734687805176, + "logits/rejected": 3.3519134521484375, + "logps/chosen": -627.6580810546875, + "logps/rejected": -977.8060302734375, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.129983425140381, + "rewards/margins": 24.711997985839844, + "rewards/rejected": -31.841981887817383, + "step": 2092 + }, + { + "epoch": 1.3020217729393468, + "grad_norm": 0.10764219611883163, + "learning_rate": 3.144306131857999e-06, + "logits/chosen": -2.0879392623901367, + "logits/rejected": 1.9700127840042114, + "logps/chosen": -452.25634765625, + "logps/rejected": -1075.676025390625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.861340522766113, + "rewards/margins": 39.19294738769531, + "rewards/rejected": -47.054283142089844, + "step": 2093 + }, + { + "epoch": 1.3026438569206844, + "grad_norm": 0.0006069698138162494, + "learning_rate": 3.1431535269709547e-06, + "logits/chosen": -0.44354957342147827, + "logits/rejected": 3.16389536857605, + "logps/chosen": -556.4852294921875, + "logps/rejected": -1139.32373046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.457627296447754, + "rewards/margins": 37.76030349731445, + "rewards/rejected": -48.217933654785156, + "step": 2094 + }, + { + "epoch": 1.3032659409020217, + "grad_norm": 1.178689956665039, + "learning_rate": 3.14200092208391e-06, + "logits/chosen": -0.7547582387924194, + "logits/rejected": 3.3931565284729004, + "logps/chosen": -524.9329833984375, + "logps/rejected": -1077.71826171875, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.029295921325684, + "rewards/margins": 28.463600158691406, + "rewards/rejected": -39.492897033691406, + "step": 2095 + }, + { + "epoch": 1.3038880248833593, + "grad_norm": 14.788235664367676, + "learning_rate": 3.140848317196865e-06, + "logits/chosen": -0.28261828422546387, + "logits/rejected": 1.6292152404785156, + "logps/chosen": -532.36962890625, + "logps/rejected": -860.1507568359375, + "loss": 0.4315, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.120588779449463, + "rewards/margins": 19.741498947143555, + "rewards/rejected": -26.86208724975586, + "step": 2096 + }, + { + "epoch": 1.3045101088646969, + "grad_norm": 30.001577377319336, + "learning_rate": 3.1396957123098204e-06, + "logits/chosen": 1.9691777229309082, + "logits/rejected": 3.434938430786133, + "logps/chosen": -684.923828125, + "logps/rejected": -1013.4309692382812, + "loss": 0.6869, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.90506362915039, + "rewards/margins": 24.885982513427734, + "rewards/rejected": -37.79104232788086, + "step": 2097 + }, + { + "epoch": 1.3051321928460342, + "grad_norm": 0.1871831715106964, + "learning_rate": 3.1385431074227756e-06, + "logits/chosen": 2.1880619525909424, + "logits/rejected": 1.8200218677520752, + "logps/chosen": -679.3081665039062, + "logps/rejected": -855.6882934570312, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.308370590209961, + "rewards/margins": 20.151187896728516, + "rewards/rejected": -32.45956039428711, + "step": 2098 + }, + { + "epoch": 1.3057542768273718, + "grad_norm": 0.2509006857872009, + "learning_rate": 3.137390502535731e-06, + "logits/chosen": 1.5480281114578247, + "logits/rejected": 2.9846410751342773, + "logps/chosen": -634.6478881835938, + "logps/rejected": -973.6765747070312, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.49044418334961, + "rewards/margins": 22.26751708984375, + "rewards/rejected": -32.757957458496094, + "step": 2099 + }, + { + "epoch": 1.3063763608087091, + "grad_norm": 0.05295734107494354, + "learning_rate": 3.136237897648686e-06, + "logits/chosen": 2.5485236644744873, + "logits/rejected": 3.1285977363586426, + "logps/chosen": -528.86865234375, + "logps/rejected": -914.2305297851562, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.828814506530762, + "rewards/margins": 29.736709594726562, + "rewards/rejected": -37.565521240234375, + "step": 2100 + }, + { + "epoch": 1.3069984447900467, + "grad_norm": 0.00011729019752237946, + "learning_rate": 3.1350852927616417e-06, + "logits/chosen": 1.2746424674987793, + "logits/rejected": 4.064460754394531, + "logps/chosen": -570.7022705078125, + "logps/rejected": -1051.89892578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.794236660003662, + "rewards/margins": 36.453895568847656, + "rewards/rejected": -44.24812698364258, + "step": 2101 + }, + { + "epoch": 1.307620528771384, + "grad_norm": 0.013876068405807018, + "learning_rate": 3.133932687874597e-06, + "logits/chosen": 1.3518435955047607, + "logits/rejected": 1.6746224164962769, + "logps/chosen": -567.3868408203125, + "logps/rejected": -852.9873657226562, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.235878944396973, + "rewards/margins": 25.40787696838379, + "rewards/rejected": -37.64375686645508, + "step": 2102 + }, + { + "epoch": 1.3082426127527216, + "grad_norm": 0.7705238461494446, + "learning_rate": 3.132780082987552e-06, + "logits/chosen": 0.999545156955719, + "logits/rejected": 4.350939750671387, + "logps/chosen": -371.6129150390625, + "logps/rejected": -756.1754150390625, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.465909481048584, + "rewards/margins": 21.080066680908203, + "rewards/rejected": -26.545976638793945, + "step": 2103 + }, + { + "epoch": 1.3088646967340591, + "grad_norm": 1.206705927848816, + "learning_rate": 3.1316274781005074e-06, + "logits/chosen": 0.7771980166435242, + "logits/rejected": 2.899505853652954, + "logps/chosen": -496.0504455566406, + "logps/rejected": -950.00341796875, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.855134010314941, + "rewards/margins": 29.67104721069336, + "rewards/rejected": -36.52618408203125, + "step": 2104 + }, + { + "epoch": 1.3094867807153965, + "grad_norm": 0.00644198153167963, + "learning_rate": 3.1304748732134626e-06, + "logits/chosen": 0.1656215786933899, + "logits/rejected": 4.405486106872559, + "logps/chosen": -458.61248779296875, + "logps/rejected": -1091.378662109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.653053283691406, + "rewards/margins": 34.8778076171875, + "rewards/rejected": -45.530860900878906, + "step": 2105 + }, + { + "epoch": 1.310108864696734, + "grad_norm": 7.407296657562256, + "learning_rate": 3.129322268326418e-06, + "logits/chosen": -0.6084730625152588, + "logits/rejected": 3.4701526165008545, + "logps/chosen": -413.8167419433594, + "logps/rejected": -819.1229858398438, + "loss": 0.0395, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.87021255493164, + "rewards/margins": 20.633195877075195, + "rewards/rejected": -29.503406524658203, + "step": 2106 + }, + { + "epoch": 1.3107309486780716, + "grad_norm": 9.020928700920194e-06, + "learning_rate": 3.128169663439373e-06, + "logits/chosen": -0.03172177076339722, + "logits/rejected": 2.916116237640381, + "logps/chosen": -515.1375122070312, + "logps/rejected": -1125.2947998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.552846908569336, + "rewards/margins": 36.16227722167969, + "rewards/rejected": -48.715126037597656, + "step": 2107 + }, + { + "epoch": 1.311353032659409, + "grad_norm": 0.9298233389854431, + "learning_rate": 3.1270170585523287e-06, + "logits/chosen": 2.7141852378845215, + "logits/rejected": 1.5818345546722412, + "logps/chosen": -704.413818359375, + "logps/rejected": -827.8814697265625, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.471658706665039, + "rewards/margins": 18.957651138305664, + "rewards/rejected": -29.429309844970703, + "step": 2108 + }, + { + "epoch": 1.3119751166407465, + "grad_norm": 0.10091857612133026, + "learning_rate": 3.125864453665284e-06, + "logits/chosen": 2.0753657817840576, + "logits/rejected": 2.2879538536071777, + "logps/chosen": -658.8688354492188, + "logps/rejected": -899.2041015625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.374801635742188, + "rewards/margins": 26.37302017211914, + "rewards/rejected": -39.74782180786133, + "step": 2109 + }, + { + "epoch": 1.312597200622084, + "grad_norm": 0.4145754873752594, + "learning_rate": 3.124711848778239e-06, + "logits/chosen": -0.41198205947875977, + "logits/rejected": 3.2668869495391846, + "logps/chosen": -540.3209838867188, + "logps/rejected": -990.2476806640625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.664708137512207, + "rewards/margins": 28.692848205566406, + "rewards/rejected": -38.35755920410156, + "step": 2110 + }, + { + "epoch": 1.3132192846034214, + "grad_norm": 0.03950352221727371, + "learning_rate": 3.1235592438911944e-06, + "logits/chosen": 2.346403121948242, + "logits/rejected": 3.843334436416626, + "logps/chosen": -665.7584838867188, + "logps/rejected": -1120.620361328125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.084131240844727, + "rewards/margins": 28.030925750732422, + "rewards/rejected": -41.11505889892578, + "step": 2111 + }, + { + "epoch": 1.313841368584759, + "grad_norm": 0.01574692316353321, + "learning_rate": 3.1224066390041496e-06, + "logits/chosen": 0.11658996343612671, + "logits/rejected": 2.8681399822235107, + "logps/chosen": -657.4076538085938, + "logps/rejected": -1048.323486328125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.495312690734863, + "rewards/margins": 27.018035888671875, + "rewards/rejected": -34.51334762573242, + "step": 2112 + }, + { + "epoch": 1.3144634525660965, + "grad_norm": 0.24601449072360992, + "learning_rate": 3.121254034117105e-06, + "logits/chosen": -1.321711540222168, + "logits/rejected": 0.7253201603889465, + "logps/chosen": -473.1822509765625, + "logps/rejected": -928.5146484375, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.915327548980713, + "rewards/margins": 29.833887100219727, + "rewards/rejected": -36.74921417236328, + "step": 2113 + }, + { + "epoch": 1.3150855365474339, + "grad_norm": 6.262076567509212e-06, + "learning_rate": 3.12010142923006e-06, + "logits/chosen": 2.3087410926818848, + "logits/rejected": 4.198649883270264, + "logps/chosen": -656.1389770507812, + "logps/rejected": -1184.9796142578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.827698707580566, + "rewards/margins": 35.55638885498047, + "rewards/rejected": -48.38408660888672, + "step": 2114 + }, + { + "epoch": 1.3157076205287714, + "grad_norm": 0.3786371648311615, + "learning_rate": 3.1189488243430153e-06, + "logits/chosen": 1.2803771495819092, + "logits/rejected": 3.8872060775756836, + "logps/chosen": -535.787841796875, + "logps/rejected": -1055.6212158203125, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.432448863983154, + "rewards/margins": 30.718673706054688, + "rewards/rejected": -38.151123046875, + "step": 2115 + }, + { + "epoch": 1.316329704510109, + "grad_norm": 0.00028373984969221056, + "learning_rate": 3.117796219455971e-06, + "logits/chosen": 0.8943437933921814, + "logits/rejected": 2.8500447273254395, + "logps/chosen": -649.5738525390625, + "logps/rejected": -1057.764892578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.27314567565918, + "rewards/margins": 30.976238250732422, + "rewards/rejected": -42.24938201904297, + "step": 2116 + }, + { + "epoch": 1.3169517884914463, + "grad_norm": 24.275968551635742, + "learning_rate": 3.116643614568926e-06, + "logits/chosen": -0.27282553911209106, + "logits/rejected": 1.769002914428711, + "logps/chosen": -585.8446044921875, + "logps/rejected": -944.812255859375, + "loss": 0.3463, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.907154083251953, + "rewards/margins": 23.340641021728516, + "rewards/rejected": -35.24779510498047, + "step": 2117 + }, + { + "epoch": 1.317573872472784, + "grad_norm": 0.0063476222567260265, + "learning_rate": 3.1154910096818814e-06, + "logits/chosen": -0.006000339984893799, + "logits/rejected": 3.46754789352417, + "logps/chosen": -503.677734375, + "logps/rejected": -1099.6572265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.680063247680664, + "rewards/margins": 30.042665481567383, + "rewards/rejected": -37.72273254394531, + "step": 2118 + }, + { + "epoch": 1.3181959564541212, + "grad_norm": 5.0704827308654785, + "learning_rate": 3.1143384047948366e-06, + "logits/chosen": -0.21951770782470703, + "logits/rejected": 1.3027235269546509, + "logps/chosen": -553.710693359375, + "logps/rejected": -946.5357055664062, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.494546890258789, + "rewards/margins": 23.37753677368164, + "rewards/rejected": -30.872081756591797, + "step": 2119 + }, + { + "epoch": 1.3188180404354588, + "grad_norm": 0.000527180265635252, + "learning_rate": 3.113185799907792e-06, + "logits/chosen": 0.8772927522659302, + "logits/rejected": 5.24975061416626, + "logps/chosen": -441.1733093261719, + "logps/rejected": -981.8777465820312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.961986541748047, + "rewards/margins": 30.325220108032227, + "rewards/rejected": -36.287208557128906, + "step": 2120 + }, + { + "epoch": 1.3194401244167961, + "grad_norm": 6.20682158114505e-06, + "learning_rate": 3.112033195020747e-06, + "logits/chosen": 1.3164989948272705, + "logits/rejected": 2.95279598236084, + "logps/chosen": -661.6497802734375, + "logps/rejected": -1091.271728515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.640320777893066, + "rewards/margins": 34.3283576965332, + "rewards/rejected": -46.96868133544922, + "step": 2121 + }, + { + "epoch": 1.3200622083981337, + "grad_norm": 1.8734179735183716, + "learning_rate": 3.1108805901337023e-06, + "logits/chosen": 1.941436767578125, + "logits/rejected": 1.3771246671676636, + "logps/chosen": -673.208251953125, + "logps/rejected": -903.9393920898438, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.356344223022461, + "rewards/margins": 22.33736801147461, + "rewards/rejected": -32.69371032714844, + "step": 2122 + }, + { + "epoch": 1.3206842923794713, + "grad_norm": 1.8553708287072368e-05, + "learning_rate": 3.109727985246658e-06, + "logits/chosen": -3.6621267795562744, + "logits/rejected": 0.9122580289840698, + "logps/chosen": -348.76800537109375, + "logps/rejected": -936.0460815429688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.951865196228027, + "rewards/margins": 31.755855560302734, + "rewards/rejected": -40.70772171020508, + "step": 2123 + }, + { + "epoch": 1.3213063763608086, + "grad_norm": 0.3098766505718231, + "learning_rate": 3.108575380359613e-06, + "logits/chosen": 0.4674542546272278, + "logits/rejected": 2.539536714553833, + "logps/chosen": -612.90576171875, + "logps/rejected": -1040.7362060546875, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.947325706481934, + "rewards/margins": 32.84162521362305, + "rewards/rejected": -45.78894805908203, + "step": 2124 + }, + { + "epoch": 1.3219284603421462, + "grad_norm": 0.00044224029988981783, + "learning_rate": 3.1074227754725684e-06, + "logits/chosen": -2.9238626956939697, + "logits/rejected": 2.93160343170166, + "logps/chosen": -186.1780548095703, + "logps/rejected": -841.7645874023438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.421435356140137, + "rewards/margins": 32.04075622558594, + "rewards/rejected": -36.462196350097656, + "step": 2125 + }, + { + "epoch": 1.3225505443234837, + "grad_norm": 0.8636406064033508, + "learning_rate": 3.1062701705855236e-06, + "logits/chosen": 2.3500113487243652, + "logits/rejected": 3.3730363845825195, + "logps/chosen": -606.7012939453125, + "logps/rejected": -1062.3568115234375, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.823650360107422, + "rewards/margins": 30.047855377197266, + "rewards/rejected": -38.87150573730469, + "step": 2126 + }, + { + "epoch": 1.323172628304821, + "grad_norm": 0.0007554941112175584, + "learning_rate": 3.105117565698479e-06, + "logits/chosen": -1.9400715827941895, + "logits/rejected": 3.186816692352295, + "logps/chosen": -492.09014892578125, + "logps/rejected": -1054.965087890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.473559856414795, + "rewards/margins": 26.097862243652344, + "rewards/rejected": -32.57142639160156, + "step": 2127 + }, + { + "epoch": 1.3237947122861586, + "grad_norm": 38.66023635864258, + "learning_rate": 3.103964960811434e-06, + "logits/chosen": 2.6481308937072754, + "logits/rejected": 1.8837988376617432, + "logps/chosen": -702.989501953125, + "logps/rejected": -984.9061889648438, + "loss": 0.4483, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.173105239868164, + "rewards/margins": 24.799110412597656, + "rewards/rejected": -35.97221374511719, + "step": 2128 + }, + { + "epoch": 1.3244167962674962, + "grad_norm": 0.9219430685043335, + "learning_rate": 3.1028123559243893e-06, + "logits/chosen": 0.7498334646224976, + "logits/rejected": 4.0075788497924805, + "logps/chosen": -537.6915893554688, + "logps/rejected": -943.332275390625, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.623586654663086, + "rewards/margins": 25.398311614990234, + "rewards/rejected": -34.02189636230469, + "step": 2129 + }, + { + "epoch": 1.3250388802488335, + "grad_norm": 1.8907319088157237e-07, + "learning_rate": 3.101659751037345e-06, + "logits/chosen": -0.394875168800354, + "logits/rejected": 4.241382598876953, + "logps/chosen": -342.15899658203125, + "logps/rejected": -979.307861328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.465579986572266, + "rewards/margins": 36.069435119628906, + "rewards/rejected": -43.53501510620117, + "step": 2130 + }, + { + "epoch": 1.325660964230171, + "grad_norm": 1.4963295459747314, + "learning_rate": 3.1005071461503e-06, + "logits/chosen": 3.006678581237793, + "logits/rejected": 4.0752854347229, + "logps/chosen": -644.6529541015625, + "logps/rejected": -862.7306518554688, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.216716766357422, + "rewards/margins": 15.411653518676758, + "rewards/rejected": -28.628368377685547, + "step": 2131 + }, + { + "epoch": 1.3262830482115087, + "grad_norm": 0.0001043678421410732, + "learning_rate": 3.0993545412632554e-06, + "logits/chosen": -0.3442641496658325, + "logits/rejected": 3.0216991901397705, + "logps/chosen": -512.071533203125, + "logps/rejected": -1114.519287109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.777825355529785, + "rewards/margins": 38.027313232421875, + "rewards/rejected": -44.805137634277344, + "step": 2132 + }, + { + "epoch": 1.326905132192846, + "grad_norm": 0.0006537793087773025, + "learning_rate": 3.0982019363762106e-06, + "logits/chosen": 3.7730226516723633, + "logits/rejected": 4.204200744628906, + "logps/chosen": -813.5851440429688, + "logps/rejected": -1144.5123291015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.097002983093262, + "rewards/margins": 32.316932678222656, + "rewards/rejected": -47.41393280029297, + "step": 2133 + }, + { + "epoch": 1.3275272161741836, + "grad_norm": 0.019565237686038017, + "learning_rate": 3.097049331489166e-06, + "logits/chosen": -2.0026626586914062, + "logits/rejected": 2.547441005706787, + "logps/chosen": -440.7296142578125, + "logps/rejected": -934.6153564453125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.003240585327148, + "rewards/margins": 26.617525100708008, + "rewards/rejected": -35.620765686035156, + "step": 2134 + }, + { + "epoch": 1.3281493001555211, + "grad_norm": 3.7515264921239577e-06, + "learning_rate": 3.095896726602121e-06, + "logits/chosen": 0.791428804397583, + "logits/rejected": 4.45504093170166, + "logps/chosen": -493.91546630859375, + "logps/rejected": -1070.8453369140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.852304458618164, + "rewards/margins": 29.491228103637695, + "rewards/rejected": -40.34353256225586, + "step": 2135 + }, + { + "epoch": 1.3287713841368585, + "grad_norm": 0.002371502574533224, + "learning_rate": 3.0947441217150763e-06, + "logits/chosen": 2.1727709770202637, + "logits/rejected": 1.9977059364318848, + "logps/chosen": -679.185302734375, + "logps/rejected": -975.188720703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.928152084350586, + "rewards/margins": 27.553071975708008, + "rewards/rejected": -38.481224060058594, + "step": 2136 + }, + { + "epoch": 1.329393468118196, + "grad_norm": 0.56076979637146, + "learning_rate": 3.0935915168280315e-06, + "logits/chosen": -0.7301803827285767, + "logits/rejected": 3.738680124282837, + "logps/chosen": -526.186279296875, + "logps/rejected": -996.2095947265625, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.729106903076172, + "rewards/margins": 23.9970703125, + "rewards/rejected": -34.72617721557617, + "step": 2137 + }, + { + "epoch": 1.3300155520995334, + "grad_norm": 0.00125672179274261, + "learning_rate": 3.092438911940987e-06, + "logits/chosen": 0.5604817867279053, + "logits/rejected": 4.069615840911865, + "logps/chosen": -403.9148254394531, + "logps/rejected": -942.8673706054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.626916885375977, + "rewards/margins": 32.32008361816406, + "rewards/rejected": -39.946998596191406, + "step": 2138 + }, + { + "epoch": 1.330637636080871, + "grad_norm": 34.205047607421875, + "learning_rate": 3.0912863070539424e-06, + "logits/chosen": 0.6298574209213257, + "logits/rejected": 2.4317867755889893, + "logps/chosen": -627.36767578125, + "logps/rejected": -1017.8082885742188, + "loss": 0.7327, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.587239265441895, + "rewards/margins": 23.058414459228516, + "rewards/rejected": -33.645652770996094, + "step": 2139 + }, + { + "epoch": 1.3312597200622083, + "grad_norm": 0.0010638857493177056, + "learning_rate": 3.0901337021668976e-06, + "logits/chosen": -0.36331796646118164, + "logits/rejected": 3.6444251537323, + "logps/chosen": -563.85693359375, + "logps/rejected": -1249.5921630859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.383662223815918, + "rewards/margins": 41.731117248535156, + "rewards/rejected": -53.114776611328125, + "step": 2140 + }, + { + "epoch": 1.3318818040435458, + "grad_norm": 13.8467435836792, + "learning_rate": 3.088981097279853e-06, + "logits/chosen": 0.953113317489624, + "logits/rejected": 2.9334874153137207, + "logps/chosen": -620.501220703125, + "logps/rejected": -894.0372314453125, + "loss": 0.1024, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.037420272827148, + "rewards/margins": 14.888923645019531, + "rewards/rejected": -23.92634391784668, + "step": 2141 + }, + { + "epoch": 1.3325038880248834, + "grad_norm": 2.1303920220816508e-05, + "learning_rate": 3.087828492392808e-06, + "logits/chosen": -2.658367156982422e-05, + "logits/rejected": 4.296390056610107, + "logps/chosen": -483.7230224609375, + "logps/rejected": -1070.751708984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.992851257324219, + "rewards/margins": 26.837543487548828, + "rewards/rejected": -38.83039474487305, + "step": 2142 + }, + { + "epoch": 1.3331259720062207, + "grad_norm": 0.026947133243083954, + "learning_rate": 3.0866758875057633e-06, + "logits/chosen": 0.3194420337677002, + "logits/rejected": 3.87138032913208, + "logps/chosen": -482.6922607421875, + "logps/rejected": -992.799560546875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.412920951843262, + "rewards/margins": 28.275657653808594, + "rewards/rejected": -37.68857955932617, + "step": 2143 + }, + { + "epoch": 1.3337480559875583, + "grad_norm": 0.0005353665328584611, + "learning_rate": 3.0855232826187185e-06, + "logits/chosen": 2.0463151931762695, + "logits/rejected": 3.850377082824707, + "logps/chosen": -562.7142944335938, + "logps/rejected": -857.8176879882812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.175778865814209, + "rewards/margins": 25.96051025390625, + "rewards/rejected": -33.13629150390625, + "step": 2144 + }, + { + "epoch": 1.3343701399688959, + "grad_norm": 0.0006094225682318211, + "learning_rate": 3.084370677731674e-06, + "logits/chosen": -2.2982234954833984, + "logits/rejected": 2.2199718952178955, + "logps/chosen": -375.00616455078125, + "logps/rejected": -919.08740234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9371609687805176, + "rewards/margins": 26.919570922851562, + "rewards/rejected": -30.85672950744629, + "step": 2145 + }, + { + "epoch": 1.3349922239502332, + "grad_norm": 0.0008352459408342838, + "learning_rate": 3.0832180728446294e-06, + "logits/chosen": -1.811676263809204, + "logits/rejected": 3.494459867477417, + "logps/chosen": -418.7713317871094, + "logps/rejected": -1007.1888427734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.297712326049805, + "rewards/margins": 27.783876419067383, + "rewards/rejected": -35.08158874511719, + "step": 2146 + }, + { + "epoch": 1.3356143079315708, + "grad_norm": 0.03766888752579689, + "learning_rate": 3.0820654679575846e-06, + "logits/chosen": -0.016537785530090332, + "logits/rejected": 3.032536506652832, + "logps/chosen": -383.07684326171875, + "logps/rejected": -800.5468139648438, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.415544033050537, + "rewards/margins": 24.178037643432617, + "rewards/rejected": -30.593582153320312, + "step": 2147 + }, + { + "epoch": 1.3362363919129083, + "grad_norm": 1.5738648176193237, + "learning_rate": 3.08091286307054e-06, + "logits/chosen": 1.2972499132156372, + "logits/rejected": 3.6453158855438232, + "logps/chosen": -636.6319580078125, + "logps/rejected": -1055.734130859375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.79327392578125, + "rewards/margins": 32.944515228271484, + "rewards/rejected": -43.73778533935547, + "step": 2148 + }, + { + "epoch": 1.3368584758942457, + "grad_norm": 0.9019051194190979, + "learning_rate": 3.079760258183495e-06, + "logits/chosen": -0.11323332786560059, + "logits/rejected": 4.938979148864746, + "logps/chosen": -527.7478637695312, + "logps/rejected": -1196.08447265625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.183340072631836, + "rewards/margins": 36.72466278076172, + "rewards/rejected": -49.90800476074219, + "step": 2149 + }, + { + "epoch": 1.3374805598755832, + "grad_norm": 0.0704350695014, + "learning_rate": 3.0786076532964503e-06, + "logits/chosen": -2.881014823913574, + "logits/rejected": 1.5524474382400513, + "logps/chosen": -409.4421081542969, + "logps/rejected": -975.3092651367188, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.2625885009765625, + "rewards/margins": 32.731468200683594, + "rewards/rejected": -39.994056701660156, + "step": 2150 + }, + { + "epoch": 1.3381026438569208, + "grad_norm": 0.001212852424941957, + "learning_rate": 3.0774550484094055e-06, + "logits/chosen": -3.0680320262908936, + "logits/rejected": 2.949629306793213, + "logps/chosen": -383.16021728515625, + "logps/rejected": -1086.001708984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.776760101318359, + "rewards/margins": 36.22696304321289, + "rewards/rejected": -44.00372314453125, + "step": 2151 + }, + { + "epoch": 1.3387247278382581, + "grad_norm": 4.288447856903076, + "learning_rate": 3.076302443522361e-06, + "logits/chosen": -0.5125648379325867, + "logits/rejected": 2.735118865966797, + "logps/chosen": -511.6773986816406, + "logps/rejected": -967.4842529296875, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.024867057800293, + "rewards/margins": 30.234031677246094, + "rewards/rejected": -38.25889587402344, + "step": 2152 + }, + { + "epoch": 1.3393468118195957, + "grad_norm": 0.00015137945592869073, + "learning_rate": 3.0751498386353164e-06, + "logits/chosen": 0.21446776390075684, + "logits/rejected": 1.9205048084259033, + "logps/chosen": -533.0106201171875, + "logps/rejected": -891.5570068359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.088725566864014, + "rewards/margins": 26.396310806274414, + "rewards/rejected": -33.48503494262695, + "step": 2153 + }, + { + "epoch": 1.3399688958009333, + "grad_norm": 0.18598617613315582, + "learning_rate": 3.0739972337482716e-06, + "logits/chosen": -0.05675274133682251, + "logits/rejected": 4.564008712768555, + "logps/chosen": -496.07373046875, + "logps/rejected": -1063.553955078125, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.981276988983154, + "rewards/margins": 31.187776565551758, + "rewards/rejected": -39.16905212402344, + "step": 2154 + }, + { + "epoch": 1.3405909797822706, + "grad_norm": 12.508747100830078, + "learning_rate": 3.072844628861227e-06, + "logits/chosen": 2.6598260402679443, + "logits/rejected": 3.2036752700805664, + "logps/chosen": -674.0693969726562, + "logps/rejected": -987.4519653320312, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.833343505859375, + "rewards/margins": 25.156932830810547, + "rewards/rejected": -37.990272521972656, + "step": 2155 + }, + { + "epoch": 1.3412130637636082, + "grad_norm": 0.00395676214247942, + "learning_rate": 3.071692023974182e-06, + "logits/chosen": 2.0138673782348633, + "logits/rejected": 3.2959272861480713, + "logps/chosen": -664.734375, + "logps/rejected": -1090.194580078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.462017059326172, + "rewards/margins": 29.33136749267578, + "rewards/rejected": -39.79338836669922, + "step": 2156 + }, + { + "epoch": 1.3418351477449455, + "grad_norm": 0.0016368265496566892, + "learning_rate": 3.0705394190871373e-06, + "logits/chosen": 2.031752347946167, + "logits/rejected": 3.387629270553589, + "logps/chosen": -595.8892211914062, + "logps/rejected": -908.4937744140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.228726387023926, + "rewards/margins": 29.409114837646484, + "rewards/rejected": -40.637840270996094, + "step": 2157 + }, + { + "epoch": 1.342457231726283, + "grad_norm": 0.004761462565511465, + "learning_rate": 3.0693868142000925e-06, + "logits/chosen": -0.8495367765426636, + "logits/rejected": 1.2057145833969116, + "logps/chosen": -457.0412292480469, + "logps/rejected": -904.3390502929688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.196920394897461, + "rewards/margins": 26.140365600585938, + "rewards/rejected": -34.33728790283203, + "step": 2158 + }, + { + "epoch": 1.3430793157076204, + "grad_norm": 0.00018156910664401948, + "learning_rate": 3.068234209313048e-06, + "logits/chosen": 0.3538955748081207, + "logits/rejected": 2.821544647216797, + "logps/chosen": -508.57733154296875, + "logps/rejected": -989.5989990234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.294981002807617, + "rewards/margins": 30.52140998840332, + "rewards/rejected": -37.81639099121094, + "step": 2159 + }, + { + "epoch": 1.343701399688958, + "grad_norm": 0.0011041401885449886, + "learning_rate": 3.0670816044260034e-06, + "logits/chosen": 2.2572083473205566, + "logits/rejected": 4.512560844421387, + "logps/chosen": -734.511474609375, + "logps/rejected": -1160.5986328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.02027702331543, + "rewards/margins": 28.191675186157227, + "rewards/rejected": -41.211952209472656, + "step": 2160 + }, + { + "epoch": 1.3443234836702955, + "grad_norm": 0.02089731954038143, + "learning_rate": 3.0659289995389586e-06, + "logits/chosen": 1.5649492740631104, + "logits/rejected": 4.971779823303223, + "logps/chosen": -527.366943359375, + "logps/rejected": -990.109375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.970163345336914, + "rewards/margins": 29.34923553466797, + "rewards/rejected": -37.31939697265625, + "step": 2161 + }, + { + "epoch": 1.3449455676516329, + "grad_norm": 0.01778881810605526, + "learning_rate": 3.064776394651914e-06, + "logits/chosen": -0.14575600624084473, + "logits/rejected": 3.174208641052246, + "logps/chosen": -595.9559326171875, + "logps/rejected": -1114.135498046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.597963333129883, + "rewards/margins": 32.00989532470703, + "rewards/rejected": -44.60785675048828, + "step": 2162 + }, + { + "epoch": 1.3455676516329704, + "grad_norm": 0.06671002507209778, + "learning_rate": 3.063623789764869e-06, + "logits/chosen": 0.020184874534606934, + "logits/rejected": 0.91424161195755, + "logps/chosen": -740.4108276367188, + "logps/rejected": -928.4967651367188, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.36511993408203, + "rewards/margins": 15.735336303710938, + "rewards/rejected": -32.10045623779297, + "step": 2163 + }, + { + "epoch": 1.346189735614308, + "grad_norm": 5.080242156982422, + "learning_rate": 3.0624711848778243e-06, + "logits/chosen": -0.9118480682373047, + "logits/rejected": 3.6349895000457764, + "logps/chosen": -563.9638671875, + "logps/rejected": -1191.065185546875, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.759162902832031, + "rewards/margins": 34.771461486816406, + "rewards/rejected": -46.53062057495117, + "step": 2164 + }, + { + "epoch": 1.3468118195956453, + "grad_norm": 5.798954589408822e-06, + "learning_rate": 3.0613185799907795e-06, + "logits/chosen": 0.6963211894035339, + "logits/rejected": 3.031797409057617, + "logps/chosen": -366.27276611328125, + "logps/rejected": -834.5653076171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.694557189941406, + "rewards/margins": 26.892271041870117, + "rewards/rejected": -31.58682632446289, + "step": 2165 + }, + { + "epoch": 1.347433903576983, + "grad_norm": 31.725595474243164, + "learning_rate": 3.0601659751037347e-06, + "logits/chosen": 2.5356428623199463, + "logits/rejected": 5.059682846069336, + "logps/chosen": -436.6680908203125, + "logps/rejected": -781.17529296875, + "loss": 0.3965, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.763218879699707, + "rewards/margins": 19.586204528808594, + "rewards/rejected": -28.349422454833984, + "step": 2166 + }, + { + "epoch": 1.3480559875583205, + "grad_norm": 4.612741947174072, + "learning_rate": 3.0590133702166904e-06, + "logits/chosen": 0.2573816180229187, + "logits/rejected": 2.5070643424987793, + "logps/chosen": -609.2000122070312, + "logps/rejected": -941.3594970703125, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.779262065887451, + "rewards/margins": 19.19269561767578, + "rewards/rejected": -26.971956253051758, + "step": 2167 + }, + { + "epoch": 1.3486780715396578, + "grad_norm": 0.3579530715942383, + "learning_rate": 3.0578607653296456e-06, + "logits/chosen": 0.7475700378417969, + "logits/rejected": 2.640726327896118, + "logps/chosen": -698.0152587890625, + "logps/rejected": -1077.4521484375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.386947631835938, + "rewards/margins": 27.110260009765625, + "rewards/rejected": -40.49720764160156, + "step": 2168 + }, + { + "epoch": 1.3493001555209954, + "grad_norm": 0.0044502979144454, + "learning_rate": 3.056708160442601e-06, + "logits/chosen": -0.43040260672569275, + "logits/rejected": 2.405294895172119, + "logps/chosen": -550.3878173828125, + "logps/rejected": -1116.6385498046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.990240097045898, + "rewards/margins": 32.76185607910156, + "rewards/rejected": -43.752098083496094, + "step": 2169 + }, + { + "epoch": 1.349922239502333, + "grad_norm": 0.044596899300813675, + "learning_rate": 3.055555555555556e-06, + "logits/chosen": 2.8219692707061768, + "logits/rejected": 3.6726090908050537, + "logps/chosen": -737.0827026367188, + "logps/rejected": -1036.586181640625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.100625038146973, + "rewards/margins": 22.17905616760254, + "rewards/rejected": -35.27967834472656, + "step": 2170 + }, + { + "epoch": 1.3505443234836703, + "grad_norm": 3.109398312517442e-05, + "learning_rate": 3.0544029506685113e-06, + "logits/chosen": 0.12296566367149353, + "logits/rejected": 3.4189023971557617, + "logps/chosen": -581.9413452148438, + "logps/rejected": -1092.4742431640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.76758098602295, + "rewards/margins": 31.79904556274414, + "rewards/rejected": -44.566627502441406, + "step": 2171 + }, + { + "epoch": 1.3511664074650078, + "grad_norm": 6.899026629980654e-05, + "learning_rate": 3.0532503457814665e-06, + "logits/chosen": -0.5253732204437256, + "logits/rejected": 2.619370460510254, + "logps/chosen": -349.04693603515625, + "logps/rejected": -896.3671264648438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.893255233764648, + "rewards/margins": 34.224151611328125, + "rewards/rejected": -40.117408752441406, + "step": 2172 + }, + { + "epoch": 1.3517884914463454, + "grad_norm": 17.062660217285156, + "learning_rate": 3.0520977408944217e-06, + "logits/chosen": 0.9396522045135498, + "logits/rejected": 2.4779770374298096, + "logps/chosen": -412.9204406738281, + "logps/rejected": -755.146728515625, + "loss": 0.1, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.589811325073242, + "rewards/margins": 23.087783813476562, + "rewards/rejected": -28.677597045898438, + "step": 2173 + }, + { + "epoch": 1.3524105754276827, + "grad_norm": 0.1276179552078247, + "learning_rate": 3.0509451360073774e-06, + "logits/chosen": -2.0715835094451904, + "logits/rejected": 2.1000967025756836, + "logps/chosen": -390.0458068847656, + "logps/rejected": -878.71826171875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.325183868408203, + "rewards/margins": 26.3515625, + "rewards/rejected": -34.67674255371094, + "step": 2174 + }, + { + "epoch": 1.3530326594090203, + "grad_norm": 0.5153205990791321, + "learning_rate": 3.0497925311203326e-06, + "logits/chosen": -1.6525423526763916, + "logits/rejected": 3.016981601715088, + "logps/chosen": -440.17242431640625, + "logps/rejected": -1001.20947265625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.2114839553833, + "rewards/margins": 30.331432342529297, + "rewards/rejected": -38.54291534423828, + "step": 2175 + }, + { + "epoch": 1.3536547433903576, + "grad_norm": 1.2170327863714192e-05, + "learning_rate": 3.048639926233288e-06, + "logits/chosen": -1.190338373184204, + "logits/rejected": 1.2228800058364868, + "logps/chosen": -388.32745361328125, + "logps/rejected": -796.8060913085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.56209659576416, + "rewards/margins": 26.223255157470703, + "rewards/rejected": -31.78535270690918, + "step": 2176 + }, + { + "epoch": 1.3542768273716952, + "grad_norm": 36.89498519897461, + "learning_rate": 3.047487321346243e-06, + "logits/chosen": -1.2961982488632202, + "logits/rejected": 2.1595730781555176, + "logps/chosen": -483.5260009765625, + "logps/rejected": -938.668701171875, + "loss": 0.3598, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.170467376708984, + "rewards/margins": 22.213472366333008, + "rewards/rejected": -33.383941650390625, + "step": 2177 + }, + { + "epoch": 1.3548989113530325, + "grad_norm": 0.0018010102212429047, + "learning_rate": 3.046334716459198e-06, + "logits/chosen": 1.577118158340454, + "logits/rejected": 4.508591651916504, + "logps/chosen": -512.4389038085938, + "logps/rejected": -965.859619140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.248443603515625, + "rewards/margins": 28.630136489868164, + "rewards/rejected": -42.878578186035156, + "step": 2178 + }, + { + "epoch": 1.35552099533437, + "grad_norm": 0.009161030873656273, + "learning_rate": 3.045182111572153e-06, + "logits/chosen": -1.5993404388427734, + "logits/rejected": 1.1790560483932495, + "logps/chosen": -384.3905334472656, + "logps/rejected": -770.4791259765625, + "loss": 0.0866, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.886144638061523, + "rewards/margins": 26.454479217529297, + "rewards/rejected": -32.34062194824219, + "step": 2179 + }, + { + "epoch": 1.3561430793157077, + "grad_norm": 0.004069062415510416, + "learning_rate": 3.0440295066851083e-06, + "logits/chosen": 2.374175548553467, + "logits/rejected": 4.294281959533691, + "logps/chosen": -632.2175903320312, + "logps/rejected": -1098.2437744140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.008170127868652, + "rewards/margins": 35.68601989746094, + "rewards/rejected": -49.69418716430664, + "step": 2180 + }, + { + "epoch": 1.356765163297045, + "grad_norm": 4.699228286743164, + "learning_rate": 3.0428769017980635e-06, + "logits/chosen": 1.171891450881958, + "logits/rejected": 3.271820545196533, + "logps/chosen": -440.36456298828125, + "logps/rejected": -847.413818359375, + "loss": 0.0516, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.948079109191895, + "rewards/margins": 21.653432846069336, + "rewards/rejected": -31.601512908935547, + "step": 2181 + }, + { + "epoch": 1.3573872472783826, + "grad_norm": 0.0018416978418827057, + "learning_rate": 3.0417242969110187e-06, + "logits/chosen": -1.469376802444458, + "logits/rejected": 4.174241065979004, + "logps/chosen": -350.88006591796875, + "logps/rejected": -962.7076416015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.410130500793457, + "rewards/margins": 26.457324981689453, + "rewards/rejected": -33.867454528808594, + "step": 2182 + }, + { + "epoch": 1.3580093312597201, + "grad_norm": 14.689421653747559, + "learning_rate": 3.0405716920239744e-06, + "logits/chosen": -1.2712656259536743, + "logits/rejected": 2.44496488571167, + "logps/chosen": -562.570556640625, + "logps/rejected": -999.324462890625, + "loss": 0.0671, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.03051471710205, + "rewards/margins": 25.950546264648438, + "rewards/rejected": -33.98106002807617, + "step": 2183 + }, + { + "epoch": 1.3586314152410575, + "grad_norm": 6.13081192568643e-06, + "learning_rate": 3.0394190871369296e-06, + "logits/chosen": -0.9931305646896362, + "logits/rejected": 3.7858331203460693, + "logps/chosen": -382.7630310058594, + "logps/rejected": -1043.0816650390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.340179443359375, + "rewards/margins": 38.07415771484375, + "rewards/rejected": -45.414337158203125, + "step": 2184 + }, + { + "epoch": 1.359253499222395, + "grad_norm": 0.6616275310516357, + "learning_rate": 3.038266482249885e-06, + "logits/chosen": -1.0988692045211792, + "logits/rejected": 2.3249878883361816, + "logps/chosen": -549.3353271484375, + "logps/rejected": -909.2433471679688, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.345589637756348, + "rewards/margins": 23.204883575439453, + "rewards/rejected": -32.550472259521484, + "step": 2185 + }, + { + "epoch": 1.3598755832037326, + "grad_norm": 0.0020644681062549353, + "learning_rate": 3.03711387736284e-06, + "logits/chosen": -0.09544289112091064, + "logits/rejected": 2.8298497200012207, + "logps/chosen": -523.290771484375, + "logps/rejected": -1006.2669067382812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.667964935302734, + "rewards/margins": 35.17736053466797, + "rewards/rejected": -43.8453254699707, + "step": 2186 + }, + { + "epoch": 1.36049766718507, + "grad_norm": 0.0008767215767875314, + "learning_rate": 3.0359612724757953e-06, + "logits/chosen": 0.11832370609045029, + "logits/rejected": 2.8614234924316406, + "logps/chosen": -575.4088134765625, + "logps/rejected": -970.757080078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.645832061767578, + "rewards/margins": 28.8430233001709, + "rewards/rejected": -37.48885726928711, + "step": 2187 + }, + { + "epoch": 1.3611197511664075, + "grad_norm": 0.10958057641983032, + "learning_rate": 3.0348086675887505e-06, + "logits/chosen": -0.055881351232528687, + "logits/rejected": 3.112879514694214, + "logps/chosen": -474.2701110839844, + "logps/rejected": -868.6067504882812, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.301572322845459, + "rewards/margins": 23.456018447875977, + "rewards/rejected": -29.757593154907227, + "step": 2188 + }, + { + "epoch": 1.361741835147745, + "grad_norm": 14.898518562316895, + "learning_rate": 3.0336560627017057e-06, + "logits/chosen": 2.1643824577331543, + "logits/rejected": 4.367372512817383, + "logps/chosen": -676.5453491210938, + "logps/rejected": -1104.552001953125, + "loss": 0.0652, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.872103691101074, + "rewards/margins": 28.61599349975586, + "rewards/rejected": -41.48809814453125, + "step": 2189 + }, + { + "epoch": 1.3623639191290824, + "grad_norm": 12.740869522094727, + "learning_rate": 3.032503457814661e-06, + "logits/chosen": 0.9304056167602539, + "logits/rejected": 4.536799430847168, + "logps/chosen": -492.8013610839844, + "logps/rejected": -1083.08544921875, + "loss": 0.0633, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.854243755340576, + "rewards/margins": 35.24501419067383, + "rewards/rejected": -43.0992546081543, + "step": 2190 + }, + { + "epoch": 1.36298600311042, + "grad_norm": 6.474887868535006e-06, + "learning_rate": 3.0313508529276166e-06, + "logits/chosen": 0.4632197618484497, + "logits/rejected": 3.4487247467041016, + "logps/chosen": -522.9790649414062, + "logps/rejected": -915.0444946289062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.24857234954834, + "rewards/margins": 23.84241485595703, + "rewards/rejected": -32.09098434448242, + "step": 2191 + }, + { + "epoch": 1.3636080870917575, + "grad_norm": 0.3108627498149872, + "learning_rate": 3.030198248040572e-06, + "logits/chosen": 1.1985173225402832, + "logits/rejected": 3.053633451461792, + "logps/chosen": -631.7606201171875, + "logps/rejected": -1198.51416015625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.875808715820312, + "rewards/margins": 36.570396423339844, + "rewards/rejected": -50.446205139160156, + "step": 2192 + }, + { + "epoch": 1.3642301710730949, + "grad_norm": 0.0007350373198278248, + "learning_rate": 3.029045643153527e-06, + "logits/chosen": -1.5113818645477295, + "logits/rejected": 0.9358669519424438, + "logps/chosen": -529.31591796875, + "logps/rejected": -954.6470947265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.916925430297852, + "rewards/margins": 34.9250602722168, + "rewards/rejected": -42.841983795166016, + "step": 2193 + }, + { + "epoch": 1.3648522550544324, + "grad_norm": 0.00030861847335472703, + "learning_rate": 3.0278930382664823e-06, + "logits/chosen": -0.05734395980834961, + "logits/rejected": 2.462827682495117, + "logps/chosen": -528.8324584960938, + "logps/rejected": -969.40625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.957712173461914, + "rewards/margins": 32.38273620605469, + "rewards/rejected": -45.340450286865234, + "step": 2194 + }, + { + "epoch": 1.3654743390357698, + "grad_norm": 4.3081145122414455e-05, + "learning_rate": 3.0267404333794375e-06, + "logits/chosen": -0.979878306388855, + "logits/rejected": 4.725745677947998, + "logps/chosen": -534.77880859375, + "logps/rejected": -1191.40234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.238239288330078, + "rewards/margins": 26.146331787109375, + "rewards/rejected": -34.38457107543945, + "step": 2195 + }, + { + "epoch": 1.3660964230171073, + "grad_norm": 0.001050711958669126, + "learning_rate": 3.0255878284923927e-06, + "logits/chosen": 2.1874237060546875, + "logits/rejected": 2.7432210445404053, + "logps/chosen": -759.297119140625, + "logps/rejected": -1180.341064453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.559810638427734, + "rewards/margins": 32.7918586730957, + "rewards/rejected": -43.35166931152344, + "step": 2196 + }, + { + "epoch": 1.3667185069984447, + "grad_norm": 0.07033015042543411, + "learning_rate": 3.024435223605348e-06, + "logits/chosen": 0.10440731048583984, + "logits/rejected": 2.772296190261841, + "logps/chosen": -421.4175109863281, + "logps/rejected": -813.867919921875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.586875915527344, + "rewards/margins": 25.79877281188965, + "rewards/rejected": -33.385650634765625, + "step": 2197 + }, + { + "epoch": 1.3673405909797822, + "grad_norm": 0.018981391564011574, + "learning_rate": 3.0232826187183036e-06, + "logits/chosen": 0.21775102615356445, + "logits/rejected": 3.7370357513427734, + "logps/chosen": -497.3377990722656, + "logps/rejected": -1062.53076171875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.430828094482422, + "rewards/margins": 30.548934936523438, + "rewards/rejected": -39.97976303100586, + "step": 2198 + }, + { + "epoch": 1.3679626749611198, + "grad_norm": 0.007361208088696003, + "learning_rate": 3.022130013831259e-06, + "logits/chosen": 1.6204208135604858, + "logits/rejected": 4.520246505737305, + "logps/chosen": -634.2261962890625, + "logps/rejected": -1120.288818359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.430303573608398, + "rewards/margins": 32.23867416381836, + "rewards/rejected": -42.668975830078125, + "step": 2199 + }, + { + "epoch": 1.3685847589424571, + "grad_norm": 0.0001982577668968588, + "learning_rate": 3.020977408944214e-06, + "logits/chosen": 2.498847007751465, + "logits/rejected": 4.539471626281738, + "logps/chosen": -700.4442138671875, + "logps/rejected": -987.4329833984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.197793960571289, + "rewards/margins": 21.45416259765625, + "rewards/rejected": -34.65195846557617, + "step": 2200 + }, + { + "epoch": 1.3692068429237947, + "grad_norm": 6.363242732732033e-08, + "learning_rate": 3.0198248040571693e-06, + "logits/chosen": 1.5000331401824951, + "logits/rejected": 2.548959732055664, + "logps/chosen": -638.2463989257812, + "logps/rejected": -1020.43310546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.709440231323242, + "rewards/margins": 31.604660034179688, + "rewards/rejected": -44.3140983581543, + "step": 2201 + }, + { + "epoch": 1.3698289269051322, + "grad_norm": 0.0008398335776291788, + "learning_rate": 3.0186721991701245e-06, + "logits/chosen": -0.9833596348762512, + "logits/rejected": 4.07463264465332, + "logps/chosen": -460.53094482421875, + "logps/rejected": -1072.9697265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.126919746398926, + "rewards/margins": 28.681936264038086, + "rewards/rejected": -38.80885696411133, + "step": 2202 + }, + { + "epoch": 1.3704510108864696, + "grad_norm": 0.11784781515598297, + "learning_rate": 3.0175195942830797e-06, + "logits/chosen": -1.933821678161621, + "logits/rejected": 2.0009396076202393, + "logps/chosen": -407.39422607421875, + "logps/rejected": -861.2830810546875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.528697967529297, + "rewards/margins": 26.151098251342773, + "rewards/rejected": -33.67979431152344, + "step": 2203 + }, + { + "epoch": 1.3710730948678072, + "grad_norm": 0.1820145547389984, + "learning_rate": 3.016366989396035e-06, + "logits/chosen": -2.080866813659668, + "logits/rejected": 3.588825225830078, + "logps/chosen": -406.08258056640625, + "logps/rejected": -1154.141357421875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.055073738098145, + "rewards/margins": 35.42332077026367, + "rewards/rejected": -44.478397369384766, + "step": 2204 + }, + { + "epoch": 1.3716951788491447, + "grad_norm": 0.861481785774231, + "learning_rate": 3.0152143845089906e-06, + "logits/chosen": 1.8138333559036255, + "logits/rejected": 2.679360866546631, + "logps/chosen": -702.68603515625, + "logps/rejected": -949.3597412109375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.829734802246094, + "rewards/margins": 21.387401580810547, + "rewards/rejected": -33.217140197753906, + "step": 2205 + }, + { + "epoch": 1.372317262830482, + "grad_norm": 3.076730763496016e-07, + "learning_rate": 3.014061779621946e-06, + "logits/chosen": 0.05164957046508789, + "logits/rejected": 2.419743537902832, + "logps/chosen": -531.583984375, + "logps/rejected": -1120.018798828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.508309364318848, + "rewards/margins": 40.142372131347656, + "rewards/rejected": -50.65068054199219, + "step": 2206 + }, + { + "epoch": 1.3729393468118196, + "grad_norm": 3.052052761631785e-07, + "learning_rate": 3.012909174734901e-06, + "logits/chosen": 0.6604109406471252, + "logits/rejected": 3.84415340423584, + "logps/chosen": -599.694091796875, + "logps/rejected": -1072.8603515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.875895977020264, + "rewards/margins": 30.14890480041504, + "rewards/rejected": -37.024803161621094, + "step": 2207 + }, + { + "epoch": 1.3735614307931572, + "grad_norm": 0.050999533385038376, + "learning_rate": 3.0117565698478563e-06, + "logits/chosen": 0.33914634585380554, + "logits/rejected": 4.714583396911621, + "logps/chosen": -534.7384033203125, + "logps/rejected": -1071.54931640625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.826058387756348, + "rewards/margins": 28.858766555786133, + "rewards/rejected": -38.68482208251953, + "step": 2208 + }, + { + "epoch": 1.3741835147744945, + "grad_norm": 2.4684137315489352e-05, + "learning_rate": 3.0106039649608115e-06, + "logits/chosen": 1.701751947402954, + "logits/rejected": 3.701206684112549, + "logps/chosen": -615.7250366210938, + "logps/rejected": -1081.4918212890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.088967323303223, + "rewards/margins": 31.6005802154541, + "rewards/rejected": -43.68954849243164, + "step": 2209 + }, + { + "epoch": 1.374805598755832, + "grad_norm": 4.353829363007433e-12, + "learning_rate": 3.0094513600737667e-06, + "logits/chosen": 0.5934591293334961, + "logits/rejected": 3.640597343444824, + "logps/chosen": -633.1799926757812, + "logps/rejected": -1189.581298828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.416163444519043, + "rewards/margins": 41.63801574707031, + "rewards/rejected": -54.054176330566406, + "step": 2210 + }, + { + "epoch": 1.3754276827371696, + "grad_norm": 3.4501149654388428, + "learning_rate": 3.008298755186722e-06, + "logits/chosen": 0.6663314700126648, + "logits/rejected": 3.7698488235473633, + "logps/chosen": -592.5361938476562, + "logps/rejected": -1084.20751953125, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.985772132873535, + "rewards/margins": 28.351715087890625, + "rewards/rejected": -39.33749008178711, + "step": 2211 + }, + { + "epoch": 1.376049766718507, + "grad_norm": 1.214347004890442, + "learning_rate": 3.007146150299677e-06, + "logits/chosen": -0.8623265027999878, + "logits/rejected": 1.3673484325408936, + "logps/chosen": -475.67144775390625, + "logps/rejected": -960.241943359375, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.342723846435547, + "rewards/margins": 32.329654693603516, + "rewards/rejected": -43.67237854003906, + "step": 2212 + }, + { + "epoch": 1.3766718506998445, + "grad_norm": 3.2612618383609515e-07, + "learning_rate": 3.005993545412633e-06, + "logits/chosen": -4.33530330657959, + "logits/rejected": 2.7184455394744873, + "logps/chosen": -312.84344482421875, + "logps/rejected": -1029.19580078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.19746732711792, + "rewards/margins": 33.2601318359375, + "rewards/rejected": -39.457603454589844, + "step": 2213 + }, + { + "epoch": 1.3772939346811819, + "grad_norm": 0.0015476691769436002, + "learning_rate": 3.004840940525588e-06, + "logits/chosen": -2.4688735008239746, + "logits/rejected": 1.4806026220321655, + "logps/chosen": -441.9232482910156, + "logps/rejected": -934.3365478515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.864378929138184, + "rewards/margins": 26.260881423950195, + "rewards/rejected": -33.12525939941406, + "step": 2214 + }, + { + "epoch": 1.3779160186625194, + "grad_norm": 0.014643407426774502, + "learning_rate": 3.0036883356385433e-06, + "logits/chosen": 3.419661521911621, + "logits/rejected": 4.806012153625488, + "logps/chosen": -704.5573120117188, + "logps/rejected": -1011.6292724609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.705732345581055, + "rewards/margins": 21.165294647216797, + "rewards/rejected": -36.871028900146484, + "step": 2215 + }, + { + "epoch": 1.3785381026438568, + "grad_norm": 8.841948509216309, + "learning_rate": 3.0025357307514985e-06, + "logits/chosen": -0.05596870183944702, + "logits/rejected": 1.6367905139923096, + "logps/chosen": -547.1561889648438, + "logps/rejected": -957.1211547851562, + "loss": 0.0528, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.281091690063477, + "rewards/margins": 22.269014358520508, + "rewards/rejected": -31.55010986328125, + "step": 2216 + }, + { + "epoch": 1.3791601866251944, + "grad_norm": 51.19075012207031, + "learning_rate": 3.0013831258644537e-06, + "logits/chosen": 0.614906907081604, + "logits/rejected": 3.123727798461914, + "logps/chosen": -648.155029296875, + "logps/rejected": -1151.25146484375, + "loss": 2.2017, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.067476272583008, + "rewards/margins": 29.206144332885742, + "rewards/rejected": -40.273624420166016, + "step": 2217 + }, + { + "epoch": 1.379782270606532, + "grad_norm": 29.52722930908203, + "learning_rate": 3.000230520977409e-06, + "logits/chosen": -0.36638355255126953, + "logits/rejected": 2.1038315296173096, + "logps/chosen": -515.594970703125, + "logps/rejected": -849.2973022460938, + "loss": 0.3916, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.794394493103027, + "rewards/margins": 23.82428741455078, + "rewards/rejected": -32.618682861328125, + "step": 2218 + }, + { + "epoch": 1.3804043545878693, + "grad_norm": 1.017388105392456, + "learning_rate": 2.999077916090364e-06, + "logits/chosen": 2.244713306427002, + "logits/rejected": 2.9050168991088867, + "logps/chosen": -666.0272216796875, + "logps/rejected": -969.8905029296875, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.936464309692383, + "rewards/margins": 22.269237518310547, + "rewards/rejected": -35.20570373535156, + "step": 2219 + }, + { + "epoch": 1.3810264385692068, + "grad_norm": 0.00017945458239410073, + "learning_rate": 2.99792531120332e-06, + "logits/chosen": 0.7347161173820496, + "logits/rejected": 3.7560269832611084, + "logps/chosen": -568.7210083007812, + "logps/rejected": -1096.6600341796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.303465843200684, + "rewards/margins": 33.7734375, + "rewards/rejected": -41.076904296875, + "step": 2220 + }, + { + "epoch": 1.3816485225505444, + "grad_norm": 0.021343346685171127, + "learning_rate": 2.996772706316275e-06, + "logits/chosen": 1.6646852493286133, + "logits/rejected": 4.067060470581055, + "logps/chosen": -628.833740234375, + "logps/rejected": -937.606689453125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.903544425964355, + "rewards/margins": 19.44011688232422, + "rewards/rejected": -31.34366226196289, + "step": 2221 + }, + { + "epoch": 1.3822706065318817, + "grad_norm": 0.30674031376838684, + "learning_rate": 2.9956201014292303e-06, + "logits/chosen": 2.5471351146698, + "logits/rejected": 2.4241745471954346, + "logps/chosen": -661.6102294921875, + "logps/rejected": -892.48095703125, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.633338928222656, + "rewards/margins": 27.561359405517578, + "rewards/rejected": -38.1947021484375, + "step": 2222 + }, + { + "epoch": 1.3828926905132193, + "grad_norm": 0.3236640989780426, + "learning_rate": 2.9944674965421855e-06, + "logits/chosen": -1.1065866947174072, + "logits/rejected": 1.563366413116455, + "logps/chosen": -471.33837890625, + "logps/rejected": -941.787109375, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.373132705688477, + "rewards/margins": 27.12007713317871, + "rewards/rejected": -37.49320983886719, + "step": 2223 + }, + { + "epoch": 1.3835147744945568, + "grad_norm": 1.721951961517334, + "learning_rate": 2.9933148916551407e-06, + "logits/chosen": 2.6324095726013184, + "logits/rejected": 3.3818371295928955, + "logps/chosen": -613.0682983398438, + "logps/rejected": -748.551025390625, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.490086555480957, + "rewards/margins": 12.575876235961914, + "rewards/rejected": -27.065963745117188, + "step": 2224 + }, + { + "epoch": 1.3841368584758942, + "grad_norm": 0.0003514946438372135, + "learning_rate": 2.992162286768096e-06, + "logits/chosen": -1.2207633256912231, + "logits/rejected": 2.467137336730957, + "logps/chosen": -384.5999450683594, + "logps/rejected": -804.305908203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.229488372802734, + "rewards/margins": 23.717012405395508, + "rewards/rejected": -31.946500778198242, + "step": 2225 + }, + { + "epoch": 1.3847589424572317, + "grad_norm": 7.888747692108154, + "learning_rate": 2.991009681881051e-06, + "logits/chosen": 0.040084779262542725, + "logits/rejected": 4.101715087890625, + "logps/chosen": -499.83514404296875, + "logps/rejected": -1046.8726806640625, + "loss": 0.0338, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.456901550292969, + "rewards/margins": 27.2252254486084, + "rewards/rejected": -36.682125091552734, + "step": 2226 + }, + { + "epoch": 1.3853810264385693, + "grad_norm": 0.031036915257573128, + "learning_rate": 2.989857076994007e-06, + "logits/chosen": -3.403718948364258, + "logits/rejected": 2.1908183097839355, + "logps/chosen": -233.61251831054688, + "logps/rejected": -785.1973876953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.809695720672607, + "rewards/margins": 24.013778686523438, + "rewards/rejected": -30.823474884033203, + "step": 2227 + }, + { + "epoch": 1.3860031104199066, + "grad_norm": 0.4252808392047882, + "learning_rate": 2.988704472106962e-06, + "logits/chosen": 0.7094401717185974, + "logits/rejected": 1.9696271419525146, + "logps/chosen": -528.63037109375, + "logps/rejected": -785.7152099609375, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.194915294647217, + "rewards/margins": 25.390228271484375, + "rewards/rejected": -32.58514404296875, + "step": 2228 + }, + { + "epoch": 1.3866251944012442, + "grad_norm": 0.004287133924663067, + "learning_rate": 2.9875518672199173e-06, + "logits/chosen": -1.7506518363952637, + "logits/rejected": 1.3359475135803223, + "logps/chosen": -471.49822998046875, + "logps/rejected": -945.1353759765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.74657154083252, + "rewards/margins": 30.829132080078125, + "rewards/rejected": -39.57570266723633, + "step": 2229 + }, + { + "epoch": 1.3872472783825818, + "grad_norm": 0.012623314745724201, + "learning_rate": 2.9863992623328725e-06, + "logits/chosen": -0.06425750255584717, + "logits/rejected": 2.667785167694092, + "logps/chosen": -570.6763916015625, + "logps/rejected": -1034.1314697265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.794634819030762, + "rewards/margins": 22.76612091064453, + "rewards/rejected": -34.560752868652344, + "step": 2230 + }, + { + "epoch": 1.3878693623639191, + "grad_norm": 0.05512861907482147, + "learning_rate": 2.9852466574458277e-06, + "logits/chosen": 0.703946590423584, + "logits/rejected": 4.535602569580078, + "logps/chosen": -480.8987121582031, + "logps/rejected": -936.260986328125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.488350868225098, + "rewards/margins": 27.323925018310547, + "rewards/rejected": -35.81227493286133, + "step": 2231 + }, + { + "epoch": 1.3884914463452567, + "grad_norm": 2.850918008334702e-06, + "learning_rate": 2.984094052558783e-06, + "logits/chosen": -3.057971715927124, + "logits/rejected": 4.859375, + "logps/chosen": -320.9295959472656, + "logps/rejected": -1208.96435546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.41721248626709, + "rewards/margins": 38.850830078125, + "rewards/rejected": -45.26803970336914, + "step": 2232 + }, + { + "epoch": 1.389113530326594, + "grad_norm": 13.858604431152344, + "learning_rate": 2.982941447671738e-06, + "logits/chosen": -0.5668371319770813, + "logits/rejected": 5.378746509552002, + "logps/chosen": -516.4816284179688, + "logps/rejected": -1168.6309814453125, + "loss": 0.0561, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.723061561584473, + "rewards/margins": 32.517276763916016, + "rewards/rejected": -44.24034118652344, + "step": 2233 + }, + { + "epoch": 1.3897356143079316, + "grad_norm": 0.018682241439819336, + "learning_rate": 2.981788842784694e-06, + "logits/chosen": -0.5560173392295837, + "logits/rejected": 4.388740062713623, + "logps/chosen": -478.714111328125, + "logps/rejected": -1018.8369140625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.856765270233154, + "rewards/margins": 26.526676177978516, + "rewards/rejected": -34.38343811035156, + "step": 2234 + }, + { + "epoch": 1.390357698289269, + "grad_norm": 0.00024261536600533873, + "learning_rate": 2.980636237897649e-06, + "logits/chosen": 0.7792164087295532, + "logits/rejected": 4.847288131713867, + "logps/chosen": -501.6375732421875, + "logps/rejected": -1087.42626953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.4685640335083, + "rewards/margins": 34.76447296142578, + "rewards/rejected": -44.23303985595703, + "step": 2235 + }, + { + "epoch": 1.3909797822706065, + "grad_norm": 0.3464057147502899, + "learning_rate": 2.9794836330106043e-06, + "logits/chosen": 1.542374849319458, + "logits/rejected": 1.6514575481414795, + "logps/chosen": -570.9237670898438, + "logps/rejected": -815.7724609375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.963167190551758, + "rewards/margins": 26.82428741455078, + "rewards/rejected": -35.78745651245117, + "step": 2236 + }, + { + "epoch": 1.391601866251944, + "grad_norm": 8.877638816833496, + "learning_rate": 2.9783310281235595e-06, + "logits/chosen": 2.5546209812164307, + "logits/rejected": 2.9814341068267822, + "logps/chosen": -670.451171875, + "logps/rejected": -997.579833984375, + "loss": 0.0395, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.581817626953125, + "rewards/margins": 26.853092193603516, + "rewards/rejected": -39.434906005859375, + "step": 2237 + }, + { + "epoch": 1.3922239502332814, + "grad_norm": 6.948227405548096, + "learning_rate": 2.9771784232365147e-06, + "logits/chosen": 1.135973334312439, + "logits/rejected": 3.1979124546051025, + "logps/chosen": -561.4859619140625, + "logps/rejected": -965.5455322265625, + "loss": 0.0361, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.59466552734375, + "rewards/margins": 27.393587112426758, + "rewards/rejected": -31.988248825073242, + "step": 2238 + }, + { + "epoch": 1.392846034214619, + "grad_norm": 0.023626163601875305, + "learning_rate": 2.97602581834947e-06, + "logits/chosen": 1.0914082527160645, + "logits/rejected": 2.423067569732666, + "logps/chosen": -627.2195434570312, + "logps/rejected": -938.776123046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.695022583007812, + "rewards/margins": 28.72723960876465, + "rewards/rejected": -40.42226028442383, + "step": 2239 + }, + { + "epoch": 1.3934681181959565, + "grad_norm": 0.019772088155150414, + "learning_rate": 2.974873213462425e-06, + "logits/chosen": 1.4812792539596558, + "logits/rejected": 3.1909940242767334, + "logps/chosen": -725.069091796875, + "logps/rejected": -1122.466796875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.017687797546387, + "rewards/margins": 28.807287216186523, + "rewards/rejected": -40.824974060058594, + "step": 2240 + }, + { + "epoch": 1.3940902021772938, + "grad_norm": 16.312362670898438, + "learning_rate": 2.9737206085753804e-06, + "logits/chosen": 0.18421989679336548, + "logits/rejected": 1.6313143968582153, + "logps/chosen": -666.5517578125, + "logps/rejected": -926.6065063476562, + "loss": 0.0804, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.068418502807617, + "rewards/margins": 20.901081085205078, + "rewards/rejected": -29.969497680664062, + "step": 2241 + }, + { + "epoch": 1.3947122861586314, + "grad_norm": 16.657617568969727, + "learning_rate": 2.972568003688336e-06, + "logits/chosen": -0.5953693389892578, + "logits/rejected": 1.4768779277801514, + "logps/chosen": -424.75750732421875, + "logps/rejected": -771.7853393554688, + "loss": 0.0893, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.179218769073486, + "rewards/margins": 23.26014518737793, + "rewards/rejected": -30.43936538696289, + "step": 2242 + }, + { + "epoch": 1.395334370139969, + "grad_norm": 1.2527492376790406e-09, + "learning_rate": 2.9714153988012913e-06, + "logits/chosen": -1.4872426986694336, + "logits/rejected": 3.9436469078063965, + "logps/chosen": -395.34527587890625, + "logps/rejected": -1169.1190185546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.897050857543945, + "rewards/margins": 39.89738082885742, + "rewards/rejected": -48.79443359375, + "step": 2243 + }, + { + "epoch": 1.3959564541213063, + "grad_norm": 1.4284208260662012e-09, + "learning_rate": 2.9702627939142465e-06, + "logits/chosen": 2.7672107219696045, + "logits/rejected": 4.6261372566223145, + "logps/chosen": -765.1670532226562, + "logps/rejected": -1209.5228271484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.569221496582031, + "rewards/margins": 39.72999572753906, + "rewards/rejected": -48.299217224121094, + "step": 2244 + }, + { + "epoch": 1.3965785381026439, + "grad_norm": 32.85478973388672, + "learning_rate": 2.9691101890272017e-06, + "logits/chosen": -2.1204891204833984, + "logits/rejected": 2.243065595626831, + "logps/chosen": -384.271728515625, + "logps/rejected": -898.5313720703125, + "loss": 1.0508, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.96297836303711, + "rewards/margins": 19.47112274169922, + "rewards/rejected": -33.43409729003906, + "step": 2245 + }, + { + "epoch": 1.3972006220839814, + "grad_norm": 5.930408477783203, + "learning_rate": 2.967957584140157e-06, + "logits/chosen": 0.24247993528842926, + "logits/rejected": 4.5272369384765625, + "logps/chosen": -508.9815368652344, + "logps/rejected": -1157.3570556640625, + "loss": 0.0249, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.666451454162598, + "rewards/margins": 38.191650390625, + "rewards/rejected": -47.85810089111328, + "step": 2246 + }, + { + "epoch": 1.3978227060653188, + "grad_norm": 9.793144272407517e-05, + "learning_rate": 2.966804979253112e-06, + "logits/chosen": 0.7921026945114136, + "logits/rejected": 4.247562885284424, + "logps/chosen": -560.756591796875, + "logps/rejected": -1108.50927734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.537893295288086, + "rewards/margins": 34.58842468261719, + "rewards/rejected": -43.126312255859375, + "step": 2247 + }, + { + "epoch": 1.3984447900466563, + "grad_norm": 0.8745850324630737, + "learning_rate": 2.9656523743660674e-06, + "logits/chosen": 0.8842285871505737, + "logits/rejected": 2.2051475048065186, + "logps/chosen": -557.5842895507812, + "logps/rejected": -865.9134521484375, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.312744140625, + "rewards/margins": 24.951759338378906, + "rewards/rejected": -35.264503479003906, + "step": 2248 + }, + { + "epoch": 1.399066874027994, + "grad_norm": 0.029392994940280914, + "learning_rate": 2.964499769479023e-06, + "logits/chosen": 1.2681587934494019, + "logits/rejected": 4.473254203796387, + "logps/chosen": -582.9454956054688, + "logps/rejected": -1216.732666015625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.867018699645996, + "rewards/margins": 34.12983322143555, + "rewards/rejected": -45.996849060058594, + "step": 2249 + }, + { + "epoch": 1.3996889580093312, + "grad_norm": 0.3804221451282501, + "learning_rate": 2.9633471645919783e-06, + "logits/chosen": -1.6605898141860962, + "logits/rejected": 3.4152517318725586, + "logps/chosen": -508.3944091796875, + "logps/rejected": -1163.526611328125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.227392196655273, + "rewards/margins": 37.94681167602539, + "rewards/rejected": -51.1742057800293, + "step": 2250 + }, + { + "epoch": 1.4003110419906688, + "grad_norm": 2.7944657698952824e-10, + "learning_rate": 2.9621945597049335e-06, + "logits/chosen": -1.5698888301849365, + "logits/rejected": 3.95448637008667, + "logps/chosen": -407.74139404296875, + "logps/rejected": -1109.8446044921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.576188087463379, + "rewards/margins": 38.68739318847656, + "rewards/rejected": -45.263580322265625, + "step": 2251 + }, + { + "epoch": 1.4009331259720061, + "grad_norm": 1.1104753017425537, + "learning_rate": 2.9610419548178887e-06, + "logits/chosen": 0.010650875978171825, + "logits/rejected": 1.6669846773147583, + "logps/chosen": -573.3395385742188, + "logps/rejected": -972.3338623046875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.772941589355469, + "rewards/margins": 27.10225486755371, + "rewards/rejected": -36.87519454956055, + "step": 2252 + }, + { + "epoch": 1.4015552099533437, + "grad_norm": 0.0018733566394075751, + "learning_rate": 2.959889349930844e-06, + "logits/chosen": 3.3101394176483154, + "logits/rejected": 5.083858013153076, + "logps/chosen": -556.2260131835938, + "logps/rejected": -927.8904418945312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.198858261108398, + "rewards/margins": 26.321956634521484, + "rewards/rejected": -34.52081298828125, + "step": 2253 + }, + { + "epoch": 1.402177293934681, + "grad_norm": 0.9949904084205627, + "learning_rate": 2.958736745043799e-06, + "logits/chosen": 0.13389703631401062, + "logits/rejected": 2.4801292419433594, + "logps/chosen": -541.17041015625, + "logps/rejected": -893.1530151367188, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.718971252441406, + "rewards/margins": 24.820383071899414, + "rewards/rejected": -36.53935241699219, + "step": 2254 + }, + { + "epoch": 1.4027993779160186, + "grad_norm": 1.041222731146263e-06, + "learning_rate": 2.9575841401567544e-06, + "logits/chosen": 0.26652705669403076, + "logits/rejected": 4.539582252502441, + "logps/chosen": -585.6783447265625, + "logps/rejected": -1185.69580078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.327654838562012, + "rewards/margins": 33.012325286865234, + "rewards/rejected": -45.33998107910156, + "step": 2255 + }, + { + "epoch": 1.4034214618973562, + "grad_norm": 2.361824044783134e-06, + "learning_rate": 2.95643153526971e-06, + "logits/chosen": 0.5165107846260071, + "logits/rejected": 3.6015896797180176, + "logps/chosen": -619.5923461914062, + "logps/rejected": -1163.2398681640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.083619117736816, + "rewards/margins": 40.63835144042969, + "rewards/rejected": -50.72196960449219, + "step": 2256 + }, + { + "epoch": 1.4040435458786935, + "grad_norm": 2.7180160486750538e-06, + "learning_rate": 2.9552789303826653e-06, + "logits/chosen": -1.3893684148788452, + "logits/rejected": 3.102480888366699, + "logps/chosen": -550.6753540039062, + "logps/rejected": -1261.0013427734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.005228042602539, + "rewards/margins": 39.15861511230469, + "rewards/rejected": -53.16384506225586, + "step": 2257 + }, + { + "epoch": 1.404665629860031, + "grad_norm": 0.007969174534082413, + "learning_rate": 2.9541263254956205e-06, + "logits/chosen": 2.2342042922973633, + "logits/rejected": 2.6852080821990967, + "logps/chosen": -699.7215576171875, + "logps/rejected": -965.824462890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.429476737976074, + "rewards/margins": 26.209827423095703, + "rewards/rejected": -40.639305114746094, + "step": 2258 + }, + { + "epoch": 1.4052877138413686, + "grad_norm": 3.5950510209659114e-05, + "learning_rate": 2.9529737206085757e-06, + "logits/chosen": 1.4159845113754272, + "logits/rejected": 3.3737049102783203, + "logps/chosen": -601.9073486328125, + "logps/rejected": -980.881103515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.271068572998047, + "rewards/margins": 29.318340301513672, + "rewards/rejected": -41.58940887451172, + "step": 2259 + }, + { + "epoch": 1.405909797822706, + "grad_norm": 0.0010115044424310327, + "learning_rate": 2.951821115721531e-06, + "logits/chosen": 0.3141559064388275, + "logits/rejected": 3.4854958057403564, + "logps/chosen": -485.7015686035156, + "logps/rejected": -900.7958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.961186408996582, + "rewards/margins": 25.101022720336914, + "rewards/rejected": -34.06221008300781, + "step": 2260 + }, + { + "epoch": 1.4065318818040435, + "grad_norm": 0.007804466411471367, + "learning_rate": 2.950668510834486e-06, + "logits/chosen": 0.22595328092575073, + "logits/rejected": 3.9139413833618164, + "logps/chosen": -598.7637939453125, + "logps/rejected": -1206.5079345703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.31296443939209, + "rewards/margins": 35.79632568359375, + "rewards/rejected": -44.10929489135742, + "step": 2261 + }, + { + "epoch": 1.407153965785381, + "grad_norm": 29.254375457763672, + "learning_rate": 2.9495159059474414e-06, + "logits/chosen": -2.053928852081299, + "logits/rejected": 3.762558698654175, + "logps/chosen": -494.0701904296875, + "logps/rejected": -1236.5478515625, + "loss": 0.2308, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.962211608886719, + "rewards/margins": 37.9639778137207, + "rewards/rejected": -48.926185607910156, + "step": 2262 + }, + { + "epoch": 1.4077760497667184, + "grad_norm": 0.03585294261574745, + "learning_rate": 2.9483633010603966e-06, + "logits/chosen": 2.6447174549102783, + "logits/rejected": 3.4780094623565674, + "logps/chosen": -561.7684326171875, + "logps/rejected": -900.8305053710938, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.810640335083008, + "rewards/margins": 23.371841430664062, + "rewards/rejected": -37.18247985839844, + "step": 2263 + }, + { + "epoch": 1.408398133748056, + "grad_norm": 9.315655915997922e-05, + "learning_rate": 2.9472106961733522e-06, + "logits/chosen": -0.3833075761795044, + "logits/rejected": 2.9224395751953125, + "logps/chosen": -433.1549377441406, + "logps/rejected": -898.0396728515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.635376930236816, + "rewards/margins": 28.6442928314209, + "rewards/rejected": -37.27967071533203, + "step": 2264 + }, + { + "epoch": 1.4090202177293936, + "grad_norm": 0.024829663336277008, + "learning_rate": 2.9460580912863075e-06, + "logits/chosen": -1.8885233402252197, + "logits/rejected": 1.543205976486206, + "logps/chosen": -534.0154418945312, + "logps/rejected": -1084.5830078125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.128021240234375, + "rewards/margins": 30.66397476196289, + "rewards/rejected": -41.7919921875, + "step": 2265 + }, + { + "epoch": 1.409642301710731, + "grad_norm": 0.21217453479766846, + "learning_rate": 2.9449054863992627e-06, + "logits/chosen": -0.9531707763671875, + "logits/rejected": 2.394754409790039, + "logps/chosen": -617.4303588867188, + "logps/rejected": -1086.7529296875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.659804344177246, + "rewards/margins": 28.16089630126953, + "rewards/rejected": -35.820701599121094, + "step": 2266 + }, + { + "epoch": 1.4102643856920685, + "grad_norm": 1.5152208106883336e-05, + "learning_rate": 2.943752881512218e-06, + "logits/chosen": -0.8151946067810059, + "logits/rejected": 3.052356004714966, + "logps/chosen": -365.26904296875, + "logps/rejected": -995.769775390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.043517112731934, + "rewards/margins": 35.24461364746094, + "rewards/rejected": -40.28812789916992, + "step": 2267 + }, + { + "epoch": 1.410886469673406, + "grad_norm": 21.003597259521484, + "learning_rate": 2.942600276625173e-06, + "logits/chosen": 1.9905128479003906, + "logits/rejected": 4.831020355224609, + "logps/chosen": -658.3651123046875, + "logps/rejected": -1095.9422607421875, + "loss": 0.1103, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.824920654296875, + "rewards/margins": 30.134084701538086, + "rewards/rejected": -39.95900344848633, + "step": 2268 + }, + { + "epoch": 1.4115085536547434, + "grad_norm": 0.0016211661277338862, + "learning_rate": 2.9414476717381284e-06, + "logits/chosen": 0.7444226741790771, + "logits/rejected": 4.623043060302734, + "logps/chosen": -614.892333984375, + "logps/rejected": -1186.7662353515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.60790729522705, + "rewards/margins": 33.84346008300781, + "rewards/rejected": -43.45137023925781, + "step": 2269 + }, + { + "epoch": 1.412130637636081, + "grad_norm": 0.00010323274182155728, + "learning_rate": 2.9402950668510836e-06, + "logits/chosen": 1.018390417098999, + "logits/rejected": 4.094212532043457, + "logps/chosen": -384.023681640625, + "logps/rejected": -894.3253173828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.826488494873047, + "rewards/margins": 29.77437400817871, + "rewards/rejected": -37.60086441040039, + "step": 2270 + }, + { + "epoch": 1.4127527216174183, + "grad_norm": 22.7961483001709, + "learning_rate": 2.9391424619640392e-06, + "logits/chosen": -0.5670567154884338, + "logits/rejected": 0.5044700503349304, + "logps/chosen": -561.7535400390625, + "logps/rejected": -897.264892578125, + "loss": 0.0895, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.388435363769531, + "rewards/margins": 25.081241607666016, + "rewards/rejected": -35.46967697143555, + "step": 2271 + }, + { + "epoch": 1.4133748055987558, + "grad_norm": 2.3609633445739746, + "learning_rate": 2.9379898570769945e-06, + "logits/chosen": 0.33859747648239136, + "logits/rejected": 4.584858417510986, + "logps/chosen": -459.1435546875, + "logps/rejected": -949.4611206054688, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.196245193481445, + "rewards/margins": 22.02639389038086, + "rewards/rejected": -32.22264099121094, + "step": 2272 + }, + { + "epoch": 1.4139968895800932, + "grad_norm": 0.01303979940712452, + "learning_rate": 2.9368372521899497e-06, + "logits/chosen": 0.0018563270568847656, + "logits/rejected": 2.0656213760375977, + "logps/chosen": -534.7927856445312, + "logps/rejected": -904.1492919921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.918089389801025, + "rewards/margins": 28.851530075073242, + "rewards/rejected": -35.76961898803711, + "step": 2273 + }, + { + "epoch": 1.4146189735614307, + "grad_norm": 31.436941146850586, + "learning_rate": 2.935684647302905e-06, + "logits/chosen": 2.9472086429595947, + "logits/rejected": 3.870821237564087, + "logps/chosen": -646.843505859375, + "logps/rejected": -1069.711669921875, + "loss": 0.3968, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.343791007995605, + "rewards/margins": 31.182849884033203, + "rewards/rejected": -44.526641845703125, + "step": 2274 + }, + { + "epoch": 1.4152410575427683, + "grad_norm": 2.008999217650853e-06, + "learning_rate": 2.93453204241586e-06, + "logits/chosen": -2.290863037109375, + "logits/rejected": 2.3186676502227783, + "logps/chosen": -367.4084777832031, + "logps/rejected": -951.1954345703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.120453357696533, + "rewards/margins": 29.2360782623291, + "rewards/rejected": -34.356529235839844, + "step": 2275 + }, + { + "epoch": 1.4158631415241056, + "grad_norm": 0.04517332464456558, + "learning_rate": 2.9333794375288154e-06, + "logits/chosen": -0.20349359512329102, + "logits/rejected": 1.8677873611450195, + "logps/chosen": -428.3921203613281, + "logps/rejected": -739.129150390625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.071450710296631, + "rewards/margins": 20.734577178955078, + "rewards/rejected": -27.8060302734375, + "step": 2276 + }, + { + "epoch": 1.4164852255054432, + "grad_norm": 0.04162021726369858, + "learning_rate": 2.9322268326417706e-06, + "logits/chosen": 0.4728636145591736, + "logits/rejected": 3.5390806198120117, + "logps/chosen": -489.19610595703125, + "logps/rejected": -1045.33056640625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.421915054321289, + "rewards/margins": 34.723148345947266, + "rewards/rejected": -43.14506149291992, + "step": 2277 + }, + { + "epoch": 1.4171073094867808, + "grad_norm": 0.0008894521743059158, + "learning_rate": 2.9310742277547262e-06, + "logits/chosen": -0.6772791147232056, + "logits/rejected": 2.196185350418091, + "logps/chosen": -344.69866943359375, + "logps/rejected": -848.551025390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.582999229431152, + "rewards/margins": 31.66111183166504, + "rewards/rejected": -37.244110107421875, + "step": 2278 + }, + { + "epoch": 1.417729393468118, + "grad_norm": 6.018635811955164e-09, + "learning_rate": 2.9299216228676815e-06, + "logits/chosen": 0.21843993663787842, + "logits/rejected": 4.459475517272949, + "logps/chosen": -431.35394287109375, + "logps/rejected": -998.3802490234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.352250099182129, + "rewards/margins": 32.244117736816406, + "rewards/rejected": -42.59636688232422, + "step": 2279 + }, + { + "epoch": 1.4183514774494557, + "grad_norm": 2.377005512244068e-05, + "learning_rate": 2.9287690179806367e-06, + "logits/chosen": 0.881413459777832, + "logits/rejected": 3.4689924716949463, + "logps/chosen": -461.482666015625, + "logps/rejected": -943.070068359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.424365997314453, + "rewards/margins": 29.584659576416016, + "rewards/rejected": -38.00902557373047, + "step": 2280 + }, + { + "epoch": 1.4189735614307932, + "grad_norm": 3.8007092371117324e-05, + "learning_rate": 2.927616413093592e-06, + "logits/chosen": 0.7045872211456299, + "logits/rejected": 2.315516710281372, + "logps/chosen": -603.4990234375, + "logps/rejected": -907.1836547851562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.995718955993652, + "rewards/margins": 25.444580078125, + "rewards/rejected": -34.44029998779297, + "step": 2281 + }, + { + "epoch": 1.4195956454121306, + "grad_norm": 42.845584869384766, + "learning_rate": 2.926463808206547e-06, + "logits/chosen": 0.6564313173294067, + "logits/rejected": 2.4238367080688477, + "logps/chosen": -567.2952880859375, + "logps/rejected": -792.8505859375, + "loss": 0.7587, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.14471435546875, + "rewards/margins": 19.311264038085938, + "rewards/rejected": -28.455978393554688, + "step": 2282 + }, + { + "epoch": 1.4202177293934681, + "grad_norm": 2.9977618964949215e-07, + "learning_rate": 2.9253112033195024e-06, + "logits/chosen": -2.066939115524292, + "logits/rejected": 4.052614212036133, + "logps/chosen": -263.885498046875, + "logps/rejected": -1071.652099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.650771141052246, + "rewards/margins": 40.807212829589844, + "rewards/rejected": -45.457984924316406, + "step": 2283 + }, + { + "epoch": 1.4208398133748057, + "grad_norm": 49.3410530090332, + "learning_rate": 2.9241585984324576e-06, + "logits/chosen": 2.585322856903076, + "logits/rejected": 4.387059211730957, + "logps/chosen": -590.73486328125, + "logps/rejected": -903.32080078125, + "loss": 0.9985, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.027231216430664, + "rewards/margins": 23.2886905670166, + "rewards/rejected": -32.315921783447266, + "step": 2284 + }, + { + "epoch": 1.421461897356143, + "grad_norm": 0.18115392327308655, + "learning_rate": 2.9230059935454132e-06, + "logits/chosen": 0.1543576568365097, + "logits/rejected": 2.600271701812744, + "logps/chosen": -613.9583740234375, + "logps/rejected": -1017.9857788085938, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.766103744506836, + "rewards/margins": 22.681135177612305, + "rewards/rejected": -31.447237014770508, + "step": 2285 + }, + { + "epoch": 1.4220839813374806, + "grad_norm": 0.007985980249941349, + "learning_rate": 2.9218533886583685e-06, + "logits/chosen": 0.11727690696716309, + "logits/rejected": 1.414735198020935, + "logps/chosen": -555.1712036132812, + "logps/rejected": -842.997314453125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.974403381347656, + "rewards/margins": 23.567520141601562, + "rewards/rejected": -33.54192352294922, + "step": 2286 + }, + { + "epoch": 1.4227060653188182, + "grad_norm": 0.11219903826713562, + "learning_rate": 2.9207007837713237e-06, + "logits/chosen": 1.3696763515472412, + "logits/rejected": 3.7046830654144287, + "logps/chosen": -551.7432250976562, + "logps/rejected": -996.8988647460938, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.567296981811523, + "rewards/margins": 30.2374267578125, + "rewards/rejected": -41.804725646972656, + "step": 2287 + }, + { + "epoch": 1.4233281493001555, + "grad_norm": 0.0010436129523441195, + "learning_rate": 2.919548178884279e-06, + "logits/chosen": -1.4171717166900635, + "logits/rejected": 2.6286206245422363, + "logps/chosen": -534.484130859375, + "logps/rejected": -1125.8145751953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.40506362915039, + "rewards/margins": 30.196651458740234, + "rewards/rejected": -38.601715087890625, + "step": 2288 + }, + { + "epoch": 1.423950233281493, + "grad_norm": 7.079758506733924e-05, + "learning_rate": 2.918395573997234e-06, + "logits/chosen": 0.5658026933670044, + "logits/rejected": 3.566821575164795, + "logps/chosen": -626.707275390625, + "logps/rejected": -1013.3258056640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.590632438659668, + "rewards/margins": 25.71282958984375, + "rewards/rejected": -34.303462982177734, + "step": 2289 + }, + { + "epoch": 1.4245723172628304, + "grad_norm": 0.00018804903083946556, + "learning_rate": 2.9172429691101894e-06, + "logits/chosen": 1.4859108924865723, + "logits/rejected": 4.400835990905762, + "logps/chosen": -660.5794677734375, + "logps/rejected": -1268.1781005859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.740732192993164, + "rewards/margins": 40.89760971069336, + "rewards/rejected": -50.638336181640625, + "step": 2290 + }, + { + "epoch": 1.425194401244168, + "grad_norm": 1.8972766399383545, + "learning_rate": 2.9160903642231446e-06, + "logits/chosen": 0.980305016040802, + "logits/rejected": 3.3063576221466064, + "logps/chosen": -612.8570556640625, + "logps/rejected": -1093.697998046875, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.83894157409668, + "rewards/margins": 33.148048400878906, + "rewards/rejected": -41.98699188232422, + "step": 2291 + }, + { + "epoch": 1.4258164852255055, + "grad_norm": 1.6478937864303589, + "learning_rate": 2.9149377593361e-06, + "logits/chosen": -0.0209181010723114, + "logits/rejected": 3.4129858016967773, + "logps/chosen": -578.6953735351562, + "logps/rejected": -907.123046875, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.504940986633301, + "rewards/margins": 20.077247619628906, + "rewards/rejected": -26.582189559936523, + "step": 2292 + }, + { + "epoch": 1.4264385692068429, + "grad_norm": 0.07732792943716049, + "learning_rate": 2.9137851544490555e-06, + "logits/chosen": -0.7631025314331055, + "logits/rejected": 4.451732635498047, + "logps/chosen": -446.68280029296875, + "logps/rejected": -1024.406494140625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.762255668640137, + "rewards/margins": 31.770727157592773, + "rewards/rejected": -40.532981872558594, + "step": 2293 + }, + { + "epoch": 1.4270606531881804, + "grad_norm": 4.2958705307682976e-05, + "learning_rate": 2.9126325495620107e-06, + "logits/chosen": 0.22674143314361572, + "logits/rejected": 3.4945809841156006, + "logps/chosen": -500.32562255859375, + "logps/rejected": -1025.9798583984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.780029296875, + "rewards/margins": 27.635435104370117, + "rewards/rejected": -36.41546630859375, + "step": 2294 + }, + { + "epoch": 1.4276827371695178, + "grad_norm": 42.8979377746582, + "learning_rate": 2.911479944674966e-06, + "logits/chosen": -0.6848071217536926, + "logits/rejected": 2.201697587966919, + "logps/chosen": -536.333251953125, + "logps/rejected": -903.2261962890625, + "loss": 0.6641, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.489903450012207, + "rewards/margins": 22.598129272460938, + "rewards/rejected": -32.08803176879883, + "step": 2295 + }, + { + "epoch": 1.4283048211508553, + "grad_norm": 0.030604323372244835, + "learning_rate": 2.910327339787921e-06, + "logits/chosen": -0.04041486978530884, + "logits/rejected": 2.474832057952881, + "logps/chosen": -581.4979248046875, + "logps/rejected": -952.0086059570312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.742212295532227, + "rewards/margins": 24.62486457824707, + "rewards/rejected": -33.36707305908203, + "step": 2296 + }, + { + "epoch": 1.428926905132193, + "grad_norm": 9.851161166807287e-07, + "learning_rate": 2.9091747349008764e-06, + "logits/chosen": 0.3778786063194275, + "logits/rejected": 1.9946703910827637, + "logps/chosen": -456.3149108886719, + "logps/rejected": -777.9240112304688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.893744468688965, + "rewards/margins": 26.18206787109375, + "rewards/rejected": -30.075809478759766, + "step": 2297 + }, + { + "epoch": 1.4295489891135302, + "grad_norm": 0.0991005226969719, + "learning_rate": 2.9080221300138316e-06, + "logits/chosen": 2.706174373626709, + "logits/rejected": 4.151269912719727, + "logps/chosen": -758.4832763671875, + "logps/rejected": -1072.541015625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.922719955444336, + "rewards/margins": 25.964866638183594, + "rewards/rejected": -34.88758850097656, + "step": 2298 + }, + { + "epoch": 1.4301710730948678, + "grad_norm": 0.07812916487455368, + "learning_rate": 2.906869525126787e-06, + "logits/chosen": -0.47702324390411377, + "logits/rejected": 3.2301926612854004, + "logps/chosen": -474.9299011230469, + "logps/rejected": -996.20556640625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.653593063354492, + "rewards/margins": 23.850337982177734, + "rewards/rejected": -33.503929138183594, + "step": 2299 + }, + { + "epoch": 1.4307931570762054, + "grad_norm": 0.00029026303673163056, + "learning_rate": 2.9057169202397425e-06, + "logits/chosen": -0.5452579259872437, + "logits/rejected": 4.23468542098999, + "logps/chosen": -490.8875427246094, + "logps/rejected": -1042.40869140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.9663872718811035, + "rewards/margins": 32.5419807434082, + "rewards/rejected": -39.50836944580078, + "step": 2300 + }, + { + "epoch": 1.4314152410575427, + "grad_norm": 30.1119384765625, + "learning_rate": 2.9045643153526977e-06, + "logits/chosen": -2.7672839164733887, + "logits/rejected": 2.42836594581604, + "logps/chosen": -380.5882568359375, + "logps/rejected": -959.0693359375, + "loss": 0.2293, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.109725952148438, + "rewards/margins": 24.500015258789062, + "rewards/rejected": -32.6097412109375, + "step": 2301 + }, + { + "epoch": 1.4320373250388803, + "grad_norm": 4.354133129119873, + "learning_rate": 2.903411710465653e-06, + "logits/chosen": 1.5805208683013916, + "logits/rejected": 3.6228115558624268, + "logps/chosen": -873.7197265625, + "logps/rejected": -1153.635498046875, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.920944213867188, + "rewards/margins": 16.957725524902344, + "rewards/rejected": -32.87866973876953, + "step": 2302 + }, + { + "epoch": 1.4326594090202178, + "grad_norm": 0.0239068903028965, + "learning_rate": 2.902259105578608e-06, + "logits/chosen": -2.2274341583251953, + "logits/rejected": 0.8469206094741821, + "logps/chosen": -358.734130859375, + "logps/rejected": -833.1475219726562, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.413700580596924, + "rewards/margins": 25.46902084350586, + "rewards/rejected": -32.882720947265625, + "step": 2303 + }, + { + "epoch": 1.4332814930015552, + "grad_norm": 0.7325589656829834, + "learning_rate": 2.9011065006915634e-06, + "logits/chosen": -0.23840701580047607, + "logits/rejected": 2.6510872840881348, + "logps/chosen": -502.33648681640625, + "logps/rejected": -865.9973754882812, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.252693176269531, + "rewards/margins": 16.553138732910156, + "rewards/rejected": -25.805830001831055, + "step": 2304 + }, + { + "epoch": 1.4339035769828927, + "grad_norm": 2.370480537414551, + "learning_rate": 2.8999538958045186e-06, + "logits/chosen": -0.7831411361694336, + "logits/rejected": 3.6987407207489014, + "logps/chosen": -384.1199951171875, + "logps/rejected": -955.3935546875, + "loss": 0.0227, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.02347469329834, + "rewards/margins": 23.749711990356445, + "rewards/rejected": -32.77318572998047, + "step": 2305 + }, + { + "epoch": 1.4345256609642303, + "grad_norm": 0.9532006978988647, + "learning_rate": 2.898801290917474e-06, + "logits/chosen": 1.7668182849884033, + "logits/rejected": 4.3650007247924805, + "logps/chosen": -664.02099609375, + "logps/rejected": -1065.537353515625, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.304007530212402, + "rewards/margins": 27.521072387695312, + "rewards/rejected": -35.82508087158203, + "step": 2306 + }, + { + "epoch": 1.4351477449455676, + "grad_norm": 0.30767178535461426, + "learning_rate": 2.8976486860304295e-06, + "logits/chosen": 3.121201515197754, + "logits/rejected": 3.2817912101745605, + "logps/chosen": -619.9653930664062, + "logps/rejected": -893.48779296875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.179896354675293, + "rewards/margins": 22.813602447509766, + "rewards/rejected": -31.993501663208008, + "step": 2307 + }, + { + "epoch": 1.4357698289269052, + "grad_norm": 2.5073416054510744e-06, + "learning_rate": 2.8964960811433847e-06, + "logits/chosen": 0.9362043142318726, + "logits/rejected": 3.051908493041992, + "logps/chosen": -525.4351196289062, + "logps/rejected": -1072.8572998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.091745376586914, + "rewards/margins": 34.52039337158203, + "rewards/rejected": -43.61213684082031, + "step": 2308 + }, + { + "epoch": 1.4363919129082425, + "grad_norm": 1.418189525604248, + "learning_rate": 2.89534347625634e-06, + "logits/chosen": 2.421281576156616, + "logits/rejected": 4.215733051300049, + "logps/chosen": -625.8364868164062, + "logps/rejected": -989.9420776367188, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.793773651123047, + "rewards/margins": 25.186986923217773, + "rewards/rejected": -34.98076248168945, + "step": 2309 + }, + { + "epoch": 1.43701399688958, + "grad_norm": 0.0014669963857159019, + "learning_rate": 2.894190871369295e-06, + "logits/chosen": 0.5397623181343079, + "logits/rejected": 3.229332447052002, + "logps/chosen": -356.3837585449219, + "logps/rejected": -761.350830078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.371195316314697, + "rewards/margins": 19.48046875, + "rewards/rejected": -26.85166358947754, + "step": 2310 + }, + { + "epoch": 1.4376360808709177, + "grad_norm": 0.5468123555183411, + "learning_rate": 2.8930382664822504e-06, + "logits/chosen": 2.6708908081054688, + "logits/rejected": 3.487905502319336, + "logps/chosen": -646.0723876953125, + "logps/rejected": -917.741943359375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.835456848144531, + "rewards/margins": 22.48293113708496, + "rewards/rejected": -31.31838607788086, + "step": 2311 + }, + { + "epoch": 1.438258164852255, + "grad_norm": 0.00034044316271319985, + "learning_rate": 2.8918856615952056e-06, + "logits/chosen": 0.7117612361907959, + "logits/rejected": 4.120718479156494, + "logps/chosen": -488.3861389160156, + "logps/rejected": -1005.724853515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.154720306396484, + "rewards/margins": 28.170984268188477, + "rewards/rejected": -40.325706481933594, + "step": 2312 + }, + { + "epoch": 1.4388802488335926, + "grad_norm": 0.001113589503802359, + "learning_rate": 2.890733056708161e-06, + "logits/chosen": -0.4939787685871124, + "logits/rejected": 3.5605859756469727, + "logps/chosen": -464.910888671875, + "logps/rejected": -1003.0372924804688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.48483657836914, + "rewards/margins": 29.896778106689453, + "rewards/rejected": -38.381614685058594, + "step": 2313 + }, + { + "epoch": 1.43950233281493, + "grad_norm": 7.407980918884277, + "learning_rate": 2.8895804518211156e-06, + "logits/chosen": 2.0751850605010986, + "logits/rejected": 3.233450412750244, + "logps/chosen": -584.488037109375, + "logps/rejected": -939.03076171875, + "loss": 0.0264, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.532438278198242, + "rewards/margins": 26.578365325927734, + "rewards/rejected": -38.11080551147461, + "step": 2314 + }, + { + "epoch": 1.4401244167962675, + "grad_norm": 0.00016063018119893968, + "learning_rate": 2.888427846934071e-06, + "logits/chosen": -3.001732110977173, + "logits/rejected": 0.1540934443473816, + "logps/chosen": -333.6917724609375, + "logps/rejected": -832.804443359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.484884738922119, + "rewards/margins": 26.101276397705078, + "rewards/rejected": -33.58616256713867, + "step": 2315 + }, + { + "epoch": 1.440746500777605, + "grad_norm": 0.0016659300308674574, + "learning_rate": 2.887275242047026e-06, + "logits/chosen": 0.07998377084732056, + "logits/rejected": 4.144624710083008, + "logps/chosen": -367.6268310546875, + "logps/rejected": -895.8948974609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.288524150848389, + "rewards/margins": 32.77602767944336, + "rewards/rejected": -40.064552307128906, + "step": 2316 + }, + { + "epoch": 1.4413685847589424, + "grad_norm": 0.0014177010161802173, + "learning_rate": 2.8861226371599817e-06, + "logits/chosen": -2.2402052879333496, + "logits/rejected": 4.038663864135742, + "logps/chosen": -441.38323974609375, + "logps/rejected": -1127.3525390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.360583782196045, + "rewards/margins": 32.027435302734375, + "rewards/rejected": -39.388023376464844, + "step": 2317 + }, + { + "epoch": 1.44199066874028, + "grad_norm": 0.0028884029015898705, + "learning_rate": 2.884970032272937e-06, + "logits/chosen": 1.2739320993423462, + "logits/rejected": 4.6315460205078125, + "logps/chosen": -595.5584716796875, + "logps/rejected": -1157.019775390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.700339317321777, + "rewards/margins": 33.080440521240234, + "rewards/rejected": -41.780784606933594, + "step": 2318 + }, + { + "epoch": 1.4426127527216175, + "grad_norm": 0.28165727853775024, + "learning_rate": 2.883817427385892e-06, + "logits/chosen": 1.276587724685669, + "logits/rejected": 3.1559951305389404, + "logps/chosen": -727.439453125, + "logps/rejected": -1092.28369140625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.861395835876465, + "rewards/margins": 26.8516845703125, + "rewards/rejected": -37.71308135986328, + "step": 2319 + }, + { + "epoch": 1.4432348367029548, + "grad_norm": 0.00015634715964552015, + "learning_rate": 2.8826648224988474e-06, + "logits/chosen": -0.27201998233795166, + "logits/rejected": 4.20585823059082, + "logps/chosen": -564.0928955078125, + "logps/rejected": -1196.4190673828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.63508415222168, + "rewards/margins": 36.40064239501953, + "rewards/rejected": -47.03572463989258, + "step": 2320 + }, + { + "epoch": 1.4438569206842924, + "grad_norm": 59.58882522583008, + "learning_rate": 2.8815122176118026e-06, + "logits/chosen": 0.428855836391449, + "logits/rejected": 2.460395336151123, + "logps/chosen": -533.8165283203125, + "logps/rejected": -808.375244140625, + "loss": 1.2072, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.576982498168945, + "rewards/margins": 16.39482879638672, + "rewards/rejected": -28.971811294555664, + "step": 2321 + }, + { + "epoch": 1.44447900466563, + "grad_norm": 0.0001684689341345802, + "learning_rate": 2.880359612724758e-06, + "logits/chosen": 1.0388171672821045, + "logits/rejected": 1.971908450126648, + "logps/chosen": -688.4964599609375, + "logps/rejected": -1014.7935180664062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.04210090637207, + "rewards/margins": 26.734935760498047, + "rewards/rejected": -38.777034759521484, + "step": 2322 + }, + { + "epoch": 1.4451010886469673, + "grad_norm": 2.0579311239998788e-05, + "learning_rate": 2.879207007837713e-06, + "logits/chosen": -0.8547032475471497, + "logits/rejected": 3.58233380317688, + "logps/chosen": -504.6190490722656, + "logps/rejected": -1175.1273193359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.368605613708496, + "rewards/margins": 39.4583740234375, + "rewards/rejected": -47.82698059082031, + "step": 2323 + }, + { + "epoch": 1.4457231726283049, + "grad_norm": 1.1889311224422272e-07, + "learning_rate": 2.8780544029506687e-06, + "logits/chosen": 0.19701743125915527, + "logits/rejected": 2.2015702724456787, + "logps/chosen": -473.6476745605469, + "logps/rejected": -878.5316772460938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.067363739013672, + "rewards/margins": 26.477081298828125, + "rewards/rejected": -35.54444122314453, + "step": 2324 + }, + { + "epoch": 1.4463452566096424, + "grad_norm": 49.5518798828125, + "learning_rate": 2.876901798063624e-06, + "logits/chosen": 2.072795867919922, + "logits/rejected": 5.290923118591309, + "logps/chosen": -674.4661254882812, + "logps/rejected": -1051.1849365234375, + "loss": 2.2263, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.104040145874023, + "rewards/margins": 23.05379295349121, + "rewards/rejected": -36.157833099365234, + "step": 2325 + }, + { + "epoch": 1.4469673405909798, + "grad_norm": 1.7055537700653076, + "learning_rate": 2.875749193176579e-06, + "logits/chosen": 2.415715456008911, + "logits/rejected": 3.685410499572754, + "logps/chosen": -636.4686889648438, + "logps/rejected": -1046.666015625, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.71038818359375, + "rewards/margins": 28.964778900146484, + "rewards/rejected": -41.67516326904297, + "step": 2326 + }, + { + "epoch": 1.4475894245723173, + "grad_norm": 2.623699799642054e-07, + "learning_rate": 2.8745965882895344e-06, + "logits/chosen": -0.019688010215759277, + "logits/rejected": 1.9653702974319458, + "logps/chosen": -652.8423461914062, + "logps/rejected": -1085.557373046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.358902931213379, + "rewards/margins": 37.280296325683594, + "rewards/rejected": -48.63920211791992, + "step": 2327 + }, + { + "epoch": 1.4482115085536549, + "grad_norm": 0.5979498624801636, + "learning_rate": 2.8734439834024896e-06, + "logits/chosen": 1.2116461992263794, + "logits/rejected": 4.837029457092285, + "logps/chosen": -471.36785888671875, + "logps/rejected": -965.8245849609375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.9149322509765625, + "rewards/margins": 30.018962860107422, + "rewards/rejected": -36.93389892578125, + "step": 2328 + }, + { + "epoch": 1.4488335925349922, + "grad_norm": 7.315420447184806e-08, + "learning_rate": 2.872291378515445e-06, + "logits/chosen": 1.088128924369812, + "logits/rejected": 3.6922049522399902, + "logps/chosen": -553.4324951171875, + "logps/rejected": -1014.8348999023438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.668688774108887, + "rewards/margins": 28.717819213867188, + "rewards/rejected": -39.38650894165039, + "step": 2329 + }, + { + "epoch": 1.4494556765163298, + "grad_norm": 7.4477925300598145, + "learning_rate": 2.8711387736284e-06, + "logits/chosen": 0.9433234333992004, + "logits/rejected": 2.2451412677764893, + "logps/chosen": -598.5908203125, + "logps/rejected": -948.7597045898438, + "loss": 0.0358, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.012178421020508, + "rewards/margins": 24.346149444580078, + "rewards/rejected": -36.35832595825195, + "step": 2330 + }, + { + "epoch": 1.4500777604976671, + "grad_norm": 0.0051433308981359005, + "learning_rate": 2.8699861687413557e-06, + "logits/chosen": -1.3141753673553467, + "logits/rejected": 3.2086472511291504, + "logps/chosen": -493.376953125, + "logps/rejected": -1056.125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.405976295471191, + "rewards/margins": 25.69782066345215, + "rewards/rejected": -35.10379409790039, + "step": 2331 + }, + { + "epoch": 1.4506998444790047, + "grad_norm": 0.0010937333572655916, + "learning_rate": 2.868833563854311e-06, + "logits/chosen": 1.9706964492797852, + "logits/rejected": 4.621064186096191, + "logps/chosen": -528.7137451171875, + "logps/rejected": -984.925048828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.555310249328613, + "rewards/margins": 31.49054718017578, + "rewards/rejected": -41.045860290527344, + "step": 2332 + }, + { + "epoch": 1.451321928460342, + "grad_norm": 1.4169393580232281e-05, + "learning_rate": 2.867680958967266e-06, + "logits/chosen": -0.5840214490890503, + "logits/rejected": 2.413980007171631, + "logps/chosen": -461.7684020996094, + "logps/rejected": -960.8614501953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.614622116088867, + "rewards/margins": 30.289093017578125, + "rewards/rejected": -36.903717041015625, + "step": 2333 + }, + { + "epoch": 1.4519440124416796, + "grad_norm": 29.227153778076172, + "learning_rate": 2.8665283540802214e-06, + "logits/chosen": -1.9500712156295776, + "logits/rejected": 2.810960292816162, + "logps/chosen": -483.628662109375, + "logps/rejected": -1260.641357421875, + "loss": 0.7312, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.263227462768555, + "rewards/margins": 42.335426330566406, + "rewards/rejected": -54.598655700683594, + "step": 2334 + }, + { + "epoch": 1.4525660964230172, + "grad_norm": 0.21363773941993713, + "learning_rate": 2.8653757491931766e-06, + "logits/chosen": -1.4968746900558472, + "logits/rejected": 4.372405052185059, + "logps/chosen": -456.53900146484375, + "logps/rejected": -1048.5850830078125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.634637355804443, + "rewards/margins": 28.53285789489746, + "rewards/rejected": -35.16749572753906, + "step": 2335 + }, + { + "epoch": 1.4531881804043545, + "grad_norm": 0.19996534287929535, + "learning_rate": 2.864223144306132e-06, + "logits/chosen": -0.618783712387085, + "logits/rejected": 2.2022411823272705, + "logps/chosen": -461.9523010253906, + "logps/rejected": -760.0111083984375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.399519920349121, + "rewards/margins": 19.92711639404297, + "rewards/rejected": -27.326637268066406, + "step": 2336 + }, + { + "epoch": 1.453810264385692, + "grad_norm": 0.21690642833709717, + "learning_rate": 2.863070539419087e-06, + "logits/chosen": 0.9195233583450317, + "logits/rejected": 3.0173544883728027, + "logps/chosen": -619.1990966796875, + "logps/rejected": -891.9837036132812, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.781450271606445, + "rewards/margins": 20.42620849609375, + "rewards/rejected": -32.20765686035156, + "step": 2337 + }, + { + "epoch": 1.4544323483670296, + "grad_norm": 0.003023615339770913, + "learning_rate": 2.8619179345320423e-06, + "logits/chosen": -0.1441906839609146, + "logits/rejected": 4.001985549926758, + "logps/chosen": -365.0175476074219, + "logps/rejected": -927.1415405273438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.58237075805664, + "rewards/margins": 24.85255241394043, + "rewards/rejected": -33.43492126464844, + "step": 2338 + }, + { + "epoch": 1.455054432348367, + "grad_norm": 0.0015379984397441149, + "learning_rate": 2.860765329644998e-06, + "logits/chosen": 3.7588369846343994, + "logits/rejected": 4.855842113494873, + "logps/chosen": -703.90234375, + "logps/rejected": -1005.2516479492188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.806050300598145, + "rewards/margins": 25.48776626586914, + "rewards/rejected": -35.29381561279297, + "step": 2339 + }, + { + "epoch": 1.4556765163297045, + "grad_norm": 0.0017797609325498343, + "learning_rate": 2.859612724757953e-06, + "logits/chosen": 0.6893689036369324, + "logits/rejected": 2.4680938720703125, + "logps/chosen": -492.37890625, + "logps/rejected": -1000.2102661132812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.166633605957031, + "rewards/margins": 31.155921936035156, + "rewards/rejected": -40.32255172729492, + "step": 2340 + }, + { + "epoch": 1.456298600311042, + "grad_norm": 9.442226655664854e-06, + "learning_rate": 2.8584601198709084e-06, + "logits/chosen": -2.1514220237731934, + "logits/rejected": 3.0572056770324707, + "logps/chosen": -334.06134033203125, + "logps/rejected": -919.5453491210938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.310533046722412, + "rewards/margins": 28.655559539794922, + "rewards/rejected": -34.96609115600586, + "step": 2341 + }, + { + "epoch": 1.4569206842923794, + "grad_norm": 4.326364040374756, + "learning_rate": 2.8573075149838636e-06, + "logits/chosen": 3.9358277320861816, + "logits/rejected": 4.717712879180908, + "logps/chosen": -596.55810546875, + "logps/rejected": -774.9546508789062, + "loss": 0.0755, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.885163307189941, + "rewards/margins": 19.06470489501953, + "rewards/rejected": -26.949867248535156, + "step": 2342 + }, + { + "epoch": 1.457542768273717, + "grad_norm": 0.00039138575084507465, + "learning_rate": 2.856154910096819e-06, + "logits/chosen": 1.4116837978363037, + "logits/rejected": 3.5277867317199707, + "logps/chosen": -538.282958984375, + "logps/rejected": -876.333740234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.566531658172607, + "rewards/margins": 24.43173599243164, + "rewards/rejected": -31.99827003479004, + "step": 2343 + }, + { + "epoch": 1.4581648522550545, + "grad_norm": 2.0726189613342285, + "learning_rate": 2.855002305209774e-06, + "logits/chosen": 2.4261767864227295, + "logits/rejected": 4.365809440612793, + "logps/chosen": -493.1600646972656, + "logps/rejected": -836.7879028320312, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.41954231262207, + "rewards/margins": 22.17431640625, + "rewards/rejected": -29.593856811523438, + "step": 2344 + }, + { + "epoch": 1.4587869362363919, + "grad_norm": 0.3835785388946533, + "learning_rate": 2.8538497003227293e-06, + "logits/chosen": -0.7283627390861511, + "logits/rejected": 2.8423304557800293, + "logps/chosen": -544.6996459960938, + "logps/rejected": -972.6627197265625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.42279577255249, + "rewards/margins": 25.784692764282227, + "rewards/rejected": -32.207489013671875, + "step": 2345 + }, + { + "epoch": 1.4594090202177294, + "grad_norm": 0.3456924855709076, + "learning_rate": 2.852697095435685e-06, + "logits/chosen": 1.5055906772613525, + "logits/rejected": 3.6082897186279297, + "logps/chosen": -594.1642456054688, + "logps/rejected": -965.9072875976562, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.597909927368164, + "rewards/margins": 24.235122680664062, + "rewards/rejected": -32.83303451538086, + "step": 2346 + }, + { + "epoch": 1.460031104199067, + "grad_norm": 0.37873873114585876, + "learning_rate": 2.85154449054864e-06, + "logits/chosen": 0.9466390609741211, + "logits/rejected": 3.1475629806518555, + "logps/chosen": -460.72149658203125, + "logps/rejected": -825.5015869140625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.996981620788574, + "rewards/margins": 24.10140609741211, + "rewards/rejected": -33.098388671875, + "step": 2347 + }, + { + "epoch": 1.4606531881804043, + "grad_norm": 9.690855979919434, + "learning_rate": 2.8503918856615954e-06, + "logits/chosen": -2.765359401702881, + "logits/rejected": 1.0825361013412476, + "logps/chosen": -454.8573303222656, + "logps/rejected": -916.665283203125, + "loss": 0.0908, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.064667224884033, + "rewards/margins": 24.96207618713379, + "rewards/rejected": -32.0267448425293, + "step": 2348 + }, + { + "epoch": 1.461275272161742, + "grad_norm": 0.14686737954616547, + "learning_rate": 2.8492392807745506e-06, + "logits/chosen": -0.8677732944488525, + "logits/rejected": 2.7581467628479004, + "logps/chosen": -310.7578125, + "logps/rejected": -701.1024780273438, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.958008289337158, + "rewards/margins": 18.651893615722656, + "rewards/rejected": -23.609901428222656, + "step": 2349 + }, + { + "epoch": 1.4618973561430793, + "grad_norm": 0.001136072096414864, + "learning_rate": 2.848086675887506e-06, + "logits/chosen": 1.9676111936569214, + "logits/rejected": 4.360522270202637, + "logps/chosen": -757.5195922851562, + "logps/rejected": -1163.509521484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.460611343383789, + "rewards/margins": 27.386371612548828, + "rewards/rejected": -41.84698486328125, + "step": 2350 + }, + { + "epoch": 1.4625194401244168, + "grad_norm": 0.0009974318090826273, + "learning_rate": 2.846934071000461e-06, + "logits/chosen": 2.166215419769287, + "logits/rejected": 4.188321113586426, + "logps/chosen": -620.139892578125, + "logps/rejected": -986.1962890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.4574174880981445, + "rewards/margins": 23.120960235595703, + "rewards/rejected": -30.578380584716797, + "step": 2351 + }, + { + "epoch": 1.4631415241057542, + "grad_norm": 1.2499357461929321, + "learning_rate": 2.8457814661134163e-06, + "logits/chosen": 0.13246458768844604, + "logits/rejected": 2.2372920513153076, + "logps/chosen": -439.4615173339844, + "logps/rejected": -887.6943359375, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.784399032592773, + "rewards/margins": 30.512107849121094, + "rewards/rejected": -39.2965087890625, + "step": 2352 + }, + { + "epoch": 1.4637636080870917, + "grad_norm": 35.474639892578125, + "learning_rate": 2.844628861226372e-06, + "logits/chosen": 1.5007219314575195, + "logits/rejected": 3.3255417346954346, + "logps/chosen": -552.41259765625, + "logps/rejected": -902.187255859375, + "loss": 0.5765, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.2433037757873535, + "rewards/margins": 23.953628540039062, + "rewards/rejected": -30.19693374633789, + "step": 2353 + }, + { + "epoch": 1.4643856920684293, + "grad_norm": 0.017408454790711403, + "learning_rate": 2.843476256339327e-06, + "logits/chosen": -4.301645278930664, + "logits/rejected": 0.49649691581726074, + "logps/chosen": -350.7183532714844, + "logps/rejected": -843.7274169921875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.945578575134277, + "rewards/margins": 28.868030548095703, + "rewards/rejected": -33.8136100769043, + "step": 2354 + }, + { + "epoch": 1.4650077760497666, + "grad_norm": 3.2871270179748535, + "learning_rate": 2.8423236514522824e-06, + "logits/chosen": 0.1433262825012207, + "logits/rejected": 1.8936108350753784, + "logps/chosen": -549.3184814453125, + "logps/rejected": -852.3326416015625, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.849102973937988, + "rewards/margins": 22.03537940979004, + "rewards/rejected": -33.88447952270508, + "step": 2355 + }, + { + "epoch": 1.4656298600311042, + "grad_norm": 3.6574772821040824e-05, + "learning_rate": 2.8411710465652376e-06, + "logits/chosen": -2.466205596923828, + "logits/rejected": 2.333536148071289, + "logps/chosen": -354.087890625, + "logps/rejected": -1063.402099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.740349292755127, + "rewards/margins": 38.1416015625, + "rewards/rejected": -44.881954193115234, + "step": 2356 + }, + { + "epoch": 1.4662519440124417, + "grad_norm": 0.0006809753249399364, + "learning_rate": 2.840018441678193e-06, + "logits/chosen": 1.7734744548797607, + "logits/rejected": 4.814844131469727, + "logps/chosen": -519.6259155273438, + "logps/rejected": -1013.4092407226562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.913437843322754, + "rewards/margins": 28.07283592224121, + "rewards/rejected": -35.98627471923828, + "step": 2357 + }, + { + "epoch": 1.466874027993779, + "grad_norm": 0.11370241641998291, + "learning_rate": 2.838865836791148e-06, + "logits/chosen": 1.595003366470337, + "logits/rejected": 3.9001097679138184, + "logps/chosen": -567.801513671875, + "logps/rejected": -948.3878784179688, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.882749557495117, + "rewards/margins": 24.700820922851562, + "rewards/rejected": -31.583572387695312, + "step": 2358 + }, + { + "epoch": 1.4674961119751166, + "grad_norm": 18.743146896362305, + "learning_rate": 2.8377132319041033e-06, + "logits/chosen": 0.6721723079681396, + "logits/rejected": 3.64046049118042, + "logps/chosen": -472.2651672363281, + "logps/rejected": -921.5863037109375, + "loss": 0.1213, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.745011329650879, + "rewards/margins": 23.231292724609375, + "rewards/rejected": -31.976303100585938, + "step": 2359 + }, + { + "epoch": 1.4681181959564542, + "grad_norm": 0.0008493398199789226, + "learning_rate": 2.836560627017059e-06, + "logits/chosen": 0.07488232851028442, + "logits/rejected": 2.289384603500366, + "logps/chosen": -517.2086181640625, + "logps/rejected": -871.466064453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.47705602645874, + "rewards/margins": 22.173673629760742, + "rewards/rejected": -29.65073013305664, + "step": 2360 + }, + { + "epoch": 1.4687402799377915, + "grad_norm": 0.00484496122226119, + "learning_rate": 2.835408022130014e-06, + "logits/chosen": -0.6169958114624023, + "logits/rejected": 3.451166868209839, + "logps/chosen": -461.492431640625, + "logps/rejected": -915.6463623046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.851531028747559, + "rewards/margins": 24.448511123657227, + "rewards/rejected": -33.30004119873047, + "step": 2361 + }, + { + "epoch": 1.469362363919129, + "grad_norm": 0.004142931196838617, + "learning_rate": 2.8342554172429694e-06, + "logits/chosen": 2.3081841468811035, + "logits/rejected": 2.309966564178467, + "logps/chosen": -713.6710205078125, + "logps/rejected": -979.0992431640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.44482421875, + "rewards/margins": 25.762096405029297, + "rewards/rejected": -35.2069206237793, + "step": 2362 + }, + { + "epoch": 1.4699844479004667, + "grad_norm": 22.25066566467285, + "learning_rate": 2.8331028123559246e-06, + "logits/chosen": -2.046438694000244, + "logits/rejected": 2.8966598510742188, + "logps/chosen": -330.8453369140625, + "logps/rejected": -708.985595703125, + "loss": 0.1171, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.468657493591309, + "rewards/margins": 15.020978927612305, + "rewards/rejected": -21.489635467529297, + "step": 2363 + }, + { + "epoch": 1.470606531881804, + "grad_norm": 0.6886146664619446, + "learning_rate": 2.83195020746888e-06, + "logits/chosen": 0.38471102714538574, + "logits/rejected": 3.2078094482421875, + "logps/chosen": -600.56396484375, + "logps/rejected": -885.523193359375, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.067018508911133, + "rewards/margins": 15.421733856201172, + "rewards/rejected": -24.488752365112305, + "step": 2364 + }, + { + "epoch": 1.4712286158631416, + "grad_norm": 0.13812090456485748, + "learning_rate": 2.830797602581835e-06, + "logits/chosen": 0.7420355081558228, + "logits/rejected": 1.6655001640319824, + "logps/chosen": -575.5955810546875, + "logps/rejected": -820.6859130859375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.355046272277832, + "rewards/margins": 20.670995712280273, + "rewards/rejected": -31.02604103088379, + "step": 2365 + }, + { + "epoch": 1.4718506998444791, + "grad_norm": 4.970563531969674e-05, + "learning_rate": 2.8296449976947903e-06, + "logits/chosen": -0.8490327596664429, + "logits/rejected": 3.9662766456604004, + "logps/chosen": -535.6664428710938, + "logps/rejected": -1070.893310546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.740767478942871, + "rewards/margins": 26.906150817871094, + "rewards/rejected": -36.64691925048828, + "step": 2366 + }, + { + "epoch": 1.4724727838258165, + "grad_norm": 0.0006937950383871794, + "learning_rate": 2.8284923928077455e-06, + "logits/chosen": -0.45125436782836914, + "logits/rejected": 2.927877902984619, + "logps/chosen": -407.20904541015625, + "logps/rejected": -850.866943359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.919216156005859, + "rewards/margins": 28.38515281677246, + "rewards/rejected": -34.30436706542969, + "step": 2367 + }, + { + "epoch": 1.473094867807154, + "grad_norm": 4.14547061920166, + "learning_rate": 2.827339787920701e-06, + "logits/chosen": 1.7581843137741089, + "logits/rejected": 3.994076728820801, + "logps/chosen": -676.138671875, + "logps/rejected": -961.73974609375, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.300919532775879, + "rewards/margins": 17.93962860107422, + "rewards/rejected": -29.24054527282715, + "step": 2368 + }, + { + "epoch": 1.4737169517884914, + "grad_norm": 7.2804209594323765e-06, + "learning_rate": 2.8261871830336564e-06, + "logits/chosen": 0.9338542222976685, + "logits/rejected": 2.5756583213806152, + "logps/chosen": -636.443603515625, + "logps/rejected": -1058.41796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.650700569152832, + "rewards/margins": 31.83884048461914, + "rewards/rejected": -40.489540100097656, + "step": 2369 + }, + { + "epoch": 1.474339035769829, + "grad_norm": 0.13495945930480957, + "learning_rate": 2.8250345781466116e-06, + "logits/chosen": -0.1379644274711609, + "logits/rejected": 4.099135398864746, + "logps/chosen": -435.06072998046875, + "logps/rejected": -1011.3875732421875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.849440574645996, + "rewards/margins": 29.290332794189453, + "rewards/rejected": -34.139774322509766, + "step": 2370 + }, + { + "epoch": 1.4749611197511663, + "grad_norm": 34.87321853637695, + "learning_rate": 2.823881973259567e-06, + "logits/chosen": 0.5016416907310486, + "logits/rejected": 3.4392948150634766, + "logps/chosen": -574.3455810546875, + "logps/rejected": -914.9444580078125, + "loss": 0.8498, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.708126068115234, + "rewards/margins": 23.428733825683594, + "rewards/rejected": -33.136863708496094, + "step": 2371 + }, + { + "epoch": 1.4755832037325038, + "grad_norm": 4.791884862243023e-07, + "learning_rate": 2.822729368372522e-06, + "logits/chosen": 2.531529188156128, + "logits/rejected": 3.9152517318725586, + "logps/chosen": -632.7254638671875, + "logps/rejected": -985.106201171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.28458309173584, + "rewards/margins": 27.402944564819336, + "rewards/rejected": -34.687530517578125, + "step": 2372 + }, + { + "epoch": 1.4762052877138414, + "grad_norm": 0.00026815864839591086, + "learning_rate": 2.8215767634854773e-06, + "logits/chosen": 0.4294940233230591, + "logits/rejected": 2.460738182067871, + "logps/chosen": -562.446044921875, + "logps/rejected": -992.881591796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.796026229858398, + "rewards/margins": 30.108896255493164, + "rewards/rejected": -37.90492248535156, + "step": 2373 + }, + { + "epoch": 1.4768273716951787, + "grad_norm": 16.857969284057617, + "learning_rate": 2.8204241585984325e-06, + "logits/chosen": 0.3010619282722473, + "logits/rejected": 2.009814739227295, + "logps/chosen": -580.1434326171875, + "logps/rejected": -995.7860107421875, + "loss": 0.1045, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.051149368286133, + "rewards/margins": 28.202312469482422, + "rewards/rejected": -39.25346374511719, + "step": 2374 + }, + { + "epoch": 1.4774494556765163, + "grad_norm": 0.00010196808580076322, + "learning_rate": 2.819271553711388e-06, + "logits/chosen": 0.31497353315353394, + "logits/rejected": 2.954072952270508, + "logps/chosen": -638.5074462890625, + "logps/rejected": -1035.220703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.442541122436523, + "rewards/margins": 24.339162826538086, + "rewards/rejected": -36.781707763671875, + "step": 2375 + }, + { + "epoch": 1.4780715396578539, + "grad_norm": 16.672515869140625, + "learning_rate": 2.8181189488243434e-06, + "logits/chosen": 0.9463157057762146, + "logits/rejected": 2.8403208255767822, + "logps/chosen": -544.5169677734375, + "logps/rejected": -791.8751831054688, + "loss": 0.0848, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.374601364135742, + "rewards/margins": 18.103866577148438, + "rewards/rejected": -25.47846794128418, + "step": 2376 + }, + { + "epoch": 1.4786936236391912, + "grad_norm": 0.0009514009580016136, + "learning_rate": 2.8169663439372986e-06, + "logits/chosen": 0.3465360999107361, + "logits/rejected": 2.0250134468078613, + "logps/chosen": -541.4112548828125, + "logps/rejected": -1036.6761474609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.47393798828125, + "rewards/margins": 31.21034049987793, + "rewards/rejected": -43.68428039550781, + "step": 2377 + }, + { + "epoch": 1.4793157076205288, + "grad_norm": 0.003587897401303053, + "learning_rate": 2.815813739050254e-06, + "logits/chosen": 0.45701614022254944, + "logits/rejected": 4.645224571228027, + "logps/chosen": -472.3916931152344, + "logps/rejected": -922.6441650390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.343234539031982, + "rewards/margins": 19.303733825683594, + "rewards/rejected": -26.646968841552734, + "step": 2378 + }, + { + "epoch": 1.4799377916018663, + "grad_norm": 1.2445507049560547, + "learning_rate": 2.814661134163209e-06, + "logits/chosen": 0.3036624789237976, + "logits/rejected": 4.275981426239014, + "logps/chosen": -469.282958984375, + "logps/rejected": -1064.331298828125, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.483030319213867, + "rewards/margins": 35.476776123046875, + "rewards/rejected": -43.95980453491211, + "step": 2379 + }, + { + "epoch": 1.4805598755832037, + "grad_norm": 6.064235549274599e-07, + "learning_rate": 2.8135085292761642e-06, + "logits/chosen": -0.9457730054855347, + "logits/rejected": 3.1594090461730957, + "logps/chosen": -401.4171142578125, + "logps/rejected": -926.5023803710938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.529729127883911, + "rewards/margins": 27.780242919921875, + "rewards/rejected": -30.309972763061523, + "step": 2380 + }, + { + "epoch": 1.4811819595645412, + "grad_norm": 1.4774746894836426, + "learning_rate": 2.8123559243891195e-06, + "logits/chosen": -1.9062919616699219, + "logits/rejected": 4.1249895095825195, + "logps/chosen": -458.60137939453125, + "logps/rejected": -1110.5926513671875, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.662704467773438, + "rewards/margins": 28.99850845336914, + "rewards/rejected": -38.66121292114258, + "step": 2381 + }, + { + "epoch": 1.4818040435458788, + "grad_norm": 0.015088031068444252, + "learning_rate": 2.811203319502075e-06, + "logits/chosen": -0.5311957597732544, + "logits/rejected": 0.9081388115882874, + "logps/chosen": -559.6624755859375, + "logps/rejected": -943.973876953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.011931419372559, + "rewards/margins": 30.31785774230957, + "rewards/rejected": -39.32978820800781, + "step": 2382 + }, + { + "epoch": 1.4824261275272161, + "grad_norm": 0.1541348546743393, + "learning_rate": 2.8100507146150303e-06, + "logits/chosen": 0.4068309962749481, + "logits/rejected": 2.128772735595703, + "logps/chosen": -577.6309204101562, + "logps/rejected": -1012.070556640625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.201326370239258, + "rewards/margins": 29.582557678222656, + "rewards/rejected": -37.78388214111328, + "step": 2383 + }, + { + "epoch": 1.4830482115085537, + "grad_norm": 9.980174464629954e-10, + "learning_rate": 2.8088981097279856e-06, + "logits/chosen": -1.31401789188385, + "logits/rejected": 2.629791021347046, + "logps/chosen": -518.3517456054688, + "logps/rejected": -1158.66015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.827581405639648, + "rewards/margins": 32.846656799316406, + "rewards/rejected": -46.67423629760742, + "step": 2384 + }, + { + "epoch": 1.4836702954898913, + "grad_norm": 54.381248474121094, + "learning_rate": 2.807745504840941e-06, + "logits/chosen": -0.3003752827644348, + "logits/rejected": 3.1268844604492188, + "logps/chosen": -413.068603515625, + "logps/rejected": -895.3505859375, + "loss": 1.0718, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.030431747436523, + "rewards/margins": 25.447994232177734, + "rewards/rejected": -33.478424072265625, + "step": 2385 + }, + { + "epoch": 1.4842923794712286, + "grad_norm": 19.62732696533203, + "learning_rate": 2.806592899953896e-06, + "logits/chosen": 0.8926951885223389, + "logits/rejected": 2.025569438934326, + "logps/chosen": -564.5762329101562, + "logps/rejected": -894.4605712890625, + "loss": 0.0952, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.947629928588867, + "rewards/margins": 26.661720275878906, + "rewards/rejected": -37.609352111816406, + "step": 2386 + }, + { + "epoch": 1.4849144634525662, + "grad_norm": 0.08074700832366943, + "learning_rate": 2.8054402950668512e-06, + "logits/chosen": -2.5688016414642334, + "logits/rejected": 2.8136441707611084, + "logps/chosen": -407.8417053222656, + "logps/rejected": -908.9759521484375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.713031768798828, + "rewards/margins": 24.314756393432617, + "rewards/rejected": -30.027786254882812, + "step": 2387 + }, + { + "epoch": 1.4855365474339035, + "grad_norm": 6.001190211435414e-09, + "learning_rate": 2.8042876901798065e-06, + "logits/chosen": 0.46303045749664307, + "logits/rejected": 4.923661231994629, + "logps/chosen": -580.6307373046875, + "logps/rejected": -1194.9835205078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.170421600341797, + "rewards/margins": 36.211002349853516, + "rewards/rejected": -45.38142395019531, + "step": 2388 + }, + { + "epoch": 1.486158631415241, + "grad_norm": 3.6265132427215576, + "learning_rate": 2.8031350852927617e-06, + "logits/chosen": 0.7024605870246887, + "logits/rejected": 3.2725701332092285, + "logps/chosen": -553.1900024414062, + "logps/rejected": -950.600830078125, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.3401360511779785, + "rewards/margins": 20.61620330810547, + "rewards/rejected": -26.956340789794922, + "step": 2389 + }, + { + "epoch": 1.4867807153965784, + "grad_norm": 29.599863052368164, + "learning_rate": 2.8019824804057173e-06, + "logits/chosen": 0.4895651340484619, + "logits/rejected": 2.5251917839050293, + "logps/chosen": -574.4183959960938, + "logps/rejected": -955.8038330078125, + "loss": 0.1737, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.665485382080078, + "rewards/margins": 21.212478637695312, + "rewards/rejected": -34.877960205078125, + "step": 2390 + }, + { + "epoch": 1.487402799377916, + "grad_norm": 0.0006918342551216483, + "learning_rate": 2.8008298755186726e-06, + "logits/chosen": -0.6636053323745728, + "logits/rejected": 2.680243968963623, + "logps/chosen": -415.1027526855469, + "logps/rejected": -902.0926513671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.417555809020996, + "rewards/margins": 26.878646850585938, + "rewards/rejected": -37.296199798583984, + "step": 2391 + }, + { + "epoch": 1.4880248833592535, + "grad_norm": 0.006915316917002201, + "learning_rate": 2.799677270631628e-06, + "logits/chosen": -3.2200422286987305, + "logits/rejected": 0.09259417653083801, + "logps/chosen": -331.16986083984375, + "logps/rejected": -823.7048950195312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.002388954162598, + "rewards/margins": 33.96484375, + "rewards/rejected": -40.96723556518555, + "step": 2392 + }, + { + "epoch": 1.4886469673405909, + "grad_norm": 22.634033203125, + "learning_rate": 2.798524665744583e-06, + "logits/chosen": -2.392181873321533, + "logits/rejected": 3.8128175735473633, + "logps/chosen": -507.95855712890625, + "logps/rejected": -1193.1182861328125, + "loss": 0.1397, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.660463809967041, + "rewards/margins": 31.40123748779297, + "rewards/rejected": -38.061702728271484, + "step": 2393 + }, + { + "epoch": 1.4892690513219284, + "grad_norm": 8.997950553894043, + "learning_rate": 2.7973720608575382e-06, + "logits/chosen": -0.7331479787826538, + "logits/rejected": 4.469404697418213, + "logps/chosen": -514.8806762695312, + "logps/rejected": -1097.8677978515625, + "loss": 0.048, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.211360931396484, + "rewards/margins": 28.35735321044922, + "rewards/rejected": -35.5687141418457, + "step": 2394 + }, + { + "epoch": 1.489891135303266, + "grad_norm": 13.998196601867676, + "learning_rate": 2.7962194559704935e-06, + "logits/chosen": 1.6632962226867676, + "logits/rejected": 5.319121360778809, + "logps/chosen": -552.4025268554688, + "logps/rejected": -1164.188720703125, + "loss": 0.0704, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.791094779968262, + "rewards/margins": 33.89095687866211, + "rewards/rejected": -44.68205261230469, + "step": 2395 + }, + { + "epoch": 1.4905132192846033, + "grad_norm": 39.58431625366211, + "learning_rate": 2.7950668510834487e-06, + "logits/chosen": -0.7600829005241394, + "logits/rejected": 4.01608419418335, + "logps/chosen": -558.3819580078125, + "logps/rejected": -1057.7418212890625, + "loss": 0.2663, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.818014621734619, + "rewards/margins": 28.740753173828125, + "rewards/rejected": -35.55876922607422, + "step": 2396 + }, + { + "epoch": 1.491135303265941, + "grad_norm": 0.0052307723090052605, + "learning_rate": 2.7939142461964043e-06, + "logits/chosen": -0.843339204788208, + "logits/rejected": 2.3956422805786133, + "logps/chosen": -650.19873046875, + "logps/rejected": -1152.5457763671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.485787391662598, + "rewards/margins": 31.080204010009766, + "rewards/rejected": -45.56599426269531, + "step": 2397 + }, + { + "epoch": 1.4917573872472785, + "grad_norm": 6.963534815440653e-08, + "learning_rate": 2.7927616413093596e-06, + "logits/chosen": -2.2730250358581543, + "logits/rejected": 2.440248489379883, + "logps/chosen": -517.8671875, + "logps/rejected": -1140.94873046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.223237037658691, + "rewards/margins": 35.51857376098633, + "rewards/rejected": -46.74181365966797, + "step": 2398 + }, + { + "epoch": 1.4923794712286158, + "grad_norm": 11.254664421081543, + "learning_rate": 2.791609036422315e-06, + "logits/chosen": 0.6596075892448425, + "logits/rejected": 2.2641704082489014, + "logps/chosen": -540.0404052734375, + "logps/rejected": -805.3961181640625, + "loss": 0.1092, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.902894973754883, + "rewards/margins": 15.764644622802734, + "rewards/rejected": -25.667539596557617, + "step": 2399 + }, + { + "epoch": 1.4930015552099534, + "grad_norm": 14.557866096496582, + "learning_rate": 2.79045643153527e-06, + "logits/chosen": 0.24826712906360626, + "logits/rejected": 2.864927291870117, + "logps/chosen": -531.9868774414062, + "logps/rejected": -907.9967041015625, + "loss": 0.123, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.463442802429199, + "rewards/margins": 25.134521484375, + "rewards/rejected": -32.59796142578125, + "step": 2400 + }, + { + "epoch": 1.493623639191291, + "grad_norm": 17.49602699279785, + "learning_rate": 2.7893038266482252e-06, + "logits/chosen": 1.243196725845337, + "logits/rejected": 4.108986854553223, + "logps/chosen": -593.2510986328125, + "logps/rejected": -947.8580322265625, + "loss": 0.0633, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.749677658081055, + "rewards/margins": 21.538341522216797, + "rewards/rejected": -32.288021087646484, + "step": 2401 + }, + { + "epoch": 1.4942457231726283, + "grad_norm": 9.584007329976885e-07, + "learning_rate": 2.7881512217611805e-06, + "logits/chosen": 2.3788280487060547, + "logits/rejected": 3.814729690551758, + "logps/chosen": -558.6822509765625, + "logps/rejected": -915.1446533203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.254256248474121, + "rewards/margins": 29.16737937927246, + "rewards/rejected": -38.42163848876953, + "step": 2402 + }, + { + "epoch": 1.4948678071539658, + "grad_norm": 0.0007437972817569971, + "learning_rate": 2.7869986168741357e-06, + "logits/chosen": -2.4514694213867188, + "logits/rejected": 2.2673568725585938, + "logps/chosen": -317.77044677734375, + "logps/rejected": -846.106689453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.250749588012695, + "rewards/margins": 26.648483276367188, + "rewards/rejected": -31.899234771728516, + "step": 2403 + }, + { + "epoch": 1.4954898911353034, + "grad_norm": 2.4952751118689775e-05, + "learning_rate": 2.7858460119870913e-06, + "logits/chosen": 0.03027331829071045, + "logits/rejected": 3.5894298553466797, + "logps/chosen": -426.458251953125, + "logps/rejected": -831.8810424804688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.599156379699707, + "rewards/margins": 19.644079208374023, + "rewards/rejected": -28.243236541748047, + "step": 2404 + }, + { + "epoch": 1.4961119751166407, + "grad_norm": 0.021602489054203033, + "learning_rate": 2.7846934071000466e-06, + "logits/chosen": -0.24435366690158844, + "logits/rejected": 2.2700464725494385, + "logps/chosen": -568.7886962890625, + "logps/rejected": -955.20751953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.295473098754883, + "rewards/margins": 25.87757682800293, + "rewards/rejected": -38.17304992675781, + "step": 2405 + }, + { + "epoch": 1.4967340590979783, + "grad_norm": 49.02490997314453, + "learning_rate": 2.7835408022130018e-06, + "logits/chosen": -0.45938390493392944, + "logits/rejected": 1.8616430759429932, + "logps/chosen": -490.0686340332031, + "logps/rejected": -981.237060546875, + "loss": 0.4472, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.393717765808105, + "rewards/margins": 27.700580596923828, + "rewards/rejected": -36.094303131103516, + "step": 2406 + }, + { + "epoch": 1.4973561430793156, + "grad_norm": 1.6386419534683228, + "learning_rate": 2.782388197325957e-06, + "logits/chosen": -0.5539921522140503, + "logits/rejected": 2.1543989181518555, + "logps/chosen": -465.9166259765625, + "logps/rejected": -878.362548828125, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.275735378265381, + "rewards/margins": 26.394481658935547, + "rewards/rejected": -32.67021560668945, + "step": 2407 + }, + { + "epoch": 1.4979782270606532, + "grad_norm": 0.004477610811591148, + "learning_rate": 2.7812355924389122e-06, + "logits/chosen": -0.5151447057723999, + "logits/rejected": 3.5004169940948486, + "logps/chosen": -492.4542236328125, + "logps/rejected": -857.807861328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.714780807495117, + "rewards/margins": 21.09079360961914, + "rewards/rejected": -31.805574417114258, + "step": 2408 + }, + { + "epoch": 1.4986003110419905, + "grad_norm": 0.049627888947725296, + "learning_rate": 2.7800829875518675e-06, + "logits/chosen": 2.9913344383239746, + "logits/rejected": 3.396819591522217, + "logps/chosen": -797.1928100585938, + "logps/rejected": -1062.549072265625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.408965110778809, + "rewards/margins": 27.537857055664062, + "rewards/rejected": -39.94682312011719, + "step": 2409 + }, + { + "epoch": 1.499222395023328, + "grad_norm": 0.9994378089904785, + "learning_rate": 2.7789303826648227e-06, + "logits/chosen": -0.025385677814483643, + "logits/rejected": 2.2981178760528564, + "logps/chosen": -588.76025390625, + "logps/rejected": -896.0594482421875, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.430543899536133, + "rewards/margins": 17.48183250427246, + "rewards/rejected": -28.912376403808594, + "step": 2410 + }, + { + "epoch": 1.4998444790046657, + "grad_norm": 3.7340683937072754, + "learning_rate": 2.7777777777777783e-06, + "logits/chosen": -2.0321695804595947, + "logits/rejected": 1.818973183631897, + "logps/chosen": -518.645751953125, + "logps/rejected": -1016.1298217773438, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.84179973602295, + "rewards/margins": 25.71894645690918, + "rewards/rejected": -35.56074523925781, + "step": 2411 + }, + { + "epoch": 1.500466562986003, + "grad_norm": 0.8598123788833618, + "learning_rate": 2.7766251728907336e-06, + "logits/chosen": 2.643171787261963, + "logits/rejected": 3.5609869956970215, + "logps/chosen": -689.77197265625, + "logps/rejected": -989.2908935546875, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.747472763061523, + "rewards/margins": 24.951494216918945, + "rewards/rejected": -38.69896697998047, + "step": 2412 + }, + { + "epoch": 1.5010886469673406, + "grad_norm": 0.0394817516207695, + "learning_rate": 2.7754725680036888e-06, + "logits/chosen": -0.7106583714485168, + "logits/rejected": 1.8233951330184937, + "logps/chosen": -603.9002685546875, + "logps/rejected": -1166.8470458984375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.380064964294434, + "rewards/margins": 37.42149353027344, + "rewards/rejected": -46.80155944824219, + "step": 2413 + }, + { + "epoch": 1.5017107309486781, + "grad_norm": 0.15796667337417603, + "learning_rate": 2.774319963116644e-06, + "logits/chosen": 0.41977572441101074, + "logits/rejected": 2.346480369567871, + "logps/chosen": -530.1688232421875, + "logps/rejected": -940.331298828125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.626280784606934, + "rewards/margins": 26.990314483642578, + "rewards/rejected": -37.61659622192383, + "step": 2414 + }, + { + "epoch": 1.5023328149300155, + "grad_norm": 35.47513198852539, + "learning_rate": 2.7731673582295992e-06, + "logits/chosen": 0.3368704319000244, + "logits/rejected": 2.681093454360962, + "logps/chosen": -392.08807373046875, + "logps/rejected": -673.9613647460938, + "loss": 0.343, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.13431453704834, + "rewards/margins": 20.258684158325195, + "rewards/rejected": -28.39299964904785, + "step": 2415 + }, + { + "epoch": 1.502954898911353, + "grad_norm": 0.008840728551149368, + "learning_rate": 2.7720147533425545e-06, + "logits/chosen": 1.6901668310165405, + "logits/rejected": 3.677003860473633, + "logps/chosen": -688.974609375, + "logps/rejected": -1004.6647338867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.885746002197266, + "rewards/margins": 22.908794403076172, + "rewards/rejected": -33.79454040527344, + "step": 2416 + }, + { + "epoch": 1.5035769828926906, + "grad_norm": 0.00016085940296761692, + "learning_rate": 2.7708621484555097e-06, + "logits/chosen": -0.6000373363494873, + "logits/rejected": 2.4729881286621094, + "logps/chosen": -447.89678955078125, + "logps/rejected": -982.514892578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.45212173461914, + "rewards/margins": 33.85790252685547, + "rewards/rejected": -42.310020446777344, + "step": 2417 + }, + { + "epoch": 1.504199066874028, + "grad_norm": 6.961882172618061e-05, + "learning_rate": 2.769709543568465e-06, + "logits/chosen": 1.9195138216018677, + "logits/rejected": 3.281461715698242, + "logps/chosen": -664.70751953125, + "logps/rejected": -1080.923095703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.098590850830078, + "rewards/margins": 28.69293212890625, + "rewards/rejected": -38.79152297973633, + "step": 2418 + }, + { + "epoch": 1.5048211508553655, + "grad_norm": 0.03905373439192772, + "learning_rate": 2.7685569386814206e-06, + "logits/chosen": 0.5288473963737488, + "logits/rejected": 2.606009006500244, + "logps/chosen": -588.6915283203125, + "logps/rejected": -1013.2470092773438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.714137077331543, + "rewards/margins": 31.635147094726562, + "rewards/rejected": -41.34928512573242, + "step": 2419 + }, + { + "epoch": 1.505443234836703, + "grad_norm": 51.066200256347656, + "learning_rate": 2.7674043337943758e-06, + "logits/chosen": 0.3447500765323639, + "logits/rejected": 2.5495049953460693, + "logps/chosen": -701.102294921875, + "logps/rejected": -1111.52685546875, + "loss": 0.7873, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.652717590332031, + "rewards/margins": 34.65187072753906, + "rewards/rejected": -50.30458450317383, + "step": 2420 + }, + { + "epoch": 1.5060653188180404, + "grad_norm": 4.544334411621094, + "learning_rate": 2.766251728907331e-06, + "logits/chosen": -0.7580356001853943, + "logits/rejected": 2.633445978164673, + "logps/chosen": -559.7318725585938, + "logps/rejected": -1010.6101684570312, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.76829719543457, + "rewards/margins": 29.79266357421875, + "rewards/rejected": -40.56096267700195, + "step": 2421 + }, + { + "epoch": 1.506687402799378, + "grad_norm": 0.5855236649513245, + "learning_rate": 2.7650991240202862e-06, + "logits/chosen": 1.4826244115829468, + "logits/rejected": 5.066828727722168, + "logps/chosen": -537.537353515625, + "logps/rejected": -894.7590942382812, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.093587875366211, + "rewards/margins": 23.76456642150879, + "rewards/rejected": -32.858154296875, + "step": 2422 + }, + { + "epoch": 1.5073094867807155, + "grad_norm": 7.518645617210495e-08, + "learning_rate": 2.7639465191332415e-06, + "logits/chosen": 0.8941356539726257, + "logits/rejected": 3.034396171569824, + "logps/chosen": -535.9212646484375, + "logps/rejected": -1033.3935546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.346717834472656, + "rewards/margins": 33.61785125732422, + "rewards/rejected": -41.964569091796875, + "step": 2423 + }, + { + "epoch": 1.5079315707620529, + "grad_norm": 2.77945876121521, + "learning_rate": 2.7627939142461967e-06, + "logits/chosen": 0.5460835695266724, + "logits/rejected": 4.352009296417236, + "logps/chosen": -500.4707946777344, + "logps/rejected": -1004.465576171875, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.690991401672363, + "rewards/margins": 25.8135986328125, + "rewards/rejected": -35.50458908081055, + "step": 2424 + }, + { + "epoch": 1.5085536547433902, + "grad_norm": 9.093402475457424e-09, + "learning_rate": 2.761641309359152e-06, + "logits/chosen": -3.632844924926758, + "logits/rejected": 3.0162880420684814, + "logps/chosen": -412.83270263671875, + "logps/rejected": -1213.8994140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.727034091949463, + "rewards/margins": 44.87699890136719, + "rewards/rejected": -51.604034423828125, + "step": 2425 + }, + { + "epoch": 1.509175738724728, + "grad_norm": 0.19694973528385162, + "learning_rate": 2.7604887044721076e-06, + "logits/chosen": 0.878905177116394, + "logits/rejected": 3.315678596496582, + "logps/chosen": -607.8718872070312, + "logps/rejected": -988.4095458984375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.330745697021484, + "rewards/margins": 26.40803337097168, + "rewards/rejected": -36.7387809753418, + "step": 2426 + }, + { + "epoch": 1.5097978227060653, + "grad_norm": 1.872478060249705e-05, + "learning_rate": 2.7593360995850628e-06, + "logits/chosen": -0.5195698738098145, + "logits/rejected": 1.6151765584945679, + "logps/chosen": -601.8635864257812, + "logps/rejected": -897.02099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.940296173095703, + "rewards/margins": 25.409744262695312, + "rewards/rejected": -35.350040435791016, + "step": 2427 + }, + { + "epoch": 1.5104199066874027, + "grad_norm": 35.12553024291992, + "learning_rate": 2.758183494698018e-06, + "logits/chosen": -0.8151949644088745, + "logits/rejected": 2.569201707839966, + "logps/chosen": -508.86138916015625, + "logps/rejected": -909.9303588867188, + "loss": 0.4958, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.214035034179688, + "rewards/margins": 23.001964569091797, + "rewards/rejected": -33.215999603271484, + "step": 2428 + }, + { + "epoch": 1.5110419906687402, + "grad_norm": 3.7998273372650146, + "learning_rate": 2.7570308898109732e-06, + "logits/chosen": 1.9752259254455566, + "logits/rejected": 4.596410274505615, + "logps/chosen": -713.7828369140625, + "logps/rejected": -1202.2008056640625, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.797355651855469, + "rewards/margins": 27.169727325439453, + "rewards/rejected": -38.96708297729492, + "step": 2429 + }, + { + "epoch": 1.5116640746500778, + "grad_norm": 27.40531349182129, + "learning_rate": 2.7558782849239285e-06, + "logits/chosen": 2.0464298725128174, + "logits/rejected": 4.918285369873047, + "logps/chosen": -549.6773681640625, + "logps/rejected": -924.9179077148438, + "loss": 0.1815, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.397310256958008, + "rewards/margins": 23.025856018066406, + "rewards/rejected": -34.42316436767578, + "step": 2430 + }, + { + "epoch": 1.5122861586314151, + "grad_norm": 8.02556037902832, + "learning_rate": 2.7547256800368837e-06, + "logits/chosen": 0.1492936760187149, + "logits/rejected": 1.9480422735214233, + "logps/chosen": -600.2176513671875, + "logps/rejected": -991.26806640625, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.710877418518066, + "rewards/margins": 25.57059097290039, + "rewards/rejected": -35.28146743774414, + "step": 2431 + }, + { + "epoch": 1.5129082426127527, + "grad_norm": 0.0011324110673740506, + "learning_rate": 2.753573075149839e-06, + "logits/chosen": -2.1036806106567383, + "logits/rejected": 2.825481414794922, + "logps/chosen": -573.4466552734375, + "logps/rejected": -1138.679931640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.326138496398926, + "rewards/margins": 28.386465072631836, + "rewards/rejected": -42.71260452270508, + "step": 2432 + }, + { + "epoch": 1.5135303265940903, + "grad_norm": 0.0010255653178319335, + "learning_rate": 2.7524204702627945e-06, + "logits/chosen": 2.479342460632324, + "logits/rejected": 4.253589630126953, + "logps/chosen": -537.177001953125, + "logps/rejected": -864.4534301757812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.309935569763184, + "rewards/margins": 23.93538475036621, + "rewards/rejected": -32.24531936645508, + "step": 2433 + }, + { + "epoch": 1.5141524105754276, + "grad_norm": 1.3664502773735876e-07, + "learning_rate": 2.7512678653757498e-06, + "logits/chosen": -2.0502357482910156, + "logits/rejected": 3.057553768157959, + "logps/chosen": -561.5479125976562, + "logps/rejected": -1400.86767578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.96968936920166, + "rewards/margins": 49.919158935546875, + "rewards/rejected": -59.888851165771484, + "step": 2434 + }, + { + "epoch": 1.5147744945567652, + "grad_norm": 1.4722614878337481e-06, + "learning_rate": 2.750115260488705e-06, + "logits/chosen": 0.7835877537727356, + "logits/rejected": 3.7953410148620605, + "logps/chosen": -514.9024047851562, + "logps/rejected": -1092.03271484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.49221420288086, + "rewards/margins": 32.406341552734375, + "rewards/rejected": -40.898555755615234, + "step": 2435 + }, + { + "epoch": 1.5153965785381027, + "grad_norm": 0.000481670256704092, + "learning_rate": 2.7489626556016602e-06, + "logits/chosen": 1.926286220550537, + "logits/rejected": 4.122426986694336, + "logps/chosen": -615.120361328125, + "logps/rejected": -1095.592529296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.4006242752075195, + "rewards/margins": 35.62769317626953, + "rewards/rejected": -42.0283203125, + "step": 2436 + }, + { + "epoch": 1.51601866251944, + "grad_norm": 0.13158220052719116, + "learning_rate": 2.7478100507146154e-06, + "logits/chosen": 2.438539505004883, + "logits/rejected": 3.7112181186676025, + "logps/chosen": -540.5350952148438, + "logps/rejected": -866.4390869140625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.910808563232422, + "rewards/margins": 23.664012908935547, + "rewards/rejected": -30.574819564819336, + "step": 2437 + }, + { + "epoch": 1.5166407465007776, + "grad_norm": 1.392264485359192, + "learning_rate": 2.7466574458275707e-06, + "logits/chosen": 1.2793818712234497, + "logits/rejected": 3.091960906982422, + "logps/chosen": -674.7471313476562, + "logps/rejected": -1019.5752563476562, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.084941864013672, + "rewards/margins": 23.3023624420166, + "rewards/rejected": -39.387306213378906, + "step": 2438 + }, + { + "epoch": 1.5172628304821152, + "grad_norm": 22.623865127563477, + "learning_rate": 2.745504840940526e-06, + "logits/chosen": 0.5542949438095093, + "logits/rejected": 1.8071708679199219, + "logps/chosen": -590.0474243164062, + "logps/rejected": -921.1709594726562, + "loss": 0.2623, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.720653533935547, + "rewards/margins": 23.469879150390625, + "rewards/rejected": -37.19053649902344, + "step": 2439 + }, + { + "epoch": 1.5178849144634525, + "grad_norm": 6.497164577012882e-05, + "learning_rate": 2.744352236053481e-06, + "logits/chosen": 0.5079193115234375, + "logits/rejected": 2.893798351287842, + "logps/chosen": -563.1099853515625, + "logps/rejected": -988.0438232421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.849778175354004, + "rewards/margins": 30.442718505859375, + "rewards/rejected": -42.29249572753906, + "step": 2440 + }, + { + "epoch": 1.51850699844479, + "grad_norm": 3.525198221206665, + "learning_rate": 2.7431996311664368e-06, + "logits/chosen": 1.9516997337341309, + "logits/rejected": 2.452531337738037, + "logps/chosen": -795.6206665039062, + "logps/rejected": -1220.8349609375, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.373698234558105, + "rewards/margins": 30.371505737304688, + "rewards/rejected": -45.745201110839844, + "step": 2441 + }, + { + "epoch": 1.5191290824261277, + "grad_norm": 0.2017599195241928, + "learning_rate": 2.742047026279392e-06, + "logits/chosen": -0.42940008640289307, + "logits/rejected": 2.5336532592773438, + "logps/chosen": -372.24713134765625, + "logps/rejected": -885.3436279296875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.162657737731934, + "rewards/margins": 28.138328552246094, + "rewards/rejected": -35.300987243652344, + "step": 2442 + }, + { + "epoch": 1.519751166407465, + "grad_norm": 7.490428970413632e-07, + "learning_rate": 2.7408944213923472e-06, + "logits/chosen": 0.3331374526023865, + "logits/rejected": 3.346038818359375, + "logps/chosen": -534.9227294921875, + "logps/rejected": -993.5345458984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.115105628967285, + "rewards/margins": 27.73911476135254, + "rewards/rejected": -38.854217529296875, + "step": 2443 + }, + { + "epoch": 1.5203732503888023, + "grad_norm": 0.21721665561199188, + "learning_rate": 2.7397418165053024e-06, + "logits/chosen": 0.08835101127624512, + "logits/rejected": 1.9229445457458496, + "logps/chosen": -594.24609375, + "logps/rejected": -945.4306640625, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.304940223693848, + "rewards/margins": 23.95694351196289, + "rewards/rejected": -33.261878967285156, + "step": 2444 + }, + { + "epoch": 1.5209953343701401, + "grad_norm": 0.0029869587160646915, + "learning_rate": 2.7385892116182577e-06, + "logits/chosen": -0.9350767135620117, + "logits/rejected": 2.4976651668548584, + "logps/chosen": -440.30133056640625, + "logps/rejected": -1002.122314453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.273009300231934, + "rewards/margins": 29.780683517456055, + "rewards/rejected": -39.05369186401367, + "step": 2445 + }, + { + "epoch": 1.5216174183514775, + "grad_norm": 0.018518727272748947, + "learning_rate": 2.737436606731213e-06, + "logits/chosen": -1.7265733480453491, + "logits/rejected": 3.20708966255188, + "logps/chosen": -387.7981872558594, + "logps/rejected": -950.6136474609375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.578176498413086, + "rewards/margins": 31.040130615234375, + "rewards/rejected": -39.618309020996094, + "step": 2446 + }, + { + "epoch": 1.5222395023328148, + "grad_norm": 1.1724967956542969, + "learning_rate": 2.736284001844168e-06, + "logits/chosen": 0.09461307525634766, + "logits/rejected": 3.0579278469085693, + "logps/chosen": -470.50006103515625, + "logps/rejected": -860.4100341796875, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.176168441772461, + "rewards/margins": 23.08479118347168, + "rewards/rejected": -31.26095962524414, + "step": 2447 + }, + { + "epoch": 1.5228615863141524, + "grad_norm": 0.5167633891105652, + "learning_rate": 2.7351313969571238e-06, + "logits/chosen": 1.8386075496673584, + "logits/rejected": 2.3939671516418457, + "logps/chosen": -635.957763671875, + "logps/rejected": -812.0810546875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.093191146850586, + "rewards/margins": 19.466598510742188, + "rewards/rejected": -32.55978775024414, + "step": 2448 + }, + { + "epoch": 1.52348367029549, + "grad_norm": 0.06474962085485458, + "learning_rate": 2.733978792070078e-06, + "logits/chosen": 0.4039455056190491, + "logits/rejected": 3.9968762397766113, + "logps/chosen": -536.2996826171875, + "logps/rejected": -1120.7918701171875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.25309944152832, + "rewards/margins": 29.0009822845459, + "rewards/rejected": -36.25408172607422, + "step": 2449 + }, + { + "epoch": 1.5241057542768273, + "grad_norm": 1.1716054359567352e-05, + "learning_rate": 2.732826187183034e-06, + "logits/chosen": 2.6479225158691406, + "logits/rejected": 3.409113645553589, + "logps/chosen": -719.8601684570312, + "logps/rejected": -981.4459228515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.668313980102539, + "rewards/margins": 25.581754684448242, + "rewards/rejected": -36.250064849853516, + "step": 2450 + }, + { + "epoch": 1.5247278382581648, + "grad_norm": 0.0022444785572588444, + "learning_rate": 2.731673582295989e-06, + "logits/chosen": 0.8789412975311279, + "logits/rejected": 1.1855617761611938, + "logps/chosen": -638.7889404296875, + "logps/rejected": -830.7925415039062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.223556518554688, + "rewards/margins": 21.04960060119629, + "rewards/rejected": -33.273155212402344, + "step": 2451 + }, + { + "epoch": 1.5253499222395024, + "grad_norm": 1.4134977845969843e-06, + "learning_rate": 2.7305209774089442e-06, + "logits/chosen": -0.17751866579055786, + "logits/rejected": 3.011488437652588, + "logps/chosen": -519.5287475585938, + "logps/rejected": -1034.9293212890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.822946548461914, + "rewards/margins": 32.01959228515625, + "rewards/rejected": -45.8425407409668, + "step": 2452 + }, + { + "epoch": 1.5259720062208397, + "grad_norm": 1.7072126823336475e-08, + "learning_rate": 2.7293683725218995e-06, + "logits/chosen": -0.6224250793457031, + "logits/rejected": 3.233564853668213, + "logps/chosen": -584.4635620117188, + "logps/rejected": -1117.71728515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.048900604248047, + "rewards/margins": 32.317787170410156, + "rewards/rejected": -41.36669158935547, + "step": 2453 + }, + { + "epoch": 1.5265940902021773, + "grad_norm": 1.3031844900979195e-05, + "learning_rate": 2.7282157676348547e-06, + "logits/chosen": 3.390267848968506, + "logits/rejected": 4.746077537536621, + "logps/chosen": -832.5244140625, + "logps/rejected": -1280.4586181640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.035869598388672, + "rewards/margins": 36.55662155151367, + "rewards/rejected": -52.592491149902344, + "step": 2454 + }, + { + "epoch": 1.5272161741835149, + "grad_norm": 0.4124351441860199, + "learning_rate": 2.72706316274781e-06, + "logits/chosen": 1.1203548908233643, + "logits/rejected": 1.8139748573303223, + "logps/chosen": -593.9185791015625, + "logps/rejected": -884.8736572265625, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.489469528198242, + "rewards/margins": 25.796775817871094, + "rewards/rejected": -33.2862434387207, + "step": 2455 + }, + { + "epoch": 1.5278382581648522, + "grad_norm": 2.913459062576294, + "learning_rate": 2.725910557860765e-06, + "logits/chosen": 1.5062038898468018, + "logits/rejected": 3.94240665435791, + "logps/chosen": -485.2527770996094, + "logps/rejected": -734.7945556640625, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.377772331237793, + "rewards/margins": 17.07063865661621, + "rewards/rejected": -26.448410034179688, + "step": 2456 + }, + { + "epoch": 1.5284603421461898, + "grad_norm": 1.1580379009246826, + "learning_rate": 2.724757952973721e-06, + "logits/chosen": -1.4506340026855469, + "logits/rejected": 1.637460708618164, + "logps/chosen": -512.544921875, + "logps/rejected": -956.2432861328125, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.50003719329834, + "rewards/margins": 24.926979064941406, + "rewards/rejected": -33.42701721191406, + "step": 2457 + }, + { + "epoch": 1.5290824261275273, + "grad_norm": 0.837179958820343, + "learning_rate": 2.723605348086676e-06, + "logits/chosen": -2.831498622894287, + "logits/rejected": 1.7902092933654785, + "logps/chosen": -421.67547607421875, + "logps/rejected": -812.24609375, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.50699234008789, + "rewards/margins": 21.368635177612305, + "rewards/rejected": -30.875625610351562, + "step": 2458 + }, + { + "epoch": 1.5297045101088647, + "grad_norm": 0.04172046482563019, + "learning_rate": 2.7224527431996312e-06, + "logits/chosen": -3.2224364280700684, + "logits/rejected": 3.2389562129974365, + "logps/chosen": -271.430419921875, + "logps/rejected": -872.8142700195312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.220527648925781, + "rewards/margins": 31.60157012939453, + "rewards/rejected": -37.82209777832031, + "step": 2459 + }, + { + "epoch": 1.5303265940902022, + "grad_norm": 0.00036741772782988846, + "learning_rate": 2.7213001383125865e-06, + "logits/chosen": -0.7951878905296326, + "logits/rejected": 2.4324963092803955, + "logps/chosen": -371.46630859375, + "logps/rejected": -739.7484741210938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.226638317108154, + "rewards/margins": 19.29977798461914, + "rewards/rejected": -25.526412963867188, + "step": 2460 + }, + { + "epoch": 1.5309486780715398, + "grad_norm": 33.964141845703125, + "learning_rate": 2.7201475334255417e-06, + "logits/chosen": 1.0293166637420654, + "logits/rejected": 3.4800872802734375, + "logps/chosen": -537.6439819335938, + "logps/rejected": -1003.99462890625, + "loss": 0.4305, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.282417297363281, + "rewards/margins": 34.82405090332031, + "rewards/rejected": -43.10646438598633, + "step": 2461 + }, + { + "epoch": 1.5315707620528771, + "grad_norm": 0.007006136234849691, + "learning_rate": 2.718994928538497e-06, + "logits/chosen": 0.6648842692375183, + "logits/rejected": -0.3263876438140869, + "logps/chosen": -736.0245361328125, + "logps/rejected": -854.2238159179688, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.421032905578613, + "rewards/margins": 23.76239585876465, + "rewards/rejected": -31.183427810668945, + "step": 2462 + }, + { + "epoch": 1.5321928460342145, + "grad_norm": 0.023654131218791008, + "learning_rate": 2.717842323651452e-06, + "logits/chosen": -0.5091513991355896, + "logits/rejected": 2.5750484466552734, + "logps/chosen": -364.62310791015625, + "logps/rejected": -830.571044921875, + "loss": 0.0866, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.813291549682617, + "rewards/margins": 25.79483413696289, + "rewards/rejected": -33.608123779296875, + "step": 2463 + }, + { + "epoch": 1.5328149300155522, + "grad_norm": 2.9299342713784426e-05, + "learning_rate": 2.7166897187644074e-06, + "logits/chosen": -0.37343019247055054, + "logits/rejected": 1.7793900966644287, + "logps/chosen": -414.75274658203125, + "logps/rejected": -872.8060913085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.2207536697387695, + "rewards/margins": 34.36773681640625, + "rewards/rejected": -40.58848571777344, + "step": 2464 + }, + { + "epoch": 1.5334370139968896, + "grad_norm": 0.12796840071678162, + "learning_rate": 2.715537113877363e-06, + "logits/chosen": -0.22504746913909912, + "logits/rejected": 2.651764392852783, + "logps/chosen": -532.3455810546875, + "logps/rejected": -983.6930541992188, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.367204666137695, + "rewards/margins": 27.68272590637207, + "rewards/rejected": -37.049930572509766, + "step": 2465 + }, + { + "epoch": 1.534059097978227, + "grad_norm": 22.160669326782227, + "learning_rate": 2.7143845089903182e-06, + "logits/chosen": -0.8312405943870544, + "logits/rejected": 1.0451204776763916, + "logps/chosen": -529.1278076171875, + "logps/rejected": -887.6492919921875, + "loss": 0.2574, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.91517162322998, + "rewards/margins": 27.073076248168945, + "rewards/rejected": -39.988250732421875, + "step": 2466 + }, + { + "epoch": 1.5346811819595645, + "grad_norm": 0.0007610557368025184, + "learning_rate": 2.7132319041032735e-06, + "logits/chosen": -1.3976175785064697, + "logits/rejected": 3.0815062522888184, + "logps/chosen": -397.5416259765625, + "logps/rejected": -866.3143310546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.541437149047852, + "rewards/margins": 28.345874786376953, + "rewards/rejected": -37.88731384277344, + "step": 2467 + }, + { + "epoch": 1.535303265940902, + "grad_norm": 0.014254836365580559, + "learning_rate": 2.7120792992162287e-06, + "logits/chosen": -0.7080047130584717, + "logits/rejected": 2.4305598735809326, + "logps/chosen": -506.60711669921875, + "logps/rejected": -929.2200927734375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.860275268554688, + "rewards/margins": 21.704132080078125, + "rewards/rejected": -34.56440734863281, + "step": 2468 + }, + { + "epoch": 1.5359253499222394, + "grad_norm": 0.08221913874149323, + "learning_rate": 2.710926694329184e-06, + "logits/chosen": -0.03373962268233299, + "logits/rejected": 2.4926323890686035, + "logps/chosen": -512.6092529296875, + "logps/rejected": -846.0359497070312, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.146074295043945, + "rewards/margins": 20.820676803588867, + "rewards/rejected": -27.96674919128418, + "step": 2469 + }, + { + "epoch": 1.536547433903577, + "grad_norm": 0.9291184544563293, + "learning_rate": 2.709774089442139e-06, + "logits/chosen": 2.647587537765503, + "logits/rejected": 3.6963562965393066, + "logps/chosen": -655.6061401367188, + "logps/rejected": -922.4675903320312, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.961267471313477, + "rewards/margins": 19.87137222290039, + "rewards/rejected": -30.8326416015625, + "step": 2470 + }, + { + "epoch": 1.5371695178849145, + "grad_norm": 0.0023878749925643206, + "learning_rate": 2.7086214845550944e-06, + "logits/chosen": -0.08294537663459778, + "logits/rejected": 3.287048578262329, + "logps/chosen": -385.4952392578125, + "logps/rejected": -815.072021484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.807106018066406, + "rewards/margins": 21.306137084960938, + "rewards/rejected": -30.113243103027344, + "step": 2471 + }, + { + "epoch": 1.5377916018662519, + "grad_norm": 0.7342345714569092, + "learning_rate": 2.70746887966805e-06, + "logits/chosen": 0.5753480195999146, + "logits/rejected": 3.7676868438720703, + "logps/chosen": -441.8882751464844, + "logps/rejected": -854.6257934570312, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.644134521484375, + "rewards/margins": 24.062170028686523, + "rewards/rejected": -32.70630645751953, + "step": 2472 + }, + { + "epoch": 1.5384136858475894, + "grad_norm": 27.15131187438965, + "learning_rate": 2.7063162747810052e-06, + "logits/chosen": 1.2550212144851685, + "logits/rejected": 2.9827208518981934, + "logps/chosen": -442.3372802734375, + "logps/rejected": -729.8902587890625, + "loss": 0.1199, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.713069915771484, + "rewards/margins": 15.846502304077148, + "rewards/rejected": -26.5595703125, + "step": 2473 + }, + { + "epoch": 1.539035769828927, + "grad_norm": 0.022780759260058403, + "learning_rate": 2.7051636698939605e-06, + "logits/chosen": 0.1612262725830078, + "logits/rejected": 4.642159938812256, + "logps/chosen": -553.856689453125, + "logps/rejected": -1031.6358642578125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.07167911529541, + "rewards/margins": 21.338481903076172, + "rewards/rejected": -30.4101619720459, + "step": 2474 + }, + { + "epoch": 1.5396578538102643, + "grad_norm": 3.3649816266745347e-10, + "learning_rate": 2.7040110650069157e-06, + "logits/chosen": -3.152029275894165, + "logits/rejected": 1.914430856704712, + "logps/chosen": -467.9855651855469, + "logps/rejected": -1212.340576171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.553437232971191, + "rewards/margins": 41.90711975097656, + "rewards/rejected": -53.46055603027344, + "step": 2475 + }, + { + "epoch": 1.5402799377916019, + "grad_norm": 3.6554156395141035e-06, + "learning_rate": 2.702858460119871e-06, + "logits/chosen": 2.662741184234619, + "logits/rejected": 4.871638298034668, + "logps/chosen": -587.4205322265625, + "logps/rejected": -1073.63818359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.387510299682617, + "rewards/margins": 34.884681701660156, + "rewards/rejected": -44.272193908691406, + "step": 2476 + }, + { + "epoch": 1.5409020217729394, + "grad_norm": 2.009286880493164, + "learning_rate": 2.701705855232826e-06, + "logits/chosen": 2.2703564167022705, + "logits/rejected": 2.3993289470672607, + "logps/chosen": -774.0054931640625, + "logps/rejected": -1000.2762451171875, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.877897262573242, + "rewards/margins": 19.677772521972656, + "rewards/rejected": -37.555667877197266, + "step": 2477 + }, + { + "epoch": 1.5415241057542768, + "grad_norm": 0.000742044416256249, + "learning_rate": 2.7005532503457814e-06, + "logits/chosen": 0.48302167654037476, + "logits/rejected": 4.2788166999816895, + "logps/chosen": -413.0840148925781, + "logps/rejected": -951.339111328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.892075538635254, + "rewards/margins": 32.75634765625, + "rewards/rejected": -41.64841842651367, + "step": 2478 + }, + { + "epoch": 1.5421461897356143, + "grad_norm": 1.013292738605287e-08, + "learning_rate": 2.699400645458737e-06, + "logits/chosen": 0.2683802843093872, + "logits/rejected": 2.4963221549987793, + "logps/chosen": -575.1116943359375, + "logps/rejected": -1050.36376953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.652692794799805, + "rewards/margins": 36.25019454956055, + "rewards/rejected": -45.90288543701172, + "step": 2479 + }, + { + "epoch": 1.542768273716952, + "grad_norm": 6.289964949246496e-05, + "learning_rate": 2.6982480405716922e-06, + "logits/chosen": -0.9888131618499756, + "logits/rejected": 3.092804193496704, + "logps/chosen": -430.79046630859375, + "logps/rejected": -926.0091552734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.098524570465088, + "rewards/margins": 26.761119842529297, + "rewards/rejected": -32.859642028808594, + "step": 2480 + }, + { + "epoch": 1.5433903576982893, + "grad_norm": 0.0013569953152909875, + "learning_rate": 2.6970954356846475e-06, + "logits/chosen": -0.7292362451553345, + "logits/rejected": 1.174302101135254, + "logps/chosen": -625.8265991210938, + "logps/rejected": -1065.358154296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.744695663452148, + "rewards/margins": 28.380962371826172, + "rewards/rejected": -39.12565994262695, + "step": 2481 + }, + { + "epoch": 1.5440124416796266, + "grad_norm": 0.002672493224963546, + "learning_rate": 2.6959428307976027e-06, + "logits/chosen": 1.855053186416626, + "logits/rejected": 3.0483992099761963, + "logps/chosen": -619.2630615234375, + "logps/rejected": -1022.6942138671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.304459571838379, + "rewards/margins": 29.886611938476562, + "rewards/rejected": -40.191070556640625, + "step": 2482 + }, + { + "epoch": 1.5446345256609644, + "grad_norm": 5.420322486315854e-05, + "learning_rate": 2.694790225910558e-06, + "logits/chosen": -1.8340150117874146, + "logits/rejected": 3.607595920562744, + "logps/chosen": -360.5854187011719, + "logps/rejected": -1015.0472412109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2733869552612305, + "rewards/margins": 32.604515075683594, + "rewards/rejected": -36.87790298461914, + "step": 2483 + }, + { + "epoch": 1.5452566096423017, + "grad_norm": 0.0011562893632799387, + "learning_rate": 2.693637621023513e-06, + "logits/chosen": -2.7047109603881836, + "logits/rejected": 3.4061014652252197, + "logps/chosen": -437.25604248046875, + "logps/rejected": -1212.28857421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.983414649963379, + "rewards/margins": 38.062217712402344, + "rewards/rejected": -46.04563522338867, + "step": 2484 + }, + { + "epoch": 1.545878693623639, + "grad_norm": 1.156277176050935e-05, + "learning_rate": 2.6924850161364684e-06, + "logits/chosen": -1.4244234561920166, + "logits/rejected": 4.303253173828125, + "logps/chosen": -339.56890869140625, + "logps/rejected": -957.9353637695312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.711836814880371, + "rewards/margins": 32.08358383178711, + "rewards/rejected": -36.7954216003418, + "step": 2485 + }, + { + "epoch": 1.5465007776049766, + "grad_norm": 1.7454749468015507e-07, + "learning_rate": 2.691332411249424e-06, + "logits/chosen": 1.288722276687622, + "logits/rejected": 2.1820712089538574, + "logps/chosen": -599.7692260742188, + "logps/rejected": -972.705810546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.94448471069336, + "rewards/margins": 29.24241065979004, + "rewards/rejected": -40.18689727783203, + "step": 2486 + }, + { + "epoch": 1.5471228615863142, + "grad_norm": 23.994794845581055, + "learning_rate": 2.6901798063623792e-06, + "logits/chosen": 0.03151065111160278, + "logits/rejected": 2.8123786449432373, + "logps/chosen": -593.8890991210938, + "logps/rejected": -979.3603515625, + "loss": 0.1584, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.582526206970215, + "rewards/margins": 21.869956970214844, + "rewards/rejected": -32.452484130859375, + "step": 2487 + }, + { + "epoch": 1.5477449455676515, + "grad_norm": 9.391483146714563e-09, + "learning_rate": 2.6890272014753345e-06, + "logits/chosen": 0.1529102921485901, + "logits/rejected": 3.4798989295959473, + "logps/chosen": -538.7216186523438, + "logps/rejected": -1052.495361328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.785987854003906, + "rewards/margins": 36.01325988769531, + "rewards/rejected": -44.79924774169922, + "step": 2488 + }, + { + "epoch": 1.548367029548989, + "grad_norm": 12.872499465942383, + "learning_rate": 2.6878745965882897e-06, + "logits/chosen": -3.528820037841797, + "logits/rejected": 2.9935293197631836, + "logps/chosen": -178.8984832763672, + "logps/rejected": -872.1593017578125, + "loss": 0.1691, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5141162872314453, + "rewards/margins": 30.633007049560547, + "rewards/rejected": -34.14712142944336, + "step": 2489 + }, + { + "epoch": 1.5489891135303266, + "grad_norm": 28.490171432495117, + "learning_rate": 2.686721991701245e-06, + "logits/chosen": -0.568493127822876, + "logits/rejected": 1.1433310508728027, + "logps/chosen": -645.1004638671875, + "logps/rejected": -984.1376342773438, + "loss": 0.1649, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.578904151916504, + "rewards/margins": 25.876937866210938, + "rewards/rejected": -32.45584487915039, + "step": 2490 + }, + { + "epoch": 1.549611197511664, + "grad_norm": 9.463408470153809, + "learning_rate": 2.6855693868142e-06, + "logits/chosen": -1.7124032974243164, + "logits/rejected": 1.8130053281784058, + "logps/chosen": -408.69232177734375, + "logps/rejected": -863.88818359375, + "loss": 0.037, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.570595741271973, + "rewards/margins": 27.266246795654297, + "rewards/rejected": -38.83684539794922, + "step": 2491 + }, + { + "epoch": 1.5502332814930015, + "grad_norm": 5.1917506738163866e-08, + "learning_rate": 2.6844167819271554e-06, + "logits/chosen": 0.7790787220001221, + "logits/rejected": 3.551978588104248, + "logps/chosen": -534.1652221679688, + "logps/rejected": -1087.25048828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.447636604309082, + "rewards/margins": 35.59351348876953, + "rewards/rejected": -41.0411491394043, + "step": 2492 + }, + { + "epoch": 1.550855365474339, + "grad_norm": 0.009952358901500702, + "learning_rate": 2.6832641770401106e-06, + "logits/chosen": -2.5790116786956787, + "logits/rejected": 0.9824144840240479, + "logps/chosen": -362.249267578125, + "logps/rejected": -817.4982299804688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.19536018371582, + "rewards/margins": 20.45709800720215, + "rewards/rejected": -25.65245819091797, + "step": 2493 + }, + { + "epoch": 1.5514774494556764, + "grad_norm": 0.006530395243316889, + "learning_rate": 2.6821115721530662e-06, + "logits/chosen": -1.5302128791809082, + "logits/rejected": 1.6881933212280273, + "logps/chosen": -605.1315307617188, + "logps/rejected": -1031.45849609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.122232437133789, + "rewards/margins": 30.1520938873291, + "rewards/rejected": -39.27432632446289, + "step": 2494 + }, + { + "epoch": 1.552099533437014, + "grad_norm": 0.45922571420669556, + "learning_rate": 2.6809589672660214e-06, + "logits/chosen": -0.26042747497558594, + "logits/rejected": 4.046167373657227, + "logps/chosen": -430.70587158203125, + "logps/rejected": -943.998779296875, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.969429016113281, + "rewards/margins": 24.93759536743164, + "rewards/rejected": -32.90702438354492, + "step": 2495 + }, + { + "epoch": 1.5527216174183516, + "grad_norm": 9.564102219883353e-05, + "learning_rate": 2.6798063623789767e-06, + "logits/chosen": 1.4396380186080933, + "logits/rejected": 2.8060150146484375, + "logps/chosen": -577.88134765625, + "logps/rejected": -895.8618774414062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.6483793258667, + "rewards/margins": 23.74332046508789, + "rewards/rejected": -33.391700744628906, + "step": 2496 + }, + { + "epoch": 1.553343701399689, + "grad_norm": 7.810917468376033e-10, + "learning_rate": 2.678653757491932e-06, + "logits/chosen": 2.06510066986084, + "logits/rejected": 4.934891700744629, + "logps/chosen": -564.6988525390625, + "logps/rejected": -1128.0897216796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.261550903320312, + "rewards/margins": 31.091615676879883, + "rewards/rejected": -41.35316848754883, + "step": 2497 + }, + { + "epoch": 1.5539657853810265, + "grad_norm": 0.0008484581485390663, + "learning_rate": 2.677501152604887e-06, + "logits/chosen": -2.6576812267303467, + "logits/rejected": 2.4716832637786865, + "logps/chosen": -291.2125244140625, + "logps/rejected": -789.5697631835938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.161551475524902, + "rewards/margins": 22.731517791748047, + "rewards/rejected": -26.893070220947266, + "step": 2498 + }, + { + "epoch": 1.554587869362364, + "grad_norm": 0.0002879021340049803, + "learning_rate": 2.6763485477178423e-06, + "logits/chosen": 0.3646266460418701, + "logits/rejected": 3.86550235748291, + "logps/chosen": -518.783935546875, + "logps/rejected": -969.0770263671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.659425258636475, + "rewards/margins": 29.202505111694336, + "rewards/rejected": -35.86193084716797, + "step": 2499 + }, + { + "epoch": 1.5552099533437014, + "grad_norm": 57.68034362792969, + "learning_rate": 2.6751959428307976e-06, + "logits/chosen": -0.09455686807632446, + "logits/rejected": 2.160449743270874, + "logps/chosen": -619.3577270507812, + "logps/rejected": -954.7877197265625, + "loss": 0.9767, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.064574241638184, + "rewards/margins": 24.755352020263672, + "rewards/rejected": -34.819923400878906, + "step": 2500 + }, + { + "epoch": 1.5558320373250387, + "grad_norm": 29.71711540222168, + "learning_rate": 2.6740433379437532e-06, + "logits/chosen": 0.9189615249633789, + "logits/rejected": 3.5247128009796143, + "logps/chosen": -403.5267639160156, + "logps/rejected": -714.1881713867188, + "loss": 0.6582, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.70235824584961, + "rewards/margins": 14.166807174682617, + "rewards/rejected": -23.869165420532227, + "step": 2501 + }, + { + "epoch": 1.5564541213063765, + "grad_norm": 2.8825539288845903e-07, + "learning_rate": 2.6728907330567084e-06, + "logits/chosen": -1.2510281801223755, + "logits/rejected": 4.012646675109863, + "logps/chosen": -478.6166687011719, + "logps/rejected": -1164.77587890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.776491165161133, + "rewards/margins": 35.69042205810547, + "rewards/rejected": -43.46691131591797, + "step": 2502 + }, + { + "epoch": 1.5570762052877138, + "grad_norm": 0.05821898207068443, + "learning_rate": 2.6717381281696637e-06, + "logits/chosen": 2.096285343170166, + "logits/rejected": 2.3310656547546387, + "logps/chosen": -583.2084350585938, + "logps/rejected": -805.5694580078125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.98892593383789, + "rewards/margins": 24.44229507446289, + "rewards/rejected": -34.43122100830078, + "step": 2503 + }, + { + "epoch": 1.5576982892690512, + "grad_norm": 82.08670806884766, + "learning_rate": 2.670585523282619e-06, + "logits/chosen": -0.4332718551158905, + "logits/rejected": 4.304185390472412, + "logps/chosen": -328.9471130371094, + "logps/rejected": -675.8569946289062, + "loss": 0.544, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.469512939453125, + "rewards/margins": 16.837844848632812, + "rewards/rejected": -24.307355880737305, + "step": 2504 + }, + { + "epoch": 1.558320373250389, + "grad_norm": 18.179847717285156, + "learning_rate": 2.669432918395574e-06, + "logits/chosen": 1.644677758216858, + "logits/rejected": 4.1040730476379395, + "logps/chosen": -453.4781799316406, + "logps/rejected": -896.974853515625, + "loss": 0.1027, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.000197410583496, + "rewards/margins": 27.992036819458008, + "rewards/rejected": -39.99223327636719, + "step": 2505 + }, + { + "epoch": 1.5589424572317263, + "grad_norm": 0.9634746313095093, + "learning_rate": 2.6682803135085293e-06, + "logits/chosen": -0.5393826961517334, + "logits/rejected": 3.3745839595794678, + "logps/chosen": -550.09814453125, + "logps/rejected": -988.7060546875, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.148344039916992, + "rewards/margins": 24.866161346435547, + "rewards/rejected": -34.01450729370117, + "step": 2506 + }, + { + "epoch": 1.5595645412130636, + "grad_norm": 36.72014617919922, + "learning_rate": 2.6671277086214846e-06, + "logits/chosen": 2.456897258758545, + "logits/rejected": 1.9290426969528198, + "logps/chosen": -667.2633056640625, + "logps/rejected": -770.6574096679688, + "loss": 0.2178, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.517938613891602, + "rewards/margins": 16.047889709472656, + "rewards/rejected": -25.565826416015625, + "step": 2507 + }, + { + "epoch": 1.5601866251944012, + "grad_norm": 0.004348399117588997, + "learning_rate": 2.6659751037344402e-06, + "logits/chosen": -1.0786043405532837, + "logits/rejected": 4.487873077392578, + "logps/chosen": -415.68646240234375, + "logps/rejected": -1152.6275634765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.8546223640441895, + "rewards/margins": 38.18126678466797, + "rewards/rejected": -43.035888671875, + "step": 2508 + }, + { + "epoch": 1.5608087091757388, + "grad_norm": 37.94123458862305, + "learning_rate": 2.6648224988473954e-06, + "logits/chosen": 0.8928492069244385, + "logits/rejected": 2.8876805305480957, + "logps/chosen": -651.49755859375, + "logps/rejected": -904.5684814453125, + "loss": 0.767, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.068072319030762, + "rewards/margins": 11.500648498535156, + "rewards/rejected": -21.5687198638916, + "step": 2509 + }, + { + "epoch": 1.5614307931570761, + "grad_norm": 0.11989522725343704, + "learning_rate": 2.6636698939603507e-06, + "logits/chosen": -1.96262526512146, + "logits/rejected": 2.2925920486450195, + "logps/chosen": -503.9068908691406, + "logps/rejected": -1025.4134521484375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.269535064697266, + "rewards/margins": 29.875804901123047, + "rewards/rejected": -41.14533996582031, + "step": 2510 + }, + { + "epoch": 1.5620528771384137, + "grad_norm": 0.36885467171669006, + "learning_rate": 2.662517289073306e-06, + "logits/chosen": -0.9184327125549316, + "logits/rejected": 2.790034294128418, + "logps/chosen": -406.4637145996094, + "logps/rejected": -781.6912231445312, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.292177677154541, + "rewards/margins": 23.567413330078125, + "rewards/rejected": -27.859588623046875, + "step": 2511 + }, + { + "epoch": 1.5626749611197512, + "grad_norm": 0.2901647686958313, + "learning_rate": 2.661364684186261e-06, + "logits/chosen": -0.9580250978469849, + "logits/rejected": 2.781895160675049, + "logps/chosen": -444.0966796875, + "logps/rejected": -925.6182250976562, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.922597885131836, + "rewards/margins": 26.735271453857422, + "rewards/rejected": -35.657867431640625, + "step": 2512 + }, + { + "epoch": 1.5632970451010886, + "grad_norm": 0.2968682050704956, + "learning_rate": 2.6602120792992163e-06, + "logits/chosen": -0.31334781646728516, + "logits/rejected": 3.522827625274658, + "logps/chosen": -569.567138671875, + "logps/rejected": -1027.34765625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.26767349243164, + "rewards/margins": 28.82740592956543, + "rewards/rejected": -39.09507751464844, + "step": 2513 + }, + { + "epoch": 1.5639191290824261, + "grad_norm": 1.1502807140350342, + "learning_rate": 2.6590594744121716e-06, + "logits/chosen": 0.24424254894256592, + "logits/rejected": 2.969736099243164, + "logps/chosen": -649.736328125, + "logps/rejected": -989.1493530273438, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.951969623565674, + "rewards/margins": 20.423797607421875, + "rewards/rejected": -26.37576675415039, + "step": 2514 + }, + { + "epoch": 1.5645412130637637, + "grad_norm": 20.15816879272461, + "learning_rate": 2.657906869525127e-06, + "logits/chosen": -1.0590736865997314, + "logits/rejected": 2.900858163833618, + "logps/chosen": -347.2210693359375, + "logps/rejected": -889.9716796875, + "loss": 0.1205, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.696262836456299, + "rewards/margins": 23.77630615234375, + "rewards/rejected": -31.47256851196289, + "step": 2515 + }, + { + "epoch": 1.565163297045101, + "grad_norm": 1.8690065145492554, + "learning_rate": 2.6567542646380824e-06, + "logits/chosen": -2.203958034515381, + "logits/rejected": 3.374868631362915, + "logps/chosen": -402.5797424316406, + "logps/rejected": -912.735107421875, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.828686237335205, + "rewards/margins": 24.617942810058594, + "rewards/rejected": -31.44662857055664, + "step": 2516 + }, + { + "epoch": 1.5657853810264386, + "grad_norm": 8.570445061195642e-06, + "learning_rate": 2.6556016597510377e-06, + "logits/chosen": 0.857057511806488, + "logits/rejected": 4.9921956062316895, + "logps/chosen": -502.5346374511719, + "logps/rejected": -1055.5001220703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.580801486968994, + "rewards/margins": 30.187623977661133, + "rewards/rejected": -35.76842498779297, + "step": 2517 + }, + { + "epoch": 1.5664074650077762, + "grad_norm": 1.9115601901376067e-07, + "learning_rate": 2.654449054863993e-06, + "logits/chosen": 1.3328273296356201, + "logits/rejected": 2.9461498260498047, + "logps/chosen": -550.345947265625, + "logps/rejected": -998.7139892578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.301694869995117, + "rewards/margins": 32.61174011230469, + "rewards/rejected": -41.91343688964844, + "step": 2518 + }, + { + "epoch": 1.5670295489891135, + "grad_norm": 0.005492346826940775, + "learning_rate": 2.653296449976948e-06, + "logits/chosen": -2.038207769393921, + "logits/rejected": 2.8221802711486816, + "logps/chosen": -496.6783447265625, + "logps/rejected": -1076.4049072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.93287467956543, + "rewards/margins": 31.873998641967773, + "rewards/rejected": -41.80687713623047, + "step": 2519 + }, + { + "epoch": 1.5676516329704508, + "grad_norm": 31.12177848815918, + "learning_rate": 2.6521438450899033e-06, + "logits/chosen": 2.806741237640381, + "logits/rejected": 3.6385693550109863, + "logps/chosen": -586.6148681640625, + "logps/rejected": -871.7191162109375, + "loss": 0.9491, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.129293441772461, + "rewards/margins": 19.070999145507812, + "rewards/rejected": -30.200294494628906, + "step": 2520 + }, + { + "epoch": 1.5682737169517886, + "grad_norm": 39.04676055908203, + "learning_rate": 2.6509912402028586e-06, + "logits/chosen": 2.220421314239502, + "logits/rejected": 4.0747294425964355, + "logps/chosen": -629.5321655273438, + "logps/rejected": -880.70751953125, + "loss": 1.7587, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.306745529174805, + "rewards/margins": 16.847389221191406, + "rewards/rejected": -28.15413475036621, + "step": 2521 + }, + { + "epoch": 1.568895800933126, + "grad_norm": 0.3181990087032318, + "learning_rate": 2.6498386353158138e-06, + "logits/chosen": 3.5077221393585205, + "logits/rejected": 4.509550094604492, + "logps/chosen": -669.1798706054688, + "logps/rejected": -964.48974609375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.942821502685547, + "rewards/margins": 23.189241409301758, + "rewards/rejected": -35.13206481933594, + "step": 2522 + }, + { + "epoch": 1.5695178849144633, + "grad_norm": 0.04694477468729019, + "learning_rate": 2.6486860304287694e-06, + "logits/chosen": -1.1715391874313354, + "logits/rejected": 2.6850781440734863, + "logps/chosen": -528.1036376953125, + "logps/rejected": -1065.68701171875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.915897369384766, + "rewards/margins": 33.24100875854492, + "rewards/rejected": -40.15690612792969, + "step": 2523 + }, + { + "epoch": 1.570139968895801, + "grad_norm": 1.0720814458409222e-07, + "learning_rate": 2.6475334255417247e-06, + "logits/chosen": -1.3283451795578003, + "logits/rejected": 2.9149234294891357, + "logps/chosen": -557.9683837890625, + "logps/rejected": -1072.668701171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.794567108154297, + "rewards/margins": 29.346282958984375, + "rewards/rejected": -38.14085006713867, + "step": 2524 + }, + { + "epoch": 1.5707620528771384, + "grad_norm": 9.388706416757486e-07, + "learning_rate": 2.64638082065468e-06, + "logits/chosen": 0.9473726749420166, + "logits/rejected": 3.064026355743408, + "logps/chosen": -638.7322998046875, + "logps/rejected": -964.4024658203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.972917556762695, + "rewards/margins": 27.92718505859375, + "rewards/rejected": -33.90010070800781, + "step": 2525 + }, + { + "epoch": 1.5713841368584758, + "grad_norm": 0.046334926038980484, + "learning_rate": 2.645228215767635e-06, + "logits/chosen": 0.19112294912338257, + "logits/rejected": 2.3191397190093994, + "logps/chosen": -578.079833984375, + "logps/rejected": -947.2376098632812, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.907577037811279, + "rewards/margins": 25.607810974121094, + "rewards/rejected": -33.51538848876953, + "step": 2526 + }, + { + "epoch": 1.5720062208398133, + "grad_norm": 4.616542816162109, + "learning_rate": 2.6440756108805903e-06, + "logits/chosen": 2.2481207847595215, + "logits/rejected": 2.875807762145996, + "logps/chosen": -721.6137084960938, + "logps/rejected": -1072.4210205078125, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.013412475585938, + "rewards/margins": 27.015226364135742, + "rewards/rejected": -38.02864074707031, + "step": 2527 + }, + { + "epoch": 1.572628304821151, + "grad_norm": 0.07997529953718185, + "learning_rate": 2.6429230059935456e-06, + "logits/chosen": 0.14837861061096191, + "logits/rejected": 3.7757630348205566, + "logps/chosen": -531.468994140625, + "logps/rejected": -1102.809326171875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.239072799682617, + "rewards/margins": 31.316261291503906, + "rewards/rejected": -40.55533218383789, + "step": 2528 + }, + { + "epoch": 1.5732503888024882, + "grad_norm": 0.1496405154466629, + "learning_rate": 2.6417704011065008e-06, + "logits/chosen": -1.293025016784668, + "logits/rejected": 4.773388385772705, + "logps/chosen": -350.036376953125, + "logps/rejected": -869.2445068359375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.083723545074463, + "rewards/margins": 17.739789962768555, + "rewards/rejected": -22.823514938354492, + "step": 2529 + }, + { + "epoch": 1.5738724727838258, + "grad_norm": 0.05532778799533844, + "learning_rate": 2.6406177962194564e-06, + "logits/chosen": 0.01579982042312622, + "logits/rejected": 2.789395332336426, + "logps/chosen": -499.70263671875, + "logps/rejected": -814.7089233398438, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.12386417388916, + "rewards/margins": 21.891815185546875, + "rewards/rejected": -27.015682220458984, + "step": 2530 + }, + { + "epoch": 1.5744945567651634, + "grad_norm": 8.699598402017727e-05, + "learning_rate": 2.6394651913324117e-06, + "logits/chosen": 2.4416699409484863, + "logits/rejected": 5.013975620269775, + "logps/chosen": -410.20770263671875, + "logps/rejected": -835.2908935546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.419612884521484, + "rewards/margins": 23.25516128540039, + "rewards/rejected": -29.67477798461914, + "step": 2531 + }, + { + "epoch": 1.5751166407465007, + "grad_norm": 0.0021628057584166527, + "learning_rate": 2.638312586445367e-06, + "logits/chosen": -0.6670438051223755, + "logits/rejected": 3.6675217151641846, + "logps/chosen": -384.84075927734375, + "logps/rejected": -905.62646484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.705150604248047, + "rewards/margins": 26.648361206054688, + "rewards/rejected": -35.353511810302734, + "step": 2532 + }, + { + "epoch": 1.5757387247278383, + "grad_norm": 0.00378401973284781, + "learning_rate": 2.637159981558322e-06, + "logits/chosen": 1.4846572875976562, + "logits/rejected": 2.9995572566986084, + "logps/chosen": -725.524658203125, + "logps/rejected": -958.9528198242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.081862926483154, + "rewards/margins": 21.75735855102539, + "rewards/rejected": -28.839221954345703, + "step": 2533 + }, + { + "epoch": 1.5763608087091758, + "grad_norm": 5.04378604888916, + "learning_rate": 2.6360073766712773e-06, + "logits/chosen": 1.3019071817398071, + "logits/rejected": 4.482569217681885, + "logps/chosen": -533.6031494140625, + "logps/rejected": -985.0313110351562, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.342979907989502, + "rewards/margins": 23.088260650634766, + "rewards/rejected": -30.431243896484375, + "step": 2534 + }, + { + "epoch": 1.5769828926905132, + "grad_norm": 8.794490895525087e-06, + "learning_rate": 2.6348547717842326e-06, + "logits/chosen": 1.8717117309570312, + "logits/rejected": 3.6646323204040527, + "logps/chosen": -630.3969116210938, + "logps/rejected": -1010.4031982421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.561976909637451, + "rewards/margins": 26.8465633392334, + "rewards/rejected": -33.408538818359375, + "step": 2535 + }, + { + "epoch": 1.5776049766718507, + "grad_norm": 2.2234980860957876e-05, + "learning_rate": 2.6337021668971878e-06, + "logits/chosen": -0.8583243489265442, + "logits/rejected": 3.619313955307007, + "logps/chosen": -408.3638610839844, + "logps/rejected": -999.8567504882812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.225172996520996, + "rewards/margins": 28.894039154052734, + "rewards/rejected": -37.11920928955078, + "step": 2536 + }, + { + "epoch": 1.5782270606531883, + "grad_norm": 0.012370456010103226, + "learning_rate": 2.6325495620101434e-06, + "logits/chosen": -1.1759496927261353, + "logits/rejected": 2.753152370452881, + "logps/chosen": -449.21856689453125, + "logps/rejected": -964.8666381835938, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.146978378295898, + "rewards/margins": 27.28769302368164, + "rewards/rejected": -34.43467330932617, + "step": 2537 + }, + { + "epoch": 1.5788491446345256, + "grad_norm": 0.048277225345373154, + "learning_rate": 2.6313969571230987e-06, + "logits/chosen": -1.496280550956726, + "logits/rejected": 2.0494561195373535, + "logps/chosen": -304.0895080566406, + "logps/rejected": -812.8955688476562, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.053216934204102, + "rewards/margins": 28.015701293945312, + "rewards/rejected": -34.06891632080078, + "step": 2538 + }, + { + "epoch": 1.579471228615863, + "grad_norm": 0.0003371371713001281, + "learning_rate": 2.630244352236054e-06, + "logits/chosen": -0.8229646682739258, + "logits/rejected": 0.5232251286506653, + "logps/chosen": -396.1567077636719, + "logps/rejected": -720.4967041015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.08257007598877, + "rewards/margins": 23.936092376708984, + "rewards/rejected": -32.01866149902344, + "step": 2539 + }, + { + "epoch": 1.5800933125972008, + "grad_norm": 0.001109063159674406, + "learning_rate": 2.629091747349009e-06, + "logits/chosen": 1.09111750125885, + "logits/rejected": 3.661069631576538, + "logps/chosen": -433.0865478515625, + "logps/rejected": -773.2035522460938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.496565818786621, + "rewards/margins": 25.523311614990234, + "rewards/rejected": -34.01987838745117, + "step": 2540 + }, + { + "epoch": 1.580715396578538, + "grad_norm": 2.954153751488775e-05, + "learning_rate": 2.6279391424619643e-06, + "logits/chosen": 1.3929088115692139, + "logits/rejected": 3.76798152923584, + "logps/chosen": -631.7744140625, + "logps/rejected": -977.82080078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.168238639831543, + "rewards/margins": 28.04422950744629, + "rewards/rejected": -34.21247100830078, + "step": 2541 + }, + { + "epoch": 1.5813374805598754, + "grad_norm": 3.4777238368988037, + "learning_rate": 2.6267865375749196e-06, + "logits/chosen": 0.6678463220596313, + "logits/rejected": 3.8288257122039795, + "logps/chosen": -564.5045166015625, + "logps/rejected": -951.9814453125, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.784692764282227, + "rewards/margins": 19.069561004638672, + "rewards/rejected": -26.854251861572266, + "step": 2542 + }, + { + "epoch": 1.5819595645412132, + "grad_norm": 0.014230997301638126, + "learning_rate": 2.6256339326878748e-06, + "logits/chosen": 0.6849037408828735, + "logits/rejected": 3.4208898544311523, + "logps/chosen": -505.7391662597656, + "logps/rejected": -984.9599609375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.177289962768555, + "rewards/margins": 24.551013946533203, + "rewards/rejected": -32.728302001953125, + "step": 2543 + }, + { + "epoch": 1.5825816485225506, + "grad_norm": 0.00025358598213642836, + "learning_rate": 2.62448132780083e-06, + "logits/chosen": -2.680936574935913, + "logits/rejected": 0.6628610491752625, + "logps/chosen": -421.4416198730469, + "logps/rejected": -838.618408203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.892261028289795, + "rewards/margins": 24.422157287597656, + "rewards/rejected": -29.31441879272461, + "step": 2544 + }, + { + "epoch": 1.583203732503888, + "grad_norm": 0.2544800937175751, + "learning_rate": 2.6233287229137857e-06, + "logits/chosen": 1.9035508632659912, + "logits/rejected": 2.245786428451538, + "logps/chosen": -658.3436889648438, + "logps/rejected": -877.4723510742188, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.054585456848145, + "rewards/margins": 20.301048278808594, + "rewards/rejected": -30.355634689331055, + "step": 2545 + }, + { + "epoch": 1.5838258164852255, + "grad_norm": 0.07018294930458069, + "learning_rate": 2.622176118026741e-06, + "logits/chosen": 0.526107668876648, + "logits/rejected": 3.307961940765381, + "logps/chosen": -544.033203125, + "logps/rejected": -850.1934204101562, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.851484298706055, + "rewards/margins": 23.675559997558594, + "rewards/rejected": -29.52704620361328, + "step": 2546 + }, + { + "epoch": 1.584447900466563, + "grad_norm": 0.0002997084229718894, + "learning_rate": 2.621023513139696e-06, + "logits/chosen": -0.7003446221351624, + "logits/rejected": 3.4765868186950684, + "logps/chosen": -441.3431091308594, + "logps/rejected": -953.63671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.216616153717041, + "rewards/margins": 24.402572631835938, + "rewards/rejected": -31.61918830871582, + "step": 2547 + }, + { + "epoch": 1.5850699844479004, + "grad_norm": 0.0024575558491051197, + "learning_rate": 2.6198709082526513e-06, + "logits/chosen": 0.7880875468254089, + "logits/rejected": 4.125187397003174, + "logps/chosen": -468.6260070800781, + "logps/rejected": -896.840087890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.1893792152404785, + "rewards/margins": 22.992290496826172, + "rewards/rejected": -29.181671142578125, + "step": 2548 + }, + { + "epoch": 1.585692068429238, + "grad_norm": 0.0008942119893617928, + "learning_rate": 2.6187183033656065e-06, + "logits/chosen": 1.5010969638824463, + "logits/rejected": 3.093844175338745, + "logps/chosen": -624.7542724609375, + "logps/rejected": -999.3758544921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.038987159729004, + "rewards/margins": 24.880168914794922, + "rewards/rejected": -32.919158935546875, + "step": 2549 + }, + { + "epoch": 1.5863141524105755, + "grad_norm": 0.01182605978101492, + "learning_rate": 2.6175656984785618e-06, + "logits/chosen": -2.0643815994262695, + "logits/rejected": 2.822075366973877, + "logps/chosen": -418.46539306640625, + "logps/rejected": -1023.8269653320312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.2684173583984375, + "rewards/margins": 29.310277938842773, + "rewards/rejected": -35.578697204589844, + "step": 2550 + }, + { + "epoch": 1.5869362363919128, + "grad_norm": 0.20442859828472137, + "learning_rate": 2.616413093591517e-06, + "logits/chosen": 1.9439702033996582, + "logits/rejected": 3.6782374382019043, + "logps/chosen": -584.3154296875, + "logps/rejected": -1012.0755615234375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.359824180603027, + "rewards/margins": 26.538776397705078, + "rewards/rejected": -32.89860153198242, + "step": 2551 + }, + { + "epoch": 1.5875583203732504, + "grad_norm": 4.9473346734885126e-05, + "learning_rate": 2.6152604887044726e-06, + "logits/chosen": -0.7652897834777832, + "logits/rejected": 3.000626802444458, + "logps/chosen": -460.4172058105469, + "logps/rejected": -990.1380615234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.667954921722412, + "rewards/margins": 30.455583572387695, + "rewards/rejected": -38.123538970947266, + "step": 2552 + }, + { + "epoch": 1.588180404354588, + "grad_norm": 0.3778168559074402, + "learning_rate": 2.614107883817428e-06, + "logits/chosen": 0.441825270652771, + "logits/rejected": 3.4026546478271484, + "logps/chosen": -420.4871520996094, + "logps/rejected": -755.599365234375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.588260650634766, + "rewards/margins": 17.710298538208008, + "rewards/rejected": -24.298559188842773, + "step": 2553 + }, + { + "epoch": 1.5888024883359253, + "grad_norm": 0.08141383528709412, + "learning_rate": 2.612955278930383e-06, + "logits/chosen": 0.27586179971694946, + "logits/rejected": 3.2958290576934814, + "logps/chosen": -504.78094482421875, + "logps/rejected": -861.496337890625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.5833024978637695, + "rewards/margins": 24.973854064941406, + "rewards/rejected": -29.55715560913086, + "step": 2554 + }, + { + "epoch": 1.5894245723172629, + "grad_norm": 0.0002683989005163312, + "learning_rate": 2.6118026740433383e-06, + "logits/chosen": 0.30600738525390625, + "logits/rejected": 3.9830586910247803, + "logps/chosen": -449.1526794433594, + "logps/rejected": -920.802490234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.895366191864014, + "rewards/margins": 22.761844635009766, + "rewards/rejected": -30.657211303710938, + "step": 2555 + }, + { + "epoch": 1.5900466562986004, + "grad_norm": 0.4529314339160919, + "learning_rate": 2.6106500691562935e-06, + "logits/chosen": 0.1846950650215149, + "logits/rejected": 1.4357441663742065, + "logps/chosen": -549.7300415039062, + "logps/rejected": -770.741943359375, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.734827041625977, + "rewards/margins": 17.410337448120117, + "rewards/rejected": -25.14516830444336, + "step": 2556 + }, + { + "epoch": 1.5906687402799378, + "grad_norm": 0.0962265357375145, + "learning_rate": 2.6094974642692488e-06, + "logits/chosen": 1.6746344566345215, + "logits/rejected": 3.797276496887207, + "logps/chosen": -772.9541625976562, + "logps/rejected": -1070.3515625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.528155326843262, + "rewards/margins": 22.276142120361328, + "rewards/rejected": -27.804298400878906, + "step": 2557 + }, + { + "epoch": 1.5912908242612753, + "grad_norm": 0.037062421441078186, + "learning_rate": 2.608344859382204e-06, + "logits/chosen": 1.5409696102142334, + "logits/rejected": 3.426913261413574, + "logps/chosen": -573.5651245117188, + "logps/rejected": -867.2743530273438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.677513122558594, + "rewards/margins": 21.017486572265625, + "rewards/rejected": -28.69500160217285, + "step": 2558 + }, + { + "epoch": 1.591912908242613, + "grad_norm": 0.07313279807567596, + "learning_rate": 2.6071922544951596e-06, + "logits/chosen": 1.7334628105163574, + "logits/rejected": 2.109466075897217, + "logps/chosen": -693.92529296875, + "logps/rejected": -962.532470703125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.267555236816406, + "rewards/margins": 23.251060485839844, + "rewards/rejected": -36.51861572265625, + "step": 2559 + }, + { + "epoch": 1.5925349922239502, + "grad_norm": 0.02839500457048416, + "learning_rate": 2.606039649608115e-06, + "logits/chosen": -3.9982075691223145, + "logits/rejected": 2.8284850120544434, + "logps/chosen": -334.660888671875, + "logps/rejected": -1039.389404296875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.840045690536499, + "rewards/margins": 29.952611923217773, + "rewards/rejected": -33.792659759521484, + "step": 2560 + }, + { + "epoch": 1.5931570762052876, + "grad_norm": 0.5425875782966614, + "learning_rate": 2.60488704472107e-06, + "logits/chosen": 2.1814193725585938, + "logits/rejected": 4.387759685516357, + "logps/chosen": -606.7564697265625, + "logps/rejected": -958.6481323242188, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.972704887390137, + "rewards/margins": 20.99521255493164, + "rewards/rejected": -30.96791648864746, + "step": 2561 + }, + { + "epoch": 1.5937791601866254, + "grad_norm": 0.09757669270038605, + "learning_rate": 2.6037344398340253e-06, + "logits/chosen": -1.089568018913269, + "logits/rejected": 3.5683395862579346, + "logps/chosen": -398.23394775390625, + "logps/rejected": -1009.205810546875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4523613452911377, + "rewards/margins": 32.37748718261719, + "rewards/rejected": -35.82984924316406, + "step": 2562 + }, + { + "epoch": 1.5944012441679627, + "grad_norm": 0.0016663366695865989, + "learning_rate": 2.6025818349469805e-06, + "logits/chosen": 2.186467170715332, + "logits/rejected": 4.869716644287109, + "logps/chosen": -648.9400634765625, + "logps/rejected": -1087.5924072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.644506454467773, + "rewards/margins": 29.386463165283203, + "rewards/rejected": -41.03097152709961, + "step": 2563 + }, + { + "epoch": 1.5950233281493, + "grad_norm": 41.84226989746094, + "learning_rate": 2.6014292300599358e-06, + "logits/chosen": 2.0851328372955322, + "logits/rejected": 2.8250632286071777, + "logps/chosen": -555.81494140625, + "logps/rejected": -855.7350463867188, + "loss": 0.2556, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.315744400024414, + "rewards/margins": 22.709354400634766, + "rewards/rejected": -32.02510070800781, + "step": 2564 + }, + { + "epoch": 1.5956454121306376, + "grad_norm": 3.323015334899537e-05, + "learning_rate": 2.600276625172891e-06, + "logits/chosen": 1.2171071767807007, + "logits/rejected": 4.171212196350098, + "logps/chosen": -536.38916015625, + "logps/rejected": -1118.4737548828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.4384260177612305, + "rewards/margins": 34.14727783203125, + "rewards/rejected": -41.58570861816406, + "step": 2565 + }, + { + "epoch": 1.5962674961119752, + "grad_norm": 7.136670112609863, + "learning_rate": 2.5991240202858462e-06, + "logits/chosen": -3.851240634918213, + "logits/rejected": 1.4777321815490723, + "logps/chosen": -300.756591796875, + "logps/rejected": -792.384765625, + "loss": 0.0283, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.0310492515563965, + "rewards/margins": 17.15809440612793, + "rewards/rejected": -24.189144134521484, + "step": 2566 + }, + { + "epoch": 1.5968895800933125, + "grad_norm": 0.03129857778549194, + "learning_rate": 2.597971415398802e-06, + "logits/chosen": -0.48873722553253174, + "logits/rejected": 4.124366283416748, + "logps/chosen": -465.2950439453125, + "logps/rejected": -994.5694580078125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.589336395263672, + "rewards/margins": 25.836570739746094, + "rewards/rejected": -36.425907135009766, + "step": 2567 + }, + { + "epoch": 1.59751166407465, + "grad_norm": 1.2106145732104778e-05, + "learning_rate": 2.596818810511757e-06, + "logits/chosen": 1.2782320976257324, + "logits/rejected": 2.5213370323181152, + "logps/chosen": -537.9226684570312, + "logps/rejected": -860.0325317382812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.05621337890625, + "rewards/margins": 26.37557601928711, + "rewards/rejected": -32.43178939819336, + "step": 2568 + }, + { + "epoch": 1.5981337480559876, + "grad_norm": 0.0014095701044425368, + "learning_rate": 2.5956662056247123e-06, + "logits/chosen": -1.6308845281600952, + "logits/rejected": 1.9259669780731201, + "logps/chosen": -571.4871826171875, + "logps/rejected": -1069.501708984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.04758358001709, + "rewards/margins": 34.30439758300781, + "rewards/rejected": -43.35198211669922, + "step": 2569 + }, + { + "epoch": 1.598755832037325, + "grad_norm": 0.000554277969058603, + "learning_rate": 2.5945136007376675e-06, + "logits/chosen": -0.8610889315605164, + "logits/rejected": 3.7429866790771484, + "logps/chosen": -375.9157409667969, + "logps/rejected": -974.5948486328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.605518341064453, + "rewards/margins": 28.76226043701172, + "rewards/rejected": -34.36778259277344, + "step": 2570 + }, + { + "epoch": 1.5993779160186625, + "grad_norm": 3.883296813000925e-05, + "learning_rate": 2.5933609958506228e-06, + "logits/chosen": -2.023681640625, + "logits/rejected": 4.468254566192627, + "logps/chosen": -376.07720947265625, + "logps/rejected": -1144.0804443359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.148895263671875, + "rewards/margins": 39.1790771484375, + "rewards/rejected": -44.327972412109375, + "step": 2571 + }, + { + "epoch": 1.6, + "grad_norm": 0.05322521552443504, + "learning_rate": 2.592208390963578e-06, + "logits/chosen": -2.8253278732299805, + "logits/rejected": 2.3830480575561523, + "logps/chosen": -418.5565185546875, + "logps/rejected": -1005.6823120117188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.197831630706787, + "rewards/margins": 26.426393508911133, + "rewards/rejected": -29.624225616455078, + "step": 2572 + }, + { + "epoch": 1.6006220839813374, + "grad_norm": 11.513082504272461, + "learning_rate": 2.5910557860765332e-06, + "logits/chosen": -1.173416018486023, + "logits/rejected": 1.7636840343475342, + "logps/chosen": -539.8458862304688, + "logps/rejected": -1030.737060546875, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.969292640686035, + "rewards/margins": 24.555206298828125, + "rewards/rejected": -30.524499893188477, + "step": 2573 + }, + { + "epoch": 1.601244167962675, + "grad_norm": 0.017018593847751617, + "learning_rate": 2.589903181189489e-06, + "logits/chosen": -2.8795456886291504, + "logits/rejected": 1.4251694679260254, + "logps/chosen": -391.2342224121094, + "logps/rejected": -964.24072265625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.388416290283203, + "rewards/margins": 28.631431579589844, + "rewards/rejected": -35.01984405517578, + "step": 2574 + }, + { + "epoch": 1.6018662519440126, + "grad_norm": 32.19961166381836, + "learning_rate": 2.588750576302444e-06, + "logits/chosen": -1.7024846076965332, + "logits/rejected": 3.0102920532226562, + "logps/chosen": -383.92584228515625, + "logps/rejected": -823.80712890625, + "loss": 0.7058, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.452936172485352, + "rewards/margins": 15.71203899383545, + "rewards/rejected": -22.164974212646484, + "step": 2575 + }, + { + "epoch": 1.60248833592535, + "grad_norm": 9.193151527142618e-06, + "learning_rate": 2.5875979714153993e-06, + "logits/chosen": -0.4175698757171631, + "logits/rejected": 4.685636520385742, + "logps/chosen": -467.2003173828125, + "logps/rejected": -1059.797607421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.230841159820557, + "rewards/margins": 32.583526611328125, + "rewards/rejected": -39.81436538696289, + "step": 2576 + }, + { + "epoch": 1.6031104199066875, + "grad_norm": 0.6161825656890869, + "learning_rate": 2.5864453665283545e-06, + "logits/chosen": -0.6858171224594116, + "logits/rejected": 2.824033737182617, + "logps/chosen": -561.7140502929688, + "logps/rejected": -998.8251342773438, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.853142738342285, + "rewards/margins": 32.505619049072266, + "rewards/rejected": -37.3587646484375, + "step": 2577 + }, + { + "epoch": 1.603732503888025, + "grad_norm": 26.230451583862305, + "learning_rate": 2.5852927616413098e-06, + "logits/chosen": 0.6289217472076416, + "logits/rejected": 3.2599964141845703, + "logps/chosen": -524.7599487304688, + "logps/rejected": -822.0818481445312, + "loss": 0.1951, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.978344917297363, + "rewards/margins": 17.70925521850586, + "rewards/rejected": -24.687599182128906, + "step": 2578 + }, + { + "epoch": 1.6043545878693624, + "grad_norm": 4.318963320315561e-08, + "learning_rate": 2.584140156754265e-06, + "logits/chosen": -2.402571439743042, + "logits/rejected": 3.2633743286132812, + "logps/chosen": -379.7547302246094, + "logps/rejected": -1089.806640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.327663421630859, + "rewards/margins": 37.135555267333984, + "rewards/rejected": -44.463218688964844, + "step": 2579 + }, + { + "epoch": 1.6049766718506997, + "grad_norm": 0.19019412994384766, + "learning_rate": 2.58298755186722e-06, + "logits/chosen": 2.633239984512329, + "logits/rejected": 4.513375282287598, + "logps/chosen": -677.1762084960938, + "logps/rejected": -1026.538330078125, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.649065017700195, + "rewards/margins": 23.47222328186035, + "rewards/rejected": -35.12129211425781, + "step": 2580 + }, + { + "epoch": 1.6055987558320375, + "grad_norm": 0.10817208886146545, + "learning_rate": 2.581834946980176e-06, + "logits/chosen": -1.4973026514053345, + "logits/rejected": 2.0379388332366943, + "logps/chosen": -446.043212890625, + "logps/rejected": -862.3677368164062, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.164411544799805, + "rewards/margins": 23.581241607666016, + "rewards/rejected": -30.74565315246582, + "step": 2581 + }, + { + "epoch": 1.6062208398133748, + "grad_norm": 0.1274665892124176, + "learning_rate": 2.580682342093131e-06, + "logits/chosen": -0.8409813642501831, + "logits/rejected": 2.7474489212036133, + "logps/chosen": -393.14324951171875, + "logps/rejected": -764.0924072265625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.663969039916992, + "rewards/margins": 22.817779541015625, + "rewards/rejected": -31.48175048828125, + "step": 2582 + }, + { + "epoch": 1.6068429237947122, + "grad_norm": 0.03009512461721897, + "learning_rate": 2.5795297372060863e-06, + "logits/chosen": 0.9661927819252014, + "logits/rejected": 3.4408862590789795, + "logps/chosen": -397.1964416503906, + "logps/rejected": -772.956787109375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.04042911529541, + "rewards/margins": 22.691429138183594, + "rewards/rejected": -28.731861114501953, + "step": 2583 + }, + { + "epoch": 1.6074650077760497, + "grad_norm": 0.24635007977485657, + "learning_rate": 2.5783771323190415e-06, + "logits/chosen": -1.9829556941986084, + "logits/rejected": 3.6001546382904053, + "logps/chosen": -356.6353759765625, + "logps/rejected": -788.8740234375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.533819198608398, + "rewards/margins": 16.42125701904297, + "rewards/rejected": -21.955076217651367, + "step": 2584 + }, + { + "epoch": 1.6080870917573873, + "grad_norm": 29.279674530029297, + "learning_rate": 2.5772245274319963e-06, + "logits/chosen": 1.2748240232467651, + "logits/rejected": 2.8580141067504883, + "logps/chosen": -573.7198486328125, + "logps/rejected": -938.4467163085938, + "loss": 0.3834, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.002640724182129, + "rewards/margins": 25.643817901611328, + "rewards/rejected": -33.646461486816406, + "step": 2585 + }, + { + "epoch": 1.6087091757387246, + "grad_norm": 23.010272979736328, + "learning_rate": 2.5760719225449516e-06, + "logits/chosen": 1.743993878364563, + "logits/rejected": 3.330801248550415, + "logps/chosen": -648.9251098632812, + "logps/rejected": -1052.299560546875, + "loss": 0.2618, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.065129280090332, + "rewards/margins": 26.94137954711914, + "rewards/rejected": -35.00651168823242, + "step": 2586 + }, + { + "epoch": 1.6093312597200622, + "grad_norm": 1.394989226355392e-07, + "learning_rate": 2.5749193176579068e-06, + "logits/chosen": 2.410019636154175, + "logits/rejected": 4.149812698364258, + "logps/chosen": -608.6632690429688, + "logps/rejected": -938.4107666015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.392426490783691, + "rewards/margins": 26.489665985107422, + "rewards/rejected": -34.8820915222168, + "step": 2587 + }, + { + "epoch": 1.6099533437013998, + "grad_norm": 0.0016567111015319824, + "learning_rate": 2.573766712770862e-06, + "logits/chosen": -1.261190414428711, + "logits/rejected": 3.9460859298706055, + "logps/chosen": -293.0774841308594, + "logps/rejected": -949.5228271484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.228933334350586, + "rewards/margins": 29.283653259277344, + "rewards/rejected": -35.51258850097656, + "step": 2588 + }, + { + "epoch": 1.610575427682737, + "grad_norm": 0.02118654176592827, + "learning_rate": 2.5726141078838172e-06, + "logits/chosen": -0.7358344197273254, + "logits/rejected": 3.6361420154571533, + "logps/chosen": -437.2724609375, + "logps/rejected": -1011.6895751953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.786621570587158, + "rewards/margins": 28.29082679748535, + "rewards/rejected": -36.077449798583984, + "step": 2589 + }, + { + "epoch": 1.6111975116640747, + "grad_norm": 34.734352111816406, + "learning_rate": 2.5714615029967725e-06, + "logits/chosen": 1.2995983362197876, + "logits/rejected": 3.4828128814697266, + "logps/chosen": -684.9298095703125, + "logps/rejected": -985.8946533203125, + "loss": 0.2266, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.939543724060059, + "rewards/margins": 19.27617073059082, + "rewards/rejected": -28.215713500976562, + "step": 2590 + }, + { + "epoch": 1.6118195956454122, + "grad_norm": 0.23033146560192108, + "learning_rate": 2.570308898109728e-06, + "logits/chosen": -2.0494282245635986, + "logits/rejected": 2.8577654361724854, + "logps/chosen": -346.23577880859375, + "logps/rejected": -907.2470092773438, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1096112728118896, + "rewards/margins": 27.401147842407227, + "rewards/rejected": -30.510757446289062, + "step": 2591 + }, + { + "epoch": 1.6124416796267496, + "grad_norm": 0.8053516745567322, + "learning_rate": 2.5691562932226833e-06, + "logits/chosen": -1.868981957435608, + "logits/rejected": 3.8824474811553955, + "logps/chosen": -460.58026123046875, + "logps/rejected": -1142.6533203125, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.4744157791137695, + "rewards/margins": 29.066810607910156, + "rewards/rejected": -36.541229248046875, + "step": 2592 + }, + { + "epoch": 1.6130637636080871, + "grad_norm": 4.806786060333252, + "learning_rate": 2.5680036883356386e-06, + "logits/chosen": 1.3408687114715576, + "logits/rejected": 0.943589448928833, + "logps/chosen": -657.0845336914062, + "logps/rejected": -760.260009765625, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.541203498840332, + "rewards/margins": 18.67476463317871, + "rewards/rejected": -27.215970993041992, + "step": 2593 + }, + { + "epoch": 1.6136858475894247, + "grad_norm": 7.727682532276958e-05, + "learning_rate": 2.5668510834485938e-06, + "logits/chosen": -0.061612486839294434, + "logits/rejected": 3.8385701179504395, + "logps/chosen": -550.5499267578125, + "logps/rejected": -1054.3939208984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.10532808303833, + "rewards/margins": 26.229053497314453, + "rewards/rejected": -33.334381103515625, + "step": 2594 + }, + { + "epoch": 1.614307931570762, + "grad_norm": 0.4082733392715454, + "learning_rate": 2.565698478561549e-06, + "logits/chosen": -1.452492117881775, + "logits/rejected": 1.2722666263580322, + "logps/chosen": -538.8363647460938, + "logps/rejected": -978.8455200195312, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.732166290283203, + "rewards/margins": 27.385570526123047, + "rewards/rejected": -39.11773681640625, + "step": 2595 + }, + { + "epoch": 1.6149300155520996, + "grad_norm": 0.04512008652091026, + "learning_rate": 2.5645458736745042e-06, + "logits/chosen": 0.16232812404632568, + "logits/rejected": 2.1995935440063477, + "logps/chosen": -695.5032348632812, + "logps/rejected": -1052.70947265625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.921582221984863, + "rewards/margins": 25.841278076171875, + "rewards/rejected": -38.762855529785156, + "step": 2596 + }, + { + "epoch": 1.6155520995334371, + "grad_norm": 0.00561953941360116, + "learning_rate": 2.5633932687874595e-06, + "logits/chosen": 0.4279603064060211, + "logits/rejected": 3.640723943710327, + "logps/chosen": -489.49615478515625, + "logps/rejected": -974.6058349609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.134950160980225, + "rewards/margins": 29.59377098083496, + "rewards/rejected": -34.728721618652344, + "step": 2597 + }, + { + "epoch": 1.6161741835147745, + "grad_norm": 0.0004975904012098908, + "learning_rate": 2.562240663900415e-06, + "logits/chosen": 2.388624668121338, + "logits/rejected": 3.4154815673828125, + "logps/chosen": -653.7855224609375, + "logps/rejected": -907.9586181640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.451631546020508, + "rewards/margins": 22.841678619384766, + "rewards/rejected": -33.293312072753906, + "step": 2598 + }, + { + "epoch": 1.6167962674961118, + "grad_norm": 2.29254254469069e-11, + "learning_rate": 2.5610880590133703e-06, + "logits/chosen": -2.7268898487091064, + "logits/rejected": 3.9294936656951904, + "logps/chosen": -445.59332275390625, + "logps/rejected": -1288.5296630859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.320531845092773, + "rewards/margins": 41.719940185546875, + "rewards/rejected": -47.04046630859375, + "step": 2599 + }, + { + "epoch": 1.6174183514774496, + "grad_norm": 0.00903357844799757, + "learning_rate": 2.5599354541263256e-06, + "logits/chosen": -2.188351631164551, + "logits/rejected": 3.1762428283691406, + "logps/chosen": -226.08502197265625, + "logps/rejected": -817.2967529296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.105920791625977, + "rewards/margins": 27.778106689453125, + "rewards/rejected": -31.884029388427734, + "step": 2600 + }, + { + "epoch": 1.618040435458787, + "grad_norm": 0.022304048761725426, + "learning_rate": 2.5587828492392808e-06, + "logits/chosen": -1.54436457157135, + "logits/rejected": 3.401952028274536, + "logps/chosen": -393.1194152832031, + "logps/rejected": -939.4529418945312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2694783210754395, + "rewards/margins": 23.11556625366211, + "rewards/rejected": -27.38504409790039, + "step": 2601 + }, + { + "epoch": 1.6186625194401243, + "grad_norm": 0.47571098804473877, + "learning_rate": 2.557630244352236e-06, + "logits/chosen": 0.1825639009475708, + "logits/rejected": 3.720527172088623, + "logps/chosen": -609.8028564453125, + "logps/rejected": -1145.288818359375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.5856475830078125, + "rewards/margins": 31.391204833984375, + "rewards/rejected": -38.97685241699219, + "step": 2602 + }, + { + "epoch": 1.6192846034214619, + "grad_norm": 0.07786351442337036, + "learning_rate": 2.5564776394651912e-06, + "logits/chosen": 0.14193564653396606, + "logits/rejected": 1.8101928234100342, + "logps/chosen": -526.883056640625, + "logps/rejected": -821.777587890625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.243754863739014, + "rewards/margins": 23.589351654052734, + "rewards/rejected": -29.833106994628906, + "step": 2603 + }, + { + "epoch": 1.6199066874027994, + "grad_norm": 13.48662281036377, + "learning_rate": 2.5553250345781465e-06, + "logits/chosen": 0.29353073239326477, + "logits/rejected": 1.9300470352172852, + "logps/chosen": -537.5771484375, + "logps/rejected": -801.8447265625, + "loss": 0.0731, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.729201316833496, + "rewards/margins": 18.486591339111328, + "rewards/rejected": -27.21579360961914, + "step": 2604 + }, + { + "epoch": 1.6205287713841368, + "grad_norm": 14.752580642700195, + "learning_rate": 2.554172429691102e-06, + "logits/chosen": 0.7777003645896912, + "logits/rejected": 3.255272388458252, + "logps/chosen": -503.46063232421875, + "logps/rejected": -869.5552978515625, + "loss": 0.0813, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.107442378997803, + "rewards/margins": 21.075592041015625, + "rewards/rejected": -26.183034896850586, + "step": 2605 + }, + { + "epoch": 1.6211508553654743, + "grad_norm": 6.323744310066104e-05, + "learning_rate": 2.5530198248040573e-06, + "logits/chosen": 1.3028900623321533, + "logits/rejected": 3.118048667907715, + "logps/chosen": -682.3069458007812, + "logps/rejected": -1054.4993896484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.212747573852539, + "rewards/margins": 24.12259292602539, + "rewards/rejected": -35.3353385925293, + "step": 2606 + }, + { + "epoch": 1.6217729393468119, + "grad_norm": 3.8181777000427246, + "learning_rate": 2.5518672199170125e-06, + "logits/chosen": 2.5419797897338867, + "logits/rejected": 3.7514326572418213, + "logps/chosen": -671.3524780273438, + "logps/rejected": -910.8513793945312, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.456694602966309, + "rewards/margins": 18.08182144165039, + "rewards/rejected": -26.538516998291016, + "step": 2607 + }, + { + "epoch": 1.6223950233281492, + "grad_norm": 0.0002522819268051535, + "learning_rate": 2.5507146150299678e-06, + "logits/chosen": 3.79829478263855, + "logits/rejected": 3.902897357940674, + "logps/chosen": -660.8634033203125, + "logps/rejected": -977.2813720703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.231990814208984, + "rewards/margins": 29.680944442749023, + "rewards/rejected": -37.91293716430664, + "step": 2608 + }, + { + "epoch": 1.6230171073094868, + "grad_norm": 0.04555211216211319, + "learning_rate": 2.549562010142923e-06, + "logits/chosen": -1.7312321662902832, + "logits/rejected": 0.7760089635848999, + "logps/chosen": -510.12109375, + "logps/rejected": -881.8781127929688, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.821007251739502, + "rewards/margins": 24.35039520263672, + "rewards/rejected": -29.171401977539062, + "step": 2609 + }, + { + "epoch": 1.6236391912908243, + "grad_norm": 4.4340089933037063e-10, + "learning_rate": 2.5484094052558782e-06, + "logits/chosen": 1.1520129442214966, + "logits/rejected": 5.368868350982666, + "logps/chosen": -535.7080078125, + "logps/rejected": -1051.622802734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.899171352386475, + "rewards/margins": 31.874183654785156, + "rewards/rejected": -38.77335739135742, + "step": 2610 + }, + { + "epoch": 1.6242612752721617, + "grad_norm": 0.0008103394648060203, + "learning_rate": 2.5472568003688334e-06, + "logits/chosen": -0.27327293157577515, + "logits/rejected": 3.8780975341796875, + "logps/chosen": -451.706787109375, + "logps/rejected": -1007.6486206054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.71866226196289, + "rewards/margins": 31.797969818115234, + "rewards/rejected": -40.516632080078125, + "step": 2611 + }, + { + "epoch": 1.6248833592534992, + "grad_norm": 2.2073519229888916, + "learning_rate": 2.546104195481789e-06, + "logits/chosen": -1.2786414623260498, + "logits/rejected": 2.140981674194336, + "logps/chosen": -542.6721801757812, + "logps/rejected": -953.30615234375, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.801692962646484, + "rewards/margins": 23.804019927978516, + "rewards/rejected": -33.605712890625, + "step": 2612 + }, + { + "epoch": 1.6255054432348368, + "grad_norm": 0.0158245787024498, + "learning_rate": 2.5449515905947443e-06, + "logits/chosen": -0.201128751039505, + "logits/rejected": 2.1382110118865967, + "logps/chosen": -388.56170654296875, + "logps/rejected": -689.6513061523438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2910847663879395, + "rewards/margins": 19.573322296142578, + "rewards/rejected": -22.86440658569336, + "step": 2613 + }, + { + "epoch": 1.6261275272161742, + "grad_norm": 0.0037320577539503574, + "learning_rate": 2.5437989857076995e-06, + "logits/chosen": -1.5668559074401855, + "logits/rejected": 3.3568079471588135, + "logps/chosen": -420.1706848144531, + "logps/rejected": -986.5573120117188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.247982025146484, + "rewards/margins": 28.099079132080078, + "rewards/rejected": -33.34706115722656, + "step": 2614 + }, + { + "epoch": 1.6267496111975117, + "grad_norm": 0.9762111902236938, + "learning_rate": 2.5426463808206548e-06, + "logits/chosen": -0.7459107041358948, + "logits/rejected": 3.1922781467437744, + "logps/chosen": -568.686767578125, + "logps/rejected": -1133.5242919921875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.407186508178711, + "rewards/margins": 23.96875, + "rewards/rejected": -33.375938415527344, + "step": 2615 + }, + { + "epoch": 1.6273716951788493, + "grad_norm": 0.19420433044433594, + "learning_rate": 2.54149377593361e-06, + "logits/chosen": 0.6126387119293213, + "logits/rejected": 2.9637625217437744, + "logps/chosen": -562.302978515625, + "logps/rejected": -1003.8385009765625, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.56039047241211, + "rewards/margins": 21.085357666015625, + "rewards/rejected": -31.6457462310791, + "step": 2616 + }, + { + "epoch": 1.6279937791601866, + "grad_norm": 0.011737792752683163, + "learning_rate": 2.5403411710465652e-06, + "logits/chosen": 2.8269572257995605, + "logits/rejected": 3.9817934036254883, + "logps/chosen": -689.129150390625, + "logps/rejected": -986.1431884765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.855367660522461, + "rewards/margins": 21.466367721557617, + "rewards/rejected": -31.321735382080078, + "step": 2617 + }, + { + "epoch": 1.628615863141524, + "grad_norm": 3.9934420585632324, + "learning_rate": 2.5391885661595204e-06, + "logits/chosen": 1.5089527368545532, + "logits/rejected": 2.406381130218506, + "logps/chosen": -523.5970458984375, + "logps/rejected": -897.9669799804688, + "loss": 0.0416, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.596930027008057, + "rewards/margins": 27.403316497802734, + "rewards/rejected": -35.000244140625, + "step": 2618 + }, + { + "epoch": 1.6292379471228617, + "grad_norm": 0.024353763088583946, + "learning_rate": 2.5380359612724757e-06, + "logits/chosen": 2.6592142581939697, + "logits/rejected": 4.7618513107299805, + "logps/chosen": -617.1805419921875, + "logps/rejected": -967.5352783203125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.64953327178955, + "rewards/margins": 27.00077247619629, + "rewards/rejected": -36.650306701660156, + "step": 2619 + }, + { + "epoch": 1.629860031104199, + "grad_norm": 0.06505515426397324, + "learning_rate": 2.5368833563854313e-06, + "logits/chosen": 2.7376463413238525, + "logits/rejected": 3.918849468231201, + "logps/chosen": -727.3867797851562, + "logps/rejected": -1086.908447265625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.635740280151367, + "rewards/margins": 27.26062774658203, + "rewards/rejected": -38.896366119384766, + "step": 2620 + }, + { + "epoch": 1.6304821150855364, + "grad_norm": 0.014469513669610023, + "learning_rate": 2.5357307514983865e-06, + "logits/chosen": -0.44161003828048706, + "logits/rejected": 2.54278564453125, + "logps/chosen": -645.5219116210938, + "logps/rejected": -1192.2489013671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.379627227783203, + "rewards/margins": 30.334142684936523, + "rewards/rejected": -39.71377182006836, + "step": 2621 + }, + { + "epoch": 1.631104199066874, + "grad_norm": 0.0001569920714246109, + "learning_rate": 2.5345781466113418e-06, + "logits/chosen": 0.8897266387939453, + "logits/rejected": 3.378631353378296, + "logps/chosen": -578.5534057617188, + "logps/rejected": -1006.815673828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.458053588867188, + "rewards/margins": 26.728559494018555, + "rewards/rejected": -35.186614990234375, + "step": 2622 + }, + { + "epoch": 1.6317262830482115, + "grad_norm": 2.3231365048559383e-05, + "learning_rate": 2.533425541724297e-06, + "logits/chosen": -1.3021610975265503, + "logits/rejected": 1.9647281169891357, + "logps/chosen": -413.54571533203125, + "logps/rejected": -846.652099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.324384689331055, + "rewards/margins": 26.152280807495117, + "rewards/rejected": -34.47666549682617, + "step": 2623 + }, + { + "epoch": 1.6323483670295489, + "grad_norm": 0.0006543318158946931, + "learning_rate": 2.5322729368372522e-06, + "logits/chosen": 2.5640270709991455, + "logits/rejected": 4.1576032638549805, + "logps/chosen": -714.5624389648438, + "logps/rejected": -1073.593017578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.278242111206055, + "rewards/margins": 27.065113067626953, + "rewards/rejected": -36.343353271484375, + "step": 2624 + }, + { + "epoch": 1.6329704510108864, + "grad_norm": 0.13595524430274963, + "learning_rate": 2.5311203319502074e-06, + "logits/chosen": 1.7532628774642944, + "logits/rejected": 3.9578118324279785, + "logps/chosen": -714.988037109375, + "logps/rejected": -1059.7442626953125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.232544898986816, + "rewards/margins": 24.83914566040039, + "rewards/rejected": -37.07168960571289, + "step": 2625 + }, + { + "epoch": 1.633592534992224, + "grad_norm": 0.005432057660073042, + "learning_rate": 2.5299677270631627e-06, + "logits/chosen": -1.5166964530944824, + "logits/rejected": 2.6830894947052, + "logps/chosen": -349.53704833984375, + "logps/rejected": -1010.3942260742188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.998991966247559, + "rewards/margins": 37.300437927246094, + "rewards/rejected": -43.29943084716797, + "step": 2626 + }, + { + "epoch": 1.6342146189735614, + "grad_norm": 0.2473265379667282, + "learning_rate": 2.5288151221761183e-06, + "logits/chosen": -2.057335615158081, + "logits/rejected": 3.281980514526367, + "logps/chosen": -425.85528564453125, + "logps/rejected": -965.0843505859375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.48233699798584, + "rewards/margins": 27.600360870361328, + "rewards/rejected": -38.082698822021484, + "step": 2627 + }, + { + "epoch": 1.634836702954899, + "grad_norm": 0.0459589809179306, + "learning_rate": 2.5276625172890735e-06, + "logits/chosen": 1.5313283205032349, + "logits/rejected": 3.6381654739379883, + "logps/chosen": -675.596923828125, + "logps/rejected": -1058.517578125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.262785911560059, + "rewards/margins": 25.024362564086914, + "rewards/rejected": -34.287147521972656, + "step": 2628 + }, + { + "epoch": 1.6354587869362365, + "grad_norm": 15.045350074768066, + "learning_rate": 2.5265099124020288e-06, + "logits/chosen": -1.2002232074737549, + "logits/rejected": 1.7366091012954712, + "logps/chosen": -623.9976806640625, + "logps/rejected": -1112.90966796875, + "loss": 0.0817, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.058524131774902, + "rewards/margins": 33.36068344116211, + "rewards/rejected": -41.419212341308594, + "step": 2629 + }, + { + "epoch": 1.6360808709175738, + "grad_norm": 0.17426888644695282, + "learning_rate": 2.525357307514984e-06, + "logits/chosen": -0.4459741711616516, + "logits/rejected": 1.9978004693984985, + "logps/chosen": -593.0241088867188, + "logps/rejected": -930.87646484375, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.513236999511719, + "rewards/margins": 23.07077407836914, + "rewards/rejected": -32.58401107788086, + "step": 2630 + }, + { + "epoch": 1.6367029548989114, + "grad_norm": 0.032606858760118484, + "learning_rate": 2.5242047026279392e-06, + "logits/chosen": 2.1993825435638428, + "logits/rejected": 2.9777932167053223, + "logps/chosen": -734.6822509765625, + "logps/rejected": -1108.458984375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.36622142791748, + "rewards/margins": 29.997440338134766, + "rewards/rejected": -42.36366271972656, + "step": 2631 + }, + { + "epoch": 1.637325038880249, + "grad_norm": 0.2596385180950165, + "learning_rate": 2.5230520977408944e-06, + "logits/chosen": -0.051331907510757446, + "logits/rejected": 4.525763988494873, + "logps/chosen": -351.0687561035156, + "logps/rejected": -884.1585693359375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.428440570831299, + "rewards/margins": 27.59353256225586, + "rewards/rejected": -31.02197265625, + "step": 2632 + }, + { + "epoch": 1.6379471228615863, + "grad_norm": 2.970715045928955, + "learning_rate": 2.5218994928538497e-06, + "logits/chosen": -2.0356454849243164, + "logits/rejected": 3.7598862648010254, + "logps/chosen": -416.25897216796875, + "logps/rejected": -1123.38330078125, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.475407600402832, + "rewards/margins": 33.963050842285156, + "rewards/rejected": -40.43846130371094, + "step": 2633 + }, + { + "epoch": 1.6385692068429238, + "grad_norm": 0.053677089512348175, + "learning_rate": 2.5207468879668053e-06, + "logits/chosen": 3.2241604328155518, + "logits/rejected": 5.152410507202148, + "logps/chosen": -655.5747680664062, + "logps/rejected": -1016.7356567382812, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.689812660217285, + "rewards/margins": 28.69457244873047, + "rewards/rejected": -35.38438415527344, + "step": 2634 + }, + { + "epoch": 1.6391912908242614, + "grad_norm": 29.179723739624023, + "learning_rate": 2.5195942830797605e-06, + "logits/chosen": 3.355369806289673, + "logits/rejected": 4.142183303833008, + "logps/chosen": -637.4825439453125, + "logps/rejected": -853.29833984375, + "loss": 0.2064, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.102502822875977, + "rewards/margins": 18.019689559936523, + "rewards/rejected": -27.1221923828125, + "step": 2635 + }, + { + "epoch": 1.6398133748055987, + "grad_norm": 2.7533092498779297, + "learning_rate": 2.5184416781927158e-06, + "logits/chosen": 0.7892777919769287, + "logits/rejected": 2.5396130084991455, + "logps/chosen": -529.934326171875, + "logps/rejected": -895.4093017578125, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.260340690612793, + "rewards/margins": 23.903886795043945, + "rewards/rejected": -33.16422653198242, + "step": 2636 + }, + { + "epoch": 1.640435458786936, + "grad_norm": 0.0008083415450528264, + "learning_rate": 2.517289073305671e-06, + "logits/chosen": -1.9454562664031982, + "logits/rejected": 4.325829982757568, + "logps/chosen": -366.5877990722656, + "logps/rejected": -1101.1285400390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.116681098937988, + "rewards/margins": 35.933876037597656, + "rewards/rejected": -41.050559997558594, + "step": 2637 + }, + { + "epoch": 1.6410575427682739, + "grad_norm": 0.020745502784848213, + "learning_rate": 2.516136468418626e-06, + "logits/chosen": -0.43713444471359253, + "logits/rejected": 4.419112205505371, + "logps/chosen": -330.8953857421875, + "logps/rejected": -810.3660888671875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.58512020111084, + "rewards/margins": 23.428924560546875, + "rewards/rejected": -28.0140438079834, + "step": 2638 + }, + { + "epoch": 1.6416796267496112, + "grad_norm": 1.2889726349385455e-05, + "learning_rate": 2.5149838635315814e-06, + "logits/chosen": -0.5015338659286499, + "logits/rejected": 2.5819554328918457, + "logps/chosen": -532.0968017578125, + "logps/rejected": -1029.7630615234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.803220748901367, + "rewards/margins": 35.672523498535156, + "rewards/rejected": -42.475738525390625, + "step": 2639 + }, + { + "epoch": 1.6423017107309485, + "grad_norm": 0.001544899307191372, + "learning_rate": 2.5138312586445367e-06, + "logits/chosen": 0.0655108094215393, + "logits/rejected": 3.8382391929626465, + "logps/chosen": -442.2047119140625, + "logps/rejected": -894.7894287109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.998608589172363, + "rewards/margins": 24.21469497680664, + "rewards/rejected": -32.21330261230469, + "step": 2640 + }, + { + "epoch": 1.6429237947122861, + "grad_norm": 0.00020783714717254043, + "learning_rate": 2.512678653757492e-06, + "logits/chosen": -3.7620840072631836, + "logits/rejected": 3.1680679321289062, + "logps/chosen": -407.64263916015625, + "logps/rejected": -1092.920654296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.393203735351562, + "rewards/margins": 35.05226135253906, + "rewards/rejected": -43.445465087890625, + "step": 2641 + }, + { + "epoch": 1.6435458786936237, + "grad_norm": 15.702348709106445, + "learning_rate": 2.5115260488704475e-06, + "logits/chosen": 1.081176519393921, + "logits/rejected": 3.1471967697143555, + "logps/chosen": -568.99365234375, + "logps/rejected": -961.87890625, + "loss": 0.085, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.569413185119629, + "rewards/margins": 24.506071090698242, + "rewards/rejected": -34.07548522949219, + "step": 2642 + }, + { + "epoch": 1.644167962674961, + "grad_norm": 1.2208596672280692e-05, + "learning_rate": 2.5103734439834028e-06, + "logits/chosen": -0.700672447681427, + "logits/rejected": 1.970914602279663, + "logps/chosen": -495.0232849121094, + "logps/rejected": -855.750244140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.2229461669921875, + "rewards/margins": 29.222091674804688, + "rewards/rejected": -35.445037841796875, + "step": 2643 + }, + { + "epoch": 1.6447900466562986, + "grad_norm": 0.570807695388794, + "learning_rate": 2.509220839096358e-06, + "logits/chosen": 1.094158411026001, + "logits/rejected": 1.5800312757492065, + "logps/chosen": -552.4247436523438, + "logps/rejected": -832.7394409179688, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.968006134033203, + "rewards/margins": 25.20488166809082, + "rewards/rejected": -31.172887802124023, + "step": 2644 + }, + { + "epoch": 1.6454121306376361, + "grad_norm": 5.437176878331229e-05, + "learning_rate": 2.508068234209313e-06, + "logits/chosen": -0.7897230982780457, + "logits/rejected": 3.430896282196045, + "logps/chosen": -423.3076171875, + "logps/rejected": -1002.4334716796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.253915309906006, + "rewards/margins": 30.8887882232666, + "rewards/rejected": -36.1427001953125, + "step": 2645 + }, + { + "epoch": 1.6460342146189735, + "grad_norm": 22.25408172607422, + "learning_rate": 2.5069156293222684e-06, + "logits/chosen": -0.08154647052288055, + "logits/rejected": 2.993760585784912, + "logps/chosen": -472.1348876953125, + "logps/rejected": -845.7042846679688, + "loss": 0.1614, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.82140064239502, + "rewards/margins": 18.36347770690918, + "rewards/rejected": -27.18487548828125, + "step": 2646 + }, + { + "epoch": 1.646656298600311, + "grad_norm": 26.578590393066406, + "learning_rate": 2.5057630244352237e-06, + "logits/chosen": 0.6960867643356323, + "logits/rejected": 4.30970573425293, + "logps/chosen": -570.57421875, + "logps/rejected": -988.6622924804688, + "loss": 0.1848, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.502416610717773, + "rewards/margins": 23.228342056274414, + "rewards/rejected": -35.73075866699219, + "step": 2647 + }, + { + "epoch": 1.6472783825816486, + "grad_norm": 2.429492235183716, + "learning_rate": 2.504610419548179e-06, + "logits/chosen": 0.29884105920791626, + "logits/rejected": 3.7831315994262695, + "logps/chosen": -520.1424560546875, + "logps/rejected": -938.1561279296875, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.274698257446289, + "rewards/margins": 23.346614837646484, + "rewards/rejected": -30.621315002441406, + "step": 2648 + }, + { + "epoch": 1.647900466562986, + "grad_norm": 0.22774247825145721, + "learning_rate": 2.5034578146611345e-06, + "logits/chosen": -0.22361783683300018, + "logits/rejected": 4.840500354766846, + "logps/chosen": -470.9745178222656, + "logps/rejected": -1212.837158203125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.539072036743164, + "rewards/margins": 39.26922607421875, + "rewards/rejected": -47.80830383300781, + "step": 2649 + }, + { + "epoch": 1.6485225505443235, + "grad_norm": 0.0011797187617048621, + "learning_rate": 2.5023052097740898e-06, + "logits/chosen": 0.7142741680145264, + "logits/rejected": 3.0429601669311523, + "logps/chosen": -618.66357421875, + "logps/rejected": -1096.073974609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.319652557373047, + "rewards/margins": 34.73102569580078, + "rewards/rejected": -44.05067825317383, + "step": 2650 + }, + { + "epoch": 1.649144634525661, + "grad_norm": 26.476112365722656, + "learning_rate": 2.501152604887045e-06, + "logits/chosen": 2.0527851581573486, + "logits/rejected": 4.556172847747803, + "logps/chosen": -514.275146484375, + "logps/rejected": -912.59033203125, + "loss": 0.5968, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.415748596191406, + "rewards/margins": 24.877079010009766, + "rewards/rejected": -33.292823791503906, + "step": 2651 + }, + { + "epoch": 1.6497667185069984, + "grad_norm": 2.9838106632232666, + "learning_rate": 2.5e-06, + "logits/chosen": 1.1987992525100708, + "logits/rejected": 2.6071865558624268, + "logps/chosen": -553.8780517578125, + "logps/rejected": -1002.5031127929688, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.879100799560547, + "rewards/margins": 28.60495376586914, + "rewards/rejected": -39.48405456542969, + "step": 2652 + }, + { + "epoch": 1.650388802488336, + "grad_norm": 4.720027936855331e-05, + "learning_rate": 2.4988473951129554e-06, + "logits/chosen": 0.23911744356155396, + "logits/rejected": 4.47409725189209, + "logps/chosen": -397.372802734375, + "logps/rejected": -932.7152099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.55997085571289, + "rewards/margins": 27.874908447265625, + "rewards/rejected": -37.43488311767578, + "step": 2653 + }, + { + "epoch": 1.6510108864696735, + "grad_norm": 9.448405265808105, + "learning_rate": 2.4976947902259107e-06, + "logits/chosen": 1.5364556312561035, + "logits/rejected": 3.421743392944336, + "logps/chosen": -592.2041015625, + "logps/rejected": -891.9800415039062, + "loss": 0.0422, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.87186861038208, + "rewards/margins": 19.73807716369629, + "rewards/rejected": -27.60994529724121, + "step": 2654 + }, + { + "epoch": 1.6516329704510109, + "grad_norm": 0.0007873836439102888, + "learning_rate": 2.496542185338866e-06, + "logits/chosen": 2.8756916522979736, + "logits/rejected": 3.758998394012451, + "logps/chosen": -642.5400390625, + "logps/rejected": -866.63671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.303086280822754, + "rewards/margins": 21.229276657104492, + "rewards/rejected": -26.532363891601562, + "step": 2655 + }, + { + "epoch": 1.6522550544323482, + "grad_norm": 1.6545012613278232e-06, + "learning_rate": 2.4953895804518215e-06, + "logits/chosen": -3.027890682220459, + "logits/rejected": 2.6755690574645996, + "logps/chosen": -334.79962158203125, + "logps/rejected": -1057.5010986328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5030996799468994, + "rewards/margins": 34.03456115722656, + "rewards/rejected": -37.537662506103516, + "step": 2656 + }, + { + "epoch": 1.652877138413686, + "grad_norm": 0.04323378950357437, + "learning_rate": 2.4942369755647768e-06, + "logits/chosen": 0.9792221784591675, + "logits/rejected": 4.009556770324707, + "logps/chosen": -543.080322265625, + "logps/rejected": -979.8177490234375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.486639022827148, + "rewards/margins": 25.986400604248047, + "rewards/rejected": -33.47304153442383, + "step": 2657 + }, + { + "epoch": 1.6534992223950233, + "grad_norm": 23.02494239807129, + "learning_rate": 2.493084370677732e-06, + "logits/chosen": -3.052624464035034, + "logits/rejected": 1.8419592380523682, + "logps/chosen": -465.3800354003906, + "logps/rejected": -1080.67822265625, + "loss": 0.5405, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.965513229370117, + "rewards/margins": 28.879623413085938, + "rewards/rejected": -38.84513854980469, + "step": 2658 + }, + { + "epoch": 1.6541213063763607, + "grad_norm": 0.21242384612560272, + "learning_rate": 2.491931765790687e-06, + "logits/chosen": -0.4600183367729187, + "logits/rejected": 3.5004982948303223, + "logps/chosen": -311.2583312988281, + "logps/rejected": -733.16455078125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.480689525604248, + "rewards/margins": 22.707656860351562, + "rewards/rejected": -28.188343048095703, + "step": 2659 + }, + { + "epoch": 1.6547433903576982, + "grad_norm": 17.66877555847168, + "learning_rate": 2.4907791609036424e-06, + "logits/chosen": -1.8398863077163696, + "logits/rejected": 2.277012348175049, + "logps/chosen": -525.8780517578125, + "logps/rejected": -1049.6021728515625, + "loss": 0.0864, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.22800350189209, + "rewards/margins": 26.594133377075195, + "rewards/rejected": -36.82213592529297, + "step": 2660 + }, + { + "epoch": 1.6553654743390358, + "grad_norm": 0.014850149862468243, + "learning_rate": 2.4896265560165977e-06, + "logits/chosen": 1.2716033458709717, + "logits/rejected": 4.249362945556641, + "logps/chosen": -592.276611328125, + "logps/rejected": -952.0533447265625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.303014755249023, + "rewards/margins": 26.30194091796875, + "rewards/rejected": -35.604957580566406, + "step": 2661 + }, + { + "epoch": 1.6559875583203731, + "grad_norm": 2.063870670099277e-05, + "learning_rate": 2.488473951129553e-06, + "logits/chosen": 1.3646904230117798, + "logits/rejected": 3.710188388824463, + "logps/chosen": -458.9728088378906, + "logps/rejected": -885.9873046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.571175575256348, + "rewards/margins": 28.973085403442383, + "rewards/rejected": -37.54425811767578, + "step": 2662 + }, + { + "epoch": 1.6566096423017107, + "grad_norm": 3.696420431137085, + "learning_rate": 2.4873213462425085e-06, + "logits/chosen": -0.5779656767845154, + "logits/rejected": 2.220045328140259, + "logps/chosen": -499.6197814941406, + "logps/rejected": -768.114013671875, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.123802185058594, + "rewards/margins": 11.481657028198242, + "rewards/rejected": -21.60546112060547, + "step": 2663 + }, + { + "epoch": 1.6572317262830483, + "grad_norm": 13.726298332214355, + "learning_rate": 2.4861687413554637e-06, + "logits/chosen": 2.0920047760009766, + "logits/rejected": 2.3564300537109375, + "logps/chosen": -493.8480224609375, + "logps/rejected": -653.603271484375, + "loss": 0.0679, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.378700017929077, + "rewards/margins": 18.358959197998047, + "rewards/rejected": -21.737659454345703, + "step": 2664 + }, + { + "epoch": 1.6578538102643856, + "grad_norm": 11.536592483520508, + "learning_rate": 2.485016136468419e-06, + "logits/chosen": 1.7251183986663818, + "logits/rejected": 3.7293882369995117, + "logps/chosen": -574.5499267578125, + "logps/rejected": -829.0257568359375, + "loss": 0.0732, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.837594985961914, + "rewards/margins": 15.651803970336914, + "rewards/rejected": -22.489398956298828, + "step": 2665 + }, + { + "epoch": 1.6584758942457232, + "grad_norm": 9.621564865112305, + "learning_rate": 2.483863531581374e-06, + "logits/chosen": -1.3720073699951172, + "logits/rejected": 2.4826555252075195, + "logps/chosen": -432.24261474609375, + "logps/rejected": -852.5631103515625, + "loss": 0.0731, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.460178375244141, + "rewards/margins": 20.206809997558594, + "rewards/rejected": -27.666988372802734, + "step": 2666 + }, + { + "epoch": 1.6590979782270607, + "grad_norm": 12.04741096496582, + "learning_rate": 2.4827109266943294e-06, + "logits/chosen": -0.24524717032909393, + "logits/rejected": 4.3620710372924805, + "logps/chosen": -528.10302734375, + "logps/rejected": -1026.803466796875, + "loss": 0.101, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.096864700317383, + "rewards/margins": 21.553415298461914, + "rewards/rejected": -30.650278091430664, + "step": 2667 + }, + { + "epoch": 1.659720062208398, + "grad_norm": 0.01621832698583603, + "learning_rate": 2.4815583218072846e-06, + "logits/chosen": 0.9092066287994385, + "logits/rejected": 4.583636283874512, + "logps/chosen": -399.865966796875, + "logps/rejected": -795.4120483398438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.871487617492676, + "rewards/margins": 22.655391693115234, + "rewards/rejected": -28.526880264282227, + "step": 2668 + }, + { + "epoch": 1.6603421461897356, + "grad_norm": 0.0032031771261245012, + "learning_rate": 2.48040571692024e-06, + "logits/chosen": 1.1127208471298218, + "logits/rejected": 2.875021457672119, + "logps/chosen": -594.210693359375, + "logps/rejected": -1003.0892944335938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.634833335876465, + "rewards/margins": 27.350683212280273, + "rewards/rejected": -37.98551559448242, + "step": 2669 + }, + { + "epoch": 1.6609642301710732, + "grad_norm": 2.2323959569803264e-07, + "learning_rate": 2.479253112033195e-06, + "logits/chosen": -0.4398040771484375, + "logits/rejected": 2.9848246574401855, + "logps/chosen": -536.8930053710938, + "logps/rejected": -1051.15185546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.939126968383789, + "rewards/margins": 30.918827056884766, + "rewards/rejected": -37.85795593261719, + "step": 2670 + }, + { + "epoch": 1.6615863141524105, + "grad_norm": 1.4484525918960571, + "learning_rate": 2.4781005071461507e-06, + "logits/chosen": -0.9866859316825867, + "logits/rejected": 3.3200957775115967, + "logps/chosen": -473.4273681640625, + "logps/rejected": -996.3983154296875, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.616700172424316, + "rewards/margins": 25.130720138549805, + "rewards/rejected": -31.747419357299805, + "step": 2671 + }, + { + "epoch": 1.662208398133748, + "grad_norm": 0.07499787956476212, + "learning_rate": 2.476947902259106e-06, + "logits/chosen": 0.25226056575775146, + "logits/rejected": 4.216619968414307, + "logps/chosen": -366.23028564453125, + "logps/rejected": -829.1866455078125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.613929748535156, + "rewards/margins": 22.871925354003906, + "rewards/rejected": -27.485855102539062, + "step": 2672 + }, + { + "epoch": 1.6628304821150857, + "grad_norm": 8.993841038318351e-05, + "learning_rate": 2.475795297372061e-06, + "logits/chosen": -1.5489999055862427, + "logits/rejected": 2.7130203247070312, + "logps/chosen": -463.1144714355469, + "logps/rejected": -911.884765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4509150981903076, + "rewards/margins": 21.476350784301758, + "rewards/rejected": -24.927265167236328, + "step": 2673 + }, + { + "epoch": 1.663452566096423, + "grad_norm": 18.458757400512695, + "learning_rate": 2.4746426924850164e-06, + "logits/chosen": 1.1556620597839355, + "logits/rejected": 2.756657838821411, + "logps/chosen": -557.0864868164062, + "logps/rejected": -773.4407348632812, + "loss": 0.1201, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7018070220947266, + "rewards/margins": 15.727760314941406, + "rewards/rejected": -19.4295654296875, + "step": 2674 + }, + { + "epoch": 1.6640746500777603, + "grad_norm": 14.458096504211426, + "learning_rate": 2.4734900875979716e-06, + "logits/chosen": 2.844008684158325, + "logits/rejected": 2.5082483291625977, + "logps/chosen": -716.7899169921875, + "logps/rejected": -886.850341796875, + "loss": 0.0698, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.73927116394043, + "rewards/margins": 21.637420654296875, + "rewards/rejected": -30.376689910888672, + "step": 2675 + }, + { + "epoch": 1.6646967340590981, + "grad_norm": 9.570556640625, + "learning_rate": 2.472337482710927e-06, + "logits/chosen": 2.666499614715576, + "logits/rejected": 5.031964302062988, + "logps/chosen": -681.7529907226562, + "logps/rejected": -1033.83203125, + "loss": 0.0483, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.99263858795166, + "rewards/margins": 21.586830139160156, + "rewards/rejected": -31.579469680786133, + "step": 2676 + }, + { + "epoch": 1.6653188180404355, + "grad_norm": 0.020050466060638428, + "learning_rate": 2.471184877823882e-06, + "logits/chosen": -0.3638615012168884, + "logits/rejected": 3.2650585174560547, + "logps/chosen": -377.27508544921875, + "logps/rejected": -859.4393920898438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.265905380249023, + "rewards/margins": 25.773866653442383, + "rewards/rejected": -31.039772033691406, + "step": 2677 + }, + { + "epoch": 1.6659409020217728, + "grad_norm": 1.8163225945500017e-07, + "learning_rate": 2.4700322729368377e-06, + "logits/chosen": -0.49591100215911865, + "logits/rejected": 3.8754587173461914, + "logps/chosen": -543.1971435546875, + "logps/rejected": -1206.58251953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.626020431518555, + "rewards/margins": 37.646209716796875, + "rewards/rejected": -50.27223205566406, + "step": 2678 + }, + { + "epoch": 1.6665629860031104, + "grad_norm": 0.0008349265553988516, + "learning_rate": 2.468879668049793e-06, + "logits/chosen": 0.5656048059463501, + "logits/rejected": 0.990628719329834, + "logps/chosen": -471.417236328125, + "logps/rejected": -761.2012329101562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.046195983886719, + "rewards/margins": 21.776756286621094, + "rewards/rejected": -29.822952270507812, + "step": 2679 + }, + { + "epoch": 1.667185069984448, + "grad_norm": 0.4058059751987457, + "learning_rate": 2.467727063162748e-06, + "logits/chosen": -0.5524758696556091, + "logits/rejected": 2.338270664215088, + "logps/chosen": -484.1187744140625, + "logps/rejected": -918.57861328125, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.968391418457031, + "rewards/margins": 26.87673568725586, + "rewards/rejected": -31.84512710571289, + "step": 2680 + }, + { + "epoch": 1.6678071539657853, + "grad_norm": 0.0031695840880274773, + "learning_rate": 2.4665744582757034e-06, + "logits/chosen": 0.9984191656112671, + "logits/rejected": 3.44390606880188, + "logps/chosen": -575.0689086914062, + "logps/rejected": -1073.3946533203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.461224555969238, + "rewards/margins": 33.61181640625, + "rewards/rejected": -38.07304000854492, + "step": 2681 + }, + { + "epoch": 1.6684292379471228, + "grad_norm": 1.2835079132855753e-07, + "learning_rate": 2.4654218533886586e-06, + "logits/chosen": 0.9636375904083252, + "logits/rejected": 3.6253345012664795, + "logps/chosen": -727.8519287109375, + "logps/rejected": -1174.5240478515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.085047721862793, + "rewards/margins": 32.849456787109375, + "rewards/rejected": -41.93450164794922, + "step": 2682 + }, + { + "epoch": 1.6690513219284604, + "grad_norm": 0.014250471256673336, + "learning_rate": 2.464269248501614e-06, + "logits/chosen": -0.09940612316131592, + "logits/rejected": 3.5841808319091797, + "logps/chosen": -389.5401611328125, + "logps/rejected": -736.5833740234375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.445569038391113, + "rewards/margins": 18.353742599487305, + "rewards/rejected": -22.799312591552734, + "step": 2683 + }, + { + "epoch": 1.6696734059097977, + "grad_norm": 0.008063657209277153, + "learning_rate": 2.463116643614569e-06, + "logits/chosen": 0.6216780543327332, + "logits/rejected": 0.8254268765449524, + "logps/chosen": -548.0035400390625, + "logps/rejected": -790.623046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.906831741333008, + "rewards/margins": 23.314350128173828, + "rewards/rejected": -32.22118377685547, + "step": 2684 + }, + { + "epoch": 1.6702954898911353, + "grad_norm": 0.0015282687963917851, + "learning_rate": 2.4619640387275247e-06, + "logits/chosen": 0.9856696128845215, + "logits/rejected": 4.668768882751465, + "logps/chosen": -539.461669921875, + "logps/rejected": -1043.248046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.886101722717285, + "rewards/margins": 27.577598571777344, + "rewards/rejected": -36.46370315551758, + "step": 2685 + }, + { + "epoch": 1.6709175738724729, + "grad_norm": 4.341231822967529, + "learning_rate": 2.4608114338404795e-06, + "logits/chosen": 1.3563194274902344, + "logits/rejected": 3.6629714965820312, + "logps/chosen": -544.7206420898438, + "logps/rejected": -966.00830078125, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.258726119995117, + "rewards/margins": 26.233715057373047, + "rewards/rejected": -38.4924430847168, + "step": 2686 + }, + { + "epoch": 1.6715396578538102, + "grad_norm": 0.00014986857422627509, + "learning_rate": 2.4596588289534348e-06, + "logits/chosen": -1.0385351181030273, + "logits/rejected": 3.6127541065216064, + "logps/chosen": -399.8589172363281, + "logps/rejected": -1015.7762451171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.090690612792969, + "rewards/margins": 34.113670349121094, + "rewards/rejected": -40.20436096191406, + "step": 2687 + }, + { + "epoch": 1.6721617418351478, + "grad_norm": 0.012877055443823338, + "learning_rate": 2.45850622406639e-06, + "logits/chosen": 0.6718133091926575, + "logits/rejected": 2.5280394554138184, + "logps/chosen": -479.4302978515625, + "logps/rejected": -812.8926391601562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.698501586914062, + "rewards/margins": 20.51024055480957, + "rewards/rejected": -29.208742141723633, + "step": 2688 + }, + { + "epoch": 1.6727838258164853, + "grad_norm": 0.006198279093950987, + "learning_rate": 2.4573536191793452e-06, + "logits/chosen": 0.344510555267334, + "logits/rejected": 3.332859992980957, + "logps/chosen": -612.3021240234375, + "logps/rejected": -1001.0293579101562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.828897476196289, + "rewards/margins": 25.03092384338379, + "rewards/rejected": -34.85982131958008, + "step": 2689 + }, + { + "epoch": 1.6734059097978227, + "grad_norm": 10.438737869262695, + "learning_rate": 2.456201014292301e-06, + "logits/chosen": -2.088998794555664, + "logits/rejected": 4.074522972106934, + "logps/chosen": -356.7595520019531, + "logps/rejected": -990.0142211914062, + "loss": 0.06, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.425658226013184, + "rewards/margins": 36.38076400756836, + "rewards/rejected": -40.80641555786133, + "step": 2690 + }, + { + "epoch": 1.6740279937791602, + "grad_norm": 0.02665124461054802, + "learning_rate": 2.455048409405256e-06, + "logits/chosen": 0.4028273820877075, + "logits/rejected": 1.4046893119812012, + "logps/chosen": -633.2741088867188, + "logps/rejected": -923.4190673828125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.680094242095947, + "rewards/margins": 26.50547981262207, + "rewards/rejected": -34.18557357788086, + "step": 2691 + }, + { + "epoch": 1.6746500777604978, + "grad_norm": 0.074191614985466, + "learning_rate": 2.4538958045182113e-06, + "logits/chosen": -1.7957860231399536, + "logits/rejected": 2.5487489700317383, + "logps/chosen": -488.90869140625, + "logps/rejected": -1007.2379760742188, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.99670696258545, + "rewards/margins": 27.577075958251953, + "rewards/rejected": -36.57378387451172, + "step": 2692 + }, + { + "epoch": 1.6752721617418351, + "grad_norm": 45.34721374511719, + "learning_rate": 2.4527431996311665e-06, + "logits/chosen": 2.733384132385254, + "logits/rejected": 4.8411126136779785, + "logps/chosen": -693.03857421875, + "logps/rejected": -1121.6904296875, + "loss": 0.3278, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.350863456726074, + "rewards/margins": 31.43549346923828, + "rewards/rejected": -38.78635787963867, + "step": 2693 + }, + { + "epoch": 1.6758942457231725, + "grad_norm": 0.1667236089706421, + "learning_rate": 2.4515905947441218e-06, + "logits/chosen": -1.2241672277450562, + "logits/rejected": 3.4006261825561523, + "logps/chosen": -472.51177978515625, + "logps/rejected": -1073.1248779296875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.828851222991943, + "rewards/margins": 29.12144660949707, + "rewards/rejected": -36.950294494628906, + "step": 2694 + }, + { + "epoch": 1.6765163297045103, + "grad_norm": 4.75455162813887e-05, + "learning_rate": 2.450437989857077e-06, + "logits/chosen": -0.4841141402721405, + "logits/rejected": 1.1751519441604614, + "logps/chosen": -469.4749755859375, + "logps/rejected": -779.6455688476562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.378645896911621, + "rewards/margins": 25.129650115966797, + "rewards/rejected": -31.5082950592041, + "step": 2695 + }, + { + "epoch": 1.6771384136858476, + "grad_norm": 0.00032651741639710963, + "learning_rate": 2.449285384970032e-06, + "logits/chosen": -0.706736147403717, + "logits/rejected": 3.2579140663146973, + "logps/chosen": -420.28155517578125, + "logps/rejected": -1104.028564453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8741631507873535, + "rewards/margins": 39.49601364135742, + "rewards/rejected": -42.37017822265625, + "step": 2696 + }, + { + "epoch": 1.677760497667185, + "grad_norm": 0.0009474342223256826, + "learning_rate": 2.448132780082988e-06, + "logits/chosen": 0.9678953289985657, + "logits/rejected": 3.6299071311950684, + "logps/chosen": -473.0794372558594, + "logps/rejected": -827.8023681640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.318397521972656, + "rewards/margins": 26.759435653686523, + "rewards/rejected": -35.07783508300781, + "step": 2697 + }, + { + "epoch": 1.6783825816485225, + "grad_norm": 1.3759820376435528e-06, + "learning_rate": 2.446980175195943e-06, + "logits/chosen": -0.22475984692573547, + "logits/rejected": 4.250744819641113, + "logps/chosen": -539.4481201171875, + "logps/rejected": -1178.7088623046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.235435485839844, + "rewards/margins": 36.99909210205078, + "rewards/rejected": -46.234527587890625, + "step": 2698 + }, + { + "epoch": 1.67900466562986, + "grad_norm": 0.0020632497034966946, + "learning_rate": 2.4458275703088983e-06, + "logits/chosen": -0.722686767578125, + "logits/rejected": 1.4689903259277344, + "logps/chosen": -516.1554565429688, + "logps/rejected": -869.0540771484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.267215728759766, + "rewards/margins": 19.979875564575195, + "rewards/rejected": -28.247089385986328, + "step": 2699 + }, + { + "epoch": 1.6796267496111974, + "grad_norm": 0.0010660639964044094, + "learning_rate": 2.4446749654218535e-06, + "logits/chosen": -1.3368785381317139, + "logits/rejected": 2.3945975303649902, + "logps/chosen": -339.3318176269531, + "logps/rejected": -753.3385009765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.133460998535156, + "rewards/margins": 23.696168899536133, + "rewards/rejected": -30.829631805419922, + "step": 2700 + }, + { + "epoch": 1.680248833592535, + "grad_norm": 2.6615917682647705, + "learning_rate": 2.4435223605348088e-06, + "logits/chosen": -0.7420621514320374, + "logits/rejected": 3.047687530517578, + "logps/chosen": -508.665283203125, + "logps/rejected": -948.4867553710938, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.16337776184082, + "rewards/margins": 25.176620483398438, + "rewards/rejected": -33.339996337890625, + "step": 2701 + }, + { + "epoch": 1.6808709175738725, + "grad_norm": 0.4478207230567932, + "learning_rate": 2.442369755647764e-06, + "logits/chosen": -2.0094776153564453, + "logits/rejected": 4.143111228942871, + "logps/chosen": -497.0942077636719, + "logps/rejected": -1188.542724609375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.448010444641113, + "rewards/margins": 30.105907440185547, + "rewards/rejected": -38.553916931152344, + "step": 2702 + }, + { + "epoch": 1.6814930015552099, + "grad_norm": 21.752195358276367, + "learning_rate": 2.441217150760719e-06, + "logits/chosen": 0.8991308212280273, + "logits/rejected": 4.451169013977051, + "logps/chosen": -482.6705627441406, + "logps/rejected": -925.1748046875, + "loss": 0.253, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.439286231994629, + "rewards/margins": 27.2860050201416, + "rewards/rejected": -32.72529220581055, + "step": 2703 + }, + { + "epoch": 1.6821150855365474, + "grad_norm": 0.10334199666976929, + "learning_rate": 2.4400645458736744e-06, + "logits/chosen": -1.01015305519104, + "logits/rejected": 2.3958823680877686, + "logps/chosen": -373.493896484375, + "logps/rejected": -759.80517578125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.089910507202148, + "rewards/margins": 15.053412437438965, + "rewards/rejected": -21.14332389831543, + "step": 2704 + }, + { + "epoch": 1.682737169517885, + "grad_norm": 3.677429676055908, + "learning_rate": 2.43891194098663e-06, + "logits/chosen": 2.817261219024658, + "logits/rejected": 5.3078293800354, + "logps/chosen": -631.369140625, + "logps/rejected": -1031.2825927734375, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.647318840026855, + "rewards/margins": 25.45352554321289, + "rewards/rejected": -35.10084533691406, + "step": 2705 + }, + { + "epoch": 1.6833592534992223, + "grad_norm": 0.00020113641221541911, + "learning_rate": 2.4377593360995853e-06, + "logits/chosen": -1.9404137134552002, + "logits/rejected": 1.4332096576690674, + "logps/chosen": -344.89678955078125, + "logps/rejected": -818.8267822265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.624368190765381, + "rewards/margins": 24.991334915161133, + "rewards/rejected": -32.61570358276367, + "step": 2706 + }, + { + "epoch": 1.68398133748056, + "grad_norm": 33.51328659057617, + "learning_rate": 2.4366067312125405e-06, + "logits/chosen": 0.8392902612686157, + "logits/rejected": 3.4847161769866943, + "logps/chosen": -521.1593017578125, + "logps/rejected": -924.17236328125, + "loss": 0.5981, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.168278694152832, + "rewards/margins": 27.528364181518555, + "rewards/rejected": -34.6966438293457, + "step": 2707 + }, + { + "epoch": 1.6846034214618975, + "grad_norm": 9.50869591398451e-12, + "learning_rate": 2.4354541263254958e-06, + "logits/chosen": -1.568687915802002, + "logits/rejected": 4.22953462600708, + "logps/chosen": -385.8027038574219, + "logps/rejected": -1056.0511474609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.476365566253662, + "rewards/margins": 36.176177978515625, + "rewards/rejected": -40.65254211425781, + "step": 2708 + }, + { + "epoch": 1.6852255054432348, + "grad_norm": 4.177229881286621, + "learning_rate": 2.434301521438451e-06, + "logits/chosen": -2.9155402183532715, + "logits/rejected": -0.22680601477622986, + "logps/chosen": -460.20953369140625, + "logps/rejected": -860.6627807617188, + "loss": 0.0443, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.720498085021973, + "rewards/margins": 23.418338775634766, + "rewards/rejected": -32.13883972167969, + "step": 2709 + }, + { + "epoch": 1.6858475894245724, + "grad_norm": 0.007003345992416143, + "learning_rate": 2.433148916551406e-06, + "logits/chosen": -2.622126579284668, + "logits/rejected": 2.777906656265259, + "logps/chosen": -300.8578186035156, + "logps/rejected": -924.3973388671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.914675712585449, + "rewards/margins": 33.299461364746094, + "rewards/rejected": -41.214141845703125, + "step": 2710 + }, + { + "epoch": 1.68646967340591, + "grad_norm": 8.189059257507324, + "learning_rate": 2.4319963116643614e-06, + "logits/chosen": -1.0699472427368164, + "logits/rejected": 2.6137197017669678, + "logps/chosen": -341.54754638671875, + "logps/rejected": -804.4307861328125, + "loss": 0.03, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.407834529876709, + "rewards/margins": 24.09420394897461, + "rewards/rejected": -30.502038955688477, + "step": 2711 + }, + { + "epoch": 1.6870917573872473, + "grad_norm": 1.527038335800171, + "learning_rate": 2.430843706777317e-06, + "logits/chosen": 0.6405558586120605, + "logits/rejected": 3.023784875869751, + "logps/chosen": -500.26556396484375, + "logps/rejected": -883.9600219726562, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.72463607788086, + "rewards/margins": 21.619857788085938, + "rewards/rejected": -30.344491958618164, + "step": 2712 + }, + { + "epoch": 1.6877138413685846, + "grad_norm": 4.083947715116665e-05, + "learning_rate": 2.4296911018902723e-06, + "logits/chosen": -1.9219568967819214, + "logits/rejected": 4.124215126037598, + "logps/chosen": -245.7783660888672, + "logps/rejected": -898.5387573242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.52880859375, + "rewards/margins": 25.843339920043945, + "rewards/rejected": -28.372146606445312, + "step": 2713 + }, + { + "epoch": 1.6883359253499224, + "grad_norm": 0.034476667642593384, + "learning_rate": 2.4285384970032275e-06, + "logits/chosen": -0.8637582659721375, + "logits/rejected": 3.500040054321289, + "logps/chosen": -500.3390197753906, + "logps/rejected": -1043.5906982421875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.800796508789062, + "rewards/margins": 25.491060256958008, + "rewards/rejected": -34.29185485839844, + "step": 2714 + }, + { + "epoch": 1.6889580093312597, + "grad_norm": 0.021597065031528473, + "learning_rate": 2.4273858921161828e-06, + "logits/chosen": 0.38649308681488037, + "logits/rejected": 3.550788402557373, + "logps/chosen": -556.5277099609375, + "logps/rejected": -993.5703125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.913545608520508, + "rewards/margins": 27.199634552001953, + "rewards/rejected": -33.113182067871094, + "step": 2715 + }, + { + "epoch": 1.689580093312597, + "grad_norm": 1.3178468179830816e-05, + "learning_rate": 2.426233287229138e-06, + "logits/chosen": 0.597633421421051, + "logits/rejected": 3.9349923133850098, + "logps/chosen": -514.322265625, + "logps/rejected": -1022.6780395507812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.5213823318481445, + "rewards/margins": 34.16990280151367, + "rewards/rejected": -41.691287994384766, + "step": 2716 + }, + { + "epoch": 1.6902021772939346, + "grad_norm": 0.0011442669201642275, + "learning_rate": 2.425080682342093e-06, + "logits/chosen": -2.8341634273529053, + "logits/rejected": 0.4455097019672394, + "logps/chosen": -428.048828125, + "logps/rejected": -1065.8172607421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.660099983215332, + "rewards/margins": 35.430763244628906, + "rewards/rejected": -46.09086227416992, + "step": 2717 + }, + { + "epoch": 1.6908242612752722, + "grad_norm": 33.787418365478516, + "learning_rate": 2.4239280774550484e-06, + "logits/chosen": -2.904046058654785, + "logits/rejected": 4.311407566070557, + "logps/chosen": -310.52142333984375, + "logps/rejected": -1213.693115234375, + "loss": 0.4385, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.004347801208496, + "rewards/margins": 42.72428512573242, + "rewards/rejected": -46.728633880615234, + "step": 2718 + }, + { + "epoch": 1.6914463452566095, + "grad_norm": 0.011313035152852535, + "learning_rate": 2.422775472568004e-06, + "logits/chosen": 1.6661875247955322, + "logits/rejected": 4.304853439331055, + "logps/chosen": -686.5845947265625, + "logps/rejected": -1155.77490234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.166810989379883, + "rewards/margins": 20.23712921142578, + "rewards/rejected": -28.40393829345703, + "step": 2719 + }, + { + "epoch": 1.692068429237947, + "grad_norm": 0.2662597894668579, + "learning_rate": 2.4216228676809593e-06, + "logits/chosen": -1.647039532661438, + "logits/rejected": 3.3279147148132324, + "logps/chosen": -423.85504150390625, + "logps/rejected": -987.908447265625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.690751075744629, + "rewards/margins": 33.81493377685547, + "rewards/rejected": -40.50568389892578, + "step": 2720 + }, + { + "epoch": 1.6926905132192847, + "grad_norm": 32.57624816894531, + "learning_rate": 2.4204702627939145e-06, + "logits/chosen": 2.2442877292633057, + "logits/rejected": 4.182276248931885, + "logps/chosen": -627.66650390625, + "logps/rejected": -963.42333984375, + "loss": 0.571, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.049556732177734, + "rewards/margins": 19.074588775634766, + "rewards/rejected": -29.1241455078125, + "step": 2721 + }, + { + "epoch": 1.693312597200622, + "grad_norm": 7.951205770950764e-05, + "learning_rate": 2.4193176579068697e-06, + "logits/chosen": -0.030267775058746338, + "logits/rejected": 3.133117914199829, + "logps/chosen": -561.3271484375, + "logps/rejected": -1074.7281494140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.656089782714844, + "rewards/margins": 34.97183609008789, + "rewards/rejected": -44.627925872802734, + "step": 2722 + }, + { + "epoch": 1.6939346811819596, + "grad_norm": 2.12374210357666, + "learning_rate": 2.418165053019825e-06, + "logits/chosen": -1.011715054512024, + "logits/rejected": 4.0569586753845215, + "logps/chosen": -334.70367431640625, + "logps/rejected": -818.4017333984375, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.086916923522949, + "rewards/margins": 20.90202522277832, + "rewards/rejected": -25.988941192626953, + "step": 2723 + }, + { + "epoch": 1.6945567651632971, + "grad_norm": 0.5422797799110413, + "learning_rate": 2.41701244813278e-06, + "logits/chosen": 2.399137020111084, + "logits/rejected": 3.993281602859497, + "logps/chosen": -638.6915283203125, + "logps/rejected": -1012.2086181640625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.717671394348145, + "rewards/margins": 25.65497398376465, + "rewards/rejected": -34.372642517089844, + "step": 2724 + }, + { + "epoch": 1.6951788491446345, + "grad_norm": 0.0035224133171141148, + "learning_rate": 2.4158598432457354e-06, + "logits/chosen": 2.443415641784668, + "logits/rejected": 3.793722629547119, + "logps/chosen": -642.5726318359375, + "logps/rejected": -1068.98974609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.700204849243164, + "rewards/margins": 30.676729202270508, + "rewards/rejected": -38.37693405151367, + "step": 2725 + }, + { + "epoch": 1.695800933125972, + "grad_norm": 0.0012602354399859905, + "learning_rate": 2.414707238358691e-06, + "logits/chosen": -2.665707588195801, + "logits/rejected": 3.0817317962646484, + "logps/chosen": -313.6740417480469, + "logps/rejected": -992.2149658203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.802724838256836, + "rewards/margins": 29.130760192871094, + "rewards/rejected": -36.9334831237793, + "step": 2726 + }, + { + "epoch": 1.6964230171073096, + "grad_norm": 0.08769652247428894, + "learning_rate": 2.4135546334716463e-06, + "logits/chosen": 2.0320892333984375, + "logits/rejected": 4.716381549835205, + "logps/chosen": -652.5599975585938, + "logps/rejected": -1002.2882080078125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.021942138671875, + "rewards/margins": 24.541967391967773, + "rewards/rejected": -35.56391143798828, + "step": 2727 + }, + { + "epoch": 1.697045101088647, + "grad_norm": 0.00015202997019514441, + "learning_rate": 2.4124020285846015e-06, + "logits/chosen": -0.2173956036567688, + "logits/rejected": 3.3320794105529785, + "logps/chosen": -304.7808532714844, + "logps/rejected": -714.9826049804688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9645328521728516, + "rewards/margins": 22.323930740356445, + "rewards/rejected": -26.288463592529297, + "step": 2728 + }, + { + "epoch": 1.6976671850699845, + "grad_norm": 5.48606014251709, + "learning_rate": 2.4112494236975567e-06, + "logits/chosen": 2.7743520736694336, + "logits/rejected": 2.2740674018859863, + "logps/chosen": -813.4549560546875, + "logps/rejected": -881.2877197265625, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.876110076904297, + "rewards/margins": 16.42871856689453, + "rewards/rejected": -31.304828643798828, + "step": 2729 + }, + { + "epoch": 1.698289269051322, + "grad_norm": 0.05634181573987007, + "learning_rate": 2.410096818810512e-06, + "logits/chosen": 0.47589847445487976, + "logits/rejected": 4.360468864440918, + "logps/chosen": -400.2265625, + "logps/rejected": -927.064453125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.384169578552246, + "rewards/margins": 31.364051818847656, + "rewards/rejected": -40.74821853637695, + "step": 2730 + }, + { + "epoch": 1.6989113530326594, + "grad_norm": 38.13288497924805, + "learning_rate": 2.408944213923467e-06, + "logits/chosen": -0.2454141080379486, + "logits/rejected": 1.3477165699005127, + "logps/chosen": -612.4234619140625, + "logps/rejected": -934.793212890625, + "loss": 0.6544, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.737934112548828, + "rewards/margins": 18.501102447509766, + "rewards/rejected": -32.239036560058594, + "step": 2731 + }, + { + "epoch": 1.6995334370139967, + "grad_norm": 0.011911272071301937, + "learning_rate": 2.4077916090364224e-06, + "logits/chosen": 2.55433988571167, + "logits/rejected": 4.622102737426758, + "logps/chosen": -639.3505249023438, + "logps/rejected": -1007.958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.38956069946289, + "rewards/margins": 23.871074676513672, + "rewards/rejected": -33.26063537597656, + "step": 2732 + }, + { + "epoch": 1.7001555209953345, + "grad_norm": 4.579018059303053e-05, + "learning_rate": 2.4066390041493776e-06, + "logits/chosen": 1.351335883140564, + "logits/rejected": 4.477890968322754, + "logps/chosen": -523.9299926757812, + "logps/rejected": -966.3905029296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.234797477722168, + "rewards/margins": 29.752262115478516, + "rewards/rejected": -38.987060546875, + "step": 2733 + }, + { + "epoch": 1.7007776049766719, + "grad_norm": 13.588505744934082, + "learning_rate": 2.4054863992623333e-06, + "logits/chosen": 1.3708151578903198, + "logits/rejected": 2.2401483058929443, + "logps/chosen": -545.5647583007812, + "logps/rejected": -833.3558349609375, + "loss": 0.0453, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.524633407592773, + "rewards/margins": 26.133468627929688, + "rewards/rejected": -36.658103942871094, + "step": 2734 + }, + { + "epoch": 1.7013996889580092, + "grad_norm": 42.44413757324219, + "learning_rate": 2.4043337943752885e-06, + "logits/chosen": -1.0746033191680908, + "logits/rejected": 2.049215793609619, + "logps/chosen": -386.47906494140625, + "logps/rejected": -853.283203125, + "loss": 0.5919, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.945468902587891, + "rewards/margins": 23.645946502685547, + "rewards/rejected": -31.591413497924805, + "step": 2735 + }, + { + "epoch": 1.702021772939347, + "grad_norm": 0.10011202841997147, + "learning_rate": 2.4031811894882437e-06, + "logits/chosen": -0.11232280731201172, + "logits/rejected": 2.443506956100464, + "logps/chosen": -652.34033203125, + "logps/rejected": -1059.8670654296875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.735544204711914, + "rewards/margins": 28.015451431274414, + "rewards/rejected": -41.75099563598633, + "step": 2736 + }, + { + "epoch": 1.7026438569206843, + "grad_norm": 6.48001766204834, + "learning_rate": 2.402028584601199e-06, + "logits/chosen": 0.16253745555877686, + "logits/rejected": 3.0159144401550293, + "logps/chosen": -469.88543701171875, + "logps/rejected": -903.1436767578125, + "loss": 0.0296, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.632349014282227, + "rewards/margins": 28.997150421142578, + "rewards/rejected": -36.62950134277344, + "step": 2737 + }, + { + "epoch": 1.7032659409020217, + "grad_norm": 0.21324293315410614, + "learning_rate": 2.400875979714154e-06, + "logits/chosen": 0.3878498673439026, + "logits/rejected": 3.4972376823425293, + "logps/chosen": -622.3283081054688, + "logps/rejected": -1015.4915161132812, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.050553321838379, + "rewards/margins": 21.196529388427734, + "rewards/rejected": -34.2470817565918, + "step": 2738 + }, + { + "epoch": 1.7038880248833592, + "grad_norm": 35.615596771240234, + "learning_rate": 2.3997233748271094e-06, + "logits/chosen": -1.6907641887664795, + "logits/rejected": 1.8408029079437256, + "logps/chosen": -509.0423583984375, + "logps/rejected": -1010.310546875, + "loss": 0.356, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.704755783081055, + "rewards/margins": 23.956729888916016, + "rewards/rejected": -32.66148376464844, + "step": 2739 + }, + { + "epoch": 1.7045101088646968, + "grad_norm": 0.3253224790096283, + "learning_rate": 2.3985707699400646e-06, + "logits/chosen": -0.516049325466156, + "logits/rejected": 2.495758533477783, + "logps/chosen": -390.4578857421875, + "logps/rejected": -747.568115234375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.596771240234375, + "rewards/margins": 21.8277587890625, + "rewards/rejected": -31.424530029296875, + "step": 2740 + }, + { + "epoch": 1.7051321928460341, + "grad_norm": 0.36244937777519226, + "learning_rate": 2.3974181650530203e-06, + "logits/chosen": -1.3871111869812012, + "logits/rejected": 4.006772994995117, + "logps/chosen": -278.1317443847656, + "logps/rejected": -832.1084594726562, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.295748710632324, + "rewards/margins": 28.429582595825195, + "rewards/rejected": -34.72533416748047, + "step": 2741 + }, + { + "epoch": 1.7057542768273717, + "grad_norm": 0.0001849048276199028, + "learning_rate": 2.3962655601659755e-06, + "logits/chosen": -1.333700180053711, + "logits/rejected": 2.9659676551818848, + "logps/chosen": -408.71209716796875, + "logps/rejected": -999.737060546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.499065399169922, + "rewards/margins": 31.57623863220215, + "rewards/rejected": -37.0753059387207, + "step": 2742 + }, + { + "epoch": 1.7063763608087092, + "grad_norm": 37.42914962768555, + "learning_rate": 2.3951129552789307e-06, + "logits/chosen": 0.06055879592895508, + "logits/rejected": 0.357525110244751, + "logps/chosen": -564.270751953125, + "logps/rejected": -872.297119140625, + "loss": 1.0488, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.182369232177734, + "rewards/margins": 22.586198806762695, + "rewards/rejected": -36.76856994628906, + "step": 2743 + }, + { + "epoch": 1.7069984447900466, + "grad_norm": 0.03632102906703949, + "learning_rate": 2.393960350391886e-06, + "logits/chosen": -2.4644315242767334, + "logits/rejected": 4.031020164489746, + "logps/chosen": -366.7651062011719, + "logps/rejected": -1044.26123046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.470329284667969, + "rewards/margins": 32.28361511230469, + "rewards/rejected": -39.753944396972656, + "step": 2744 + }, + { + "epoch": 1.7076205287713841, + "grad_norm": 2.2521191567648202e-05, + "learning_rate": 2.392807745504841e-06, + "logits/chosen": -0.43236926198005676, + "logits/rejected": 2.319559335708618, + "logps/chosen": -581.4365234375, + "logps/rejected": -1093.714111328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.591827392578125, + "rewards/margins": 35.70243835449219, + "rewards/rejected": -53.29426193237305, + "step": 2745 + }, + { + "epoch": 1.7082426127527217, + "grad_norm": 23.961801528930664, + "learning_rate": 2.3916551406177964e-06, + "logits/chosen": -2.98201847076416, + "logits/rejected": 3.573429584503174, + "logps/chosen": -358.71954345703125, + "logps/rejected": -1015.135009765625, + "loss": 0.1875, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.754096508026123, + "rewards/margins": 30.13132667541504, + "rewards/rejected": -36.88542175292969, + "step": 2746 + }, + { + "epoch": 1.708864696734059, + "grad_norm": 0.001443828223273158, + "learning_rate": 2.3905025357307516e-06, + "logits/chosen": 1.7244963645935059, + "logits/rejected": 4.353095531463623, + "logps/chosen": -530.1221313476562, + "logps/rejected": -912.9786376953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.947088241577148, + "rewards/margins": 25.049970626831055, + "rewards/rejected": -31.997058868408203, + "step": 2747 + }, + { + "epoch": 1.7094867807153966, + "grad_norm": 0.13827162981033325, + "learning_rate": 2.3893499308437073e-06, + "logits/chosen": 1.7216299772262573, + "logits/rejected": 4.234375, + "logps/chosen": -632.4540405273438, + "logps/rejected": -1099.68310546875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.171640396118164, + "rewards/margins": 25.345455169677734, + "rewards/rejected": -36.517093658447266, + "step": 2748 + }, + { + "epoch": 1.7101088646967342, + "grad_norm": 6.640329360961914, + "learning_rate": 2.3881973259566625e-06, + "logits/chosen": -0.3900489807128906, + "logits/rejected": 4.642871379852295, + "logps/chosen": -533.6476440429688, + "logps/rejected": -1143.8360595703125, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.386316299438477, + "rewards/margins": 27.875993728637695, + "rewards/rejected": -36.262306213378906, + "step": 2749 + }, + { + "epoch": 1.7107309486780715, + "grad_norm": 0.018535036593675613, + "learning_rate": 2.3870447210696177e-06, + "logits/chosen": -0.20886629819869995, + "logits/rejected": 3.061354160308838, + "logps/chosen": -448.07391357421875, + "logps/rejected": -983.8624877929688, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.103137016296387, + "rewards/margins": 32.060157775878906, + "rewards/rejected": -41.16329574584961, + "step": 2750 + }, + { + "epoch": 1.7113530326594089, + "grad_norm": 0.004795982502400875, + "learning_rate": 2.385892116182573e-06, + "logits/chosen": 2.007289171218872, + "logits/rejected": 3.2250893115997314, + "logps/chosen": -746.975341796875, + "logps/rejected": -1015.2783203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.418895721435547, + "rewards/margins": 21.668270111083984, + "rewards/rejected": -34.08716583251953, + "step": 2751 + }, + { + "epoch": 1.7119751166407466, + "grad_norm": 56.460609436035156, + "learning_rate": 2.384739511295528e-06, + "logits/chosen": 1.8035542964935303, + "logits/rejected": 3.231412172317505, + "logps/chosen": -719.7908935546875, + "logps/rejected": -941.8114624023438, + "loss": 0.6946, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.651267051696777, + "rewards/margins": 18.337669372558594, + "rewards/rejected": -32.98893737792969, + "step": 2752 + }, + { + "epoch": 1.712597200622084, + "grad_norm": 1.7252594261663035e-05, + "learning_rate": 2.3835869064084834e-06, + "logits/chosen": 2.040581703186035, + "logits/rejected": 3.0739097595214844, + "logps/chosen": -595.2687377929688, + "logps/rejected": -847.3399658203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.129035949707031, + "rewards/margins": 23.222198486328125, + "rewards/rejected": -32.35123062133789, + "step": 2753 + }, + { + "epoch": 1.7132192846034213, + "grad_norm": 0.00926581397652626, + "learning_rate": 2.3824343015214386e-06, + "logits/chosen": 0.37493985891342163, + "logits/rejected": 3.2370142936706543, + "logps/chosen": -436.7816162109375, + "logps/rejected": -841.2584838867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.670589447021484, + "rewards/margins": 23.545188903808594, + "rewards/rejected": -34.21577835083008, + "step": 2754 + }, + { + "epoch": 1.713841368584759, + "grad_norm": 0.05381428450345993, + "learning_rate": 2.381281696634394e-06, + "logits/chosen": 1.032725214958191, + "logits/rejected": 3.626753091812134, + "logps/chosen": -596.6863403320312, + "logps/rejected": -1086.4066162109375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.142266273498535, + "rewards/margins": 33.237281799316406, + "rewards/rejected": -43.379547119140625, + "step": 2755 + }, + { + "epoch": 1.7144634525660964, + "grad_norm": 43.99714279174805, + "learning_rate": 2.380129091747349e-06, + "logits/chosen": 2.4638986587524414, + "logits/rejected": 1.1975902318954468, + "logps/chosen": -824.5519409179688, + "logps/rejected": -935.77685546875, + "loss": 0.5373, + "rewards/accuracies": 0.875, + "rewards/chosen": -18.057632446289062, + "rewards/margins": 17.896528244018555, + "rewards/rejected": -35.95416259765625, + "step": 2756 + }, + { + "epoch": 1.7150855365474338, + "grad_norm": 2.8142278097220697e-05, + "learning_rate": 2.3789764868603043e-06, + "logits/chosen": -0.084682896733284, + "logits/rejected": 4.622895240783691, + "logps/chosen": -513.091552734375, + "logps/rejected": -1204.34423828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.010409355163574, + "rewards/margins": 35.475242614746094, + "rewards/rejected": -46.48564910888672, + "step": 2757 + }, + { + "epoch": 1.7157076205287713, + "grad_norm": 5.099128657093388e-07, + "learning_rate": 2.3778238819732595e-06, + "logits/chosen": 1.8338658809661865, + "logits/rejected": 3.0492074489593506, + "logps/chosen": -530.6513671875, + "logps/rejected": -860.2197265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.946867942810059, + "rewards/margins": 25.688880920410156, + "rewards/rejected": -35.63574981689453, + "step": 2758 + }, + { + "epoch": 1.716329704510109, + "grad_norm": 0.05735553056001663, + "learning_rate": 2.3766712770862148e-06, + "logits/chosen": 0.5481299161911011, + "logits/rejected": 1.9903779029846191, + "logps/chosen": -658.283935546875, + "logps/rejected": -961.1427001953125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.964715003967285, + "rewards/margins": 20.74061393737793, + "rewards/rejected": -31.70532989501953, + "step": 2759 + }, + { + "epoch": 1.7169517884914463, + "grad_norm": 1.0057343757807757e-07, + "learning_rate": 2.3755186721991704e-06, + "logits/chosen": -2.410283088684082, + "logits/rejected": 1.853485345840454, + "logps/chosen": -428.63189697265625, + "logps/rejected": -974.8028564453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.190696716308594, + "rewards/margins": 34.52707290649414, + "rewards/rejected": -40.7177734375, + "step": 2760 + }, + { + "epoch": 1.7175738724727838, + "grad_norm": 0.04589163884520531, + "learning_rate": 2.3743660673121256e-06, + "logits/chosen": 0.30356281995773315, + "logits/rejected": 2.435214042663574, + "logps/chosen": -465.5194091796875, + "logps/rejected": -819.4593505859375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.391858100891113, + "rewards/margins": 19.34056282043457, + "rewards/rejected": -26.732418060302734, + "step": 2761 + }, + { + "epoch": 1.7181959564541214, + "grad_norm": 0.19371838867664337, + "learning_rate": 2.373213462425081e-06, + "logits/chosen": 1.2789655923843384, + "logits/rejected": 3.6189393997192383, + "logps/chosen": -504.8437805175781, + "logps/rejected": -906.9939575195312, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.996271133422852, + "rewards/margins": 23.702014923095703, + "rewards/rejected": -35.69828796386719, + "step": 2762 + }, + { + "epoch": 1.7188180404354587, + "grad_norm": 0.0007012194837443531, + "learning_rate": 2.372060857538036e-06, + "logits/chosen": -2.735713481903076, + "logits/rejected": 1.4813268184661865, + "logps/chosen": -385.73150634765625, + "logps/rejected": -942.9842529296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.717445373535156, + "rewards/margins": 34.88406753540039, + "rewards/rejected": -42.60151290893555, + "step": 2763 + }, + { + "epoch": 1.7194401244167963, + "grad_norm": 0.005390803795307875, + "learning_rate": 2.3709082526509913e-06, + "logits/chosen": 0.9662128686904907, + "logits/rejected": 1.7179337739944458, + "logps/chosen": -537.8453369140625, + "logps/rejected": -877.6395263671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.155290603637695, + "rewards/margins": 24.83075714111328, + "rewards/rejected": -35.98604965209961, + "step": 2764 + }, + { + "epoch": 1.7200622083981338, + "grad_norm": 0.04520804435014725, + "learning_rate": 2.3697556477639465e-06, + "logits/chosen": 0.6427794694900513, + "logits/rejected": 3.6675214767456055, + "logps/chosen": -479.03680419921875, + "logps/rejected": -872.9364013671875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.6263628005981445, + "rewards/margins": 19.985591888427734, + "rewards/rejected": -26.611953735351562, + "step": 2765 + }, + { + "epoch": 1.7206842923794712, + "grad_norm": 0.061935242265462875, + "learning_rate": 2.3686030428769018e-06, + "logits/chosen": -0.41115838289260864, + "logits/rejected": 4.158937454223633, + "logps/chosen": -559.28076171875, + "logps/rejected": -1320.9888916015625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.832966804504395, + "rewards/margins": 41.016700744628906, + "rewards/rejected": -55.84966278076172, + "step": 2766 + }, + { + "epoch": 1.7213063763608087, + "grad_norm": 0.0008467906154692173, + "learning_rate": 2.367450437989857e-06, + "logits/chosen": 2.045358180999756, + "logits/rejected": 2.9397809505462646, + "logps/chosen": -575.7149658203125, + "logps/rejected": -990.0404052734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.39871883392334, + "rewards/margins": 32.345008850097656, + "rewards/rejected": -42.74372863769531, + "step": 2767 + }, + { + "epoch": 1.7219284603421463, + "grad_norm": 0.035182323306798935, + "learning_rate": 2.3662978331028126e-06, + "logits/chosen": -1.1947062015533447, + "logits/rejected": 3.6915230751037598, + "logps/chosen": -364.73077392578125, + "logps/rejected": -939.2349853515625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.385026693344116, + "rewards/margins": 26.972509384155273, + "rewards/rejected": -30.357534408569336, + "step": 2768 + }, + { + "epoch": 1.7225505443234836, + "grad_norm": 0.0013195689534768462, + "learning_rate": 2.365145228215768e-06, + "logits/chosen": 0.15518411993980408, + "logits/rejected": 2.3073744773864746, + "logps/chosen": -576.5823364257812, + "logps/rejected": -856.1112060546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.871328353881836, + "rewards/margins": 18.359949111938477, + "rewards/rejected": -27.231277465820312, + "step": 2769 + }, + { + "epoch": 1.723172628304821, + "grad_norm": 4.51564359664917, + "learning_rate": 2.363992623328723e-06, + "logits/chosen": 0.6936973929405212, + "logits/rejected": 3.6317267417907715, + "logps/chosen": -504.4945983886719, + "logps/rejected": -747.7782592773438, + "loss": 0.0233, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.503636360168457, + "rewards/margins": 18.5909423828125, + "rewards/rejected": -27.09457778930664, + "step": 2770 + }, + { + "epoch": 1.7237947122861588, + "grad_norm": 0.011435436084866524, + "learning_rate": 2.3628400184416783e-06, + "logits/chosen": -2.0588650703430176, + "logits/rejected": 1.1596896648406982, + "logps/chosen": -426.8011474609375, + "logps/rejected": -997.5834350585938, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.577409267425537, + "rewards/margins": 33.40705871582031, + "rewards/rejected": -40.984466552734375, + "step": 2771 + }, + { + "epoch": 1.724416796267496, + "grad_norm": 0.796137273311615, + "learning_rate": 2.3616874135546335e-06, + "logits/chosen": -0.6903742551803589, + "logits/rejected": 3.3889074325561523, + "logps/chosen": -453.154541015625, + "logps/rejected": -941.4500732421875, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.413330078125, + "rewards/margins": 27.53225326538086, + "rewards/rejected": -36.94558334350586, + "step": 2772 + }, + { + "epoch": 1.7250388802488335, + "grad_norm": 0.00024519202997907996, + "learning_rate": 2.3605348086675888e-06, + "logits/chosen": 2.3214950561523438, + "logits/rejected": 3.429938316345215, + "logps/chosen": -568.1884765625, + "logps/rejected": -873.7118530273438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.79106330871582, + "rewards/margins": 25.14430046081543, + "rewards/rejected": -32.93536376953125, + "step": 2773 + }, + { + "epoch": 1.7256609642301712, + "grad_norm": 9.089024388231337e-06, + "learning_rate": 2.359382203780544e-06, + "logits/chosen": 3.637298822402954, + "logits/rejected": 3.8487226963043213, + "logps/chosen": -681.5171508789062, + "logps/rejected": -1018.5064086914062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.541778564453125, + "rewards/margins": 26.06352996826172, + "rewards/rejected": -33.605308532714844, + "step": 2774 + }, + { + "epoch": 1.7262830482115086, + "grad_norm": 2.3466477394104004, + "learning_rate": 2.3582295988934996e-06, + "logits/chosen": 1.3259943723678589, + "logits/rejected": 4.5670976638793945, + "logps/chosen": -652.5491943359375, + "logps/rejected": -1064.20263671875, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.579160690307617, + "rewards/margins": 26.290414810180664, + "rewards/rejected": -37.86957550048828, + "step": 2775 + }, + { + "epoch": 1.726905132192846, + "grad_norm": 0.16569431126117706, + "learning_rate": 2.357076994006455e-06, + "logits/chosen": -0.6097656488418579, + "logits/rejected": 3.4430322647094727, + "logps/chosen": -518.7841796875, + "logps/rejected": -1055.17041015625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.744987487792969, + "rewards/margins": 32.98575973510742, + "rewards/rejected": -42.73074722290039, + "step": 2776 + }, + { + "epoch": 1.7275272161741835, + "grad_norm": 0.09507567435503006, + "learning_rate": 2.35592438911941e-06, + "logits/chosen": -0.14290493726730347, + "logits/rejected": 3.131923198699951, + "logps/chosen": -484.4637756347656, + "logps/rejected": -944.8180541992188, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.113192558288574, + "rewards/margins": 27.75663948059082, + "rewards/rejected": -36.869834899902344, + "step": 2777 + }, + { + "epoch": 1.728149300155521, + "grad_norm": 0.20066502690315247, + "learning_rate": 2.3547717842323653e-06, + "logits/chosen": 2.386807441711426, + "logits/rejected": 3.2670209407806396, + "logps/chosen": -634.5061645507812, + "logps/rejected": -960.57421875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.417230606079102, + "rewards/margins": 28.866121292114258, + "rewards/rejected": -42.28335189819336, + "step": 2778 + }, + { + "epoch": 1.7287713841368584, + "grad_norm": 27.666940689086914, + "learning_rate": 2.3536191793453205e-06, + "logits/chosen": 0.8378705978393555, + "logits/rejected": 2.626453161239624, + "logps/chosen": -686.529541015625, + "logps/rejected": -1111.91162109375, + "loss": 0.3747, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.910959243774414, + "rewards/margins": 30.519216537475586, + "rewards/rejected": -45.430179595947266, + "step": 2779 + }, + { + "epoch": 1.729393468118196, + "grad_norm": 4.822820663452148, + "learning_rate": 2.3524665744582757e-06, + "logits/chosen": 1.7815725803375244, + "logits/rejected": 3.440255880355835, + "logps/chosen": -554.303955078125, + "logps/rejected": -881.9509887695312, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.29763412475586, + "rewards/margins": 22.683753967285156, + "rewards/rejected": -31.981388092041016, + "step": 2780 + }, + { + "epoch": 1.7300155520995335, + "grad_norm": 0.04911397397518158, + "learning_rate": 2.351313969571231e-06, + "logits/chosen": -0.22427219152450562, + "logits/rejected": 2.9939990043640137, + "logps/chosen": -644.0833740234375, + "logps/rejected": -1102.349365234375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.63404369354248, + "rewards/margins": 25.59818458557129, + "rewards/rejected": -39.23223114013672, + "step": 2781 + }, + { + "epoch": 1.7306376360808708, + "grad_norm": 2.280266046524048, + "learning_rate": 2.3501613646841866e-06, + "logits/chosen": -3.042137384414673, + "logits/rejected": 2.958921432495117, + "logps/chosen": -434.674072265625, + "logps/rejected": -1080.346923828125, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.471439361572266, + "rewards/margins": 34.801605224609375, + "rewards/rejected": -46.27304458618164, + "step": 2782 + }, + { + "epoch": 1.7312597200622084, + "grad_norm": 34.36720275878906, + "learning_rate": 2.349008759797142e-06, + "logits/chosen": -0.14768370985984802, + "logits/rejected": 2.648454427719116, + "logps/chosen": -579.7083129882812, + "logps/rejected": -1024.5987548828125, + "loss": 0.2468, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.255849838256836, + "rewards/margins": 27.46382713317871, + "rewards/rejected": -37.71967697143555, + "step": 2783 + }, + { + "epoch": 1.731881804043546, + "grad_norm": 1.1667402759485412e-06, + "learning_rate": 2.347856154910097e-06, + "logits/chosen": -3.273422956466675, + "logits/rejected": 3.320072650909424, + "logps/chosen": -313.364990234375, + "logps/rejected": -1038.2227783203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.1106157302856445, + "rewards/margins": 32.74595642089844, + "rewards/rejected": -38.856571197509766, + "step": 2784 + }, + { + "epoch": 1.7325038880248833, + "grad_norm": 31.996156692504883, + "learning_rate": 2.3467035500230523e-06, + "logits/chosen": -1.5053670406341553, + "logits/rejected": 1.9649426937103271, + "logps/chosen": -432.020751953125, + "logps/rejected": -886.978271484375, + "loss": 0.4382, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.486809253692627, + "rewards/margins": 24.155384063720703, + "rewards/rejected": -30.642192840576172, + "step": 2785 + }, + { + "epoch": 1.7331259720062209, + "grad_norm": 2.6820787752512842e-05, + "learning_rate": 2.3455509451360075e-06, + "logits/chosen": -1.8410687446594238, + "logits/rejected": 4.440993309020996, + "logps/chosen": -359.7074279785156, + "logps/rejected": -1087.0943603515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.158833980560303, + "rewards/margins": 34.07766342163086, + "rewards/rejected": -40.23649978637695, + "step": 2786 + }, + { + "epoch": 1.7337480559875584, + "grad_norm": 0.758760929107666, + "learning_rate": 2.3443983402489627e-06, + "logits/chosen": -1.3878371715545654, + "logits/rejected": 2.5684714317321777, + "logps/chosen": -493.8824768066406, + "logps/rejected": -898.0422973632812, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.0507612228393555, + "rewards/margins": 23.280902862548828, + "rewards/rejected": -30.3316650390625, + "step": 2787 + }, + { + "epoch": 1.7343701399688958, + "grad_norm": 1.4930262565612793, + "learning_rate": 2.343245735361918e-06, + "logits/chosen": -0.7004250884056091, + "logits/rejected": 3.0102808475494385, + "logps/chosen": -538.3475341796875, + "logps/rejected": -971.8416748046875, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.780963897705078, + "rewards/margins": 19.83646583557129, + "rewards/rejected": -28.617431640625, + "step": 2788 + }, + { + "epoch": 1.7349922239502333, + "grad_norm": 0.019264360889792442, + "learning_rate": 2.3420931304748736e-06, + "logits/chosen": -0.9465036392211914, + "logits/rejected": 4.264335632324219, + "logps/chosen": -327.2490234375, + "logps/rejected": -816.83544921875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.298092842102051, + "rewards/margins": 21.538646697998047, + "rewards/rejected": -25.836740493774414, + "step": 2789 + }, + { + "epoch": 1.735614307931571, + "grad_norm": 0.00041238003177568316, + "learning_rate": 2.340940525587829e-06, + "logits/chosen": -0.6670334339141846, + "logits/rejected": 1.9680249691009521, + "logps/chosen": -429.46240234375, + "logps/rejected": -969.256103515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.832972049713135, + "rewards/margins": 31.79166030883789, + "rewards/rejected": -39.6246337890625, + "step": 2790 + }, + { + "epoch": 1.7362363919129082, + "grad_norm": 6.324278831481934, + "learning_rate": 2.339787920700784e-06, + "logits/chosen": 2.0768911838531494, + "logits/rejected": 2.6396563053131104, + "logps/chosen": -616.1771240234375, + "logps/rejected": -860.7713012695312, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.11803913116455, + "rewards/margins": 17.913665771484375, + "rewards/rejected": -26.031702041625977, + "step": 2791 + }, + { + "epoch": 1.7368584758942456, + "grad_norm": 0.00336162350140512, + "learning_rate": 2.3386353158137393e-06, + "logits/chosen": 0.6120978593826294, + "logits/rejected": 2.5396242141723633, + "logps/chosen": -566.498046875, + "logps/rejected": -1051.150390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.51701831817627, + "rewards/margins": 32.585655212402344, + "rewards/rejected": -42.10266876220703, + "step": 2792 + }, + { + "epoch": 1.7374805598755834, + "grad_norm": 0.03778179734945297, + "learning_rate": 2.3374827109266945e-06, + "logits/chosen": -1.0540292263031006, + "logits/rejected": 2.4589219093322754, + "logps/chosen": -366.5378723144531, + "logps/rejected": -811.3502197265625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.14518404006958, + "rewards/margins": 30.593111038208008, + "rewards/rejected": -37.73829650878906, + "step": 2793 + }, + { + "epoch": 1.7381026438569207, + "grad_norm": 2.461298942565918, + "learning_rate": 2.3363301060396497e-06, + "logits/chosen": 2.8499398231506348, + "logits/rejected": 4.111413478851318, + "logps/chosen": -592.90869140625, + "logps/rejected": -959.670166015625, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.37989616394043, + "rewards/margins": 23.281414031982422, + "rewards/rejected": -33.661312103271484, + "step": 2794 + }, + { + "epoch": 1.738724727838258, + "grad_norm": 0.03133802488446236, + "learning_rate": 2.335177501152605e-06, + "logits/chosen": -0.18570584058761597, + "logits/rejected": 2.8326315879821777, + "logps/chosen": -496.2568664550781, + "logps/rejected": -912.7903442382812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.508269309997559, + "rewards/margins": 27.135391235351562, + "rewards/rejected": -35.64365768432617, + "step": 2795 + }, + { + "epoch": 1.7393468118195956, + "grad_norm": 0.04039645567536354, + "learning_rate": 2.33402489626556e-06, + "logits/chosen": -1.090985894203186, + "logits/rejected": 3.0338008403778076, + "logps/chosen": -468.44683837890625, + "logps/rejected": -1013.5341186523438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.679754257202148, + "rewards/margins": 33.22745132446289, + "rewards/rejected": -40.907203674316406, + "step": 2796 + }, + { + "epoch": 1.7399688958009332, + "grad_norm": 0.003894314868375659, + "learning_rate": 2.332872291378516e-06, + "logits/chosen": 0.48016834259033203, + "logits/rejected": 0.8828748464584351, + "logps/chosen": -588.974853515625, + "logps/rejected": -899.0206298828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.459451675415039, + "rewards/margins": 24.44631576538086, + "rewards/rejected": -32.90576934814453, + "step": 2797 + }, + { + "epoch": 1.7405909797822705, + "grad_norm": 4.184944191365503e-05, + "learning_rate": 2.331719686491471e-06, + "logits/chosen": -2.460817337036133, + "logits/rejected": 2.706566333770752, + "logps/chosen": -324.81427001953125, + "logps/rejected": -978.3182373046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.0268778800964355, + "rewards/margins": 31.31508445739746, + "rewards/rejected": -37.34196090698242, + "step": 2798 + }, + { + "epoch": 1.741213063763608, + "grad_norm": 0.0037377155385911465, + "learning_rate": 2.3305670816044263e-06, + "logits/chosen": -2.2354516983032227, + "logits/rejected": 3.795807123184204, + "logps/chosen": -327.38311767578125, + "logps/rejected": -927.3829956054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.096700191497803, + "rewards/margins": 28.419921875, + "rewards/rejected": -33.516624450683594, + "step": 2799 + }, + { + "epoch": 1.7418351477449456, + "grad_norm": 0.01131622213870287, + "learning_rate": 2.3294144767173815e-06, + "logits/chosen": 0.8565537929534912, + "logits/rejected": 3.1421103477478027, + "logps/chosen": -564.3128051757812, + "logps/rejected": -958.5903930664062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.258453369140625, + "rewards/margins": 24.025192260742188, + "rewards/rejected": -33.28364562988281, + "step": 2800 + }, + { + "epoch": 1.742457231726283, + "grad_norm": 0.9999305605888367, + "learning_rate": 2.3282618718303367e-06, + "logits/chosen": 0.04175460338592529, + "logits/rejected": 3.4293456077575684, + "logps/chosen": -473.6128845214844, + "logps/rejected": -915.9506225585938, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.805809020996094, + "rewards/margins": 23.354106903076172, + "rewards/rejected": -34.15991973876953, + "step": 2801 + }, + { + "epoch": 1.7430793157076205, + "grad_norm": 0.37862786650657654, + "learning_rate": 2.327109266943292e-06, + "logits/chosen": 2.9525222778320312, + "logits/rejected": 5.147446155548096, + "logps/chosen": -635.0414428710938, + "logps/rejected": -1057.5360107421875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.820849418640137, + "rewards/margins": 24.550661087036133, + "rewards/rejected": -33.37151336669922, + "step": 2802 + }, + { + "epoch": 1.743701399688958, + "grad_norm": 37.64432907104492, + "learning_rate": 2.325956662056247e-06, + "logits/chosen": 1.7595758438110352, + "logits/rejected": 3.8820478916168213, + "logps/chosen": -615.9675903320312, + "logps/rejected": -1010.1990966796875, + "loss": 0.4328, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.145980834960938, + "rewards/margins": 27.22187042236328, + "rewards/rejected": -38.36784744262695, + "step": 2803 + }, + { + "epoch": 1.7443234836702954, + "grad_norm": 0.122687928378582, + "learning_rate": 2.324804057169203e-06, + "logits/chosen": -1.2267993688583374, + "logits/rejected": 4.899316310882568, + "logps/chosen": -335.3673095703125, + "logps/rejected": -985.9779052734375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.552623748779297, + "rewards/margins": 32.96437072753906, + "rewards/rejected": -38.51699447631836, + "step": 2804 + }, + { + "epoch": 1.744945567651633, + "grad_norm": 0.007713071536272764, + "learning_rate": 2.323651452282158e-06, + "logits/chosen": -1.778456449508667, + "logits/rejected": 0.725517749786377, + "logps/chosen": -432.5553894042969, + "logps/rejected": -830.3568115234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.305593490600586, + "rewards/margins": 23.346324920654297, + "rewards/rejected": -31.651918411254883, + "step": 2805 + }, + { + "epoch": 1.7455676516329706, + "grad_norm": 4.1068889800044417e-07, + "learning_rate": 2.3224988473951133e-06, + "logits/chosen": -0.16772279143333435, + "logits/rejected": 2.112006425857544, + "logps/chosen": -525.9559326171875, + "logps/rejected": -964.70751953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.622228622436523, + "rewards/margins": 28.970535278320312, + "rewards/rejected": -38.5927619934082, + "step": 2806 + }, + { + "epoch": 1.746189735614308, + "grad_norm": 1.9309496565256268e-05, + "learning_rate": 2.3213462425080685e-06, + "logits/chosen": -0.6594813466072083, + "logits/rejected": 3.5559253692626953, + "logps/chosen": -404.66748046875, + "logps/rejected": -869.7897338867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.164316177368164, + "rewards/margins": 29.088703155517578, + "rewards/rejected": -36.253021240234375, + "step": 2807 + }, + { + "epoch": 1.7468118195956455, + "grad_norm": 34.09669494628906, + "learning_rate": 2.3201936376210237e-06, + "logits/chosen": 3.4891278743743896, + "logits/rejected": 4.456305503845215, + "logps/chosen": -647.476318359375, + "logps/rejected": -957.98681640625, + "loss": 0.5016, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.324559211730957, + "rewards/margins": 23.308271408081055, + "rewards/rejected": -35.63282775878906, + "step": 2808 + }, + { + "epoch": 1.747433903576983, + "grad_norm": 1.8752028942108154, + "learning_rate": 2.319041032733979e-06, + "logits/chosen": -1.88791024684906, + "logits/rejected": 2.6826634407043457, + "logps/chosen": -335.170654296875, + "logps/rejected": -904.3746337890625, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0012407302856445, + "rewards/margins": 25.391626358032227, + "rewards/rejected": -30.392868041992188, + "step": 2809 + }, + { + "epoch": 1.7480559875583204, + "grad_norm": 0.08498809486627579, + "learning_rate": 2.317888427846934e-06, + "logits/chosen": 2.0526623725891113, + "logits/rejected": 3.381930351257324, + "logps/chosen": -548.2327880859375, + "logps/rejected": -890.7827758789062, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.699272155761719, + "rewards/margins": 24.447311401367188, + "rewards/rejected": -35.146583557128906, + "step": 2810 + }, + { + "epoch": 1.7486780715396577, + "grad_norm": 0.1443328857421875, + "learning_rate": 2.31673582295989e-06, + "logits/chosen": 2.4920990467071533, + "logits/rejected": 4.312612533569336, + "logps/chosen": -722.9078979492188, + "logps/rejected": -1082.167724609375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.12423324584961, + "rewards/margins": 30.722488403320312, + "rewards/rejected": -44.84672546386719, + "step": 2811 + }, + { + "epoch": 1.7493001555209955, + "grad_norm": 0.00010770368680823594, + "learning_rate": 2.315583218072845e-06, + "logits/chosen": -0.9948385953903198, + "logits/rejected": 1.3194243907928467, + "logps/chosen": -566.56201171875, + "logps/rejected": -934.32080078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.929027080535889, + "rewards/margins": 26.289602279663086, + "rewards/rejected": -33.2186279296875, + "step": 2812 + }, + { + "epoch": 1.7499222395023328, + "grad_norm": 46.462738037109375, + "learning_rate": 2.3144306131858003e-06, + "logits/chosen": 3.6122045516967773, + "logits/rejected": 3.540928840637207, + "logps/chosen": -630.3472900390625, + "logps/rejected": -921.0729370117188, + "loss": 0.8578, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.057168960571289, + "rewards/margins": 23.821197509765625, + "rewards/rejected": -36.87836456298828, + "step": 2813 + }, + { + "epoch": 1.7505443234836702, + "grad_norm": 9.17495059967041, + "learning_rate": 2.3132780082987555e-06, + "logits/chosen": -0.8413869142532349, + "logits/rejected": 2.3990821838378906, + "logps/chosen": -383.0685729980469, + "logps/rejected": -798.2875366210938, + "loss": 0.0812, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.45396900177002, + "rewards/margins": 21.561416625976562, + "rewards/rejected": -33.015384674072266, + "step": 2814 + }, + { + "epoch": 1.7511664074650077, + "grad_norm": 4.03289270401001, + "learning_rate": 2.3121254034117107e-06, + "logits/chosen": 2.9473519325256348, + "logits/rejected": 3.943220853805542, + "logps/chosen": -617.802490234375, + "logps/rejected": -904.656982421875, + "loss": 0.0196, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.362736701965332, + "rewards/margins": 22.11973762512207, + "rewards/rejected": -27.48247718811035, + "step": 2815 + }, + { + "epoch": 1.7517884914463453, + "grad_norm": 22.0452938079834, + "learning_rate": 2.310972798524666e-06, + "logits/chosen": 1.4050730466842651, + "logits/rejected": 4.609858512878418, + "logps/chosen": -589.161376953125, + "logps/rejected": -979.752685546875, + "loss": 0.1047, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.366984367370605, + "rewards/margins": 24.246492385864258, + "rewards/rejected": -32.61347579956055, + "step": 2816 + }, + { + "epoch": 1.7524105754276826, + "grad_norm": 3.8451528549194336, + "learning_rate": 2.309820193637621e-06, + "logits/chosen": 0.7891393303871155, + "logits/rejected": 4.272323131561279, + "logps/chosen": -428.27484130859375, + "logps/rejected": -965.9637451171875, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.005939483642578, + "rewards/margins": 30.316749572753906, + "rewards/rejected": -35.32268524169922, + "step": 2817 + }, + { + "epoch": 1.7530326594090202, + "grad_norm": 0.0004849430697504431, + "learning_rate": 2.3086675887505764e-06, + "logits/chosen": 2.0776607990264893, + "logits/rejected": 4.065606117248535, + "logps/chosen": -484.43988037109375, + "logps/rejected": -874.045654296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.818570137023926, + "rewards/margins": 25.894359588623047, + "rewards/rejected": -33.712928771972656, + "step": 2818 + }, + { + "epoch": 1.7536547433903578, + "grad_norm": 0.0005741657223552465, + "learning_rate": 2.307514983863532e-06, + "logits/chosen": 0.3438476622104645, + "logits/rejected": 3.1698288917541504, + "logps/chosen": -536.3983764648438, + "logps/rejected": -1008.376953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.154132843017578, + "rewards/margins": 29.072219848632812, + "rewards/rejected": -36.22635269165039, + "step": 2819 + }, + { + "epoch": 1.754276827371695, + "grad_norm": 0.0035577313974499702, + "learning_rate": 2.3063623789764873e-06, + "logits/chosen": 0.8933321237564087, + "logits/rejected": 3.381267547607422, + "logps/chosen": -666.1956787109375, + "logps/rejected": -1114.95361328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.84673547744751, + "rewards/margins": 30.009090423583984, + "rewards/rejected": -37.8558235168457, + "step": 2820 + }, + { + "epoch": 1.7548989113530327, + "grad_norm": 34.45783615112305, + "learning_rate": 2.3052097740894425e-06, + "logits/chosen": 0.24750453233718872, + "logits/rejected": 2.9766762256622314, + "logps/chosen": -476.5849304199219, + "logps/rejected": -839.9825439453125, + "loss": 0.7048, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.244043350219727, + "rewards/margins": 19.534896850585938, + "rewards/rejected": -28.778940200805664, + "step": 2821 + }, + { + "epoch": 1.7555209953343702, + "grad_norm": 0.3499009609222412, + "learning_rate": 2.3040571692023973e-06, + "logits/chosen": 0.20748788118362427, + "logits/rejected": 1.776287317276001, + "logps/chosen": -447.55999755859375, + "logps/rejected": -747.993408203125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.3524370193481445, + "rewards/margins": 24.67609214782715, + "rewards/rejected": -29.028528213500977, + "step": 2822 + }, + { + "epoch": 1.7561430793157076, + "grad_norm": 0.004271762445569038, + "learning_rate": 2.302904564315353e-06, + "logits/chosen": 0.4916335642337799, + "logits/rejected": 1.6427289247512817, + "logps/chosen": -535.5093994140625, + "logps/rejected": -822.4710693359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.049504280090332, + "rewards/margins": 23.968944549560547, + "rewards/rejected": -32.01844787597656, + "step": 2823 + }, + { + "epoch": 1.7567651632970451, + "grad_norm": 0.1598803550004959, + "learning_rate": 2.301751959428308e-06, + "logits/chosen": -0.5362465381622314, + "logits/rejected": 1.9070513248443604, + "logps/chosen": -549.949462890625, + "logps/rejected": -816.508544921875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.776954174041748, + "rewards/margins": 20.22863006591797, + "rewards/rejected": -28.005582809448242, + "step": 2824 + }, + { + "epoch": 1.7573872472783827, + "grad_norm": 3.6965129375457764, + "learning_rate": 2.3005993545412634e-06, + "logits/chosen": -0.49294036626815796, + "logits/rejected": 2.3970351219177246, + "logps/chosen": -467.9210205078125, + "logps/rejected": -754.7039184570312, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.430497169494629, + "rewards/margins": 14.597742080688477, + "rewards/rejected": -24.028240203857422, + "step": 2825 + }, + { + "epoch": 1.75800933125972, + "grad_norm": 0.037016257643699646, + "learning_rate": 2.2994467496542186e-06, + "logits/chosen": 1.0786442756652832, + "logits/rejected": 3.0117878913879395, + "logps/chosen": -633.4241943359375, + "logps/rejected": -975.4942626953125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.774829864501953, + "rewards/margins": 25.39168930053711, + "rewards/rejected": -34.16651916503906, + "step": 2826 + }, + { + "epoch": 1.7586314152410576, + "grad_norm": 0.03342762216925621, + "learning_rate": 2.298294144767174e-06, + "logits/chosen": -1.3810304403305054, + "logits/rejected": 2.7755300998687744, + "logps/chosen": -399.886474609375, + "logps/rejected": -855.031494140625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.9426751136779785, + "rewards/margins": 23.406360626220703, + "rewards/rejected": -30.349037170410156, + "step": 2827 + }, + { + "epoch": 1.7592534992223952, + "grad_norm": 0.0004126617859583348, + "learning_rate": 2.297141539880129e-06, + "logits/chosen": -3.138012409210205, + "logits/rejected": 0.9484289884567261, + "logps/chosen": -409.16693115234375, + "logps/rejected": -1011.06591796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.72064208984375, + "rewards/margins": 36.936763763427734, + "rewards/rejected": -40.65740203857422, + "step": 2828 + }, + { + "epoch": 1.7598755832037325, + "grad_norm": 2.3251948732649907e-05, + "learning_rate": 2.2959889349930843e-06, + "logits/chosen": 1.5177479982376099, + "logits/rejected": 4.266093730926514, + "logps/chosen": -490.1104736328125, + "logps/rejected": -894.9434814453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.51591968536377, + "rewards/margins": 26.004505157470703, + "rewards/rejected": -34.520423889160156, + "step": 2829 + }, + { + "epoch": 1.7604976671850698, + "grad_norm": 0.47916364669799805, + "learning_rate": 2.2948363301060395e-06, + "logits/chosen": 0.8490583300590515, + "logits/rejected": 3.6363015174865723, + "logps/chosen": -456.613525390625, + "logps/rejected": -754.70166015625, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.169865608215332, + "rewards/margins": 18.686918258666992, + "rewards/rejected": -24.85678482055664, + "step": 2830 + }, + { + "epoch": 1.7611197511664076, + "grad_norm": 0.026581356301903725, + "learning_rate": 2.293683725218995e-06, + "logits/chosen": 1.8229038715362549, + "logits/rejected": 3.4620201587677, + "logps/chosen": -599.924560546875, + "logps/rejected": -994.2450561523438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.186615943908691, + "rewards/margins": 31.1335506439209, + "rewards/rejected": -39.320167541503906, + "step": 2831 + }, + { + "epoch": 1.761741835147745, + "grad_norm": 17.492971420288086, + "learning_rate": 2.2925311203319504e-06, + "logits/chosen": 1.6246124505996704, + "logits/rejected": 4.336266040802002, + "logps/chosen": -587.4801635742188, + "logps/rejected": -970.076904296875, + "loss": 0.0843, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.888565063476562, + "rewards/margins": 20.833152770996094, + "rewards/rejected": -33.721717834472656, + "step": 2832 + }, + { + "epoch": 1.7623639191290823, + "grad_norm": 0.0129725756123662, + "learning_rate": 2.2913785154449056e-06, + "logits/chosen": 0.6306408643722534, + "logits/rejected": 3.076450824737549, + "logps/chosen": -573.4195556640625, + "logps/rejected": -956.2833251953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.969671726226807, + "rewards/margins": 26.17241859436035, + "rewards/rejected": -34.14208984375, + "step": 2833 + }, + { + "epoch": 1.7629860031104199, + "grad_norm": 31.76062774658203, + "learning_rate": 2.290225910557861e-06, + "logits/chosen": 2.5573315620422363, + "logits/rejected": 1.8043968677520752, + "logps/chosen": -538.5514526367188, + "logps/rejected": -797.49755859375, + "loss": 0.3282, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.024632453918457, + "rewards/margins": 24.634727478027344, + "rewards/rejected": -30.659358978271484, + "step": 2834 + }, + { + "epoch": 1.7636080870917574, + "grad_norm": 35.535221099853516, + "learning_rate": 2.289073305670816e-06, + "logits/chosen": 1.011389970779419, + "logits/rejected": 3.940976619720459, + "logps/chosen": -593.7369384765625, + "logps/rejected": -1046.2943115234375, + "loss": 0.4384, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.93221664428711, + "rewards/margins": 26.355077743530273, + "rewards/rejected": -36.287296295166016, + "step": 2835 + }, + { + "epoch": 1.7642301710730948, + "grad_norm": 0.001468529924750328, + "learning_rate": 2.2879207007837713e-06, + "logits/chosen": 2.8411571979522705, + "logits/rejected": 3.874950647354126, + "logps/chosen": -710.3514404296875, + "logps/rejected": -1009.4497680664062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.734901428222656, + "rewards/margins": 25.648292541503906, + "rewards/rejected": -36.38319396972656, + "step": 2836 + }, + { + "epoch": 1.7648522550544323, + "grad_norm": 0.00015201901260297745, + "learning_rate": 2.2867680958967265e-06, + "logits/chosen": -0.3207424283027649, + "logits/rejected": 1.9411276578903198, + "logps/chosen": -592.6459350585938, + "logps/rejected": -1078.9656982421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.434858322143555, + "rewards/margins": 30.316349029541016, + "rewards/rejected": -39.75120544433594, + "step": 2837 + }, + { + "epoch": 1.76547433903577, + "grad_norm": 2.018131971359253, + "learning_rate": 2.285615491009682e-06, + "logits/chosen": 0.6795270442962646, + "logits/rejected": 2.601144790649414, + "logps/chosen": -646.558837890625, + "logps/rejected": -951.0234375, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.77408218383789, + "rewards/margins": 15.429933547973633, + "rewards/rejected": -26.204017639160156, + "step": 2838 + }, + { + "epoch": 1.7660964230171072, + "grad_norm": 0.20884829759597778, + "learning_rate": 2.2844628861226374e-06, + "logits/chosen": -1.0158138275146484, + "logits/rejected": 2.9894943237304688, + "logps/chosen": -364.9871826171875, + "logps/rejected": -824.8662109375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.625598430633545, + "rewards/margins": 23.467449188232422, + "rewards/rejected": -28.093048095703125, + "step": 2839 + }, + { + "epoch": 1.7667185069984448, + "grad_norm": 15.738641738891602, + "learning_rate": 2.2833102812355926e-06, + "logits/chosen": -1.3841941356658936, + "logits/rejected": 2.4583823680877686, + "logps/chosen": -478.6824951171875, + "logps/rejected": -981.4010009765625, + "loss": 0.0995, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.98832368850708, + "rewards/margins": 32.27064895629883, + "rewards/rejected": -38.25897216796875, + "step": 2840 + }, + { + "epoch": 1.7673405909797824, + "grad_norm": 36.28713607788086, + "learning_rate": 2.282157676348548e-06, + "logits/chosen": 2.22945237159729, + "logits/rejected": 3.3262252807617188, + "logps/chosen": -731.6435546875, + "logps/rejected": -985.1890869140625, + "loss": 0.512, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.859464645385742, + "rewards/margins": 26.305404663085938, + "rewards/rejected": -36.16486358642578, + "step": 2841 + }, + { + "epoch": 1.7679626749611197, + "grad_norm": 0.7895492911338806, + "learning_rate": 2.281005071461503e-06, + "logits/chosen": 0.8218408226966858, + "logits/rejected": 3.196619987487793, + "logps/chosen": -571.4409790039062, + "logps/rejected": -920.848388671875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.964709281921387, + "rewards/margins": 18.337387084960938, + "rewards/rejected": -27.30209732055664, + "step": 2842 + }, + { + "epoch": 1.7685847589424573, + "grad_norm": 7.703816890716553, + "learning_rate": 2.2798524665744583e-06, + "logits/chosen": -1.191411018371582, + "logits/rejected": 2.2585504055023193, + "logps/chosen": -438.60821533203125, + "logps/rejected": -823.6568603515625, + "loss": 0.0312, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.269340515136719, + "rewards/margins": 19.628704071044922, + "rewards/rejected": -26.89804458618164, + "step": 2843 + }, + { + "epoch": 1.7692068429237948, + "grad_norm": 0.982222318649292, + "learning_rate": 2.2786998616874135e-06, + "logits/chosen": 1.8104430437088013, + "logits/rejected": 5.031914234161377, + "logps/chosen": -563.5733032226562, + "logps/rejected": -995.634765625, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.055357456207275, + "rewards/margins": 30.996875762939453, + "rewards/rejected": -38.05223083496094, + "step": 2844 + }, + { + "epoch": 1.7698289269051322, + "grad_norm": 13.193146705627441, + "learning_rate": 2.277547256800369e-06, + "logits/chosen": -0.04513192176818848, + "logits/rejected": 5.120041370391846, + "logps/chosen": -335.22283935546875, + "logps/rejected": -888.48193359375, + "loss": 0.05, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7368974685668945, + "rewards/margins": 23.650060653686523, + "rewards/rejected": -27.386959075927734, + "step": 2845 + }, + { + "epoch": 1.7704510108864697, + "grad_norm": 0.0032247763592749834, + "learning_rate": 2.2763946519133244e-06, + "logits/chosen": -0.46913138031959534, + "logits/rejected": 2.5801992416381836, + "logps/chosen": -450.40704345703125, + "logps/rejected": -831.6661987304688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.872275352478027, + "rewards/margins": 24.720630645751953, + "rewards/rejected": -33.5929069519043, + "step": 2846 + }, + { + "epoch": 1.7710730948678073, + "grad_norm": 0.027071982622146606, + "learning_rate": 2.2752420470262796e-06, + "logits/chosen": 0.2968297600746155, + "logits/rejected": 4.050349235534668, + "logps/chosen": -465.64208984375, + "logps/rejected": -938.214111328125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.954277038574219, + "rewards/margins": 26.31043243408203, + "rewards/rejected": -33.26470947265625, + "step": 2847 + }, + { + "epoch": 1.7716951788491446, + "grad_norm": 4.982742662917872e-09, + "learning_rate": 2.274089442139235e-06, + "logits/chosen": 0.42632216215133667, + "logits/rejected": 2.5242867469787598, + "logps/chosen": -637.7935180664062, + "logps/rejected": -1124.866455078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.32619857788086, + "rewards/margins": 33.32003402709961, + "rewards/rejected": -46.64623260498047, + "step": 2848 + }, + { + "epoch": 1.772317262830482, + "grad_norm": 0.4207907021045685, + "learning_rate": 2.27293683725219e-06, + "logits/chosen": -1.1957554817199707, + "logits/rejected": 3.567960262298584, + "logps/chosen": -490.2234191894531, + "logps/rejected": -1094.930419921875, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.037872314453125, + "rewards/margins": 35.78264617919922, + "rewards/rejected": -44.820518493652344, + "step": 2849 + }, + { + "epoch": 1.7729393468118197, + "grad_norm": 2.5869648456573486, + "learning_rate": 2.2717842323651453e-06, + "logits/chosen": 1.9972779750823975, + "logits/rejected": 4.186806678771973, + "logps/chosen": -749.8709716796875, + "logps/rejected": -1108.9453125, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.551399230957031, + "rewards/margins": 24.46184539794922, + "rewards/rejected": -37.01324462890625, + "step": 2850 + }, + { + "epoch": 1.773561430793157, + "grad_norm": 34.92661666870117, + "learning_rate": 2.2706316274781005e-06, + "logits/chosen": 0.05643177032470703, + "logits/rejected": 1.2682291269302368, + "logps/chosen": -476.22760009765625, + "logps/rejected": -719.2870483398438, + "loss": 0.6597, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.579065322875977, + "rewards/margins": 18.019031524658203, + "rewards/rejected": -27.598098754882812, + "step": 2851 + }, + { + "epoch": 1.7741835147744944, + "grad_norm": 0.0038406543899327517, + "learning_rate": 2.269479022591056e-06, + "logits/chosen": 0.727659285068512, + "logits/rejected": 3.0023317337036133, + "logps/chosen": -624.4636840820312, + "logps/rejected": -1009.10302734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.819724082946777, + "rewards/margins": 27.708839416503906, + "rewards/rejected": -34.528564453125, + "step": 2852 + }, + { + "epoch": 1.774805598755832, + "grad_norm": 0.0031099789775907993, + "learning_rate": 2.2683264177040114e-06, + "logits/chosen": -1.9862315654754639, + "logits/rejected": 2.229929208755493, + "logps/chosen": -434.7628479003906, + "logps/rejected": -976.1802978515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.795281410217285, + "rewards/margins": 25.33068084716797, + "rewards/rejected": -30.12596321105957, + "step": 2853 + }, + { + "epoch": 1.7754276827371696, + "grad_norm": 0.014582900330424309, + "learning_rate": 2.2671738128169666e-06, + "logits/chosen": -1.2985285520553589, + "logits/rejected": 3.906200885772705, + "logps/chosen": -451.67852783203125, + "logps/rejected": -1184.2401123046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.578330039978027, + "rewards/margins": 40.39623260498047, + "rewards/rejected": -46.97456359863281, + "step": 2854 + }, + { + "epoch": 1.776049766718507, + "grad_norm": 0.0002128417690983042, + "learning_rate": 2.266021207929922e-06, + "logits/chosen": 1.6485921144485474, + "logits/rejected": 2.5614266395568848, + "logps/chosen": -751.6290893554688, + "logps/rejected": -1172.1341552734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.660265922546387, + "rewards/margins": 33.579036712646484, + "rewards/rejected": -49.23930358886719, + "step": 2855 + }, + { + "epoch": 1.7766718506998445, + "grad_norm": 0.3063058853149414, + "learning_rate": 2.264868603042877e-06, + "logits/chosen": 0.09333009272813797, + "logits/rejected": 3.864189624786377, + "logps/chosen": -471.91925048828125, + "logps/rejected": -956.8831176757812, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.546854972839355, + "rewards/margins": 28.44036865234375, + "rewards/rejected": -37.987220764160156, + "step": 2856 + }, + { + "epoch": 1.777293934681182, + "grad_norm": 0.1482161283493042, + "learning_rate": 2.2637159981558323e-06, + "logits/chosen": 1.1796531677246094, + "logits/rejected": 3.714764356613159, + "logps/chosen": -590.8246459960938, + "logps/rejected": -972.3810424804688, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.212776184082031, + "rewards/margins": 24.3189697265625, + "rewards/rejected": -32.53174591064453, + "step": 2857 + }, + { + "epoch": 1.7779160186625194, + "grad_norm": 43.31864547729492, + "learning_rate": 2.2625633932687875e-06, + "logits/chosen": 1.916248083114624, + "logits/rejected": 3.472468376159668, + "logps/chosen": -631.3201293945312, + "logps/rejected": -899.087890625, + "loss": 0.7806, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.413410186767578, + "rewards/margins": 16.619535446166992, + "rewards/rejected": -28.03294563293457, + "step": 2858 + }, + { + "epoch": 1.778538102643857, + "grad_norm": 1.5456307664862834e-05, + "learning_rate": 2.2614107883817427e-06, + "logits/chosen": -1.0370193719863892, + "logits/rejected": 3.6356005668640137, + "logps/chosen": -546.611572265625, + "logps/rejected": -1105.85595703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.225702285766602, + "rewards/margins": 32.230499267578125, + "rewards/rejected": -43.456199645996094, + "step": 2859 + }, + { + "epoch": 1.7791601866251945, + "grad_norm": 0.20345354080200195, + "learning_rate": 2.2602581834946984e-06, + "logits/chosen": 1.2066192626953125, + "logits/rejected": 4.191374778747559, + "logps/chosen": -637.443359375, + "logps/rejected": -1108.2515869140625, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.954766273498535, + "rewards/margins": 28.498451232910156, + "rewards/rejected": -38.453216552734375, + "step": 2860 + }, + { + "epoch": 1.7797822706065318, + "grad_norm": 1.7547857761383057, + "learning_rate": 2.2591055786076536e-06, + "logits/chosen": -0.753386378288269, + "logits/rejected": 3.332653045654297, + "logps/chosen": -511.5549011230469, + "logps/rejected": -918.64013671875, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.331133842468262, + "rewards/margins": 20.307628631591797, + "rewards/rejected": -25.638763427734375, + "step": 2861 + }, + { + "epoch": 1.7804043545878694, + "grad_norm": 0.1727510541677475, + "learning_rate": 2.257952973720609e-06, + "logits/chosen": -1.888128399848938, + "logits/rejected": 2.080550193786621, + "logps/chosen": -435.1989440917969, + "logps/rejected": -920.7476806640625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.537738800048828, + "rewards/margins": 24.849666595458984, + "rewards/rejected": -32.38740539550781, + "step": 2862 + }, + { + "epoch": 1.781026438569207, + "grad_norm": 1.2717350728053134e-07, + "learning_rate": 2.256800368833564e-06, + "logits/chosen": 0.18511630594730377, + "logits/rejected": 3.744037628173828, + "logps/chosen": -495.83209228515625, + "logps/rejected": -1016.9049072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.336804389953613, + "rewards/margins": 28.20050621032715, + "rewards/rejected": -37.53730773925781, + "step": 2863 + }, + { + "epoch": 1.7816485225505443, + "grad_norm": 10.340002059936523, + "learning_rate": 2.2556477639465193e-06, + "logits/chosen": 2.4061026573181152, + "logits/rejected": 3.0818214416503906, + "logps/chosen": -627.86572265625, + "logps/rejected": -845.392333984375, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.683109283447266, + "rewards/margins": 19.959367752075195, + "rewards/rejected": -30.642478942871094, + "step": 2864 + }, + { + "epoch": 1.7822706065318819, + "grad_norm": 1.0460494756698608, + "learning_rate": 2.2544951590594745e-06, + "logits/chosen": 1.9467957019805908, + "logits/rejected": 3.067915439605713, + "logps/chosen": -678.021484375, + "logps/rejected": -1033.32421875, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.412694931030273, + "rewards/margins": 28.45948600769043, + "rewards/rejected": -35.87217712402344, + "step": 2865 + }, + { + "epoch": 1.7828926905132194, + "grad_norm": 0.0009686941630207002, + "learning_rate": 2.2533425541724297e-06, + "logits/chosen": -0.48395222425460815, + "logits/rejected": 2.3878097534179688, + "logps/chosen": -477.0278015136719, + "logps/rejected": -884.94580078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.3412346839904785, + "rewards/margins": 25.06332015991211, + "rewards/rejected": -32.40455627441406, + "step": 2866 + }, + { + "epoch": 1.7835147744945568, + "grad_norm": 1.9869872331619263, + "learning_rate": 2.2521899492853854e-06, + "logits/chosen": -0.11884996294975281, + "logits/rejected": 4.540460586547852, + "logps/chosen": -442.8230895996094, + "logps/rejected": -1045.760986328125, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.427252769470215, + "rewards/margins": 27.349628448486328, + "rewards/rejected": -34.776878356933594, + "step": 2867 + }, + { + "epoch": 1.784136858475894, + "grad_norm": 5.745584964752197, + "learning_rate": 2.2510373443983406e-06, + "logits/chosen": -0.665834903717041, + "logits/rejected": 2.0192759037017822, + "logps/chosen": -589.6995849609375, + "logps/rejected": -1074.5523681640625, + "loss": 0.0391, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.3974528312683105, + "rewards/margins": 25.565839767456055, + "rewards/rejected": -32.96329116821289, + "step": 2868 + }, + { + "epoch": 1.7847589424572319, + "grad_norm": 0.004010713193565607, + "learning_rate": 2.249884739511296e-06, + "logits/chosen": 0.010303676128387451, + "logits/rejected": 4.516445159912109, + "logps/chosen": -394.00146484375, + "logps/rejected": -921.01611328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.836312770843506, + "rewards/margins": 24.702938079833984, + "rewards/rejected": -28.539249420166016, + "step": 2869 + }, + { + "epoch": 1.7853810264385692, + "grad_norm": 0.0002325717214262113, + "learning_rate": 2.248732134624251e-06, + "logits/chosen": 0.8579131364822388, + "logits/rejected": 4.581618785858154, + "logps/chosen": -591.1415405273438, + "logps/rejected": -1070.2159423828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.982126235961914, + "rewards/margins": 30.17152214050293, + "rewards/rejected": -42.153648376464844, + "step": 2870 + }, + { + "epoch": 1.7860031104199066, + "grad_norm": 0.08633695542812347, + "learning_rate": 2.2475795297372063e-06, + "logits/chosen": 2.029080390930176, + "logits/rejected": 4.249919414520264, + "logps/chosen": -643.3477172851562, + "logps/rejected": -1013.6303100585938, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.15836238861084, + "rewards/margins": 28.098350524902344, + "rewards/rejected": -37.256710052490234, + "step": 2871 + }, + { + "epoch": 1.7866251944012441, + "grad_norm": 0.13996653258800507, + "learning_rate": 2.2464269248501615e-06, + "logits/chosen": -0.4132803678512573, + "logits/rejected": 3.320643424987793, + "logps/chosen": -524.7666015625, + "logps/rejected": -1031.8675537109375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.282973289489746, + "rewards/margins": 30.53451156616211, + "rewards/rejected": -36.81748580932617, + "step": 2872 + }, + { + "epoch": 1.7872472783825817, + "grad_norm": 0.4101029932498932, + "learning_rate": 2.2452743199631167e-06, + "logits/chosen": 1.6613099575042725, + "logits/rejected": 4.061172008514404, + "logps/chosen": -518.1925048828125, + "logps/rejected": -979.4036865234375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.9535813331604, + "rewards/margins": 31.052183151245117, + "rewards/rejected": -38.005767822265625, + "step": 2873 + }, + { + "epoch": 1.787869362363919, + "grad_norm": 0.15557296574115753, + "learning_rate": 2.2441217150760724e-06, + "logits/chosen": 1.591304063796997, + "logits/rejected": 0.4381207227706909, + "logps/chosen": -690.9009399414062, + "logps/rejected": -943.1353149414062, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.526205062866211, + "rewards/margins": 27.176612854003906, + "rewards/rejected": -38.70281982421875, + "step": 2874 + }, + { + "epoch": 1.7884914463452566, + "grad_norm": 16.046310424804688, + "learning_rate": 2.2429691101890276e-06, + "logits/chosen": 2.2174692153930664, + "logits/rejected": 4.0461297035217285, + "logps/chosen": -761.5391845703125, + "logps/rejected": -1035.3385009765625, + "loss": 0.063, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.166966438293457, + "rewards/margins": 18.014434814453125, + "rewards/rejected": -28.181400299072266, + "step": 2875 + }, + { + "epoch": 1.7891135303265941, + "grad_norm": 1.2151827812194824, + "learning_rate": 2.241816505301983e-06, + "logits/chosen": 0.9016231298446655, + "logits/rejected": 3.3903098106384277, + "logps/chosen": -545.459716796875, + "logps/rejected": -949.28466796875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.686226844787598, + "rewards/margins": 25.988252639770508, + "rewards/rejected": -33.67448043823242, + "step": 2876 + }, + { + "epoch": 1.7897356143079315, + "grad_norm": 0.03736743703484535, + "learning_rate": 2.240663900414938e-06, + "logits/chosen": 0.6556278467178345, + "logits/rejected": 2.400874137878418, + "logps/chosen": -350.68231201171875, + "logps/rejected": -622.5861206054688, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.515726089477539, + "rewards/margins": 19.54558753967285, + "rewards/rejected": -25.06131362915039, + "step": 2877 + }, + { + "epoch": 1.790357698289269, + "grad_norm": 7.153614569688216e-05, + "learning_rate": 2.2395112955278933e-06, + "logits/chosen": -0.9888482093811035, + "logits/rejected": 3.460984230041504, + "logps/chosen": -356.6788330078125, + "logps/rejected": -922.9345703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.053175449371338, + "rewards/margins": 25.149484634399414, + "rewards/rejected": -30.202659606933594, + "step": 2878 + }, + { + "epoch": 1.7909797822706066, + "grad_norm": 4.209404869470745e-05, + "learning_rate": 2.2383586906408485e-06, + "logits/chosen": 0.8455231189727783, + "logits/rejected": 4.013948440551758, + "logps/chosen": -500.7359619140625, + "logps/rejected": -1064.920166015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.406131744384766, + "rewards/margins": 30.79470443725586, + "rewards/rejected": -38.200836181640625, + "step": 2879 + }, + { + "epoch": 1.791601866251944, + "grad_norm": 0.6016518473625183, + "learning_rate": 2.2372060857538037e-06, + "logits/chosen": -1.218041181564331, + "logits/rejected": 1.918828010559082, + "logps/chosen": -378.580810546875, + "logps/rejected": -893.1238403320312, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.370937824249268, + "rewards/margins": 26.3651065826416, + "rewards/rejected": -31.736042022705078, + "step": 2880 + }, + { + "epoch": 1.7922239502332815, + "grad_norm": 0.06199437752366066, + "learning_rate": 2.236053480866759e-06, + "logits/chosen": 1.2523444890975952, + "logits/rejected": 3.574066162109375, + "logps/chosen": -564.341064453125, + "logps/rejected": -926.0437622070312, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.147315502166748, + "rewards/margins": 22.847570419311523, + "rewards/rejected": -27.994884490966797, + "step": 2881 + }, + { + "epoch": 1.792846034214619, + "grad_norm": 0.20693425834178925, + "learning_rate": 2.2349008759797146e-06, + "logits/chosen": 0.8386412262916565, + "logits/rejected": 2.8027706146240234, + "logps/chosen": -561.4254150390625, + "logps/rejected": -894.841796875, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.587484359741211, + "rewards/margins": 16.834026336669922, + "rewards/rejected": -24.421510696411133, + "step": 2882 + }, + { + "epoch": 1.7934681181959564, + "grad_norm": 0.0012074284022673965, + "learning_rate": 2.23374827109267e-06, + "logits/chosen": -1.2442289590835571, + "logits/rejected": 3.0291409492492676, + "logps/chosen": -401.638916015625, + "logps/rejected": -942.6603393554688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.9297685623168945, + "rewards/margins": 30.98455810546875, + "rewards/rejected": -38.91432189941406, + "step": 2883 + }, + { + "epoch": 1.794090202177294, + "grad_norm": 0.03824201226234436, + "learning_rate": 2.232595666205625e-06, + "logits/chosen": 0.026376813650131226, + "logits/rejected": 4.139131546020508, + "logps/chosen": -498.52325439453125, + "logps/rejected": -1008.8857421875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.575507164001465, + "rewards/margins": 26.977291107177734, + "rewards/rejected": -34.552799224853516, + "step": 2884 + }, + { + "epoch": 1.7947122861586315, + "grad_norm": 0.002526791300624609, + "learning_rate": 2.2314430613185803e-06, + "logits/chosen": 0.8544110655784607, + "logits/rejected": 3.109708070755005, + "logps/chosen": -500.810791015625, + "logps/rejected": -909.2841796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.763897895812988, + "rewards/margins": 28.4268798828125, + "rewards/rejected": -33.19077682495117, + "step": 2885 + }, + { + "epoch": 1.7953343701399689, + "grad_norm": 0.004643771797418594, + "learning_rate": 2.2302904564315355e-06, + "logits/chosen": -0.9598997831344604, + "logits/rejected": 4.1503777503967285, + "logps/chosen": -420.0594482421875, + "logps/rejected": -983.3630981445312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.483625888824463, + "rewards/margins": 30.00240707397461, + "rewards/rejected": -35.48603439331055, + "step": 2886 + }, + { + "epoch": 1.7959564541213062, + "grad_norm": 0.023572798818349838, + "learning_rate": 2.2291378515444907e-06, + "logits/chosen": 0.6749930381774902, + "logits/rejected": 3.5571882724761963, + "logps/chosen": -455.864501953125, + "logps/rejected": -821.3405151367188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.834256172180176, + "rewards/margins": 26.151960372924805, + "rewards/rejected": -31.986217498779297, + "step": 2887 + }, + { + "epoch": 1.796578538102644, + "grad_norm": 17.47933578491211, + "learning_rate": 2.227985246657446e-06, + "logits/chosen": 1.6231681108474731, + "logits/rejected": 3.904000997543335, + "logps/chosen": -675.1061401367188, + "logps/rejected": -1025.37939453125, + "loss": 0.1002, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.872825860977173, + "rewards/margins": 26.913835525512695, + "rewards/rejected": -30.786663055419922, + "step": 2888 + }, + { + "epoch": 1.7972006220839813, + "grad_norm": 7.277772510860814e-06, + "learning_rate": 2.2268326417704016e-06, + "logits/chosen": 0.35814934968948364, + "logits/rejected": 2.3936288356781006, + "logps/chosen": -615.1605224609375, + "logps/rejected": -1072.79541015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.465442657470703, + "rewards/margins": 29.792701721191406, + "rewards/rejected": -39.25814437866211, + "step": 2889 + }, + { + "epoch": 1.7978227060653187, + "grad_norm": 0.00016307276382576674, + "learning_rate": 2.2256800368833564e-06, + "logits/chosen": 0.3945973515510559, + "logits/rejected": 3.6338953971862793, + "logps/chosen": -555.1265869140625, + "logps/rejected": -1015.112548828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.311851501464844, + "rewards/margins": 23.719341278076172, + "rewards/rejected": -32.03119659423828, + "step": 2890 + }, + { + "epoch": 1.7984447900466562, + "grad_norm": 0.0038027490954846144, + "learning_rate": 2.2245274319963116e-06, + "logits/chosen": 0.6874729990959167, + "logits/rejected": 2.82660174369812, + "logps/chosen": -535.7744140625, + "logps/rejected": -943.1077880859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.335187911987305, + "rewards/margins": 28.816408157348633, + "rewards/rejected": -38.15159606933594, + "step": 2891 + }, + { + "epoch": 1.7990668740279938, + "grad_norm": 1.2254604371264577e-06, + "learning_rate": 2.223374827109267e-06, + "logits/chosen": 2.0987632274627686, + "logits/rejected": 4.418678283691406, + "logps/chosen": -547.7008056640625, + "logps/rejected": -925.8990478515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.061464309692383, + "rewards/margins": 27.80548667907715, + "rewards/rejected": -35.86695098876953, + "step": 2892 + }, + { + "epoch": 1.7996889580093312, + "grad_norm": 0.05550967901945114, + "learning_rate": 2.222222222222222e-06, + "logits/chosen": -0.3484824299812317, + "logits/rejected": 2.3431625366210938, + "logps/chosen": -382.5537109375, + "logps/rejected": -790.096923828125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.595591068267822, + "rewards/margins": 22.2501220703125, + "rewards/rejected": -26.845712661743164, + "step": 2893 + }, + { + "epoch": 1.8003110419906687, + "grad_norm": 1.9200422229914693e-07, + "learning_rate": 2.2210696173351777e-06, + "logits/chosen": 1.3279170989990234, + "logits/rejected": 2.890540838241577, + "logps/chosen": -693.2498779296875, + "logps/rejected": -1096.75, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.892668724060059, + "rewards/margins": 33.936702728271484, + "rewards/rejected": -45.82937240600586, + "step": 2894 + }, + { + "epoch": 1.8009331259720063, + "grad_norm": 0.023555485531687737, + "learning_rate": 2.219917012448133e-06, + "logits/chosen": 0.3982091546058655, + "logits/rejected": 2.7005228996276855, + "logps/chosen": -511.42529296875, + "logps/rejected": -1003.4261474609375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.058162689208984, + "rewards/margins": 32.33118438720703, + "rewards/rejected": -40.389347076416016, + "step": 2895 + }, + { + "epoch": 1.8015552099533436, + "grad_norm": 0.32222017645835876, + "learning_rate": 2.218764407561088e-06, + "logits/chosen": 2.824542999267578, + "logits/rejected": 3.9358267784118652, + "logps/chosen": -637.0455322265625, + "logps/rejected": -999.7955932617188, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.393836975097656, + "rewards/margins": 29.571090698242188, + "rewards/rejected": -37.96493148803711, + "step": 2896 + }, + { + "epoch": 1.8021772939346812, + "grad_norm": 5.3932286391500384e-05, + "learning_rate": 2.2176118026740434e-06, + "logits/chosen": 0.723086416721344, + "logits/rejected": 3.5180208683013916, + "logps/chosen": -524.3895263671875, + "logps/rejected": -977.4048461914062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.150003433227539, + "rewards/margins": 27.52701187133789, + "rewards/rejected": -36.6770133972168, + "step": 2897 + }, + { + "epoch": 1.8027993779160187, + "grad_norm": 1.2058066129684448, + "learning_rate": 2.2164591977869986e-06, + "logits/chosen": 3.3061118125915527, + "logits/rejected": 4.3138508796691895, + "logps/chosen": -613.2428588867188, + "logps/rejected": -913.3850708007812, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.50782585144043, + "rewards/margins": 20.202499389648438, + "rewards/rejected": -32.7103271484375, + "step": 2898 + }, + { + "epoch": 1.803421461897356, + "grad_norm": 0.019831910729408264, + "learning_rate": 2.215306592899954e-06, + "logits/chosen": -0.23215629160404205, + "logits/rejected": 0.6817746162414551, + "logps/chosen": -534.7474365234375, + "logps/rejected": -899.2763061523438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.072684288024902, + "rewards/margins": 29.39617156982422, + "rewards/rejected": -36.46885681152344, + "step": 2899 + }, + { + "epoch": 1.8040435458786936, + "grad_norm": 0.02706284634768963, + "learning_rate": 2.214153988012909e-06, + "logits/chosen": 0.3160700798034668, + "logits/rejected": 4.163125991821289, + "logps/chosen": -294.6250915527344, + "logps/rejected": -732.25732421875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0198540687561035, + "rewards/margins": 23.535429000854492, + "rewards/rejected": -28.55528450012207, + "step": 2900 + }, + { + "epoch": 1.8046656298600312, + "grad_norm": 0.303785502910614, + "learning_rate": 2.2130013831258647e-06, + "logits/chosen": 0.5170513391494751, + "logits/rejected": 2.3560218811035156, + "logps/chosen": -593.2955322265625, + "logps/rejected": -927.296142578125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.172282218933105, + "rewards/margins": 24.161550521850586, + "rewards/rejected": -32.333831787109375, + "step": 2901 + }, + { + "epoch": 1.8052877138413685, + "grad_norm": 39.95273208618164, + "learning_rate": 2.21184877823882e-06, + "logits/chosen": 0.7138420343399048, + "logits/rejected": 3.270815372467041, + "logps/chosen": -482.2649841308594, + "logps/rejected": -907.3980102539062, + "loss": 1.3448, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.218573570251465, + "rewards/margins": 23.526567459106445, + "rewards/rejected": -31.745140075683594, + "step": 2902 + }, + { + "epoch": 1.805909797822706, + "grad_norm": 0.3934561610221863, + "learning_rate": 2.210696173351775e-06, + "logits/chosen": -2.243908405303955, + "logits/rejected": 1.5933899879455566, + "logps/chosen": -375.66131591796875, + "logps/rejected": -851.6378784179688, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.891318321228027, + "rewards/margins": 26.80710792541504, + "rewards/rejected": -31.69842529296875, + "step": 2903 + }, + { + "epoch": 1.8065318818040437, + "grad_norm": 0.023749463260173798, + "learning_rate": 2.2095435684647304e-06, + "logits/chosen": 1.9159235954284668, + "logits/rejected": 3.28167724609375, + "logps/chosen": -548.1072387695312, + "logps/rejected": -928.1199951171875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.026692390441895, + "rewards/margins": 27.322006225585938, + "rewards/rejected": -35.348697662353516, + "step": 2904 + }, + { + "epoch": 1.807153965785381, + "grad_norm": 2.144883394241333, + "learning_rate": 2.2083909635776856e-06, + "logits/chosen": -0.10407549142837524, + "logits/rejected": 4.395783424377441, + "logps/chosen": -437.296630859375, + "logps/rejected": -834.9159545898438, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.79471492767334, + "rewards/margins": 17.42055892944336, + "rewards/rejected": -27.215274810791016, + "step": 2905 + }, + { + "epoch": 1.8077760497667184, + "grad_norm": 5.723523432976663e-09, + "learning_rate": 2.207238358690641e-06, + "logits/chosen": -3.5189616680145264, + "logits/rejected": 4.164684295654297, + "logps/chosen": -288.29217529296875, + "logps/rejected": -1121.138427734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.8310956954956055, + "rewards/margins": 34.23997497558594, + "rewards/rejected": -42.071067810058594, + "step": 2906 + }, + { + "epoch": 1.8083981337480561, + "grad_norm": 5.7696780686455895e-08, + "learning_rate": 2.206085753803596e-06, + "logits/chosen": -1.8566597700119019, + "logits/rejected": 3.9218180179595947, + "logps/chosen": -331.6076354980469, + "logps/rejected": -1117.03857421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.040053844451904, + "rewards/margins": 40.208106994628906, + "rewards/rejected": -46.24816131591797, + "step": 2907 + }, + { + "epoch": 1.8090202177293935, + "grad_norm": 0.02445216104388237, + "learning_rate": 2.2049331489165517e-06, + "logits/chosen": -3.414188861846924, + "logits/rejected": 3.4807980060577393, + "logps/chosen": -381.92279052734375, + "logps/rejected": -999.1416625976562, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.60594367980957, + "rewards/margins": 25.19023895263672, + "rewards/rejected": -29.796180725097656, + "step": 2908 + }, + { + "epoch": 1.8096423017107308, + "grad_norm": 5.386016845703125, + "learning_rate": 2.203780544029507e-06, + "logits/chosen": -0.25347068905830383, + "logits/rejected": 2.615142345428467, + "logps/chosen": -591.55712890625, + "logps/rejected": -973.9298095703125, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.074479103088379, + "rewards/margins": 24.832969665527344, + "rewards/rejected": -31.90744972229004, + "step": 2909 + }, + { + "epoch": 1.8102643856920684, + "grad_norm": 0.0004156986833550036, + "learning_rate": 2.202627939142462e-06, + "logits/chosen": 2.4946722984313965, + "logits/rejected": 3.7645764350891113, + "logps/chosen": -769.0728759765625, + "logps/rejected": -982.890869140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.196980476379395, + "rewards/margins": 22.227109909057617, + "rewards/rejected": -34.42408752441406, + "step": 2910 + }, + { + "epoch": 1.810886469673406, + "grad_norm": 0.0488068051636219, + "learning_rate": 2.2014753342554174e-06, + "logits/chosen": -0.22399890422821045, + "logits/rejected": 3.620121479034424, + "logps/chosen": -438.82940673828125, + "logps/rejected": -879.4267578125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.7626495361328125, + "rewards/margins": 23.414827346801758, + "rewards/rejected": -30.17747688293457, + "step": 2911 + }, + { + "epoch": 1.8115085536547433, + "grad_norm": 0.025857780128717422, + "learning_rate": 2.2003227293683726e-06, + "logits/chosen": -0.0801013708114624, + "logits/rejected": 3.76338791847229, + "logps/chosen": -366.40399169921875, + "logps/rejected": -822.9530029296875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.393184661865234, + "rewards/margins": 23.762935638427734, + "rewards/rejected": -31.15612030029297, + "step": 2912 + }, + { + "epoch": 1.8121306376360808, + "grad_norm": 2.441636004490988e-09, + "learning_rate": 2.199170124481328e-06, + "logits/chosen": 1.1459236145019531, + "logits/rejected": 4.223677158355713, + "logps/chosen": -656.9322509765625, + "logps/rejected": -1171.351318359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.336624145507812, + "rewards/margins": 33.33731460571289, + "rewards/rejected": -45.6739387512207, + "step": 2913 + }, + { + "epoch": 1.8127527216174184, + "grad_norm": 9.004888852359727e-06, + "learning_rate": 2.198017519594283e-06, + "logits/chosen": -1.1242594718933105, + "logits/rejected": 3.08487606048584, + "logps/chosen": -347.3804016113281, + "logps/rejected": -911.5828247070312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.066286563873291, + "rewards/margins": 29.527799606323242, + "rewards/rejected": -34.594085693359375, + "step": 2914 + }, + { + "epoch": 1.8133748055987557, + "grad_norm": 9.333149137091823e-06, + "learning_rate": 2.1968649147072387e-06, + "logits/chosen": 0.8879589438438416, + "logits/rejected": 2.641096353530884, + "logps/chosen": -611.4288330078125, + "logps/rejected": -1026.958251953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.871919631958008, + "rewards/margins": 27.208112716674805, + "rewards/rejected": -41.08003234863281, + "step": 2915 + }, + { + "epoch": 1.8139968895800933, + "grad_norm": 5.846538066864014, + "learning_rate": 2.195712309820194e-06, + "logits/chosen": 0.099272221326828, + "logits/rejected": 3.766204833984375, + "logps/chosen": -506.4801940917969, + "logps/rejected": -1010.7666015625, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.206683158874512, + "rewards/margins": 27.946022033691406, + "rewards/rejected": -38.15270233154297, + "step": 2916 + }, + { + "epoch": 1.8146189735614309, + "grad_norm": 0.007777743507176638, + "learning_rate": 2.194559704933149e-06, + "logits/chosen": -0.36318880319595337, + "logits/rejected": 1.6212220191955566, + "logps/chosen": -473.5294189453125, + "logps/rejected": -822.025634765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.75701904296875, + "rewards/margins": 25.177345275878906, + "rewards/rejected": -31.934364318847656, + "step": 2917 + }, + { + "epoch": 1.8152410575427682, + "grad_norm": 3.3365886338287964e-05, + "learning_rate": 2.1934071000461044e-06, + "logits/chosen": 1.2993437051773071, + "logits/rejected": 3.2163896560668945, + "logps/chosen": -691.1989135742188, + "logps/rejected": -1104.626953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.866948127746582, + "rewards/margins": 35.912864685058594, + "rewards/rejected": -44.77981185913086, + "step": 2918 + }, + { + "epoch": 1.8158631415241058, + "grad_norm": 0.006478574126958847, + "learning_rate": 2.1922544951590596e-06, + "logits/chosen": -2.5405850410461426, + "logits/rejected": 1.3741693496704102, + "logps/chosen": -430.7892150878906, + "logps/rejected": -984.012451171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.790828704833984, + "rewards/margins": 29.503963470458984, + "rewards/rejected": -37.29479217529297, + "step": 2919 + }, + { + "epoch": 1.8164852255054433, + "grad_norm": 0.00021122633188497275, + "learning_rate": 2.191101890272015e-06, + "logits/chosen": -0.9110089540481567, + "logits/rejected": 3.0883960723876953, + "logps/chosen": -460.5048522949219, + "logps/rejected": -1125.7283935546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.782463073730469, + "rewards/margins": 36.10568618774414, + "rewards/rejected": -41.88814926147461, + "step": 2920 + }, + { + "epoch": 1.8171073094867807, + "grad_norm": 0.003954702522605658, + "learning_rate": 2.18994928538497e-06, + "logits/chosen": -2.058516025543213, + "logits/rejected": 2.3706719875335693, + "logps/chosen": -356.7776794433594, + "logps/rejected": -962.4862670898438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8709609508514404, + "rewards/margins": 34.379554748535156, + "rewards/rejected": -38.25051498413086, + "step": 2921 + }, + { + "epoch": 1.8177293934681182, + "grad_norm": 1.9064320440520532e-05, + "learning_rate": 2.1887966804979253e-06, + "logits/chosen": 0.29505228996276855, + "logits/rejected": 3.59993314743042, + "logps/chosen": -487.88775634765625, + "logps/rejected": -985.3267822265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.258945465087891, + "rewards/margins": 32.4913444519043, + "rewards/rejected": -38.75028991699219, + "step": 2922 + }, + { + "epoch": 1.8183514774494558, + "grad_norm": 28.823314666748047, + "learning_rate": 2.187644075610881e-06, + "logits/chosen": -0.5781339406967163, + "logits/rejected": 2.4247617721557617, + "logps/chosen": -513.3623657226562, + "logps/rejected": -1042.141845703125, + "loss": 0.4131, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.648505687713623, + "rewards/margins": 31.26897430419922, + "rewards/rejected": -38.917476654052734, + "step": 2923 + }, + { + "epoch": 1.8189735614307931, + "grad_norm": 0.005049367900937796, + "learning_rate": 2.186491470723836e-06, + "logits/chosen": 0.6467719078063965, + "logits/rejected": 4.127745151519775, + "logps/chosen": -454.96112060546875, + "logps/rejected": -907.7899169921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.762445449829102, + "rewards/margins": 27.605257034301758, + "rewards/rejected": -32.36770248413086, + "step": 2924 + }, + { + "epoch": 1.8195956454121305, + "grad_norm": 0.5330542325973511, + "learning_rate": 2.1853388658367914e-06, + "logits/chosen": -1.1520477533340454, + "logits/rejected": 3.201488971710205, + "logps/chosen": -368.5358581542969, + "logps/rejected": -902.8729248046875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.463862419128418, + "rewards/margins": 32.56816864013672, + "rewards/rejected": -36.03202819824219, + "step": 2925 + }, + { + "epoch": 1.8202177293934683, + "grad_norm": 0.12580925226211548, + "learning_rate": 2.1841862609497466e-06, + "logits/chosen": 1.035218358039856, + "logits/rejected": 2.849552631378174, + "logps/chosen": -667.6565551757812, + "logps/rejected": -1076.946044921875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.761651039123535, + "rewards/margins": 23.467363357543945, + "rewards/rejected": -34.22901153564453, + "step": 2926 + }, + { + "epoch": 1.8208398133748056, + "grad_norm": 0.020777558907866478, + "learning_rate": 2.183033656062702e-06, + "logits/chosen": 0.5492273569107056, + "logits/rejected": 3.572721481323242, + "logps/chosen": -448.18756103515625, + "logps/rejected": -810.0396728515625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.7897562980651855, + "rewards/margins": 19.51482582092285, + "rewards/rejected": -26.304582595825195, + "step": 2927 + }, + { + "epoch": 1.821461897356143, + "grad_norm": 5.220741149969399e-05, + "learning_rate": 2.181881051175657e-06, + "logits/chosen": 1.1220362186431885, + "logits/rejected": 3.1610453128814697, + "logps/chosen": -531.509521484375, + "logps/rejected": -936.9639892578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.363021373748779, + "rewards/margins": 30.606876373291016, + "rewards/rejected": -36.96989822387695, + "step": 2928 + }, + { + "epoch": 1.8220839813374805, + "grad_norm": 0.004893193952739239, + "learning_rate": 2.1807284462886123e-06, + "logits/chosen": 0.5991615653038025, + "logits/rejected": 4.349677085876465, + "logps/chosen": -508.3122253417969, + "logps/rejected": -951.9404907226562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.923589706420898, + "rewards/margins": 27.25507164001465, + "rewards/rejected": -35.17866134643555, + "step": 2929 + }, + { + "epoch": 1.822706065318818, + "grad_norm": 0.015286240726709366, + "learning_rate": 2.179575841401568e-06, + "logits/chosen": 0.10364526510238647, + "logits/rejected": 3.790127992630005, + "logps/chosen": -611.4168701171875, + "logps/rejected": -1101.678955078125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.22943115234375, + "rewards/margins": 28.932636260986328, + "rewards/rejected": -37.16206741333008, + "step": 2930 + }, + { + "epoch": 1.8233281493001554, + "grad_norm": 2.1218961876456888e-07, + "learning_rate": 2.178423236514523e-06, + "logits/chosen": -2.1993157863616943, + "logits/rejected": 3.7309844493865967, + "logps/chosen": -365.35125732421875, + "logps/rejected": -1016.9790649414062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.300674438476562, + "rewards/margins": 31.86829376220703, + "rewards/rejected": -40.168968200683594, + "step": 2931 + }, + { + "epoch": 1.823950233281493, + "grad_norm": 0.011282279156148434, + "learning_rate": 2.1772706316274784e-06, + "logits/chosen": -1.7666422128677368, + "logits/rejected": 1.8686922788619995, + "logps/chosen": -422.510498046875, + "logps/rejected": -993.54248046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.541168212890625, + "rewards/margins": 30.807161331176758, + "rewards/rejected": -40.34832763671875, + "step": 2932 + }, + { + "epoch": 1.8245723172628305, + "grad_norm": 0.0015131094260141253, + "learning_rate": 2.1761180267404336e-06, + "logits/chosen": -1.056628942489624, + "logits/rejected": 3.1450870037078857, + "logps/chosen": -361.94989013671875, + "logps/rejected": -995.957275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.591294765472412, + "rewards/margins": 30.778053283691406, + "rewards/rejected": -36.36935043334961, + "step": 2933 + }, + { + "epoch": 1.8251944012441679, + "grad_norm": 1.8307373523712158, + "learning_rate": 2.174965421853389e-06, + "logits/chosen": -0.6615819931030273, + "logits/rejected": 2.4734442234039307, + "logps/chosen": -519.839599609375, + "logps/rejected": -949.9840087890625, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.32272720336914, + "rewards/margins": 23.967910766601562, + "rewards/rejected": -33.29063415527344, + "step": 2934 + }, + { + "epoch": 1.8258164852255054, + "grad_norm": 0.0006201690994203091, + "learning_rate": 2.173812816966344e-06, + "logits/chosen": 4.023318290710449, + "logits/rejected": 3.745248317718506, + "logps/chosen": -664.39013671875, + "logps/rejected": -877.3580322265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.520995140075684, + "rewards/margins": 26.294607162475586, + "rewards/rejected": -31.815603256225586, + "step": 2935 + }, + { + "epoch": 1.826438569206843, + "grad_norm": 0.00011524202272994444, + "learning_rate": 2.1726602120792993e-06, + "logits/chosen": 0.4154716730117798, + "logits/rejected": 4.476164817810059, + "logps/chosen": -483.99884033203125, + "logps/rejected": -1042.3800048828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.124652862548828, + "rewards/margins": 30.8934326171875, + "rewards/rejected": -39.01808166503906, + "step": 2936 + }, + { + "epoch": 1.8270606531881803, + "grad_norm": 6.123228073120117, + "learning_rate": 2.171507607192255e-06, + "logits/chosen": 2.2037289142608643, + "logits/rejected": 2.833683490753174, + "logps/chosen": -729.1688842773438, + "logps/rejected": -1059.06494140625, + "loss": 0.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.01978588104248, + "rewards/margins": 24.942218780517578, + "rewards/rejected": -34.962005615234375, + "step": 2937 + }, + { + "epoch": 1.827682737169518, + "grad_norm": 1.062633714354888e-06, + "learning_rate": 2.17035500230521e-06, + "logits/chosen": 0.33711880445480347, + "logits/rejected": 3.4437479972839355, + "logps/chosen": -421.8076171875, + "logps/rejected": -1000.681640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.936567306518555, + "rewards/margins": 35.271427154541016, + "rewards/rejected": -44.2079963684082, + "step": 2938 + }, + { + "epoch": 1.8283048211508555, + "grad_norm": 1.1224675178527832, + "learning_rate": 2.1692023974181654e-06, + "logits/chosen": -2.050677537918091, + "logits/rejected": 2.308436870574951, + "logps/chosen": -409.9349060058594, + "logps/rejected": -905.05419921875, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.486188888549805, + "rewards/margins": 25.246501922607422, + "rewards/rejected": -32.732688903808594, + "step": 2939 + }, + { + "epoch": 1.8289269051321928, + "grad_norm": 6.059678554534912, + "learning_rate": 2.1680497925311206e-06, + "logits/chosen": 1.1956887245178223, + "logits/rejected": 3.2358450889587402, + "logps/chosen": -490.0911865234375, + "logps/rejected": -807.5560913085938, + "loss": 0.0499, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.075892448425293, + "rewards/margins": 21.591156005859375, + "rewards/rejected": -27.667049407958984, + "step": 2940 + }, + { + "epoch": 1.8295489891135304, + "grad_norm": 0.0003045987687073648, + "learning_rate": 2.166897187644076e-06, + "logits/chosen": 1.3800091743469238, + "logits/rejected": 3.362034797668457, + "logps/chosen": -597.3182373046875, + "logps/rejected": -1002.1866455078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.463356018066406, + "rewards/margins": 30.19609832763672, + "rewards/rejected": -43.659454345703125, + "step": 2941 + }, + { + "epoch": 1.830171073094868, + "grad_norm": 0.0026972335763275623, + "learning_rate": 2.165744582757031e-06, + "logits/chosen": -1.7000247240066528, + "logits/rejected": 3.2369625568389893, + "logps/chosen": -315.9631652832031, + "logps/rejected": -950.5713500976562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.224560737609863, + "rewards/margins": 28.419212341308594, + "rewards/rejected": -33.64377212524414, + "step": 2942 + }, + { + "epoch": 1.8307931570762053, + "grad_norm": 0.0006514900014735758, + "learning_rate": 2.1645919778699863e-06, + "logits/chosen": 1.6406525373458862, + "logits/rejected": 2.945081949234009, + "logps/chosen": -550.1603393554688, + "logps/rejected": -932.3032836914062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.099654197692871, + "rewards/margins": 28.81879425048828, + "rewards/rejected": -37.91844940185547, + "step": 2943 + }, + { + "epoch": 1.8314152410575426, + "grad_norm": 3.3904993534088135, + "learning_rate": 2.1634393729829415e-06, + "logits/chosen": 1.3965401649475098, + "logits/rejected": 2.7653379440307617, + "logps/chosen": -546.5840454101562, + "logps/rejected": -861.8883666992188, + "loss": 0.0332, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.872170925140381, + "rewards/margins": 23.634138107299805, + "rewards/rejected": -30.506309509277344, + "step": 2944 + }, + { + "epoch": 1.8320373250388804, + "grad_norm": 0.0024480209685862064, + "learning_rate": 2.162286768095897e-06, + "logits/chosen": -0.25313782691955566, + "logits/rejected": 2.7993435859680176, + "logps/chosen": -516.3167724609375, + "logps/rejected": -857.3118286132812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.381752967834473, + "rewards/margins": 21.019704818725586, + "rewards/rejected": -29.401458740234375, + "step": 2945 + }, + { + "epoch": 1.8326594090202177, + "grad_norm": 2.8514961503134373e-09, + "learning_rate": 2.1611341632088524e-06, + "logits/chosen": -1.2172906398773193, + "logits/rejected": 2.9055662155151367, + "logps/chosen": -480.29754638671875, + "logps/rejected": -1059.2327880859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.863322734832764, + "rewards/margins": 32.67042922973633, + "rewards/rejected": -40.53375244140625, + "step": 2946 + }, + { + "epoch": 1.833281493001555, + "grad_norm": 3.847206971840933e-05, + "learning_rate": 2.1599815583218076e-06, + "logits/chosen": 0.5809682011604309, + "logits/rejected": 3.172386646270752, + "logps/chosen": -445.71368408203125, + "logps/rejected": -870.1534423828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.905163764953613, + "rewards/margins": 29.116558074951172, + "rewards/rejected": -36.02172088623047, + "step": 2947 + }, + { + "epoch": 1.8339035769828926, + "grad_norm": 0.11857543140649796, + "learning_rate": 2.158828953434763e-06, + "logits/chosen": -0.6377525329589844, + "logits/rejected": 3.9645566940307617, + "logps/chosen": -480.59234619140625, + "logps/rejected": -1024.75390625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.692419528961182, + "rewards/margins": 25.763336181640625, + "rewards/rejected": -31.45575523376465, + "step": 2948 + }, + { + "epoch": 1.8345256609642302, + "grad_norm": 5.8594279289245605, + "learning_rate": 2.157676348547718e-06, + "logits/chosen": -2.2111575603485107, + "logits/rejected": 3.7058584690093994, + "logps/chosen": -376.0509338378906, + "logps/rejected": -938.4148559570312, + "loss": 0.0523, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.637496948242188, + "rewards/margins": 27.1528377532959, + "rewards/rejected": -37.79033279418945, + "step": 2949 + }, + { + "epoch": 1.8351477449455675, + "grad_norm": 0.3535236716270447, + "learning_rate": 2.1565237436606733e-06, + "logits/chosen": 0.1966608762741089, + "logits/rejected": 2.6298699378967285, + "logps/chosen": -603.3723754882812, + "logps/rejected": -920.42138671875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.434402465820312, + "rewards/margins": 24.166688919067383, + "rewards/rejected": -34.60109329223633, + "step": 2950 + }, + { + "epoch": 1.835769828926905, + "grad_norm": 1.51795320562087e-05, + "learning_rate": 2.1553711387736285e-06, + "logits/chosen": -1.9474167823791504, + "logits/rejected": 1.8401274681091309, + "logps/chosen": -547.075927734375, + "logps/rejected": -1213.5771484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.760676383972168, + "rewards/margins": 37.2432746887207, + "rewards/rejected": -52.00395202636719, + "step": 2951 + }, + { + "epoch": 1.8363919129082427, + "grad_norm": 0.005809712689369917, + "learning_rate": 2.154218533886584e-06, + "logits/chosen": -1.4853514432907104, + "logits/rejected": 3.4599609375, + "logps/chosen": -325.3935852050781, + "logps/rejected": -844.361572265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.286863327026367, + "rewards/margins": 23.762916564941406, + "rewards/rejected": -30.049781799316406, + "step": 2952 + }, + { + "epoch": 1.83701399688958, + "grad_norm": 1.4774353076063562e-05, + "learning_rate": 2.1530659289995394e-06, + "logits/chosen": 0.5670173168182373, + "logits/rejected": 3.9193413257598877, + "logps/chosen": -564.1218872070312, + "logps/rejected": -1089.5821533203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.208772659301758, + "rewards/margins": 39.558135986328125, + "rewards/rejected": -50.766910552978516, + "step": 2953 + }, + { + "epoch": 1.8376360808709176, + "grad_norm": 0.0008804783574305475, + "learning_rate": 2.1519133241124946e-06, + "logits/chosen": -1.3862698078155518, + "logits/rejected": 4.039997577667236, + "logps/chosen": -367.4657287597656, + "logps/rejected": -995.6171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9530906677246094, + "rewards/margins": 34.095638275146484, + "rewards/rejected": -38.04873275756836, + "step": 2954 + }, + { + "epoch": 1.8382581648522551, + "grad_norm": 0.0034357395488768816, + "learning_rate": 2.15076071922545e-06, + "logits/chosen": 0.04776197671890259, + "logits/rejected": 2.4046387672424316, + "logps/chosen": -530.6160888671875, + "logps/rejected": -937.1896362304688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.533966064453125, + "rewards/margins": 22.469799041748047, + "rewards/rejected": -36.003761291503906, + "step": 2955 + }, + { + "epoch": 1.8388802488335925, + "grad_norm": 1.5653957234462723e-05, + "learning_rate": 2.149608114338405e-06, + "logits/chosen": -0.19805996119976044, + "logits/rejected": 3.1254231929779053, + "logps/chosen": -556.4609375, + "logps/rejected": -1079.94775390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.580520629882812, + "rewards/margins": 30.80333137512207, + "rewards/rejected": -43.38385009765625, + "step": 2956 + }, + { + "epoch": 1.83950233281493, + "grad_norm": 27.43037986755371, + "learning_rate": 2.1484555094513603e-06, + "logits/chosen": 2.576160430908203, + "logits/rejected": 3.2386300563812256, + "logps/chosen": -607.7432861328125, + "logps/rejected": -908.0758056640625, + "loss": 0.374, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.76134967803955, + "rewards/margins": 23.454696655273438, + "rewards/rejected": -34.21604919433594, + "step": 2957 + }, + { + "epoch": 1.8401244167962676, + "grad_norm": 6.398452281951904, + "learning_rate": 2.1473029045643155e-06, + "logits/chosen": -0.6255086064338684, + "logits/rejected": 3.2505578994750977, + "logps/chosen": -477.28216552734375, + "logps/rejected": -866.9014892578125, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.571666717529297, + "rewards/margins": 24.20078468322754, + "rewards/rejected": -32.7724494934082, + "step": 2958 + }, + { + "epoch": 1.840746500777605, + "grad_norm": 7.195072157628601e-06, + "learning_rate": 2.1461502996772707e-06, + "logits/chosen": -1.8814363479614258, + "logits/rejected": 3.9996814727783203, + "logps/chosen": -305.8305969238281, + "logps/rejected": -1075.4239501953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.674073219299316, + "rewards/margins": 39.340084075927734, + "rewards/rejected": -44.01416015625, + "step": 2959 + }, + { + "epoch": 1.8413685847589425, + "grad_norm": 3.382543127372628e-06, + "learning_rate": 2.144997694790226e-06, + "logits/chosen": -0.4814430773258209, + "logits/rejected": 3.463700771331787, + "logps/chosen": -494.40863037109375, + "logps/rejected": -1069.7625732421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.398022651672363, + "rewards/margins": 35.85649108886719, + "rewards/rejected": -44.2545166015625, + "step": 2960 + }, + { + "epoch": 1.84199066874028, + "grad_norm": 0.47370555996894836, + "learning_rate": 2.143845089903181e-06, + "logits/chosen": -2.028714895248413, + "logits/rejected": 3.4398715496063232, + "logps/chosen": -326.89898681640625, + "logps/rejected": -946.994873046875, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.010538101196289, + "rewards/margins": 31.887008666992188, + "rewards/rejected": -36.897544860839844, + "step": 2961 + }, + { + "epoch": 1.8426127527216174, + "grad_norm": 0.9828805923461914, + "learning_rate": 2.1426924850161364e-06, + "logits/chosen": 2.851536750793457, + "logits/rejected": 3.195272445678711, + "logps/chosen": -591.3557739257812, + "logps/rejected": -907.9046630859375, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.238224029541016, + "rewards/margins": 25.548540115356445, + "rewards/rejected": -37.78676223754883, + "step": 2962 + }, + { + "epoch": 1.8432348367029547, + "grad_norm": 2.494321194035365e-08, + "learning_rate": 2.1415398801290916e-06, + "logits/chosen": 0.753627359867096, + "logits/rejected": 3.689297914505005, + "logps/chosen": -475.31829833984375, + "logps/rejected": -1157.6439208984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.471179008483887, + "rewards/margins": 39.312171936035156, + "rewards/rejected": -47.783355712890625, + "step": 2963 + }, + { + "epoch": 1.8438569206842925, + "grad_norm": 0.002211152808740735, + "learning_rate": 2.1403872752420473e-06, + "logits/chosen": 0.5270100831985474, + "logits/rejected": 2.783477306365967, + "logps/chosen": -550.474609375, + "logps/rejected": -890.4298706054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.78561782836914, + "rewards/margins": 24.508562088012695, + "rewards/rejected": -36.29418182373047, + "step": 2964 + }, + { + "epoch": 1.8444790046656299, + "grad_norm": 21.535110473632812, + "learning_rate": 2.1392346703550025e-06, + "logits/chosen": -2.338355302810669, + "logits/rejected": 0.6437476277351379, + "logps/chosen": -490.95294189453125, + "logps/rejected": -1015.9381103515625, + "loss": 0.1576, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.412555694580078, + "rewards/margins": 26.055999755859375, + "rewards/rejected": -38.46855545043945, + "step": 2965 + }, + { + "epoch": 1.8451010886469672, + "grad_norm": 1.2399395018292125e-05, + "learning_rate": 2.1380820654679577e-06, + "logits/chosen": 1.6357576847076416, + "logits/rejected": 3.4656600952148438, + "logps/chosen": -639.3941040039062, + "logps/rejected": -1004.3380737304688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.001319885253906, + "rewards/margins": 29.696578979492188, + "rewards/rejected": -37.697898864746094, + "step": 2966 + }, + { + "epoch": 1.845723172628305, + "grad_norm": 48.81344985961914, + "learning_rate": 2.136929460580913e-06, + "logits/chosen": 2.8996856212615967, + "logits/rejected": 3.9725353717803955, + "logps/chosen": -725.06103515625, + "logps/rejected": -951.90576171875, + "loss": 2.1172, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.822303771972656, + "rewards/margins": 21.377300262451172, + "rewards/rejected": -32.19960403442383, + "step": 2967 + }, + { + "epoch": 1.8463452566096423, + "grad_norm": 0.06352223455905914, + "learning_rate": 2.135776855693868e-06, + "logits/chosen": -1.567836046218872, + "logits/rejected": 3.9059090614318848, + "logps/chosen": -474.1429443359375, + "logps/rejected": -1246.232666015625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.337193489074707, + "rewards/margins": 42.75863265991211, + "rewards/rejected": -49.0958251953125, + "step": 2968 + }, + { + "epoch": 1.8469673405909797, + "grad_norm": 0.023700682446360588, + "learning_rate": 2.1346242508068234e-06, + "logits/chosen": 2.8687596321105957, + "logits/rejected": 2.6395351886749268, + "logps/chosen": -682.3429565429688, + "logps/rejected": -916.9752807617188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.061613082885742, + "rewards/margins": 25.777587890625, + "rewards/rejected": -34.839202880859375, + "step": 2969 + }, + { + "epoch": 1.8475894245723172, + "grad_norm": 0.2039632946252823, + "learning_rate": 2.1334716459197786e-06, + "logits/chosen": 1.3555502891540527, + "logits/rejected": 4.361049175262451, + "logps/chosen": -466.4350891113281, + "logps/rejected": -843.00341796875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.455344200134277, + "rewards/margins": 20.132835388183594, + "rewards/rejected": -31.588180541992188, + "step": 2970 + }, + { + "epoch": 1.8482115085536548, + "grad_norm": 2.145111999141136e-08, + "learning_rate": 2.1323190410327343e-06, + "logits/chosen": -0.839304506778717, + "logits/rejected": 2.2905335426330566, + "logps/chosen": -520.3160400390625, + "logps/rejected": -1045.048095703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.831880569458008, + "rewards/margins": 34.59035873413086, + "rewards/rejected": -42.4222412109375, + "step": 2971 + }, + { + "epoch": 1.8488335925349921, + "grad_norm": 15.03640365600586, + "learning_rate": 2.1311664361456895e-06, + "logits/chosen": 0.5526972413063049, + "logits/rejected": 2.9396677017211914, + "logps/chosen": -543.9615478515625, + "logps/rejected": -1039.202392578125, + "loss": 0.0443, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.499490737915039, + "rewards/margins": 31.284423828125, + "rewards/rejected": -43.783912658691406, + "step": 2972 + }, + { + "epoch": 1.8494556765163297, + "grad_norm": 6.252834339193214e-08, + "learning_rate": 2.1300138312586447e-06, + "logits/chosen": -1.3490384817123413, + "logits/rejected": 1.9333854913711548, + "logps/chosen": -523.3990478515625, + "logps/rejected": -1060.8382568359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.62447738647461, + "rewards/margins": 31.01898765563965, + "rewards/rejected": -40.643463134765625, + "step": 2973 + }, + { + "epoch": 1.8500777604976673, + "grad_norm": 0.47883057594299316, + "learning_rate": 2.1288612263716e-06, + "logits/chosen": 0.83540278673172, + "logits/rejected": 3.4483134746551514, + "logps/chosen": -649.0042724609375, + "logps/rejected": -1084.9862060546875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.918928146362305, + "rewards/margins": 27.286611557006836, + "rewards/rejected": -37.20553970336914, + "step": 2974 + }, + { + "epoch": 1.8506998444790046, + "grad_norm": 0.08642778545618057, + "learning_rate": 2.127708621484555e-06, + "logits/chosen": 1.8379980325698853, + "logits/rejected": 2.3691537380218506, + "logps/chosen": -613.3511962890625, + "logps/rejected": -848.9517211914062, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.631416320800781, + "rewards/margins": 22.10049819946289, + "rewards/rejected": -31.731916427612305, + "step": 2975 + }, + { + "epoch": 1.8513219284603422, + "grad_norm": 37.839088439941406, + "learning_rate": 2.1265560165975104e-06, + "logits/chosen": 1.2364885807037354, + "logits/rejected": 2.4372706413269043, + "logps/chosen": -585.241455078125, + "logps/rejected": -831.3947143554688, + "loss": 0.5229, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.798276901245117, + "rewards/margins": 21.3815860748291, + "rewards/rejected": -32.17986297607422, + "step": 2976 + }, + { + "epoch": 1.8519440124416797, + "grad_norm": 0.09387121349573135, + "learning_rate": 2.1254034117104656e-06, + "logits/chosen": 0.7087660431861877, + "logits/rejected": 3.379647970199585, + "logps/chosen": -636.924072265625, + "logps/rejected": -1199.20947265625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.0158891677856445, + "rewards/margins": 37.23964309692383, + "rewards/rejected": -43.255531311035156, + "step": 2977 + }, + { + "epoch": 1.852566096423017, + "grad_norm": 11.74792766571045, + "learning_rate": 2.1242508068234213e-06, + "logits/chosen": 0.6603624820709229, + "logits/rejected": 3.59389591217041, + "logps/chosen": -573.1044311523438, + "logps/rejected": -1002.54296875, + "loss": 0.0559, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.622593879699707, + "rewards/margins": 24.730194091796875, + "rewards/rejected": -37.352787017822266, + "step": 2978 + }, + { + "epoch": 1.8531881804043546, + "grad_norm": 5.23779344803188e-05, + "learning_rate": 2.1230982019363765e-06, + "logits/chosen": 0.8842979669570923, + "logits/rejected": 3.95865535736084, + "logps/chosen": -411.5668640136719, + "logps/rejected": -885.6905517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.89336109161377, + "rewards/margins": 30.12948226928711, + "rewards/rejected": -39.02284240722656, + "step": 2979 + }, + { + "epoch": 1.8538102643856922, + "grad_norm": 9.278995513916016, + "learning_rate": 2.1219455970493317e-06, + "logits/chosen": 2.5439319610595703, + "logits/rejected": 4.2560038566589355, + "logps/chosen": -725.4393310546875, + "logps/rejected": -1119.859375, + "loss": 0.0416, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.018448829650879, + "rewards/margins": 28.870258331298828, + "rewards/rejected": -38.888702392578125, + "step": 2980 + }, + { + "epoch": 1.8544323483670295, + "grad_norm": 5.287654314400925e-09, + "learning_rate": 2.120792992162287e-06, + "logits/chosen": -1.8039573431015015, + "logits/rejected": 2.1132235527038574, + "logps/chosen": -485.0125427246094, + "logps/rejected": -1005.99072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.288414001464844, + "rewards/margins": 36.67291259765625, + "rewards/rejected": -46.961326599121094, + "step": 2981 + }, + { + "epoch": 1.8550544323483669, + "grad_norm": 0.4025889039039612, + "learning_rate": 2.119640387275242e-06, + "logits/chosen": 2.9755101203918457, + "logits/rejected": 4.695067405700684, + "logps/chosen": -648.7351684570312, + "logps/rejected": -1055.5948486328125, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.618903160095215, + "rewards/margins": 28.467288970947266, + "rewards/rejected": -40.08618927001953, + "step": 2982 + }, + { + "epoch": 1.8556765163297047, + "grad_norm": 36.6614875793457, + "learning_rate": 2.1184877823881974e-06, + "logits/chosen": 2.5858490467071533, + "logits/rejected": 3.5514469146728516, + "logps/chosen": -845.2623901367188, + "logps/rejected": -1315.9478759765625, + "loss": 0.3482, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.051424026489258, + "rewards/margins": 32.73891830444336, + "rewards/rejected": -47.790340423583984, + "step": 2983 + }, + { + "epoch": 1.856298600311042, + "grad_norm": 29.610349655151367, + "learning_rate": 2.1173351775011526e-06, + "logits/chosen": -0.57902592420578, + "logits/rejected": 3.0780577659606934, + "logps/chosen": -524.6580810546875, + "logps/rejected": -986.159423828125, + "loss": 0.3359, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.902968406677246, + "rewards/margins": 26.206405639648438, + "rewards/rejected": -39.109375, + "step": 2984 + }, + { + "epoch": 1.8569206842923793, + "grad_norm": 0.00036348786670714617, + "learning_rate": 2.116182572614108e-06, + "logits/chosen": 1.5172300338745117, + "logits/rejected": 3.862931489944458, + "logps/chosen": -601.9778442382812, + "logps/rejected": -1014.2064208984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.697236061096191, + "rewards/margins": 31.744977951049805, + "rewards/rejected": -41.44221115112305, + "step": 2985 + }, + { + "epoch": 1.8575427682737171, + "grad_norm": 0.6304136514663696, + "learning_rate": 2.1150299677270635e-06, + "logits/chosen": 1.1943728923797607, + "logits/rejected": 3.9687066078186035, + "logps/chosen": -582.301513671875, + "logps/rejected": -1053.023681640625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.7523956298828125, + "rewards/margins": 34.65993881225586, + "rewards/rejected": -42.41233444213867, + "step": 2986 + }, + { + "epoch": 1.8581648522550545, + "grad_norm": 0.0008032044279389083, + "learning_rate": 2.1138773628400187e-06, + "logits/chosen": -2.6957528591156006, + "logits/rejected": 1.5795737504959106, + "logps/chosen": -363.6121826171875, + "logps/rejected": -886.4146728515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.061027526855469, + "rewards/margins": 30.693744659423828, + "rewards/rejected": -36.7547721862793, + "step": 2987 + }, + { + "epoch": 1.8587869362363918, + "grad_norm": 0.0013717946130782366, + "learning_rate": 2.112724757952974e-06, + "logits/chosen": -0.3157472610473633, + "logits/rejected": 2.1115522384643555, + "logps/chosen": -401.7276916503906, + "logps/rejected": -775.2745361328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.6762285232543945, + "rewards/margins": 28.42954444885254, + "rewards/rejected": -34.10577392578125, + "step": 2988 + }, + { + "epoch": 1.8594090202177294, + "grad_norm": 8.54127677030192e-07, + "learning_rate": 2.111572153065929e-06, + "logits/chosen": 1.5187461376190186, + "logits/rejected": 3.9361824989318848, + "logps/chosen": -424.12322998046875, + "logps/rejected": -852.22802734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.521132946014404, + "rewards/margins": 30.52448272705078, + "rewards/rejected": -38.045616149902344, + "step": 2989 + }, + { + "epoch": 1.860031104199067, + "grad_norm": 0.00021709220891352743, + "learning_rate": 2.1104195481788844e-06, + "logits/chosen": 0.541998028755188, + "logits/rejected": 2.679088592529297, + "logps/chosen": -595.886962890625, + "logps/rejected": -1007.0936279296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.939725875854492, + "rewards/margins": 26.004108428955078, + "rewards/rejected": -38.9438362121582, + "step": 2990 + }, + { + "epoch": 1.8606531881804043, + "grad_norm": 0.042757876217365265, + "learning_rate": 2.1092669432918396e-06, + "logits/chosen": 0.20492124557495117, + "logits/rejected": 2.4277796745300293, + "logps/chosen": -542.8641357421875, + "logps/rejected": -1056.92333984375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.894628524780273, + "rewards/margins": 35.557220458984375, + "rewards/rejected": -45.45185089111328, + "step": 2991 + }, + { + "epoch": 1.8612752721617418, + "grad_norm": 3.564086675643921, + "learning_rate": 2.108114338404795e-06, + "logits/chosen": 0.8177498579025269, + "logits/rejected": 2.0094380378723145, + "logps/chosen": -678.0740966796875, + "logps/rejected": -905.52587890625, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.322629928588867, + "rewards/margins": 17.72020149230957, + "rewards/rejected": -30.042829513549805, + "step": 2992 + }, + { + "epoch": 1.8618973561430794, + "grad_norm": 27.561216354370117, + "learning_rate": 2.1069617335177505e-06, + "logits/chosen": -2.124131202697754, + "logits/rejected": 2.3365206718444824, + "logps/chosen": -346.3985290527344, + "logps/rejected": -939.9415283203125, + "loss": 0.9759, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.955907821655273, + "rewards/margins": 25.818593978881836, + "rewards/rejected": -33.77450180053711, + "step": 2993 + }, + { + "epoch": 1.8625194401244167, + "grad_norm": 2.435544013977051, + "learning_rate": 2.1058091286307057e-06, + "logits/chosen": -1.4700162410736084, + "logits/rejected": 1.1362886428833008, + "logps/chosen": -505.6947326660156, + "logps/rejected": -871.2249755859375, + "loss": 0.0405, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.700376510620117, + "rewards/margins": 23.613595962524414, + "rewards/rejected": -35.31397247314453, + "step": 2994 + }, + { + "epoch": 1.8631415241057543, + "grad_norm": 7.134440898895264, + "learning_rate": 2.104656523743661e-06, + "logits/chosen": -0.5249059796333313, + "logits/rejected": 2.251185417175293, + "logps/chosen": -566.6573486328125, + "logps/rejected": -883.6015625, + "loss": 0.0389, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.398397445678711, + "rewards/margins": 25.780942916870117, + "rewards/rejected": -32.17934036254883, + "step": 2995 + }, + { + "epoch": 1.8637636080870918, + "grad_norm": 6.2458311731461436e-06, + "learning_rate": 2.103503918856616e-06, + "logits/chosen": -1.0155529975891113, + "logits/rejected": 3.2504520416259766, + "logps/chosen": -421.27655029296875, + "logps/rejected": -1078.7958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.56519889831543, + "rewards/margins": 31.937530517578125, + "rewards/rejected": -40.50273132324219, + "step": 2996 + }, + { + "epoch": 1.8643856920684292, + "grad_norm": 0.0005573639646172523, + "learning_rate": 2.1023513139695714e-06, + "logits/chosen": 1.2448923587799072, + "logits/rejected": 2.8676905632019043, + "logps/chosen": -564.3466186523438, + "logps/rejected": -998.9130859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.508292198181152, + "rewards/margins": 28.037822723388672, + "rewards/rejected": -38.546112060546875, + "step": 2997 + }, + { + "epoch": 1.8650077760497668, + "grad_norm": 9.827252824834432e-07, + "learning_rate": 2.1011987090825266e-06, + "logits/chosen": 1.301634669303894, + "logits/rejected": 2.054259777069092, + "logps/chosen": -562.0250244140625, + "logps/rejected": -966.8475341796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.389965057373047, + "rewards/margins": 34.89493179321289, + "rewards/rejected": -44.28489685058594, + "step": 2998 + }, + { + "epoch": 1.8656298600311043, + "grad_norm": 36.70895004272461, + "learning_rate": 2.100046104195482e-06, + "logits/chosen": 1.1578892469406128, + "logits/rejected": 4.549923896789551, + "logps/chosen": -650.369384765625, + "logps/rejected": -1236.646728515625, + "loss": 0.7256, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.710254669189453, + "rewards/margins": 37.79402160644531, + "rewards/rejected": -48.5042724609375, + "step": 2999 + }, + { + "epoch": 1.8662519440124417, + "grad_norm": 6.277427466727659e-09, + "learning_rate": 2.0988934993084375e-06, + "logits/chosen": -1.7158828973770142, + "logits/rejected": 2.6323788166046143, + "logps/chosen": -478.2867431640625, + "logps/rejected": -1147.33447265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.997095108032227, + "rewards/margins": 36.653255462646484, + "rewards/rejected": -45.65034484863281, + "step": 3000 + }, + { + "epoch": 1.866874027993779, + "grad_norm": 0.0013673205394297838, + "learning_rate": 2.0977408944213927e-06, + "logits/chosen": 0.34744882583618164, + "logits/rejected": 2.0286967754364014, + "logps/chosen": -579.9356079101562, + "logps/rejected": -969.3897705078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.04820728302002, + "rewards/margins": 29.78260040283203, + "rewards/rejected": -42.830806732177734, + "step": 3001 + }, + { + "epoch": 1.8674961119751168, + "grad_norm": 32.87213134765625, + "learning_rate": 2.096588289534348e-06, + "logits/chosen": 1.6131173372268677, + "logits/rejected": 4.298799991607666, + "logps/chosen": -596.7760620117188, + "logps/rejected": -1074.724853515625, + "loss": 0.4465, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.154102325439453, + "rewards/margins": 29.750553131103516, + "rewards/rejected": -43.904659271240234, + "step": 3002 + }, + { + "epoch": 1.8681181959564541, + "grad_norm": 0.20293423533439636, + "learning_rate": 2.095435684647303e-06, + "logits/chosen": -1.015268325805664, + "logits/rejected": 4.6616339683532715, + "logps/chosen": -418.6412353515625, + "logps/rejected": -1064.1949462890625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.623247146606445, + "rewards/margins": 35.75114440917969, + "rewards/rejected": -44.3743896484375, + "step": 3003 + }, + { + "epoch": 1.8687402799377915, + "grad_norm": 0.10534223169088364, + "learning_rate": 2.0942830797602584e-06, + "logits/chosen": -0.0824170708656311, + "logits/rejected": 2.3265902996063232, + "logps/chosen": -361.28765869140625, + "logps/rejected": -665.5305786132812, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.109169006347656, + "rewards/margins": 13.994619369506836, + "rewards/rejected": -23.103790283203125, + "step": 3004 + }, + { + "epoch": 1.8693623639191292, + "grad_norm": 0.5241131782531738, + "learning_rate": 2.0931304748732136e-06, + "logits/chosen": 1.0985878705978394, + "logits/rejected": 3.5785131454467773, + "logps/chosen": -479.8888244628906, + "logps/rejected": -926.8046264648438, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.3170166015625, + "rewards/margins": 29.623743057250977, + "rewards/rejected": -37.940765380859375, + "step": 3005 + }, + { + "epoch": 1.8699844479004666, + "grad_norm": 4.3681578972609714e-05, + "learning_rate": 2.091977869986169e-06, + "logits/chosen": 1.0532994270324707, + "logits/rejected": 4.94828987121582, + "logps/chosen": -617.1984252929688, + "logps/rejected": -1163.953857421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.137100219726562, + "rewards/margins": 31.24260139465332, + "rewards/rejected": -40.37969970703125, + "step": 3006 + }, + { + "epoch": 1.870606531881804, + "grad_norm": 4.708970300271176e-05, + "learning_rate": 2.090825265099124e-06, + "logits/chosen": 1.2389825582504272, + "logits/rejected": 3.942110776901245, + "logps/chosen": -384.44927978515625, + "logps/rejected": -990.743896484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.256569862365723, + "rewards/margins": 43.04841613769531, + "rewards/rejected": -51.30498504638672, + "step": 3007 + }, + { + "epoch": 1.8712286158631415, + "grad_norm": 0.00026734487619251013, + "learning_rate": 2.0896726602120797e-06, + "logits/chosen": -0.2838389277458191, + "logits/rejected": 2.5366568565368652, + "logps/chosen": -485.586181640625, + "logps/rejected": -973.17626953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.219608306884766, + "rewards/margins": 31.877403259277344, + "rewards/rejected": -41.09701156616211, + "step": 3008 + }, + { + "epoch": 1.871850699844479, + "grad_norm": 0.7158253192901611, + "learning_rate": 2.088520055325035e-06, + "logits/chosen": 0.17250674962997437, + "logits/rejected": 1.608120322227478, + "logps/chosen": -633.865966796875, + "logps/rejected": -921.3824462890625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.764053344726562, + "rewards/margins": 23.544639587402344, + "rewards/rejected": -34.308692932128906, + "step": 3009 + }, + { + "epoch": 1.8724727838258164, + "grad_norm": 5.755442543886602e-05, + "learning_rate": 2.08736745043799e-06, + "logits/chosen": -2.124953031539917, + "logits/rejected": 2.750650644302368, + "logps/chosen": -467.41168212890625, + "logps/rejected": -1126.162841796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.77512264251709, + "rewards/margins": 39.48204803466797, + "rewards/rejected": -51.257171630859375, + "step": 3010 + }, + { + "epoch": 1.873094867807154, + "grad_norm": 37.73600387573242, + "learning_rate": 2.0862148455509454e-06, + "logits/chosen": 2.946049451828003, + "logits/rejected": 2.970284938812256, + "logps/chosen": -768.961181640625, + "logps/rejected": -1059.467529296875, + "loss": 0.3578, + "rewards/accuracies": 0.875, + "rewards/chosen": -16.543516159057617, + "rewards/margins": 29.980506896972656, + "rewards/rejected": -46.52402114868164, + "step": 3011 + }, + { + "epoch": 1.8737169517884915, + "grad_norm": 9.90644645690918, + "learning_rate": 2.0850622406639006e-06, + "logits/chosen": -1.2075892686843872, + "logits/rejected": 3.742737054824829, + "logps/chosen": -424.56170654296875, + "logps/rejected": -896.1334838867188, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.003117561340332, + "rewards/margins": 22.584789276123047, + "rewards/rejected": -26.587905883789062, + "step": 3012 + }, + { + "epoch": 1.8743390357698289, + "grad_norm": 1.2684682815233828e-07, + "learning_rate": 2.083909635776856e-06, + "logits/chosen": 0.23507288098335266, + "logits/rejected": 3.0083565711975098, + "logps/chosen": -632.8001708984375, + "logps/rejected": -1120.076416015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.256636619567871, + "rewards/margins": 25.77067756652832, + "rewards/rejected": -38.02731704711914, + "step": 3013 + }, + { + "epoch": 1.8749611197511664, + "grad_norm": 1.4789420366287231, + "learning_rate": 2.082757030889811e-06, + "logits/chosen": 1.3540959358215332, + "logits/rejected": 2.8773014545440674, + "logps/chosen": -539.6065673828125, + "logps/rejected": -920.03271484375, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.775294303894043, + "rewards/margins": 28.458703994750977, + "rewards/rejected": -37.23400115966797, + "step": 3014 + }, + { + "epoch": 1.875583203732504, + "grad_norm": 5.803546088145595e-08, + "learning_rate": 2.0816044260027667e-06, + "logits/chosen": 1.6014816761016846, + "logits/rejected": 2.7704672813415527, + "logps/chosen": -544.0638427734375, + "logps/rejected": -985.9021606445312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.202249526977539, + "rewards/margins": 32.040164947509766, + "rewards/rejected": -42.24241638183594, + "step": 3015 + }, + { + "epoch": 1.8762052877138413, + "grad_norm": 0.0022250423207879066, + "learning_rate": 2.080451821115722e-06, + "logits/chosen": -0.5401831269264221, + "logits/rejected": 2.8969578742980957, + "logps/chosen": -490.593994140625, + "logps/rejected": -882.6023559570312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.626242637634277, + "rewards/margins": 24.396589279174805, + "rewards/rejected": -32.022830963134766, + "step": 3016 + }, + { + "epoch": 1.8768273716951789, + "grad_norm": 3.121169356745668e-05, + "learning_rate": 2.079299216228677e-06, + "logits/chosen": -1.2195719480514526, + "logits/rejected": 3.7740817070007324, + "logps/chosen": -373.0020446777344, + "logps/rejected": -971.7380981445312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.059677124023438, + "rewards/margins": 32.931053161621094, + "rewards/rejected": -40.99073028564453, + "step": 3017 + }, + { + "epoch": 1.8774494556765164, + "grad_norm": 7.038599014282227, + "learning_rate": 2.0781466113416324e-06, + "logits/chosen": 2.310530185699463, + "logits/rejected": 3.1102864742279053, + "logps/chosen": -638.824951171875, + "logps/rejected": -905.7316284179688, + "loss": 0.0928, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.308221817016602, + "rewards/margins": 23.656431198120117, + "rewards/rejected": -33.96465301513672, + "step": 3018 + }, + { + "epoch": 1.8780715396578538, + "grad_norm": 0.0008783753728494048, + "learning_rate": 2.0769940064545876e-06, + "logits/chosen": 2.319756269454956, + "logits/rejected": 4.028102874755859, + "logps/chosen": -705.4632568359375, + "logps/rejected": -1112.9202880859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.343297958374023, + "rewards/margins": 29.65229606628418, + "rewards/rejected": -42.9955940246582, + "step": 3019 + }, + { + "epoch": 1.8786936236391913, + "grad_norm": 0.14819443225860596, + "learning_rate": 2.075841401567543e-06, + "logits/chosen": 1.2726922035217285, + "logits/rejected": 2.144050121307373, + "logps/chosen": -630.6361083984375, + "logps/rejected": -954.7571411132812, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.983515739440918, + "rewards/margins": 23.956439971923828, + "rewards/rejected": -36.9399528503418, + "step": 3020 + }, + { + "epoch": 1.879315707620529, + "grad_norm": 0.07809139043092728, + "learning_rate": 2.074688796680498e-06, + "logits/chosen": 2.2689342498779297, + "logits/rejected": 4.868353843688965, + "logps/chosen": -604.720458984375, + "logps/rejected": -1018.6021118164062, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.717222690582275, + "rewards/margins": 28.454551696777344, + "rewards/rejected": -35.171775817871094, + "step": 3021 + }, + { + "epoch": 1.8799377916018662, + "grad_norm": 9.459550346946344e-05, + "learning_rate": 2.0735361917934537e-06, + "logits/chosen": 1.7424232959747314, + "logits/rejected": 3.0601165294647217, + "logps/chosen": -488.897216796875, + "logps/rejected": -926.98291015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.733386993408203, + "rewards/margins": 33.84270477294922, + "rewards/rejected": -41.57609558105469, + "step": 3022 + }, + { + "epoch": 1.8805598755832036, + "grad_norm": 0.04000190272927284, + "learning_rate": 2.072383586906409e-06, + "logits/chosen": 1.3352985382080078, + "logits/rejected": 3.4352452754974365, + "logps/chosen": -559.6813354492188, + "logps/rejected": -924.271728515625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.983686447143555, + "rewards/margins": 25.625911712646484, + "rewards/rejected": -37.609596252441406, + "step": 3023 + }, + { + "epoch": 1.8811819595645414, + "grad_norm": 0.10560665279626846, + "learning_rate": 2.071230982019364e-06, + "logits/chosen": 1.071738600730896, + "logits/rejected": 2.951690196990967, + "logps/chosen": -527.6966552734375, + "logps/rejected": -841.054443359375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.558955192565918, + "rewards/margins": 20.486169815063477, + "rewards/rejected": -31.04512596130371, + "step": 3024 + }, + { + "epoch": 1.8818040435458787, + "grad_norm": 0.0009212340810336173, + "learning_rate": 2.070078377132319e-06, + "logits/chosen": -1.7650898694992065, + "logits/rejected": 1.4234349727630615, + "logps/chosen": -414.21185302734375, + "logps/rejected": -1025.5460205078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.888592720031738, + "rewards/margins": 40.60430145263672, + "rewards/rejected": -50.49289321899414, + "step": 3025 + }, + { + "epoch": 1.882426127527216, + "grad_norm": 0.12818951904773712, + "learning_rate": 2.068925772245274e-06, + "logits/chosen": 2.31978178024292, + "logits/rejected": 3.1467909812927246, + "logps/chosen": -750.0935668945312, + "logps/rejected": -1013.3674926757812, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.906643867492676, + "rewards/margins": 22.728199005126953, + "rewards/rejected": -37.63484191894531, + "step": 3026 + }, + { + "epoch": 1.8830482115085536, + "grad_norm": 0.23764050006866455, + "learning_rate": 2.06777316735823e-06, + "logits/chosen": -1.2640528678894043, + "logits/rejected": 2.4256398677825928, + "logps/chosen": -436.44805908203125, + "logps/rejected": -976.5224609375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.2282075881958, + "rewards/margins": 34.364280700683594, + "rewards/rejected": -42.59248733520508, + "step": 3027 + }, + { + "epoch": 1.8836702954898912, + "grad_norm": 0.12176292389631271, + "learning_rate": 2.066620562471185e-06, + "logits/chosen": 0.8552002906799316, + "logits/rejected": 4.851562023162842, + "logps/chosen": -543.6951904296875, + "logps/rejected": -1165.0093994140625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.13092041015625, + "rewards/margins": 32.57614517211914, + "rewards/rejected": -43.707069396972656, + "step": 3028 + }, + { + "epoch": 1.8842923794712285, + "grad_norm": 0.0009258187492378056, + "learning_rate": 2.0654679575841403e-06, + "logits/chosen": -1.5283629894256592, + "logits/rejected": 2.731228828430176, + "logps/chosen": -348.7073669433594, + "logps/rejected": -837.8958740234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.846237659454346, + "rewards/margins": 25.294355392456055, + "rewards/rejected": -30.140592575073242, + "step": 3029 + }, + { + "epoch": 1.884914463452566, + "grad_norm": 0.0059629688039422035, + "learning_rate": 2.0643153526970955e-06, + "logits/chosen": 0.30039146542549133, + "logits/rejected": 3.9497108459472656, + "logps/chosen": -519.6970825195312, + "logps/rejected": -1043.4434814453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.718129634857178, + "rewards/margins": 31.629749298095703, + "rewards/rejected": -39.347877502441406, + "step": 3030 + }, + { + "epoch": 1.8855365474339036, + "grad_norm": 8.508398605044931e-05, + "learning_rate": 2.0631627478100507e-06, + "logits/chosen": -2.2366878986358643, + "logits/rejected": 0.8939638733863831, + "logps/chosen": -473.6304931640625, + "logps/rejected": -957.8397827148438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.143477439880371, + "rewards/margins": 27.332979202270508, + "rewards/rejected": -38.47645568847656, + "step": 3031 + }, + { + "epoch": 1.886158631415241, + "grad_norm": 0.000677596777677536, + "learning_rate": 2.062010142923006e-06, + "logits/chosen": 2.1652817726135254, + "logits/rejected": 3.804422616958618, + "logps/chosen": -669.2681884765625, + "logps/rejected": -1044.4951171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.551870346069336, + "rewards/margins": 30.823772430419922, + "rewards/rejected": -42.375640869140625, + "step": 3032 + }, + { + "epoch": 1.8867807153965785, + "grad_norm": 2.0379013221827336e-05, + "learning_rate": 2.060857538035961e-06, + "logits/chosen": 2.494349479675293, + "logits/rejected": 3.617753505706787, + "logps/chosen": -645.7994384765625, + "logps/rejected": -1071.411376953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.979326248168945, + "rewards/margins": 29.537216186523438, + "rewards/rejected": -43.516544342041016, + "step": 3033 + }, + { + "epoch": 1.887402799377916, + "grad_norm": 4.401593969305395e-07, + "learning_rate": 2.059704933148917e-06, + "logits/chosen": 1.1647454500198364, + "logits/rejected": 4.53623628616333, + "logps/chosen": -665.0794067382812, + "logps/rejected": -1135.864501953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.783852577209473, + "rewards/margins": 31.66611099243164, + "rewards/rejected": -46.44996643066406, + "step": 3034 + }, + { + "epoch": 1.8880248833592534, + "grad_norm": 2.0084574222564697, + "learning_rate": 2.058552328261872e-06, + "logits/chosen": 1.410960078239441, + "logits/rejected": 1.4644067287445068, + "logps/chosen": -776.1810302734375, + "logps/rejected": -915.4017333984375, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.6139497756958, + "rewards/margins": 21.933719635009766, + "rewards/rejected": -31.547666549682617, + "step": 3035 + }, + { + "epoch": 1.888646967340591, + "grad_norm": 0.4876706600189209, + "learning_rate": 2.0573997233748273e-06, + "logits/chosen": -1.973346471786499, + "logits/rejected": 3.7605323791503906, + "logps/chosen": -389.125244140625, + "logps/rejected": -962.2559814453125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.528801918029785, + "rewards/margins": 30.331979751586914, + "rewards/rejected": -40.860782623291016, + "step": 3036 + }, + { + "epoch": 1.8892690513219286, + "grad_norm": 0.0003601381031330675, + "learning_rate": 2.0562471184877825e-06, + "logits/chosen": 2.1486668586730957, + "logits/rejected": 3.886343002319336, + "logps/chosen": -646.3818359375, + "logps/rejected": -1020.0104370117188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.100616455078125, + "rewards/margins": 27.22791290283203, + "rewards/rejected": -37.328529357910156, + "step": 3037 + }, + { + "epoch": 1.889891135303266, + "grad_norm": 0.005591857247054577, + "learning_rate": 2.0550945136007377e-06, + "logits/chosen": -1.573172926902771, + "logits/rejected": 1.094953179359436, + "logps/chosen": -499.50726318359375, + "logps/rejected": -951.7015380859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.768106937408447, + "rewards/margins": 31.28192901611328, + "rewards/rejected": -39.05003356933594, + "step": 3038 + }, + { + "epoch": 1.8905132192846035, + "grad_norm": 2.937760591506958, + "learning_rate": 2.053941908713693e-06, + "logits/chosen": 1.1670812368392944, + "logits/rejected": 2.585752010345459, + "logps/chosen": -494.4090576171875, + "logps/rejected": -959.474853515625, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.01852798461914, + "rewards/margins": 28.86224365234375, + "rewards/rejected": -39.880775451660156, + "step": 3039 + }, + { + "epoch": 1.891135303265941, + "grad_norm": 0.011783706955611706, + "learning_rate": 2.052789303826648e-06, + "logits/chosen": -1.6418266296386719, + "logits/rejected": 3.052611827850342, + "logps/chosen": -261.74237060546875, + "logps/rejected": -709.05126953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9691755771636963, + "rewards/margins": 29.911666870117188, + "rewards/rejected": -33.88084411621094, + "step": 3040 + }, + { + "epoch": 1.8917573872472784, + "grad_norm": 8.802350748737808e-06, + "learning_rate": 2.051636698939604e-06, + "logits/chosen": -1.3013746738433838, + "logits/rejected": 3.662790060043335, + "logps/chosen": -374.2080993652344, + "logps/rejected": -957.27783203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.102154731750488, + "rewards/margins": 32.80384063720703, + "rewards/rejected": -37.9059944152832, + "step": 3041 + }, + { + "epoch": 1.8923794712286157, + "grad_norm": 0.0001305158803006634, + "learning_rate": 2.050484094052559e-06, + "logits/chosen": 0.9714987874031067, + "logits/rejected": 3.3573224544525146, + "logps/chosen": -463.06201171875, + "logps/rejected": -1027.607177734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.532149314880371, + "rewards/margins": 34.60115051269531, + "rewards/rejected": -45.13330078125, + "step": 3042 + }, + { + "epoch": 1.8930015552099535, + "grad_norm": 3.116247171419673e-05, + "learning_rate": 2.0493314891655143e-06, + "logits/chosen": -4.505119323730469, + "logits/rejected": 1.4350619316101074, + "logps/chosen": -233.6446075439453, + "logps/rejected": -951.873291015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.3743133544921875, + "rewards/margins": 34.8604850769043, + "rewards/rejected": -41.23480224609375, + "step": 3043 + }, + { + "epoch": 1.8936236391912908, + "grad_norm": 6.368558729263896e-07, + "learning_rate": 2.0481788842784695e-06, + "logits/chosen": -0.6455403566360474, + "logits/rejected": 2.493967056274414, + "logps/chosen": -473.7528991699219, + "logps/rejected": -1045.281982421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.817072868347168, + "rewards/margins": 33.52661895751953, + "rewards/rejected": -43.343692779541016, + "step": 3044 + }, + { + "epoch": 1.8942457231726282, + "grad_norm": 35.97359085083008, + "learning_rate": 2.0470262793914247e-06, + "logits/chosen": 0.24772866070270538, + "logits/rejected": 3.34269642829895, + "logps/chosen": -497.4903259277344, + "logps/rejected": -1017.6675415039062, + "loss": 0.5899, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.941030025482178, + "rewards/margins": 32.35401916503906, + "rewards/rejected": -40.2950439453125, + "step": 3045 + }, + { + "epoch": 1.8948678071539657, + "grad_norm": 0.0004061897052451968, + "learning_rate": 2.04587367450438e-06, + "logits/chosen": 0.5643148422241211, + "logits/rejected": 1.8211342096328735, + "logps/chosen": -652.9474487304688, + "logps/rejected": -934.273193359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.59850788116455, + "rewards/margins": 28.603839874267578, + "rewards/rejected": -37.20234680175781, + "step": 3046 + }, + { + "epoch": 1.8954898911353033, + "grad_norm": 4.04508637075196e-06, + "learning_rate": 2.044721069617335e-06, + "logits/chosen": 0.7040514945983887, + "logits/rejected": 3.3264479637145996, + "logps/chosen": -418.761474609375, + "logps/rejected": -1029.391845703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.296758651733398, + "rewards/margins": 40.41428756713867, + "rewards/rejected": -49.71105194091797, + "step": 3047 + }, + { + "epoch": 1.8961119751166406, + "grad_norm": 0.11154375970363617, + "learning_rate": 2.0435684647302904e-06, + "logits/chosen": 1.0263574123382568, + "logits/rejected": 3.8444151878356934, + "logps/chosen": -580.5594482421875, + "logps/rejected": -1068.85302734375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.889599800109863, + "rewards/margins": 32.126556396484375, + "rewards/rejected": -44.01615905761719, + "step": 3048 + }, + { + "epoch": 1.8967340590979782, + "grad_norm": 0.3560849130153656, + "learning_rate": 2.042415859843246e-06, + "logits/chosen": -0.8153203725814819, + "logits/rejected": 3.3516063690185547, + "logps/chosen": -435.34503173828125, + "logps/rejected": -901.9808959960938, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.1541008949279785, + "rewards/margins": 23.67719078063965, + "rewards/rejected": -30.83129119873047, + "step": 3049 + }, + { + "epoch": 1.8973561430793158, + "grad_norm": 0.004634470213204622, + "learning_rate": 2.0412632549562013e-06, + "logits/chosen": 2.4827990531921387, + "logits/rejected": 3.3610711097717285, + "logps/chosen": -683.9464111328125, + "logps/rejected": -1042.1004638671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.414421081542969, + "rewards/margins": 26.508174896240234, + "rewards/rejected": -36.9225959777832, + "step": 3050 + }, + { + "epoch": 1.8979782270606531, + "grad_norm": 9.769371899892576e-06, + "learning_rate": 2.0401106500691565e-06, + "logits/chosen": -0.8136694431304932, + "logits/rejected": 3.5954878330230713, + "logps/chosen": -504.87060546875, + "logps/rejected": -1096.439208984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.774595260620117, + "rewards/margins": 35.71419143676758, + "rewards/rejected": -44.488792419433594, + "step": 3051 + }, + { + "epoch": 1.8986003110419907, + "grad_norm": 0.7546086311340332, + "learning_rate": 2.0389580451821117e-06, + "logits/chosen": 0.20620578527450562, + "logits/rejected": 2.4939072132110596, + "logps/chosen": -550.8204345703125, + "logps/rejected": -966.3890380859375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.345878601074219, + "rewards/margins": 29.434968948364258, + "rewards/rejected": -38.780845642089844, + "step": 3052 + }, + { + "epoch": 1.8992223950233282, + "grad_norm": 8.60761547088623, + "learning_rate": 2.037805440295067e-06, + "logits/chosen": 0.5855912566184998, + "logits/rejected": 3.697741746902466, + "logps/chosen": -476.2772216796875, + "logps/rejected": -933.6175537109375, + "loss": 0.05, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.937131881713867, + "rewards/margins": 27.20270538330078, + "rewards/rejected": -38.139835357666016, + "step": 3053 + }, + { + "epoch": 1.8998444790046656, + "grad_norm": 1.1274566531938035e-05, + "learning_rate": 2.036652835408022e-06, + "logits/chosen": 0.19531968235969543, + "logits/rejected": 2.492250919342041, + "logps/chosen": -621.8941650390625, + "logps/rejected": -1169.3470458984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.341142654418945, + "rewards/margins": 36.63648223876953, + "rewards/rejected": -48.977622985839844, + "step": 3054 + }, + { + "epoch": 1.9004665629860031, + "grad_norm": 0.10121874511241913, + "learning_rate": 2.0355002305209774e-06, + "logits/chosen": -0.518656849861145, + "logits/rejected": 2.535217523574829, + "logps/chosen": -494.6761779785156, + "logps/rejected": -863.6760864257812, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.946184158325195, + "rewards/margins": 22.766103744506836, + "rewards/rejected": -29.71228790283203, + "step": 3055 + }, + { + "epoch": 1.9010886469673407, + "grad_norm": 0.0062729050405323505, + "learning_rate": 2.034347625633933e-06, + "logits/chosen": -0.871635377407074, + "logits/rejected": 4.4975104331970215, + "logps/chosen": -346.7241516113281, + "logps/rejected": -954.4545288085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.209202766418457, + "rewards/margins": 28.649002075195312, + "rewards/rejected": -35.85820388793945, + "step": 3056 + }, + { + "epoch": 1.901710730948678, + "grad_norm": 2.5780183321444383e-08, + "learning_rate": 2.0331950207468883e-06, + "logits/chosen": 0.47149020433425903, + "logits/rejected": 4.374908447265625, + "logps/chosen": -649.23583984375, + "logps/rejected": -1248.1553955078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.42192554473877, + "rewards/margins": 35.7809944152832, + "rewards/rejected": -48.202919006347656, + "step": 3057 + }, + { + "epoch": 1.9023328149300156, + "grad_norm": 3.1426994340222336e-09, + "learning_rate": 2.0320424158598435e-06, + "logits/chosen": -1.6505568027496338, + "logits/rejected": 1.8247491121292114, + "logps/chosen": -557.5438232421875, + "logps/rejected": -1146.343505859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.328878402709961, + "rewards/margins": 38.459896087646484, + "rewards/rejected": -48.78877258300781, + "step": 3058 + }, + { + "epoch": 1.9029548989113532, + "grad_norm": 6.946622477244091e-08, + "learning_rate": 2.0308898109727987e-06, + "logits/chosen": -1.7206653356552124, + "logits/rejected": 3.2438573837280273, + "logps/chosen": -463.35784912109375, + "logps/rejected": -1229.4114990234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.979928970336914, + "rewards/margins": 44.707908630371094, + "rewards/rejected": -56.687835693359375, + "step": 3059 + }, + { + "epoch": 1.9035769828926905, + "grad_norm": 0.12513019144535065, + "learning_rate": 2.029737206085754e-06, + "logits/chosen": -1.8191311359405518, + "logits/rejected": 2.4682679176330566, + "logps/chosen": -370.4996643066406, + "logps/rejected": -905.6737060546875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.9366374015808105, + "rewards/margins": 27.555641174316406, + "rewards/rejected": -34.492279052734375, + "step": 3060 + }, + { + "epoch": 1.9041990668740278, + "grad_norm": 2.9927703508292325e-05, + "learning_rate": 2.028584601198709e-06, + "logits/chosen": -1.5963267087936401, + "logits/rejected": 3.524061679840088, + "logps/chosen": -300.2033386230469, + "logps/rejected": -916.1325073242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.946621894836426, + "rewards/margins": 32.93095016479492, + "rewards/rejected": -38.87757110595703, + "step": 3061 + }, + { + "epoch": 1.9048211508553656, + "grad_norm": 0.006748128682374954, + "learning_rate": 2.0274319963116644e-06, + "logits/chosen": 1.2222062349319458, + "logits/rejected": 3.575316905975342, + "logps/chosen": -649.7758178710938, + "logps/rejected": -1057.25732421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.95299243927002, + "rewards/margins": 27.27806282043457, + "rewards/rejected": -42.231056213378906, + "step": 3062 + }, + { + "epoch": 1.905443234836703, + "grad_norm": 20.295888900756836, + "learning_rate": 2.02627939142462e-06, + "logits/chosen": 2.645158052444458, + "logits/rejected": 4.364293575286865, + "logps/chosen": -514.44384765625, + "logps/rejected": -851.8877563476562, + "loss": 0.7349, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.85225772857666, + "rewards/margins": 20.194347381591797, + "rewards/rejected": -28.04660415649414, + "step": 3063 + }, + { + "epoch": 1.9060653188180403, + "grad_norm": 0.026905400678515434, + "learning_rate": 2.0251267865375752e-06, + "logits/chosen": -2.920731544494629, + "logits/rejected": 3.2363901138305664, + "logps/chosen": -209.95999145507812, + "logps/rejected": -882.4180297851562, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.916708946228027, + "rewards/margins": 32.43791580200195, + "rewards/rejected": -37.35462951660156, + "step": 3064 + }, + { + "epoch": 1.9066874027993779, + "grad_norm": 1.3152927749615628e-05, + "learning_rate": 2.0239741816505305e-06, + "logits/chosen": -0.07215797901153564, + "logits/rejected": 3.6048831939697266, + "logps/chosen": -531.9854736328125, + "logps/rejected": -1120.064453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.932154655456543, + "rewards/margins": 31.065196990966797, + "rewards/rejected": -39.99734878540039, + "step": 3065 + }, + { + "epoch": 1.9073094867807154, + "grad_norm": 22.387847900390625, + "learning_rate": 2.0228215767634857e-06, + "logits/chosen": 1.1178255081176758, + "logits/rejected": 2.6430320739746094, + "logps/chosen": -586.69921875, + "logps/rejected": -957.328125, + "loss": 0.1829, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.733922958374023, + "rewards/margins": 27.749778747558594, + "rewards/rejected": -37.48370361328125, + "step": 3066 + }, + { + "epoch": 1.9079315707620528, + "grad_norm": 7.578842442512723e-09, + "learning_rate": 2.021668971876441e-06, + "logits/chosen": 3.0557453632354736, + "logits/rejected": 3.5825586318969727, + "logps/chosen": -559.4607543945312, + "logps/rejected": -944.7230224609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.9096903800964355, + "rewards/margins": 32.48735809326172, + "rewards/rejected": -39.39704895019531, + "step": 3067 + }, + { + "epoch": 1.9085536547433903, + "grad_norm": 18.07485580444336, + "learning_rate": 2.020516366989396e-06, + "logits/chosen": 3.178652763366699, + "logits/rejected": 2.9502663612365723, + "logps/chosen": -688.4437255859375, + "logps/rejected": -898.4486694335938, + "loss": 0.0585, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.985757827758789, + "rewards/margins": 20.856191635131836, + "rewards/rejected": -32.841949462890625, + "step": 3068 + }, + { + "epoch": 1.909175738724728, + "grad_norm": 0.6955883502960205, + "learning_rate": 2.0193637621023514e-06, + "logits/chosen": 2.94408917427063, + "logits/rejected": 2.766674041748047, + "logps/chosen": -792.5679931640625, + "logps/rejected": -956.5133056640625, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.01635456085205, + "rewards/margins": 19.517358779907227, + "rewards/rejected": -34.533714294433594, + "step": 3069 + }, + { + "epoch": 1.9097978227060652, + "grad_norm": 15.6476469039917, + "learning_rate": 2.0182111572153066e-06, + "logits/chosen": 1.1466944217681885, + "logits/rejected": 3.3508975505828857, + "logps/chosen": -475.7377014160156, + "logps/rejected": -933.6702270507812, + "loss": 0.0944, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.101260185241699, + "rewards/margins": 30.59676742553711, + "rewards/rejected": -37.698028564453125, + "step": 3070 + }, + { + "epoch": 1.9104199066874028, + "grad_norm": 0.03882289305329323, + "learning_rate": 2.0170585523282622e-06, + "logits/chosen": 0.9645749926567078, + "logits/rejected": 3.7685093879699707, + "logps/chosen": -419.4703674316406, + "logps/rejected": -873.78564453125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.286841869354248, + "rewards/margins": 28.846656799316406, + "rewards/rejected": -34.13349914550781, + "step": 3071 + }, + { + "epoch": 1.9110419906687404, + "grad_norm": 23.7106990814209, + "learning_rate": 2.0159059474412175e-06, + "logits/chosen": 4.3552751541137695, + "logits/rejected": 5.19343376159668, + "logps/chosen": -798.3819580078125, + "logps/rejected": -1081.6822509765625, + "loss": 0.3422, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.164212226867676, + "rewards/margins": 20.923892974853516, + "rewards/rejected": -36.088104248046875, + "step": 3072 + }, + { + "epoch": 1.9116640746500777, + "grad_norm": 0.0004619990650098771, + "learning_rate": 2.0147533425541727e-06, + "logits/chosen": 2.130751848220825, + "logits/rejected": 4.413289546966553, + "logps/chosen": -547.0361328125, + "logps/rejected": -903.9354248046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.0716118812561035, + "rewards/margins": 24.975723266601562, + "rewards/rejected": -32.047332763671875, + "step": 3073 + }, + { + "epoch": 1.9122861586314153, + "grad_norm": 2.6988864476606977e-08, + "learning_rate": 2.013600737667128e-06, + "logits/chosen": 1.7085301876068115, + "logits/rejected": 4.056130409240723, + "logps/chosen": -657.5714111328125, + "logps/rejected": -1185.6339111328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.055585861206055, + "rewards/margins": 36.13258743286133, + "rewards/rejected": -46.18817138671875, + "step": 3074 + }, + { + "epoch": 1.9129082426127528, + "grad_norm": 1.0772266705316724e-06, + "learning_rate": 2.012448132780083e-06, + "logits/chosen": 1.6624712944030762, + "logits/rejected": 4.163967609405518, + "logps/chosen": -429.5469970703125, + "logps/rejected": -896.1236572265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.713283061981201, + "rewards/margins": 30.268901824951172, + "rewards/rejected": -37.98218536376953, + "step": 3075 + }, + { + "epoch": 1.9135303265940902, + "grad_norm": 0.003041618037968874, + "learning_rate": 2.0112955278930384e-06, + "logits/chosen": 2.202765703201294, + "logits/rejected": 4.012975215911865, + "logps/chosen": -629.578857421875, + "logps/rejected": -1021.6544189453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.357513427734375, + "rewards/margins": 25.704200744628906, + "rewards/rejected": -35.06171417236328, + "step": 3076 + }, + { + "epoch": 1.9141524105754277, + "grad_norm": 3.149693727493286, + "learning_rate": 2.0101429230059936e-06, + "logits/chosen": 2.2744429111480713, + "logits/rejected": 2.2045953273773193, + "logps/chosen": -632.7249145507812, + "logps/rejected": -869.91650390625, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.000911712646484, + "rewards/margins": 22.652973175048828, + "rewards/rejected": -32.65388488769531, + "step": 3077 + }, + { + "epoch": 1.9147744945567653, + "grad_norm": 15.868894577026367, + "learning_rate": 2.0089903181189492e-06, + "logits/chosen": 1.046784520149231, + "logits/rejected": 3.364591598510742, + "logps/chosen": -626.5619506835938, + "logps/rejected": -1132.443115234375, + "loss": 0.0613, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.214308738708496, + "rewards/margins": 35.30830383300781, + "rewards/rejected": -45.522613525390625, + "step": 3078 + }, + { + "epoch": 1.9153965785381026, + "grad_norm": 15.966095924377441, + "learning_rate": 2.0078377132319045e-06, + "logits/chosen": -0.5236443281173706, + "logits/rejected": 2.8816840648651123, + "logps/chosen": -622.166259765625, + "logps/rejected": -1035.40380859375, + "loss": 0.2252, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.653708457946777, + "rewards/margins": 31.47915267944336, + "rewards/rejected": -38.13285827636719, + "step": 3079 + }, + { + "epoch": 1.91601866251944, + "grad_norm": 5.1122162403771654e-05, + "learning_rate": 2.0066851083448597e-06, + "logits/chosen": -0.1377563625574112, + "logits/rejected": 1.8764601945877075, + "logps/chosen": -588.8012084960938, + "logps/rejected": -952.9384765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.899794578552246, + "rewards/margins": 25.850767135620117, + "rewards/rejected": -38.75056076049805, + "step": 3080 + }, + { + "epoch": 1.9166407465007778, + "grad_norm": 2.693913698196411, + "learning_rate": 2.005532503457815e-06, + "logits/chosen": -2.1489388942718506, + "logits/rejected": 1.2460088729858398, + "logps/chosen": -409.9424743652344, + "logps/rejected": -908.951416015625, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.152788162231445, + "rewards/margins": 23.960559844970703, + "rewards/rejected": -34.113346099853516, + "step": 3081 + }, + { + "epoch": 1.917262830482115, + "grad_norm": 0.2958039343357086, + "learning_rate": 2.00437989857077e-06, + "logits/chosen": -1.046022891998291, + "logits/rejected": 3.3509743213653564, + "logps/chosen": -481.7524719238281, + "logps/rejected": -1104.011962890625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.942147254943848, + "rewards/margins": 38.9442253112793, + "rewards/rejected": -47.88637161254883, + "step": 3082 + }, + { + "epoch": 1.9178849144634524, + "grad_norm": 9.022804988489952e-06, + "learning_rate": 2.0032272936837254e-06, + "logits/chosen": 0.3423982858657837, + "logits/rejected": 3.9999098777770996, + "logps/chosen": -417.267822265625, + "logps/rejected": -853.1267700195312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.099996566772461, + "rewards/margins": 27.42827796936035, + "rewards/rejected": -35.52827453613281, + "step": 3083 + }, + { + "epoch": 1.91850699844479, + "grad_norm": 8.890567779541016, + "learning_rate": 2.0020746887966806e-06, + "logits/chosen": 3.4113690853118896, + "logits/rejected": 4.016191482543945, + "logps/chosen": -667.6809692382812, + "logps/rejected": -883.475830078125, + "loss": 0.0523, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.454672813415527, + "rewards/margins": 20.978595733642578, + "rewards/rejected": -32.43326950073242, + "step": 3084 + }, + { + "epoch": 1.9191290824261276, + "grad_norm": 2.1542064132518135e-07, + "learning_rate": 2.0009220839096362e-06, + "logits/chosen": 0.25448083877563477, + "logits/rejected": 4.301416397094727, + "logps/chosen": -445.3734436035156, + "logps/rejected": -1053.107666015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.527276992797852, + "rewards/margins": 33.589111328125, + "rewards/rejected": -42.11638641357422, + "step": 3085 + }, + { + "epoch": 1.919751166407465, + "grad_norm": 0.019370652735233307, + "learning_rate": 1.9997694790225915e-06, + "logits/chosen": -0.40544599294662476, + "logits/rejected": 3.474851131439209, + "logps/chosen": -469.44781494140625, + "logps/rejected": -912.0271606445312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.017045974731445, + "rewards/margins": 25.133575439453125, + "rewards/rejected": -34.15061950683594, + "step": 3086 + }, + { + "epoch": 1.9203732503888025, + "grad_norm": 2.7604433853412047e-05, + "learning_rate": 1.9986168741355467e-06, + "logits/chosen": -2.633376121520996, + "logits/rejected": 2.72335147857666, + "logps/chosen": -397.2959289550781, + "logps/rejected": -1001.3154907226562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.045672416687012, + "rewards/margins": 33.706565856933594, + "rewards/rejected": -41.75223922729492, + "step": 3087 + }, + { + "epoch": 1.92099533437014, + "grad_norm": 0.5521590709686279, + "learning_rate": 1.997464269248502e-06, + "logits/chosen": -2.533043384552002, + "logits/rejected": 2.634922981262207, + "logps/chosen": -496.95086669921875, + "logps/rejected": -1041.037353515625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.341783046722412, + "rewards/margins": 29.893619537353516, + "rewards/rejected": -36.23540496826172, + "step": 3088 + }, + { + "epoch": 1.9216174183514774, + "grad_norm": 33.23648452758789, + "learning_rate": 1.996311664361457e-06, + "logits/chosen": 0.3715195059776306, + "logits/rejected": 3.6420249938964844, + "logps/chosen": -478.12640380859375, + "logps/rejected": -843.5733032226562, + "loss": 0.4117, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.215129852294922, + "rewards/margins": 21.32658576965332, + "rewards/rejected": -27.541715621948242, + "step": 3089 + }, + { + "epoch": 1.922239502332815, + "grad_norm": 40.58311462402344, + "learning_rate": 1.9951590594744124e-06, + "logits/chosen": 1.1373742818832397, + "logits/rejected": 4.414701461791992, + "logps/chosen": -538.0445556640625, + "logps/rejected": -901.5621337890625, + "loss": 0.9431, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.4848051071167, + "rewards/margins": 15.416234970092773, + "rewards/rejected": -25.901039123535156, + "step": 3090 + }, + { + "epoch": 1.9228615863141525, + "grad_norm": 0.08620551228523254, + "learning_rate": 1.9940064545873676e-06, + "logits/chosen": -0.5909771919250488, + "logits/rejected": 2.6995272636413574, + "logps/chosen": -428.00830078125, + "logps/rejected": -923.3511962890625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.33791732788086, + "rewards/margins": 27.793746948242188, + "rewards/rejected": -36.13166046142578, + "step": 3091 + }, + { + "epoch": 1.9234836702954898, + "grad_norm": 0.00960856769233942, + "learning_rate": 1.9928538497003232e-06, + "logits/chosen": 0.8783894181251526, + "logits/rejected": 3.604226589202881, + "logps/chosen": -500.4090270996094, + "logps/rejected": -950.4783325195312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.384622573852539, + "rewards/margins": 30.068050384521484, + "rewards/rejected": -34.452674865722656, + "step": 3092 + }, + { + "epoch": 1.9241057542768274, + "grad_norm": 1.3824650049209595, + "learning_rate": 1.991701244813278e-06, + "logits/chosen": 0.8421534299850464, + "logits/rejected": 3.5944361686706543, + "logps/chosen": -551.550048828125, + "logps/rejected": -1028.826416015625, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.340576648712158, + "rewards/margins": 25.5235595703125, + "rewards/rejected": -30.864137649536133, + "step": 3093 + }, + { + "epoch": 1.924727838258165, + "grad_norm": 1.3520052561943885e-05, + "learning_rate": 1.9905486399262333e-06, + "logits/chosen": 1.369136095046997, + "logits/rejected": 3.906341075897217, + "logps/chosen": -490.1999206542969, + "logps/rejected": -1115.05517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.051058769226074, + "rewards/margins": 40.5749397277832, + "rewards/rejected": -50.62599563598633, + "step": 3094 + }, + { + "epoch": 1.9253499222395023, + "grad_norm": 2.0884041786193848, + "learning_rate": 1.9893960350391885e-06, + "logits/chosen": -0.2868638336658478, + "logits/rejected": 3.4033193588256836, + "logps/chosen": -459.6829528808594, + "logps/rejected": -1025.7183837890625, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.993990421295166, + "rewards/margins": 31.40522575378418, + "rewards/rejected": -38.39921569824219, + "step": 3095 + }, + { + "epoch": 1.9259720062208399, + "grad_norm": 0.04092787951231003, + "learning_rate": 1.9882434301521437e-06, + "logits/chosen": -0.23980551958084106, + "logits/rejected": 3.9187815189361572, + "logps/chosen": -501.53662109375, + "logps/rejected": -1109.9281005859375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.170280456542969, + "rewards/margins": 35.897464752197266, + "rewards/rejected": -45.06774139404297, + "step": 3096 + }, + { + "epoch": 1.9265940902021774, + "grad_norm": 0.00010406249202787876, + "learning_rate": 1.9870908252650994e-06, + "logits/chosen": 3.265599250793457, + "logits/rejected": 3.0254709720611572, + "logps/chosen": -635.1368408203125, + "logps/rejected": -902.3892822265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.115925312042236, + "rewards/margins": 24.14023780822754, + "rewards/rejected": -29.256162643432617, + "step": 3097 + }, + { + "epoch": 1.9272161741835148, + "grad_norm": 0.0016180593520402908, + "learning_rate": 1.9859382203780546e-06, + "logits/chosen": 0.5095869302749634, + "logits/rejected": 2.9197235107421875, + "logps/chosen": -520.6793212890625, + "logps/rejected": -1036.481201171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.886094093322754, + "rewards/margins": 36.49535369873047, + "rewards/rejected": -43.38145065307617, + "step": 3098 + }, + { + "epoch": 1.927838258164852, + "grad_norm": 0.0008051490876823664, + "learning_rate": 1.98478561549101e-06, + "logits/chosen": -3.078697919845581, + "logits/rejected": 3.5038082599639893, + "logps/chosen": -302.9356689453125, + "logps/rejected": -906.822265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.008868217468262, + "rewards/margins": 21.944965362548828, + "rewards/rejected": -27.953832626342773, + "step": 3099 + }, + { + "epoch": 1.9284603421461899, + "grad_norm": 1.0987540690621245e-06, + "learning_rate": 1.983633010603965e-06, + "logits/chosen": -1.677296757698059, + "logits/rejected": 2.7449183464050293, + "logps/chosen": -345.772705078125, + "logps/rejected": -854.6087036132812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.282731533050537, + "rewards/margins": 30.456693649291992, + "rewards/rejected": -36.73942565917969, + "step": 3100 + }, + { + "epoch": 1.9290824261275272, + "grad_norm": 2.140411853790283, + "learning_rate": 1.9824804057169203e-06, + "logits/chosen": -0.08920153230428696, + "logits/rejected": 3.9136552810668945, + "logps/chosen": -519.4923706054688, + "logps/rejected": -951.7150268554688, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.870290756225586, + "rewards/margins": 23.316768646240234, + "rewards/rejected": -32.18705749511719, + "step": 3101 + }, + { + "epoch": 1.9297045101088646, + "grad_norm": 4.3668413162231445, + "learning_rate": 1.9813278008298755e-06, + "logits/chosen": 2.294924020767212, + "logits/rejected": 3.8346686363220215, + "logps/chosen": -517.449462890625, + "logps/rejected": -826.955810546875, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.303144454956055, + "rewards/margins": 21.79949188232422, + "rewards/rejected": -35.10263442993164, + "step": 3102 + }, + { + "epoch": 1.9303265940902021, + "grad_norm": 35.314605712890625, + "learning_rate": 1.9801751959428307e-06, + "logits/chosen": 1.7527735233306885, + "logits/rejected": 2.813300371170044, + "logps/chosen": -544.0867919921875, + "logps/rejected": -810.4888916015625, + "loss": 0.6331, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.789369583129883, + "rewards/margins": 20.098064422607422, + "rewards/rejected": -25.887434005737305, + "step": 3103 + }, + { + "epoch": 1.9309486780715397, + "grad_norm": 0.37775611877441406, + "learning_rate": 1.9790225910557864e-06, + "logits/chosen": 2.292999505996704, + "logits/rejected": 4.234982490539551, + "logps/chosen": -575.2119140625, + "logps/rejected": -962.66455078125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.68335247039795, + "rewards/margins": 23.792911529541016, + "rewards/rejected": -34.47626876831055, + "step": 3104 + }, + { + "epoch": 1.931570762052877, + "grad_norm": 38.52018356323242, + "learning_rate": 1.9778699861687416e-06, + "logits/chosen": -0.1604076474905014, + "logits/rejected": 0.9690057039260864, + "logps/chosen": -589.8624267578125, + "logps/rejected": -806.6043701171875, + "loss": 0.6047, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.873226165771484, + "rewards/margins": 14.048189163208008, + "rewards/rejected": -20.921417236328125, + "step": 3105 + }, + { + "epoch": 1.9321928460342146, + "grad_norm": 0.0012302246177569032, + "learning_rate": 1.976717381281697e-06, + "logits/chosen": -2.3391997814178467, + "logits/rejected": 3.5974345207214355, + "logps/chosen": -271.0357360839844, + "logps/rejected": -858.4124755859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.7468342781066895, + "rewards/margins": 24.664321899414062, + "rewards/rejected": -30.41115951538086, + "step": 3106 + }, + { + "epoch": 1.9328149300155522, + "grad_norm": 0.13548584282398224, + "learning_rate": 1.975564776394652e-06, + "logits/chosen": -1.131691575050354, + "logits/rejected": 2.9434633255004883, + "logps/chosen": -432.55963134765625, + "logps/rejected": -941.6983032226562, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.376041412353516, + "rewards/margins": 30.401823043823242, + "rewards/rejected": -38.777862548828125, + "step": 3107 + }, + { + "epoch": 1.9334370139968895, + "grad_norm": 0.0010253810323774815, + "learning_rate": 1.9744121715076073e-06, + "logits/chosen": 1.5464913845062256, + "logits/rejected": 1.995990514755249, + "logps/chosen": -640.5809936523438, + "logps/rejected": -906.3209228515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.729105472564697, + "rewards/margins": 24.215728759765625, + "rewards/rejected": -31.944835662841797, + "step": 3108 + }, + { + "epoch": 1.934059097978227, + "grad_norm": 1.0012019872665405, + "learning_rate": 1.9732595666205625e-06, + "logits/chosen": 0.8782744407653809, + "logits/rejected": 3.3917250633239746, + "logps/chosen": -675.33544921875, + "logps/rejected": -1131.438720703125, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.280045509338379, + "rewards/margins": 30.720184326171875, + "rewards/rejected": -44.00022888183594, + "step": 3109 + }, + { + "epoch": 1.9346811819595646, + "grad_norm": 0.0008394105243496597, + "learning_rate": 1.9721069617335177e-06, + "logits/chosen": 1.2346922159194946, + "logits/rejected": 1.7204153537750244, + "logps/chosen": -612.9161376953125, + "logps/rejected": -899.165283203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.591879844665527, + "rewards/margins": 25.975309371948242, + "rewards/rejected": -36.56719207763672, + "step": 3110 + }, + { + "epoch": 1.935303265940902, + "grad_norm": 0.01554886344820261, + "learning_rate": 1.970954356846473e-06, + "logits/chosen": -1.2211658954620361, + "logits/rejected": 3.6318647861480713, + "logps/chosen": -531.0762329101562, + "logps/rejected": -1141.5849609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.235472202301025, + "rewards/margins": 30.65101432800293, + "rewards/rejected": -35.8864860534668, + "step": 3111 + }, + { + "epoch": 1.9359253499222395, + "grad_norm": 4.545105934143066, + "learning_rate": 1.9698017519594286e-06, + "logits/chosen": -2.037519693374634, + "logits/rejected": 2.981616497039795, + "logps/chosen": -454.7665710449219, + "logps/rejected": -1027.3382568359375, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.1797332763671875, + "rewards/margins": 27.61724281311035, + "rewards/rejected": -34.79697799682617, + "step": 3112 + }, + { + "epoch": 1.936547433903577, + "grad_norm": 22.48046112060547, + "learning_rate": 1.968649147072384e-06, + "logits/chosen": -3.1317570209503174, + "logits/rejected": 2.2430827617645264, + "logps/chosen": -233.67686462402344, + "logps/rejected": -833.810546875, + "loss": 0.3444, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.681829452514648, + "rewards/margins": 26.82794189453125, + "rewards/rejected": -32.509769439697266, + "step": 3113 + }, + { + "epoch": 1.9371695178849144, + "grad_norm": 1.1949235158681404e-05, + "learning_rate": 1.967496542185339e-06, + "logits/chosen": 2.1707959175109863, + "logits/rejected": 2.928690195083618, + "logps/chosen": -630.8195190429688, + "logps/rejected": -919.963623046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.479073524475098, + "rewards/margins": 28.746137619018555, + "rewards/rejected": -35.22521209716797, + "step": 3114 + }, + { + "epoch": 1.937791601866252, + "grad_norm": 0.00946116354316473, + "learning_rate": 1.9663439372982943e-06, + "logits/chosen": -0.10625946521759033, + "logits/rejected": 3.2997217178344727, + "logps/chosen": -392.2223815917969, + "logps/rejected": -870.4689331054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0400776863098145, + "rewards/margins": 26.645164489746094, + "rewards/rejected": -31.68524169921875, + "step": 3115 + }, + { + "epoch": 1.9384136858475896, + "grad_norm": 0.0013832555850967765, + "learning_rate": 1.9651913324112495e-06, + "logits/chosen": 2.235701084136963, + "logits/rejected": 5.511145114898682, + "logps/chosen": -671.1697998046875, + "logps/rejected": -1116.8763427734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.155223846435547, + "rewards/margins": 26.336801528930664, + "rewards/rejected": -36.492027282714844, + "step": 3116 + }, + { + "epoch": 1.939035769828927, + "grad_norm": 0.0005869021988473833, + "learning_rate": 1.9640387275242047e-06, + "logits/chosen": -2.2219884395599365, + "logits/rejected": 2.0224733352661133, + "logps/chosen": -325.9281921386719, + "logps/rejected": -870.1353759765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.242877006530762, + "rewards/margins": 30.151485443115234, + "rewards/rejected": -34.39436340332031, + "step": 3117 + }, + { + "epoch": 1.9396578538102642, + "grad_norm": 30.43111801147461, + "learning_rate": 1.96288612263716e-06, + "logits/chosen": 0.6214398145675659, + "logits/rejected": 3.964189291000366, + "logps/chosen": -570.104736328125, + "logps/rejected": -1154.1322021484375, + "loss": 0.1592, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.116994857788086, + "rewards/margins": 30.15409278869629, + "rewards/rejected": -41.27109146118164, + "step": 3118 + }, + { + "epoch": 1.940279937791602, + "grad_norm": 27.62190055847168, + "learning_rate": 1.9617335177501156e-06, + "logits/chosen": 2.633063793182373, + "logits/rejected": 5.510431289672852, + "logps/chosen": -557.0858154296875, + "logps/rejected": -1135.85205078125, + "loss": 0.2993, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.610867977142334, + "rewards/margins": 38.47575378417969, + "rewards/rejected": -45.08662033081055, + "step": 3119 + }, + { + "epoch": 1.9409020217729394, + "grad_norm": 28.94077491760254, + "learning_rate": 1.960580912863071e-06, + "logits/chosen": -1.3939131498336792, + "logits/rejected": 3.7747256755828857, + "logps/chosen": -485.427001953125, + "logps/rejected": -1010.84375, + "loss": 0.2913, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.9063720703125, + "rewards/margins": 21.018043518066406, + "rewards/rejected": -26.924415588378906, + "step": 3120 + }, + { + "epoch": 1.9415241057542767, + "grad_norm": 40.993804931640625, + "learning_rate": 1.959428307976026e-06, + "logits/chosen": 2.295240640640259, + "logits/rejected": 4.34235143661499, + "logps/chosen": -642.120361328125, + "logps/rejected": -1075.3221435546875, + "loss": 0.3546, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.789778709411621, + "rewards/margins": 22.575544357299805, + "rewards/rejected": -31.36532211303711, + "step": 3121 + }, + { + "epoch": 1.9421461897356143, + "grad_norm": 0.021397702395915985, + "learning_rate": 1.9582757030889812e-06, + "logits/chosen": -3.1025195121765137, + "logits/rejected": 3.167198419570923, + "logps/chosen": -357.3218994140625, + "logps/rejected": -972.1842041015625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.450490951538086, + "rewards/margins": 24.834686279296875, + "rewards/rejected": -31.285175323486328, + "step": 3122 + }, + { + "epoch": 1.9427682737169518, + "grad_norm": 0.4374660849571228, + "learning_rate": 1.9571230982019365e-06, + "logits/chosen": 1.4612170457839966, + "logits/rejected": 4.9650397300720215, + "logps/chosen": -562.9573364257812, + "logps/rejected": -1090.174560546875, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.2041778564453125, + "rewards/margins": 26.764659881591797, + "rewards/rejected": -33.96883773803711, + "step": 3123 + }, + { + "epoch": 1.9433903576982892, + "grad_norm": 0.0014534497167915106, + "learning_rate": 1.9559704933148917e-06, + "logits/chosen": -1.5175460577011108, + "logits/rejected": 3.670214891433716, + "logps/chosen": -357.6202392578125, + "logps/rejected": -976.2584838867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.8302741050720215, + "rewards/margins": 30.991336822509766, + "rewards/rejected": -35.82160949707031, + "step": 3124 + }, + { + "epoch": 1.9440124416796267, + "grad_norm": 40.93093490600586, + "learning_rate": 1.954817888427847e-06, + "logits/chosen": 0.8629130721092224, + "logits/rejected": 3.1365485191345215, + "logps/chosen": -601.42626953125, + "logps/rejected": -1025.9072265625, + "loss": 0.9715, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.362449645996094, + "rewards/margins": 26.06359100341797, + "rewards/rejected": -34.42604064941406, + "step": 3125 + }, + { + "epoch": 1.9446345256609643, + "grad_norm": 12.326611518859863, + "learning_rate": 1.9536652835408026e-06, + "logits/chosen": 2.843836784362793, + "logits/rejected": 4.174145698547363, + "logps/chosen": -592.7425537109375, + "logps/rejected": -801.8739013671875, + "loss": 0.071, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.317679405212402, + "rewards/margins": 20.747962951660156, + "rewards/rejected": -30.065643310546875, + "step": 3126 + }, + { + "epoch": 1.9452566096423016, + "grad_norm": 0.009106865152716637, + "learning_rate": 1.952512678653758e-06, + "logits/chosen": 0.664067804813385, + "logits/rejected": 2.8401622772216797, + "logps/chosen": -483.0638732910156, + "logps/rejected": -1060.47265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.1976847648620605, + "rewards/margins": 36.17652893066406, + "rewards/rejected": -43.37421417236328, + "step": 3127 + }, + { + "epoch": 1.9458786936236392, + "grad_norm": 0.00035235649556852877, + "learning_rate": 1.951360073766713e-06, + "logits/chosen": 1.572568416595459, + "logits/rejected": 3.067289352416992, + "logps/chosen": -589.0814208984375, + "logps/rejected": -878.741943359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.191118240356445, + "rewards/margins": 20.378524780273438, + "rewards/rejected": -34.56964111328125, + "step": 3128 + }, + { + "epoch": 1.9465007776049768, + "grad_norm": 2.083462823065929e-05, + "learning_rate": 1.9502074688796682e-06, + "logits/chosen": -1.6689331531524658, + "logits/rejected": 3.7483890056610107, + "logps/chosen": -418.681640625, + "logps/rejected": -1105.7725830078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.608135223388672, + "rewards/margins": 34.870338439941406, + "rewards/rejected": -41.47846984863281, + "step": 3129 + }, + { + "epoch": 1.947122861586314, + "grad_norm": 1.515984296798706, + "learning_rate": 1.9490548639926235e-06, + "logits/chosen": -1.502424716949463, + "logits/rejected": 3.913059711456299, + "logps/chosen": -441.2313232421875, + "logps/rejected": -1136.94677734375, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.84985065460205, + "rewards/margins": 34.735313415527344, + "rewards/rejected": -43.585166931152344, + "step": 3130 + }, + { + "epoch": 1.9477449455676517, + "grad_norm": 0.0010944779496639967, + "learning_rate": 1.9479022591055787e-06, + "logits/chosen": -1.6574513912200928, + "logits/rejected": 2.5761234760284424, + "logps/chosen": -434.24237060546875, + "logps/rejected": -995.3330688476562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.173585891723633, + "rewards/margins": 29.175065994262695, + "rewards/rejected": -37.34865188598633, + "step": 3131 + }, + { + "epoch": 1.9483670295489892, + "grad_norm": 0.08957312256097794, + "learning_rate": 1.946749654218534e-06, + "logits/chosen": 2.9277377128601074, + "logits/rejected": 3.3973681926727295, + "logps/chosen": -821.0192260742188, + "logps/rejected": -1138.6099853515625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.827193260192871, + "rewards/margins": 28.56563949584961, + "rewards/rejected": -40.3928337097168, + "step": 3132 + }, + { + "epoch": 1.9489891135303266, + "grad_norm": 5.443854433906381e-07, + "learning_rate": 1.945597049331489e-06, + "logits/chosen": 1.8225369453430176, + "logits/rejected": 5.566961765289307, + "logps/chosen": -481.93804931640625, + "logps/rejected": -1042.929443359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.911165714263916, + "rewards/margins": 30.441822052001953, + "rewards/rejected": -36.35298538208008, + "step": 3133 + }, + { + "epoch": 1.9496111975116641, + "grad_norm": 3.7663533021259354e-06, + "learning_rate": 1.944444444444445e-06, + "logits/chosen": 0.8368363380432129, + "logits/rejected": 3.7206904888153076, + "logps/chosen": -463.287109375, + "logps/rejected": -858.33349609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.503768444061279, + "rewards/margins": 25.793119430541992, + "rewards/rejected": -33.29689025878906, + "step": 3134 + }, + { + "epoch": 1.9502332814930017, + "grad_norm": 1.1044361514223056e-07, + "learning_rate": 1.9432918395574e-06, + "logits/chosen": -2.337228298187256, + "logits/rejected": 3.89192795753479, + "logps/chosen": -342.0597229003906, + "logps/rejected": -1077.766845703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.909173965454102, + "rewards/margins": 34.26679229736328, + "rewards/rejected": -42.175968170166016, + "step": 3135 + }, + { + "epoch": 1.950855365474339, + "grad_norm": 25.050539016723633, + "learning_rate": 1.9421392346703552e-06, + "logits/chosen": 2.8053271770477295, + "logits/rejected": 2.157046318054199, + "logps/chosen": -723.6729736328125, + "logps/rejected": -894.4259033203125, + "loss": 0.2076, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.69310474395752, + "rewards/margins": 18.876235961914062, + "rewards/rejected": -30.569339752197266, + "step": 3136 + }, + { + "epoch": 1.9514774494556764, + "grad_norm": 0.028986474499106407, + "learning_rate": 1.9409866297833105e-06, + "logits/chosen": -3.2278780937194824, + "logits/rejected": 0.645659327507019, + "logps/chosen": -316.598876953125, + "logps/rejected": -778.5980224609375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.194002151489258, + "rewards/margins": 28.001659393310547, + "rewards/rejected": -32.19565963745117, + "step": 3137 + }, + { + "epoch": 1.9520995334370141, + "grad_norm": 0.00019788251665886492, + "learning_rate": 1.9398340248962657e-06, + "logits/chosen": 1.0286705493927002, + "logits/rejected": 2.940277338027954, + "logps/chosen": -495.791748046875, + "logps/rejected": -904.9774169921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.718324661254883, + "rewards/margins": 27.192028045654297, + "rewards/rejected": -39.91035461425781, + "step": 3138 + }, + { + "epoch": 1.9527216174183515, + "grad_norm": 4.4513002649182454e-05, + "learning_rate": 1.938681420009221e-06, + "logits/chosen": -0.6334162950515747, + "logits/rejected": 2.7107186317443848, + "logps/chosen": -450.05792236328125, + "logps/rejected": -958.1580810546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.791568279266357, + "rewards/margins": 30.65761947631836, + "rewards/rejected": -38.449188232421875, + "step": 3139 + }, + { + "epoch": 1.9533437013996888, + "grad_norm": 0.0005697354790754616, + "learning_rate": 1.937528815122176e-06, + "logits/chosen": -1.5794157981872559, + "logits/rejected": 3.3874337673187256, + "logps/chosen": -283.36279296875, + "logps/rejected": -919.2586669921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.818148136138916, + "rewards/margins": 35.102237701416016, + "rewards/rejected": -38.92038345336914, + "step": 3140 + }, + { + "epoch": 1.9539657853810264, + "grad_norm": 3.034701347351074, + "learning_rate": 1.936376210235132e-06, + "logits/chosen": 0.9317562580108643, + "logits/rejected": 2.288961410522461, + "logps/chosen": -659.7003173828125, + "logps/rejected": -990.6065063476562, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.969992160797119, + "rewards/margins": 22.054443359375, + "rewards/rejected": -30.024436950683594, + "step": 3141 + }, + { + "epoch": 1.954587869362364, + "grad_norm": 0.00017606717301532626, + "learning_rate": 1.935223605348087e-06, + "logits/chosen": 2.3919968605041504, + "logits/rejected": 3.5712153911590576, + "logps/chosen": -615.6132202148438, + "logps/rejected": -1001.5966186523438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.834830284118652, + "rewards/margins": 30.781444549560547, + "rewards/rejected": -41.61627197265625, + "step": 3142 + }, + { + "epoch": 1.9552099533437013, + "grad_norm": 1.5589220083711552e-06, + "learning_rate": 1.9340710004610422e-06, + "logits/chosen": -0.3756348490715027, + "logits/rejected": 3.8530774116516113, + "logps/chosen": -409.1357116699219, + "logps/rejected": -897.506591796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.981670379638672, + "rewards/margins": 29.417194366455078, + "rewards/rejected": -38.39886474609375, + "step": 3143 + }, + { + "epoch": 1.9558320373250389, + "grad_norm": 33.71767044067383, + "learning_rate": 1.9329183955739975e-06, + "logits/chosen": -3.283298969268799, + "logits/rejected": -0.33032527565956116, + "logps/chosen": -383.6517333984375, + "logps/rejected": -775.3797607421875, + "loss": 0.8141, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.489877700805664, + "rewards/margins": 20.71088409423828, + "rewards/rejected": -27.200763702392578, + "step": 3144 + }, + { + "epoch": 1.9564541213063764, + "grad_norm": 6.639469862790293e-09, + "learning_rate": 1.9317657906869527e-06, + "logits/chosen": -0.8790050745010376, + "logits/rejected": 1.0028407573699951, + "logps/chosen": -523.29443359375, + "logps/rejected": -927.624755859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.112761497497559, + "rewards/margins": 33.71839141845703, + "rewards/rejected": -38.831153869628906, + "step": 3145 + }, + { + "epoch": 1.9570762052877138, + "grad_norm": 0.0003128540702164173, + "learning_rate": 1.930613185799908e-06, + "logits/chosen": -1.3517966270446777, + "logits/rejected": 1.9820481538772583, + "logps/chosen": -559.878173828125, + "logps/rejected": -969.4806518554688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.006365776062012, + "rewards/margins": 28.887584686279297, + "rewards/rejected": -40.893951416015625, + "step": 3146 + }, + { + "epoch": 1.9576982892690513, + "grad_norm": 9.801250416785479e-05, + "learning_rate": 1.929460580912863e-06, + "logits/chosen": 0.4533102512359619, + "logits/rejected": 2.669489860534668, + "logps/chosen": -600.5856323242188, + "logps/rejected": -1008.6813354492188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.19776439666748, + "rewards/margins": 30.614797592163086, + "rewards/rejected": -39.812564849853516, + "step": 3147 + }, + { + "epoch": 1.9583203732503889, + "grad_norm": 0.0008385455585084856, + "learning_rate": 1.9283079760258188e-06, + "logits/chosen": -0.3274199366569519, + "logits/rejected": 1.4190573692321777, + "logps/chosen": -520.9417724609375, + "logps/rejected": -1010.8619995117188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.59363079071045, + "rewards/margins": 29.788921356201172, + "rewards/rejected": -40.38255310058594, + "step": 3148 + }, + { + "epoch": 1.9589424572317262, + "grad_norm": 0.3622640371322632, + "learning_rate": 1.927155371138774e-06, + "logits/chosen": -0.10235399007797241, + "logits/rejected": 3.619846820831299, + "logps/chosen": -502.5452880859375, + "logps/rejected": -1006.097412109375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.385444164276123, + "rewards/margins": 32.80233383178711, + "rewards/rejected": -39.18777847290039, + "step": 3149 + }, + { + "epoch": 1.9595645412130638, + "grad_norm": 2.1730106709583197e-06, + "learning_rate": 1.9260027662517292e-06, + "logits/chosen": -0.025127731263637543, + "logits/rejected": 2.28237247467041, + "logps/chosen": -590.5953369140625, + "logps/rejected": -1034.8394775390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.482217788696289, + "rewards/margins": 31.997419357299805, + "rewards/rejected": -40.479637145996094, + "step": 3150 + }, + { + "epoch": 1.9601866251944013, + "grad_norm": 8.769025802612305, + "learning_rate": 1.9248501613646845e-06, + "logits/chosen": 0.180405855178833, + "logits/rejected": 1.3878958225250244, + "logps/chosen": -582.6483154296875, + "logps/rejected": -890.475341796875, + "loss": 0.0525, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.298641204833984, + "rewards/margins": 20.171621322631836, + "rewards/rejected": -33.47026443481445, + "step": 3151 + }, + { + "epoch": 1.9608087091757387, + "grad_norm": 0.0005307064857333899, + "learning_rate": 1.9236975564776397e-06, + "logits/chosen": 0.7239381670951843, + "logits/rejected": 3.4129738807678223, + "logps/chosen": -372.01153564453125, + "logps/rejected": -790.05712890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.203645706176758, + "rewards/margins": 25.585803985595703, + "rewards/rejected": -32.789451599121094, + "step": 3152 + }, + { + "epoch": 1.9614307931570762, + "grad_norm": 8.47744083404541, + "learning_rate": 1.922544951590595e-06, + "logits/chosen": -0.6052994132041931, + "logits/rejected": 1.3863511085510254, + "logps/chosen": -600.923095703125, + "logps/rejected": -1011.9559936523438, + "loss": 0.0317, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.772683143615723, + "rewards/margins": 23.889484405517578, + "rewards/rejected": -33.66217041015625, + "step": 3153 + }, + { + "epoch": 1.9620528771384138, + "grad_norm": 2.698514967036658e-09, + "learning_rate": 1.92139234670355e-06, + "logits/chosen": -3.1321492195129395, + "logits/rejected": 3.270890951156616, + "logps/chosen": -285.3479309082031, + "logps/rejected": -935.8294677734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.258288860321045, + "rewards/margins": 38.576446533203125, + "rewards/rejected": -43.83473587036133, + "step": 3154 + }, + { + "epoch": 1.9626749611197511, + "grad_norm": 1.4546620832334156e-06, + "learning_rate": 1.9202397418165058e-06, + "logits/chosen": -1.279468059539795, + "logits/rejected": 2.263674736022949, + "logps/chosen": -536.9735717773438, + "logps/rejected": -1118.637939453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.917298316955566, + "rewards/margins": 36.86639404296875, + "rewards/rejected": -47.78369140625, + "step": 3155 + }, + { + "epoch": 1.9632970451010885, + "grad_norm": 0.13870863616466522, + "learning_rate": 1.919087136929461e-06, + "logits/chosen": 2.533172130584717, + "logits/rejected": 3.0230398178100586, + "logps/chosen": -699.2755737304688, + "logps/rejected": -861.6014404296875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.90634536743164, + "rewards/margins": 22.009307861328125, + "rewards/rejected": -32.915653228759766, + "step": 3156 + }, + { + "epoch": 1.9639191290824263, + "grad_norm": 0.0025135388132184744, + "learning_rate": 1.9179345320424162e-06, + "logits/chosen": -0.8763086199760437, + "logits/rejected": 3.799420118331909, + "logps/chosen": -385.1059265136719, + "logps/rejected": -963.9544677734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.895961761474609, + "rewards/margins": 32.50831985473633, + "rewards/rejected": -40.40427780151367, + "step": 3157 + }, + { + "epoch": 1.9645412130637636, + "grad_norm": 1.0180851859331597e-05, + "learning_rate": 1.9167819271553715e-06, + "logits/chosen": 2.6572232246398926, + "logits/rejected": 4.152245998382568, + "logps/chosen": -740.9976806640625, + "logps/rejected": -1174.274169921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.86086654663086, + "rewards/margins": 31.476028442382812, + "rewards/rejected": -44.336891174316406, + "step": 3158 + }, + { + "epoch": 1.965163297045101, + "grad_norm": 0.10397597402334213, + "learning_rate": 1.9156293222683267e-06, + "logits/chosen": 2.5373735427856445, + "logits/rejected": 2.5043015480041504, + "logps/chosen": -691.72265625, + "logps/rejected": -875.3533325195312, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.654552459716797, + "rewards/margins": 25.71814727783203, + "rewards/rejected": -35.37269973754883, + "step": 3159 + }, + { + "epoch": 1.9657853810264385, + "grad_norm": 1.2996370060136542e-05, + "learning_rate": 1.914476717381282e-06, + "logits/chosen": -0.1440131962299347, + "logits/rejected": 3.557115316390991, + "logps/chosen": -502.76531982421875, + "logps/rejected": -1063.85302734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.445473670959473, + "rewards/margins": 33.03824234008789, + "rewards/rejected": -44.48371505737305, + "step": 3160 + }, + { + "epoch": 1.966407465007776, + "grad_norm": 0.0939781591296196, + "learning_rate": 1.913324112494237e-06, + "logits/chosen": 1.9109691381454468, + "logits/rejected": 4.250695705413818, + "logps/chosen": -513.4877319335938, + "logps/rejected": -964.0062866210938, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.212984085083008, + "rewards/margins": 28.262313842773438, + "rewards/rejected": -38.47529602050781, + "step": 3161 + }, + { + "epoch": 1.9670295489891134, + "grad_norm": 16.613849639892578, + "learning_rate": 1.9121715076071924e-06, + "logits/chosen": -2.215022563934326, + "logits/rejected": 3.2000844478607178, + "logps/chosen": -508.90728759765625, + "logps/rejected": -1212.5, + "loss": 0.0634, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.177558898925781, + "rewards/margins": 36.37632751464844, + "rewards/rejected": -48.55388641357422, + "step": 3162 + }, + { + "epoch": 1.967651632970451, + "grad_norm": 29.307523727416992, + "learning_rate": 1.9110189027201476e-06, + "logits/chosen": 0.535607099533081, + "logits/rejected": 1.110487699508667, + "logps/chosen": -581.4873657226562, + "logps/rejected": -877.1924438476562, + "loss": 0.1411, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.076355934143066, + "rewards/margins": 24.285526275634766, + "rewards/rejected": -34.361881256103516, + "step": 3163 + }, + { + "epoch": 1.9682737169517885, + "grad_norm": 12.159449577331543, + "learning_rate": 1.909866297833103e-06, + "logits/chosen": 2.109978199005127, + "logits/rejected": 2.998699903488159, + "logps/chosen": -659.17919921875, + "logps/rejected": -887.56396484375, + "loss": 0.1495, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.49515151977539, + "rewards/margins": 19.975492477416992, + "rewards/rejected": -29.470645904541016, + "step": 3164 + }, + { + "epoch": 1.9688958009331259, + "grad_norm": 3.3401072869310156e-05, + "learning_rate": 1.908713692946058e-06, + "logits/chosen": -0.23816433548927307, + "logits/rejected": 4.679553985595703, + "logps/chosen": -519.406982421875, + "logps/rejected": -1159.0615234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.095431327819824, + "rewards/margins": 38.48530578613281, + "rewards/rejected": -49.58073425292969, + "step": 3165 + }, + { + "epoch": 1.9695178849144634, + "grad_norm": 3.759016564686135e-08, + "learning_rate": 1.9075610880590133e-06, + "logits/chosen": 0.09053629636764526, + "logits/rejected": 3.3123433589935303, + "logps/chosen": -542.893310546875, + "logps/rejected": -1076.65576171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.824052810668945, + "rewards/margins": 34.43450927734375, + "rewards/rejected": -46.25856018066406, + "step": 3166 + }, + { + "epoch": 1.970139968895801, + "grad_norm": 9.492687968304381e-05, + "learning_rate": 1.9064084831719687e-06, + "logits/chosen": -2.267563819885254, + "logits/rejected": 1.2888798713684082, + "logps/chosen": -436.4515380859375, + "logps/rejected": -974.1395263671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.326604843139648, + "rewards/margins": 34.898399353027344, + "rewards/rejected": -41.225006103515625, + "step": 3167 + }, + { + "epoch": 1.9707620528771383, + "grad_norm": 0.00013881105405744165, + "learning_rate": 1.905255878284924e-06, + "logits/chosen": -1.0494887828826904, + "logits/rejected": 1.077904224395752, + "logps/chosen": -445.4752197265625, + "logps/rejected": -931.4921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.989595413208008, + "rewards/margins": 30.04443359375, + "rewards/rejected": -39.034027099609375, + "step": 3168 + }, + { + "epoch": 1.971384136858476, + "grad_norm": 0.00014189987268764526, + "learning_rate": 1.9041032733978794e-06, + "logits/chosen": 0.719484806060791, + "logits/rejected": 1.8682856559753418, + "logps/chosen": -647.4837036132812, + "logps/rejected": -1057.12109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.586885452270508, + "rewards/margins": 31.471981048583984, + "rewards/rejected": -47.058868408203125, + "step": 3169 + }, + { + "epoch": 1.9720062208398135, + "grad_norm": 11.673099517822266, + "learning_rate": 1.9029506685108346e-06, + "logits/chosen": -1.9770435094833374, + "logits/rejected": 2.3703324794769287, + "logps/chosen": -462.7710876464844, + "logps/rejected": -1014.6849975585938, + "loss": 0.0698, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.472570419311523, + "rewards/margins": 35.858558654785156, + "rewards/rejected": -44.33112716674805, + "step": 3170 + }, + { + "epoch": 1.9726283048211508, + "grad_norm": 3.072976184625986e-08, + "learning_rate": 1.9017980636237898e-06, + "logits/chosen": 1.2289338111877441, + "logits/rejected": 3.1060218811035156, + "logps/chosen": -621.6762084960938, + "logps/rejected": -1122.5240478515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.191411018371582, + "rewards/margins": 35.472572326660156, + "rewards/rejected": -45.663978576660156, + "step": 3171 + }, + { + "epoch": 1.9732503888024884, + "grad_norm": 8.41129894979531e-06, + "learning_rate": 1.900645458736745e-06, + "logits/chosen": -0.29541754722595215, + "logits/rejected": 2.464428424835205, + "logps/chosen": -483.1417541503906, + "logps/rejected": -993.467529296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.057136058807373, + "rewards/margins": 33.919532775878906, + "rewards/rejected": -39.97666931152344, + "step": 3172 + }, + { + "epoch": 1.973872472783826, + "grad_norm": 0.007365781348198652, + "learning_rate": 1.8994928538497005e-06, + "logits/chosen": -0.018617630004882812, + "logits/rejected": 3.561922550201416, + "logps/chosen": -358.08642578125, + "logps/rejected": -819.01806640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.528288841247559, + "rewards/margins": 30.811264038085938, + "rewards/rejected": -36.33955383300781, + "step": 3173 + }, + { + "epoch": 1.9744945567651633, + "grad_norm": 0.005658108275383711, + "learning_rate": 1.8983402489626557e-06, + "logits/chosen": -1.1695606708526611, + "logits/rejected": 2.8908729553222656, + "logps/chosen": -511.5259094238281, + "logps/rejected": -1040.5850830078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.388712882995605, + "rewards/margins": 33.64862823486328, + "rewards/rejected": -42.03733825683594, + "step": 3174 + }, + { + "epoch": 1.9751166407465006, + "grad_norm": 0.0003242559905629605, + "learning_rate": 1.897187644075611e-06, + "logits/chosen": 0.11923173069953918, + "logits/rejected": 3.546078681945801, + "logps/chosen": -502.07183837890625, + "logps/rejected": -1198.700927734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.976399898529053, + "rewards/margins": 43.29308319091797, + "rewards/rejected": -51.26948547363281, + "step": 3175 + }, + { + "epoch": 1.9757387247278384, + "grad_norm": 62.495479583740234, + "learning_rate": 1.8960350391885661e-06, + "logits/chosen": 2.8545210361480713, + "logits/rejected": 3.2460947036743164, + "logps/chosen": -766.4078979492188, + "logps/rejected": -1039.130859375, + "loss": 0.9158, + "rewards/accuracies": 0.875, + "rewards/chosen": -19.115346908569336, + "rewards/margins": 22.217802047729492, + "rewards/rejected": -41.33314895629883, + "step": 3176 + }, + { + "epoch": 1.9763608087091757, + "grad_norm": 8.45496015244862e-06, + "learning_rate": 1.8948824343015216e-06, + "logits/chosen": -3.7129292488098145, + "logits/rejected": 1.566472053527832, + "logps/chosen": -306.4184265136719, + "logps/rejected": -932.2579345703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.600948333740234, + "rewards/margins": 31.574434280395508, + "rewards/rejected": -40.17538070678711, + "step": 3177 + }, + { + "epoch": 1.976982892690513, + "grad_norm": 0.7843475937843323, + "learning_rate": 1.8937298294144768e-06, + "logits/chosen": 1.6225887537002563, + "logits/rejected": 4.008382797241211, + "logps/chosen": -745.7958984375, + "logps/rejected": -1071.2774658203125, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.640724182128906, + "rewards/margins": 25.535783767700195, + "rewards/rejected": -40.17650604248047, + "step": 3178 + }, + { + "epoch": 1.9776049766718506, + "grad_norm": 2.848709357294865e-07, + "learning_rate": 1.892577224527432e-06, + "logits/chosen": -1.1793930530548096, + "logits/rejected": 1.1641390323638916, + "logps/chosen": -529.8934326171875, + "logps/rejected": -1047.27001953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.412664413452148, + "rewards/margins": 36.38139343261719, + "rewards/rejected": -47.794063568115234, + "step": 3179 + }, + { + "epoch": 1.9782270606531882, + "grad_norm": 0.10748720169067383, + "learning_rate": 1.8914246196403875e-06, + "logits/chosen": 0.003985404968261719, + "logits/rejected": 3.4207515716552734, + "logps/chosen": -571.9930419921875, + "logps/rejected": -1027.877197265625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.69100570678711, + "rewards/margins": 28.41872787475586, + "rewards/rejected": -37.10973358154297, + "step": 3180 + }, + { + "epoch": 1.9788491446345255, + "grad_norm": 0.0021116528660058975, + "learning_rate": 1.8902720147533427e-06, + "logits/chosen": 0.023240089416503906, + "logits/rejected": 4.275704383850098, + "logps/chosen": -391.3278503417969, + "logps/rejected": -927.5806884765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.604829788208008, + "rewards/margins": 29.584720611572266, + "rewards/rejected": -38.18954849243164, + "step": 3181 + }, + { + "epoch": 1.979471228615863, + "grad_norm": 2.9487199348920967e-09, + "learning_rate": 1.889119409866298e-06, + "logits/chosen": -0.26054155826568604, + "logits/rejected": 3.042192220687866, + "logps/chosen": -555.6188354492188, + "logps/rejected": -1068.11572265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.144253730773926, + "rewards/margins": 34.46765899658203, + "rewards/rejected": -43.61191177368164, + "step": 3182 + }, + { + "epoch": 1.9800933125972007, + "grad_norm": 0.008355970494449139, + "learning_rate": 1.8879668049792531e-06, + "logits/chosen": 2.2057981491088867, + "logits/rejected": 2.4470415115356445, + "logps/chosen": -720.2744750976562, + "logps/rejected": -873.899658203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.665620803833008, + "rewards/margins": 22.65416717529297, + "rewards/rejected": -34.31978988647461, + "step": 3183 + }, + { + "epoch": 1.980715396578538, + "grad_norm": 2.728590488433838, + "learning_rate": 1.8868142000922086e-06, + "logits/chosen": -1.2559614181518555, + "logits/rejected": 3.207569122314453, + "logps/chosen": -475.935302734375, + "logps/rejected": -1004.422607421875, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.180890083312988, + "rewards/margins": 29.82106590270996, + "rewards/rejected": -37.001953125, + "step": 3184 + }, + { + "epoch": 1.9813374805598756, + "grad_norm": 1.7883297687149025e-06, + "learning_rate": 1.8856615952051638e-06, + "logits/chosen": -1.5023488998413086, + "logits/rejected": 4.09722375869751, + "logps/chosen": -365.15325927734375, + "logps/rejected": -1063.2884521484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.08966064453125, + "rewards/margins": 36.919166564941406, + "rewards/rejected": -49.00883102416992, + "step": 3185 + }, + { + "epoch": 1.9819595645412131, + "grad_norm": 21.962324142456055, + "learning_rate": 1.884508990318119e-06, + "logits/chosen": -1.3031964302062988, + "logits/rejected": 2.3771743774414062, + "logps/chosen": -550.832275390625, + "logps/rejected": -880.1903686523438, + "loss": 0.1747, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.590015411376953, + "rewards/margins": 20.501266479492188, + "rewards/rejected": -32.091285705566406, + "step": 3186 + }, + { + "epoch": 1.9825816485225505, + "grad_norm": 12.981456756591797, + "learning_rate": 1.8833563854310745e-06, + "logits/chosen": 2.1112420558929443, + "logits/rejected": 2.808957576751709, + "logps/chosen": -717.0997314453125, + "logps/rejected": -899.928955078125, + "loss": 0.0672, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.56185531616211, + "rewards/margins": 15.117238998413086, + "rewards/rejected": -27.679094314575195, + "step": 3187 + }, + { + "epoch": 1.983203732503888, + "grad_norm": 0.0059814429841935635, + "learning_rate": 1.8822037805440297e-06, + "logits/chosen": -1.7570008039474487, + "logits/rejected": 1.8395037651062012, + "logps/chosen": -407.08367919921875, + "logps/rejected": -1051.62255859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.58604621887207, + "rewards/margins": 34.558738708496094, + "rewards/rejected": -47.14478302001953, + "step": 3188 + }, + { + "epoch": 1.9838258164852256, + "grad_norm": 0.15041697025299072, + "learning_rate": 1.881051175656985e-06, + "logits/chosen": 1.5227668285369873, + "logits/rejected": 3.454390287399292, + "logps/chosen": -638.9949951171875, + "logps/rejected": -938.1869506835938, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.065284729003906, + "rewards/margins": 27.189281463623047, + "rewards/rejected": -36.25456619262695, + "step": 3189 + }, + { + "epoch": 1.984447900466563, + "grad_norm": 0.41454270482063293, + "learning_rate": 1.8798985707699401e-06, + "logits/chosen": 0.650727391242981, + "logits/rejected": 3.691972494125366, + "logps/chosen": -396.9961853027344, + "logps/rejected": -893.4281616210938, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.56352424621582, + "rewards/margins": 25.493627548217773, + "rewards/rejected": -34.057151794433594, + "step": 3190 + }, + { + "epoch": 1.9850699844479005, + "grad_norm": 0.7126598358154297, + "learning_rate": 1.8787459658828956e-06, + "logits/chosen": 1.2246778011322021, + "logits/rejected": 2.014221668243408, + "logps/chosen": -533.6785888671875, + "logps/rejected": -979.2806396484375, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.698402404785156, + "rewards/margins": 32.90081787109375, + "rewards/rejected": -44.599220275878906, + "step": 3191 + }, + { + "epoch": 1.985692068429238, + "grad_norm": 0.03736700490117073, + "learning_rate": 1.8775933609958508e-06, + "logits/chosen": -0.6881626844406128, + "logits/rejected": 3.6436805725097656, + "logps/chosen": -356.96368408203125, + "logps/rejected": -787.3362426757812, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.223313331604004, + "rewards/margins": 28.010154724121094, + "rewards/rejected": -36.23346710205078, + "step": 3192 + }, + { + "epoch": 1.9863141524105754, + "grad_norm": 30.400585174560547, + "learning_rate": 1.876440756108806e-06, + "logits/chosen": 3.5226330757141113, + "logits/rejected": 4.738844394683838, + "logps/chosen": -752.00146484375, + "logps/rejected": -1067.257080078125, + "loss": 0.3187, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.198256492614746, + "rewards/margins": 26.144824981689453, + "rewards/rejected": -36.34307861328125, + "step": 3193 + }, + { + "epoch": 1.9869362363919127, + "grad_norm": 1.7005794048309326, + "learning_rate": 1.8752881512217612e-06, + "logits/chosen": -0.09759989380836487, + "logits/rejected": 1.11489999294281, + "logps/chosen": -441.220703125, + "logps/rejected": -719.19287109375, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.71569538116455, + "rewards/margins": 21.517513275146484, + "rewards/rejected": -31.23320960998535, + "step": 3194 + }, + { + "epoch": 1.9875583203732505, + "grad_norm": 0.014652427285909653, + "learning_rate": 1.8741355463347167e-06, + "logits/chosen": 0.6133676767349243, + "logits/rejected": 2.9346764087677, + "logps/chosen": -650.8162841796875, + "logps/rejected": -1050.29736328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.254842758178711, + "rewards/margins": 30.56330108642578, + "rewards/rejected": -43.818145751953125, + "step": 3195 + }, + { + "epoch": 1.9881804043545879, + "grad_norm": 0.00039215991273522377, + "learning_rate": 1.872982941447672e-06, + "logits/chosen": -1.609758973121643, + "logits/rejected": 2.8189010620117188, + "logps/chosen": -491.3603210449219, + "logps/rejected": -1019.6839599609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.190193176269531, + "rewards/margins": 33.10917663574219, + "rewards/rejected": -44.29936599731445, + "step": 3196 + }, + { + "epoch": 1.9888024883359252, + "grad_norm": 54.50910568237305, + "learning_rate": 1.8718303365606271e-06, + "logits/chosen": 1.285512924194336, + "logits/rejected": 2.98002290725708, + "logps/chosen": -510.360595703125, + "logps/rejected": -891.5095825195312, + "loss": 0.8237, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.826030731201172, + "rewards/margins": 25.11355209350586, + "rewards/rejected": -36.93958282470703, + "step": 3197 + }, + { + "epoch": 1.989424572317263, + "grad_norm": 3.051857493119314e-05, + "learning_rate": 1.8706777316735826e-06, + "logits/chosen": -0.6766510009765625, + "logits/rejected": 2.1143531799316406, + "logps/chosen": -480.74951171875, + "logps/rejected": -1022.847900390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.219396591186523, + "rewards/margins": 33.2050666809082, + "rewards/rejected": -43.42446517944336, + "step": 3198 + }, + { + "epoch": 1.9900466562986003, + "grad_norm": 0.0019154062028974295, + "learning_rate": 1.8695251267865378e-06, + "logits/chosen": -0.9002442955970764, + "logits/rejected": 3.898540496826172, + "logps/chosen": -527.63720703125, + "logps/rejected": -1305.6282958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.581677436828613, + "rewards/margins": 49.44828796386719, + "rewards/rejected": -61.02996826171875, + "step": 3199 + }, + { + "epoch": 1.9906687402799377, + "grad_norm": 0.3469063639640808, + "learning_rate": 1.868372521899493e-06, + "logits/chosen": -0.160893514752388, + "logits/rejected": 2.917184829711914, + "logps/chosen": -553.887939453125, + "logps/rejected": -994.0776977539062, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.018745422363281, + "rewards/margins": 31.039907455444336, + "rewards/rejected": -40.05865478515625, + "step": 3200 + }, + { + "epoch": 1.9912908242612752, + "grad_norm": 9.508553375781048e-06, + "learning_rate": 1.8672199170124482e-06, + "logits/chosen": 0.7386203408241272, + "logits/rejected": 3.1600570678710938, + "logps/chosen": -444.47576904296875, + "logps/rejected": -991.1026611328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.113765716552734, + "rewards/margins": 33.536102294921875, + "rewards/rejected": -42.64986801147461, + "step": 3201 + }, + { + "epoch": 1.9919129082426128, + "grad_norm": 36.303627014160156, + "learning_rate": 1.8660673121254037e-06, + "logits/chosen": 1.1433557271957397, + "logits/rejected": 3.8839259147644043, + "logps/chosen": -570.5936889648438, + "logps/rejected": -945.875732421875, + "loss": 0.8348, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.896246910095215, + "rewards/margins": 20.941421508789062, + "rewards/rejected": -32.837669372558594, + "step": 3202 + }, + { + "epoch": 1.9925349922239501, + "grad_norm": 0.05175577849149704, + "learning_rate": 1.864914707238359e-06, + "logits/chosen": -0.03805255889892578, + "logits/rejected": 1.907156229019165, + "logps/chosen": -590.1661376953125, + "logps/rejected": -947.9773559570312, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.847996711730957, + "rewards/margins": 21.76512336730957, + "rewards/rejected": -36.613121032714844, + "step": 3203 + }, + { + "epoch": 1.9931570762052877, + "grad_norm": 4.170333340880461e-05, + "learning_rate": 1.8637621023513141e-06, + "logits/chosen": -1.174929141998291, + "logits/rejected": 1.828955054283142, + "logps/chosen": -365.89617919921875, + "logps/rejected": -905.7449340820312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.122086524963379, + "rewards/margins": 31.113018035888672, + "rewards/rejected": -38.235107421875, + "step": 3204 + }, + { + "epoch": 1.9937791601866253, + "grad_norm": 1.7264932539173827e-11, + "learning_rate": 1.8626094974642693e-06, + "logits/chosen": 2.5377910137176514, + "logits/rejected": 4.4260687828063965, + "logps/chosen": -697.2733154296875, + "logps/rejected": -1260.3101806640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.474903106689453, + "rewards/margins": 38.30769729614258, + "rewards/rejected": -49.78260040283203, + "step": 3205 + }, + { + "epoch": 1.9944012441679626, + "grad_norm": 1.0426287651062012, + "learning_rate": 1.8614568925772248e-06, + "logits/chosen": -0.8568588495254517, + "logits/rejected": 0.989220380783081, + "logps/chosen": -438.51177978515625, + "logps/rejected": -781.666259765625, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.024365425109863, + "rewards/margins": 24.262752532958984, + "rewards/rejected": -34.28711700439453, + "step": 3206 + }, + { + "epoch": 1.9950233281493002, + "grad_norm": 17.793310165405273, + "learning_rate": 1.86030428769018e-06, + "logits/chosen": 0.980941891670227, + "logits/rejected": 1.8689048290252686, + "logps/chosen": -675.7926025390625, + "logps/rejected": -916.1185913085938, + "loss": 0.0549, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.662540435791016, + "rewards/margins": 18.985729217529297, + "rewards/rejected": -36.64826965332031, + "step": 3207 + }, + { + "epoch": 1.9956454121306377, + "grad_norm": 0.10001546144485474, + "learning_rate": 1.8591516828031352e-06, + "logits/chosen": 1.4320085048675537, + "logits/rejected": 3.508606433868408, + "logps/chosen": -679.5264282226562, + "logps/rejected": -1101.302978515625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.195636749267578, + "rewards/margins": 26.399478912353516, + "rewards/rejected": -41.595115661621094, + "step": 3208 + }, + { + "epoch": 1.996267496111975, + "grad_norm": 3.449564610491507e-05, + "learning_rate": 1.8579990779160907e-06, + "logits/chosen": -0.9296298027038574, + "logits/rejected": 3.058635950088501, + "logps/chosen": -398.24456787109375, + "logps/rejected": -1040.71044921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.891507148742676, + "rewards/margins": 38.04267120361328, + "rewards/rejected": -46.934181213378906, + "step": 3209 + }, + { + "epoch": 1.9968895800933126, + "grad_norm": 9.294652409153059e-05, + "learning_rate": 1.856846473029046e-06, + "logits/chosen": 1.0654292106628418, + "logits/rejected": 1.9909114837646484, + "logps/chosen": -664.6588745117188, + "logps/rejected": -1054.145751953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.01765251159668, + "rewards/margins": 31.69105339050293, + "rewards/rejected": -45.70870590209961, + "step": 3210 + }, + { + "epoch": 1.9975116640746502, + "grad_norm": 0.5575461387634277, + "learning_rate": 1.8556938681420011e-06, + "logits/chosen": 3.3123221397399902, + "logits/rejected": 2.042398452758789, + "logps/chosen": -661.1849365234375, + "logps/rejected": -802.16162109375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.65969181060791, + "rewards/margins": 23.294269561767578, + "rewards/rejected": -32.95396423339844, + "step": 3211 + }, + { + "epoch": 1.9981337480559875, + "grad_norm": 5.9800062444992363e-05, + "learning_rate": 1.8545412632549563e-06, + "logits/chosen": 0.1124124526977539, + "logits/rejected": -0.4729093611240387, + "logps/chosen": -529.6085205078125, + "logps/rejected": -770.8993530273438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.632349014282227, + "rewards/margins": 20.30960464477539, + "rewards/rejected": -32.941951751708984, + "step": 3212 + }, + { + "epoch": 1.9987558320373249, + "grad_norm": 0.42236706614494324, + "learning_rate": 1.8533886583679118e-06, + "logits/chosen": 1.4251984357833862, + "logits/rejected": 3.264601469039917, + "logps/chosen": -543.8138427734375, + "logps/rejected": -847.8862915039062, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.262736320495605, + "rewards/margins": 20.891653060913086, + "rewards/rejected": -29.154390335083008, + "step": 3213 + }, + { + "epoch": 1.9993779160186627, + "grad_norm": 36.19878387451172, + "learning_rate": 1.852236053480867e-06, + "logits/chosen": -1.031221866607666, + "logits/rejected": 2.1314122676849365, + "logps/chosen": -418.09295654296875, + "logps/rejected": -987.6477661132812, + "loss": 0.1566, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.54425048828125, + "rewards/margins": 33.829002380371094, + "rewards/rejected": -39.373252868652344, + "step": 3214 + }, + { + "epoch": 2.0, + "grad_norm": 2.246525632187968e-08, + "learning_rate": 1.8510834485938222e-06, + "logits/chosen": 1.2374851703643799, + "logits/rejected": 3.5844645500183105, + "logps/chosen": -616.0235595703125, + "logps/rejected": -1126.6861572265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.12330436706543, + "rewards/margins": 36.971405029296875, + "rewards/rejected": -52.09471130371094, + "step": 3215 + }, + { + "epoch": 2.0006220839813373, + "grad_norm": 1.0097607472392411e-10, + "learning_rate": 1.8499308437067775e-06, + "logits/chosen": -0.8583577871322632, + "logits/rejected": 4.101155757904053, + "logps/chosen": -515.8357543945312, + "logps/rejected": -1275.03466796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.491175651550293, + "rewards/margins": 47.675228118896484, + "rewards/rejected": -58.166404724121094, + "step": 3216 + }, + { + "epoch": 2.001244167962675, + "grad_norm": 4.448748586582951e-05, + "learning_rate": 1.8487782388197329e-06, + "logits/chosen": 0.23793944716453552, + "logits/rejected": 3.973947048187256, + "logps/chosen": -618.2573852539062, + "logps/rejected": -1212.6055908203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.788324356079102, + "rewards/margins": 40.03203582763672, + "rewards/rejected": -50.82035827636719, + "step": 3217 + }, + { + "epoch": 2.0018662519440125, + "grad_norm": 7.491581345675513e-05, + "learning_rate": 1.8476256339326881e-06, + "logits/chosen": 1.447487473487854, + "logits/rejected": 4.082512378692627, + "logps/chosen": -616.44384765625, + "logps/rejected": -1120.1319580078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.434648513793945, + "rewards/margins": 38.114646911621094, + "rewards/rejected": -49.54929733276367, + "step": 3218 + }, + { + "epoch": 2.00248833592535, + "grad_norm": 3.84054183086846e-05, + "learning_rate": 1.8464730290456433e-06, + "logits/chosen": 1.8149409294128418, + "logits/rejected": 4.784121513366699, + "logps/chosen": -467.8046875, + "logps/rejected": -872.73779296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.681539535522461, + "rewards/margins": 28.198171615600586, + "rewards/rejected": -36.87971496582031, + "step": 3219 + }, + { + "epoch": 2.0031104199066876, + "grad_norm": 0.043713755905628204, + "learning_rate": 1.8453204241585988e-06, + "logits/chosen": 0.8241747617721558, + "logits/rejected": 3.4915554523468018, + "logps/chosen": -591.9862670898438, + "logps/rejected": -1083.1734619140625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.339737892150879, + "rewards/margins": 31.408842086791992, + "rewards/rejected": -41.74858093261719, + "step": 3220 + }, + { + "epoch": 2.003732503888025, + "grad_norm": 1.5662541272831731e-06, + "learning_rate": 1.844167819271554e-06, + "logits/chosen": -3.3587021827697754, + "logits/rejected": 3.4004125595092773, + "logps/chosen": -293.3941650390625, + "logps/rejected": -941.4876708984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.155046463012695, + "rewards/margins": 30.374483108520508, + "rewards/rejected": -37.5295295715332, + "step": 3221 + }, + { + "epoch": 2.0043545878693623, + "grad_norm": 7.130699634552002, + "learning_rate": 1.8430152143845092e-06, + "logits/chosen": -1.5600241422653198, + "logits/rejected": 2.922963857650757, + "logps/chosen": -547.20849609375, + "logps/rejected": -1075.0660400390625, + "loss": 0.0517, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.9271821975708, + "rewards/margins": 28.489486694335938, + "rewards/rejected": -43.41667175292969, + "step": 3222 + }, + { + "epoch": 2.0049766718507, + "grad_norm": 1.1248069142766326e-09, + "learning_rate": 1.8418626094974645e-06, + "logits/chosen": -2.2901358604431152, + "logits/rejected": 4.997872829437256, + "logps/chosen": -412.37506103515625, + "logps/rejected": -1133.8994140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.321676254272461, + "rewards/margins": 33.20163345336914, + "rewards/rejected": -43.52330780029297, + "step": 3223 + }, + { + "epoch": 2.0055987558320374, + "grad_norm": 1.6254787169600604e-06, + "learning_rate": 1.8407100046104199e-06, + "logits/chosen": 0.942064642906189, + "logits/rejected": 4.401899814605713, + "logps/chosen": -590.219970703125, + "logps/rejected": -1120.31787109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.195745468139648, + "rewards/margins": 31.794509887695312, + "rewards/rejected": -42.990257263183594, + "step": 3224 + }, + { + "epoch": 2.0062208398133747, + "grad_norm": 2.278909960296005e-05, + "learning_rate": 1.8395573997233751e-06, + "logits/chosen": -2.379765033721924, + "logits/rejected": 3.167349338531494, + "logps/chosen": -284.0023193359375, + "logps/rejected": -844.4675903320312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.441474437713623, + "rewards/margins": 31.908851623535156, + "rewards/rejected": -35.35032653808594, + "step": 3225 + }, + { + "epoch": 2.006842923794712, + "grad_norm": 1.1007481813430786, + "learning_rate": 1.8384047948363303e-06, + "logits/chosen": -0.15077215433120728, + "logits/rejected": 2.8373799324035645, + "logps/chosen": -441.18719482421875, + "logps/rejected": -808.1421508789062, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.933378219604492, + "rewards/margins": 23.44791030883789, + "rewards/rejected": -35.381290435791016, + "step": 3226 + }, + { + "epoch": 2.00746500777605, + "grad_norm": 6.995454100433562e-07, + "learning_rate": 1.8372521899492856e-06, + "logits/chosen": -0.511811375617981, + "logits/rejected": 3.8422083854675293, + "logps/chosen": -386.70245361328125, + "logps/rejected": -960.3453369140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.07375955581665, + "rewards/margins": 31.684070587158203, + "rewards/rejected": -37.75782775878906, + "step": 3227 + }, + { + "epoch": 2.008087091757387, + "grad_norm": 1.6627997589946375e-11, + "learning_rate": 1.836099585062241e-06, + "logits/chosen": 0.5001524686813354, + "logits/rejected": 1.829136848449707, + "logps/chosen": -693.051025390625, + "logps/rejected": -1198.63330078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.901176452636719, + "rewards/margins": 39.069000244140625, + "rewards/rejected": -52.97017288208008, + "step": 3228 + }, + { + "epoch": 2.0087091757387245, + "grad_norm": 0.2621017396450043, + "learning_rate": 1.834946980175196e-06, + "logits/chosen": -0.9760973453521729, + "logits/rejected": 3.2193968296051025, + "logps/chosen": -499.0474853515625, + "logps/rejected": -1006.688232421875, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.02197265625, + "rewards/margins": 30.229379653930664, + "rewards/rejected": -40.25135040283203, + "step": 3229 + }, + { + "epoch": 2.0093312597200623, + "grad_norm": 0.003114903811365366, + "learning_rate": 1.8337943752881512e-06, + "logits/chosen": -2.958038568496704, + "logits/rejected": 2.047264575958252, + "logps/chosen": -356.1661071777344, + "logps/rejected": -884.7626953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.416285991668701, + "rewards/margins": 27.496191024780273, + "rewards/rejected": -34.912479400634766, + "step": 3230 + }, + { + "epoch": 2.0099533437013997, + "grad_norm": 0.003051260020583868, + "learning_rate": 1.8326417704011065e-06, + "logits/chosen": -0.30247652530670166, + "logits/rejected": 2.661752462387085, + "logps/chosen": -552.0884399414062, + "logps/rejected": -1117.947021484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.436690330505371, + "rewards/margins": 34.702999114990234, + "rewards/rejected": -47.13969039916992, + "step": 3231 + }, + { + "epoch": 2.010575427682737, + "grad_norm": 0.03463459387421608, + "learning_rate": 1.831489165514062e-06, + "logits/chosen": 0.2040191888809204, + "logits/rejected": 3.293041706085205, + "logps/chosen": -549.462890625, + "logps/rejected": -954.4931640625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.499516487121582, + "rewards/margins": 24.763486862182617, + "rewards/rejected": -35.26300811767578, + "step": 3232 + }, + { + "epoch": 2.011197511664075, + "grad_norm": 5.8191501011606306e-05, + "learning_rate": 1.8303365606270171e-06, + "logits/chosen": 0.7268747091293335, + "logits/rejected": 0.9204151034355164, + "logps/chosen": -638.6798095703125, + "logps/rejected": -936.39453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.820808410644531, + "rewards/margins": 26.461711883544922, + "rewards/rejected": -39.28252029418945, + "step": 3233 + }, + { + "epoch": 2.011819595645412, + "grad_norm": 1.3442055246670748e-09, + "learning_rate": 1.8291839557399723e-06, + "logits/chosen": -2.53243088722229, + "logits/rejected": 2.388972282409668, + "logps/chosen": -442.279541015625, + "logps/rejected": -1245.7723388671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.95213508605957, + "rewards/margins": 50.84278106689453, + "rewards/rejected": -62.794918060302734, + "step": 3234 + }, + { + "epoch": 2.0124416796267495, + "grad_norm": 20.80923080444336, + "learning_rate": 1.8280313508529276e-06, + "logits/chosen": 1.5385990142822266, + "logits/rejected": 4.438154220581055, + "logps/chosen": -615.07470703125, + "logps/rejected": -1099.8695068359375, + "loss": 0.1284, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.9859466552734375, + "rewards/margins": 32.458778381347656, + "rewards/rejected": -40.44472122192383, + "step": 3235 + }, + { + "epoch": 2.0130637636080873, + "grad_norm": 0.001437014085240662, + "learning_rate": 1.826878745965883e-06, + "logits/chosen": 1.5975340604782104, + "logits/rejected": 2.7934255599975586, + "logps/chosen": -651.9356079101562, + "logps/rejected": -1084.415771484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.168201446533203, + "rewards/margins": 37.92890167236328, + "rewards/rejected": -48.097103118896484, + "step": 3236 + }, + { + "epoch": 2.0136858475894246, + "grad_norm": 0.022464267909526825, + "learning_rate": 1.8257261410788382e-06, + "logits/chosen": -0.4906814992427826, + "logits/rejected": 4.151356220245361, + "logps/chosen": -384.986083984375, + "logps/rejected": -939.2615966796875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.647461891174316, + "rewards/margins": 29.363422393798828, + "rewards/rejected": -40.01088333129883, + "step": 3237 + }, + { + "epoch": 2.014307931570762, + "grad_norm": 0.0007543100509792566, + "learning_rate": 1.8245735361917935e-06, + "logits/chosen": 0.009873226284980774, + "logits/rejected": 2.72328519821167, + "logps/chosen": -531.1657104492188, + "logps/rejected": -1049.25439453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.855783462524414, + "rewards/margins": 33.60731887817383, + "rewards/rejected": -45.46310043334961, + "step": 3238 + }, + { + "epoch": 2.0149300155520997, + "grad_norm": 0.1094256192445755, + "learning_rate": 1.8234209313047487e-06, + "logits/chosen": 0.3360966444015503, + "logits/rejected": 4.504080772399902, + "logps/chosen": -476.1607666015625, + "logps/rejected": -967.1328125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.062921047210693, + "rewards/margins": 27.761829376220703, + "rewards/rejected": -34.82475280761719, + "step": 3239 + }, + { + "epoch": 2.015552099533437, + "grad_norm": 0.0464254654943943, + "learning_rate": 1.8222683264177041e-06, + "logits/chosen": 1.2491750717163086, + "logits/rejected": 3.8947272300720215, + "logps/chosen": -432.4212646484375, + "logps/rejected": -855.244873046875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.43021011352539, + "rewards/margins": 22.83322525024414, + "rewards/rejected": -33.26343536376953, + "step": 3240 + }, + { + "epoch": 2.0161741835147744, + "grad_norm": 21.57063865661621, + "learning_rate": 1.8211157215306593e-06, + "logits/chosen": 1.8107213973999023, + "logits/rejected": 4.095378398895264, + "logps/chosen": -570.1275634765625, + "logps/rejected": -953.5574951171875, + "loss": 0.1983, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.565531730651855, + "rewards/margins": 23.652999877929688, + "rewards/rejected": -33.21853256225586, + "step": 3241 + }, + { + "epoch": 2.016796267496112, + "grad_norm": 8.953101314546075e-06, + "learning_rate": 1.8199631166436146e-06, + "logits/chosen": 3.1453516483306885, + "logits/rejected": 3.4556429386138916, + "logps/chosen": -581.106689453125, + "logps/rejected": -922.162841796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.440402030944824, + "rewards/margins": 27.36144256591797, + "rewards/rejected": -39.80184555053711, + "step": 3242 + }, + { + "epoch": 2.0174183514774495, + "grad_norm": 0.07161377370357513, + "learning_rate": 1.81881051175657e-06, + "logits/chosen": 1.7500510215759277, + "logits/rejected": 3.6969451904296875, + "logps/chosen": -632.09765625, + "logps/rejected": -1028.01953125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.109907150268555, + "rewards/margins": 28.601673126220703, + "rewards/rejected": -40.71158218383789, + "step": 3243 + }, + { + "epoch": 2.018040435458787, + "grad_norm": 9.346132173959631e-06, + "learning_rate": 1.8176579068695252e-06, + "logits/chosen": -0.8983386754989624, + "logits/rejected": 2.404198169708252, + "logps/chosen": -537.7662353515625, + "logps/rejected": -1138.3782958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.997988700866699, + "rewards/margins": 38.091087341308594, + "rewards/rejected": -46.089073181152344, + "step": 3244 + }, + { + "epoch": 2.018662519440124, + "grad_norm": 0.00020862782548647374, + "learning_rate": 1.8165053019824805e-06, + "logits/chosen": 0.0696406364440918, + "logits/rejected": 3.6975560188293457, + "logps/chosen": -413.0459289550781, + "logps/rejected": -961.5831909179688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.714500904083252, + "rewards/margins": 35.98335266113281, + "rewards/rejected": -43.69785690307617, + "step": 3245 + }, + { + "epoch": 2.019284603421462, + "grad_norm": 0.004104145802557468, + "learning_rate": 1.8153526970954357e-06, + "logits/chosen": -0.13870498538017273, + "logits/rejected": 4.5534257888793945, + "logps/chosen": -294.36688232421875, + "logps/rejected": -854.3531494140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.45639181137085, + "rewards/margins": 26.36554718017578, + "rewards/rejected": -32.821937561035156, + "step": 3246 + }, + { + "epoch": 2.0199066874027993, + "grad_norm": 3.7008983326813905e-06, + "learning_rate": 1.8142000922083911e-06, + "logits/chosen": 0.7690372467041016, + "logits/rejected": 2.658047914505005, + "logps/chosen": -630.7483520507812, + "logps/rejected": -1073.966064453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.671428680419922, + "rewards/margins": 31.08428955078125, + "rewards/rejected": -42.75571823120117, + "step": 3247 + }, + { + "epoch": 2.0205287713841367, + "grad_norm": 0.07437684386968613, + "learning_rate": 1.8130474873213463e-06, + "logits/chosen": 0.5262776613235474, + "logits/rejected": 2.2410149574279785, + "logps/chosen": -577.2639770507812, + "logps/rejected": -973.701416015625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.476058959960938, + "rewards/margins": 30.7393798828125, + "rewards/rejected": -41.21543884277344, + "step": 3248 + }, + { + "epoch": 2.0211508553654745, + "grad_norm": 1.5357197523117065, + "learning_rate": 1.8118948824343016e-06, + "logits/chosen": 1.70393705368042, + "logits/rejected": 3.8450446128845215, + "logps/chosen": -481.91070556640625, + "logps/rejected": -1086.33935546875, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.68971061706543, + "rewards/margins": 40.52362060546875, + "rewards/rejected": -49.21332931518555, + "step": 3249 + }, + { + "epoch": 2.021772939346812, + "grad_norm": 5.022254834230466e-10, + "learning_rate": 1.810742277547257e-06, + "logits/chosen": -2.6166677474975586, + "logits/rejected": 2.7093167304992676, + "logps/chosen": -303.3418273925781, + "logps/rejected": -1076.861083984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.309881210327148, + "rewards/margins": 42.87946319580078, + "rewards/rejected": -49.1893424987793, + "step": 3250 + }, + { + "epoch": 2.022395023328149, + "grad_norm": 5.978255271911621, + "learning_rate": 1.8095896726602122e-06, + "logits/chosen": 0.09582383185625076, + "logits/rejected": 1.5433762073516846, + "logps/chosen": -550.7335205078125, + "logps/rejected": -959.3916625976562, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.984166145324707, + "rewards/margins": 23.924039840698242, + "rewards/rejected": -34.908203125, + "step": 3251 + }, + { + "epoch": 2.023017107309487, + "grad_norm": 0.005254325456917286, + "learning_rate": 1.8084370677731675e-06, + "logits/chosen": -0.8678733706474304, + "logits/rejected": 2.8479673862457275, + "logps/chosen": -551.5311279296875, + "logps/rejected": -1241.876708984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.364095687866211, + "rewards/margins": 42.996192932128906, + "rewards/rejected": -56.360286712646484, + "step": 3252 + }, + { + "epoch": 2.0236391912908243, + "grad_norm": 0.026929447427392006, + "learning_rate": 1.8072844628861227e-06, + "logits/chosen": -1.9292072057724, + "logits/rejected": 2.830716848373413, + "logps/chosen": -461.7641906738281, + "logps/rejected": -977.4414672851562, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.050446510314941, + "rewards/margins": 23.504653930664062, + "rewards/rejected": -30.55510139465332, + "step": 3253 + }, + { + "epoch": 2.0242612752721616, + "grad_norm": 35.80807876586914, + "learning_rate": 1.8061318579990781e-06, + "logits/chosen": 0.03676527738571167, + "logits/rejected": 2.638430595397949, + "logps/chosen": -380.2243347167969, + "logps/rejected": -805.8291015625, + "loss": 0.3045, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.26235580444336, + "rewards/margins": 25.53285026550293, + "rewards/rejected": -38.795204162597656, + "step": 3254 + }, + { + "epoch": 2.0248833592534994, + "grad_norm": 0.05008547008037567, + "learning_rate": 1.8049792531120333e-06, + "logits/chosen": -0.5707646608352661, + "logits/rejected": 3.5213394165039062, + "logps/chosen": -453.5271301269531, + "logps/rejected": -1093.7734375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.335705757141113, + "rewards/margins": 42.951393127441406, + "rewards/rejected": -48.2870979309082, + "step": 3255 + }, + { + "epoch": 2.0255054432348367, + "grad_norm": 16.085357666015625, + "learning_rate": 1.8038266482249886e-06, + "logits/chosen": 2.096897602081299, + "logits/rejected": 2.4062466621398926, + "logps/chosen": -702.8720092773438, + "logps/rejected": -930.53564453125, + "loss": 0.091, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.675012588500977, + "rewards/margins": 20.07527732849121, + "rewards/rejected": -32.75028991699219, + "step": 3256 + }, + { + "epoch": 2.026127527216174, + "grad_norm": 0.00011278959573246539, + "learning_rate": 1.8026740433379438e-06, + "logits/chosen": 2.4136128425598145, + "logits/rejected": 3.5406317710876465, + "logps/chosen": -606.5416259765625, + "logps/rejected": -968.1136474609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.13314437866211, + "rewards/margins": 29.38345718383789, + "rewards/rejected": -42.5166015625, + "step": 3257 + }, + { + "epoch": 2.026749611197512, + "grad_norm": 0.0005387061974033713, + "learning_rate": 1.8015214384508992e-06, + "logits/chosen": -0.6184181571006775, + "logits/rejected": 2.548027992248535, + "logps/chosen": -573.9535522460938, + "logps/rejected": -1113.246337890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.945446014404297, + "rewards/margins": 35.821556091308594, + "rewards/rejected": -50.76700210571289, + "step": 3258 + }, + { + "epoch": 2.027371695178849, + "grad_norm": 0.00011549627379281446, + "learning_rate": 1.8003688335638544e-06, + "logits/chosen": 3.146450996398926, + "logits/rejected": 3.83984375, + "logps/chosen": -783.1937255859375, + "logps/rejected": -1222.3145751953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.651394844055176, + "rewards/margins": 37.6688232421875, + "rewards/rejected": -53.320220947265625, + "step": 3259 + }, + { + "epoch": 2.0279937791601865, + "grad_norm": 0.006031322292983532, + "learning_rate": 1.7992162286768097e-06, + "logits/chosen": -1.1792547702789307, + "logits/rejected": 0.5242654085159302, + "logps/chosen": -531.763916015625, + "logps/rejected": -996.9135131835938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.301654815673828, + "rewards/margins": 33.844364166259766, + "rewards/rejected": -44.146018981933594, + "step": 3260 + }, + { + "epoch": 2.0286158631415243, + "grad_norm": 0.005457498598843813, + "learning_rate": 1.7980636237897651e-06, + "logits/chosen": -0.2665392756462097, + "logits/rejected": 4.692824363708496, + "logps/chosen": -368.7724914550781, + "logps/rejected": -965.8126220703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.76845645904541, + "rewards/margins": 28.322364807128906, + "rewards/rejected": -36.0908203125, + "step": 3261 + }, + { + "epoch": 2.0292379471228617, + "grad_norm": 1.9818975488306023e-05, + "learning_rate": 1.7969110189027203e-06, + "logits/chosen": 0.24156644940376282, + "logits/rejected": 2.885503053665161, + "logps/chosen": -686.6699829101562, + "logps/rejected": -1015.3580322265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.752955436706543, + "rewards/margins": 30.202939987182617, + "rewards/rejected": -42.95589828491211, + "step": 3262 + }, + { + "epoch": 2.029860031104199, + "grad_norm": 4.516140563737281e-07, + "learning_rate": 1.7957584140156756e-06, + "logits/chosen": 1.4977079629898071, + "logits/rejected": 4.380037307739258, + "logps/chosen": -616.4777221679688, + "logps/rejected": -1208.46826171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.326394081115723, + "rewards/margins": 39.57026672363281, + "rewards/rejected": -52.89665603637695, + "step": 3263 + }, + { + "epoch": 2.0304821150855363, + "grad_norm": 7.894611826486653e-07, + "learning_rate": 1.7946058091286308e-06, + "logits/chosen": 1.232468843460083, + "logits/rejected": 3.525763511657715, + "logps/chosen": -512.837646484375, + "logps/rejected": -1061.3662109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.058338165283203, + "rewards/margins": 38.82855224609375, + "rewards/rejected": -47.88689422607422, + "step": 3264 + }, + { + "epoch": 2.031104199066874, + "grad_norm": 0.46222686767578125, + "learning_rate": 1.7934532042415862e-06, + "logits/chosen": 0.8871013522148132, + "logits/rejected": 3.695953607559204, + "logps/chosen": -575.14892578125, + "logps/rejected": -972.8055419921875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.340843200683594, + "rewards/margins": 26.107912063598633, + "rewards/rejected": -36.448753356933594, + "step": 3265 + }, + { + "epoch": 2.0317262830482115, + "grad_norm": 0.00018285130499862134, + "learning_rate": 1.7923005993545414e-06, + "logits/chosen": 0.45569974184036255, + "logits/rejected": 2.566760301589966, + "logps/chosen": -628.85400390625, + "logps/rejected": -1045.263671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.879467010498047, + "rewards/margins": 29.43852996826172, + "rewards/rejected": -41.317996978759766, + "step": 3266 + }, + { + "epoch": 2.032348367029549, + "grad_norm": 4.834855644730851e-05, + "learning_rate": 1.7911479944674967e-06, + "logits/chosen": 1.829100489616394, + "logits/rejected": 3.4712557792663574, + "logps/chosen": -811.4913330078125, + "logps/rejected": -1266.06103515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.349407196044922, + "rewards/margins": 36.72393798828125, + "rewards/rejected": -52.07334518432617, + "step": 3267 + }, + { + "epoch": 2.0329704510108866, + "grad_norm": 11.432663917541504, + "learning_rate": 1.789995389580452e-06, + "logits/chosen": -2.6283178329467773, + "logits/rejected": 2.5972487926483154, + "logps/chosen": -460.37530517578125, + "logps/rejected": -983.4398803710938, + "loss": 0.0675, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.121039390563965, + "rewards/margins": 27.89306640625, + "rewards/rejected": -38.01410675048828, + "step": 3268 + }, + { + "epoch": 2.033592534992224, + "grad_norm": 5.5460273870266974e-05, + "learning_rate": 1.7888427846934073e-06, + "logits/chosen": 0.5614759922027588, + "logits/rejected": 2.7525126934051514, + "logps/chosen": -524.7399291992188, + "logps/rejected": -1027.02294921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.975117683410645, + "rewards/margins": 32.141090393066406, + "rewards/rejected": -41.1162109375, + "step": 3269 + }, + { + "epoch": 2.0342146189735613, + "grad_norm": 0.0012390101328492165, + "learning_rate": 1.7876901798063626e-06, + "logits/chosen": 0.3229471743106842, + "logits/rejected": 2.942484140396118, + "logps/chosen": -524.5540771484375, + "logps/rejected": -1004.2750244140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.311838626861572, + "rewards/margins": 36.90262985229492, + "rewards/rejected": -42.21446990966797, + "step": 3270 + }, + { + "epoch": 2.034836702954899, + "grad_norm": 0.002542484551668167, + "learning_rate": 1.7865375749193178e-06, + "logits/chosen": 0.5060664415359497, + "logits/rejected": 4.053060531616211, + "logps/chosen": -528.406005859375, + "logps/rejected": -1041.412109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.696809768676758, + "rewards/margins": 26.39920425415039, + "rewards/rejected": -35.09601593017578, + "step": 3271 + }, + { + "epoch": 2.0354587869362364, + "grad_norm": 0.0006695652264170349, + "learning_rate": 1.7853849700322732e-06, + "logits/chosen": 1.4988611936569214, + "logits/rejected": 1.3970237970352173, + "logps/chosen": -627.5516967773438, + "logps/rejected": -843.8731689453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.545269012451172, + "rewards/margins": 24.648292541503906, + "rewards/rejected": -37.19355773925781, + "step": 3272 + }, + { + "epoch": 2.0360808709175737, + "grad_norm": 0.011035816743969917, + "learning_rate": 1.7842323651452284e-06, + "logits/chosen": -2.0264055728912354, + "logits/rejected": 2.839123487472534, + "logps/chosen": -318.1818542480469, + "logps/rejected": -856.6058959960938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.792394161224365, + "rewards/margins": 23.563783645629883, + "rewards/rejected": -30.356178283691406, + "step": 3273 + }, + { + "epoch": 2.0367029548989115, + "grad_norm": 0.00011552633077371866, + "learning_rate": 1.7830797602581837e-06, + "logits/chosen": 0.0791429877281189, + "logits/rejected": 3.939262866973877, + "logps/chosen": -413.80029296875, + "logps/rejected": -987.649169921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.731544017791748, + "rewards/margins": 38.70989990234375, + "rewards/rejected": -44.441444396972656, + "step": 3274 + }, + { + "epoch": 2.037325038880249, + "grad_norm": 0.19351507723331451, + "learning_rate": 1.7819271553711389e-06, + "logits/chosen": 0.4769981801509857, + "logits/rejected": 2.4208312034606934, + "logps/chosen": -651.798095703125, + "logps/rejected": -1105.50830078125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.518138885498047, + "rewards/margins": 32.211429595947266, + "rewards/rejected": -44.72956848144531, + "step": 3275 + }, + { + "epoch": 2.037947122861586, + "grad_norm": 1.6189000362487604e-08, + "learning_rate": 1.7807745504840943e-06, + "logits/chosen": 0.6967576146125793, + "logits/rejected": 4.596070289611816, + "logps/chosen": -495.2347412109375, + "logps/rejected": -1113.9619140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.830886840820312, + "rewards/margins": 35.10674285888672, + "rewards/rejected": -43.93762969970703, + "step": 3276 + }, + { + "epoch": 2.038569206842924, + "grad_norm": 3.4301083360332996e-05, + "learning_rate": 1.7796219455970496e-06, + "logits/chosen": 0.39109957218170166, + "logits/rejected": 3.4164555072784424, + "logps/chosen": -509.5931091308594, + "logps/rejected": -1002.6499633789062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.291837692260742, + "rewards/margins": 33.81525421142578, + "rewards/rejected": -46.107093811035156, + "step": 3277 + }, + { + "epoch": 2.0391912908242613, + "grad_norm": 0.714177131652832, + "learning_rate": 1.7784693407100048e-06, + "logits/chosen": 2.7856783866882324, + "logits/rejected": 3.950831413269043, + "logps/chosen": -630.0526123046875, + "logps/rejected": -963.7937622070312, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.394908905029297, + "rewards/margins": 25.152563095092773, + "rewards/rejected": -34.5474739074707, + "step": 3278 + }, + { + "epoch": 2.0398133748055987, + "grad_norm": 0.05921145901083946, + "learning_rate": 1.77731673582296e-06, + "logits/chosen": -0.07561540603637695, + "logits/rejected": 1.551999807357788, + "logps/chosen": -601.9862060546875, + "logps/rejected": -1023.5673828125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.382827758789062, + "rewards/margins": 30.679702758789062, + "rewards/rejected": -45.06253433227539, + "step": 3279 + }, + { + "epoch": 2.0404354587869364, + "grad_norm": 0.019323257729411125, + "learning_rate": 1.7761641309359154e-06, + "logits/chosen": -1.1875865459442139, + "logits/rejected": 2.3914713859558105, + "logps/chosen": -543.6219482421875, + "logps/rejected": -960.1560668945312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.5393648147583, + "rewards/margins": 24.20549774169922, + "rewards/rejected": -32.74486541748047, + "step": 3280 + }, + { + "epoch": 2.041057542768274, + "grad_norm": 0.0016454965807497501, + "learning_rate": 1.7750115260488707e-06, + "logits/chosen": 3.365413188934326, + "logits/rejected": 4.608278274536133, + "logps/chosen": -793.6561889648438, + "logps/rejected": -1262.06640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.539536476135254, + "rewards/margins": 35.477149963378906, + "rewards/rejected": -48.016685485839844, + "step": 3281 + }, + { + "epoch": 2.041679626749611, + "grad_norm": 0.00505446782335639, + "learning_rate": 1.7738589211618259e-06, + "logits/chosen": 1.5577607154846191, + "logits/rejected": 1.9160778522491455, + "logps/chosen": -656.432861328125, + "logps/rejected": -947.4666748046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.272841453552246, + "rewards/margins": 25.096763610839844, + "rewards/rejected": -34.369606018066406, + "step": 3282 + }, + { + "epoch": 2.0423017107309485, + "grad_norm": 4.4753107886208454e-07, + "learning_rate": 1.7727063162747813e-06, + "logits/chosen": 0.7870282530784607, + "logits/rejected": 4.364411354064941, + "logps/chosen": -598.6383056640625, + "logps/rejected": -1166.548828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.089197158813477, + "rewards/margins": 34.72199249267578, + "rewards/rejected": -47.811187744140625, + "step": 3283 + }, + { + "epoch": 2.0429237947122862, + "grad_norm": 0.0002891090407501906, + "learning_rate": 1.7715537113877366e-06, + "logits/chosen": 0.13575269281864166, + "logits/rejected": 2.0142300128936768, + "logps/chosen": -677.8521728515625, + "logps/rejected": -1039.75830078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.51553726196289, + "rewards/margins": 25.58846092224121, + "rewards/rejected": -41.10399627685547, + "step": 3284 + }, + { + "epoch": 2.0435458786936236, + "grad_norm": 1.2564136397941184e-07, + "learning_rate": 1.7704011065006918e-06, + "logits/chosen": -3.027099847793579, + "logits/rejected": 4.060395240783691, + "logps/chosen": -396.2166748046875, + "logps/rejected": -1203.6392822265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.413979530334473, + "rewards/margins": 37.810646057128906, + "rewards/rejected": -45.22462463378906, + "step": 3285 + }, + { + "epoch": 2.044167962674961, + "grad_norm": 29.117576599121094, + "learning_rate": 1.769248501613647e-06, + "logits/chosen": 0.6583254933357239, + "logits/rejected": 2.5260579586029053, + "logps/chosen": -604.4136962890625, + "logps/rejected": -879.0269165039062, + "loss": 0.3192, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.834407806396484, + "rewards/margins": 22.25692367553711, + "rewards/rejected": -31.091331481933594, + "step": 3286 + }, + { + "epoch": 2.0447900466562987, + "grad_norm": 0.0004560309462249279, + "learning_rate": 1.7680958967266024e-06, + "logits/chosen": 0.19445538520812988, + "logits/rejected": 2.462914228439331, + "logps/chosen": -562.7223510742188, + "logps/rejected": -1123.900634765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.67230224609375, + "rewards/margins": 36.951412200927734, + "rewards/rejected": -49.62371826171875, + "step": 3287 + }, + { + "epoch": 2.045412130637636, + "grad_norm": 0.005535934120416641, + "learning_rate": 1.7669432918395577e-06, + "logits/chosen": -0.4256298840045929, + "logits/rejected": 2.6482839584350586, + "logps/chosen": -507.5963134765625, + "logps/rejected": -932.339599609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.090707778930664, + "rewards/margins": 28.398929595947266, + "rewards/rejected": -38.4896354675293, + "step": 3288 + }, + { + "epoch": 2.0460342146189734, + "grad_norm": 1.1572271585464478, + "learning_rate": 1.7657906869525129e-06, + "logits/chosen": 0.5644538998603821, + "logits/rejected": 1.5637774467468262, + "logps/chosen": -629.345458984375, + "logps/rejected": -1000.5640869140625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.741057395935059, + "rewards/margins": 32.58911895751953, + "rewards/rejected": -46.330177307128906, + "step": 3289 + }, + { + "epoch": 2.046656298600311, + "grad_norm": 1.0075103044509888, + "learning_rate": 1.7646380820654681e-06, + "logits/chosen": 1.313632845878601, + "logits/rejected": 3.4829115867614746, + "logps/chosen": -548.7958984375, + "logps/rejected": -918.281005859375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.908761978149414, + "rewards/margins": 25.477642059326172, + "rewards/rejected": -35.38640594482422, + "step": 3290 + }, + { + "epoch": 2.0472783825816485, + "grad_norm": 3.8606403904850595e-06, + "learning_rate": 1.7634854771784235e-06, + "logits/chosen": -1.0874159336090088, + "logits/rejected": 3.1560041904449463, + "logps/chosen": -512.3287963867188, + "logps/rejected": -1114.589599609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.583832740783691, + "rewards/margins": 35.8284912109375, + "rewards/rejected": -45.412322998046875, + "step": 3291 + }, + { + "epoch": 2.047900466562986, + "grad_norm": 1.0839836761533661e-07, + "learning_rate": 1.7623328722913788e-06, + "logits/chosen": 0.6895605325698853, + "logits/rejected": 2.0665345191955566, + "logps/chosen": -607.904296875, + "logps/rejected": -1044.9071044921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.638944625854492, + "rewards/margins": 32.77497100830078, + "rewards/rejected": -46.413917541503906, + "step": 3292 + }, + { + "epoch": 2.0485225505443236, + "grad_norm": 0.1251257359981537, + "learning_rate": 1.761180267404334e-06, + "logits/chosen": -0.2770364284515381, + "logits/rejected": 2.772951364517212, + "logps/chosen": -615.9885864257812, + "logps/rejected": -1054.5169677734375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.234400749206543, + "rewards/margins": 32.018253326416016, + "rewards/rejected": -44.252655029296875, + "step": 3293 + }, + { + "epoch": 2.049144634525661, + "grad_norm": 34.915321350097656, + "learning_rate": 1.7600276625172894e-06, + "logits/chosen": -1.7377254962921143, + "logits/rejected": 0.7142691612243652, + "logps/chosen": -563.9683837890625, + "logps/rejected": -1082.34619140625, + "loss": 0.2039, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.72260570526123, + "rewards/margins": 31.14119529724121, + "rewards/rejected": -45.863800048828125, + "step": 3294 + }, + { + "epoch": 2.0497667185069983, + "grad_norm": 5.692875862121582, + "learning_rate": 1.7588750576302447e-06, + "logits/chosen": -1.636199712753296, + "logits/rejected": 2.049520254135132, + "logps/chosen": -406.2523193359375, + "logps/rejected": -944.5439453125, + "loss": 0.0803, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.766059875488281, + "rewards/margins": 29.158157348632812, + "rewards/rejected": -40.924217224121094, + "step": 3295 + }, + { + "epoch": 2.050388802488336, + "grad_norm": 5.243829946266487e-05, + "learning_rate": 1.7577224527431997e-06, + "logits/chosen": 3.2839221954345703, + "logits/rejected": 4.257046699523926, + "logps/chosen": -686.9495849609375, + "logps/rejected": -1010.24462890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.418400764465332, + "rewards/margins": 29.747093200683594, + "rewards/rejected": -41.16549301147461, + "step": 3296 + }, + { + "epoch": 2.0510108864696734, + "grad_norm": 0.005440168082714081, + "learning_rate": 1.756569847856155e-06, + "logits/chosen": 1.3700108528137207, + "logits/rejected": 2.389298677444458, + "logps/chosen": -632.6429443359375, + "logps/rejected": -1019.9666748046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.622817993164062, + "rewards/margins": 30.42552947998047, + "rewards/rejected": -42.04834747314453, + "step": 3297 + }, + { + "epoch": 2.051632970451011, + "grad_norm": 28.08281707763672, + "learning_rate": 1.7554172429691101e-06, + "logits/chosen": -0.11010386794805527, + "logits/rejected": 3.2671074867248535, + "logps/chosen": -394.72784423828125, + "logps/rejected": -937.2503662109375, + "loss": 0.5615, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.893918991088867, + "rewards/margins": 30.8527774810791, + "rewards/rejected": -40.74669647216797, + "step": 3298 + }, + { + "epoch": 2.0522550544323486, + "grad_norm": 6.840485002612695e-05, + "learning_rate": 1.7542646380820656e-06, + "logits/chosen": 2.731478214263916, + "logits/rejected": 4.624332427978516, + "logps/chosen": -590.8049926757812, + "logps/rejected": -916.9151000976562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.759002685546875, + "rewards/margins": 25.12192153930664, + "rewards/rejected": -34.88092041015625, + "step": 3299 + }, + { + "epoch": 2.052877138413686, + "grad_norm": 0.005580862518399954, + "learning_rate": 1.7531120331950208e-06, + "logits/chosen": -1.3007276058197021, + "logits/rejected": 2.0316357612609863, + "logps/chosen": -466.2745361328125, + "logps/rejected": -922.46044921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.656286239624023, + "rewards/margins": 25.383224487304688, + "rewards/rejected": -36.03950881958008, + "step": 3300 + }, + { + "epoch": 2.0534992223950232, + "grad_norm": 3.2947580814361572, + "learning_rate": 1.751959428307976e-06, + "logits/chosen": -1.4168139696121216, + "logits/rejected": 1.8039307594299316, + "logps/chosen": -466.0013732910156, + "logps/rejected": -957.6085205078125, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.91163444519043, + "rewards/margins": 30.08930206298828, + "rewards/rejected": -39.000938415527344, + "step": 3301 + }, + { + "epoch": 2.054121306376361, + "grad_norm": 0.0028978725895285606, + "learning_rate": 1.7508068234209312e-06, + "logits/chosen": 0.9679357409477234, + "logits/rejected": 2.403862237930298, + "logps/chosen": -657.3065185546875, + "logps/rejected": -984.7806396484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.79493236541748, + "rewards/margins": 24.7813720703125, + "rewards/rejected": -34.57630157470703, + "step": 3302 + }, + { + "epoch": 2.0547433903576984, + "grad_norm": 0.0019351892406120896, + "learning_rate": 1.7496542185338867e-06, + "logits/chosen": 2.3574891090393066, + "logits/rejected": 4.697743892669678, + "logps/chosen": -553.681884765625, + "logps/rejected": -966.7857055664062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.650367736816406, + "rewards/margins": 25.174945831298828, + "rewards/rejected": -35.825313568115234, + "step": 3303 + }, + { + "epoch": 2.0553654743390357, + "grad_norm": 0.10551003366708755, + "learning_rate": 1.7485016136468419e-06, + "logits/chosen": 1.1634535789489746, + "logits/rejected": 2.7659077644348145, + "logps/chosen": -536.2604370117188, + "logps/rejected": -869.1211547851562, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.846975326538086, + "rewards/margins": 19.269046783447266, + "rewards/rejected": -27.11602210998535, + "step": 3304 + }, + { + "epoch": 2.055987558320373, + "grad_norm": 0.004820887930691242, + "learning_rate": 1.7473490087597971e-06, + "logits/chosen": 2.5007271766662598, + "logits/rejected": 2.895714282989502, + "logps/chosen": -633.784912109375, + "logps/rejected": -966.4913330078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.281123161315918, + "rewards/margins": 29.767860412597656, + "rewards/rejected": -36.04898452758789, + "step": 3305 + }, + { + "epoch": 2.056609642301711, + "grad_norm": 4.788481237483211e-05, + "learning_rate": 1.7461964038727526e-06, + "logits/chosen": -1.0666640996932983, + "logits/rejected": 2.2224929332733154, + "logps/chosen": -327.1462707519531, + "logps/rejected": -840.1002197265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.442052364349365, + "rewards/margins": 33.94697189331055, + "rewards/rejected": -39.38902282714844, + "step": 3306 + }, + { + "epoch": 2.057231726283048, + "grad_norm": 0.0034600174985826015, + "learning_rate": 1.7450437989857078e-06, + "logits/chosen": 0.42211639881134033, + "logits/rejected": 4.9143571853637695, + "logps/chosen": -468.89068603515625, + "logps/rejected": -1009.6427001953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.933347225189209, + "rewards/margins": 27.643991470336914, + "rewards/rejected": -35.57733917236328, + "step": 3307 + }, + { + "epoch": 2.0578538102643855, + "grad_norm": 0.00637791259214282, + "learning_rate": 1.743891194098663e-06, + "logits/chosen": 0.0010982751846313477, + "logits/rejected": 0.983103334903717, + "logps/chosen": -619.0855712890625, + "logps/rejected": -880.09619140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.18000316619873, + "rewards/margins": 24.80661964416504, + "rewards/rejected": -35.98662185668945, + "step": 3308 + }, + { + "epoch": 2.0584758942457233, + "grad_norm": 2.640011916810181e-05, + "learning_rate": 1.7427385892116182e-06, + "logits/chosen": 0.31347817182540894, + "logits/rejected": 3.793872117996216, + "logps/chosen": -429.73309326171875, + "logps/rejected": -894.3707885742188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.5145440101623535, + "rewards/margins": 27.934934616088867, + "rewards/rejected": -34.44947814941406, + "step": 3309 + }, + { + "epoch": 2.0590979782270606, + "grad_norm": 0.08523139357566833, + "learning_rate": 1.7415859843245737e-06, + "logits/chosen": 1.4979889392852783, + "logits/rejected": 2.8870656490325928, + "logps/chosen": -652.359375, + "logps/rejected": -1072.613037109375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.882624626159668, + "rewards/margins": 29.20603370666504, + "rewards/rejected": -43.08865737915039, + "step": 3310 + }, + { + "epoch": 2.059720062208398, + "grad_norm": 1.8696160316467285, + "learning_rate": 1.7404333794375289e-06, + "logits/chosen": -1.357941746711731, + "logits/rejected": 5.028107166290283, + "logps/chosen": -366.6391296386719, + "logps/rejected": -1070.263916015625, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.512500762939453, + "rewards/margins": 30.921201705932617, + "rewards/rejected": -42.4337043762207, + "step": 3311 + }, + { + "epoch": 2.0603421461897358, + "grad_norm": 4.39236537204124e-05, + "learning_rate": 1.7392807745504841e-06, + "logits/chosen": -0.5997156500816345, + "logits/rejected": 3.348984956741333, + "logps/chosen": -473.5180969238281, + "logps/rejected": -960.445556640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.715170860290527, + "rewards/margins": 27.744476318359375, + "rewards/rejected": -34.45964813232422, + "step": 3312 + }, + { + "epoch": 2.060964230171073, + "grad_norm": 10.655519485473633, + "learning_rate": 1.7381281696634396e-06, + "logits/chosen": 2.1897690296173096, + "logits/rejected": 4.3341875076293945, + "logps/chosen": -429.31158447265625, + "logps/rejected": -806.4674072265625, + "loss": 0.066, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.688857078552246, + "rewards/margins": 27.4539737701416, + "rewards/rejected": -32.1428337097168, + "step": 3313 + }, + { + "epoch": 2.0615863141524104, + "grad_norm": 17.07646369934082, + "learning_rate": 1.7369755647763948e-06, + "logits/chosen": 1.3907215595245361, + "logits/rejected": 2.856285333633423, + "logps/chosen": -531.179443359375, + "logps/rejected": -748.7376708984375, + "loss": 0.0693, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.567995071411133, + "rewards/margins": 19.450092315673828, + "rewards/rejected": -28.01808738708496, + "step": 3314 + }, + { + "epoch": 2.0622083981337482, + "grad_norm": 0.2608616054058075, + "learning_rate": 1.73582295988935e-06, + "logits/chosen": -3.7539005279541016, + "logits/rejected": 3.1033551692962646, + "logps/chosen": -227.97682189941406, + "logps/rejected": -840.460205078125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.802971363067627, + "rewards/margins": 24.070194244384766, + "rewards/rejected": -27.873165130615234, + "step": 3315 + }, + { + "epoch": 2.0628304821150856, + "grad_norm": 0.00168923893943429, + "learning_rate": 1.7346703550023052e-06, + "logits/chosen": -0.24053560197353363, + "logits/rejected": 1.6045324802398682, + "logps/chosen": -540.8382568359375, + "logps/rejected": -906.5202026367188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.838738441467285, + "rewards/margins": 26.211814880371094, + "rewards/rejected": -37.05055236816406, + "step": 3316 + }, + { + "epoch": 2.063452566096423, + "grad_norm": 19.610145568847656, + "learning_rate": 1.7335177501152607e-06, + "logits/chosen": 1.4513070583343506, + "logits/rejected": 2.9208927154541016, + "logps/chosen": -546.4124145507812, + "logps/rejected": -900.282470703125, + "loss": 0.1005, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.553874969482422, + "rewards/margins": 29.052043914794922, + "rewards/rejected": -37.605918884277344, + "step": 3317 + }, + { + "epoch": 2.0640746500777607, + "grad_norm": 4.993141919840127e-05, + "learning_rate": 1.7323651452282159e-06, + "logits/chosen": -4.351912498474121, + "logits/rejected": 2.076814651489258, + "logps/chosen": -291.6829833984375, + "logps/rejected": -931.6229248046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.039106369018555, + "rewards/margins": 25.343379974365234, + "rewards/rejected": -30.38248634338379, + "step": 3318 + }, + { + "epoch": 2.064696734059098, + "grad_norm": 0.47496530413627625, + "learning_rate": 1.7312125403411711e-06, + "logits/chosen": -1.0925979614257812, + "logits/rejected": 2.5883841514587402, + "logps/chosen": -294.2505187988281, + "logps/rejected": -662.4988403320312, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.046053886413574, + "rewards/margins": 20.85523223876953, + "rewards/rejected": -23.901287078857422, + "step": 3319 + }, + { + "epoch": 2.0653188180404354, + "grad_norm": 0.0007322177407331765, + "learning_rate": 1.7300599354541263e-06, + "logits/chosen": -1.0787445306777954, + "logits/rejected": 3.1141369342803955, + "logps/chosen": -682.3014526367188, + "logps/rejected": -1381.6591796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.946907043457031, + "rewards/margins": 40.99354934692383, + "rewards/rejected": -53.940452575683594, + "step": 3320 + }, + { + "epoch": 2.065940902021773, + "grad_norm": 0.7636861205101013, + "learning_rate": 1.7289073305670818e-06, + "logits/chosen": -2.0277695655822754, + "logits/rejected": 1.954400897026062, + "logps/chosen": -455.1981201171875, + "logps/rejected": -830.0636596679688, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.5624308586120605, + "rewards/margins": 14.610343933105469, + "rewards/rejected": -22.172775268554688, + "step": 3321 + }, + { + "epoch": 2.0665629860031105, + "grad_norm": 0.0060247681103646755, + "learning_rate": 1.727754725680037e-06, + "logits/chosen": 0.31880348920822144, + "logits/rejected": 2.8384175300598145, + "logps/chosen": -446.51092529296875, + "logps/rejected": -844.023681640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.652673721313477, + "rewards/margins": 25.704570770263672, + "rewards/rejected": -35.357242584228516, + "step": 3322 + }, + { + "epoch": 2.067185069984448, + "grad_norm": 0.015240863896906376, + "learning_rate": 1.7266021207929922e-06, + "logits/chosen": -1.3900054693222046, + "logits/rejected": 2.368161916732788, + "logps/chosen": -461.9625244140625, + "logps/rejected": -941.9267578125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.914365768432617, + "rewards/margins": 31.33608055114746, + "rewards/rejected": -38.25044631958008, + "step": 3323 + }, + { + "epoch": 2.067807153965785, + "grad_norm": 7.038326543806761e-07, + "learning_rate": 1.7254495159059477e-06, + "logits/chosen": -1.3869534730911255, + "logits/rejected": 2.0281357765197754, + "logps/chosen": -487.2365417480469, + "logps/rejected": -1085.301513671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.76966381072998, + "rewards/margins": 36.14636993408203, + "rewards/rejected": -44.91603469848633, + "step": 3324 + }, + { + "epoch": 2.068429237947123, + "grad_norm": 0.047666098922491074, + "learning_rate": 1.7242969110189029e-06, + "logits/chosen": 2.2894299030303955, + "logits/rejected": 3.7836549282073975, + "logps/chosen": -689.18798828125, + "logps/rejected": -1073.497802734375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.957679748535156, + "rewards/margins": 26.87308120727539, + "rewards/rejected": -40.83075714111328, + "step": 3325 + }, + { + "epoch": 2.0690513219284603, + "grad_norm": 1.717453734784158e-08, + "learning_rate": 1.7231443061318581e-06, + "logits/chosen": 1.9342631101608276, + "logits/rejected": 2.905841588973999, + "logps/chosen": -699.0042724609375, + "logps/rejected": -1064.35498046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.291414260864258, + "rewards/margins": 30.3492488861084, + "rewards/rejected": -41.640663146972656, + "step": 3326 + }, + { + "epoch": 2.0696734059097976, + "grad_norm": 0.1288936734199524, + "learning_rate": 1.7219917012448133e-06, + "logits/chosen": 3.1761245727539062, + "logits/rejected": 3.488016128540039, + "logps/chosen": -561.826904296875, + "logps/rejected": -791.1370849609375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.260245323181152, + "rewards/margins": 22.008047103881836, + "rewards/rejected": -30.268291473388672, + "step": 3327 + }, + { + "epoch": 2.0702954898911354, + "grad_norm": 4.1853404075808953e-10, + "learning_rate": 1.7208390963577688e-06, + "logits/chosen": 1.2564480304718018, + "logits/rejected": 4.029335021972656, + "logps/chosen": -612.748291015625, + "logps/rejected": -1131.76025390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.627924919128418, + "rewards/margins": 35.778465270996094, + "rewards/rejected": -46.40639114379883, + "step": 3328 + }, + { + "epoch": 2.0709175738724728, + "grad_norm": 0.004606824833899736, + "learning_rate": 1.719686491470724e-06, + "logits/chosen": 1.6285595893859863, + "logits/rejected": 4.791367530822754, + "logps/chosen": -513.8687744140625, + "logps/rejected": -1024.88525390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.402234077453613, + "rewards/margins": 32.802608489990234, + "rewards/rejected": -41.20484161376953, + "step": 3329 + }, + { + "epoch": 2.07153965785381, + "grad_norm": 0.041494857519865036, + "learning_rate": 1.7185338865836792e-06, + "logits/chosen": -1.059615969657898, + "logits/rejected": 2.409153938293457, + "logps/chosen": -484.38726806640625, + "logps/rejected": -1028.5125732421875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.8254194259643555, + "rewards/margins": 32.270713806152344, + "rewards/rejected": -39.09613037109375, + "step": 3330 + }, + { + "epoch": 2.072161741835148, + "grad_norm": 0.1615261733531952, + "learning_rate": 1.7173812816966344e-06, + "logits/chosen": -0.8603125214576721, + "logits/rejected": 3.1495981216430664, + "logps/chosen": -456.7115173339844, + "logps/rejected": -837.7604370117188, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.486283302307129, + "rewards/margins": 18.79230308532715, + "rewards/rejected": -27.27858543395996, + "step": 3331 + }, + { + "epoch": 2.0727838258164852, + "grad_norm": 1.1796037142630666e-05, + "learning_rate": 1.7162286768095899e-06, + "logits/chosen": 2.506577491760254, + "logits/rejected": 4.20957612991333, + "logps/chosen": -557.8585205078125, + "logps/rejected": -940.8349609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.77398681640625, + "rewards/margins": 27.37487030029297, + "rewards/rejected": -39.14885711669922, + "step": 3332 + }, + { + "epoch": 2.0734059097978226, + "grad_norm": 0.10842271149158478, + "learning_rate": 1.715076071922545e-06, + "logits/chosen": 1.1956841945648193, + "logits/rejected": 2.755904197692871, + "logps/chosen": -685.4227905273438, + "logps/rejected": -1112.8021240234375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.265910148620605, + "rewards/margins": 30.79898452758789, + "rewards/rejected": -45.06489562988281, + "step": 3333 + }, + { + "epoch": 2.0740279937791604, + "grad_norm": 0.09202323108911514, + "learning_rate": 1.7139234670355003e-06, + "logits/chosen": 0.308301717042923, + "logits/rejected": 3.1039891242980957, + "logps/chosen": -546.5222778320312, + "logps/rejected": -1041.5968017578125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.068633079528809, + "rewards/margins": 29.088891983032227, + "rewards/rejected": -41.15752410888672, + "step": 3334 + }, + { + "epoch": 2.0746500777604977, + "grad_norm": 1.3221635526861064e-05, + "learning_rate": 1.7127708621484558e-06, + "logits/chosen": 2.04056453704834, + "logits/rejected": 3.642124652862549, + "logps/chosen": -615.4324951171875, + "logps/rejected": -992.840087890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.023149490356445, + "rewards/margins": 27.139328002929688, + "rewards/rejected": -40.162479400634766, + "step": 3335 + }, + { + "epoch": 2.075272161741835, + "grad_norm": 0.001122101442888379, + "learning_rate": 1.711618257261411e-06, + "logits/chosen": 2.35383939743042, + "logits/rejected": 4.766437530517578, + "logps/chosen": -628.700439453125, + "logps/rejected": -1054.2471923828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.110316276550293, + "rewards/margins": 25.99763298034668, + "rewards/rejected": -37.107948303222656, + "step": 3336 + }, + { + "epoch": 2.075894245723173, + "grad_norm": 0.00026429022545926273, + "learning_rate": 1.7104656523743662e-06, + "logits/chosen": 0.6324070692062378, + "logits/rejected": 3.297117233276367, + "logps/chosen": -642.6302490234375, + "logps/rejected": -1133.9599609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.002674102783203, + "rewards/margins": 33.64592361450195, + "rewards/rejected": -44.648597717285156, + "step": 3337 + }, + { + "epoch": 2.07651632970451, + "grad_norm": 0.2311714142560959, + "learning_rate": 1.7093130474873214e-06, + "logits/chosen": 1.3173719644546509, + "logits/rejected": 4.45114803314209, + "logps/chosen": -575.9068603515625, + "logps/rejected": -1084.0186767578125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.138073921203613, + "rewards/margins": 27.18564224243164, + "rewards/rejected": -39.32371520996094, + "step": 3338 + }, + { + "epoch": 2.0771384136858475, + "grad_norm": 1.0550995284575038e-05, + "learning_rate": 1.7081604426002769e-06, + "logits/chosen": -1.0990958213806152, + "logits/rejected": 1.1802140474319458, + "logps/chosen": -554.207275390625, + "logps/rejected": -947.847900390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.284149169921875, + "rewards/margins": 32.44597244262695, + "rewards/rejected": -44.730125427246094, + "step": 3339 + }, + { + "epoch": 2.0777604976671853, + "grad_norm": 0.31715869903564453, + "learning_rate": 1.707007837713232e-06, + "logits/chosen": 0.9920371770858765, + "logits/rejected": 4.3308634757995605, + "logps/chosen": -586.5159912109375, + "logps/rejected": -1092.0472412109375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.493955612182617, + "rewards/margins": 28.599620819091797, + "rewards/rejected": -40.09357452392578, + "step": 3340 + }, + { + "epoch": 2.0783825816485226, + "grad_norm": 0.0001579702802700922, + "learning_rate": 1.7058552328261873e-06, + "logits/chosen": 1.2911337614059448, + "logits/rejected": 3.543045997619629, + "logps/chosen": -585.599609375, + "logps/rejected": -1032.3271484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.259281158447266, + "rewards/margins": 31.118511199951172, + "rewards/rejected": -40.37779235839844, + "step": 3341 + }, + { + "epoch": 2.07900466562986, + "grad_norm": 7.858145245620562e-09, + "learning_rate": 1.7047026279391426e-06, + "logits/chosen": -0.6377176642417908, + "logits/rejected": 2.4123430252075195, + "logps/chosen": -489.8824768066406, + "logps/rejected": -1049.6268310546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.102771759033203, + "rewards/margins": 35.268489837646484, + "rewards/rejected": -43.37126159667969, + "step": 3342 + }, + { + "epoch": 2.0796267496111973, + "grad_norm": 8.685908881034266e-09, + "learning_rate": 1.703550023052098e-06, + "logits/chosen": 0.5078180432319641, + "logits/rejected": 3.226416826248169, + "logps/chosen": -583.5855102539062, + "logps/rejected": -1118.15673828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.772845268249512, + "rewards/margins": 36.71405029296875, + "rewards/rejected": -44.48689270019531, + "step": 3343 + }, + { + "epoch": 2.080248833592535, + "grad_norm": 0.0039777737110853195, + "learning_rate": 1.7023974181650532e-06, + "logits/chosen": 0.7485368251800537, + "logits/rejected": 3.2258193492889404, + "logps/chosen": -613.4325561523438, + "logps/rejected": -1030.727783203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.436695098876953, + "rewards/margins": 24.86806869506836, + "rewards/rejected": -36.30476379394531, + "step": 3344 + }, + { + "epoch": 2.0808709175738724, + "grad_norm": 0.1329488307237625, + "learning_rate": 1.7012448132780084e-06, + "logits/chosen": 0.3994715213775635, + "logits/rejected": 1.6882116794586182, + "logps/chosen": -573.140869140625, + "logps/rejected": -1020.3795166015625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.334638595581055, + "rewards/margins": 33.32188415527344, + "rewards/rejected": -42.656524658203125, + "step": 3345 + }, + { + "epoch": 2.0814930015552098, + "grad_norm": 4.0168673876905814e-05, + "learning_rate": 1.7000922083909639e-06, + "logits/chosen": -1.8198471069335938, + "logits/rejected": 2.12861967086792, + "logps/chosen": -425.8285827636719, + "logps/rejected": -976.2532958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.025663375854492, + "rewards/margins": 29.32131004333496, + "rewards/rejected": -39.34697723388672, + "step": 3346 + }, + { + "epoch": 2.0821150855365476, + "grad_norm": 0.0009763347334228456, + "learning_rate": 1.698939603503919e-06, + "logits/chosen": -0.5600918531417847, + "logits/rejected": 3.825688600540161, + "logps/chosen": -500.5871276855469, + "logps/rejected": -1128.254638671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.031661987304688, + "rewards/margins": 36.95503234863281, + "rewards/rejected": -48.9866943359375, + "step": 3347 + }, + { + "epoch": 2.082737169517885, + "grad_norm": 5.5919431360962335e-06, + "learning_rate": 1.6977869986168743e-06, + "logits/chosen": 1.264890193939209, + "logits/rejected": 4.474057674407959, + "logps/chosen": -614.8546142578125, + "logps/rejected": -1293.04052734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.77999496459961, + "rewards/margins": 41.97685241699219, + "rewards/rejected": -56.7568473815918, + "step": 3348 + }, + { + "epoch": 2.0833592534992222, + "grad_norm": 0.012092916294932365, + "learning_rate": 1.6966343937298295e-06, + "logits/chosen": 1.270735263824463, + "logits/rejected": 2.718186855316162, + "logps/chosen": -504.2120666503906, + "logps/rejected": -853.6555786132812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.743149757385254, + "rewards/margins": 22.66098403930664, + "rewards/rejected": -29.404136657714844, + "step": 3349 + }, + { + "epoch": 2.08398133748056, + "grad_norm": 27.2022705078125, + "learning_rate": 1.695481788842785e-06, + "logits/chosen": -0.6622167825698853, + "logits/rejected": 3.0031518936157227, + "logps/chosen": -539.7035522460938, + "logps/rejected": -962.2923583984375, + "loss": 0.181, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.674617767333984, + "rewards/margins": 25.50714111328125, + "rewards/rejected": -37.181758880615234, + "step": 3350 + }, + { + "epoch": 2.0846034214618974, + "grad_norm": 0.1362578123807907, + "learning_rate": 1.6943291839557402e-06, + "logits/chosen": 0.1251983642578125, + "logits/rejected": 1.8397150039672852, + "logps/chosen": -611.310791015625, + "logps/rejected": -951.3914794921875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.344130516052246, + "rewards/margins": 19.910261154174805, + "rewards/rejected": -31.254390716552734, + "step": 3351 + }, + { + "epoch": 2.0852255054432347, + "grad_norm": 0.47021928429603577, + "learning_rate": 1.6931765790686954e-06, + "logits/chosen": -0.5348381996154785, + "logits/rejected": 5.024543285369873, + "logps/chosen": -355.5381164550781, + "logps/rejected": -895.0904541015625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.548956394195557, + "rewards/margins": 22.768566131591797, + "rewards/rejected": -28.317522048950195, + "step": 3352 + }, + { + "epoch": 2.0858475894245725, + "grad_norm": 0.0482005700469017, + "learning_rate": 1.6920239741816507e-06, + "logits/chosen": -1.475197672843933, + "logits/rejected": 1.649541974067688, + "logps/chosen": -457.4234619140625, + "logps/rejected": -933.4058837890625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.065631866455078, + "rewards/margins": 29.441650390625, + "rewards/rejected": -42.50728225708008, + "step": 3353 + }, + { + "epoch": 2.08646967340591, + "grad_norm": 0.024374065920710564, + "learning_rate": 1.690871369294606e-06, + "logits/chosen": 0.28265321254730225, + "logits/rejected": 3.3512191772460938, + "logps/chosen": -564.5770263671875, + "logps/rejected": -1192.9810791015625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.895007133483887, + "rewards/margins": 35.95161819458008, + "rewards/rejected": -42.846622467041016, + "step": 3354 + }, + { + "epoch": 2.087091757387247, + "grad_norm": 0.04075371474027634, + "learning_rate": 1.6897187644075613e-06, + "logits/chosen": 2.054584503173828, + "logits/rejected": 3.301548480987549, + "logps/chosen": -597.0030517578125, + "logps/rejected": -926.091552734375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.193412780761719, + "rewards/margins": 23.50974464416504, + "rewards/rejected": -32.703155517578125, + "step": 3355 + }, + { + "epoch": 2.087713841368585, + "grad_norm": 0.08765062689781189, + "learning_rate": 1.6885661595205165e-06, + "logits/chosen": 0.8265056610107422, + "logits/rejected": 2.3820858001708984, + "logps/chosen": -497.92254638671875, + "logps/rejected": -863.9046020507812, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.941751480102539, + "rewards/margins": 28.759737014770508, + "rewards/rejected": -36.70149230957031, + "step": 3356 + }, + { + "epoch": 2.0883359253499223, + "grad_norm": 6.599282187380595e-06, + "learning_rate": 1.687413554633472e-06, + "logits/chosen": -1.7757030725479126, + "logits/rejected": 3.9701287746429443, + "logps/chosen": -296.7178649902344, + "logps/rejected": -905.2001953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.456328868865967, + "rewards/margins": 29.916818618774414, + "rewards/rejected": -34.373146057128906, + "step": 3357 + }, + { + "epoch": 2.0889580093312596, + "grad_norm": 0.0847984030842781, + "learning_rate": 1.6862609497464272e-06, + "logits/chosen": -0.9706917405128479, + "logits/rejected": 3.0828208923339844, + "logps/chosen": -487.76605224609375, + "logps/rejected": -959.8536987304688, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.820488452911377, + "rewards/margins": 30.454444885253906, + "rewards/rejected": -36.274932861328125, + "step": 3358 + }, + { + "epoch": 2.0895800933125974, + "grad_norm": 3.220686994609423e-05, + "learning_rate": 1.6851083448593824e-06, + "logits/chosen": 0.18418824672698975, + "logits/rejected": 3.0592291355133057, + "logps/chosen": -435.98345947265625, + "logps/rejected": -890.0090942382812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.9094061851501465, + "rewards/margins": 27.20081329345703, + "rewards/rejected": -34.110225677490234, + "step": 3359 + }, + { + "epoch": 2.0902021772939348, + "grad_norm": 3.048164742835979e-08, + "learning_rate": 1.6839557399723377e-06, + "logits/chosen": -2.1238396167755127, + "logits/rejected": 1.927706241607666, + "logps/chosen": -496.24481201171875, + "logps/rejected": -1024.53955078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.999995231628418, + "rewards/margins": 35.04389953613281, + "rewards/rejected": -43.04389953613281, + "step": 3360 + }, + { + "epoch": 2.090824261275272, + "grad_norm": 0.4297603964805603, + "learning_rate": 1.682803135085293e-06, + "logits/chosen": 1.2899434566497803, + "logits/rejected": 1.664573073387146, + "logps/chosen": -559.8187255859375, + "logps/rejected": -939.0767822265625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.20655632019043, + "rewards/margins": 28.083295822143555, + "rewards/rejected": -35.289852142333984, + "step": 3361 + }, + { + "epoch": 2.0914463452566094, + "grad_norm": 2.5927631668309914e-06, + "learning_rate": 1.6816505301982483e-06, + "logits/chosen": -0.8606783151626587, + "logits/rejected": 1.8166348934173584, + "logps/chosen": -439.0218200683594, + "logps/rejected": -962.8194580078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.483155250549316, + "rewards/margins": 36.84089660644531, + "rewards/rejected": -44.32405090332031, + "step": 3362 + }, + { + "epoch": 2.0920684292379472, + "grad_norm": 0.004372260067611933, + "learning_rate": 1.6804979253112035e-06, + "logits/chosen": -1.8572988510131836, + "logits/rejected": 2.5752906799316406, + "logps/chosen": -525.1903076171875, + "logps/rejected": -1165.911865234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.921345710754395, + "rewards/margins": 36.948970794677734, + "rewards/rejected": -45.87031936645508, + "step": 3363 + }, + { + "epoch": 2.0926905132192846, + "grad_norm": 0.0018370038596913218, + "learning_rate": 1.6793453204241586e-06, + "logits/chosen": 0.052701711654663086, + "logits/rejected": 2.7828869819641113, + "logps/chosen": -501.08148193359375, + "logps/rejected": -904.0264892578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.115810394287109, + "rewards/margins": 22.015249252319336, + "rewards/rejected": -29.131057739257812, + "step": 3364 + }, + { + "epoch": 2.093312597200622, + "grad_norm": 7.0257151492114644e-06, + "learning_rate": 1.6781927155371138e-06, + "logits/chosen": 1.1880685091018677, + "logits/rejected": 2.866147756576538, + "logps/chosen": -572.2569580078125, + "logps/rejected": -942.1068115234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.220047950744629, + "rewards/margins": 26.508953094482422, + "rewards/rejected": -39.729000091552734, + "step": 3365 + }, + { + "epoch": 2.0939346811819597, + "grad_norm": 0.17728877067565918, + "learning_rate": 1.6770401106500692e-06, + "logits/chosen": 1.9238035678863525, + "logits/rejected": 4.696476936340332, + "logps/chosen": -634.623779296875, + "logps/rejected": -1072.55908203125, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.632692337036133, + "rewards/margins": 28.64840316772461, + "rewards/rejected": -40.281097412109375, + "step": 3366 + }, + { + "epoch": 2.094556765163297, + "grad_norm": 0.02830907702445984, + "learning_rate": 1.6758875057630244e-06, + "logits/chosen": -2.150949001312256, + "logits/rejected": 2.483363628387451, + "logps/chosen": -298.2734069824219, + "logps/rejected": -914.7220458984375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8420822620391846, + "rewards/margins": 32.87299346923828, + "rewards/rejected": -36.71507263183594, + "step": 3367 + }, + { + "epoch": 2.0951788491446344, + "grad_norm": 0.008550606667995453, + "learning_rate": 1.6747349008759797e-06, + "logits/chosen": -0.103985995054245, + "logits/rejected": 2.6301164627075195, + "logps/chosen": -555.9187622070312, + "logps/rejected": -1000.498291015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.633752822875977, + "rewards/margins": 32.593162536621094, + "rewards/rejected": -39.2269172668457, + "step": 3368 + }, + { + "epoch": 2.095800933125972, + "grad_norm": 2.6436568077770062e-05, + "learning_rate": 1.673582295988935e-06, + "logits/chosen": -0.7505025863647461, + "logits/rejected": 3.1368703842163086, + "logps/chosen": -460.7519226074219, + "logps/rejected": -933.426513671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.978192329406738, + "rewards/margins": 23.834163665771484, + "rewards/rejected": -32.812355041503906, + "step": 3369 + }, + { + "epoch": 2.0964230171073095, + "grad_norm": 4.5708739015992705e-09, + "learning_rate": 1.6724296911018903e-06, + "logits/chosen": -0.32886290550231934, + "logits/rejected": 4.3517279624938965, + "logps/chosen": -455.4180908203125, + "logps/rejected": -1061.9700927734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.620471000671387, + "rewards/margins": 33.597320556640625, + "rewards/rejected": -41.21778869628906, + "step": 3370 + }, + { + "epoch": 2.097045101088647, + "grad_norm": 0.005900989286601543, + "learning_rate": 1.6712770862148456e-06, + "logits/chosen": -0.8842804431915283, + "logits/rejected": 2.112257957458496, + "logps/chosen": -372.7100524902344, + "logps/rejected": -772.2435913085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.723258018493652, + "rewards/margins": 25.396638870239258, + "rewards/rejected": -32.119895935058594, + "step": 3371 + }, + { + "epoch": 2.0976671850699846, + "grad_norm": 0.00877196155488491, + "learning_rate": 1.6701244813278008e-06, + "logits/chosen": 3.7252695560455322, + "logits/rejected": 4.46113395690918, + "logps/chosen": -819.5318603515625, + "logps/rejected": -1156.776123046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.046567916870117, + "rewards/margins": 26.81656265258789, + "rewards/rejected": -41.863128662109375, + "step": 3372 + }, + { + "epoch": 2.098289269051322, + "grad_norm": 0.02428249642252922, + "learning_rate": 1.6689718764407562e-06, + "logits/chosen": -1.1189624071121216, + "logits/rejected": 1.4620922803878784, + "logps/chosen": -482.97113037109375, + "logps/rejected": -990.8453979492188, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.332776069641113, + "rewards/margins": 33.3018798828125, + "rewards/rejected": -46.63465881347656, + "step": 3373 + }, + { + "epoch": 2.0989113530326593, + "grad_norm": 0.001983237685635686, + "learning_rate": 1.6678192715537114e-06, + "logits/chosen": 0.7820011973381042, + "logits/rejected": 4.40946626663208, + "logps/chosen": -550.2945556640625, + "logps/rejected": -1139.11083984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.764097213745117, + "rewards/margins": 37.623233795166016, + "rewards/rejected": -50.3873291015625, + "step": 3374 + }, + { + "epoch": 2.099533437013997, + "grad_norm": 2.0523664545635256e-07, + "learning_rate": 1.6666666666666667e-06, + "logits/chosen": -0.4806283414363861, + "logits/rejected": 4.106095790863037, + "logps/chosen": -491.4980163574219, + "logps/rejected": -1238.3123779296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.256150245666504, + "rewards/margins": 43.39604568481445, + "rewards/rejected": -52.65219497680664, + "step": 3375 + }, + { + "epoch": 2.1001555209953344, + "grad_norm": 0.03712281957268715, + "learning_rate": 1.665514061779622e-06, + "logits/chosen": -0.9505767822265625, + "logits/rejected": 1.9326263666152954, + "logps/chosen": -391.64251708984375, + "logps/rejected": -812.211181640625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.85351037979126, + "rewards/margins": 31.238975524902344, + "rewards/rejected": -36.09248733520508, + "step": 3376 + }, + { + "epoch": 2.1007776049766718, + "grad_norm": 15.193599700927734, + "learning_rate": 1.6643614568925773e-06, + "logits/chosen": 0.22194261848926544, + "logits/rejected": 4.298823833465576, + "logps/chosen": -400.57037353515625, + "logps/rejected": -1007.1203002929688, + "loss": 0.0787, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.647911548614502, + "rewards/margins": 27.597047805786133, + "rewards/rejected": -33.244956970214844, + "step": 3377 + }, + { + "epoch": 2.1013996889580095, + "grad_norm": 0.00013249566836748272, + "learning_rate": 1.6632088520055325e-06, + "logits/chosen": -0.22353875637054443, + "logits/rejected": 3.013558864593506, + "logps/chosen": -383.1407775878906, + "logps/rejected": -873.5018310546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.963991165161133, + "rewards/margins": 32.24616241455078, + "rewards/rejected": -39.21015167236328, + "step": 3378 + }, + { + "epoch": 2.102021772939347, + "grad_norm": 0.010166036896407604, + "learning_rate": 1.6620562471184878e-06, + "logits/chosen": 2.5323030948638916, + "logits/rejected": 3.62776517868042, + "logps/chosen": -519.2050170898438, + "logps/rejected": -897.3560180664062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.967037200927734, + "rewards/margins": 29.247364044189453, + "rewards/rejected": -39.21440124511719, + "step": 3379 + }, + { + "epoch": 2.1026438569206842, + "grad_norm": 0.010199688374996185, + "learning_rate": 1.6609036422314432e-06, + "logits/chosen": -0.9682947397232056, + "logits/rejected": 2.1650946140289307, + "logps/chosen": -393.393798828125, + "logps/rejected": -769.2060546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.042436599731445, + "rewards/margins": 21.88577651977539, + "rewards/rejected": -30.928213119506836, + "step": 3380 + }, + { + "epoch": 2.1032659409020216, + "grad_norm": 3.648558731583762e-06, + "learning_rate": 1.6597510373443984e-06, + "logits/chosen": -1.0206509828567505, + "logits/rejected": 1.8315527439117432, + "logps/chosen": -546.6287231445312, + "logps/rejected": -1130.8519287109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.928306579589844, + "rewards/margins": 32.370277404785156, + "rewards/rejected": -44.298583984375, + "step": 3381 + }, + { + "epoch": 2.1038880248833594, + "grad_norm": 7.011471439000161e-07, + "learning_rate": 1.6585984324573537e-06, + "logits/chosen": 0.48149287700653076, + "logits/rejected": 3.7991549968719482, + "logps/chosen": -459.12738037109375, + "logps/rejected": -922.0175170898438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.526046752929688, + "rewards/margins": 27.56385040283203, + "rewards/rejected": -39.08989715576172, + "step": 3382 + }, + { + "epoch": 2.1045101088646967, + "grad_norm": 0.5247215628623962, + "learning_rate": 1.6574458275703089e-06, + "logits/chosen": 2.620265245437622, + "logits/rejected": 3.077489137649536, + "logps/chosen": -687.5137329101562, + "logps/rejected": -940.1102905273438, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.058935165405273, + "rewards/margins": 25.865474700927734, + "rewards/rejected": -36.924407958984375, + "step": 3383 + }, + { + "epoch": 2.105132192846034, + "grad_norm": 0.48539650440216064, + "learning_rate": 1.6562932226832643e-06, + "logits/chosen": -0.010341644287109375, + "logits/rejected": 0.931721031665802, + "logps/chosen": -578.7786254882812, + "logps/rejected": -866.927734375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.459258079528809, + "rewards/margins": 21.96541976928711, + "rewards/rejected": -32.42467498779297, + "step": 3384 + }, + { + "epoch": 2.105754276827372, + "grad_norm": 0.00542947044596076, + "learning_rate": 1.6551406177962195e-06, + "logits/chosen": 1.2294321060180664, + "logits/rejected": 3.682133436203003, + "logps/chosen": -672.1929931640625, + "logps/rejected": -1026.928955078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.094335556030273, + "rewards/margins": 23.090843200683594, + "rewards/rejected": -36.1851806640625, + "step": 3385 + }, + { + "epoch": 2.106376360808709, + "grad_norm": 0.00023597065592184663, + "learning_rate": 1.6539880129091748e-06, + "logits/chosen": -0.3777759075164795, + "logits/rejected": 2.5276408195495605, + "logps/chosen": -501.2518310546875, + "logps/rejected": -1042.737060546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.490774154663086, + "rewards/margins": 31.137483596801758, + "rewards/rejected": -42.628257751464844, + "step": 3386 + }, + { + "epoch": 2.1069984447900465, + "grad_norm": 5.728524411097169e-05, + "learning_rate": 1.6528354080221302e-06, + "logits/chosen": -1.2409263849258423, + "logits/rejected": 4.19053316116333, + "logps/chosen": -437.630126953125, + "logps/rejected": -1081.5054931640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.32150650024414, + "rewards/margins": 29.635597229003906, + "rewards/rejected": -39.95710372924805, + "step": 3387 + }, + { + "epoch": 2.1076205287713843, + "grad_norm": 9.662021511758212e-07, + "learning_rate": 1.6516828031350854e-06, + "logits/chosen": -0.9957299828529358, + "logits/rejected": 4.041959285736084, + "logps/chosen": -295.1841125488281, + "logps/rejected": -854.8515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7770609855651855, + "rewards/margins": 31.328670501708984, + "rewards/rejected": -35.10573196411133, + "step": 3388 + }, + { + "epoch": 2.1082426127527216, + "grad_norm": 0.015900224447250366, + "learning_rate": 1.6505301982480407e-06, + "logits/chosen": 0.956409752368927, + "logits/rejected": 2.972102642059326, + "logps/chosen": -722.85546875, + "logps/rejected": -1107.1051025390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.309172630310059, + "rewards/margins": 29.619922637939453, + "rewards/rejected": -43.92909240722656, + "step": 3389 + }, + { + "epoch": 2.108864696734059, + "grad_norm": 9.378606796264648, + "learning_rate": 1.6493775933609959e-06, + "logits/chosen": -0.19955027103424072, + "logits/rejected": 2.5805864334106445, + "logps/chosen": -541.113525390625, + "logps/rejected": -942.9413452148438, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.94508695602417, + "rewards/margins": 25.10898208618164, + "rewards/rejected": -33.05406951904297, + "step": 3390 + }, + { + "epoch": 2.1094867807153967, + "grad_norm": 20.579364776611328, + "learning_rate": 1.6482249884739513e-06, + "logits/chosen": 0.4862733781337738, + "logits/rejected": 2.504495859146118, + "logps/chosen": -518.5618896484375, + "logps/rejected": -958.5907592773438, + "loss": 0.6058, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.772661209106445, + "rewards/margins": 31.137737274169922, + "rewards/rejected": -42.910400390625, + "step": 3391 + }, + { + "epoch": 2.110108864696734, + "grad_norm": 0.0027981658931821585, + "learning_rate": 1.6470723835869065e-06, + "logits/chosen": 0.07086589932441711, + "logits/rejected": 2.659494161605835, + "logps/chosen": -546.3900756835938, + "logps/rejected": -847.4092407226562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.78062629699707, + "rewards/margins": 18.225074768066406, + "rewards/rejected": -29.005701065063477, + "step": 3392 + }, + { + "epoch": 2.1107309486780714, + "grad_norm": 0.0020019779913127422, + "learning_rate": 1.6459197786998618e-06, + "logits/chosen": 0.425553560256958, + "logits/rejected": 3.445070266723633, + "logps/chosen": -598.8272705078125, + "logps/rejected": -1064.2998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.582368850708008, + "rewards/margins": 30.0719051361084, + "rewards/rejected": -41.654273986816406, + "step": 3393 + }, + { + "epoch": 2.111353032659409, + "grad_norm": 10.503533363342285, + "learning_rate": 1.644767173812817e-06, + "logits/chosen": 0.857345461845398, + "logits/rejected": 3.003244638442993, + "logps/chosen": -442.4684753417969, + "logps/rejected": -779.4535522460938, + "loss": 0.0462, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.216536521911621, + "rewards/margins": 25.76353645324707, + "rewards/rejected": -30.980072021484375, + "step": 3394 + }, + { + "epoch": 2.1119751166407466, + "grad_norm": 0.1677616685628891, + "learning_rate": 1.6436145689257724e-06, + "logits/chosen": -0.8002194166183472, + "logits/rejected": 3.6724236011505127, + "logps/chosen": -200.62562561035156, + "logps/rejected": -812.24609375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.35098123550415, + "rewards/margins": 32.549068450927734, + "rewards/rejected": -36.90005111694336, + "step": 3395 + }, + { + "epoch": 2.112597200622084, + "grad_norm": 1.996715582208708e-06, + "learning_rate": 1.6424619640387277e-06, + "logits/chosen": 0.9634472131729126, + "logits/rejected": 3.9523916244506836, + "logps/chosen": -513.762451171875, + "logps/rejected": -1041.6044921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.443663597106934, + "rewards/margins": 33.710235595703125, + "rewards/rejected": -43.153900146484375, + "step": 3396 + }, + { + "epoch": 2.1132192846034217, + "grad_norm": 0.031665410846471786, + "learning_rate": 1.6413093591516829e-06, + "logits/chosen": 0.4123764634132385, + "logits/rejected": 2.6448452472686768, + "logps/chosen": -600.843505859375, + "logps/rejected": -983.0840454101562, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.967991828918457, + "rewards/margins": 24.33465576171875, + "rewards/rejected": -33.30264663696289, + "step": 3397 + }, + { + "epoch": 2.113841368584759, + "grad_norm": 0.022291993722319603, + "learning_rate": 1.6401567542646383e-06, + "logits/chosen": -0.33354711532592773, + "logits/rejected": 3.285107135772705, + "logps/chosen": -541.2667236328125, + "logps/rejected": -1024.5386962890625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.661428451538086, + "rewards/margins": 31.472816467285156, + "rewards/rejected": -43.134246826171875, + "step": 3398 + }, + { + "epoch": 2.1144634525660964, + "grad_norm": 2.6611015796661377, + "learning_rate": 1.6390041493775935e-06, + "logits/chosen": -0.07903802394866943, + "logits/rejected": 2.2965927124023438, + "logps/chosen": -485.8436279296875, + "logps/rejected": -1028.42138671875, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.215842247009277, + "rewards/margins": 34.56201934814453, + "rewards/rejected": -42.777862548828125, + "step": 3399 + }, + { + "epoch": 2.1150855365474337, + "grad_norm": 3.3976443774008658e-06, + "learning_rate": 1.6378515444905488e-06, + "logits/chosen": 0.8789270520210266, + "logits/rejected": 2.5680906772613525, + "logps/chosen": -475.8021240234375, + "logps/rejected": -883.010498046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.538758754730225, + "rewards/margins": 30.40880584716797, + "rewards/rejected": -37.947566986083984, + "step": 3400 + }, + { + "epoch": 2.1157076205287715, + "grad_norm": 1.922818410093896e-05, + "learning_rate": 1.636698939603504e-06, + "logits/chosen": -0.7577810883522034, + "logits/rejected": 3.907153844833374, + "logps/chosen": -377.5826721191406, + "logps/rejected": -1134.6827392578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.530715942382812, + "rewards/margins": 43.018455505371094, + "rewards/rejected": -54.549171447753906, + "step": 3401 + }, + { + "epoch": 2.116329704510109, + "grad_norm": 9.223941802978516, + "learning_rate": 1.6355463347164594e-06, + "logits/chosen": -1.7244789600372314, + "logits/rejected": 2.035127878189087, + "logps/chosen": -496.82421875, + "logps/rejected": -924.963134765625, + "loss": 0.0393, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.925373077392578, + "rewards/margins": 29.017684936523438, + "rewards/rejected": -38.943058013916016, + "step": 3402 + }, + { + "epoch": 2.116951788491446, + "grad_norm": 0.3632352352142334, + "learning_rate": 1.6343937298294146e-06, + "logits/chosen": 0.20769548416137695, + "logits/rejected": 2.323042154312134, + "logps/chosen": -590.65380859375, + "logps/rejected": -1021.50634765625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.967580795288086, + "rewards/margins": 28.152793884277344, + "rewards/rejected": -36.12037658691406, + "step": 3403 + }, + { + "epoch": 2.117573872472784, + "grad_norm": 0.0003732674231287092, + "learning_rate": 1.6332411249423699e-06, + "logits/chosen": -0.18807101249694824, + "logits/rejected": 3.297891616821289, + "logps/chosen": -446.2677001953125, + "logps/rejected": -963.2024536132812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.80330753326416, + "rewards/margins": 34.15058135986328, + "rewards/rejected": -40.953887939453125, + "step": 3404 + }, + { + "epoch": 2.1181959564541213, + "grad_norm": 0.023219313472509384, + "learning_rate": 1.632088520055325e-06, + "logits/chosen": 1.1393150091171265, + "logits/rejected": 3.2419910430908203, + "logps/chosen": -510.16485595703125, + "logps/rejected": -958.189453125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.300631046295166, + "rewards/margins": 33.69567108154297, + "rewards/rejected": -39.99630355834961, + "step": 3405 + }, + { + "epoch": 2.1188180404354586, + "grad_norm": 0.09577471017837524, + "learning_rate": 1.6309359151682805e-06, + "logits/chosen": -0.7650696635246277, + "logits/rejected": 2.7050087451934814, + "logps/chosen": -434.15020751953125, + "logps/rejected": -878.897216796875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.322816848754883, + "rewards/margins": 26.0329647064209, + "rewards/rejected": -31.35578155517578, + "step": 3406 + }, + { + "epoch": 2.1194401244167964, + "grad_norm": 0.00028814279357902706, + "learning_rate": 1.6297833102812358e-06, + "logits/chosen": 0.2786529064178467, + "logits/rejected": 3.1810197830200195, + "logps/chosen": -590.8251953125, + "logps/rejected": -1019.7881469726562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.678223609924316, + "rewards/margins": 23.088512420654297, + "rewards/rejected": -36.76673889160156, + "step": 3407 + }, + { + "epoch": 2.1200622083981338, + "grad_norm": 0.001266329549252987, + "learning_rate": 1.628630705394191e-06, + "logits/chosen": 0.22288648784160614, + "logits/rejected": 3.409573554992676, + "logps/chosen": -519.0152587890625, + "logps/rejected": -972.437744140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.787503719329834, + "rewards/margins": 30.214773178100586, + "rewards/rejected": -37.00227737426758, + "step": 3408 + }, + { + "epoch": 2.120684292379471, + "grad_norm": 0.003725613933056593, + "learning_rate": 1.6274781005071464e-06, + "logits/chosen": -0.06311874091625214, + "logits/rejected": 2.947871446609497, + "logps/chosen": -577.43115234375, + "logps/rejected": -1169.604736328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.592594146728516, + "rewards/margins": 36.942161560058594, + "rewards/rejected": -45.534751892089844, + "step": 3409 + }, + { + "epoch": 2.121306376360809, + "grad_norm": 0.0010662629501894116, + "learning_rate": 1.6263254956201016e-06, + "logits/chosen": 0.02673649787902832, + "logits/rejected": 1.3852254152297974, + "logps/chosen": -534.2376708984375, + "logps/rejected": -950.355712890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.195487976074219, + "rewards/margins": 28.430917739868164, + "rewards/rejected": -40.62640380859375, + "step": 3410 + }, + { + "epoch": 2.121928460342146, + "grad_norm": 0.11500487476587296, + "learning_rate": 1.6251728907330569e-06, + "logits/chosen": -1.8423373699188232, + "logits/rejected": 1.2112541198730469, + "logps/chosen": -461.163818359375, + "logps/rejected": -946.5360107421875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.386780261993408, + "rewards/margins": 32.830833435058594, + "rewards/rejected": -40.217613220214844, + "step": 3411 + }, + { + "epoch": 2.1225505443234836, + "grad_norm": 12.089064598083496, + "learning_rate": 1.624020285846012e-06, + "logits/chosen": 2.3610942363739014, + "logits/rejected": 4.344902992248535, + "logps/chosen": -795.449951171875, + "logps/rejected": -1055.771484375, + "loss": 0.052, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.550273418426514, + "rewards/margins": 17.250858306884766, + "rewards/rejected": -23.801132202148438, + "step": 3412 + }, + { + "epoch": 2.1231726283048213, + "grad_norm": 0.4581535756587982, + "learning_rate": 1.6228676809589675e-06, + "logits/chosen": -0.11082451045513153, + "logits/rejected": 2.212818145751953, + "logps/chosen": -644.3795166015625, + "logps/rejected": -961.1854248046875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.852404594421387, + "rewards/margins": 21.750226974487305, + "rewards/rejected": -33.602630615234375, + "step": 3413 + }, + { + "epoch": 2.1237947122861587, + "grad_norm": 5.110677193442825e-06, + "learning_rate": 1.6217150760719228e-06, + "logits/chosen": -0.10364311933517456, + "logits/rejected": 3.4550118446350098, + "logps/chosen": -476.4292907714844, + "logps/rejected": -1043.4764404296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.02131462097168, + "rewards/margins": 33.4831428527832, + "rewards/rejected": -40.50445556640625, + "step": 3414 + }, + { + "epoch": 2.124416796267496, + "grad_norm": 1.0429807240086575e-08, + "learning_rate": 1.620562471184878e-06, + "logits/chosen": 0.379156231880188, + "logits/rejected": 3.817188262939453, + "logps/chosen": -701.6832885742188, + "logps/rejected": -1382.80029296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.1474609375, + "rewards/margins": 36.24828338623047, + "rewards/rejected": -48.395748138427734, + "step": 3415 + }, + { + "epoch": 2.125038880248834, + "grad_norm": 3.745609262750804e-07, + "learning_rate": 1.6194098662978332e-06, + "logits/chosen": -0.8152501583099365, + "logits/rejected": 2.818958044052124, + "logps/chosen": -384.119384765625, + "logps/rejected": -905.9598999023438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.373373031616211, + "rewards/margins": 32.04167175292969, + "rewards/rejected": -38.415042877197266, + "step": 3416 + }, + { + "epoch": 2.125660964230171, + "grad_norm": 0.004811655264347792, + "learning_rate": 1.6182572614107886e-06, + "logits/chosen": -2.818270444869995, + "logits/rejected": 4.097588062286377, + "logps/chosen": -214.08514404296875, + "logps/rejected": -970.8871459960938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.288216590881348, + "rewards/margins": 32.85670852661133, + "rewards/rejected": -37.14492416381836, + "step": 3417 + }, + { + "epoch": 2.1262830482115085, + "grad_norm": 3.656461715698242, + "learning_rate": 1.6171046565237439e-06, + "logits/chosen": -0.7900360226631165, + "logits/rejected": 4.9393696784973145, + "logps/chosen": -380.90673828125, + "logps/rejected": -1163.0694580078125, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9158077239990234, + "rewards/margins": 39.61353302001953, + "rewards/rejected": -42.52934265136719, + "step": 3418 + }, + { + "epoch": 2.126905132192846, + "grad_norm": 0.00022827822249382734, + "learning_rate": 1.615952051636699e-06, + "logits/chosen": -0.14041665196418762, + "logits/rejected": 0.7800382971763611, + "logps/chosen": -509.66851806640625, + "logps/rejected": -845.5088500976562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.2577290534973145, + "rewards/margins": 25.382957458496094, + "rewards/rejected": -32.64068603515625, + "step": 3419 + }, + { + "epoch": 2.1275272161741836, + "grad_norm": 0.004320131614804268, + "learning_rate": 1.6147994467496545e-06, + "logits/chosen": -0.5010417699813843, + "logits/rejected": 3.5509495735168457, + "logps/chosen": -485.992431640625, + "logps/rejected": -963.5689697265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.654045104980469, + "rewards/margins": 28.273672103881836, + "rewards/rejected": -36.92771530151367, + "step": 3420 + }, + { + "epoch": 2.128149300155521, + "grad_norm": 1.9214366986375353e-08, + "learning_rate": 1.6136468418626098e-06, + "logits/chosen": -1.4153292179107666, + "logits/rejected": 2.755247116088867, + "logps/chosen": -371.322998046875, + "logps/rejected": -966.787841796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.08968448638916, + "rewards/margins": 36.248077392578125, + "rewards/rejected": -41.337764739990234, + "step": 3421 + }, + { + "epoch": 2.1287713841368583, + "grad_norm": 6.000399288552671e-09, + "learning_rate": 1.612494236975565e-06, + "logits/chosen": -2.731515407562256, + "logits/rejected": 2.4720258712768555, + "logps/chosen": -431.3476867675781, + "logps/rejected": -1083.6611328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.9705119132995605, + "rewards/margins": 29.507789611816406, + "rewards/rejected": -37.478302001953125, + "step": 3422 + }, + { + "epoch": 2.129393468118196, + "grad_norm": 0.264360249042511, + "learning_rate": 1.6113416320885202e-06, + "logits/chosen": -0.4833368957042694, + "logits/rejected": 2.7763330936431885, + "logps/chosen": -526.0587768554688, + "logps/rejected": -1065.353271484375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.281364440917969, + "rewards/margins": 33.10901641845703, + "rewards/rejected": -44.390380859375, + "step": 3423 + }, + { + "epoch": 2.1300155520995334, + "grad_norm": 23.688011169433594, + "learning_rate": 1.6101890272014756e-06, + "logits/chosen": -0.3603689968585968, + "logits/rejected": 3.46712064743042, + "logps/chosen": -398.64654541015625, + "logps/rejected": -885.201904296875, + "loss": 0.1506, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.763185501098633, + "rewards/margins": 26.965011596679688, + "rewards/rejected": -35.72819900512695, + "step": 3424 + }, + { + "epoch": 2.1306376360808708, + "grad_norm": 0.0001959709479706362, + "learning_rate": 1.6090364223144309e-06, + "logits/chosen": 0.1824033260345459, + "logits/rejected": 2.3923895359039307, + "logps/chosen": -480.4774169921875, + "logps/rejected": -901.778076171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.293122291564941, + "rewards/margins": 25.791786193847656, + "rewards/rejected": -38.08490753173828, + "step": 3425 + }, + { + "epoch": 2.1312597200622085, + "grad_norm": 2.947682787635131e-06, + "learning_rate": 1.607883817427386e-06, + "logits/chosen": 1.6706695556640625, + "logits/rejected": 1.5100889205932617, + "logps/chosen": -709.5021362304688, + "logps/rejected": -1123.13623046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.592023849487305, + "rewards/margins": 37.92151641845703, + "rewards/rejected": -50.51354217529297, + "step": 3426 + }, + { + "epoch": 2.131881804043546, + "grad_norm": 8.702866580279078e-06, + "learning_rate": 1.6067312125403415e-06, + "logits/chosen": -3.008915424346924, + "logits/rejected": 3.2472410202026367, + "logps/chosen": -362.72802734375, + "logps/rejected": -1140.3369140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.616853713989258, + "rewards/margins": 39.08551025390625, + "rewards/rejected": -47.70236587524414, + "step": 3427 + }, + { + "epoch": 2.132503888024883, + "grad_norm": 8.457972580799833e-05, + "learning_rate": 1.6055786076532967e-06, + "logits/chosen": 0.2832450866699219, + "logits/rejected": 5.077991008758545, + "logps/chosen": -337.720458984375, + "logps/rejected": -969.3193359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.811567783355713, + "rewards/margins": 29.737640380859375, + "rewards/rejected": -34.5492057800293, + "step": 3428 + }, + { + "epoch": 2.133125972006221, + "grad_norm": 0.019046053290367126, + "learning_rate": 1.604426002766252e-06, + "logits/chosen": 1.4136080741882324, + "logits/rejected": 2.035309314727783, + "logps/chosen": -560.418212890625, + "logps/rejected": -939.0333251953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.986815929412842, + "rewards/margins": 34.09614181518555, + "rewards/rejected": -41.08295440673828, + "step": 3429 + }, + { + "epoch": 2.1337480559875583, + "grad_norm": 0.00037207978311926126, + "learning_rate": 1.6032733978792072e-06, + "logits/chosen": 1.2080271244049072, + "logits/rejected": 3.942293643951416, + "logps/chosen": -448.4974365234375, + "logps/rejected": -820.5440673828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.193816184997559, + "rewards/margins": 23.287883758544922, + "rewards/rejected": -32.4817008972168, + "step": 3430 + }, + { + "epoch": 2.1343701399688957, + "grad_norm": 5.110921859741211, + "learning_rate": 1.6021207929921626e-06, + "logits/chosen": -1.5018537044525146, + "logits/rejected": 1.764155626296997, + "logps/chosen": -333.2330322265625, + "logps/rejected": -845.230712890625, + "loss": 0.0328, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.84320068359375, + "rewards/margins": 27.2366943359375, + "rewards/rejected": -35.07989501953125, + "step": 3431 + }, + { + "epoch": 2.1349922239502335, + "grad_norm": 0.10313411802053452, + "learning_rate": 1.6009681881051176e-06, + "logits/chosen": 0.2874959111213684, + "logits/rejected": 3.128005027770996, + "logps/chosen": -493.38836669921875, + "logps/rejected": -862.8633422851562, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.106266021728516, + "rewards/margins": 22.064449310302734, + "rewards/rejected": -29.170717239379883, + "step": 3432 + }, + { + "epoch": 2.135614307931571, + "grad_norm": 0.5469374060630798, + "learning_rate": 1.5998155832180729e-06, + "logits/chosen": 0.9954251050949097, + "logits/rejected": 3.078653335571289, + "logps/chosen": -632.5187377929688, + "logps/rejected": -1006.222412109375, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.858264923095703, + "rewards/margins": 20.12818717956543, + "rewards/rejected": -36.9864501953125, + "step": 3433 + }, + { + "epoch": 2.136236391912908, + "grad_norm": 0.04896441847085953, + "learning_rate": 1.598662978331028e-06, + "logits/chosen": -0.063498854637146, + "logits/rejected": 3.525094509124756, + "logps/chosen": -441.2791442871094, + "logps/rejected": -863.0089111328125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.556578636169434, + "rewards/margins": 21.468883514404297, + "rewards/rejected": -29.025463104248047, + "step": 3434 + }, + { + "epoch": 2.136858475894246, + "grad_norm": 8.239752787631005e-06, + "learning_rate": 1.5975103734439833e-06, + "logits/chosen": -0.23733371496200562, + "logits/rejected": 0.8966051936149597, + "logps/chosen": -587.71337890625, + "logps/rejected": -949.0718994140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.718193054199219, + "rewards/margins": 26.919227600097656, + "rewards/rejected": -37.637420654296875, + "step": 3435 + }, + { + "epoch": 2.1374805598755833, + "grad_norm": 0.00015162007184699178, + "learning_rate": 1.5963577685569388e-06, + "logits/chosen": 0.4512918293476105, + "logits/rejected": 3.2689778804779053, + "logps/chosen": -581.7296752929688, + "logps/rejected": -1036.16357421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.346375465393066, + "rewards/margins": 32.81805419921875, + "rewards/rejected": -42.1644287109375, + "step": 3436 + }, + { + "epoch": 2.1381026438569206, + "grad_norm": 0.00044463237281888723, + "learning_rate": 1.595205163669894e-06, + "logits/chosen": -0.19343051314353943, + "logits/rejected": 1.666266679763794, + "logps/chosen": -576.584716796875, + "logps/rejected": -1199.6351318359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0612592697143555, + "rewards/margins": 40.66623306274414, + "rewards/rejected": -45.72749328613281, + "step": 3437 + }, + { + "epoch": 2.138724727838258, + "grad_norm": 6.797229161747964e-06, + "learning_rate": 1.5940525587828492e-06, + "logits/chosen": 2.218825340270996, + "logits/rejected": 3.9413962364196777, + "logps/chosen": -811.3069458007812, + "logps/rejected": -1181.102294921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.166580200195312, + "rewards/margins": 31.07561492919922, + "rewards/rejected": -44.24219512939453, + "step": 3438 + }, + { + "epoch": 2.1393468118195957, + "grad_norm": 4.010344491689466e-05, + "learning_rate": 1.5928999538958046e-06, + "logits/chosen": 0.6854308843612671, + "logits/rejected": 3.5298032760620117, + "logps/chosen": -543.1837158203125, + "logps/rejected": -993.5977783203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.933540344238281, + "rewards/margins": 31.57440185546875, + "rewards/rejected": -42.50794219970703, + "step": 3439 + }, + { + "epoch": 2.139968895800933, + "grad_norm": 4.72462797164917, + "learning_rate": 1.5917473490087599e-06, + "logits/chosen": 0.36550286412239075, + "logits/rejected": -0.1433129608631134, + "logps/chosen": -605.7955322265625, + "logps/rejected": -873.3226928710938, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.314057350158691, + "rewards/margins": 30.274394989013672, + "rewards/rejected": -40.58845520019531, + "step": 3440 + }, + { + "epoch": 2.1405909797822704, + "grad_norm": 2.0681025603153103e-07, + "learning_rate": 1.590594744121715e-06, + "logits/chosen": 0.44030457735061646, + "logits/rejected": 1.914415955543518, + "logps/chosen": -468.28497314453125, + "logps/rejected": -858.67626953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.888754844665527, + "rewards/margins": 29.416000366210938, + "rewards/rejected": -39.30475616455078, + "step": 3441 + }, + { + "epoch": 2.141213063763608, + "grad_norm": 0.01545824483036995, + "learning_rate": 1.5894421392346703e-06, + "logits/chosen": 2.361666202545166, + "logits/rejected": 3.3521080017089844, + "logps/chosen": -671.66455078125, + "logps/rejected": -1096.104248046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.397945404052734, + "rewards/margins": 27.711437225341797, + "rewards/rejected": -44.10938262939453, + "step": 3442 + }, + { + "epoch": 2.1418351477449455, + "grad_norm": 0.4221373498439789, + "learning_rate": 1.5882895343476258e-06, + "logits/chosen": -1.1415373086929321, + "logits/rejected": 2.070000410079956, + "logps/chosen": -366.7683410644531, + "logps/rejected": -928.0114135742188, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.439271926879883, + "rewards/margins": 36.66456985473633, + "rewards/rejected": -45.103843688964844, + "step": 3443 + }, + { + "epoch": 2.142457231726283, + "grad_norm": 0.022570105269551277, + "learning_rate": 1.587136929460581e-06, + "logits/chosen": -1.1626898050308228, + "logits/rejected": 2.8129069805145264, + "logps/chosen": -484.2257995605469, + "logps/rejected": -1054.75, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.110090255737305, + "rewards/margins": 33.480899810791016, + "rewards/rejected": -40.59099197387695, + "step": 3444 + }, + { + "epoch": 2.1430793157076207, + "grad_norm": 0.1024412214756012, + "learning_rate": 1.5859843245735362e-06, + "logits/chosen": -1.3715943098068237, + "logits/rejected": 1.9006588459014893, + "logps/chosen": -504.47418212890625, + "logps/rejected": -1045.1580810546875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.661272048950195, + "rewards/margins": 30.334453582763672, + "rewards/rejected": -42.9957275390625, + "step": 3445 + }, + { + "epoch": 2.143701399688958, + "grad_norm": 5.425422668457031, + "learning_rate": 1.5848317196864914e-06, + "logits/chosen": -0.059883177280426025, + "logits/rejected": 2.410794734954834, + "logps/chosen": -361.6816711425781, + "logps/rejected": -728.779052734375, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.959995269775391, + "rewards/margins": 21.38794708251953, + "rewards/rejected": -28.347942352294922, + "step": 3446 + }, + { + "epoch": 2.1443234836702953, + "grad_norm": 0.0003700493252836168, + "learning_rate": 1.5836791147994469e-06, + "logits/chosen": 0.10252094268798828, + "logits/rejected": 3.9793825149536133, + "logps/chosen": -514.8165283203125, + "logps/rejected": -1008.554443359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.945167541503906, + "rewards/margins": 31.51917266845703, + "rewards/rejected": -43.4643440246582, + "step": 3447 + }, + { + "epoch": 2.144945567651633, + "grad_norm": 32.00484085083008, + "learning_rate": 1.582526509912402e-06, + "logits/chosen": 3.398038148880005, + "logits/rejected": 2.6710593700408936, + "logps/chosen": -794.04541015625, + "logps/rejected": -1034.32861328125, + "loss": 0.783, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.406631469726562, + "rewards/margins": 20.88903045654297, + "rewards/rejected": -34.29566192626953, + "step": 3448 + }, + { + "epoch": 2.1455676516329705, + "grad_norm": 0.0014944367576390505, + "learning_rate": 1.5813739050253573e-06, + "logits/chosen": -0.21664124727249146, + "logits/rejected": 1.0657644271850586, + "logps/chosen": -538.9214477539062, + "logps/rejected": -952.3651123046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.446172714233398, + "rewards/margins": 31.40801239013672, + "rewards/rejected": -42.854183197021484, + "step": 3449 + }, + { + "epoch": 2.146189735614308, + "grad_norm": 0.009861108846962452, + "learning_rate": 1.5802213001383128e-06, + "logits/chosen": 2.1326751708984375, + "logits/rejected": 3.010751962661743, + "logps/chosen": -634.6590576171875, + "logps/rejected": -910.4227294921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.147429466247559, + "rewards/margins": 24.965938568115234, + "rewards/rejected": -36.11336898803711, + "step": 3450 + }, + { + "epoch": 2.1468118195956456, + "grad_norm": 12.68981647491455, + "learning_rate": 1.579068695251268e-06, + "logits/chosen": 0.1347132921218872, + "logits/rejected": 3.76957368850708, + "logps/chosen": -437.06329345703125, + "logps/rejected": -1030.0235595703125, + "loss": 0.0651, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.579730033874512, + "rewards/margins": 32.528709411621094, + "rewards/rejected": -39.108436584472656, + "step": 3451 + }, + { + "epoch": 2.147433903576983, + "grad_norm": 0.004146880470216274, + "learning_rate": 1.5779160903642232e-06, + "logits/chosen": -3.4257616996765137, + "logits/rejected": 0.663922905921936, + "logps/chosen": -254.12570190429688, + "logps/rejected": -789.0089721679688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.897040843963623, + "rewards/margins": 25.073740005493164, + "rewards/rejected": -30.970783233642578, + "step": 3452 + }, + { + "epoch": 2.1480559875583203, + "grad_norm": 0.004583262838423252, + "learning_rate": 1.5767634854771784e-06, + "logits/chosen": 1.2491068840026855, + "logits/rejected": 4.228781223297119, + "logps/chosen": -570.4674072265625, + "logps/rejected": -1028.893310546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.680624008178711, + "rewards/margins": 32.67273712158203, + "rewards/rejected": -42.353355407714844, + "step": 3453 + }, + { + "epoch": 2.148678071539658, + "grad_norm": 1.6916555978241377e-05, + "learning_rate": 1.5756108805901339e-06, + "logits/chosen": 1.6966251134872437, + "logits/rejected": 2.9529733657836914, + "logps/chosen": -604.318359375, + "logps/rejected": -914.4246826171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.783700942993164, + "rewards/margins": 26.6322021484375, + "rewards/rejected": -38.4159049987793, + "step": 3454 + }, + { + "epoch": 2.1493001555209954, + "grad_norm": 0.05122312530875206, + "learning_rate": 1.574458275703089e-06, + "logits/chosen": -3.1102006435394287, + "logits/rejected": 1.0001440048217773, + "logps/chosen": -423.76300048828125, + "logps/rejected": -1014.8577880859375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.17207145690918, + "rewards/margins": 37.166168212890625, + "rewards/rejected": -47.33823776245117, + "step": 3455 + }, + { + "epoch": 2.1499222395023327, + "grad_norm": 0.00029756067669950426, + "learning_rate": 1.5733056708160443e-06, + "logits/chosen": -1.2222700119018555, + "logits/rejected": 3.7177681922912598, + "logps/chosen": -603.027587890625, + "logps/rejected": -1247.6982421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.437413215637207, + "rewards/margins": 37.02989196777344, + "rewards/rejected": -48.46730422973633, + "step": 3456 + }, + { + "epoch": 2.15054432348367, + "grad_norm": 0.04954065755009651, + "learning_rate": 1.5721530659289995e-06, + "logits/chosen": 1.6569437980651855, + "logits/rejected": 3.8350000381469727, + "logps/chosen": -643.8868408203125, + "logps/rejected": -1028.9217529296875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.637311935424805, + "rewards/margins": 28.497699737548828, + "rewards/rejected": -38.135009765625, + "step": 3457 + }, + { + "epoch": 2.151166407465008, + "grad_norm": 3.508955478668213, + "learning_rate": 1.571000461041955e-06, + "logits/chosen": -2.0015316009521484, + "logits/rejected": 0.9972792863845825, + "logps/chosen": -548.6949462890625, + "logps/rejected": -1086.0467529296875, + "loss": 0.0416, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.5714111328125, + "rewards/margins": 30.41091537475586, + "rewards/rejected": -43.98232650756836, + "step": 3458 + }, + { + "epoch": 2.151788491446345, + "grad_norm": 8.965987035480794e-06, + "learning_rate": 1.5698478561549102e-06, + "logits/chosen": 1.6384658813476562, + "logits/rejected": 4.505178451538086, + "logps/chosen": -584.2413330078125, + "logps/rejected": -1142.488525390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.16094970703125, + "rewards/margins": 36.01255798339844, + "rewards/rejected": -43.17350769042969, + "step": 3459 + }, + { + "epoch": 2.1524105754276825, + "grad_norm": 0.00012471464287955314, + "learning_rate": 1.5686952512678654e-06, + "logits/chosen": 2.938779592514038, + "logits/rejected": 4.181041717529297, + "logps/chosen": -635.9864501953125, + "logps/rejected": -1054.688720703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.60373306274414, + "rewards/margins": 34.09574508666992, + "rewards/rejected": -44.69947814941406, + "step": 3460 + }, + { + "epoch": 2.1530326594090203, + "grad_norm": 2.1145135065125942e-07, + "learning_rate": 1.5675426463808209e-06, + "logits/chosen": 1.1438682079315186, + "logits/rejected": 3.93155574798584, + "logps/chosen": -453.353759765625, + "logps/rejected": -957.1611328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.293970108032227, + "rewards/margins": 37.22956848144531, + "rewards/rejected": -45.523536682128906, + "step": 3461 + }, + { + "epoch": 2.1536547433903577, + "grad_norm": 6.02866823302961e-13, + "learning_rate": 1.566390041493776e-06, + "logits/chosen": -1.322139024734497, + "logits/rejected": 3.6064188480377197, + "logps/chosen": -487.96136474609375, + "logps/rejected": -1197.617431640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.6765775680542, + "rewards/margins": 41.77438735961914, + "rewards/rejected": -50.450965881347656, + "step": 3462 + }, + { + "epoch": 2.154276827371695, + "grad_norm": 0.08622624725103378, + "learning_rate": 1.5652374366067313e-06, + "logits/chosen": 1.3654158115386963, + "logits/rejected": 2.3155899047851562, + "logps/chosen": -730.7118530273438, + "logps/rejected": -1029.7301025390625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.9762020111084, + "rewards/margins": 20.96411895751953, + "rewards/rejected": -37.9403190612793, + "step": 3463 + }, + { + "epoch": 2.154898911353033, + "grad_norm": 0.009618532843887806, + "learning_rate": 1.5640848317196865e-06, + "logits/chosen": 0.9912912845611572, + "logits/rejected": 2.8535585403442383, + "logps/chosen": -479.0268859863281, + "logps/rejected": -854.8507080078125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.03948450088501, + "rewards/margins": 29.229816436767578, + "rewards/rejected": -36.2692985534668, + "step": 3464 + }, + { + "epoch": 2.15552099533437, + "grad_norm": 0.10931253433227539, + "learning_rate": 1.562932226832642e-06, + "logits/chosen": -0.7888144254684448, + "logits/rejected": 3.1281023025512695, + "logps/chosen": -411.3115234375, + "logps/rejected": -841.8549194335938, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.393655776977539, + "rewards/margins": 25.66720199584961, + "rewards/rejected": -34.06085968017578, + "step": 3465 + }, + { + "epoch": 2.1561430793157075, + "grad_norm": 0.00019762212468776852, + "learning_rate": 1.5617796219455972e-06, + "logits/chosen": -1.8151626586914062, + "logits/rejected": 1.8206268548965454, + "logps/chosen": -378.3106384277344, + "logps/rejected": -955.4374389648438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.059768676757812, + "rewards/margins": 30.776660919189453, + "rewards/rejected": -38.836429595947266, + "step": 3466 + }, + { + "epoch": 2.1567651632970453, + "grad_norm": 1.8507220147512271e-06, + "learning_rate": 1.5606270170585524e-06, + "logits/chosen": 1.1831034421920776, + "logits/rejected": 3.867389678955078, + "logps/chosen": -505.9981994628906, + "logps/rejected": -989.8499755859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.651363372802734, + "rewards/margins": 34.708984375, + "rewards/rejected": -42.36034393310547, + "step": 3467 + }, + { + "epoch": 2.1573872472783826, + "grad_norm": 0.004638043697923422, + "learning_rate": 1.5594744121715076e-06, + "logits/chosen": -1.4236783981323242, + "logits/rejected": 2.9329476356506348, + "logps/chosen": -343.78955078125, + "logps/rejected": -819.7637939453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.332006454467773, + "rewards/margins": 25.30170440673828, + "rewards/rejected": -32.63371276855469, + "step": 3468 + }, + { + "epoch": 2.15800933125972, + "grad_norm": 10.412405014038086, + "learning_rate": 1.558321807284463e-06, + "logits/chosen": -0.23219335079193115, + "logits/rejected": 2.9971132278442383, + "logps/chosen": -595.6908569335938, + "logps/rejected": -1006.50537109375, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.477470397949219, + "rewards/margins": 21.702659606933594, + "rewards/rejected": -30.180130004882812, + "step": 3469 + }, + { + "epoch": 2.1586314152410577, + "grad_norm": 0.003561669262126088, + "learning_rate": 1.5571692023974183e-06, + "logits/chosen": 0.5902441740036011, + "logits/rejected": 1.6300718784332275, + "logps/chosen": -610.6525268554688, + "logps/rejected": -981.94580078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.881414413452148, + "rewards/margins": 30.17713165283203, + "rewards/rejected": -45.05854034423828, + "step": 3470 + }, + { + "epoch": 2.159253499222395, + "grad_norm": 0.0019642633851617575, + "learning_rate": 1.5560165975103735e-06, + "logits/chosen": 2.9218506813049316, + "logits/rejected": 4.19120979309082, + "logps/chosen": -704.3328247070312, + "logps/rejected": -1116.96240234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.29806900024414, + "rewards/margins": 34.05742645263672, + "rewards/rejected": -46.35549545288086, + "step": 3471 + }, + { + "epoch": 2.1598755832037324, + "grad_norm": 4.151289867415642e-10, + "learning_rate": 1.554863992623329e-06, + "logits/chosen": 0.576399564743042, + "logits/rejected": 0.6314655542373657, + "logps/chosen": -596.7716064453125, + "logps/rejected": -1056.8067626953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.739250183105469, + "rewards/margins": 37.79853057861328, + "rewards/rejected": -48.53778076171875, + "step": 3472 + }, + { + "epoch": 2.16049766718507, + "grad_norm": 35.283565521240234, + "learning_rate": 1.5537113877362842e-06, + "logits/chosen": -1.2791149616241455, + "logits/rejected": 1.7859420776367188, + "logps/chosen": -557.95166015625, + "logps/rejected": -1006.7755126953125, + "loss": 0.14, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.688139915466309, + "rewards/margins": 21.129825592041016, + "rewards/rejected": -33.81796646118164, + "step": 3473 + }, + { + "epoch": 2.1611197511664075, + "grad_norm": 0.09894830733537674, + "learning_rate": 1.5525587828492394e-06, + "logits/chosen": -0.0892077088356018, + "logits/rejected": 1.6869478225708008, + "logps/chosen": -536.9102172851562, + "logps/rejected": -892.3147583007812, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.899477005004883, + "rewards/margins": 23.70627212524414, + "rewards/rejected": -35.60574722290039, + "step": 3474 + }, + { + "epoch": 2.161741835147745, + "grad_norm": 2.0548529624938965, + "learning_rate": 1.5514061779621946e-06, + "logits/chosen": 1.3693408966064453, + "logits/rejected": 2.579724073410034, + "logps/chosen": -536.6299438476562, + "logps/rejected": -680.3751220703125, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.347484588623047, + "rewards/margins": 14.572774887084961, + "rewards/rejected": -21.920257568359375, + "step": 3475 + }, + { + "epoch": 2.162363919129082, + "grad_norm": 0.0224276315420866, + "learning_rate": 1.55025357307515e-06, + "logits/chosen": 0.5975302457809448, + "logits/rejected": 4.302839279174805, + "logps/chosen": -536.1854248046875, + "logps/rejected": -939.0042724609375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.972611427307129, + "rewards/margins": 20.14980125427246, + "rewards/rejected": -28.122413635253906, + "step": 3476 + }, + { + "epoch": 2.16298600311042, + "grad_norm": 0.021996496245265007, + "learning_rate": 1.5491009681881053e-06, + "logits/chosen": -0.18943238258361816, + "logits/rejected": 2.7617297172546387, + "logps/chosen": -565.2724609375, + "logps/rejected": -1072.9237060546875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.456770896911621, + "rewards/margins": 34.614444732666016, + "rewards/rejected": -42.07121658325195, + "step": 3477 + }, + { + "epoch": 2.1636080870917573, + "grad_norm": 1.227846602169791e-09, + "learning_rate": 1.5479483633010605e-06, + "logits/chosen": -1.9056190252304077, + "logits/rejected": 2.281365394592285, + "logps/chosen": -429.4873046875, + "logps/rejected": -1025.097900390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.220512390136719, + "rewards/margins": 37.59743118286133, + "rewards/rejected": -47.81794738769531, + "step": 3478 + }, + { + "epoch": 2.1642301710730947, + "grad_norm": 5.147202500666026e-06, + "learning_rate": 1.5467957584140158e-06, + "logits/chosen": 0.15378397703170776, + "logits/rejected": 4.166439056396484, + "logps/chosen": -535.3529052734375, + "logps/rejected": -1126.279052734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.468807220458984, + "rewards/margins": 37.814125061035156, + "rewards/rejected": -45.28293228149414, + "step": 3479 + }, + { + "epoch": 2.1648522550544325, + "grad_norm": 6.843938899692148e-05, + "learning_rate": 1.5456431535269712e-06, + "logits/chosen": -0.10889559984207153, + "logits/rejected": 2.361340045928955, + "logps/chosen": -568.2152099609375, + "logps/rejected": -1050.350341796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.513697624206543, + "rewards/margins": 35.53955841064453, + "rewards/rejected": -50.053253173828125, + "step": 3480 + }, + { + "epoch": 2.16547433903577, + "grad_norm": 0.0102471224963665, + "learning_rate": 1.5444905486399264e-06, + "logits/chosen": 0.7401965856552124, + "logits/rejected": 3.1332480907440186, + "logps/chosen": -639.3670654296875, + "logps/rejected": -1051.7347412109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.7382612228393555, + "rewards/margins": 27.60154914855957, + "rewards/rejected": -35.339813232421875, + "step": 3481 + }, + { + "epoch": 2.166096423017107, + "grad_norm": 0.00041303454781882465, + "learning_rate": 1.5433379437528816e-06, + "logits/chosen": -0.9328422546386719, + "logits/rejected": 3.0686655044555664, + "logps/chosen": -263.8511047363281, + "logps/rejected": -749.5258178710938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.905520915985107, + "rewards/margins": 27.547443389892578, + "rewards/rejected": -32.452964782714844, + "step": 3482 + }, + { + "epoch": 2.166718506998445, + "grad_norm": 2.258577325164879e-09, + "learning_rate": 1.542185338865837e-06, + "logits/chosen": 2.8638532161712646, + "logits/rejected": 3.6541571617126465, + "logps/chosen": -686.989990234375, + "logps/rejected": -1098.45458984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.719980239868164, + "rewards/margins": 35.45842742919922, + "rewards/rejected": -48.17841339111328, + "step": 3483 + }, + { + "epoch": 2.1673405909797823, + "grad_norm": 0.046206485480070114, + "learning_rate": 1.5410327339787923e-06, + "logits/chosen": 0.6019414067268372, + "logits/rejected": 4.246374130249023, + "logps/chosen": -591.260009765625, + "logps/rejected": -1131.1484375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.808977127075195, + "rewards/margins": 32.77920150756836, + "rewards/rejected": -46.58818054199219, + "step": 3484 + }, + { + "epoch": 2.1679626749611196, + "grad_norm": 1.8323400020599365, + "learning_rate": 1.5398801290917475e-06, + "logits/chosen": 3.3164963722229004, + "logits/rejected": 4.026242256164551, + "logps/chosen": -558.5890502929688, + "logps/rejected": -857.2715454101562, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.3273725509643555, + "rewards/margins": 24.035884857177734, + "rewards/rejected": -31.36325454711914, + "step": 3485 + }, + { + "epoch": 2.1685847589424574, + "grad_norm": 0.0004539400397334248, + "learning_rate": 1.5387275242047028e-06, + "logits/chosen": 0.121127188205719, + "logits/rejected": 3.4484081268310547, + "logps/chosen": -671.3143310546875, + "logps/rejected": -1044.74609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.50837516784668, + "rewards/margins": 31.49677276611328, + "rewards/rejected": -40.005149841308594, + "step": 3486 + }, + { + "epoch": 2.1692068429237947, + "grad_norm": 4.255609198366983e-09, + "learning_rate": 1.5375749193176582e-06, + "logits/chosen": -0.1348937749862671, + "logits/rejected": 4.0312323570251465, + "logps/chosen": -372.6984558105469, + "logps/rejected": -992.5408935546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.464959621429443, + "rewards/margins": 36.84416580200195, + "rewards/rejected": -42.30912399291992, + "step": 3487 + }, + { + "epoch": 2.169828926905132, + "grad_norm": 0.03619871661067009, + "learning_rate": 1.5364223144306134e-06, + "logits/chosen": -0.1388590782880783, + "logits/rejected": 0.4599158763885498, + "logps/chosen": -498.08544921875, + "logps/rejected": -965.0997314453125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.397851943969727, + "rewards/margins": 33.254859924316406, + "rewards/rejected": -42.6527099609375, + "step": 3488 + }, + { + "epoch": 2.17045101088647, + "grad_norm": 0.0035405848175287247, + "learning_rate": 1.5352697095435686e-06, + "logits/chosen": 1.951183795928955, + "logits/rejected": 3.039306640625, + "logps/chosen": -680.4669189453125, + "logps/rejected": -1072.6722412109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.782991409301758, + "rewards/margins": 27.88431739807129, + "rewards/rejected": -41.66730880737305, + "step": 3489 + }, + { + "epoch": 2.171073094867807, + "grad_norm": 0.2370285987854004, + "learning_rate": 1.534117104656524e-06, + "logits/chosen": 2.453774929046631, + "logits/rejected": 4.862796783447266, + "logps/chosen": -830.1021728515625, + "logps/rejected": -1309.16845703125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.799744606018066, + "rewards/margins": 32.265045166015625, + "rewards/rejected": -47.064788818359375, + "step": 3490 + }, + { + "epoch": 2.1716951788491445, + "grad_norm": 0.01857941411435604, + "learning_rate": 1.5329644997694793e-06, + "logits/chosen": 1.4445005655288696, + "logits/rejected": 3.2285866737365723, + "logps/chosen": -647.0142822265625, + "logps/rejected": -1020.450439453125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.633213996887207, + "rewards/margins": 27.143659591674805, + "rewards/rejected": -38.77687072753906, + "step": 3491 + }, + { + "epoch": 2.1723172628304823, + "grad_norm": 6.384193693520501e-05, + "learning_rate": 1.5318118948824345e-06, + "logits/chosen": 0.22042182087898254, + "logits/rejected": 3.2595787048339844, + "logps/chosen": -557.39599609375, + "logps/rejected": -1137.6495361328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.450281143188477, + "rewards/margins": 40.4174919128418, + "rewards/rejected": -49.867767333984375, + "step": 3492 + }, + { + "epoch": 2.1729393468118197, + "grad_norm": 0.024043237790465355, + "learning_rate": 1.5306592899953897e-06, + "logits/chosen": -0.896227240562439, + "logits/rejected": 2.541123390197754, + "logps/chosen": -508.51806640625, + "logps/rejected": -1031.8397216796875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.197294235229492, + "rewards/margins": 32.25333786010742, + "rewards/rejected": -39.45063018798828, + "step": 3493 + }, + { + "epoch": 2.173561430793157, + "grad_norm": 0.1542564034461975, + "learning_rate": 1.5295066851083452e-06, + "logits/chosen": 1.0172396898269653, + "logits/rejected": 2.698474407196045, + "logps/chosen": -455.7784423828125, + "logps/rejected": -922.6686401367188, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.295679092407227, + "rewards/margins": 33.03749084472656, + "rewards/rejected": -46.33317184448242, + "step": 3494 + }, + { + "epoch": 2.1741835147744943, + "grad_norm": 3.4196689128875732, + "learning_rate": 1.5283540802213004e-06, + "logits/chosen": -1.5701600313186646, + "logits/rejected": 0.7547981142997742, + "logps/chosen": -522.6417236328125, + "logps/rejected": -887.583251953125, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.89109992980957, + "rewards/margins": 22.884368896484375, + "rewards/rejected": -33.77547073364258, + "step": 3495 + }, + { + "epoch": 2.174805598755832, + "grad_norm": 2.236021041870117, + "learning_rate": 1.5272014753342556e-06, + "logits/chosen": 2.0567288398742676, + "logits/rejected": 3.635549306869507, + "logps/chosen": -610.8658447265625, + "logps/rejected": -1079.648193359375, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.349096298217773, + "rewards/margins": 30.307376861572266, + "rewards/rejected": -41.656471252441406, + "step": 3496 + }, + { + "epoch": 2.1754276827371695, + "grad_norm": 19.92326545715332, + "learning_rate": 1.5260488704472109e-06, + "logits/chosen": -0.9514064788818359, + "logits/rejected": 2.288212537765503, + "logps/chosen": -524.08349609375, + "logps/rejected": -1010.56298828125, + "loss": 0.0905, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.694738388061523, + "rewards/margins": 33.93772888183594, + "rewards/rejected": -41.632469177246094, + "step": 3497 + }, + { + "epoch": 2.176049766718507, + "grad_norm": 0.17036820948123932, + "learning_rate": 1.5248962655601663e-06, + "logits/chosen": -2.106801748275757, + "logits/rejected": 4.079695224761963, + "logps/chosen": -369.3140869140625, + "logps/rejected": -1085.123779296875, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.724808692932129, + "rewards/margins": 30.634748458862305, + "rewards/rejected": -39.35955810546875, + "step": 3498 + }, + { + "epoch": 2.1766718506998446, + "grad_norm": 1.4225753545761108, + "learning_rate": 1.5237436606731215e-06, + "logits/chosen": 0.6853106617927551, + "logits/rejected": 3.601655960083008, + "logps/chosen": -548.959716796875, + "logps/rejected": -991.363037109375, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.702157974243164, + "rewards/margins": 26.26728630065918, + "rewards/rejected": -38.969444274902344, + "step": 3499 + }, + { + "epoch": 2.177293934681182, + "grad_norm": 0.2219383269548416, + "learning_rate": 1.5225910557860765e-06, + "logits/chosen": 0.3559526205062866, + "logits/rejected": 3.7226366996765137, + "logps/chosen": -449.0332946777344, + "logps/rejected": -1052.79443359375, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.326770782470703, + "rewards/margins": 35.569976806640625, + "rewards/rejected": -43.89674758911133, + "step": 3500 + }, + { + "epoch": 2.1779160186625193, + "grad_norm": 35.52800750732422, + "learning_rate": 1.5214384508990318e-06, + "logits/chosen": 1.4980649948120117, + "logits/rejected": 4.63604736328125, + "logps/chosen": -586.69580078125, + "logps/rejected": -1035.624267578125, + "loss": 0.4524, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.078465461730957, + "rewards/margins": 31.11182975769043, + "rewards/rejected": -41.19029235839844, + "step": 3501 + }, + { + "epoch": 2.178538102643857, + "grad_norm": 1.1494609708506687e-08, + "learning_rate": 1.5202858460119872e-06, + "logits/chosen": -0.8052943348884583, + "logits/rejected": 3.309441089630127, + "logps/chosen": -453.7115173339844, + "logps/rejected": -980.9556884765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.90156364440918, + "rewards/margins": 32.78784942626953, + "rewards/rejected": -41.689414978027344, + "step": 3502 + }, + { + "epoch": 2.1791601866251944, + "grad_norm": 0.46481531858444214, + "learning_rate": 1.5191332411249424e-06, + "logits/chosen": 0.530727744102478, + "logits/rejected": 1.2075937986373901, + "logps/chosen": -715.234619140625, + "logps/rejected": -1115.29931640625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.125618934631348, + "rewards/margins": 25.974449157714844, + "rewards/rejected": -41.100067138671875, + "step": 3503 + }, + { + "epoch": 2.1797822706065317, + "grad_norm": 0.0032227826304733753, + "learning_rate": 1.5179806362378976e-06, + "logits/chosen": 0.8939819931983948, + "logits/rejected": 2.9865517616271973, + "logps/chosen": -603.0927734375, + "logps/rejected": -1192.5281982421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.699723243713379, + "rewards/margins": 38.47848892211914, + "rewards/rejected": -47.1782112121582, + "step": 3504 + }, + { + "epoch": 2.1804043545878695, + "grad_norm": 0.009147186763584614, + "learning_rate": 1.5168280313508529e-06, + "logits/chosen": 3.3015215396881104, + "logits/rejected": 1.3482310771942139, + "logps/chosen": -738.5409545898438, + "logps/rejected": -845.2588500976562, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.378486633300781, + "rewards/margins": 20.573898315429688, + "rewards/rejected": -33.95238494873047, + "step": 3505 + }, + { + "epoch": 2.181026438569207, + "grad_norm": 6.6773923208529595e-06, + "learning_rate": 1.5156754264638083e-06, + "logits/chosen": -2.602396011352539, + "logits/rejected": 2.857754707336426, + "logps/chosen": -372.400146484375, + "logps/rejected": -1036.571044921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.945173263549805, + "rewards/margins": 36.59382247924805, + "rewards/rejected": -44.53899383544922, + "step": 3506 + }, + { + "epoch": 2.181648522550544, + "grad_norm": 2.750762462615967, + "learning_rate": 1.5145228215767635e-06, + "logits/chosen": 1.6972405910491943, + "logits/rejected": 0.563304603099823, + "logps/chosen": -580.1192016601562, + "logps/rejected": -831.8045654296875, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.198119163513184, + "rewards/margins": 26.365081787109375, + "rewards/rejected": -36.56319808959961, + "step": 3507 + }, + { + "epoch": 2.182270606531882, + "grad_norm": 0.04510970413684845, + "learning_rate": 1.5133702166897188e-06, + "logits/chosen": 1.1926695108413696, + "logits/rejected": 2.631222724914551, + "logps/chosen": -626.7939453125, + "logps/rejected": -989.72119140625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.63274097442627, + "rewards/margins": 27.476999282836914, + "rewards/rejected": -38.109737396240234, + "step": 3508 + }, + { + "epoch": 2.1828926905132193, + "grad_norm": 0.1802457571029663, + "learning_rate": 1.512217611802674e-06, + "logits/chosen": -3.372152328491211, + "logits/rejected": 0.6782333254814148, + "logps/chosen": -441.4357604980469, + "logps/rejected": -1135.593505859375, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.280879974365234, + "rewards/margins": 39.73485565185547, + "rewards/rejected": -49.0157356262207, + "step": 3509 + }, + { + "epoch": 2.1835147744945567, + "grad_norm": 0.005605627316981554, + "learning_rate": 1.5110650069156294e-06, + "logits/chosen": -2.9359774589538574, + "logits/rejected": 4.364621162414551, + "logps/chosen": -342.99444580078125, + "logps/rejected": -1007.0554809570312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.298252105712891, + "rewards/margins": 34.19009780883789, + "rewards/rejected": -41.48834991455078, + "step": 3510 + }, + { + "epoch": 2.1841368584758944, + "grad_norm": 14.325651168823242, + "learning_rate": 1.5099124020285846e-06, + "logits/chosen": 0.26200932264328003, + "logits/rejected": 3.8386402130126953, + "logps/chosen": -659.2767944335938, + "logps/rejected": -1086.6053466796875, + "loss": 0.0841, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.393942832946777, + "rewards/margins": 32.03108215332031, + "rewards/rejected": -42.42502212524414, + "step": 3511 + }, + { + "epoch": 2.184758942457232, + "grad_norm": 58.25017547607422, + "learning_rate": 1.5087597971415399e-06, + "logits/chosen": 2.0818560123443604, + "logits/rejected": 3.9699525833129883, + "logps/chosen": -757.948486328125, + "logps/rejected": -1054.1805419921875, + "loss": 0.3581, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.566535949707031, + "rewards/margins": 27.65494155883789, + "rewards/rejected": -43.221473693847656, + "step": 3512 + }, + { + "epoch": 2.185381026438569, + "grad_norm": 20.528139114379883, + "learning_rate": 1.5076071922544953e-06, + "logits/chosen": 1.878481149673462, + "logits/rejected": 3.6764373779296875, + "logps/chosen": -720.4259643554688, + "logps/rejected": -1064.9002685546875, + "loss": 0.1138, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.61282730102539, + "rewards/margins": 23.589069366455078, + "rewards/rejected": -39.20189666748047, + "step": 3513 + }, + { + "epoch": 2.1860031104199065, + "grad_norm": 7.523076055804268e-05, + "learning_rate": 1.5064545873674505e-06, + "logits/chosen": 0.2912430763244629, + "logits/rejected": 4.2857818603515625, + "logps/chosen": -687.9354858398438, + "logps/rejected": -1222.1806640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.304641723632812, + "rewards/margins": 32.19824981689453, + "rewards/rejected": -43.502891540527344, + "step": 3514 + }, + { + "epoch": 2.1866251944012443, + "grad_norm": 0.00023828446865081787, + "learning_rate": 1.5053019824804058e-06, + "logits/chosen": -0.37384992837905884, + "logits/rejected": 3.28475284576416, + "logps/chosen": -357.19329833984375, + "logps/rejected": -861.1342163085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.559617519378662, + "rewards/margins": 26.56270980834961, + "rewards/rejected": -34.12232971191406, + "step": 3515 + }, + { + "epoch": 2.1872472783825816, + "grad_norm": 2.8168491553515196e-05, + "learning_rate": 1.504149377593361e-06, + "logits/chosen": -0.1479528546333313, + "logits/rejected": 3.6196656227111816, + "logps/chosen": -423.3016662597656, + "logps/rejected": -1042.63134765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.85476016998291, + "rewards/margins": 35.65232467651367, + "rewards/rejected": -41.50708770751953, + "step": 3516 + }, + { + "epoch": 2.187869362363919, + "grad_norm": 0.0005493653588928282, + "learning_rate": 1.5029967727063164e-06, + "logits/chosen": -0.6248797178268433, + "logits/rejected": 2.49371337890625, + "logps/chosen": -552.1177368164062, + "logps/rejected": -1033.814453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.576824188232422, + "rewards/margins": 29.918533325195312, + "rewards/rejected": -41.495357513427734, + "step": 3517 + }, + { + "epoch": 2.1884914463452567, + "grad_norm": 0.05042627453804016, + "learning_rate": 1.5018441678192716e-06, + "logits/chosen": -2.2561912536621094, + "logits/rejected": 2.464254856109619, + "logps/chosen": -401.0296630859375, + "logps/rejected": -1022.3479614257812, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.056978225708008, + "rewards/margins": 33.46770477294922, + "rewards/rejected": -41.524681091308594, + "step": 3518 + }, + { + "epoch": 2.189113530326594, + "grad_norm": 0.0019069320987910032, + "learning_rate": 1.5006915629322269e-06, + "logits/chosen": 0.8858805298805237, + "logits/rejected": 2.8581292629241943, + "logps/chosen": -599.8334350585938, + "logps/rejected": -1049.4576416015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.778345584869385, + "rewards/margins": 33.91278839111328, + "rewards/rejected": -40.691131591796875, + "step": 3519 + }, + { + "epoch": 2.1897356143079314, + "grad_norm": 0.05174775794148445, + "learning_rate": 1.499538958045182e-06, + "logits/chosen": 0.19469568133354187, + "logits/rejected": 2.310316562652588, + "logps/chosen": -497.45458984375, + "logps/rejected": -1019.6409912109375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.488957405090332, + "rewards/margins": 33.4749870300293, + "rewards/rejected": -44.96394348144531, + "step": 3520 + }, + { + "epoch": 2.190357698289269, + "grad_norm": 0.6884315609931946, + "learning_rate": 1.4983863531581375e-06, + "logits/chosen": -2.5548439025878906, + "logits/rejected": 1.6989535093307495, + "logps/chosen": -362.3365173339844, + "logps/rejected": -820.731689453125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.82147741317749, + "rewards/margins": 25.978748321533203, + "rewards/rejected": -32.80022430419922, + "step": 3521 + }, + { + "epoch": 2.1909797822706065, + "grad_norm": 0.0009601087076589465, + "learning_rate": 1.4972337482710927e-06, + "logits/chosen": 1.2198925018310547, + "logits/rejected": 2.9722330570220947, + "logps/chosen": -732.6044921875, + "logps/rejected": -1182.9945068359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.560627937316895, + "rewards/margins": 33.31861114501953, + "rewards/rejected": -44.87923812866211, + "step": 3522 + }, + { + "epoch": 2.191601866251944, + "grad_norm": 0.27258390188217163, + "learning_rate": 1.496081143384048e-06, + "logits/chosen": -0.42896389961242676, + "logits/rejected": 3.6511716842651367, + "logps/chosen": -614.4691772460938, + "logps/rejected": -1194.15576171875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.832674026489258, + "rewards/margins": 40.16541290283203, + "rewards/rejected": -51.99808883666992, + "step": 3523 + }, + { + "epoch": 2.1922239502332816, + "grad_norm": 5.141660690307617, + "learning_rate": 1.4949285384970034e-06, + "logits/chosen": 0.1440633237361908, + "logits/rejected": 3.3952267169952393, + "logps/chosen": -544.6929321289062, + "logps/rejected": -929.3987426757812, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.516995429992676, + "rewards/margins": 21.96893310546875, + "rewards/rejected": -32.48592758178711, + "step": 3524 + }, + { + "epoch": 2.192846034214619, + "grad_norm": 0.30395951867103577, + "learning_rate": 1.4937759336099586e-06, + "logits/chosen": -0.002476602792739868, + "logits/rejected": 1.466905951499939, + "logps/chosen": -614.0319213867188, + "logps/rejected": -995.0291748046875, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.193775177001953, + "rewards/margins": 24.900527954101562, + "rewards/rejected": -42.09429931640625, + "step": 3525 + }, + { + "epoch": 2.1934681181959563, + "grad_norm": 30.426597595214844, + "learning_rate": 1.4926233287229139e-06, + "logits/chosen": -0.20709985494613647, + "logits/rejected": 2.804947853088379, + "logps/chosen": -548.359619140625, + "logps/rejected": -1074.06884765625, + "loss": 0.3336, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.806556701660156, + "rewards/margins": 33.09553146362305, + "rewards/rejected": -43.9020881652832, + "step": 3526 + }, + { + "epoch": 2.194090202177294, + "grad_norm": 0.0884014144539833, + "learning_rate": 1.491470723835869e-06, + "logits/chosen": 2.929100513458252, + "logits/rejected": 2.9884305000305176, + "logps/chosen": -615.928466796875, + "logps/rejected": -890.3482666015625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.841999053955078, + "rewards/margins": 27.70352554321289, + "rewards/rejected": -37.54552459716797, + "step": 3527 + }, + { + "epoch": 2.1947122861586315, + "grad_norm": 0.031009182333946228, + "learning_rate": 1.4903181189488245e-06, + "logits/chosen": 1.8256065845489502, + "logits/rejected": 3.060570478439331, + "logps/chosen": -713.8429565429688, + "logps/rejected": -1134.923828125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.26293659210205, + "rewards/margins": 36.25966262817383, + "rewards/rejected": -45.52259826660156, + "step": 3528 + }, + { + "epoch": 2.195334370139969, + "grad_norm": 0.001949359430000186, + "learning_rate": 1.4891655140617797e-06, + "logits/chosen": -0.24708634614944458, + "logits/rejected": 2.8352670669555664, + "logps/chosen": -556.1632690429688, + "logps/rejected": -1012.2968139648438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.142890930175781, + "rewards/margins": 31.787410736083984, + "rewards/rejected": -36.93030548095703, + "step": 3529 + }, + { + "epoch": 2.1959564541213066, + "grad_norm": 0.008359517902135849, + "learning_rate": 1.488012909174735e-06, + "logits/chosen": -1.7454040050506592, + "logits/rejected": 2.6669399738311768, + "logps/chosen": -508.08868408203125, + "logps/rejected": -1210.76953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.647208213806152, + "rewards/margins": 36.04959487915039, + "rewards/rejected": -45.69680404663086, + "step": 3530 + }, + { + "epoch": 2.196578538102644, + "grad_norm": 5.639753197783648e-08, + "learning_rate": 1.4868603042876902e-06, + "logits/chosen": -1.8157124519348145, + "logits/rejected": 3.0563266277313232, + "logps/chosen": -342.3044738769531, + "logps/rejected": -1138.9464111328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.511418342590332, + "rewards/margins": 37.18653869628906, + "rewards/rejected": -43.697959899902344, + "step": 3531 + }, + { + "epoch": 2.1972006220839813, + "grad_norm": 0.13938391208648682, + "learning_rate": 1.4857076994006456e-06, + "logits/chosen": -0.25851160287857056, + "logits/rejected": 3.7246131896972656, + "logps/chosen": -504.6614990234375, + "logps/rejected": -1072.123291015625, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.598262786865234, + "rewards/margins": 29.225683212280273, + "rewards/rejected": -39.823944091796875, + "step": 3532 + }, + { + "epoch": 2.1978227060653186, + "grad_norm": 0.28094029426574707, + "learning_rate": 1.4845550945136009e-06, + "logits/chosen": -0.2853562831878662, + "logits/rejected": 0.4411306381225586, + "logps/chosen": -508.27093505859375, + "logps/rejected": -726.7429809570312, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.232595443725586, + "rewards/margins": 19.054052352905273, + "rewards/rejected": -28.286649703979492, + "step": 3533 + }, + { + "epoch": 2.1984447900466564, + "grad_norm": 1.8335808249503316e-07, + "learning_rate": 1.483402489626556e-06, + "logits/chosen": -0.7460952401161194, + "logits/rejected": 4.086732864379883, + "logps/chosen": -472.76861572265625, + "logps/rejected": -1131.452392578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.650875568389893, + "rewards/margins": 39.26519012451172, + "rewards/rejected": -45.91606521606445, + "step": 3534 + }, + { + "epoch": 2.1990668740279937, + "grad_norm": 0.0018125500064343214, + "learning_rate": 1.4822498847395115e-06, + "logits/chosen": 0.30365845561027527, + "logits/rejected": 3.7823288440704346, + "logps/chosen": -560.137451171875, + "logps/rejected": -1056.3480224609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.162646293640137, + "rewards/margins": 33.35071563720703, + "rewards/rejected": -44.51335906982422, + "step": 3535 + }, + { + "epoch": 2.199688958009331, + "grad_norm": 0.0004930261638946831, + "learning_rate": 1.4810972798524667e-06, + "logits/chosen": 3.154604911804199, + "logits/rejected": 2.6136741638183594, + "logps/chosen": -609.3040161132812, + "logps/rejected": -887.94482421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.351789474487305, + "rewards/margins": 26.85038185119629, + "rewards/rejected": -41.20217514038086, + "step": 3536 + }, + { + "epoch": 2.200311041990669, + "grad_norm": 0.13322575390338898, + "learning_rate": 1.479944674965422e-06, + "logits/chosen": 1.2375845909118652, + "logits/rejected": 2.894868850708008, + "logps/chosen": -503.1761474609375, + "logps/rejected": -812.9686889648438, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.426453590393066, + "rewards/margins": 21.795515060424805, + "rewards/rejected": -34.22196960449219, + "step": 3537 + }, + { + "epoch": 2.200933125972006, + "grad_norm": 0.002348793437704444, + "learning_rate": 1.4787920700783772e-06, + "logits/chosen": 0.6488022804260254, + "logits/rejected": 0.9976472854614258, + "logps/chosen": -719.718505859375, + "logps/rejected": -1167.53515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.721811294555664, + "rewards/margins": 38.567386627197266, + "rewards/rejected": -53.28919982910156, + "step": 3538 + }, + { + "epoch": 2.2015552099533435, + "grad_norm": 8.701942277866692e-09, + "learning_rate": 1.4776394651913326e-06, + "logits/chosen": -2.041717767715454, + "logits/rejected": 2.4373598098754883, + "logps/chosen": -499.24285888671875, + "logps/rejected": -1228.697998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.595501899719238, + "rewards/margins": 39.86658477783203, + "rewards/rejected": -53.46208572387695, + "step": 3539 + }, + { + "epoch": 2.2021772939346813, + "grad_norm": 0.0034426345955580473, + "learning_rate": 1.4764868603042879e-06, + "logits/chosen": -0.7779150009155273, + "logits/rejected": 2.817213535308838, + "logps/chosen": -485.165771484375, + "logps/rejected": -1033.595703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.714851379394531, + "rewards/margins": 30.76633644104004, + "rewards/rejected": -39.4811897277832, + "step": 3540 + }, + { + "epoch": 2.2027993779160187, + "grad_norm": 7.423425267916173e-05, + "learning_rate": 1.475334255417243e-06, + "logits/chosen": 1.6081801652908325, + "logits/rejected": 3.639782428741455, + "logps/chosen": -629.6107177734375, + "logps/rejected": -1070.0302734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.458868026733398, + "rewards/margins": 27.184024810791016, + "rewards/rejected": -37.64289474487305, + "step": 3541 + }, + { + "epoch": 2.203421461897356, + "grad_norm": 10.609362602233887, + "learning_rate": 1.4741816505301983e-06, + "logits/chosen": 1.6137707233428955, + "logits/rejected": 3.130721092224121, + "logps/chosen": -699.2923583984375, + "logps/rejected": -1112.9217529296875, + "loss": 0.0336, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.161538124084473, + "rewards/margins": 30.89093589782715, + "rewards/rejected": -43.05247497558594, + "step": 3542 + }, + { + "epoch": 2.2040435458786938, + "grad_norm": 0.0009758673259057105, + "learning_rate": 1.4730290456431537e-06, + "logits/chosen": -2.5931081771850586, + "logits/rejected": 2.1481778621673584, + "logps/chosen": -423.0606689453125, + "logps/rejected": -1051.5849609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.25644302368164, + "rewards/margins": 34.31974792480469, + "rewards/rejected": -45.576194763183594, + "step": 3543 + }, + { + "epoch": 2.204665629860031, + "grad_norm": 5.5923883337527514e-05, + "learning_rate": 1.471876440756109e-06, + "logits/chosen": -0.7050518989562988, + "logits/rejected": 1.3252148628234863, + "logps/chosen": -466.5789489746094, + "logps/rejected": -930.0560302734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.0426225662231445, + "rewards/margins": 27.657917022705078, + "rewards/rejected": -34.70054244995117, + "step": 3544 + }, + { + "epoch": 2.2052877138413685, + "grad_norm": 13.390966415405273, + "learning_rate": 1.4707238358690642e-06, + "logits/chosen": -0.6056236624717712, + "logits/rejected": 2.167903423309326, + "logps/chosen": -402.4957275390625, + "logps/rejected": -790.405029296875, + "loss": 0.0767, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.958978652954102, + "rewards/margins": 24.03119468688965, + "rewards/rejected": -29.99017333984375, + "step": 3545 + }, + { + "epoch": 2.2059097978227062, + "grad_norm": 19.667755126953125, + "learning_rate": 1.4695712309820196e-06, + "logits/chosen": 0.19798988103866577, + "logits/rejected": 3.2702062129974365, + "logps/chosen": -622.1126098632812, + "logps/rejected": -1035.2138671875, + "loss": 0.1538, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.878641128540039, + "rewards/margins": 27.598281860351562, + "rewards/rejected": -38.476924896240234, + "step": 3546 + }, + { + "epoch": 2.2065318818040436, + "grad_norm": 33.967308044433594, + "learning_rate": 1.4684186260949748e-06, + "logits/chosen": 1.461820363998413, + "logits/rejected": 1.3853689432144165, + "logps/chosen": -551.6245727539062, + "logps/rejected": -700.6829833984375, + "loss": 0.2145, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.361347198486328, + "rewards/margins": 20.84174156188965, + "rewards/rejected": -30.203088760375977, + "step": 3547 + }, + { + "epoch": 2.207153965785381, + "grad_norm": 3.286201533114763e-08, + "learning_rate": 1.46726602120793e-06, + "logits/chosen": -0.11517900228500366, + "logits/rejected": 2.263479709625244, + "logps/chosen": -598.5966796875, + "logps/rejected": -1188.088134765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.82167911529541, + "rewards/margins": 40.516597747802734, + "rewards/rejected": -48.338279724121094, + "step": 3548 + }, + { + "epoch": 2.2077760497667187, + "grad_norm": 2.1776845455169678, + "learning_rate": 1.4661134163208853e-06, + "logits/chosen": -0.0010031461715698242, + "logits/rejected": 0.2503492832183838, + "logps/chosen": -622.16064453125, + "logps/rejected": -973.8892211914062, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.452604293823242, + "rewards/margins": 29.204866409301758, + "rewards/rejected": -43.657470703125, + "step": 3549 + }, + { + "epoch": 2.208398133748056, + "grad_norm": 0.0007999642984941602, + "learning_rate": 1.4649608114338407e-06, + "logits/chosen": 2.285749912261963, + "logits/rejected": 3.295093059539795, + "logps/chosen": -692.86572265625, + "logps/rejected": -972.4854736328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.163134574890137, + "rewards/margins": 21.923898696899414, + "rewards/rejected": -37.087032318115234, + "step": 3550 + }, + { + "epoch": 2.2090202177293934, + "grad_norm": 0.0001855127193266526, + "learning_rate": 1.463808206546796e-06, + "logits/chosen": -1.0556727647781372, + "logits/rejected": 2.513211727142334, + "logps/chosen": -542.8209838867188, + "logps/rejected": -1080.12060546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.84273910522461, + "rewards/margins": 34.13514709472656, + "rewards/rejected": -42.97788619995117, + "step": 3551 + }, + { + "epoch": 2.2096423017107307, + "grad_norm": 0.2574240565299988, + "learning_rate": 1.4626556016597512e-06, + "logits/chosen": -0.01851367950439453, + "logits/rejected": 4.146186351776123, + "logps/chosen": -456.16351318359375, + "logps/rejected": -1008.7926635742188, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.954230308532715, + "rewards/margins": 23.800861358642578, + "rewards/rejected": -34.75509262084961, + "step": 3552 + }, + { + "epoch": 2.2102643856920685, + "grad_norm": 0.03723328188061714, + "learning_rate": 1.4615029967727066e-06, + "logits/chosen": 1.8422104120254517, + "logits/rejected": 3.3534679412841797, + "logps/chosen": -674.9481201171875, + "logps/rejected": -1139.601806640625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.71571159362793, + "rewards/margins": 39.695892333984375, + "rewards/rejected": -50.41160202026367, + "step": 3553 + }, + { + "epoch": 2.210886469673406, + "grad_norm": 5.6771368690533563e-05, + "learning_rate": 1.4603503918856618e-06, + "logits/chosen": 0.5690759420394897, + "logits/rejected": 3.7443671226501465, + "logps/chosen": -590.1197509765625, + "logps/rejected": -1096.5736083984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.7386474609375, + "rewards/margins": 32.21901321411133, + "rewards/rejected": -46.95766067504883, + "step": 3554 + }, + { + "epoch": 2.211508553654743, + "grad_norm": 2.097417350910291e-09, + "learning_rate": 1.459197786998617e-06, + "logits/chosen": -0.7746211886405945, + "logits/rejected": 2.040572166442871, + "logps/chosen": -559.7862548828125, + "logps/rejected": -1210.46240234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.710878372192383, + "rewards/margins": 44.146278381347656, + "rewards/rejected": -56.857154846191406, + "step": 3555 + }, + { + "epoch": 2.212130637636081, + "grad_norm": 1.0708082337496094e-10, + "learning_rate": 1.4580451821115723e-06, + "logits/chosen": 0.0021647214889526367, + "logits/rejected": 4.682831764221191, + "logps/chosen": -480.7462158203125, + "logps/rejected": -1162.875732421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.413545608520508, + "rewards/margins": 42.82459259033203, + "rewards/rejected": -50.238136291503906, + "step": 3556 + }, + { + "epoch": 2.2127527216174183, + "grad_norm": 0.5422202348709106, + "learning_rate": 1.4568925772245277e-06, + "logits/chosen": 1.54587721824646, + "logits/rejected": 2.656994104385376, + "logps/chosen": -693.7167358398438, + "logps/rejected": -941.8779296875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.093391418457031, + "rewards/margins": 20.14789581298828, + "rewards/rejected": -31.241287231445312, + "step": 3557 + }, + { + "epoch": 2.2133748055987557, + "grad_norm": 1.6232826709747314, + "learning_rate": 1.455739972337483e-06, + "logits/chosen": 1.7829499244689941, + "logits/rejected": 3.495352268218994, + "logps/chosen": -649.2700805664062, + "logps/rejected": -998.00244140625, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.135042190551758, + "rewards/margins": 21.831436157226562, + "rewards/rejected": -33.96647644042969, + "step": 3558 + }, + { + "epoch": 2.2139968895800934, + "grad_norm": 0.001622863463126123, + "learning_rate": 1.4545873674504382e-06, + "logits/chosen": -1.1253859996795654, + "logits/rejected": 2.9452152252197266, + "logps/chosen": -589.4984741210938, + "logps/rejected": -1159.3382568359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.697577476501465, + "rewards/margins": 32.21438980102539, + "rewards/rejected": -44.91196823120117, + "step": 3559 + }, + { + "epoch": 2.214618973561431, + "grad_norm": 9.289454396821384e-07, + "learning_rate": 1.4534347625633934e-06, + "logits/chosen": -0.6384005546569824, + "logits/rejected": 3.2510910034179688, + "logps/chosen": -389.4755554199219, + "logps/rejected": -893.7344970703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.662978172302246, + "rewards/margins": 32.42913818359375, + "rewards/rejected": -39.09211730957031, + "step": 3560 + }, + { + "epoch": 2.215241057542768, + "grad_norm": 0.006149108987301588, + "learning_rate": 1.4522821576763488e-06, + "logits/chosen": 2.7856993675231934, + "logits/rejected": 3.0821292400360107, + "logps/chosen": -736.5506591796875, + "logps/rejected": -1034.728515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.966891765594482, + "rewards/margins": 28.720930099487305, + "rewards/rejected": -35.68782043457031, + "step": 3561 + }, + { + "epoch": 2.215863141524106, + "grad_norm": 2.35528302192688, + "learning_rate": 1.451129552789304e-06, + "logits/chosen": 1.4770406484603882, + "logits/rejected": 4.091999053955078, + "logps/chosen": -556.71435546875, + "logps/rejected": -976.85791015625, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.796460151672363, + "rewards/margins": 25.918916702270508, + "rewards/rejected": -39.71537780761719, + "step": 3562 + }, + { + "epoch": 2.2164852255054432, + "grad_norm": 0.032568614929914474, + "learning_rate": 1.4499769479022593e-06, + "logits/chosen": 1.27259361743927, + "logits/rejected": 2.114039659500122, + "logps/chosen": -574.3934936523438, + "logps/rejected": -933.9754638671875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.658870697021484, + "rewards/margins": 28.712207794189453, + "rewards/rejected": -40.37107849121094, + "step": 3563 + }, + { + "epoch": 2.2171073094867806, + "grad_norm": 6.021178705850616e-06, + "learning_rate": 1.4488243430152147e-06, + "logits/chosen": 0.02378600835800171, + "logits/rejected": 3.564028263092041, + "logps/chosen": -474.9874267578125, + "logps/rejected": -1052.9521484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.039875507354736, + "rewards/margins": 36.47960662841797, + "rewards/rejected": -43.51948547363281, + "step": 3564 + }, + { + "epoch": 2.2177293934681184, + "grad_norm": 0.5587344765663147, + "learning_rate": 1.44767173812817e-06, + "logits/chosen": -1.4509693384170532, + "logits/rejected": 2.435011148452759, + "logps/chosen": -430.926513671875, + "logps/rejected": -884.4074096679688, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.396395683288574, + "rewards/margins": 19.22774887084961, + "rewards/rejected": -26.624147415161133, + "step": 3565 + }, + { + "epoch": 2.2183514774494557, + "grad_norm": 0.11176523566246033, + "learning_rate": 1.4465191332411252e-06, + "logits/chosen": 1.9386296272277832, + "logits/rejected": 2.4904065132141113, + "logps/chosen": -648.920166015625, + "logps/rejected": -935.1029052734375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.740001678466797, + "rewards/margins": 25.321617126464844, + "rewards/rejected": -36.06161880493164, + "step": 3566 + }, + { + "epoch": 2.218973561430793, + "grad_norm": 0.0056029753759503365, + "learning_rate": 1.4453665283540804e-06, + "logits/chosen": 1.5881600379943848, + "logits/rejected": 4.458906650543213, + "logps/chosen": -502.69049072265625, + "logps/rejected": -960.2064819335938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.035712242126465, + "rewards/margins": 29.94509506225586, + "rewards/rejected": -39.98080825805664, + "step": 3567 + }, + { + "epoch": 2.219595645412131, + "grad_norm": 0.002757622394710779, + "learning_rate": 1.4442139234670354e-06, + "logits/chosen": 2.278160572052002, + "logits/rejected": 4.256825923919678, + "logps/chosen": -753.8636474609375, + "logps/rejected": -1213.9505615234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.023143768310547, + "rewards/margins": 33.02347183227539, + "rewards/rejected": -48.04661560058594, + "step": 3568 + }, + { + "epoch": 2.220217729393468, + "grad_norm": 0.0002265808143420145, + "learning_rate": 1.4430613185799909e-06, + "logits/chosen": 0.007342390716075897, + "logits/rejected": 1.5686745643615723, + "logps/chosen": -709.2378540039062, + "logps/rejected": -1120.9376220703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.142677307128906, + "rewards/margins": 34.99839782714844, + "rewards/rejected": -47.141075134277344, + "step": 3569 + }, + { + "epoch": 2.2208398133748055, + "grad_norm": 0.001974264159798622, + "learning_rate": 1.441908713692946e-06, + "logits/chosen": -2.754730701446533, + "logits/rejected": -0.30940690636634827, + "logps/chosen": -438.1318054199219, + "logps/rejected": -972.1848754882812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.748628616333008, + "rewards/margins": 31.43011474609375, + "rewards/rejected": -41.178741455078125, + "step": 3570 + }, + { + "epoch": 2.221461897356143, + "grad_norm": 0.003989961929619312, + "learning_rate": 1.4407561088059013e-06, + "logits/chosen": -2.1097288131713867, + "logits/rejected": 3.656294584274292, + "logps/chosen": -410.3033752441406, + "logps/rejected": -1147.75341796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.758270263671875, + "rewards/margins": 38.09152603149414, + "rewards/rejected": -45.84979248046875, + "step": 3571 + }, + { + "epoch": 2.2220839813374806, + "grad_norm": 13.953394889831543, + "learning_rate": 1.4396035039188565e-06, + "logits/chosen": 0.9869332313537598, + "logits/rejected": 3.99074649810791, + "logps/chosen": -527.534912109375, + "logps/rejected": -1001.27001953125, + "loss": 0.042, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.06263542175293, + "rewards/margins": 32.09954071044922, + "rewards/rejected": -40.162174224853516, + "step": 3572 + }, + { + "epoch": 2.222706065318818, + "grad_norm": 0.041325412690639496, + "learning_rate": 1.438450899031812e-06, + "logits/chosen": -1.103442907333374, + "logits/rejected": 3.769481658935547, + "logps/chosen": -384.44708251953125, + "logps/rejected": -904.393798828125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.902695655822754, + "rewards/margins": 27.485366821289062, + "rewards/rejected": -37.388065338134766, + "step": 3573 + }, + { + "epoch": 2.2233281493001553, + "grad_norm": 8.32625971725065e-07, + "learning_rate": 1.4372982941447672e-06, + "logits/chosen": -1.5317946672439575, + "logits/rejected": 2.258638858795166, + "logps/chosen": -471.34619140625, + "logps/rejected": -1037.48876953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.764509201049805, + "rewards/margins": 33.39957809448242, + "rewards/rejected": -42.164085388183594, + "step": 3574 + }, + { + "epoch": 2.223950233281493, + "grad_norm": 0.0006140259793028235, + "learning_rate": 1.4361456892577224e-06, + "logits/chosen": 3.8182244300842285, + "logits/rejected": 3.684861183166504, + "logps/chosen": -797.9357299804688, + "logps/rejected": -1206.4022216796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.626452445983887, + "rewards/margins": 34.950870513916016, + "rewards/rejected": -49.57732391357422, + "step": 3575 + }, + { + "epoch": 2.2245723172628304, + "grad_norm": 0.0012920801527798176, + "learning_rate": 1.4349930843706778e-06, + "logits/chosen": 1.275935173034668, + "logits/rejected": 2.668433904647827, + "logps/chosen": -682.9571533203125, + "logps/rejected": -1124.5068359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.28410530090332, + "rewards/margins": 31.946544647216797, + "rewards/rejected": -44.23065185546875, + "step": 3576 + }, + { + "epoch": 2.225194401244168, + "grad_norm": 0.09917772561311722, + "learning_rate": 1.433840479483633e-06, + "logits/chosen": -0.4102671146392822, + "logits/rejected": 2.216341495513916, + "logps/chosen": -454.63818359375, + "logps/rejected": -1024.591796875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.388709545135498, + "rewards/margins": 33.88431167602539, + "rewards/rejected": -41.27302169799805, + "step": 3577 + }, + { + "epoch": 2.2258164852255056, + "grad_norm": 1.067162065737648e-05, + "learning_rate": 1.4326878745965883e-06, + "logits/chosen": -1.766862392425537, + "logits/rejected": 2.5478920936584473, + "logps/chosen": -330.01385498046875, + "logps/rejected": -957.0518798828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.559136867523193, + "rewards/margins": 33.83004379272461, + "rewards/rejected": -40.38917922973633, + "step": 3578 + }, + { + "epoch": 2.226438569206843, + "grad_norm": 0.04910597577691078, + "learning_rate": 1.4315352697095435e-06, + "logits/chosen": -0.8890432119369507, + "logits/rejected": 2.860970973968506, + "logps/chosen": -427.6197814941406, + "logps/rejected": -959.2499389648438, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.983651161193848, + "rewards/margins": 27.483963012695312, + "rewards/rejected": -34.467613220214844, + "step": 3579 + }, + { + "epoch": 2.2270606531881803, + "grad_norm": 7.004135568422498e-06, + "learning_rate": 1.430382664822499e-06, + "logits/chosen": 0.7989118695259094, + "logits/rejected": 4.2649664878845215, + "logps/chosen": -388.40765380859375, + "logps/rejected": -975.993408203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.377376556396484, + "rewards/margins": 30.533058166503906, + "rewards/rejected": -38.91043472290039, + "step": 3580 + }, + { + "epoch": 2.227682737169518, + "grad_norm": 0.020427517592906952, + "learning_rate": 1.4292300599354542e-06, + "logits/chosen": 0.2810332775115967, + "logits/rejected": 3.973931312561035, + "logps/chosen": -458.1094970703125, + "logps/rejected": -945.5018920898438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.795637130737305, + "rewards/margins": 29.356910705566406, + "rewards/rejected": -39.15254592895508, + "step": 3581 + }, + { + "epoch": 2.2283048211508554, + "grad_norm": 6.906030654907227, + "learning_rate": 1.4280774550484094e-06, + "logits/chosen": -1.385702133178711, + "logits/rejected": 1.523589849472046, + "logps/chosen": -461.1471862792969, + "logps/rejected": -835.5704345703125, + "loss": 0.054, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.676605224609375, + "rewards/margins": 27.159587860107422, + "rewards/rejected": -36.8361930847168, + "step": 3582 + }, + { + "epoch": 2.2289269051321927, + "grad_norm": 4.475971698760986, + "learning_rate": 1.4269248501613646e-06, + "logits/chosen": 2.221020460128784, + "logits/rejected": 2.2579550743103027, + "logps/chosen": -595.5123901367188, + "logps/rejected": -788.226318359375, + "loss": 0.0383, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.11880111694336, + "rewards/margins": 20.095670700073242, + "rewards/rejected": -28.21446990966797, + "step": 3583 + }, + { + "epoch": 2.2295489891135305, + "grad_norm": 8.492868630582961e-08, + "learning_rate": 1.42577224527432e-06, + "logits/chosen": 3.062131881713867, + "logits/rejected": 3.5542662143707275, + "logps/chosen": -652.6409301757812, + "logps/rejected": -1119.76123046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.757875442504883, + "rewards/margins": 34.784400939941406, + "rewards/rejected": -46.542274475097656, + "step": 3584 + }, + { + "epoch": 2.230171073094868, + "grad_norm": 0.07185564935207367, + "learning_rate": 1.4246196403872753e-06, + "logits/chosen": 0.11531239748001099, + "logits/rejected": 3.3846631050109863, + "logps/chosen": -489.6289978027344, + "logps/rejected": -867.05078125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.214015007019043, + "rewards/margins": 20.78061294555664, + "rewards/rejected": -28.994626998901367, + "step": 3585 + }, + { + "epoch": 2.230793157076205, + "grad_norm": 24.446683883666992, + "learning_rate": 1.4234670355002305e-06, + "logits/chosen": 0.3510545492172241, + "logits/rejected": 3.017117500305176, + "logps/chosen": -532.3301391601562, + "logps/rejected": -973.9273071289062, + "loss": 0.212, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.277337074279785, + "rewards/margins": 32.72923278808594, + "rewards/rejected": -40.006568908691406, + "step": 3586 + }, + { + "epoch": 2.231415241057543, + "grad_norm": 0.00026219518622383475, + "learning_rate": 1.422314430613186e-06, + "logits/chosen": 0.8821203708648682, + "logits/rejected": 3.143800735473633, + "logps/chosen": -518.6629638671875, + "logps/rejected": -997.9705810546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.694472312927246, + "rewards/margins": 35.719844818115234, + "rewards/rejected": -46.41431427001953, + "step": 3587 + }, + { + "epoch": 2.2320373250388803, + "grad_norm": 0.060858700424432755, + "learning_rate": 1.4211618257261412e-06, + "logits/chosen": 1.9092174768447876, + "logits/rejected": 4.185993194580078, + "logps/chosen": -546.7835693359375, + "logps/rejected": -965.928955078125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.56574535369873, + "rewards/margins": 29.70265007019043, + "rewards/rejected": -39.268394470214844, + "step": 3588 + }, + { + "epoch": 2.2326594090202176, + "grad_norm": 36.87541580200195, + "learning_rate": 1.4200092208390964e-06, + "logits/chosen": -1.4777759313583374, + "logits/rejected": 3.420780658721924, + "logps/chosen": -500.4163818359375, + "logps/rejected": -1042.6434326171875, + "loss": 0.5542, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.277176856994629, + "rewards/margins": 30.546443939208984, + "rewards/rejected": -36.82362365722656, + "step": 3589 + }, + { + "epoch": 2.233281493001555, + "grad_norm": 0.6003460884094238, + "learning_rate": 1.4188566159520516e-06, + "logits/chosen": -0.2000950276851654, + "logits/rejected": 3.9854304790496826, + "logps/chosen": -382.73809814453125, + "logps/rejected": -948.181640625, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.3645100593566895, + "rewards/margins": 31.50813865661621, + "rewards/rejected": -35.872650146484375, + "step": 3590 + }, + { + "epoch": 2.2339035769828928, + "grad_norm": 0.3768901526927948, + "learning_rate": 1.417704011065007e-06, + "logits/chosen": -3.7208173274993896, + "logits/rejected": -0.07973974943161011, + "logps/chosen": -310.0085754394531, + "logps/rejected": -711.6843872070312, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.959542274475098, + "rewards/margins": 24.725475311279297, + "rewards/rejected": -29.685020446777344, + "step": 3591 + }, + { + "epoch": 2.23452566096423, + "grad_norm": 3.6291356086730957, + "learning_rate": 1.4165514061779623e-06, + "logits/chosen": -1.0591273307800293, + "logits/rejected": 1.9402902126312256, + "logps/chosen": -393.0709533691406, + "logps/rejected": -802.1497802734375, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.485906600952148, + "rewards/margins": 24.194093704223633, + "rewards/rejected": -32.68000030517578, + "step": 3592 + }, + { + "epoch": 2.2351477449455674, + "grad_norm": 0.09102648496627808, + "learning_rate": 1.4153988012909175e-06, + "logits/chosen": 0.13417690992355347, + "logits/rejected": 3.4707531929016113, + "logps/chosen": -512.4104614257812, + "logps/rejected": -935.8575439453125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.069864273071289, + "rewards/margins": 24.324996948242188, + "rewards/rejected": -37.394859313964844, + "step": 3593 + }, + { + "epoch": 2.2357698289269052, + "grad_norm": 9.27643632167019e-05, + "learning_rate": 1.4142461964038727e-06, + "logits/chosen": 0.36809054017066956, + "logits/rejected": 4.21954345703125, + "logps/chosen": -596.3086547851562, + "logps/rejected": -1222.6021728515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.957084655761719, + "rewards/margins": 40.438499450683594, + "rewards/rejected": -50.39558029174805, + "step": 3594 + }, + { + "epoch": 2.2363919129082426, + "grad_norm": 0.04157000035047531, + "learning_rate": 1.4130935915168282e-06, + "logits/chosen": 1.3686137199401855, + "logits/rejected": 3.7084567546844482, + "logps/chosen": -624.705322265625, + "logps/rejected": -1096.7337646484375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.065147399902344, + "rewards/margins": 31.175518035888672, + "rewards/rejected": -39.24066162109375, + "step": 3595 + }, + { + "epoch": 2.23701399688958, + "grad_norm": 0.00014478390221484005, + "learning_rate": 1.4119409866297834e-06, + "logits/chosen": 2.2126994132995605, + "logits/rejected": 3.524190902709961, + "logps/chosen": -674.3311157226562, + "logps/rejected": -1038.1907958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.26594352722168, + "rewards/margins": 28.325271606445312, + "rewards/rejected": -39.591217041015625, + "step": 3596 + }, + { + "epoch": 2.2376360808709177, + "grad_norm": 0.001759856822900474, + "learning_rate": 1.4107883817427386e-06, + "logits/chosen": 0.9845359921455383, + "logits/rejected": 2.5849828720092773, + "logps/chosen": -755.294189453125, + "logps/rejected": -1175.09912109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.71370792388916, + "rewards/margins": 28.954248428344727, + "rewards/rejected": -44.6679573059082, + "step": 3597 + }, + { + "epoch": 2.238258164852255, + "grad_norm": 1.3494692439053324e-06, + "learning_rate": 1.409635776855694e-06, + "logits/chosen": 2.1158127784729004, + "logits/rejected": 3.403364419937134, + "logps/chosen": -700.1041259765625, + "logps/rejected": -1197.26611328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.83034610748291, + "rewards/margins": 38.487022399902344, + "rewards/rejected": -49.31736755371094, + "step": 3598 + }, + { + "epoch": 2.2388802488335924, + "grad_norm": 2.0819742679595947, + "learning_rate": 1.4084831719686493e-06, + "logits/chosen": -2.9958994388580322, + "logits/rejected": 2.014313220977783, + "logps/chosen": -350.19500732421875, + "logps/rejected": -974.436279296875, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.576704025268555, + "rewards/margins": 32.14638900756836, + "rewards/rejected": -38.72309494018555, + "step": 3599 + }, + { + "epoch": 2.23950233281493, + "grad_norm": 0.08503194898366928, + "learning_rate": 1.4073305670816045e-06, + "logits/chosen": 3.557544708251953, + "logits/rejected": 4.321353435516357, + "logps/chosen": -751.0997314453125, + "logps/rejected": -1050.1031494140625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.91750431060791, + "rewards/margins": 23.742610931396484, + "rewards/rejected": -36.66011428833008, + "step": 3600 + }, + { + "epoch": 2.2401244167962675, + "grad_norm": 0.005394472740590572, + "learning_rate": 1.4061779621945597e-06, + "logits/chosen": 1.0073996782302856, + "logits/rejected": 2.4288978576660156, + "logps/chosen": -554.6205444335938, + "logps/rejected": -939.342041015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.75421142578125, + "rewards/margins": 23.233583450317383, + "rewards/rejected": -34.98779296875, + "step": 3601 + }, + { + "epoch": 2.240746500777605, + "grad_norm": 2.5468521736016214e-10, + "learning_rate": 1.4050253573075152e-06, + "logits/chosen": -0.81147301197052, + "logits/rejected": 1.3437249660491943, + "logps/chosen": -541.517822265625, + "logps/rejected": -1117.1673583984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.313809394836426, + "rewards/margins": 40.41972732543945, + "rewards/rejected": -49.73353576660156, + "step": 3602 + }, + { + "epoch": 2.2413685847589426, + "grad_norm": 40.08323669433594, + "learning_rate": 1.4038727524204704e-06, + "logits/chosen": -2.013843297958374, + "logits/rejected": 2.682800054550171, + "logps/chosen": -492.3447570800781, + "logps/rejected": -1076.1142578125, + "loss": 0.6501, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.914434909820557, + "rewards/margins": 30.04220962524414, + "rewards/rejected": -36.95664978027344, + "step": 3603 + }, + { + "epoch": 2.24199066874028, + "grad_norm": 1.0822905904817048e-09, + "learning_rate": 1.4027201475334256e-06, + "logits/chosen": -3.263639450073242, + "logits/rejected": 1.29646897315979, + "logps/chosen": -425.00909423828125, + "logps/rejected": -1020.944091796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.435742378234863, + "rewards/margins": 41.301151275634766, + "rewards/rejected": -48.73689270019531, + "step": 3604 + }, + { + "epoch": 2.2426127527216173, + "grad_norm": 6.468580722808838, + "learning_rate": 1.4015675426463808e-06, + "logits/chosen": 1.4337022304534912, + "logits/rejected": 2.421602249145508, + "logps/chosen": -648.6536254882812, + "logps/rejected": -909.980224609375, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.684656143188477, + "rewards/margins": 25.83026885986328, + "rewards/rejected": -37.514923095703125, + "step": 3605 + }, + { + "epoch": 2.243234836702955, + "grad_norm": 0.004268967546522617, + "learning_rate": 1.4004149377593363e-06, + "logits/chosen": -1.3249869346618652, + "logits/rejected": 2.7571754455566406, + "logps/chosen": -425.77606201171875, + "logps/rejected": -983.8963012695312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.982041358947754, + "rewards/margins": 32.781829833984375, + "rewards/rejected": -40.76387023925781, + "step": 3606 + }, + { + "epoch": 2.2438569206842924, + "grad_norm": 0.02053695172071457, + "learning_rate": 1.3992623328722915e-06, + "logits/chosen": 3.154069423675537, + "logits/rejected": 2.9946210384368896, + "logps/chosen": -675.71484375, + "logps/rejected": -986.4381713867188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.358978271484375, + "rewards/margins": 27.62502670288086, + "rewards/rejected": -42.9840087890625, + "step": 3607 + }, + { + "epoch": 2.2444790046656298, + "grad_norm": 0.036014728248119354, + "learning_rate": 1.3981097279852467e-06, + "logits/chosen": 0.1260930299758911, + "logits/rejected": 3.830151081085205, + "logps/chosen": -530.7362670898438, + "logps/rejected": -1056.0645751953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.860038757324219, + "rewards/margins": 32.02716064453125, + "rewards/rejected": -41.88720703125, + "step": 3608 + }, + { + "epoch": 2.245101088646967, + "grad_norm": 0.06608090549707413, + "learning_rate": 1.3969571230982022e-06, + "logits/chosen": 1.1909940242767334, + "logits/rejected": 3.821021556854248, + "logps/chosen": -635.0616455078125, + "logps/rejected": -1033.9156494140625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.48543930053711, + "rewards/margins": 23.562484741210938, + "rewards/rejected": -40.04792785644531, + "step": 3609 + }, + { + "epoch": 2.245723172628305, + "grad_norm": 0.004007376730442047, + "learning_rate": 1.3958045182111574e-06, + "logits/chosen": -1.2943001985549927, + "logits/rejected": 2.278123378753662, + "logps/chosen": -464.9049072265625, + "logps/rejected": -1044.6634521484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.284080505371094, + "rewards/margins": 35.11205291748047, + "rewards/rejected": -43.39613342285156, + "step": 3610 + }, + { + "epoch": 2.2463452566096422, + "grad_norm": 4.8539391173108015e-06, + "learning_rate": 1.3946519133241126e-06, + "logits/chosen": 1.5299954414367676, + "logits/rejected": 3.668931722640991, + "logps/chosen": -711.9598999023438, + "logps/rejected": -1167.881103515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.675633430480957, + "rewards/margins": 41.793670654296875, + "rewards/rejected": -52.46930694580078, + "step": 3611 + }, + { + "epoch": 2.2469673405909796, + "grad_norm": 0.10625211149454117, + "learning_rate": 1.3934993084370678e-06, + "logits/chosen": 0.4892570972442627, + "logits/rejected": 3.181586503982544, + "logps/chosen": -609.27490234375, + "logps/rejected": -1167.6185302734375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.308204650878906, + "rewards/margins": 34.49951171875, + "rewards/rejected": -44.80772018432617, + "step": 3612 + }, + { + "epoch": 2.2475894245723174, + "grad_norm": 0.0002764218661468476, + "learning_rate": 1.3923467035500233e-06, + "logits/chosen": -0.6855688095092773, + "logits/rejected": 4.369061470031738, + "logps/chosen": -483.9892883300781, + "logps/rejected": -1195.517333984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.535987854003906, + "rewards/margins": 36.80906295776367, + "rewards/rejected": -47.345054626464844, + "step": 3613 + }, + { + "epoch": 2.2482115085536547, + "grad_norm": 0.04360436648130417, + "learning_rate": 1.3911940986629785e-06, + "logits/chosen": -1.0055224895477295, + "logits/rejected": 1.2355737686157227, + "logps/chosen": -570.8172607421875, + "logps/rejected": -991.0896606445312, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.466044425964355, + "rewards/margins": 27.260345458984375, + "rewards/rejected": -41.72639083862305, + "step": 3614 + }, + { + "epoch": 2.248833592534992, + "grad_norm": 0.01460373867303133, + "learning_rate": 1.3900414937759337e-06, + "logits/chosen": 2.5443148612976074, + "logits/rejected": 3.125467300415039, + "logps/chosen": -656.630859375, + "logps/rejected": -941.162841796875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.8216552734375, + "rewards/margins": 26.766983032226562, + "rewards/rejected": -35.58863830566406, + "step": 3615 + }, + { + "epoch": 2.24945567651633, + "grad_norm": 0.040251053869724274, + "learning_rate": 1.3888888888888892e-06, + "logits/chosen": 1.715376853942871, + "logits/rejected": 2.761099338531494, + "logps/chosen": -572.9150390625, + "logps/rejected": -828.9449462890625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.75190544128418, + "rewards/margins": 23.62649917602539, + "rewards/rejected": -34.37840270996094, + "step": 3616 + }, + { + "epoch": 2.250077760497667, + "grad_norm": 1.6713193329298548e-10, + "learning_rate": 1.3877362840018444e-06, + "logits/chosen": 0.12911105155944824, + "logits/rejected": 3.3643741607666016, + "logps/chosen": -502.5730285644531, + "logps/rejected": -1142.19482421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.952110290527344, + "rewards/margins": 42.303550720214844, + "rewards/rejected": -52.25566482543945, + "step": 3617 + }, + { + "epoch": 2.2506998444790045, + "grad_norm": 13.921367645263672, + "learning_rate": 1.3865836791147996e-06, + "logits/chosen": -0.6900714635848999, + "logits/rejected": 3.1058475971221924, + "logps/chosen": -525.9616088867188, + "logps/rejected": -1013.4569091796875, + "loss": 0.0692, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.485834121704102, + "rewards/margins": 29.015771865844727, + "rewards/rejected": -38.50160598754883, + "step": 3618 + }, + { + "epoch": 2.2513219284603423, + "grad_norm": 0.06612343341112137, + "learning_rate": 1.3854310742277548e-06, + "logits/chosen": 0.1767033338546753, + "logits/rejected": 3.644655466079712, + "logps/chosen": -579.5123291015625, + "logps/rejected": -1140.45263671875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.06546401977539, + "rewards/margins": 27.96875, + "rewards/rejected": -39.034217834472656, + "step": 3619 + }, + { + "epoch": 2.2519440124416796, + "grad_norm": 0.0021795639768242836, + "learning_rate": 1.3842784693407103e-06, + "logits/chosen": -0.03955802321434021, + "logits/rejected": 3.334292411804199, + "logps/chosen": -396.38818359375, + "logps/rejected": -994.3783569335938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.11500072479248, + "rewards/margins": 34.99541473388672, + "rewards/rejected": -45.11041259765625, + "step": 3620 + }, + { + "epoch": 2.252566096423017, + "grad_norm": 5.8073277614312246e-05, + "learning_rate": 1.3831258644536655e-06, + "logits/chosen": 1.6798940896987915, + "logits/rejected": 3.420667886734009, + "logps/chosen": -564.4171752929688, + "logps/rejected": -992.0733642578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.620368003845215, + "rewards/margins": 30.430877685546875, + "rewards/rejected": -43.051246643066406, + "step": 3621 + }, + { + "epoch": 2.2531881804043548, + "grad_norm": 0.0003387883771210909, + "learning_rate": 1.3819732595666207e-06, + "logits/chosen": -2.5185070037841797, + "logits/rejected": 2.5935957431793213, + "logps/chosen": -357.0126647949219, + "logps/rejected": -965.9202880859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.179485321044922, + "rewards/margins": 34.00236511230469, + "rewards/rejected": -41.181846618652344, + "step": 3622 + }, + { + "epoch": 2.253810264385692, + "grad_norm": 0.001127618015743792, + "learning_rate": 1.380820654679576e-06, + "logits/chosen": 1.6329503059387207, + "logits/rejected": 4.3267927169799805, + "logps/chosen": -644.5361328125, + "logps/rejected": -1160.6734619140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.97283935546875, + "rewards/margins": 32.02046203613281, + "rewards/rejected": -41.99330520629883, + "step": 3623 + }, + { + "epoch": 2.2544323483670294, + "grad_norm": 0.0018201852217316628, + "learning_rate": 1.3796680497925314e-06, + "logits/chosen": -1.341170072555542, + "logits/rejected": 3.551229476928711, + "logps/chosen": -598.9361572265625, + "logps/rejected": -1234.8564453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.985949516296387, + "rewards/margins": 32.776268005371094, + "rewards/rejected": -43.76221466064453, + "step": 3624 + }, + { + "epoch": 2.255054432348367, + "grad_norm": 0.0016448667738586664, + "learning_rate": 1.3785154449054866e-06, + "logits/chosen": 0.6106576919555664, + "logits/rejected": 1.8452892303466797, + "logps/chosen": -522.1275024414062, + "logps/rejected": -885.4285888671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.987752437591553, + "rewards/margins": 24.238380432128906, + "rewards/rejected": -31.226133346557617, + "step": 3625 + }, + { + "epoch": 2.2556765163297046, + "grad_norm": 0.018660522997379303, + "learning_rate": 1.3773628400184418e-06, + "logits/chosen": -1.0240919589996338, + "logits/rejected": 1.6847429275512695, + "logps/chosen": -570.3428955078125, + "logps/rejected": -1055.249755859375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.822690963745117, + "rewards/margins": 32.19108581542969, + "rewards/rejected": -45.01377487182617, + "step": 3626 + }, + { + "epoch": 2.256298600311042, + "grad_norm": 0.037954483181238174, + "learning_rate": 1.3762102351313973e-06, + "logits/chosen": 1.0277810096740723, + "logits/rejected": 1.5962250232696533, + "logps/chosen": -531.65966796875, + "logps/rejected": -910.9324951171875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.959372520446777, + "rewards/margins": 29.962890625, + "rewards/rejected": -35.922264099121094, + "step": 3627 + }, + { + "epoch": 2.2569206842923792, + "grad_norm": 0.0028481753543019295, + "learning_rate": 1.3750576302443525e-06, + "logits/chosen": 1.854551076889038, + "logits/rejected": 2.4571213722229004, + "logps/chosen": -669.2379150390625, + "logps/rejected": -1053.6103515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.859801292419434, + "rewards/margins": 27.96354866027832, + "rewards/rejected": -38.82334899902344, + "step": 3628 + }, + { + "epoch": 2.257542768273717, + "grad_norm": 0.03748118877410889, + "learning_rate": 1.3739050253573077e-06, + "logits/chosen": -2.21651029586792, + "logits/rejected": 1.614790678024292, + "logps/chosen": -392.2901611328125, + "logps/rejected": -836.7474365234375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.908618927001953, + "rewards/margins": 18.771080017089844, + "rewards/rejected": -26.679697036743164, + "step": 3629 + }, + { + "epoch": 2.2581648522550544, + "grad_norm": 0.24403417110443115, + "learning_rate": 1.372752420470263e-06, + "logits/chosen": -0.4138451814651489, + "logits/rejected": 2.3937408924102783, + "logps/chosen": -594.5587768554688, + "logps/rejected": -991.3351440429688, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.23100471496582, + "rewards/margins": 25.721485137939453, + "rewards/rejected": -39.95248794555664, + "step": 3630 + }, + { + "epoch": 2.258786936236392, + "grad_norm": 1.4337886568682734e-05, + "learning_rate": 1.3715998155832184e-06, + "logits/chosen": 0.9790940284729004, + "logits/rejected": 3.165156126022339, + "logps/chosen": -626.1618041992188, + "logps/rejected": -1086.665283203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.662452697753906, + "rewards/margins": 33.48451232910156, + "rewards/rejected": -48.146968841552734, + "step": 3631 + }, + { + "epoch": 2.2594090202177295, + "grad_norm": 0.0001606412697583437, + "learning_rate": 1.3704472106961736e-06, + "logits/chosen": 0.9021769762039185, + "logits/rejected": 2.9399120807647705, + "logps/chosen": -685.17138671875, + "logps/rejected": -1164.349853515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.966161727905273, + "rewards/margins": 27.477081298828125, + "rewards/rejected": -38.44324493408203, + "step": 3632 + }, + { + "epoch": 2.260031104199067, + "grad_norm": 0.002752943430095911, + "learning_rate": 1.3692946058091288e-06, + "logits/chosen": 0.6641960144042969, + "logits/rejected": 2.6874494552612305, + "logps/chosen": -600.1417236328125, + "logps/rejected": -970.0240478515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.83272933959961, + "rewards/margins": 30.065876007080078, + "rewards/rejected": -40.89860534667969, + "step": 3633 + }, + { + "epoch": 2.260653188180404, + "grad_norm": 0.00016964755195658654, + "learning_rate": 1.368142000922084e-06, + "logits/chosen": -1.6499991416931152, + "logits/rejected": 2.953476667404175, + "logps/chosen": -455.2291259765625, + "logps/rejected": -1102.061279296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.72672176361084, + "rewards/margins": 37.105812072753906, + "rewards/rejected": -45.83253479003906, + "step": 3634 + }, + { + "epoch": 2.261275272161742, + "grad_norm": 6.942920549590781e-07, + "learning_rate": 1.366989396035039e-06, + "logits/chosen": -0.9813140034675598, + "logits/rejected": 2.0054101943969727, + "logps/chosen": -452.45037841796875, + "logps/rejected": -1088.82275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.222711563110352, + "rewards/margins": 38.482872009277344, + "rewards/rejected": -46.70558547973633, + "step": 3635 + }, + { + "epoch": 2.2618973561430793, + "grad_norm": 46.79713439941406, + "learning_rate": 1.3658367911479945e-06, + "logits/chosen": 1.9303909540176392, + "logits/rejected": 3.462801456451416, + "logps/chosen": -649.6270751953125, + "logps/rejected": -1096.564453125, + "loss": 0.6223, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.076848983764648, + "rewards/margins": 28.809837341308594, + "rewards/rejected": -39.88668441772461, + "step": 3636 + }, + { + "epoch": 2.2625194401244166, + "grad_norm": 0.006422718986868858, + "learning_rate": 1.3646841862609497e-06, + "logits/chosen": 0.4042189121246338, + "logits/rejected": 3.6927623748779297, + "logps/chosen": -472.4306335449219, + "logps/rejected": -1149.659423828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.388535499572754, + "rewards/margins": 36.08953094482422, + "rewards/rejected": -44.47806167602539, + "step": 3637 + }, + { + "epoch": 2.2631415241057544, + "grad_norm": 0.04757959023118019, + "learning_rate": 1.363531581373905e-06, + "logits/chosen": 1.9163646697998047, + "logits/rejected": 2.9809160232543945, + "logps/chosen": -683.015625, + "logps/rejected": -975.9616088867188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.581829071044922, + "rewards/margins": 26.310970306396484, + "rewards/rejected": -35.892799377441406, + "step": 3638 + }, + { + "epoch": 2.2637636080870918, + "grad_norm": 2.0798819605261087e-05, + "learning_rate": 1.3623789764868604e-06, + "logits/chosen": -2.256040096282959, + "logits/rejected": -0.2808629274368286, + "logps/chosen": -461.394775390625, + "logps/rejected": -837.761962890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.2075300216674805, + "rewards/margins": 27.08609962463379, + "rewards/rejected": -34.29363250732422, + "step": 3639 + }, + { + "epoch": 2.264385692068429, + "grad_norm": 0.0005983648006804287, + "learning_rate": 1.3612263715998156e-06, + "logits/chosen": 1.4837771654129028, + "logits/rejected": 2.5212080478668213, + "logps/chosen": -584.5574951171875, + "logps/rejected": -896.6384887695312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.758228302001953, + "rewards/margins": 25.834224700927734, + "rewards/rejected": -35.59245300292969, + "step": 3640 + }, + { + "epoch": 2.265007776049767, + "grad_norm": 0.01774199865758419, + "learning_rate": 1.3600737667127708e-06, + "logits/chosen": 1.4264962673187256, + "logits/rejected": 1.4264075756072998, + "logps/chosen": -644.2673950195312, + "logps/rejected": -845.5146484375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.36091423034668, + "rewards/margins": 20.632631301879883, + "rewards/rejected": -30.99354362487793, + "step": 3641 + }, + { + "epoch": 2.2656298600311042, + "grad_norm": 0.09057408571243286, + "learning_rate": 1.358921161825726e-06, + "logits/chosen": 0.2792913317680359, + "logits/rejected": 2.1801884174346924, + "logps/chosen": -623.1153564453125, + "logps/rejected": -980.2783203125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.632190704345703, + "rewards/margins": 21.55695343017578, + "rewards/rejected": -32.189144134521484, + "step": 3642 + }, + { + "epoch": 2.2662519440124416, + "grad_norm": 0.025631356984376907, + "learning_rate": 1.3577685569386815e-06, + "logits/chosen": 0.3164311647415161, + "logits/rejected": 3.2249999046325684, + "logps/chosen": -546.7294921875, + "logps/rejected": -909.2017211914062, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.281386852264404, + "rewards/margins": 24.865476608276367, + "rewards/rejected": -32.1468620300293, + "step": 3643 + }, + { + "epoch": 2.2668740279937794, + "grad_norm": 23.9825439453125, + "learning_rate": 1.3566159520516367e-06, + "logits/chosen": 1.6409586668014526, + "logits/rejected": 2.854196786880493, + "logps/chosen": -668.395263671875, + "logps/rejected": -926.1879272460938, + "loss": 0.1152, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.461042404174805, + "rewards/margins": 21.571842193603516, + "rewards/rejected": -34.03288269042969, + "step": 3644 + }, + { + "epoch": 2.2674961119751167, + "grad_norm": 0.14037181437015533, + "learning_rate": 1.355463347164592e-06, + "logits/chosen": -1.410860300064087, + "logits/rejected": 3.208120822906494, + "logps/chosen": -488.7926330566406, + "logps/rejected": -946.1668701171875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.458749771118164, + "rewards/margins": 26.631572723388672, + "rewards/rejected": -38.09032440185547, + "step": 3645 + }, + { + "epoch": 2.268118195956454, + "grad_norm": 0.25334399938583374, + "learning_rate": 1.3543107422775472e-06, + "logits/chosen": -1.4088850021362305, + "logits/rejected": 2.939032554626465, + "logps/chosen": -360.3502502441406, + "logps/rejected": -973.505859375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.68812084197998, + "rewards/margins": 29.315006256103516, + "rewards/rejected": -39.00312423706055, + "step": 3646 + }, + { + "epoch": 2.2687402799377914, + "grad_norm": 0.15394827723503113, + "learning_rate": 1.3531581373905026e-06, + "logits/chosen": 0.8405669331550598, + "logits/rejected": 3.7781574726104736, + "logps/chosen": -464.24200439453125, + "logps/rejected": -895.093505859375, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.683003425598145, + "rewards/margins": 27.76809310913086, + "rewards/rejected": -38.45109939575195, + "step": 3647 + }, + { + "epoch": 2.269362363919129, + "grad_norm": 0.17489789426326752, + "learning_rate": 1.3520055325034578e-06, + "logits/chosen": -1.095194697380066, + "logits/rejected": 1.75686776638031, + "logps/chosen": -472.18212890625, + "logps/rejected": -951.9043579101562, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.827794075012207, + "rewards/margins": 31.027143478393555, + "rewards/rejected": -38.85493850708008, + "step": 3648 + }, + { + "epoch": 2.2699844479004665, + "grad_norm": 0.0005933581851422787, + "learning_rate": 1.350852927616413e-06, + "logits/chosen": -2.832789659500122, + "logits/rejected": 2.9136810302734375, + "logps/chosen": -395.674560546875, + "logps/rejected": -1046.361572265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.632064819335938, + "rewards/margins": 30.736907958984375, + "rewards/rejected": -40.36897659301758, + "step": 3649 + }, + { + "epoch": 2.2706065318818043, + "grad_norm": 3.9638190269470215, + "learning_rate": 1.3497003227293685e-06, + "logits/chosen": 0.07114283740520477, + "logits/rejected": 3.7599639892578125, + "logps/chosen": -526.3443603515625, + "logps/rejected": -1021.1272583007812, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.873167991638184, + "rewards/margins": 27.13154411315918, + "rewards/rejected": -38.00471496582031, + "step": 3650 + }, + { + "epoch": 2.2712286158631416, + "grad_norm": 2.231683083664393e-06, + "learning_rate": 1.3485477178423237e-06, + "logits/chosen": -0.4136146306991577, + "logits/rejected": 3.6203694343566895, + "logps/chosen": -429.990234375, + "logps/rejected": -1029.20166015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.910269737243652, + "rewards/margins": 41.052101135253906, + "rewards/rejected": -48.96236801147461, + "step": 3651 + }, + { + "epoch": 2.271850699844479, + "grad_norm": 2.4136397769325413e-05, + "learning_rate": 1.347395112955279e-06, + "logits/chosen": -1.157287836074829, + "logits/rejected": 1.3219666481018066, + "logps/chosen": -509.8829040527344, + "logps/rejected": -1024.821044921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.58205795288086, + "rewards/margins": 38.33229064941406, + "rewards/rejected": -47.91435241699219, + "step": 3652 + }, + { + "epoch": 2.2724727838258163, + "grad_norm": 2.5297253181122414e-10, + "learning_rate": 1.3462425080682342e-06, + "logits/chosen": 1.1511964797973633, + "logits/rejected": 3.777742385864258, + "logps/chosen": -628.3382568359375, + "logps/rejected": -1225.38427734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.839183807373047, + "rewards/margins": 38.61802673339844, + "rewards/rejected": -48.45721435546875, + "step": 3653 + }, + { + "epoch": 2.273094867807154, + "grad_norm": 36.658748626708984, + "learning_rate": 1.3450899031811896e-06, + "logits/chosen": 1.0936846733093262, + "logits/rejected": 2.0753893852233887, + "logps/chosen": -478.0395812988281, + "logps/rejected": -874.01025390625, + "loss": 0.7303, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.23668098449707, + "rewards/margins": 24.970584869384766, + "rewards/rejected": -35.20726776123047, + "step": 3654 + }, + { + "epoch": 2.2737169517884914, + "grad_norm": 0.8795388340950012, + "learning_rate": 1.3439372982941448e-06, + "logits/chosen": -0.4980417788028717, + "logits/rejected": 2.420764684677124, + "logps/chosen": -432.43310546875, + "logps/rejected": -798.7457275390625, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.809070587158203, + "rewards/margins": 18.36756706237793, + "rewards/rejected": -30.176637649536133, + "step": 3655 + }, + { + "epoch": 2.2743390357698288, + "grad_norm": 2.9704248905181885, + "learning_rate": 1.3427846934071e-06, + "logits/chosen": 1.417180061340332, + "logits/rejected": 3.072629690170288, + "logps/chosen": -572.999755859375, + "logps/rejected": -979.4898681640625, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.005555629730225, + "rewards/margins": 30.64610481262207, + "rewards/rejected": -37.65166091918945, + "step": 3656 + }, + { + "epoch": 2.2749611197511665, + "grad_norm": 0.036785803735256195, + "learning_rate": 1.3416320885200553e-06, + "logits/chosen": -2.2123665809631348, + "logits/rejected": 1.8106573820114136, + "logps/chosen": -414.67205810546875, + "logps/rejected": -904.7940673828125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.934468746185303, + "rewards/margins": 26.992454528808594, + "rewards/rejected": -31.926923751831055, + "step": 3657 + }, + { + "epoch": 2.275583203732504, + "grad_norm": 6.124455451965332, + "learning_rate": 1.3404794836330107e-06, + "logits/chosen": 2.324453830718994, + "logits/rejected": 3.3866355419158936, + "logps/chosen": -636.7662963867188, + "logps/rejected": -936.2508544921875, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.091182708740234, + "rewards/margins": 23.152149200439453, + "rewards/rejected": -31.243331909179688, + "step": 3658 + }, + { + "epoch": 2.2762052877138412, + "grad_norm": 51.40892028808594, + "learning_rate": 1.339326878745966e-06, + "logits/chosen": -0.03686082363128662, + "logits/rejected": 4.1776814460754395, + "logps/chosen": -506.9464416503906, + "logps/rejected": -916.13818359375, + "loss": 2.0112, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.185665130615234, + "rewards/margins": 27.882423400878906, + "rewards/rejected": -39.06808853149414, + "step": 3659 + }, + { + "epoch": 2.276827371695179, + "grad_norm": 0.000569489726331085, + "learning_rate": 1.3381742738589212e-06, + "logits/chosen": 0.07180686295032501, + "logits/rejected": 4.02716588973999, + "logps/chosen": -540.2124633789062, + "logps/rejected": -1095.65576171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.912123680114746, + "rewards/margins": 28.8727970123291, + "rewards/rejected": -42.7849235534668, + "step": 3660 + }, + { + "epoch": 2.2774494556765164, + "grad_norm": 13.124598503112793, + "learning_rate": 1.3370216689718766e-06, + "logits/chosen": 0.4328814744949341, + "logits/rejected": 3.6302595138549805, + "logps/chosen": -453.68194580078125, + "logps/rejected": -852.2915649414062, + "loss": 0.054, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.634185791015625, + "rewards/margins": 22.16658592224121, + "rewards/rejected": -28.800771713256836, + "step": 3661 + }, + { + "epoch": 2.2780715396578537, + "grad_norm": 0.0037217868957668543, + "learning_rate": 1.3358690640848318e-06, + "logits/chosen": 0.699317216873169, + "logits/rejected": 4.026709079742432, + "logps/chosen": -473.6370544433594, + "logps/rejected": -785.733642578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.310136318206787, + "rewards/margins": 21.60482406616211, + "rewards/rejected": -28.914958953857422, + "step": 3662 + }, + { + "epoch": 2.2786936236391915, + "grad_norm": 0.5339617729187012, + "learning_rate": 1.334716459197787e-06, + "logits/chosen": 1.6939356327056885, + "logits/rejected": 3.958897590637207, + "logps/chosen": -640.4788818359375, + "logps/rejected": -973.7042846679688, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.455921173095703, + "rewards/margins": 20.49496078491211, + "rewards/rejected": -28.950881958007812, + "step": 3663 + }, + { + "epoch": 2.279315707620529, + "grad_norm": 20.766817092895508, + "learning_rate": 1.3335638543107423e-06, + "logits/chosen": 0.04786163568496704, + "logits/rejected": 4.414052486419678, + "logps/chosen": -427.1124267578125, + "logps/rejected": -1074.2518310546875, + "loss": 0.0934, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.711767196655273, + "rewards/margins": 34.70350646972656, + "rewards/rejected": -44.4152717590332, + "step": 3664 + }, + { + "epoch": 2.279937791601866, + "grad_norm": 0.001962431240826845, + "learning_rate": 1.3324112494236977e-06, + "logits/chosen": 1.640702247619629, + "logits/rejected": 2.41336989402771, + "logps/chosen": -536.8012084960938, + "logps/rejected": -902.0858154296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.044398784637451, + "rewards/margins": 28.664670944213867, + "rewards/rejected": -33.70907211303711, + "step": 3665 + }, + { + "epoch": 2.2805598755832035, + "grad_norm": 0.03671610355377197, + "learning_rate": 1.331258644536653e-06, + "logits/chosen": -1.8564445972442627, + "logits/rejected": 0.5851628184318542, + "logps/chosen": -477.62872314453125, + "logps/rejected": -787.21337890625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.753632545471191, + "rewards/margins": 22.151416778564453, + "rewards/rejected": -31.905048370361328, + "step": 3666 + }, + { + "epoch": 2.2811819595645413, + "grad_norm": 0.02714928612112999, + "learning_rate": 1.3301060396496082e-06, + "logits/chosen": -1.0161186456680298, + "logits/rejected": 1.8596889972686768, + "logps/chosen": -418.14984130859375, + "logps/rejected": -873.814697265625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.727672576904297, + "rewards/margins": 25.745969772338867, + "rewards/rejected": -32.47364044189453, + "step": 3667 + }, + { + "epoch": 2.2818040435458786, + "grad_norm": 0.0004676782409660518, + "learning_rate": 1.3289534347625634e-06, + "logits/chosen": -0.7326065301895142, + "logits/rejected": 2.5730059146881104, + "logps/chosen": -493.71246337890625, + "logps/rejected": -933.0047607421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.3746919631958, + "rewards/margins": 27.02305030822754, + "rewards/rejected": -36.397743225097656, + "step": 3668 + }, + { + "epoch": 2.2824261275272164, + "grad_norm": 2.2877783578678645e-07, + "learning_rate": 1.3278008298755188e-06, + "logits/chosen": -0.7244665026664734, + "logits/rejected": 3.4912476539611816, + "logps/chosen": -472.2742004394531, + "logps/rejected": -1081.2777099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.408008575439453, + "rewards/margins": 34.40707778930664, + "rewards/rejected": -43.815086364746094, + "step": 3669 + }, + { + "epoch": 2.2830482115085537, + "grad_norm": 4.795114705302694e-07, + "learning_rate": 1.326648224988474e-06, + "logits/chosen": -0.1821054220199585, + "logits/rejected": 3.4906506538391113, + "logps/chosen": -430.23553466796875, + "logps/rejected": -934.4568481445312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.48881721496582, + "rewards/margins": 31.01263427734375, + "rewards/rejected": -39.50144958496094, + "step": 3670 + }, + { + "epoch": 2.283670295489891, + "grad_norm": 0.00012371873890515417, + "learning_rate": 1.3254956201014293e-06, + "logits/chosen": -1.8504576683044434, + "logits/rejected": 3.2555201053619385, + "logps/chosen": -415.14703369140625, + "logps/rejected": -1054.1136474609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.276524543762207, + "rewards/margins": 35.696468353271484, + "rewards/rejected": -37.972991943359375, + "step": 3671 + }, + { + "epoch": 2.2842923794712284, + "grad_norm": 0.0600084587931633, + "learning_rate": 1.3243430152143847e-06, + "logits/chosen": 2.238312244415283, + "logits/rejected": 2.5055103302001953, + "logps/chosen": -765.4758911132812, + "logps/rejected": -1044.9024658203125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.58240509033203, + "rewards/margins": 22.25725555419922, + "rewards/rejected": -38.83966064453125, + "step": 3672 + }, + { + "epoch": 2.284914463452566, + "grad_norm": 5.0989089012146, + "learning_rate": 1.32319041032734e-06, + "logits/chosen": 1.9975833892822266, + "logits/rejected": 3.660245180130005, + "logps/chosen": -602.4431762695312, + "logps/rejected": -978.6756591796875, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.570245742797852, + "rewards/margins": 28.15796661376953, + "rewards/rejected": -37.728214263916016, + "step": 3673 + }, + { + "epoch": 2.2855365474339036, + "grad_norm": 0.001256530056707561, + "learning_rate": 1.3220378054402952e-06, + "logits/chosen": -0.7467068433761597, + "logits/rejected": 1.4600367546081543, + "logps/chosen": -402.07843017578125, + "logps/rejected": -849.447998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.446656703948975, + "rewards/margins": 30.50889015197754, + "rewards/rejected": -35.95554733276367, + "step": 3674 + }, + { + "epoch": 2.286158631415241, + "grad_norm": 0.01776723749935627, + "learning_rate": 1.3208852005532504e-06, + "logits/chosen": -2.924178123474121, + "logits/rejected": 0.3132919669151306, + "logps/chosen": -428.3075256347656, + "logps/rejected": -909.0307006835938, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.815388679504395, + "rewards/margins": 30.344745635986328, + "rewards/rejected": -39.160133361816406, + "step": 3675 + }, + { + "epoch": 2.2867807153965787, + "grad_norm": 0.021917715668678284, + "learning_rate": 1.3197325956662058e-06, + "logits/chosen": -1.6217856407165527, + "logits/rejected": 3.1677675247192383, + "logps/chosen": -456.10174560546875, + "logps/rejected": -1087.2021484375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.842765808105469, + "rewards/margins": 33.89425277709961, + "rewards/rejected": -42.73701477050781, + "step": 3676 + }, + { + "epoch": 2.287402799377916, + "grad_norm": 37.577537536621094, + "learning_rate": 1.318579990779161e-06, + "logits/chosen": 0.7215325832366943, + "logits/rejected": 3.7547783851623535, + "logps/chosen": -687.1287841796875, + "logps/rejected": -1236.029541015625, + "loss": 0.3868, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.96998405456543, + "rewards/margins": 34.49425506591797, + "rewards/rejected": -45.46424102783203, + "step": 3677 + }, + { + "epoch": 2.2880248833592534, + "grad_norm": 0.00026209407951682806, + "learning_rate": 1.3174273858921163e-06, + "logits/chosen": -2.5744283199310303, + "logits/rejected": 2.5794479846954346, + "logps/chosen": -319.0574951171875, + "logps/rejected": -996.4590454101562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.7528181076049805, + "rewards/margins": 33.89647674560547, + "rewards/rejected": -38.6492919921875, + "step": 3678 + }, + { + "epoch": 2.288646967340591, + "grad_norm": 3.9976178811684804e-08, + "learning_rate": 1.3162747810050717e-06, + "logits/chosen": -1.0795128345489502, + "logits/rejected": 2.9608840942382812, + "logps/chosen": -441.7596740722656, + "logps/rejected": -1100.30078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.616857051849365, + "rewards/margins": 30.922849655151367, + "rewards/rejected": -37.539703369140625, + "step": 3679 + }, + { + "epoch": 2.2892690513219285, + "grad_norm": 0.026899321004748344, + "learning_rate": 1.315122176118027e-06, + "logits/chosen": 1.6019718647003174, + "logits/rejected": 3.604966878890991, + "logps/chosen": -444.7810363769531, + "logps/rejected": -873.5284423828125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.15468978881836, + "rewards/margins": 30.436019897460938, + "rewards/rejected": -38.5907096862793, + "step": 3680 + }, + { + "epoch": 2.289891135303266, + "grad_norm": 0.000182815216248855, + "learning_rate": 1.3139695712309822e-06, + "logits/chosen": -2.478875160217285, + "logits/rejected": 2.828974962234497, + "logps/chosen": -309.87603759765625, + "logps/rejected": -1102.597900390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.028837203979492, + "rewards/margins": 46.795127868652344, + "rewards/rejected": -52.82395935058594, + "step": 3681 + }, + { + "epoch": 2.2905132192846036, + "grad_norm": 17.601627349853516, + "learning_rate": 1.3128169663439374e-06, + "logits/chosen": 1.8089146614074707, + "logits/rejected": 2.5463128089904785, + "logps/chosen": -628.1402587890625, + "logps/rejected": -938.7257080078125, + "loss": 0.1131, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.305665969848633, + "rewards/margins": 21.779991149902344, + "rewards/rejected": -32.085655212402344, + "step": 3682 + }, + { + "epoch": 2.291135303265941, + "grad_norm": 2.2101880858826917e-06, + "learning_rate": 1.3116643614568928e-06, + "logits/chosen": 0.36230725049972534, + "logits/rejected": 3.0662708282470703, + "logps/chosen": -495.42108154296875, + "logps/rejected": -1016.244873046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6825504302978516, + "rewards/margins": 34.046356201171875, + "rewards/rejected": -37.72890853881836, + "step": 3683 + }, + { + "epoch": 2.2917573872472783, + "grad_norm": 6.515670520457206e-06, + "learning_rate": 1.310511756569848e-06, + "logits/chosen": -1.1638526916503906, + "logits/rejected": 3.4499146938323975, + "logps/chosen": -362.0567932128906, + "logps/rejected": -958.96044921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.989096641540527, + "rewards/margins": 33.7954216003418, + "rewards/rejected": -40.78451919555664, + "step": 3684 + }, + { + "epoch": 2.2923794712286156, + "grad_norm": 1.5253701803885633e-06, + "learning_rate": 1.3093591516828033e-06, + "logits/chosen": -0.05208313465118408, + "logits/rejected": 4.282369613647461, + "logps/chosen": -385.57952880859375, + "logps/rejected": -955.6923828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.062870979309082, + "rewards/margins": 30.697647094726562, + "rewards/rejected": -40.760520935058594, + "step": 3685 + }, + { + "epoch": 2.2930015552099534, + "grad_norm": 2.5477407689322717e-05, + "learning_rate": 1.3082065467957585e-06, + "logits/chosen": 0.05571731925010681, + "logits/rejected": 4.041232585906982, + "logps/chosen": -421.4884033203125, + "logps/rejected": -906.364013671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.396073341369629, + "rewards/margins": 27.576976776123047, + "rewards/rejected": -32.973052978515625, + "step": 3686 + }, + { + "epoch": 2.2936236391912908, + "grad_norm": 5.022000550525263e-05, + "learning_rate": 1.307053941908714e-06, + "logits/chosen": 1.8593449592590332, + "logits/rejected": 4.012054443359375, + "logps/chosen": -597.22509765625, + "logps/rejected": -1045.4674072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.028395652770996, + "rewards/margins": 36.970951080322266, + "rewards/rejected": -47.99934387207031, + "step": 3687 + }, + { + "epoch": 2.2942457231726285, + "grad_norm": 53.18421936035156, + "learning_rate": 1.3059013370216692e-06, + "logits/chosen": -2.442203998565674, + "logits/rejected": 2.732808828353882, + "logps/chosen": -360.65948486328125, + "logps/rejected": -986.3630981445312, + "loss": 0.3469, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.006263732910156, + "rewards/margins": 34.367088317871094, + "rewards/rejected": -43.37335205078125, + "step": 3688 + }, + { + "epoch": 2.294867807153966, + "grad_norm": 0.0002621853200253099, + "learning_rate": 1.3047487321346244e-06, + "logits/chosen": -0.44340795278549194, + "logits/rejected": 3.2911224365234375, + "logps/chosen": -488.1886901855469, + "logps/rejected": -1053.859130859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.731563568115234, + "rewards/margins": 30.538379669189453, + "rewards/rejected": -39.26993942260742, + "step": 3689 + }, + { + "epoch": 2.295489891135303, + "grad_norm": 5.142366409301758, + "learning_rate": 1.3035961272475798e-06, + "logits/chosen": -0.5201451778411865, + "logits/rejected": 3.9743244647979736, + "logps/chosen": -435.2750244140625, + "logps/rejected": -923.180908203125, + "loss": 0.0229, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.475119590759277, + "rewards/margins": 20.42343521118164, + "rewards/rejected": -29.8985538482666, + "step": 3690 + }, + { + "epoch": 2.2961119751166406, + "grad_norm": 0.06476642191410065, + "learning_rate": 1.302443522360535e-06, + "logits/chosen": 0.49854815006256104, + "logits/rejected": 3.6201465129852295, + "logps/chosen": -443.612060546875, + "logps/rejected": -888.561767578125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.8632965087890625, + "rewards/margins": 26.700164794921875, + "rewards/rejected": -34.56346130371094, + "step": 3691 + }, + { + "epoch": 2.2967340590979783, + "grad_norm": 5.63675121156848e-06, + "learning_rate": 1.3012909174734903e-06, + "logits/chosen": -2.861495018005371, + "logits/rejected": 3.953035593032837, + "logps/chosen": -448.1300048828125, + "logps/rejected": -1275.9620361328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.200240135192871, + "rewards/margins": 41.76849365234375, + "rewards/rejected": -48.96873474121094, + "step": 3692 + }, + { + "epoch": 2.2973561430793157, + "grad_norm": 36.50901794433594, + "learning_rate": 1.3001383125864455e-06, + "logits/chosen": 1.098271369934082, + "logits/rejected": 5.046222686767578, + "logps/chosen": -546.1812744140625, + "logps/rejected": -1057.7093505859375, + "loss": 0.6451, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.096063613891602, + "rewards/margins": 29.526365280151367, + "rewards/rejected": -41.6224250793457, + "step": 3693 + }, + { + "epoch": 2.297978227060653, + "grad_norm": 1.0181839570577722e-05, + "learning_rate": 1.298985707699401e-06, + "logits/chosen": -1.1529643535614014, + "logits/rejected": 2.750061511993408, + "logps/chosen": -423.95098876953125, + "logps/rejected": -995.2757568359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.522923469543457, + "rewards/margins": 31.389129638671875, + "rewards/rejected": -39.912052154541016, + "step": 3694 + }, + { + "epoch": 2.298600311041991, + "grad_norm": 4.717450792668387e-06, + "learning_rate": 1.2978331028123562e-06, + "logits/chosen": -3.618411064147949, + "logits/rejected": 3.5521435737609863, + "logps/chosen": -414.3927001953125, + "logps/rejected": -1095.0592041015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.211316108703613, + "rewards/margins": 30.482646942138672, + "rewards/rejected": -37.69396209716797, + "step": 3695 + }, + { + "epoch": 2.299222395023328, + "grad_norm": 0.20213347673416138, + "learning_rate": 1.2966804979253114e-06, + "logits/chosen": 1.5525379180908203, + "logits/rejected": 3.612203598022461, + "logps/chosen": -565.8038940429688, + "logps/rejected": -1033.8426513671875, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.375448226928711, + "rewards/margins": 29.46129608154297, + "rewards/rejected": -42.83674621582031, + "step": 3696 + }, + { + "epoch": 2.2998444790046655, + "grad_norm": 0.07893336564302444, + "learning_rate": 1.2955278930382666e-06, + "logits/chosen": 0.6578959226608276, + "logits/rejected": 3.951193332672119, + "logps/chosen": -583.0952758789062, + "logps/rejected": -1030.0029296875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.314534187316895, + "rewards/margins": 26.324974060058594, + "rewards/rejected": -38.63950729370117, + "step": 3697 + }, + { + "epoch": 2.3004665629860033, + "grad_norm": 0.0008938809623941779, + "learning_rate": 1.294375288151222e-06, + "logits/chosen": -0.3660479187965393, + "logits/rejected": 3.558814287185669, + "logps/chosen": -483.8442077636719, + "logps/rejected": -1079.2012939453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.533079147338867, + "rewards/margins": 39.29969787597656, + "rewards/rejected": -48.83277893066406, + "step": 3698 + }, + { + "epoch": 2.3010886469673406, + "grad_norm": 1.2173177003860474, + "learning_rate": 1.2932226832641773e-06, + "logits/chosen": 1.2994377613067627, + "logits/rejected": 2.7357635498046875, + "logps/chosen": -549.0673828125, + "logps/rejected": -878.2985229492188, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.204733371734619, + "rewards/margins": 22.077877044677734, + "rewards/rejected": -29.282608032226562, + "step": 3699 + }, + { + "epoch": 2.301710730948678, + "grad_norm": 0.00022056486341170967, + "learning_rate": 1.2920700783771325e-06, + "logits/chosen": -1.6370930671691895, + "logits/rejected": 2.731771469116211, + "logps/chosen": -494.51861572265625, + "logps/rejected": -1008.6734619140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.655258178710938, + "rewards/margins": 25.27872085571289, + "rewards/rejected": -33.93397903442383, + "step": 3700 + }, + { + "epoch": 2.3023328149300157, + "grad_norm": 0.0015736103523522615, + "learning_rate": 1.290917473490088e-06, + "logits/chosen": 0.6173830032348633, + "logits/rejected": 3.2660341262817383, + "logps/chosen": -566.87548828125, + "logps/rejected": -1051.01904296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.942275524139404, + "rewards/margins": 29.693340301513672, + "rewards/rejected": -36.635616302490234, + "step": 3701 + }, + { + "epoch": 2.302954898911353, + "grad_norm": 0.06366714090108871, + "learning_rate": 1.2897648686030432e-06, + "logits/chosen": -3.899381637573242, + "logits/rejected": 1.5277197360992432, + "logps/chosen": -265.9858093261719, + "logps/rejected": -905.1071166992188, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.637141704559326, + "rewards/margins": 31.69051170349121, + "rewards/rejected": -35.32765197753906, + "step": 3702 + }, + { + "epoch": 2.3035769828926904, + "grad_norm": 0.08645754307508469, + "learning_rate": 1.2886122637159982e-06, + "logits/chosen": -0.9479325413703918, + "logits/rejected": 3.7041172981262207, + "logps/chosen": -424.2788391113281, + "logps/rejected": -986.81396484375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.513542175292969, + "rewards/margins": 28.364822387695312, + "rewards/rejected": -34.878360748291016, + "step": 3703 + }, + { + "epoch": 2.3041990668740278, + "grad_norm": 3.70295765605988e-06, + "learning_rate": 1.2874596588289534e-06, + "logits/chosen": -1.080538272857666, + "logits/rejected": 2.7991552352905273, + "logps/chosen": -477.34503173828125, + "logps/rejected": -989.5711669921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.353320121765137, + "rewards/margins": 30.877479553222656, + "rewards/rejected": -41.23080062866211, + "step": 3704 + }, + { + "epoch": 2.3048211508553655, + "grad_norm": 0.23479920625686646, + "learning_rate": 1.2863070539419086e-06, + "logits/chosen": 0.2994771897792816, + "logits/rejected": 1.2180674076080322, + "logps/chosen": -418.5895690917969, + "logps/rejected": -672.30615234375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.518365859985352, + "rewards/margins": 20.739582061767578, + "rewards/rejected": -29.257946014404297, + "step": 3705 + }, + { + "epoch": 2.305443234836703, + "grad_norm": 0.11313124746084213, + "learning_rate": 1.285154449054864e-06, + "logits/chosen": -1.9298100471496582, + "logits/rejected": 3.333915948867798, + "logps/chosen": -434.4066162109375, + "logps/rejected": -1057.1881103515625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.3718719482421875, + "rewards/margins": 29.832077026367188, + "rewards/rejected": -37.203948974609375, + "step": 3706 + }, + { + "epoch": 2.3060653188180407, + "grad_norm": 0.0011367382248863578, + "learning_rate": 1.2840018441678193e-06, + "logits/chosen": -0.14702105522155762, + "logits/rejected": 1.7592847347259521, + "logps/chosen": -625.2750854492188, + "logps/rejected": -1098.627197265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.312629699707031, + "rewards/margins": 31.041439056396484, + "rewards/rejected": -44.35406494140625, + "step": 3707 + }, + { + "epoch": 2.306687402799378, + "grad_norm": 0.00044795998837798834, + "learning_rate": 1.2828492392807745e-06, + "logits/chosen": -0.23750460147857666, + "logits/rejected": 3.907789468765259, + "logps/chosen": -369.7756652832031, + "logps/rejected": -845.3507690429688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.302602767944336, + "rewards/margins": 26.934505462646484, + "rewards/rejected": -33.23710632324219, + "step": 3708 + }, + { + "epoch": 2.3073094867807153, + "grad_norm": 5.019338459533174e-06, + "learning_rate": 1.2816966343937297e-06, + "logits/chosen": 3.057111978530884, + "logits/rejected": 2.887117385864258, + "logps/chosen": -658.5367431640625, + "logps/rejected": -1031.13037109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.958843231201172, + "rewards/margins": 37.6265754699707, + "rewards/rejected": -48.585418701171875, + "step": 3709 + }, + { + "epoch": 2.3079315707620527, + "grad_norm": 0.5342081189155579, + "learning_rate": 1.2805440295066852e-06, + "logits/chosen": 2.534153699874878, + "logits/rejected": 3.9704220294952393, + "logps/chosen": -653.8551025390625, + "logps/rejected": -1223.77392578125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.44595718383789, + "rewards/margins": 41.70103454589844, + "rewards/rejected": -51.14698791503906, + "step": 3710 + }, + { + "epoch": 2.3085536547433905, + "grad_norm": 0.0002150165819330141, + "learning_rate": 1.2793914246196404e-06, + "logits/chosen": 1.2127676010131836, + "logits/rejected": 4.247666358947754, + "logps/chosen": -579.703125, + "logps/rejected": -1159.847900390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.120686531066895, + "rewards/margins": 35.937217712402344, + "rewards/rejected": -45.057899475097656, + "step": 3711 + }, + { + "epoch": 2.309175738724728, + "grad_norm": 0.14080031216144562, + "learning_rate": 1.2782388197325956e-06, + "logits/chosen": -1.2376487255096436, + "logits/rejected": 3.667673110961914, + "logps/chosen": -463.9857482910156, + "logps/rejected": -885.1686401367188, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.179271697998047, + "rewards/margins": 16.996322631835938, + "rewards/rejected": -27.175594329833984, + "step": 3712 + }, + { + "epoch": 2.309797822706065, + "grad_norm": 2.617736936372239e-05, + "learning_rate": 1.277086214845551e-06, + "logits/chosen": 0.03214012831449509, + "logits/rejected": 5.018609046936035, + "logps/chosen": -528.091796875, + "logps/rejected": -1093.7386474609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.017572402954102, + "rewards/margins": 34.35942840576172, + "rewards/rejected": -42.37700271606445, + "step": 3713 + }, + { + "epoch": 2.310419906687403, + "grad_norm": 2.1107603970449418e-05, + "learning_rate": 1.2759336099585063e-06, + "logits/chosen": -1.4206596612930298, + "logits/rejected": 3.3120598793029785, + "logps/chosen": -543.6156005859375, + "logps/rejected": -1077.357666015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.907739639282227, + "rewards/margins": 34.17880630493164, + "rewards/rejected": -44.0865478515625, + "step": 3714 + }, + { + "epoch": 2.3110419906687403, + "grad_norm": 0.484291136264801, + "learning_rate": 1.2747810050714615e-06, + "logits/chosen": 0.2637067139148712, + "logits/rejected": 4.386711597442627, + "logps/chosen": -595.2559814453125, + "logps/rejected": -1109.7808837890625, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.601914405822754, + "rewards/margins": 31.844104766845703, + "rewards/rejected": -46.446022033691406, + "step": 3715 + }, + { + "epoch": 2.3116640746500776, + "grad_norm": 4.2439531000582065e-08, + "learning_rate": 1.2736284001844167e-06, + "logits/chosen": -1.9014339447021484, + "logits/rejected": 3.727208375930786, + "logps/chosen": -412.7446594238281, + "logps/rejected": -1121.034423828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.875720500946045, + "rewards/margins": 38.0162239074707, + "rewards/rejected": -43.891944885253906, + "step": 3716 + }, + { + "epoch": 2.3122861586314154, + "grad_norm": 5.432393209048314e-06, + "learning_rate": 1.2724757952973722e-06, + "logits/chosen": 0.21426761150360107, + "logits/rejected": 3.167912244796753, + "logps/chosen": -476.23675537109375, + "logps/rejected": -981.8463134765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.670164585113525, + "rewards/margins": 26.259475708007812, + "rewards/rejected": -32.92964172363281, + "step": 3717 + }, + { + "epoch": 2.3129082426127527, + "grad_norm": 0.10672373324632645, + "learning_rate": 1.2713231904103274e-06, + "logits/chosen": -1.5727660655975342, + "logits/rejected": 2.5053091049194336, + "logps/chosen": -513.7788696289062, + "logps/rejected": -1157.0867919921875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.235604286193848, + "rewards/margins": 41.920467376708984, + "rewards/rejected": -51.156070709228516, + "step": 3718 + }, + { + "epoch": 2.31353032659409, + "grad_norm": 1.4951767921447754, + "learning_rate": 1.2701705855232826e-06, + "logits/chosen": 1.4568402767181396, + "logits/rejected": 4.061356544494629, + "logps/chosen": -634.544921875, + "logps/rejected": -984.9749755859375, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.60744857788086, + "rewards/margins": 18.677616119384766, + "rewards/rejected": -31.285064697265625, + "step": 3719 + }, + { + "epoch": 2.314152410575428, + "grad_norm": 0.27352988719940186, + "learning_rate": 1.2690179806362378e-06, + "logits/chosen": -1.944580078125, + "logits/rejected": 1.7055405378341675, + "logps/chosen": -349.4586181640625, + "logps/rejected": -630.8715209960938, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.460259437561035, + "rewards/margins": 18.522550582885742, + "rewards/rejected": -23.982810974121094, + "step": 3720 + }, + { + "epoch": 2.314774494556765, + "grad_norm": 0.0011469712480902672, + "learning_rate": 1.2678653757491933e-06, + "logits/chosen": -1.8591368198394775, + "logits/rejected": 2.5280864238739014, + "logps/chosen": -326.79632568359375, + "logps/rejected": -907.6506958007812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.100767135620117, + "rewards/margins": 30.164491653442383, + "rewards/rejected": -36.2652587890625, + "step": 3721 + }, + { + "epoch": 2.3153965785381025, + "grad_norm": 1.0237037713523023e-05, + "learning_rate": 1.2667127708621485e-06, + "logits/chosen": -3.6736159324645996, + "logits/rejected": 1.2925745248794556, + "logps/chosen": -230.48849487304688, + "logps/rejected": -831.7344970703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.988120079040527, + "rewards/margins": 32.05707550048828, + "rewards/rejected": -37.045196533203125, + "step": 3722 + }, + { + "epoch": 2.31601866251944, + "grad_norm": 0.000444350007455796, + "learning_rate": 1.2655601659751037e-06, + "logits/chosen": -1.2352526187896729, + "logits/rejected": 2.333470106124878, + "logps/chosen": -599.5797729492188, + "logps/rejected": -1209.1802978515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.441747665405273, + "rewards/margins": 37.17320251464844, + "rewards/rejected": -46.614952087402344, + "step": 3723 + }, + { + "epoch": 2.3166407465007777, + "grad_norm": 0.14415420591831207, + "learning_rate": 1.2644075610880592e-06, + "logits/chosen": -0.7493869662284851, + "logits/rejected": 1.9482616186141968, + "logps/chosen": -424.66912841796875, + "logps/rejected": -905.3004760742188, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.350069999694824, + "rewards/margins": 26.499420166015625, + "rewards/rejected": -34.84949493408203, + "step": 3724 + }, + { + "epoch": 2.317262830482115, + "grad_norm": 0.00046594845480285585, + "learning_rate": 1.2632549562010144e-06, + "logits/chosen": 1.0839658975601196, + "logits/rejected": 3.9135398864746094, + "logps/chosen": -416.83905029296875, + "logps/rejected": -828.8883056640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.0764641761779785, + "rewards/margins": 24.667072296142578, + "rewards/rejected": -31.7435359954834, + "step": 3725 + }, + { + "epoch": 2.317884914463453, + "grad_norm": 5.600808435701765e-05, + "learning_rate": 1.2621023513139696e-06, + "logits/chosen": -1.369974136352539, + "logits/rejected": 3.0765304565429688, + "logps/chosen": -392.28057861328125, + "logps/rejected": -940.5872192382812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.665092945098877, + "rewards/margins": 30.048261642456055, + "rewards/rejected": -36.713356018066406, + "step": 3726 + }, + { + "epoch": 2.31850699844479, + "grad_norm": 0.0072428504936397076, + "learning_rate": 1.2609497464269248e-06, + "logits/chosen": 0.6592839360237122, + "logits/rejected": 2.507603883743286, + "logps/chosen": -591.06396484375, + "logps/rejected": -1104.629150390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.804035186767578, + "rewards/margins": 37.28501510620117, + "rewards/rejected": -46.08905029296875, + "step": 3727 + }, + { + "epoch": 2.3191290824261275, + "grad_norm": 1.4250758795242291e-08, + "learning_rate": 1.2597971415398803e-06, + "logits/chosen": -1.6458063125610352, + "logits/rejected": 3.0889017581939697, + "logps/chosen": -496.560546875, + "logps/rejected": -1106.861328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.856250286102295, + "rewards/margins": 37.5319938659668, + "rewards/rejected": -44.38824462890625, + "step": 3728 + }, + { + "epoch": 2.319751166407465, + "grad_norm": 0.008206356316804886, + "learning_rate": 1.2586445366528355e-06, + "logits/chosen": 1.8181511163711548, + "logits/rejected": 3.783684730529785, + "logps/chosen": -530.9696044921875, + "logps/rejected": -922.9013671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.359803199768066, + "rewards/margins": 23.870697021484375, + "rewards/rejected": -37.230499267578125, + "step": 3729 + }, + { + "epoch": 2.3203732503888026, + "grad_norm": 0.0004938808269798756, + "learning_rate": 1.2574919317657907e-06, + "logits/chosen": -0.7859050035476685, + "logits/rejected": 3.8299612998962402, + "logps/chosen": -334.7008972167969, + "logps/rejected": -828.531982421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.748422145843506, + "rewards/margins": 24.402236938476562, + "rewards/rejected": -30.150657653808594, + "step": 3730 + }, + { + "epoch": 2.32099533437014, + "grad_norm": 0.6841702461242676, + "learning_rate": 1.256339326878746e-06, + "logits/chosen": -2.255345344543457, + "logits/rejected": 1.1976604461669922, + "logps/chosen": -395.2082824707031, + "logps/rejected": -968.39453125, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.6640472412109375, + "rewards/margins": 31.672197341918945, + "rewards/rejected": -39.33624267578125, + "step": 3731 + }, + { + "epoch": 2.3216174183514773, + "grad_norm": 1.9100058423759947e-08, + "learning_rate": 1.2551867219917014e-06, + "logits/chosen": 2.8481311798095703, + "logits/rejected": 4.289071083068848, + "logps/chosen": -520.2772827148438, + "logps/rejected": -1032.16943359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.547584533691406, + "rewards/margins": 37.49254608154297, + "rewards/rejected": -45.04012680053711, + "step": 3732 + }, + { + "epoch": 2.322239502332815, + "grad_norm": 0.3577936291694641, + "learning_rate": 1.2540341171046566e-06, + "logits/chosen": -1.3577563762664795, + "logits/rejected": 2.7698278427124023, + "logps/chosen": -363.3661193847656, + "logps/rejected": -926.281982421875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.647355079650879, + "rewards/margins": 31.246688842773438, + "rewards/rejected": -37.89404296875, + "step": 3733 + }, + { + "epoch": 2.3228615863141524, + "grad_norm": 5.641165898850886e-06, + "learning_rate": 1.2528815122176118e-06, + "logits/chosen": 2.2829861640930176, + "logits/rejected": 0.9737235307693481, + "logps/chosen": -635.3814697265625, + "logps/rejected": -918.7669067382812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.598127365112305, + "rewards/margins": 28.54684066772461, + "rewards/rejected": -36.14496994018555, + "step": 3734 + }, + { + "epoch": 2.3234836702954897, + "grad_norm": 0.0003707819851115346, + "learning_rate": 1.2517289073305673e-06, + "logits/chosen": 0.35782626271247864, + "logits/rejected": 3.968404769897461, + "logps/chosen": -458.44110107421875, + "logps/rejected": -952.4920654296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.391024589538574, + "rewards/margins": 27.741703033447266, + "rewards/rejected": -34.13272476196289, + "step": 3735 + }, + { + "epoch": 2.3241057542768275, + "grad_norm": 37.11952209472656, + "learning_rate": 1.2505763024435225e-06, + "logits/chosen": -1.1397258043289185, + "logits/rejected": 0.5207135081291199, + "logps/chosen": -553.2008056640625, + "logps/rejected": -839.0127563476562, + "loss": 1.1347, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.864250183105469, + "rewards/margins": 20.787813186645508, + "rewards/rejected": -29.652063369750977, + "step": 3736 + }, + { + "epoch": 2.324727838258165, + "grad_norm": 13.333917617797852, + "learning_rate": 1.2494236975564777e-06, + "logits/chosen": 3.0632495880126953, + "logits/rejected": 2.8352396488189697, + "logps/chosen": -815.4075927734375, + "logps/rejected": -1069.552734375, + "loss": 0.1319, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.841561317443848, + "rewards/margins": 21.505847930908203, + "rewards/rejected": -37.347408294677734, + "step": 3737 + }, + { + "epoch": 2.325349922239502, + "grad_norm": 0.3247649371623993, + "learning_rate": 1.248271092669433e-06, + "logits/chosen": 0.8813507556915283, + "logits/rejected": 2.7931432723999023, + "logps/chosen": -608.7620849609375, + "logps/rejected": -886.5908813476562, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.79469108581543, + "rewards/margins": 22.09004020690918, + "rewards/rejected": -32.88473129272461, + "step": 3738 + }, + { + "epoch": 2.32597200622084, + "grad_norm": 1.6823595762252808, + "learning_rate": 1.2471184877823884e-06, + "logits/chosen": 0.045462846755981445, + "logits/rejected": 0.23739556968212128, + "logps/chosen": -650.0916748046875, + "logps/rejected": -884.3887939453125, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.383567810058594, + "rewards/margins": 21.003597259521484, + "rewards/rejected": -35.38716506958008, + "step": 3739 + }, + { + "epoch": 2.3265940902021773, + "grad_norm": 0.00010003484931075945, + "learning_rate": 1.2459658828953436e-06, + "logits/chosen": 0.15637314319610596, + "logits/rejected": 3.1810343265533447, + "logps/chosen": -496.74737548828125, + "logps/rejected": -963.8161010742188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.369870662689209, + "rewards/margins": 32.24199676513672, + "rewards/rejected": -38.61186599731445, + "step": 3740 + }, + { + "epoch": 2.3272161741835147, + "grad_norm": 0.00047350634122267365, + "learning_rate": 1.2448132780082988e-06, + "logits/chosen": -2.524026870727539, + "logits/rejected": 0.31463098526000977, + "logps/chosen": -502.8329162597656, + "logps/rejected": -1062.771240234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.350840091705322, + "rewards/margins": 33.23490524291992, + "rewards/rejected": -39.58574676513672, + "step": 3741 + }, + { + "epoch": 2.327838258164852, + "grad_norm": 21.89085578918457, + "learning_rate": 1.2436606731212543e-06, + "logits/chosen": 0.7648021578788757, + "logits/rejected": 2.883105754852295, + "logps/chosen": -567.5341796875, + "logps/rejected": -897.75537109375, + "loss": 0.1512, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.583475112915039, + "rewards/margins": 23.607681274414062, + "rewards/rejected": -33.19115447998047, + "step": 3742 + }, + { + "epoch": 2.32846034214619, + "grad_norm": 0.10280514508485794, + "learning_rate": 1.2425080682342095e-06, + "logits/chosen": 0.34326276183128357, + "logits/rejected": 2.023019552230835, + "logps/chosen": -484.177490234375, + "logps/rejected": -807.7122802734375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.493699550628662, + "rewards/margins": 22.899011611938477, + "rewards/rejected": -30.392711639404297, + "step": 3743 + }, + { + "epoch": 2.329082426127527, + "grad_norm": 0.00037758261896669865, + "learning_rate": 1.2413554633471647e-06, + "logits/chosen": -0.4873642027378082, + "logits/rejected": 4.205269813537598, + "logps/chosen": -456.52215576171875, + "logps/rejected": -1098.8558349609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.478480339050293, + "rewards/margins": 35.9359130859375, + "rewards/rejected": -40.41439437866211, + "step": 3744 + }, + { + "epoch": 2.329704510108865, + "grad_norm": 2.557085463195108e-05, + "learning_rate": 1.24020285846012e-06, + "logits/chosen": -0.8406679630279541, + "logits/rejected": 3.6342380046844482, + "logps/chosen": -493.90692138671875, + "logps/rejected": -1012.0113525390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.140291213989258, + "rewards/margins": 28.977548599243164, + "rewards/rejected": -37.11783981323242, + "step": 3745 + }, + { + "epoch": 2.3303265940902023, + "grad_norm": 0.00011655557318590581, + "learning_rate": 1.2390502535730754e-06, + "logits/chosen": 0.18190997838974, + "logits/rejected": 2.5602996349334717, + "logps/chosen": -287.9266052246094, + "logps/rejected": -611.602294921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.9288811683654785, + "rewards/margins": 20.42547607421875, + "rewards/rejected": -25.354358673095703, + "step": 3746 + }, + { + "epoch": 2.3309486780715396, + "grad_norm": 0.028517745435237885, + "learning_rate": 1.2378976486860306e-06, + "logits/chosen": 3.981574535369873, + "logits/rejected": 4.015433311462402, + "logps/chosen": -651.6341552734375, + "logps/rejected": -922.1114501953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.595947265625, + "rewards/margins": 26.155662536621094, + "rewards/rejected": -34.751609802246094, + "step": 3747 + }, + { + "epoch": 2.331570762052877, + "grad_norm": 0.009231762029230595, + "learning_rate": 1.2367450437989858e-06, + "logits/chosen": 0.6380380392074585, + "logits/rejected": 4.268522262573242, + "logps/chosen": -441.7833557128906, + "logps/rejected": -1023.3212890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.484411239624023, + "rewards/margins": 30.92462158203125, + "rewards/rejected": -37.409034729003906, + "step": 3748 + }, + { + "epoch": 2.3321928460342147, + "grad_norm": 3.534213277589515e-08, + "learning_rate": 1.235592438911941e-06, + "logits/chosen": -1.5921106338500977, + "logits/rejected": 1.847858190536499, + "logps/chosen": -451.6917419433594, + "logps/rejected": -1046.4404296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.33981704711914, + "rewards/margins": 37.05653381347656, + "rewards/rejected": -45.39634704589844, + "step": 3749 + }, + { + "epoch": 2.332814930015552, + "grad_norm": 0.020619157701730728, + "learning_rate": 1.2344398340248965e-06, + "logits/chosen": 2.333313465118408, + "logits/rejected": 3.4232683181762695, + "logps/chosen": -633.7111206054688, + "logps/rejected": -949.9092407226562, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.208894729614258, + "rewards/margins": 30.374826431274414, + "rewards/rejected": -38.58372497558594, + "step": 3750 + }, + { + "epoch": 2.3334370139968894, + "grad_norm": 48.95095443725586, + "learning_rate": 1.2332872291378517e-06, + "logits/chosen": 0.785729706287384, + "logits/rejected": 3.3353562355041504, + "logps/chosen": -461.3485412597656, + "logps/rejected": -799.4615478515625, + "loss": 2.0307, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.850112915039062, + "rewards/margins": 21.994503021240234, + "rewards/rejected": -30.844614028930664, + "step": 3751 + }, + { + "epoch": 2.334059097978227, + "grad_norm": 0.24490414559841156, + "learning_rate": 1.232134624250807e-06, + "logits/chosen": -0.022008508443832397, + "logits/rejected": 3.6488747596740723, + "logps/chosen": -464.0959777832031, + "logps/rejected": -1023.082763671875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.36753511428833, + "rewards/margins": 33.127281188964844, + "rewards/rejected": -39.494815826416016, + "step": 3752 + }, + { + "epoch": 2.3346811819595645, + "grad_norm": 2.7800905399999465e-07, + "learning_rate": 1.2309820193637624e-06, + "logits/chosen": -1.0926700830459595, + "logits/rejected": 4.033783912658691, + "logps/chosen": -524.0411376953125, + "logps/rejected": -1222.209716796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.972342014312744, + "rewards/margins": 34.16625213623047, + "rewards/rejected": -42.13859558105469, + "step": 3753 + }, + { + "epoch": 2.335303265940902, + "grad_norm": 0.027951369062066078, + "learning_rate": 1.2298294144767174e-06, + "logits/chosen": 1.3177515268325806, + "logits/rejected": 3.4625697135925293, + "logps/chosen": -495.0713195800781, + "logps/rejected": -835.1082153320312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.218081951141357, + "rewards/margins": 19.00119400024414, + "rewards/rejected": -24.219276428222656, + "step": 3754 + }, + { + "epoch": 2.3359253499222397, + "grad_norm": 0.00014580706192646176, + "learning_rate": 1.2286768095896726e-06, + "logits/chosen": 2.1752769947052, + "logits/rejected": 2.639923334121704, + "logps/chosen": -591.5237426757812, + "logps/rejected": -898.7060546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.46114444732666, + "rewards/margins": 24.936874389648438, + "rewards/rejected": -35.39802169799805, + "step": 3755 + }, + { + "epoch": 2.336547433903577, + "grad_norm": 7.0838303565979, + "learning_rate": 1.227524204702628e-06, + "logits/chosen": -3.95389461517334, + "logits/rejected": 3.1995513439178467, + "logps/chosen": -346.8486328125, + "logps/rejected": -1027.637451171875, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.685001373291016, + "rewards/margins": 31.18999671936035, + "rewards/rejected": -36.875, + "step": 3756 + }, + { + "epoch": 2.3371695178849143, + "grad_norm": 3.141146421432495, + "learning_rate": 1.2263715998155833e-06, + "logits/chosen": 0.8513885736465454, + "logits/rejected": 2.2651748657226562, + "logps/chosen": -532.1912231445312, + "logps/rejected": -856.8719482421875, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.31003475189209, + "rewards/margins": 20.228313446044922, + "rewards/rejected": -30.53835105895996, + "step": 3757 + }, + { + "epoch": 2.337791601866252, + "grad_norm": 0.02694685198366642, + "learning_rate": 1.2252189949285385e-06, + "logits/chosen": -2.7708561420440674, + "logits/rejected": 0.9407970905303955, + "logps/chosen": -405.43402099609375, + "logps/rejected": -961.0596923828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.2348785400390625, + "rewards/margins": 29.521310806274414, + "rewards/rejected": -35.75619125366211, + "step": 3758 + }, + { + "epoch": 2.3384136858475895, + "grad_norm": 0.23336142301559448, + "learning_rate": 1.224066390041494e-06, + "logits/chosen": 1.8623909950256348, + "logits/rejected": 4.030655860900879, + "logps/chosen": -567.2323608398438, + "logps/rejected": -971.25927734375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.90607738494873, + "rewards/margins": 25.02992057800293, + "rewards/rejected": -37.935997009277344, + "step": 3759 + }, + { + "epoch": 2.339035769828927, + "grad_norm": 9.65563678741455, + "learning_rate": 1.2229137851544492e-06, + "logits/chosen": -1.7390412092208862, + "logits/rejected": 3.4814720153808594, + "logps/chosen": -331.64794921875, + "logps/rejected": -900.2626953125, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.608267307281494, + "rewards/margins": 23.121736526489258, + "rewards/rejected": -28.730003356933594, + "step": 3760 + }, + { + "epoch": 2.339657853810264, + "grad_norm": 0.17411291599273682, + "learning_rate": 1.2217611802674044e-06, + "logits/chosen": -1.2542434930801392, + "logits/rejected": 0.9479058384895325, + "logps/chosen": -457.4659118652344, + "logps/rejected": -841.8319702148438, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.046476364135742, + "rewards/margins": 23.45076560974121, + "rewards/rejected": -31.497241973876953, + "step": 3761 + }, + { + "epoch": 2.340279937791602, + "grad_norm": 7.5882954597473145, + "learning_rate": 1.2206085753803596e-06, + "logits/chosen": 3.2613232135772705, + "logits/rejected": 3.1383419036865234, + "logps/chosen": -656.0169677734375, + "logps/rejected": -783.934814453125, + "loss": 0.0345, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.506058692932129, + "rewards/margins": 16.031143188476562, + "rewards/rejected": -24.537200927734375, + "step": 3762 + }, + { + "epoch": 2.3409020217729393, + "grad_norm": 9.263287211069837e-06, + "learning_rate": 1.219455970493315e-06, + "logits/chosen": -5.158586502075195, + "logits/rejected": 0.3858991861343384, + "logps/chosen": -193.34307861328125, + "logps/rejected": -922.373291015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7267749309539795, + "rewards/margins": 30.67427635192871, + "rewards/rejected": -33.40105056762695, + "step": 3763 + }, + { + "epoch": 2.341524105754277, + "grad_norm": 0.0012843944132328033, + "learning_rate": 1.2183033656062703e-06, + "logits/chosen": 2.423126459121704, + "logits/rejected": 3.8941612243652344, + "logps/chosen": -614.8330078125, + "logps/rejected": -967.76708984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.1685209274292, + "rewards/margins": 26.197811126708984, + "rewards/rejected": -37.3663330078125, + "step": 3764 + }, + { + "epoch": 2.3421461897356144, + "grad_norm": 3.2148877835425083e-06, + "learning_rate": 1.2171507607192255e-06, + "logits/chosen": -5.507774829864502, + "logits/rejected": 1.925000786781311, + "logps/chosen": -232.03387451171875, + "logps/rejected": -932.1917114257812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.883286476135254, + "rewards/margins": 28.877832412719727, + "rewards/rejected": -33.7611198425293, + "step": 3765 + }, + { + "epoch": 2.3427682737169517, + "grad_norm": 0.05553580820560455, + "learning_rate": 1.2159981558321807e-06, + "logits/chosen": 1.5427436828613281, + "logits/rejected": 2.6691017150878906, + "logps/chosen": -650.1654052734375, + "logps/rejected": -950.7757568359375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.771061897277832, + "rewards/margins": 26.58896827697754, + "rewards/rejected": -37.36003112792969, + "step": 3766 + }, + { + "epoch": 2.343390357698289, + "grad_norm": 0.5102798342704773, + "learning_rate": 1.2148455509451362e-06, + "logits/chosen": 0.39026594161987305, + "logits/rejected": 2.184535026550293, + "logps/chosen": -531.0682373046875, + "logps/rejected": -720.0987548828125, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.892273902893066, + "rewards/margins": 13.84701919555664, + "rewards/rejected": -19.73929214477539, + "step": 3767 + }, + { + "epoch": 2.344012441679627, + "grad_norm": 0.000627454777713865, + "learning_rate": 1.2136929460580914e-06, + "logits/chosen": 0.6169248819351196, + "logits/rejected": 4.225925445556641, + "logps/chosen": -434.4745178222656, + "logps/rejected": -1024.3651123046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.809660911560059, + "rewards/margins": 35.267539978027344, + "rewards/rejected": -46.07720184326172, + "step": 3768 + }, + { + "epoch": 2.344634525660964, + "grad_norm": 0.0014896132051944733, + "learning_rate": 1.2125403411710466e-06, + "logits/chosen": -1.1250966787338257, + "logits/rejected": 2.233417272567749, + "logps/chosen": -375.6403503417969, + "logps/rejected": -734.7423706054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.249678611755371, + "rewards/margins": 25.75054931640625, + "rewards/rejected": -32.00022888183594, + "step": 3769 + }, + { + "epoch": 2.3452566096423015, + "grad_norm": 1.1749808663807926e-06, + "learning_rate": 1.211387736284002e-06, + "logits/chosen": 1.9994325637817383, + "logits/rejected": 3.6004791259765625, + "logps/chosen": -579.676513671875, + "logps/rejected": -1031.6995849609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.611101150512695, + "rewards/margins": 32.72962188720703, + "rewards/rejected": -43.34072494506836, + "step": 3770 + }, + { + "epoch": 2.3458786936236393, + "grad_norm": 3.606675988976349e-07, + "learning_rate": 1.2102351313969573e-06, + "logits/chosen": -3.1550188064575195, + "logits/rejected": 3.289100170135498, + "logps/chosen": -272.6792297363281, + "logps/rejected": -1067.193359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.123805999755859, + "rewards/margins": 40.23991775512695, + "rewards/rejected": -45.36371994018555, + "step": 3771 + }, + { + "epoch": 2.3465007776049767, + "grad_norm": 0.0012340415269136429, + "learning_rate": 1.2090825265099125e-06, + "logits/chosen": 0.42929190397262573, + "logits/rejected": 3.3133544921875, + "logps/chosen": -572.3095703125, + "logps/rejected": -1064.4798583984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.947774887084961, + "rewards/margins": 32.553855895996094, + "rewards/rejected": -43.501625061035156, + "step": 3772 + }, + { + "epoch": 2.347122861586314, + "grad_norm": 1.7274918718612753e-05, + "learning_rate": 1.2079299216228677e-06, + "logits/chosen": 0.21922864019870758, + "logits/rejected": 2.7869038581848145, + "logps/chosen": -489.15069580078125, + "logps/rejected": -969.3250122070312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.346248626708984, + "rewards/margins": 32.81340789794922, + "rewards/rejected": -41.1596565246582, + "step": 3773 + }, + { + "epoch": 2.347744945567652, + "grad_norm": 0.18060894310474396, + "learning_rate": 1.2067773167358231e-06, + "logits/chosen": -0.30459514260292053, + "logits/rejected": 3.024898052215576, + "logps/chosen": -569.720947265625, + "logps/rejected": -1194.462646484375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.834023475646973, + "rewards/margins": 41.31550216674805, + "rewards/rejected": -50.14952850341797, + "step": 3774 + }, + { + "epoch": 2.348367029548989, + "grad_norm": 1.2039680480957031, + "learning_rate": 1.2056247118487784e-06, + "logits/chosen": 4.079957485198975, + "logits/rejected": 4.113856315612793, + "logps/chosen": -827.6793212890625, + "logps/rejected": -970.1021118164062, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.64090347290039, + "rewards/margins": 18.864543914794922, + "rewards/rejected": -30.505447387695312, + "step": 3775 + }, + { + "epoch": 2.3489891135303265, + "grad_norm": 1.346193790435791, + "learning_rate": 1.2044721069617336e-06, + "logits/chosen": 2.333740711212158, + "logits/rejected": 3.8003430366516113, + "logps/chosen": -715.4950561523438, + "logps/rejected": -994.8343505859375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.749503135681152, + "rewards/margins": 22.199554443359375, + "rewards/rejected": -34.949058532714844, + "step": 3776 + }, + { + "epoch": 2.3496111975116643, + "grad_norm": 0.28042879700660706, + "learning_rate": 1.2033195020746888e-06, + "logits/chosen": 0.6322274804115295, + "logits/rejected": 2.010160446166992, + "logps/chosen": -608.6675415039062, + "logps/rejected": -957.8283081054688, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.078685760498047, + "rewards/margins": 29.47532844543457, + "rewards/rejected": -37.55401611328125, + "step": 3777 + }, + { + "epoch": 2.3502332814930016, + "grad_norm": 5.428508757177042e-06, + "learning_rate": 1.2021668971876443e-06, + "logits/chosen": -0.7912687659263611, + "logits/rejected": 2.985895872116089, + "logps/chosen": -448.8441162109375, + "logps/rejected": -971.3677368164062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.1015825271606445, + "rewards/margins": 28.909358978271484, + "rewards/rejected": -34.01094055175781, + "step": 3778 + }, + { + "epoch": 2.350855365474339, + "grad_norm": 5.192906246520579e-05, + "learning_rate": 1.2010142923005995e-06, + "logits/chosen": -0.4874184727668762, + "logits/rejected": 2.1849145889282227, + "logps/chosen": -482.5423583984375, + "logps/rejected": -816.16796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.823034286499023, + "rewards/margins": 21.69009017944336, + "rewards/rejected": -31.513124465942383, + "step": 3779 + }, + { + "epoch": 2.3514774494556763, + "grad_norm": 4.981512756785378e-05, + "learning_rate": 1.1998616874135547e-06, + "logits/chosen": 2.9160568714141846, + "logits/rejected": 4.156764030456543, + "logps/chosen": -776.1582641601562, + "logps/rejected": -1080.82763671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.34664535522461, + "rewards/margins": 29.624601364135742, + "rewards/rejected": -40.971248626708984, + "step": 3780 + }, + { + "epoch": 2.352099533437014, + "grad_norm": 7.727591037750244, + "learning_rate": 1.1987090825265101e-06, + "logits/chosen": 2.0500614643096924, + "logits/rejected": 2.1399617195129395, + "logps/chosen": -669.6891479492188, + "logps/rejected": -894.5279541015625, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.296379089355469, + "rewards/margins": 20.505508422851562, + "rewards/rejected": -31.80188751220703, + "step": 3781 + }, + { + "epoch": 2.3527216174183514, + "grad_norm": 7.926801117719151e-06, + "learning_rate": 1.1975564776394654e-06, + "logits/chosen": 0.004293203353881836, + "logits/rejected": 2.637110710144043, + "logps/chosen": -400.6033630371094, + "logps/rejected": -920.6117553710938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.349189758300781, + "rewards/margins": 33.85880661010742, + "rewards/rejected": -43.2079963684082, + "step": 3782 + }, + { + "epoch": 2.353343701399689, + "grad_norm": 0.06966577470302582, + "learning_rate": 1.1964038727524206e-06, + "logits/chosen": -0.31672847270965576, + "logits/rejected": 3.4101476669311523, + "logps/chosen": -515.6886596679688, + "logps/rejected": -1000.330078125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.874223232269287, + "rewards/margins": 27.466777801513672, + "rewards/rejected": -34.34100341796875, + "step": 3783 + }, + { + "epoch": 2.3539657853810265, + "grad_norm": 8.371570587158203, + "learning_rate": 1.1952512678653758e-06, + "logits/chosen": 0.27338138222694397, + "logits/rejected": 2.996030330657959, + "logps/chosen": -582.8148193359375, + "logps/rejected": -1207.8359375, + "loss": 0.0456, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.131258010864258, + "rewards/margins": 40.75477600097656, + "rewards/rejected": -53.88603591918945, + "step": 3784 + }, + { + "epoch": 2.354587869362364, + "grad_norm": 0.6094384789466858, + "learning_rate": 1.1940986629783313e-06, + "logits/chosen": -1.153893232345581, + "logits/rejected": 0.9266235828399658, + "logps/chosen": -530.5203247070312, + "logps/rejected": -979.1256713867188, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.635900497436523, + "rewards/margins": 31.250638961791992, + "rewards/rejected": -40.886539459228516, + "step": 3785 + }, + { + "epoch": 2.355209953343701, + "grad_norm": 22.516544342041016, + "learning_rate": 1.1929460580912865e-06, + "logits/chosen": -0.9157943725585938, + "logits/rejected": 3.470613956451416, + "logps/chosen": -388.00054931640625, + "logps/rejected": -1011.9267578125, + "loss": 0.1669, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.712441444396973, + "rewards/margins": 33.6087646484375, + "rewards/rejected": -40.321205139160156, + "step": 3786 + }, + { + "epoch": 2.355832037325039, + "grad_norm": 0.08950123935937881, + "learning_rate": 1.1917934532042417e-06, + "logits/chosen": 1.4992426633834839, + "logits/rejected": 2.879100799560547, + "logps/chosen": -615.9576416015625, + "logps/rejected": -912.9107666015625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.530094623565674, + "rewards/margins": 21.183012008666992, + "rewards/rejected": -28.713109970092773, + "step": 3787 + }, + { + "epoch": 2.3564541213063763, + "grad_norm": 33.97612762451172, + "learning_rate": 1.190640848317197e-06, + "logits/chosen": 3.4221155643463135, + "logits/rejected": 3.3171300888061523, + "logps/chosen": -829.3216552734375, + "logps/rejected": -1007.546875, + "loss": 0.1565, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.417081832885742, + "rewards/margins": 20.075977325439453, + "rewards/rejected": -32.49305725097656, + "step": 3788 + }, + { + "epoch": 2.3570762052877137, + "grad_norm": 0.1435934156179428, + "learning_rate": 1.1894882434301522e-06, + "logits/chosen": 0.6667469143867493, + "logits/rejected": 2.7253386974334717, + "logps/chosen": -493.4066162109375, + "logps/rejected": -719.7052001953125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.772191047668457, + "rewards/margins": 16.941204071044922, + "rewards/rejected": -23.713396072387695, + "step": 3789 + }, + { + "epoch": 2.3576982892690515, + "grad_norm": 5.4664010207261526e-08, + "learning_rate": 1.1883356385431074e-06, + "logits/chosen": -0.8374029397964478, + "logits/rejected": 1.7547428607940674, + "logps/chosen": -557.086181640625, + "logps/rejected": -1037.237060546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.583539009094238, + "rewards/margins": 28.51294708251953, + "rewards/rejected": -40.09648513793945, + "step": 3790 + }, + { + "epoch": 2.358320373250389, + "grad_norm": 0.010267133824527264, + "learning_rate": 1.1871830336560628e-06, + "logits/chosen": 1.42992103099823, + "logits/rejected": 4.12758207321167, + "logps/chosen": -597.4346923828125, + "logps/rejected": -991.8429565429688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.071941375732422, + "rewards/margins": 26.114011764526367, + "rewards/rejected": -36.18595504760742, + "step": 3791 + }, + { + "epoch": 2.358942457231726, + "grad_norm": 0.018346259370446205, + "learning_rate": 1.186030428769018e-06, + "logits/chosen": -2.803020477294922, + "logits/rejected": 0.8278892636299133, + "logps/chosen": -464.4111328125, + "logps/rejected": -1009.1217651367188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.344085693359375, + "rewards/margins": 33.9010009765625, + "rewards/rejected": -42.245086669921875, + "step": 3792 + }, + { + "epoch": 2.359564541213064, + "grad_norm": 0.2930425703525543, + "learning_rate": 1.1848778238819733e-06, + "logits/chosen": 1.185457706451416, + "logits/rejected": 3.8306405544281006, + "logps/chosen": -636.3871459960938, + "logps/rejected": -1095.2010498046875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.685548782348633, + "rewards/margins": 33.11660385131836, + "rewards/rejected": -47.802154541015625, + "step": 3793 + }, + { + "epoch": 2.3601866251944013, + "grad_norm": 2.025911453529261e-05, + "learning_rate": 1.1837252189949285e-06, + "logits/chosen": 1.1972532272338867, + "logits/rejected": 5.457921504974365, + "logps/chosen": -427.31390380859375, + "logps/rejected": -903.8753662109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.011835098266602, + "rewards/margins": 24.26134490966797, + "rewards/rejected": -31.273181915283203, + "step": 3794 + }, + { + "epoch": 2.3608087091757386, + "grad_norm": 0.002198511268943548, + "learning_rate": 1.182572614107884e-06, + "logits/chosen": 1.3402279615402222, + "logits/rejected": 2.2236735820770264, + "logps/chosen": -726.2381591796875, + "logps/rejected": -1059.640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.72128677368164, + "rewards/margins": 26.143020629882812, + "rewards/rejected": -39.86431121826172, + "step": 3795 + }, + { + "epoch": 2.3614307931570764, + "grad_norm": 3.081467628479004, + "learning_rate": 1.1814200092208392e-06, + "logits/chosen": 2.2028040885925293, + "logits/rejected": 4.056054592132568, + "logps/chosen": -568.4141235351562, + "logps/rejected": -1015.677978515625, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.455324172973633, + "rewards/margins": 30.97480010986328, + "rewards/rejected": -40.43012237548828, + "step": 3796 + }, + { + "epoch": 2.3620528771384137, + "grad_norm": 0.00034230336314067245, + "learning_rate": 1.1802674043337944e-06, + "logits/chosen": 0.28844231367111206, + "logits/rejected": 3.227900505065918, + "logps/chosen": -488.81158447265625, + "logps/rejected": -851.3491821289062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.182365417480469, + "rewards/margins": 25.613388061523438, + "rewards/rejected": -34.795753479003906, + "step": 3797 + }, + { + "epoch": 2.362674961119751, + "grad_norm": 0.2191145271062851, + "learning_rate": 1.1791147994467498e-06, + "logits/chosen": -0.4650258421897888, + "logits/rejected": 2.9938926696777344, + "logps/chosen": -462.9337158203125, + "logps/rejected": -946.4666748046875, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.004414081573486, + "rewards/margins": 27.973308563232422, + "rewards/rejected": -34.97772216796875, + "step": 3798 + }, + { + "epoch": 2.3632970451010884, + "grad_norm": 7.152175426483154, + "learning_rate": 1.177962194559705e-06, + "logits/chosen": 1.1919983625411987, + "logits/rejected": 1.542420506477356, + "logps/chosen": -616.858642578125, + "logps/rejected": -875.9241943359375, + "loss": 0.0362, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.420209884643555, + "rewards/margins": 23.151426315307617, + "rewards/rejected": -31.571636199951172, + "step": 3799 + }, + { + "epoch": 2.363919129082426, + "grad_norm": 40.50632858276367, + "learning_rate": 1.1768095896726603e-06, + "logits/chosen": 1.6117632389068604, + "logits/rejected": 3.467606544494629, + "logps/chosen": -672.64111328125, + "logps/rejected": -970.1424560546875, + "loss": 0.4664, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.211545944213867, + "rewards/margins": 18.097660064697266, + "rewards/rejected": -31.309207916259766, + "step": 3800 + }, + { + "epoch": 2.3645412130637635, + "grad_norm": 0.0010592733742669225, + "learning_rate": 1.1756569847856155e-06, + "logits/chosen": -0.8161712884902954, + "logits/rejected": 2.525038003921509, + "logps/chosen": -487.45703125, + "logps/rejected": -999.1900634765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.189896583557129, + "rewards/margins": 30.578916549682617, + "rewards/rejected": -39.76881408691406, + "step": 3801 + }, + { + "epoch": 2.3651632970451013, + "grad_norm": 0.0015007429756224155, + "learning_rate": 1.174504379898571e-06, + "logits/chosen": -0.36155182123184204, + "logits/rejected": 3.1467785835266113, + "logps/chosen": -473.1002197265625, + "logps/rejected": -1033.470458984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.926703453063965, + "rewards/margins": 33.874717712402344, + "rewards/rejected": -42.801422119140625, + "step": 3802 + }, + { + "epoch": 2.3657853810264386, + "grad_norm": 1.6011492334655486e-05, + "learning_rate": 1.1733517750115261e-06, + "logits/chosen": -4.356368064880371, + "logits/rejected": 1.788534164428711, + "logps/chosen": -270.5740051269531, + "logps/rejected": -933.327880859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.053676605224609, + "rewards/margins": 32.06562042236328, + "rewards/rejected": -37.119300842285156, + "step": 3803 + }, + { + "epoch": 2.366407465007776, + "grad_norm": 0.31292688846588135, + "learning_rate": 1.1721991701244814e-06, + "logits/chosen": -0.5484585165977478, + "logits/rejected": 4.379980087280273, + "logps/chosen": -338.7742919921875, + "logps/rejected": -991.6512451171875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.910788536071777, + "rewards/margins": 33.38456726074219, + "rewards/rejected": -38.29535675048828, + "step": 3804 + }, + { + "epoch": 2.3670295489891133, + "grad_norm": 0.5742166042327881, + "learning_rate": 1.1710465652374368e-06, + "logits/chosen": 1.608454704284668, + "logits/rejected": 3.2188591957092285, + "logps/chosen": -593.5069580078125, + "logps/rejected": -979.3258666992188, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.885505676269531, + "rewards/margins": 25.568574905395508, + "rewards/rejected": -35.45408248901367, + "step": 3805 + }, + { + "epoch": 2.367651632970451, + "grad_norm": 0.19061534106731415, + "learning_rate": 1.169893960350392e-06, + "logits/chosen": 0.8193325400352478, + "logits/rejected": 4.64123010635376, + "logps/chosen": -569.2249755859375, + "logps/rejected": -1197.062744140625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.428016662597656, + "rewards/margins": 36.00348663330078, + "rewards/rejected": -45.43150329589844, + "step": 3806 + }, + { + "epoch": 2.3682737169517885, + "grad_norm": 0.040032364428043365, + "learning_rate": 1.1687413554633473e-06, + "logits/chosen": 3.153968334197998, + "logits/rejected": 4.467193126678467, + "logps/chosen": -686.541259765625, + "logps/rejected": -936.1351928710938, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.390555381774902, + "rewards/margins": 24.287389755249023, + "rewards/rejected": -35.67794418334961, + "step": 3807 + }, + { + "epoch": 2.368895800933126, + "grad_norm": 0.6300176978111267, + "learning_rate": 1.1675887505763025e-06, + "logits/chosen": -2.918030261993408, + "logits/rejected": 2.932901620864868, + "logps/chosen": -445.3351745605469, + "logps/rejected": -1078.980224609375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.955216407775879, + "rewards/margins": 36.79884338378906, + "rewards/rejected": -45.754058837890625, + "step": 3808 + }, + { + "epoch": 2.3695178849144636, + "grad_norm": 0.00032534674392081797, + "learning_rate": 1.166436145689258e-06, + "logits/chosen": 0.1272832453250885, + "logits/rejected": 2.286240577697754, + "logps/chosen": -382.1531066894531, + "logps/rejected": -825.8199462890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.393211364746094, + "rewards/margins": 29.338640213012695, + "rewards/rejected": -36.731849670410156, + "step": 3809 + }, + { + "epoch": 2.370139968895801, + "grad_norm": 0.07315727323293686, + "learning_rate": 1.1652835408022131e-06, + "logits/chosen": -0.5689883828163147, + "logits/rejected": 2.811748504638672, + "logps/chosen": -570.2830200195312, + "logps/rejected": -1137.9296875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.510469436645508, + "rewards/margins": 36.993011474609375, + "rewards/rejected": -46.50347900390625, + "step": 3810 + }, + { + "epoch": 2.3707620528771383, + "grad_norm": 23.180858612060547, + "learning_rate": 1.1641309359151684e-06, + "logits/chosen": 2.40103816986084, + "logits/rejected": 5.413440704345703, + "logps/chosen": -606.066650390625, + "logps/rejected": -1113.716552734375, + "loss": 0.1608, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.312074661254883, + "rewards/margins": 28.769729614257812, + "rewards/rejected": -39.08180236816406, + "step": 3811 + }, + { + "epoch": 2.371384136858476, + "grad_norm": 0.0006819193949922919, + "learning_rate": 1.1629783310281236e-06, + "logits/chosen": 1.8856199979782104, + "logits/rejected": 3.725149631500244, + "logps/chosen": -626.3080444335938, + "logps/rejected": -1122.8994140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.148897171020508, + "rewards/margins": 37.54759216308594, + "rewards/rejected": -46.69648742675781, + "step": 3812 + }, + { + "epoch": 2.3720062208398134, + "grad_norm": 27.375221252441406, + "learning_rate": 1.161825726141079e-06, + "logits/chosen": -0.32031482458114624, + "logits/rejected": 3.144876003265381, + "logps/chosen": -426.2165222167969, + "logps/rejected": -1021.3775634765625, + "loss": 0.3604, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.422128677368164, + "rewards/margins": 35.854530334472656, + "rewards/rejected": -45.27666091918945, + "step": 3813 + }, + { + "epoch": 2.3726283048211507, + "grad_norm": 0.6310350298881531, + "learning_rate": 1.1606731212540343e-06, + "logits/chosen": -2.3323371410369873, + "logits/rejected": 1.3802802562713623, + "logps/chosen": -420.9634704589844, + "logps/rejected": -897.345703125, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.875608444213867, + "rewards/margins": 27.24919319152832, + "rewards/rejected": -34.12480163574219, + "step": 3814 + }, + { + "epoch": 2.3732503888024885, + "grad_norm": 21.276466369628906, + "learning_rate": 1.1595205163669895e-06, + "logits/chosen": 1.3786993026733398, + "logits/rejected": 2.2478296756744385, + "logps/chosen": -536.0009765625, + "logps/rejected": -677.1472778320312, + "loss": 0.0837, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.205318450927734, + "rewards/margins": 18.344764709472656, + "rewards/rejected": -28.55008316040039, + "step": 3815 + }, + { + "epoch": 2.373872472783826, + "grad_norm": 0.13539540767669678, + "learning_rate": 1.158367911479945e-06, + "logits/chosen": 1.7972712516784668, + "logits/rejected": 4.250777244567871, + "logps/chosen": -696.4157104492188, + "logps/rejected": -1102.195068359375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.6190409660339355, + "rewards/margins": 29.232839584350586, + "rewards/rejected": -34.85187911987305, + "step": 3816 + }, + { + "epoch": 2.374494556765163, + "grad_norm": 0.0012157351011410356, + "learning_rate": 1.1572153065929001e-06, + "logits/chosen": 1.0089893341064453, + "logits/rejected": 3.8036298751831055, + "logps/chosen": -535.8660888671875, + "logps/rejected": -858.5618286132812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.22950553894043, + "rewards/margins": 25.268936157226562, + "rewards/rejected": -36.498443603515625, + "step": 3817 + }, + { + "epoch": 2.3751166407465005, + "grad_norm": 7.838989404262975e-06, + "learning_rate": 1.1560627017058554e-06, + "logits/chosen": -3.151318311691284, + "logits/rejected": 2.062565326690674, + "logps/chosen": -290.9604187011719, + "logps/rejected": -1047.005126953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.1016693115234375, + "rewards/margins": 37.82139205932617, + "rewards/rejected": -43.923065185546875, + "step": 3818 + }, + { + "epoch": 2.3757387247278383, + "grad_norm": 0.09899670630693436, + "learning_rate": 1.1549100968188106e-06, + "logits/chosen": -1.221901535987854, + "logits/rejected": 2.648987293243408, + "logps/chosen": -322.9440612792969, + "logps/rejected": -714.6885375976562, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.6279449462890625, + "rewards/margins": 21.054927825927734, + "rewards/rejected": -26.682872772216797, + "step": 3819 + }, + { + "epoch": 2.3763608087091757, + "grad_norm": 0.756338894367218, + "learning_rate": 1.153757491931766e-06, + "logits/chosen": -2.508833885192871, + "logits/rejected": 3.401827573776245, + "logps/chosen": -351.54278564453125, + "logps/rejected": -971.5526733398438, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.598222732543945, + "rewards/margins": 28.914169311523438, + "rewards/rejected": -36.51239013671875, + "step": 3820 + }, + { + "epoch": 2.3769828926905134, + "grad_norm": 0.47002577781677246, + "learning_rate": 1.1526048870447213e-06, + "logits/chosen": 0.4595339000225067, + "logits/rejected": 2.387575149536133, + "logps/chosen": -424.815185546875, + "logps/rejected": -771.0914916992188, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.066505432128906, + "rewards/margins": 25.422027587890625, + "rewards/rejected": -33.48853302001953, + "step": 3821 + }, + { + "epoch": 2.377604976671851, + "grad_norm": 5.515254088095389e-05, + "learning_rate": 1.1514522821576765e-06, + "logits/chosen": -2.9623970985412598, + "logits/rejected": 2.3023247718811035, + "logps/chosen": -388.5247802734375, + "logps/rejected": -1023.3239135742188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.27711296081543, + "rewards/margins": 34.165740966796875, + "rewards/rejected": -42.44285202026367, + "step": 3822 + }, + { + "epoch": 2.378227060653188, + "grad_norm": 26.249530792236328, + "learning_rate": 1.1502996772706317e-06, + "logits/chosen": 0.5427840948104858, + "logits/rejected": 3.053314208984375, + "logps/chosen": -576.7138671875, + "logps/rejected": -1053.32080078125, + "loss": 0.1586, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.067059516906738, + "rewards/margins": 28.13719940185547, + "rewards/rejected": -39.20425796508789, + "step": 3823 + }, + { + "epoch": 2.3788491446345255, + "grad_norm": 0.00016750558279454708, + "learning_rate": 1.149147072383587e-06, + "logits/chosen": -0.615725576877594, + "logits/rejected": 1.1009749174118042, + "logps/chosen": -504.9210205078125, + "logps/rejected": -933.6993408203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.970019340515137, + "rewards/margins": 28.58142852783203, + "rewards/rejected": -34.551448822021484, + "step": 3824 + }, + { + "epoch": 2.3794712286158632, + "grad_norm": 0.01796378567814827, + "learning_rate": 1.1479944674965422e-06, + "logits/chosen": 0.3534300923347473, + "logits/rejected": 1.401143193244934, + "logps/chosen": -554.0580444335938, + "logps/rejected": -919.526123046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.12806510925293, + "rewards/margins": 26.107044219970703, + "rewards/rejected": -36.235111236572266, + "step": 3825 + }, + { + "epoch": 2.3800933125972006, + "grad_norm": 0.008272453211247921, + "learning_rate": 1.1468418626094976e-06, + "logits/chosen": 1.3457462787628174, + "logits/rejected": 2.418550491333008, + "logps/chosen": -616.002685546875, + "logps/rejected": -895.5050659179688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.227404594421387, + "rewards/margins": 24.576496124267578, + "rewards/rejected": -34.80390167236328, + "step": 3826 + }, + { + "epoch": 2.380715396578538, + "grad_norm": 1.474096417427063, + "learning_rate": 1.1456892577224528e-06, + "logits/chosen": 0.9951915144920349, + "logits/rejected": 3.792121410369873, + "logps/chosen": -609.1929931640625, + "logps/rejected": -1121.0908203125, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.0488920211792, + "rewards/margins": 34.88578796386719, + "rewards/rejected": -42.9346809387207, + "step": 3827 + }, + { + "epoch": 2.3813374805598757, + "grad_norm": 4.4082865715026855, + "learning_rate": 1.144536652835408e-06, + "logits/chosen": 0.9247102737426758, + "logits/rejected": 2.380889892578125, + "logps/chosen": -692.318359375, + "logps/rejected": -1045.686279296875, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.291966438293457, + "rewards/margins": 29.624252319335938, + "rewards/rejected": -40.916221618652344, + "step": 3828 + }, + { + "epoch": 2.381959564541213, + "grad_norm": 0.037429846823215485, + "learning_rate": 1.1433840479483633e-06, + "logits/chosen": -2.244234323501587, + "logits/rejected": 3.7790298461914062, + "logps/chosen": -458.18402099609375, + "logps/rejected": -1182.460693359375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.688623428344727, + "rewards/margins": 36.90248489379883, + "rewards/rejected": -44.59111022949219, + "step": 3829 + }, + { + "epoch": 2.3825816485225504, + "grad_norm": 3.2809522963361815e-05, + "learning_rate": 1.1422314430613187e-06, + "logits/chosen": 0.8496948480606079, + "logits/rejected": 3.935314655303955, + "logps/chosen": -696.5582275390625, + "logps/rejected": -1325.233154296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.319162368774414, + "rewards/margins": 40.249549865722656, + "rewards/rejected": -50.56871032714844, + "step": 3830 + }, + { + "epoch": 2.383203732503888, + "grad_norm": 1.7225253031938337e-05, + "learning_rate": 1.141078838174274e-06, + "logits/chosen": 1.3808107376098633, + "logits/rejected": 1.7068805694580078, + "logps/chosen": -491.5025634765625, + "logps/rejected": -781.4146118164062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.127446174621582, + "rewards/margins": 24.558727264404297, + "rewards/rejected": -30.68617057800293, + "step": 3831 + }, + { + "epoch": 2.3838258164852255, + "grad_norm": 0.08149093389511108, + "learning_rate": 1.1399262332872291e-06, + "logits/chosen": 0.25885581970214844, + "logits/rejected": 4.693746089935303, + "logps/chosen": -454.82904052734375, + "logps/rejected": -899.7261962890625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.123469352722168, + "rewards/margins": 25.705184936523438, + "rewards/rejected": -32.82865524291992, + "step": 3832 + }, + { + "epoch": 2.384447900466563, + "grad_norm": 0.09399059414863586, + "learning_rate": 1.1387736284001846e-06, + "logits/chosen": -1.8209072351455688, + "logits/rejected": 1.4271893501281738, + "logps/chosen": -344.51904296875, + "logps/rejected": -894.5800170898438, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.07096529006958, + "rewards/margins": 31.631628036499023, + "rewards/rejected": -37.70259094238281, + "step": 3833 + }, + { + "epoch": 2.3850699844479006, + "grad_norm": 31.503684997558594, + "learning_rate": 1.1376210235131398e-06, + "logits/chosen": 1.3686542510986328, + "logits/rejected": 4.16377592086792, + "logps/chosen": -628.8628540039062, + "logps/rejected": -1065.0404052734375, + "loss": 0.2273, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.978160858154297, + "rewards/margins": 29.86235809326172, + "rewards/rejected": -41.84052276611328, + "step": 3834 + }, + { + "epoch": 2.385692068429238, + "grad_norm": 0.10063165426254272, + "learning_rate": 1.136468418626095e-06, + "logits/chosen": -2.5170469284057617, + "logits/rejected": 3.1774446964263916, + "logps/chosen": -293.3402404785156, + "logps/rejected": -847.16455078125, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.719707489013672, + "rewards/margins": 25.059696197509766, + "rewards/rejected": -30.779403686523438, + "step": 3835 + }, + { + "epoch": 2.3863141524105753, + "grad_norm": 0.06871528923511505, + "learning_rate": 1.1353158137390503e-06, + "logits/chosen": 0.4909989833831787, + "logits/rejected": 4.08764123916626, + "logps/chosen": -477.0809631347656, + "logps/rejected": -993.2660522460938, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.19843864440918, + "rewards/margins": 28.097043991088867, + "rewards/rejected": -37.29548645019531, + "step": 3836 + }, + { + "epoch": 2.386936236391913, + "grad_norm": 3.714032192903005e-09, + "learning_rate": 1.1341632088520057e-06, + "logits/chosen": 2.953510284423828, + "logits/rejected": 3.4700748920440674, + "logps/chosen": -828.0033569335938, + "logps/rejected": -1372.037353515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.156700134277344, + "rewards/margins": 40.094810485839844, + "rewards/rejected": -58.25151443481445, + "step": 3837 + }, + { + "epoch": 2.3875583203732504, + "grad_norm": 3.741096258163452, + "learning_rate": 1.133010603964961e-06, + "logits/chosen": 2.4105310440063477, + "logits/rejected": 2.8894474506378174, + "logps/chosen": -609.405029296875, + "logps/rejected": -775.5403442382812, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.283312797546387, + "rewards/margins": 15.006702423095703, + "rewards/rejected": -26.290014266967773, + "step": 3838 + }, + { + "epoch": 2.388180404354588, + "grad_norm": 0.0006594893056899309, + "learning_rate": 1.1318579990779161e-06, + "logits/chosen": 1.8672668933868408, + "logits/rejected": 3.5272293090820312, + "logps/chosen": -618.9702758789062, + "logps/rejected": -1041.2235107421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.7736897468566895, + "rewards/margins": 31.124055862426758, + "rewards/rejected": -38.897743225097656, + "step": 3839 + }, + { + "epoch": 2.3888024883359256, + "grad_norm": 1.4465635633786889e-10, + "learning_rate": 1.1307053941908714e-06, + "logits/chosen": -0.799127459526062, + "logits/rejected": 3.0651865005493164, + "logps/chosen": -531.7684326171875, + "logps/rejected": -1095.8377685546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.762605667114258, + "rewards/margins": 38.612545013427734, + "rewards/rejected": -47.375152587890625, + "step": 3840 + }, + { + "epoch": 2.389424572317263, + "grad_norm": 3.5946497973782243e-06, + "learning_rate": 1.1295527893038268e-06, + "logits/chosen": -0.4164998531341553, + "logits/rejected": 2.9457249641418457, + "logps/chosen": -339.09429931640625, + "logps/rejected": -858.0877075195312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.4706830978393555, + "rewards/margins": 29.20252799987793, + "rewards/rejected": -33.67321014404297, + "step": 3841 + }, + { + "epoch": 2.3900466562986002, + "grad_norm": 0.37519222497940063, + "learning_rate": 1.128400184416782e-06, + "logits/chosen": -0.43005144596099854, + "logits/rejected": 4.186233043670654, + "logps/chosen": -359.9776916503906, + "logps/rejected": -879.0882568359375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.06336498260498, + "rewards/margins": 25.423934936523438, + "rewards/rejected": -33.487300872802734, + "step": 3842 + }, + { + "epoch": 2.3906687402799376, + "grad_norm": 2.3764117941027507e-05, + "learning_rate": 1.1272475795297373e-06, + "logits/chosen": -2.9976296424865723, + "logits/rejected": 2.8074846267700195, + "logps/chosen": -442.12554931640625, + "logps/rejected": -1336.9715576171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.334522247314453, + "rewards/margins": 46.972328186035156, + "rewards/rejected": -61.306854248046875, + "step": 3843 + }, + { + "epoch": 2.3912908242612754, + "grad_norm": 0.03182898834347725, + "learning_rate": 1.1260949746426927e-06, + "logits/chosen": 0.5130317807197571, + "logits/rejected": 2.4486565589904785, + "logps/chosen": -550.2568969726562, + "logps/rejected": -907.3411865234375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.3552885055542, + "rewards/margins": 28.428510665893555, + "rewards/rejected": -36.78379821777344, + "step": 3844 + }, + { + "epoch": 2.3919129082426127, + "grad_norm": 0.09916325658559799, + "learning_rate": 1.124942369755648e-06, + "logits/chosen": -0.9081926345825195, + "logits/rejected": 3.497873306274414, + "logps/chosen": -487.79779052734375, + "logps/rejected": -1203.325927734375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.939855575561523, + "rewards/margins": 35.823936462402344, + "rewards/rejected": -40.7637939453125, + "step": 3845 + }, + { + "epoch": 2.39253499222395, + "grad_norm": 0.006481132935732603, + "learning_rate": 1.1237897648686031e-06, + "logits/chosen": -3.0908753871917725, + "logits/rejected": 2.7621371746063232, + "logps/chosen": -401.870849609375, + "logps/rejected": -1008.1326904296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.672129154205322, + "rewards/margins": 31.289287567138672, + "rewards/rejected": -37.9614143371582, + "step": 3846 + }, + { + "epoch": 2.393157076205288, + "grad_norm": 7.337146598729305e-06, + "learning_rate": 1.1226371599815584e-06, + "logits/chosen": -0.8336185216903687, + "logits/rejected": 2.953880548477173, + "logps/chosen": -547.261962890625, + "logps/rejected": -1239.45263671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.123262405395508, + "rewards/margins": 46.47114181518555, + "rewards/rejected": -55.59440612792969, + "step": 3847 + }, + { + "epoch": 2.393779160186625, + "grad_norm": 2.9044236725894734e-05, + "learning_rate": 1.1214845550945138e-06, + "logits/chosen": 1.4980781078338623, + "logits/rejected": 3.6079821586608887, + "logps/chosen": -619.3064575195312, + "logps/rejected": -1130.49755859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.297479629516602, + "rewards/margins": 36.90601348876953, + "rewards/rejected": -49.20349884033203, + "step": 3848 + }, + { + "epoch": 2.3944012441679625, + "grad_norm": 1.0253877639770508, + "learning_rate": 1.120331950207469e-06, + "logits/chosen": 1.0810809135437012, + "logits/rejected": 4.039799690246582, + "logps/chosen": -595.2739868164062, + "logps/rejected": -1004.406982421875, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.904537200927734, + "rewards/margins": 25.0423583984375, + "rewards/rejected": -33.946895599365234, + "step": 3849 + }, + { + "epoch": 2.3950233281493003, + "grad_norm": 0.003972693812102079, + "learning_rate": 1.1191793453204243e-06, + "logits/chosen": 0.4832313358783722, + "logits/rejected": 2.6967647075653076, + "logps/chosen": -546.1810913085938, + "logps/rejected": -988.8588256835938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.466804504394531, + "rewards/margins": 35.22352600097656, + "rewards/rejected": -43.690330505371094, + "step": 3850 + }, + { + "epoch": 2.3956454121306376, + "grad_norm": 0.010902078822255135, + "learning_rate": 1.1180267404333795e-06, + "logits/chosen": -1.4263477325439453, + "logits/rejected": 1.1521247625350952, + "logps/chosen": -418.4461364746094, + "logps/rejected": -798.664794921875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.85601806640625, + "rewards/margins": 20.761884689331055, + "rewards/rejected": -30.617904663085938, + "step": 3851 + }, + { + "epoch": 2.396267496111975, + "grad_norm": 0.0003886328195221722, + "learning_rate": 1.116874135546335e-06, + "logits/chosen": 1.5249922275543213, + "logits/rejected": 3.464700698852539, + "logps/chosen": -502.8961181640625, + "logps/rejected": -1032.2509765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.9890570640563965, + "rewards/margins": 36.86589431762695, + "rewards/rejected": -42.854949951171875, + "step": 3852 + }, + { + "epoch": 2.3968895800933128, + "grad_norm": 0.0013575759949162602, + "learning_rate": 1.1157215306592901e-06, + "logits/chosen": -0.5717923641204834, + "logits/rejected": 3.2800910472869873, + "logps/chosen": -446.5240478515625, + "logps/rejected": -898.409423828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.479028701782227, + "rewards/margins": 28.62911605834961, + "rewards/rejected": -37.10814666748047, + "step": 3853 + }, + { + "epoch": 2.39751166407465, + "grad_norm": 33.908782958984375, + "learning_rate": 1.1145689257722454e-06, + "logits/chosen": 1.4796611070632935, + "logits/rejected": 2.9911272525787354, + "logps/chosen": -736.0274658203125, + "logps/rejected": -1020.3888549804688, + "loss": 0.7557, + "rewards/accuracies": 0.875, + "rewards/chosen": -17.031911849975586, + "rewards/margins": 19.36725616455078, + "rewards/rejected": -36.399169921875, + "step": 3854 + }, + { + "epoch": 2.3981337480559874, + "grad_norm": 4.587909643305466e-06, + "learning_rate": 1.1134163208852008e-06, + "logits/chosen": 0.44414985179901123, + "logits/rejected": 3.9183576107025146, + "logps/chosen": -463.86669921875, + "logps/rejected": -967.0587768554688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.996728897094727, + "rewards/margins": 27.84886360168457, + "rewards/rejected": -34.8455924987793, + "step": 3855 + }, + { + "epoch": 2.3987558320373252, + "grad_norm": 5.694292326552386e-07, + "learning_rate": 1.1122637159981558e-06, + "logits/chosen": -0.7950145602226257, + "logits/rejected": 3.101360559463501, + "logps/chosen": -555.7838745117188, + "logps/rejected": -1047.280029296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.646661758422852, + "rewards/margins": 28.201671600341797, + "rewards/rejected": -36.84833526611328, + "step": 3856 + }, + { + "epoch": 2.3993779160186626, + "grad_norm": 4.2507390389801e-06, + "learning_rate": 1.111111111111111e-06, + "logits/chosen": 3.1061859130859375, + "logits/rejected": 3.111227035522461, + "logps/chosen": -794.248779296875, + "logps/rejected": -1133.7769775390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.302314758300781, + "rewards/margins": 34.15514373779297, + "rewards/rejected": -45.457462310791016, + "step": 3857 + }, + { + "epoch": 2.4, + "grad_norm": 0.0018757757497951388, + "learning_rate": 1.1099585062240665e-06, + "logits/chosen": 0.4937957525253296, + "logits/rejected": 3.476731300354004, + "logps/chosen": -680.0784301757812, + "logps/rejected": -1116.152099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.059968948364258, + "rewards/margins": 34.34630584716797, + "rewards/rejected": -42.40627670288086, + "step": 3858 + }, + { + "epoch": 2.4006220839813377, + "grad_norm": 0.0005473470664583147, + "learning_rate": 1.1088059013370217e-06, + "logits/chosen": 0.9986388087272644, + "logits/rejected": 3.233471155166626, + "logps/chosen": -412.994873046875, + "logps/rejected": -988.3786010742188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.322336673736572, + "rewards/margins": 37.020713806152344, + "rewards/rejected": -44.343048095703125, + "step": 3859 + }, + { + "epoch": 2.401244167962675, + "grad_norm": 2.3679943339516285e-08, + "learning_rate": 1.107653296449977e-06, + "logits/chosen": 1.7901991605758667, + "logits/rejected": 3.3682918548583984, + "logps/chosen": -713.5994873046875, + "logps/rejected": -1195.774658203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.00442886352539, + "rewards/margins": 42.90016555786133, + "rewards/rejected": -55.90459442138672, + "step": 3860 + }, + { + "epoch": 2.4018662519440124, + "grad_norm": 52.9873046875, + "learning_rate": 1.1065006915629324e-06, + "logits/chosen": 0.24488091468811035, + "logits/rejected": 1.826075792312622, + "logps/chosen": -396.94622802734375, + "logps/rejected": -682.343017578125, + "loss": 0.7565, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.514383316040039, + "rewards/margins": 20.130586624145508, + "rewards/rejected": -24.644969940185547, + "step": 3861 + }, + { + "epoch": 2.4024883359253497, + "grad_norm": 0.00010500989446882159, + "learning_rate": 1.1053480866758876e-06, + "logits/chosen": -0.6162758469581604, + "logits/rejected": 2.7350568771362305, + "logps/chosen": -506.5060119628906, + "logps/rejected": -1002.4456787109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.694830894470215, + "rewards/margins": 31.44960594177246, + "rewards/rejected": -37.14443588256836, + "step": 3862 + }, + { + "epoch": 2.4031104199066875, + "grad_norm": 0.437195360660553, + "learning_rate": 1.1041954817888428e-06, + "logits/chosen": -2.221411943435669, + "logits/rejected": 2.240596294403076, + "logps/chosen": -397.92572021484375, + "logps/rejected": -886.3359375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.60426139831543, + "rewards/margins": 26.445598602294922, + "rewards/rejected": -34.049861907958984, + "step": 3863 + }, + { + "epoch": 2.403732503888025, + "grad_norm": 0.0330638512969017, + "learning_rate": 1.103042876901798e-06, + "logits/chosen": -1.5246604681015015, + "logits/rejected": 3.694162368774414, + "logps/chosen": -434.1704406738281, + "logps/rejected": -949.82373046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.017059326171875, + "rewards/margins": 29.48417854309082, + "rewards/rejected": -37.50123596191406, + "step": 3864 + }, + { + "epoch": 2.404354587869362, + "grad_norm": 0.0012657060287892818, + "learning_rate": 1.1018902720147535e-06, + "logits/chosen": -0.6471145153045654, + "logits/rejected": 2.905531167984009, + "logps/chosen": -530.5743408203125, + "logps/rejected": -1086.3309326171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.827986717224121, + "rewards/margins": 35.81536865234375, + "rewards/rejected": -43.64336013793945, + "step": 3865 + }, + { + "epoch": 2.4049766718507, + "grad_norm": 9.531051858857609e-08, + "learning_rate": 1.1007376671277087e-06, + "logits/chosen": 0.3572736084461212, + "logits/rejected": 3.8673791885375977, + "logps/chosen": -654.2926025390625, + "logps/rejected": -1154.85888671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.924564361572266, + "rewards/margins": 35.805335998535156, + "rewards/rejected": -44.729896545410156, + "step": 3866 + }, + { + "epoch": 2.4055987558320373, + "grad_norm": 0.0006852780352346599, + "learning_rate": 1.099585062240664e-06, + "logits/chosen": -1.9400222301483154, + "logits/rejected": 2.281996726989746, + "logps/chosen": -417.958984375, + "logps/rejected": -963.850830078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.913776397705078, + "rewards/margins": 31.837993621826172, + "rewards/rejected": -37.75177001953125, + "step": 3867 + }, + { + "epoch": 2.4062208398133746, + "grad_norm": 0.7762795090675354, + "learning_rate": 1.0984324573536194e-06, + "logits/chosen": -1.367218255996704, + "logits/rejected": 2.897139072418213, + "logps/chosen": -401.431884765625, + "logps/rejected": -1006.3223876953125, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.719179153442383, + "rewards/margins": 31.281005859375, + "rewards/rejected": -40.000186920166016, + "step": 3868 + }, + { + "epoch": 2.4068429237947124, + "grad_norm": 1.8480974176782183e-05, + "learning_rate": 1.0972798524665746e-06, + "logits/chosen": 0.2994877099990845, + "logits/rejected": 2.9668407440185547, + "logps/chosen": -570.2740478515625, + "logps/rejected": -981.5447998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.978969573974609, + "rewards/margins": 30.225505828857422, + "rewards/rejected": -35.204471588134766, + "step": 3869 + }, + { + "epoch": 2.4074650077760498, + "grad_norm": 0.4169832170009613, + "learning_rate": 1.0961272475795298e-06, + "logits/chosen": -2.425063133239746, + "logits/rejected": 2.970721960067749, + "logps/chosen": -316.7384948730469, + "logps/rejected": -876.7730712890625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.8234100341796875, + "rewards/margins": 26.405567169189453, + "rewards/rejected": -32.228973388671875, + "step": 3870 + }, + { + "epoch": 2.408087091757387, + "grad_norm": 0.008011666126549244, + "learning_rate": 1.094974642692485e-06, + "logits/chosen": -1.890117883682251, + "logits/rejected": 1.9428832530975342, + "logps/chosen": -498.21478271484375, + "logps/rejected": -1003.7666015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.868483543395996, + "rewards/margins": 26.999061584472656, + "rewards/rejected": -36.8675422668457, + "step": 3871 + }, + { + "epoch": 2.408709175738725, + "grad_norm": 0.014141597785055637, + "learning_rate": 1.0938220378054405e-06, + "logits/chosen": -3.8100922107696533, + "logits/rejected": 1.1407690048217773, + "logps/chosen": -336.95751953125, + "logps/rejected": -872.5132446289062, + "loss": 0.0867, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.890573501586914, + "rewards/margins": 26.666492462158203, + "rewards/rejected": -33.55706787109375, + "step": 3872 + }, + { + "epoch": 2.4093312597200622, + "grad_norm": 0.0007986845448613167, + "learning_rate": 1.0926694329183957e-06, + "logits/chosen": -2.3703830242156982, + "logits/rejected": 2.6667072772979736, + "logps/chosen": -360.1922302246094, + "logps/rejected": -948.6698608398438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.256704330444336, + "rewards/margins": 34.69185256958008, + "rewards/rejected": -42.94855499267578, + "step": 3873 + }, + { + "epoch": 2.4099533437013996, + "grad_norm": 0.1704350709915161, + "learning_rate": 1.091516828031351e-06, + "logits/chosen": 1.2536743879318237, + "logits/rejected": 1.5354732275009155, + "logps/chosen": -570.9561157226562, + "logps/rejected": -839.4619750976562, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.871989250183105, + "rewards/margins": 22.955930709838867, + "rewards/rejected": -34.827919006347656, + "step": 3874 + }, + { + "epoch": 2.4105754276827374, + "grad_norm": 0.0066949715837836266, + "learning_rate": 1.0903642231443061e-06, + "logits/chosen": -1.0441203117370605, + "logits/rejected": 4.152888298034668, + "logps/chosen": -414.62628173828125, + "logps/rejected": -960.6255493164062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.822000503540039, + "rewards/margins": 23.903718948364258, + "rewards/rejected": -31.725719451904297, + "step": 3875 + }, + { + "epoch": 2.4111975116640747, + "grad_norm": 0.07872223109006882, + "learning_rate": 1.0892116182572616e-06, + "logits/chosen": 0.5356428623199463, + "logits/rejected": 2.545949697494507, + "logps/chosen": -443.02252197265625, + "logps/rejected": -925.462646484375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.831305503845215, + "rewards/margins": 27.70733070373535, + "rewards/rejected": -34.53863525390625, + "step": 3876 + }, + { + "epoch": 2.411819595645412, + "grad_norm": 0.0002740553754847497, + "learning_rate": 1.0880590133702168e-06, + "logits/chosen": 1.6327204704284668, + "logits/rejected": 4.18537712097168, + "logps/chosen": -581.2978515625, + "logps/rejected": -1052.298583984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.621686935424805, + "rewards/margins": 25.028202056884766, + "rewards/rejected": -34.64988708496094, + "step": 3877 + }, + { + "epoch": 2.41244167962675, + "grad_norm": 0.010209181345999241, + "learning_rate": 1.086906408483172e-06, + "logits/chosen": -0.8912521600723267, + "logits/rejected": 3.182377815246582, + "logps/chosen": -574.5421142578125, + "logps/rejected": -1049.0518798828125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.360973358154297, + "rewards/margins": 24.6096134185791, + "rewards/rejected": -34.970584869384766, + "step": 3878 + }, + { + "epoch": 2.413063763608087, + "grad_norm": 1.7555263184476644e-05, + "learning_rate": 1.0857538035961275e-06, + "logits/chosen": 1.3117332458496094, + "logits/rejected": 5.049984931945801, + "logps/chosen": -458.4806213378906, + "logps/rejected": -928.1309814453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.135470390319824, + "rewards/margins": 27.78437614440918, + "rewards/rejected": -33.91984558105469, + "step": 3879 + }, + { + "epoch": 2.4136858475894245, + "grad_norm": 1.8339464664459229, + "learning_rate": 1.0846011987090827e-06, + "logits/chosen": -0.8407329320907593, + "logits/rejected": 2.9085705280303955, + "logps/chosen": -490.6579895019531, + "logps/rejected": -1031.06787109375, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.46267032623291, + "rewards/margins": 32.82110595703125, + "rewards/rejected": -40.283775329589844, + "step": 3880 + }, + { + "epoch": 2.414307931570762, + "grad_norm": 1.7781831047614105e-05, + "learning_rate": 1.083448593822038e-06, + "logits/chosen": -0.6590343713760376, + "logits/rejected": 2.556335926055908, + "logps/chosen": -533.1664428710938, + "logps/rejected": -1097.8114013671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.707242965698242, + "rewards/margins": 37.932315826416016, + "rewards/rejected": -48.639564514160156, + "step": 3881 + }, + { + "epoch": 2.4149300155520996, + "grad_norm": 0.018035000190138817, + "learning_rate": 1.0822959889349931e-06, + "logits/chosen": 0.9073091149330139, + "logits/rejected": 2.709404945373535, + "logps/chosen": -545.5582885742188, + "logps/rejected": -924.0416870117188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.252019882202148, + "rewards/margins": 27.85335922241211, + "rewards/rejected": -36.10538101196289, + "step": 3882 + }, + { + "epoch": 2.415552099533437, + "grad_norm": 0.00019639487436506897, + "learning_rate": 1.0811433840479486e-06, + "logits/chosen": 0.6960997581481934, + "logits/rejected": 3.6727519035339355, + "logps/chosen": -628.2368774414062, + "logps/rejected": -1225.331298828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.303197860717773, + "rewards/margins": 40.59136199951172, + "rewards/rejected": -49.89455795288086, + "step": 3883 + }, + { + "epoch": 2.4161741835147743, + "grad_norm": 0.02851576916873455, + "learning_rate": 1.0799907791609038e-06, + "logits/chosen": 0.7198000550270081, + "logits/rejected": 2.796893835067749, + "logps/chosen": -575.7562255859375, + "logps/rejected": -1004.9088745117188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.440130233764648, + "rewards/margins": 30.360107421875, + "rewards/rejected": -38.80023956298828, + "step": 3884 + }, + { + "epoch": 2.416796267496112, + "grad_norm": 0.005485597066581249, + "learning_rate": 1.078838174273859e-06, + "logits/chosen": -1.3771247863769531, + "logits/rejected": 2.6898508071899414, + "logps/chosen": -405.0865478515625, + "logps/rejected": -927.6209716796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.677170753479004, + "rewards/margins": 31.92575454711914, + "rewards/rejected": -38.60292434692383, + "step": 3885 + }, + { + "epoch": 2.4174183514774494, + "grad_norm": 0.0021522575989365578, + "learning_rate": 1.0776855693868142e-06, + "logits/chosen": 1.534578561782837, + "logits/rejected": 3.6858203411102295, + "logps/chosen": -449.2783203125, + "logps/rejected": -860.0929565429688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.283685684204102, + "rewards/margins": 24.695194244384766, + "rewards/rejected": -33.978878021240234, + "step": 3886 + }, + { + "epoch": 2.4180404354587868, + "grad_norm": 0.00910354033112526, + "learning_rate": 1.0765329644997697e-06, + "logits/chosen": -2.305527448654175, + "logits/rejected": 1.6769919395446777, + "logps/chosen": -355.90106201171875, + "logps/rejected": -976.04443359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.1197309494018555, + "rewards/margins": 37.60846710205078, + "rewards/rejected": -44.72819900512695, + "step": 3887 + }, + { + "epoch": 2.4186625194401246, + "grad_norm": 1.590985831967373e-08, + "learning_rate": 1.075380359612725e-06, + "logits/chosen": -0.454487681388855, + "logits/rejected": 3.099392890930176, + "logps/chosen": -555.4652099609375, + "logps/rejected": -1095.21240234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.411281585693359, + "rewards/margins": 31.243457794189453, + "rewards/rejected": -37.65473937988281, + "step": 3888 + }, + { + "epoch": 2.419284603421462, + "grad_norm": 0.0012218141928315163, + "learning_rate": 1.0742277547256801e-06, + "logits/chosen": -0.7278669476509094, + "logits/rejected": 1.473168969154358, + "logps/chosen": -443.8373107910156, + "logps/rejected": -928.9570922851562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.763683795928955, + "rewards/margins": 28.198726654052734, + "rewards/rejected": -35.96240997314453, + "step": 3889 + }, + { + "epoch": 2.4199066874027992, + "grad_norm": 4.316197009757161e-05, + "learning_rate": 1.0730751498386354e-06, + "logits/chosen": 0.8848274946212769, + "logits/rejected": 3.4951579570770264, + "logps/chosen": -535.738525390625, + "logps/rejected": -1054.58544921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.603253364562988, + "rewards/margins": 32.85493469238281, + "rewards/rejected": -42.458187103271484, + "step": 3890 + }, + { + "epoch": 2.420528771384137, + "grad_norm": 0.00023110301117412746, + "learning_rate": 1.0719225449515906e-06, + "logits/chosen": -2.57528018951416, + "logits/rejected": 3.3211045265197754, + "logps/chosen": -384.0837707519531, + "logps/rejected": -999.553466796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.611985206604004, + "rewards/margins": 29.31735610961914, + "rewards/rejected": -38.92934036254883, + "step": 3891 + }, + { + "epoch": 2.4211508553654744, + "grad_norm": 0.00010306945478077978, + "learning_rate": 1.0707699400645458e-06, + "logits/chosen": 0.6355469822883606, + "logits/rejected": 3.7850501537323, + "logps/chosen": -430.77984619140625, + "logps/rejected": -878.0767822265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.622600078582764, + "rewards/margins": 27.778318405151367, + "rewards/rejected": -33.400917053222656, + "step": 3892 + }, + { + "epoch": 2.4217729393468117, + "grad_norm": 4.880786491412437e-06, + "learning_rate": 1.0696173351775012e-06, + "logits/chosen": -0.20533713698387146, + "logits/rejected": 2.7818212509155273, + "logps/chosen": -484.79132080078125, + "logps/rejected": -1032.011474609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.599923133850098, + "rewards/margins": 35.451377868652344, + "rewards/rejected": -46.051307678222656, + "step": 3893 + }, + { + "epoch": 2.4223950233281495, + "grad_norm": 0.0029728247318416834, + "learning_rate": 1.0684647302904565e-06, + "logits/chosen": -1.0218257904052734, + "logits/rejected": 1.424391269683838, + "logps/chosen": -387.3336181640625, + "logps/rejected": -822.9134521484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.986425876617432, + "rewards/margins": 28.502716064453125, + "rewards/rejected": -35.48914337158203, + "step": 3894 + }, + { + "epoch": 2.423017107309487, + "grad_norm": 3.79450602849829e-06, + "learning_rate": 1.0673121254034117e-06, + "logits/chosen": 1.6423513889312744, + "logits/rejected": 2.8541207313537598, + "logps/chosen": -503.12030029296875, + "logps/rejected": -846.1358032226562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.122786045074463, + "rewards/margins": 26.406681060791016, + "rewards/rejected": -33.52946472167969, + "step": 3895 + }, + { + "epoch": 2.423639191290824, + "grad_norm": 0.07275314629077911, + "learning_rate": 1.0661595205163671e-06, + "logits/chosen": -0.7622900009155273, + "logits/rejected": 3.9825644493103027, + "logps/chosen": -450.01629638671875, + "logps/rejected": -1051.365234375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.296741485595703, + "rewards/margins": 27.42078971862793, + "rewards/rejected": -35.717533111572266, + "step": 3896 + }, + { + "epoch": 2.424261275272162, + "grad_norm": 1.1182972192764282, + "learning_rate": 1.0650069156293224e-06, + "logits/chosen": 1.8294832706451416, + "logits/rejected": -0.14126360416412354, + "logps/chosen": -713.759033203125, + "logps/rejected": -866.5440673828125, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.934208869934082, + "rewards/margins": 23.563343048095703, + "rewards/rejected": -33.49755096435547, + "step": 3897 + }, + { + "epoch": 2.4248833592534993, + "grad_norm": 4.143659680266865e-06, + "learning_rate": 1.0638543107422776e-06, + "logits/chosen": -1.2462234497070312, + "logits/rejected": 2.659334182739258, + "logps/chosen": -287.9202880859375, + "logps/rejected": -765.412109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.872437000274658, + "rewards/margins": 28.731220245361328, + "rewards/rejected": -32.60365676879883, + "step": 3898 + }, + { + "epoch": 2.4255054432348366, + "grad_norm": 4.975401225237874e-06, + "learning_rate": 1.0627017058552328e-06, + "logits/chosen": 1.7069756984710693, + "logits/rejected": 3.591083526611328, + "logps/chosen": -539.0305786132812, + "logps/rejected": -873.288330078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.884143352508545, + "rewards/margins": 27.36844253540039, + "rewards/rejected": -35.252586364746094, + "step": 3899 + }, + { + "epoch": 2.426127527216174, + "grad_norm": 0.014776756055653095, + "learning_rate": 1.0615491009681882e-06, + "logits/chosen": -0.04439480975270271, + "logits/rejected": 2.4738411903381348, + "logps/chosen": -548.758544921875, + "logps/rejected": -1041.307373046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.613590717315674, + "rewards/margins": 30.745086669921875, + "rewards/rejected": -37.35867691040039, + "step": 3900 + }, + { + "epoch": 2.4267496111975118, + "grad_norm": 0.00023643742315471172, + "learning_rate": 1.0603964960811435e-06, + "logits/chosen": 1.3689160346984863, + "logits/rejected": 4.3477959632873535, + "logps/chosen": -502.134033203125, + "logps/rejected": -981.1270751953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.091121673583984, + "rewards/margins": 33.34466552734375, + "rewards/rejected": -40.435787200927734, + "step": 3901 + }, + { + "epoch": 2.427371695178849, + "grad_norm": 0.24226395785808563, + "learning_rate": 1.0592438911940987e-06, + "logits/chosen": 2.8122005462646484, + "logits/rejected": 5.458620071411133, + "logps/chosen": -536.3717651367188, + "logps/rejected": -1013.8870849609375, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.13599967956543, + "rewards/margins": 28.089576721191406, + "rewards/rejected": -38.2255744934082, + "step": 3902 + }, + { + "epoch": 2.4279937791601864, + "grad_norm": 2.852854095181101e-06, + "learning_rate": 1.058091286307054e-06, + "logits/chosen": -1.2406141757965088, + "logits/rejected": 3.444408416748047, + "logps/chosen": -490.6299743652344, + "logps/rejected": -1048.380859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.690634727478027, + "rewards/margins": 33.460086822509766, + "rewards/rejected": -45.15072250366211, + "step": 3903 + }, + { + "epoch": 2.4286158631415242, + "grad_norm": 0.05775154381990433, + "learning_rate": 1.0569386814200094e-06, + "logits/chosen": 1.434791922569275, + "logits/rejected": 2.4212846755981445, + "logps/chosen": -573.909423828125, + "logps/rejected": -889.8311157226562, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.915290355682373, + "rewards/margins": 25.13017463684082, + "rewards/rejected": -32.04546356201172, + "step": 3904 + }, + { + "epoch": 2.4292379471228616, + "grad_norm": 0.019974645227193832, + "learning_rate": 1.0557860765329646e-06, + "logits/chosen": 1.4990663528442383, + "logits/rejected": 2.492790699005127, + "logps/chosen": -607.529296875, + "logps/rejected": -915.8584594726562, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.066529273986816, + "rewards/margins": 22.119138717651367, + "rewards/rejected": -29.185667037963867, + "step": 3905 + }, + { + "epoch": 2.429860031104199, + "grad_norm": 0.0003226126718800515, + "learning_rate": 1.0546334716459198e-06, + "logits/chosen": 0.2538611590862274, + "logits/rejected": 3.2478034496307373, + "logps/chosen": -505.583251953125, + "logps/rejected": -998.1884765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.00172233581543, + "rewards/margins": 26.9884033203125, + "rewards/rejected": -37.99012756347656, + "step": 3906 + }, + { + "epoch": 2.4304821150855367, + "grad_norm": 0.36915433406829834, + "learning_rate": 1.0534808667588752e-06, + "logits/chosen": 0.7074757218360901, + "logits/rejected": 2.855924606323242, + "logps/chosen": -600.1455688476562, + "logps/rejected": -972.5783081054688, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.70722770690918, + "rewards/margins": 26.310523986816406, + "rewards/rejected": -36.01775360107422, + "step": 3907 + }, + { + "epoch": 2.431104199066874, + "grad_norm": 1.5006874036771478e-06, + "learning_rate": 1.0523282618718305e-06, + "logits/chosen": -0.2986694574356079, + "logits/rejected": 2.294739246368408, + "logps/chosen": -521.6359252929688, + "logps/rejected": -982.6089477539062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.400278091430664, + "rewards/margins": 31.926898956298828, + "rewards/rejected": -42.327178955078125, + "step": 3908 + }, + { + "epoch": 2.4317262830482114, + "grad_norm": 0.000786642893217504, + "learning_rate": 1.0511756569847857e-06, + "logits/chosen": -1.4148194789886475, + "logits/rejected": 0.4607158899307251, + "logps/chosen": -343.89404296875, + "logps/rejected": -714.0547485351562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.373026371002197, + "rewards/margins": 23.576187133789062, + "rewards/rejected": -28.949214935302734, + "step": 3909 + }, + { + "epoch": 2.432348367029549, + "grad_norm": 8.006913185119629, + "learning_rate": 1.050023052097741e-06, + "logits/chosen": 2.225314140319824, + "logits/rejected": 2.388673782348633, + "logps/chosen": -775.02392578125, + "logps/rejected": -1070.6405029296875, + "loss": 0.0405, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.142297744750977, + "rewards/margins": 25.948198318481445, + "rewards/rejected": -38.09049987792969, + "step": 3910 + }, + { + "epoch": 2.4329704510108865, + "grad_norm": 0.15609769523143768, + "learning_rate": 1.0488704472106964e-06, + "logits/chosen": 1.5960623025894165, + "logits/rejected": 3.988802671432495, + "logps/chosen": -606.6814575195312, + "logps/rejected": -972.6724243164062, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.931401252746582, + "rewards/margins": 21.659502029418945, + "rewards/rejected": -31.590904235839844, + "step": 3911 + }, + { + "epoch": 2.433592534992224, + "grad_norm": 0.0025720945559442043, + "learning_rate": 1.0477178423236516e-06, + "logits/chosen": -0.44914060831069946, + "logits/rejected": 3.133753776550293, + "logps/chosen": -466.37982177734375, + "logps/rejected": -967.54736328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.205682754516602, + "rewards/margins": 31.63439178466797, + "rewards/rejected": -37.84007263183594, + "step": 3912 + }, + { + "epoch": 2.4342146189735616, + "grad_norm": 0.0004372483235783875, + "learning_rate": 1.0465652374366068e-06, + "logits/chosen": 0.052958518266677856, + "logits/rejected": 4.425273895263672, + "logps/chosen": -494.8092041015625, + "logps/rejected": -966.57080078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.776680946350098, + "rewards/margins": 25.838638305664062, + "rewards/rejected": -32.615318298339844, + "step": 3913 + }, + { + "epoch": 2.434836702954899, + "grad_norm": 0.13159841299057007, + "learning_rate": 1.045412632549562e-06, + "logits/chosen": 0.2504895329475403, + "logits/rejected": 2.2940564155578613, + "logps/chosen": -595.7615356445312, + "logps/rejected": -949.5869750976562, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.673824310302734, + "rewards/margins": 22.880603790283203, + "rewards/rejected": -33.55442810058594, + "step": 3914 + }, + { + "epoch": 2.4354587869362363, + "grad_norm": 0.024950237944722176, + "learning_rate": 1.0442600276625175e-06, + "logits/chosen": -0.04401680454611778, + "logits/rejected": 3.7909388542175293, + "logps/chosen": -532.6128540039062, + "logps/rejected": -1144.41650390625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.95002269744873, + "rewards/margins": 38.728492736816406, + "rewards/rejected": -47.67851257324219, + "step": 3915 + }, + { + "epoch": 2.436080870917574, + "grad_norm": 8.862929098540917e-05, + "learning_rate": 1.0431074227754727e-06, + "logits/chosen": 1.5957598686218262, + "logits/rejected": 3.401139497756958, + "logps/chosen": -713.1761474609375, + "logps/rejected": -1047.773193359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.424721717834473, + "rewards/margins": 27.84493064880371, + "rewards/rejected": -39.2696533203125, + "step": 3916 + }, + { + "epoch": 2.4367029548989114, + "grad_norm": 3.7480590435734484e-06, + "learning_rate": 1.041954817888428e-06, + "logits/chosen": -0.12272043526172638, + "logits/rejected": 1.3370565176010132, + "logps/chosen": -578.545166015625, + "logps/rejected": -850.365234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.4892683029174805, + "rewards/margins": 22.693389892578125, + "rewards/rejected": -29.18265724182129, + "step": 3917 + }, + { + "epoch": 2.4373250388802488, + "grad_norm": 47.579586029052734, + "learning_rate": 1.0408022130013833e-06, + "logits/chosen": 2.956791877746582, + "logits/rejected": 3.071183681488037, + "logps/chosen": -763.1408081054688, + "logps/rejected": -979.1478881835938, + "loss": 1.6257, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.412786483764648, + "rewards/margins": 21.867210388183594, + "rewards/rejected": -35.279998779296875, + "step": 3918 + }, + { + "epoch": 2.437947122861586, + "grad_norm": 8.301009802380577e-05, + "learning_rate": 1.0396496081143386e-06, + "logits/chosen": -3.1810388565063477, + "logits/rejected": 0.9512239694595337, + "logps/chosen": -333.6005554199219, + "logps/rejected": -829.9498901367188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.965550422668457, + "rewards/margins": 23.553924560546875, + "rewards/rejected": -28.51947593688965, + "step": 3919 + }, + { + "epoch": 2.438569206842924, + "grad_norm": 8.164431619661627e-08, + "learning_rate": 1.0384970032272938e-06, + "logits/chosen": -1.9742231369018555, + "logits/rejected": 4.877246379852295, + "logps/chosen": -361.27850341796875, + "logps/rejected": -1050.592041015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.751445293426514, + "rewards/margins": 33.7140007019043, + "rewards/rejected": -38.46544647216797, + "step": 3920 + }, + { + "epoch": 2.4391912908242612, + "grad_norm": 7.0156575020519085e-06, + "learning_rate": 1.037344398340249e-06, + "logits/chosen": -0.6629587411880493, + "logits/rejected": 3.9200730323791504, + "logps/chosen": -420.0581970214844, + "logps/rejected": -914.085205078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.827775478363037, + "rewards/margins": 25.929183959960938, + "rewards/rejected": -33.756961822509766, + "step": 3921 + }, + { + "epoch": 2.4398133748055986, + "grad_norm": 4.913921657134779e-06, + "learning_rate": 1.0361917934532045e-06, + "logits/chosen": 0.21196871995925903, + "logits/rejected": 2.7537970542907715, + "logps/chosen": -604.2928466796875, + "logps/rejected": -1049.777099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.621042251586914, + "rewards/margins": 28.787445068359375, + "rewards/rejected": -41.40848922729492, + "step": 3922 + }, + { + "epoch": 2.4404354587869364, + "grad_norm": 0.0062870606780052185, + "learning_rate": 1.0350391885661595e-06, + "logits/chosen": -0.6903063058853149, + "logits/rejected": 0.6735283136367798, + "logps/chosen": -436.35601806640625, + "logps/rejected": -874.8502197265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.20759391784668, + "rewards/margins": 29.023616790771484, + "rewards/rejected": -36.2312126159668, + "step": 3923 + }, + { + "epoch": 2.4410575427682737, + "grad_norm": 2.6101699859282235e-06, + "learning_rate": 1.033886583679115e-06, + "logits/chosen": 0.8353837728500366, + "logits/rejected": 4.180713176727295, + "logps/chosen": -465.27850341796875, + "logps/rejected": -924.72021484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.832421779632568, + "rewards/margins": 32.02825927734375, + "rewards/rejected": -38.860679626464844, + "step": 3924 + }, + { + "epoch": 2.441679626749611, + "grad_norm": 5.736229013564298e-06, + "learning_rate": 1.0327339787920701e-06, + "logits/chosen": -1.6903998851776123, + "logits/rejected": 3.183885097503662, + "logps/chosen": -390.8701477050781, + "logps/rejected": -957.973388671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.082308769226074, + "rewards/margins": 34.455909729003906, + "rewards/rejected": -39.53821563720703, + "step": 3925 + }, + { + "epoch": 2.442301710730949, + "grad_norm": 0.4600825905799866, + "learning_rate": 1.0315813739050254e-06, + "logits/chosen": -0.5375813245773315, + "logits/rejected": 2.952937126159668, + "logps/chosen": -584.4176025390625, + "logps/rejected": -1137.98681640625, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.461213111877441, + "rewards/margins": 27.309200286865234, + "rewards/rejected": -33.77041244506836, + "step": 3926 + }, + { + "epoch": 2.442923794712286, + "grad_norm": 10.128042221069336, + "learning_rate": 1.0304287690179806e-06, + "logits/chosen": 1.3842074871063232, + "logits/rejected": 2.9823427200317383, + "logps/chosen": -661.162841796875, + "logps/rejected": -954.5634155273438, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.083999633789062, + "rewards/margins": 23.011592864990234, + "rewards/rejected": -33.0955924987793, + "step": 3927 + }, + { + "epoch": 2.4435458786936235, + "grad_norm": 0.00012620781490113586, + "learning_rate": 1.029276164130936e-06, + "logits/chosen": 1.541063904762268, + "logits/rejected": 4.459494590759277, + "logps/chosen": -573.2506713867188, + "logps/rejected": -1087.428955078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.81618595123291, + "rewards/margins": 31.231414794921875, + "rewards/rejected": -41.04759979248047, + "step": 3928 + }, + { + "epoch": 2.4441679626749613, + "grad_norm": 0.038373615592718124, + "learning_rate": 1.0281235592438912e-06, + "logits/chosen": 2.5318827629089355, + "logits/rejected": 3.6796557903289795, + "logps/chosen": -503.311767578125, + "logps/rejected": -754.3387451171875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.870074272155762, + "rewards/margins": 20.023391723632812, + "rewards/rejected": -27.89346694946289, + "step": 3929 + }, + { + "epoch": 2.4447900466562986, + "grad_norm": 0.0009832432260736823, + "learning_rate": 1.0269709543568465e-06, + "logits/chosen": 1.1359636783599854, + "logits/rejected": 3.896103858947754, + "logps/chosen": -525.8956909179688, + "logps/rejected": -1105.4205322265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.109336853027344, + "rewards/margins": 34.28251647949219, + "rewards/rejected": -46.391849517822266, + "step": 3930 + }, + { + "epoch": 2.445412130637636, + "grad_norm": 0.3185408115386963, + "learning_rate": 1.025818349469802e-06, + "logits/chosen": 2.0907115936279297, + "logits/rejected": 2.1502089500427246, + "logps/chosen": -528.4436645507812, + "logps/rejected": -699.7049560546875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.308149337768555, + "rewards/margins": 16.63261604309082, + "rewards/rejected": -21.940765380859375, + "step": 3931 + }, + { + "epoch": 2.4460342146189737, + "grad_norm": 1.571313212878067e-08, + "learning_rate": 1.0246657445827571e-06, + "logits/chosen": -2.6080470085144043, + "logits/rejected": 2.0387868881225586, + "logps/chosen": -386.8277587890625, + "logps/rejected": -1066.030029296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.407026290893555, + "rewards/margins": 36.1947021484375, + "rewards/rejected": -41.60173034667969, + "step": 3932 + }, + { + "epoch": 2.446656298600311, + "grad_norm": 4.174937930656597e-05, + "learning_rate": 1.0235131396957124e-06, + "logits/chosen": 0.15204091370105743, + "logits/rejected": 3.042105197906494, + "logps/chosen": -577.6109008789062, + "logps/rejected": -972.4326782226562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.046577453613281, + "rewards/margins": 27.047788619995117, + "rewards/rejected": -38.094364166259766, + "step": 3933 + }, + { + "epoch": 2.4472783825816484, + "grad_norm": 2.9769933007628424e-06, + "learning_rate": 1.0223605348086676e-06, + "logits/chosen": 0.32063624262809753, + "logits/rejected": 3.1496458053588867, + "logps/chosen": -464.86419677734375, + "logps/rejected": -989.5301513671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.432979583740234, + "rewards/margins": 32.06596374511719, + "rewards/rejected": -39.49894714355469, + "step": 3934 + }, + { + "epoch": 2.447900466562986, + "grad_norm": 0.002402025042101741, + "learning_rate": 1.021207929921623e-06, + "logits/chosen": 0.38487508893013, + "logits/rejected": 4.500364303588867, + "logps/chosen": -558.4069213867188, + "logps/rejected": -1054.4920654296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.773858070373535, + "rewards/margins": 25.580459594726562, + "rewards/rejected": -36.35431671142578, + "step": 3935 + }, + { + "epoch": 2.4485225505443236, + "grad_norm": 2.8169645247544395e-06, + "learning_rate": 1.0200553250345782e-06, + "logits/chosen": 0.5297533869743347, + "logits/rejected": 2.569145917892456, + "logps/chosen": -503.478515625, + "logps/rejected": -925.1766967773438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.786959648132324, + "rewards/margins": 30.930150985717773, + "rewards/rejected": -38.71710968017578, + "step": 3936 + }, + { + "epoch": 2.449144634525661, + "grad_norm": 2.3240552764036693e-05, + "learning_rate": 1.0189027201475335e-06, + "logits/chosen": 0.3535512089729309, + "logits/rejected": 3.5029115676879883, + "logps/chosen": -500.951416015625, + "logps/rejected": -1047.8524169921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.535778999328613, + "rewards/margins": 39.683143615722656, + "rewards/rejected": -48.21892547607422, + "step": 3937 + }, + { + "epoch": 2.4497667185069982, + "grad_norm": 0.000920445250812918, + "learning_rate": 1.0177501152604887e-06, + "logits/chosen": -0.4453803300857544, + "logits/rejected": 3.93086576461792, + "logps/chosen": -493.6942138671875, + "logps/rejected": -1113.318603515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.954602241516113, + "rewards/margins": 29.42861557006836, + "rewards/rejected": -39.383216857910156, + "step": 3938 + }, + { + "epoch": 2.450388802488336, + "grad_norm": 0.029540177434682846, + "learning_rate": 1.0165975103734441e-06, + "logits/chosen": -0.2629004716873169, + "logits/rejected": 0.7652404308319092, + "logps/chosen": -652.7841186523438, + "logps/rejected": -818.8164672851562, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.874147891998291, + "rewards/margins": 22.586145401000977, + "rewards/rejected": -28.46029281616211, + "step": 3939 + }, + { + "epoch": 2.4510108864696734, + "grad_norm": 0.22855834662914276, + "learning_rate": 1.0154449054863994e-06, + "logits/chosen": 0.16000841557979584, + "logits/rejected": 4.105987548828125, + "logps/chosen": -521.3453979492188, + "logps/rejected": -991.91650390625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.033821105957031, + "rewards/margins": 25.448020935058594, + "rewards/rejected": -29.481842041015625, + "step": 3940 + }, + { + "epoch": 2.4516329704510107, + "grad_norm": 0.000634959724266082, + "learning_rate": 1.0142923005993546e-06, + "logits/chosen": 0.7463221549987793, + "logits/rejected": 3.813570499420166, + "logps/chosen": -609.4283447265625, + "logps/rejected": -1091.73388671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.175848007202148, + "rewards/margins": 26.847991943359375, + "rewards/rejected": -35.023841857910156, + "step": 3941 + }, + { + "epoch": 2.4522550544323485, + "grad_norm": 0.12524612247943878, + "learning_rate": 1.01313969571231e-06, + "logits/chosen": 1.860915184020996, + "logits/rejected": 4.84216833114624, + "logps/chosen": -530.1736450195312, + "logps/rejected": -1021.5438232421875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.089037895202637, + "rewards/margins": 31.169198989868164, + "rewards/rejected": -40.258235931396484, + "step": 3942 + }, + { + "epoch": 2.452877138413686, + "grad_norm": 0.09399629384279251, + "learning_rate": 1.0119870908252652e-06, + "logits/chosen": 0.4427666664123535, + "logits/rejected": 3.365427017211914, + "logps/chosen": -567.5911865234375, + "logps/rejected": -981.8191528320312, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.774032592773438, + "rewards/margins": 24.488821029663086, + "rewards/rejected": -33.262855529785156, + "step": 3943 + }, + { + "epoch": 2.453499222395023, + "grad_norm": 1.8980324268341064, + "learning_rate": 1.0108344859382205e-06, + "logits/chosen": 2.3022541999816895, + "logits/rejected": 4.3741888999938965, + "logps/chosen": -640.0008544921875, + "logps/rejected": -1101.875, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.442511558532715, + "rewards/margins": 29.548015594482422, + "rewards/rejected": -38.99052810668945, + "step": 3944 + }, + { + "epoch": 2.454121306376361, + "grad_norm": 0.0036252494901418686, + "learning_rate": 1.0096818810511757e-06, + "logits/chosen": 0.5579085350036621, + "logits/rejected": 0.8060776591300964, + "logps/chosen": -473.23016357421875, + "logps/rejected": -832.8922729492188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.750743865966797, + "rewards/margins": 28.729267120361328, + "rewards/rejected": -36.480010986328125, + "step": 3945 + }, + { + "epoch": 2.4547433903576983, + "grad_norm": 18.020030975341797, + "learning_rate": 1.0085292761641311e-06, + "logits/chosen": 3.9792816638946533, + "logits/rejected": 3.7229690551757812, + "logps/chosen": -703.1436767578125, + "logps/rejected": -970.5345458984375, + "loss": 0.0927, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.347864151000977, + "rewards/margins": 25.340940475463867, + "rewards/rejected": -37.688804626464844, + "step": 3946 + }, + { + "epoch": 2.4553654743390356, + "grad_norm": 0.0004174646455794573, + "learning_rate": 1.0073766712770863e-06, + "logits/chosen": 1.2467455863952637, + "logits/rejected": 3.5107333660125732, + "logps/chosen": -625.0026245117188, + "logps/rejected": -1077.629638671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.804898262023926, + "rewards/margins": 27.94451141357422, + "rewards/rejected": -40.74940872192383, + "step": 3947 + }, + { + "epoch": 2.4559875583203734, + "grad_norm": 0.002254350110888481, + "learning_rate": 1.0062240663900416e-06, + "logits/chosen": -1.3291007280349731, + "logits/rejected": 2.1017301082611084, + "logps/chosen": -494.86334228515625, + "logps/rejected": -916.5098266601562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.163529396057129, + "rewards/margins": 22.77774429321289, + "rewards/rejected": -29.941272735595703, + "step": 3948 + }, + { + "epoch": 2.4566096423017107, + "grad_norm": 0.05908467620611191, + "learning_rate": 1.0050714615029968e-06, + "logits/chosen": 2.6013591289520264, + "logits/rejected": 3.2219948768615723, + "logps/chosen": -668.7972412109375, + "logps/rejected": -958.492431640625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.942373275756836, + "rewards/margins": 26.03789520263672, + "rewards/rejected": -35.98026657104492, + "step": 3949 + }, + { + "epoch": 2.457231726283048, + "grad_norm": 0.041411809623241425, + "learning_rate": 1.0039188566159522e-06, + "logits/chosen": 1.6593561172485352, + "logits/rejected": 2.5953595638275146, + "logps/chosen": -699.587646484375, + "logps/rejected": -979.1827392578125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.904213905334473, + "rewards/margins": 24.109346389770508, + "rewards/rejected": -34.01355743408203, + "step": 3950 + }, + { + "epoch": 2.457853810264386, + "grad_norm": 0.012357287108898163, + "learning_rate": 1.0027662517289075e-06, + "logits/chosen": 1.712773084640503, + "logits/rejected": 3.507235288619995, + "logps/chosen": -491.7207336425781, + "logps/rejected": -882.9329833984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.298487663269043, + "rewards/margins": 24.476160049438477, + "rewards/rejected": -33.77465057373047, + "step": 3951 + }, + { + "epoch": 2.458475894245723, + "grad_norm": 0.01435577031224966, + "learning_rate": 1.0016136468418627e-06, + "logits/chosen": -1.2487645149230957, + "logits/rejected": 4.199995994567871, + "logps/chosen": -386.044921875, + "logps/rejected": -972.6033935546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.299281597137451, + "rewards/margins": 23.616321563720703, + "rewards/rejected": -26.91560173034668, + "step": 3952 + }, + { + "epoch": 2.4590979782270606, + "grad_norm": 8.740786142880097e-06, + "learning_rate": 1.0004610419548181e-06, + "logits/chosen": -2.7983384132385254, + "logits/rejected": 2.749145746231079, + "logps/chosen": -270.6513671875, + "logps/rejected": -954.5782470703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.159430980682373, + "rewards/margins": 33.22806930541992, + "rewards/rejected": -37.38750457763672, + "step": 3953 + }, + { + "epoch": 2.4597200622083983, + "grad_norm": 9.067174687515944e-06, + "learning_rate": 9.993084370677733e-07, + "logits/chosen": -3.095334529876709, + "logits/rejected": 4.330985069274902, + "logps/chosen": -353.81378173828125, + "logps/rejected": -1149.3958740234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.141190528869629, + "rewards/margins": 35.44187927246094, + "rewards/rejected": -40.583072662353516, + "step": 3954 + }, + { + "epoch": 2.4603421461897357, + "grad_norm": 5.59990294277668e-05, + "learning_rate": 9.981558321807286e-07, + "logits/chosen": -1.5986160039901733, + "logits/rejected": 4.103527545928955, + "logps/chosen": -443.53765869140625, + "logps/rejected": -1152.404052734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.053462982177734, + "rewards/margins": 37.27567672729492, + "rewards/rejected": -45.329139709472656, + "step": 3955 + }, + { + "epoch": 2.460964230171073, + "grad_norm": 1.8919844251286122e-06, + "learning_rate": 9.970032272936838e-07, + "logits/chosen": -1.1473727226257324, + "logits/rejected": 3.8609635829925537, + "logps/chosen": -369.2405700683594, + "logps/rejected": -948.0873413085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.060483932495117, + "rewards/margins": 34.174312591552734, + "rewards/rejected": -39.234798431396484, + "step": 3956 + }, + { + "epoch": 2.4615863141524104, + "grad_norm": 1.6699502793926513e-06, + "learning_rate": 9.95850622406639e-07, + "logits/chosen": -1.3253543376922607, + "logits/rejected": 2.317295551300049, + "logps/chosen": -410.41363525390625, + "logps/rejected": -983.7108154296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.472805500030518, + "rewards/margins": 35.197509765625, + "rewards/rejected": -42.670310974121094, + "step": 3957 + }, + { + "epoch": 2.462208398133748, + "grad_norm": 17.654504776000977, + "learning_rate": 9.946980175195942e-07, + "logits/chosen": 1.3565376996994019, + "logits/rejected": 3.2873613834381104, + "logps/chosen": -685.8858642578125, + "logps/rejected": -1045.9437255859375, + "loss": 0.0998, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.6514360904693604, + "rewards/margins": 25.847148895263672, + "rewards/rejected": -29.498584747314453, + "step": 3958 + }, + { + "epoch": 2.4628304821150855, + "grad_norm": 0.0009128287783823907, + "learning_rate": 9.935454126325497e-07, + "logits/chosen": -0.545260488986969, + "logits/rejected": 3.329559803009033, + "logps/chosen": -495.82403564453125, + "logps/rejected": -995.2269287109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.437385559082031, + "rewards/margins": 28.629581451416016, + "rewards/rejected": -40.06696701049805, + "step": 3959 + }, + { + "epoch": 2.463452566096423, + "grad_norm": 0.07026296108961105, + "learning_rate": 9.92392807745505e-07, + "logits/chosen": 0.40682363510131836, + "logits/rejected": 4.326059341430664, + "logps/chosen": -515.2821044921875, + "logps/rejected": -1210.7884521484375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.251470565795898, + "rewards/margins": 39.29925537109375, + "rewards/rejected": -48.550724029541016, + "step": 3960 + }, + { + "epoch": 2.4640746500777606, + "grad_norm": 0.6919077634811401, + "learning_rate": 9.912402028584601e-07, + "logits/chosen": -1.3996939659118652, + "logits/rejected": 1.9476600885391235, + "logps/chosen": -503.40545654296875, + "logps/rejected": -861.3834228515625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.144523620605469, + "rewards/margins": 19.993408203125, + "rewards/rejected": -26.13793182373047, + "step": 3961 + }, + { + "epoch": 2.464696734059098, + "grad_norm": 0.33661141991615295, + "learning_rate": 9.900875979714154e-07, + "logits/chosen": -1.489673137664795, + "logits/rejected": 3.2342751026153564, + "logps/chosen": -340.26019287109375, + "logps/rejected": -831.0149536132812, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.035558700561523, + "rewards/margins": 22.35528564453125, + "rewards/rejected": -30.390844345092773, + "step": 3962 + }, + { + "epoch": 2.4653188180404353, + "grad_norm": 0.686103880405426, + "learning_rate": 9.889349930843708e-07, + "logits/chosen": -0.1386818289756775, + "logits/rejected": 3.1802377700805664, + "logps/chosen": -565.9710693359375, + "logps/rejected": -1054.5169677734375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.683035850524902, + "rewards/margins": 23.252742767333984, + "rewards/rejected": -30.93577766418457, + "step": 3963 + }, + { + "epoch": 2.465940902021773, + "grad_norm": 0.025931192561984062, + "learning_rate": 9.87782388197326e-07, + "logits/chosen": -0.1779177188873291, + "logits/rejected": 1.1597424745559692, + "logps/chosen": -480.335693359375, + "logps/rejected": -774.178955078125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.642938613891602, + "rewards/margins": 17.06674575805664, + "rewards/rejected": -22.709684371948242, + "step": 3964 + }, + { + "epoch": 2.4665629860031104, + "grad_norm": 6.1447601318359375, + "learning_rate": 9.866297833102812e-07, + "logits/chosen": 0.5171284079551697, + "logits/rejected": 2.205604314804077, + "logps/chosen": -633.0610961914062, + "logps/rejected": -972.0784912109375, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.602591037750244, + "rewards/margins": 23.704069137573242, + "rewards/rejected": -31.306659698486328, + "step": 3965 + }, + { + "epoch": 2.4671850699844478, + "grad_norm": 0.0006154404254630208, + "learning_rate": 9.854771784232365e-07, + "logits/chosen": 0.5229877829551697, + "logits/rejected": 3.7279510498046875, + "logps/chosen": -525.7893676757812, + "logps/rejected": -907.7853393554688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.976510047912598, + "rewards/margins": 23.866352081298828, + "rewards/rejected": -30.84286117553711, + "step": 3966 + }, + { + "epoch": 2.4678071539657855, + "grad_norm": 0.0013302437728270888, + "learning_rate": 9.84324573536192e-07, + "logits/chosen": 0.6051331758499146, + "logits/rejected": 2.128033399581909, + "logps/chosen": -508.7525329589844, + "logps/rejected": -912.324951171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.178055763244629, + "rewards/margins": 27.136598587036133, + "rewards/rejected": -37.31465148925781, + "step": 3967 + }, + { + "epoch": 2.468429237947123, + "grad_norm": 0.0012908873613923788, + "learning_rate": 9.831719686491471e-07, + "logits/chosen": 1.2176079750061035, + "logits/rejected": 2.406261444091797, + "logps/chosen": -675.4288330078125, + "logps/rejected": -951.7867431640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.866052627563477, + "rewards/margins": 23.51856231689453, + "rewards/rejected": -36.384620666503906, + "step": 3968 + }, + { + "epoch": 2.46905132192846, + "grad_norm": 0.008842960000038147, + "learning_rate": 9.820193637621024e-07, + "logits/chosen": 0.6855583786964417, + "logits/rejected": 3.3531930446624756, + "logps/chosen": -558.2369384765625, + "logps/rejected": -1022.9857788085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.03244686126709, + "rewards/margins": 28.938135147094727, + "rewards/rejected": -37.9705810546875, + "step": 3969 + }, + { + "epoch": 2.469673405909798, + "grad_norm": 0.056224849075078964, + "learning_rate": 9.808667588750578e-07, + "logits/chosen": 1.5705623626708984, + "logits/rejected": 3.3909265995025635, + "logps/chosen": -651.3974609375, + "logps/rejected": -993.1527099609375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.265022277832031, + "rewards/margins": 24.45557403564453, + "rewards/rejected": -35.72059631347656, + "step": 3970 + }, + { + "epoch": 2.4702954898911353, + "grad_norm": 12.649288177490234, + "learning_rate": 9.79714153988013e-07, + "logits/chosen": 0.9670317769050598, + "logits/rejected": 2.3478357791900635, + "logps/chosen": -656.9765625, + "logps/rejected": -1015.3643798828125, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.8673834800720215, + "rewards/margins": 23.989347457885742, + "rewards/rejected": -29.856731414794922, + "step": 3971 + }, + { + "epoch": 2.4709175738724727, + "grad_norm": 0.20173630118370056, + "learning_rate": 9.785615491009682e-07, + "logits/chosen": 0.1819133162498474, + "logits/rejected": 3.700805425643921, + "logps/chosen": -584.2685546875, + "logps/rejected": -1063.6900634765625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.745945930480957, + "rewards/margins": 33.38615417480469, + "rewards/rejected": -41.132102966308594, + "step": 3972 + }, + { + "epoch": 2.4715396578538105, + "grad_norm": 0.0004406968946568668, + "learning_rate": 9.774089442139235e-07, + "logits/chosen": 2.5935163497924805, + "logits/rejected": 3.5712637901306152, + "logps/chosen": -672.4451904296875, + "logps/rejected": -965.9010009765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.891907691955566, + "rewards/margins": 24.52800178527832, + "rewards/rejected": -35.4199104309082, + "step": 3973 + }, + { + "epoch": 2.472161741835148, + "grad_norm": 0.16155090928077698, + "learning_rate": 9.76256339326879e-07, + "logits/chosen": -0.11952055990695953, + "logits/rejected": 3.0214688777923584, + "logps/chosen": -518.565673828125, + "logps/rejected": -985.1463623046875, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.818488121032715, + "rewards/margins": 24.315500259399414, + "rewards/rejected": -31.133987426757812, + "step": 3974 + }, + { + "epoch": 2.472783825816485, + "grad_norm": 1.4269421626522671e-05, + "learning_rate": 9.751037344398341e-07, + "logits/chosen": 0.11723226308822632, + "logits/rejected": 2.796970844268799, + "logps/chosen": -579.084716796875, + "logps/rejected": -1109.278564453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.15622329711914, + "rewards/margins": 32.47679138183594, + "rewards/rejected": -42.63301467895508, + "step": 3975 + }, + { + "epoch": 2.4734059097978225, + "grad_norm": 0.01373250875622034, + "learning_rate": 9.739511295527893e-07, + "logits/chosen": 0.29508113861083984, + "logits/rejected": 3.23545503616333, + "logps/chosen": -455.6143798828125, + "logps/rejected": -863.3740844726562, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.188558578491211, + "rewards/margins": 25.89873504638672, + "rewards/rejected": -33.08729553222656, + "step": 3976 + }, + { + "epoch": 2.4740279937791603, + "grad_norm": 0.6342032551765442, + "learning_rate": 9.727985246657446e-07, + "logits/chosen": -1.2206673622131348, + "logits/rejected": 2.5578463077545166, + "logps/chosen": -481.9017028808594, + "logps/rejected": -1017.375244140625, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.518524169921875, + "rewards/margins": 29.58747100830078, + "rewards/rejected": -42.105995178222656, + "step": 3977 + }, + { + "epoch": 2.4746500777604976, + "grad_norm": 6.656295681750635e-06, + "learning_rate": 9.716459197787e-07, + "logits/chosen": 0.7014501094818115, + "logits/rejected": 3.1492178440093994, + "logps/chosen": -630.0687255859375, + "logps/rejected": -972.5453491210938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.163032531738281, + "rewards/margins": 29.593048095703125, + "rewards/rejected": -38.756080627441406, + "step": 3978 + }, + { + "epoch": 2.4752721617418354, + "grad_norm": 6.173561996547505e-05, + "learning_rate": 9.704933148916552e-07, + "logits/chosen": 1.347499966621399, + "logits/rejected": 2.789525032043457, + "logps/chosen": -606.9569091796875, + "logps/rejected": -1078.0966796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.588987350463867, + "rewards/margins": 33.38792037963867, + "rewards/rejected": -41.976905822753906, + "step": 3979 + }, + { + "epoch": 2.4758942457231727, + "grad_norm": 0.266066312789917, + "learning_rate": 9.693407100046105e-07, + "logits/chosen": 0.7726764678955078, + "logits/rejected": 3.6062095165252686, + "logps/chosen": -460.413330078125, + "logps/rejected": -1039.5133056640625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.640885353088379, + "rewards/margins": 36.07872772216797, + "rewards/rejected": -43.71961212158203, + "step": 3980 + }, + { + "epoch": 2.47651632970451, + "grad_norm": 0.06791546195745468, + "learning_rate": 9.68188105117566e-07, + "logits/chosen": -0.8611456751823425, + "logits/rejected": 4.008203983306885, + "logps/chosen": -452.2554931640625, + "logps/rejected": -1070.7613525390625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.840206146240234, + "rewards/margins": 29.160036087036133, + "rewards/rejected": -35.000244140625, + "step": 3981 + }, + { + "epoch": 2.4771384136858474, + "grad_norm": 4.322809843415598e-07, + "learning_rate": 9.670355002305211e-07, + "logits/chosen": 1.3722554445266724, + "logits/rejected": 4.346024990081787, + "logps/chosen": -635.0540771484375, + "logps/rejected": -1267.4791259765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.10350513458252, + "rewards/margins": 36.77997589111328, + "rewards/rejected": -48.88347625732422, + "step": 3982 + }, + { + "epoch": 2.477760497667185, + "grad_norm": 1.7631964510655962e-06, + "learning_rate": 9.658828953434763e-07, + "logits/chosen": -1.542866826057434, + "logits/rejected": 3.955106735229492, + "logps/chosen": -379.46466064453125, + "logps/rejected": -956.01416015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.101266384124756, + "rewards/margins": 32.51762390136719, + "rewards/rejected": -38.618892669677734, + "step": 3983 + }, + { + "epoch": 2.4783825816485225, + "grad_norm": 0.19948433339595795, + "learning_rate": 9.647302904564316e-07, + "logits/chosen": -0.9693173170089722, + "logits/rejected": 1.6268550157546997, + "logps/chosen": -582.1820068359375, + "logps/rejected": -1092.36328125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.305274963378906, + "rewards/margins": 35.87574005126953, + "rewards/rejected": -44.18102264404297, + "step": 3984 + }, + { + "epoch": 2.47900466562986, + "grad_norm": 0.23587602376937866, + "learning_rate": 9.63577685569387e-07, + "logits/chosen": 1.4217019081115723, + "logits/rejected": 1.7731695175170898, + "logps/chosen": -605.095458984375, + "logps/rejected": -938.7332763671875, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.258292198181152, + "rewards/margins": 30.493221282958984, + "rewards/rejected": -39.75151443481445, + "step": 3985 + }, + { + "epoch": 2.4796267496111977, + "grad_norm": 0.008622845634818077, + "learning_rate": 9.624250806823422e-07, + "logits/chosen": 1.8287463188171387, + "logits/rejected": 4.628846168518066, + "logps/chosen": -664.932861328125, + "logps/rejected": -1096.986328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.658853530883789, + "rewards/margins": 25.907957077026367, + "rewards/rejected": -34.566810607910156, + "step": 3986 + }, + { + "epoch": 2.480248833592535, + "grad_norm": 0.00017098673561122268, + "learning_rate": 9.612724757952975e-07, + "logits/chosen": -2.468224048614502, + "logits/rejected": 3.166496515274048, + "logps/chosen": -466.069091796875, + "logps/rejected": -1267.13671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.445119857788086, + "rewards/margins": 34.76084899902344, + "rewards/rejected": -45.205970764160156, + "step": 3987 + }, + { + "epoch": 2.4808709175738723, + "grad_norm": 5.38523199793417e-05, + "learning_rate": 9.601198709082529e-07, + "logits/chosen": 0.47611796855926514, + "logits/rejected": 3.0029196739196777, + "logps/chosen": -367.03289794921875, + "logps/rejected": -808.068359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.614110469818115, + "rewards/margins": 26.540752410888672, + "rewards/rejected": -33.15486526489258, + "step": 3988 + }, + { + "epoch": 2.48149300155521, + "grad_norm": 0.11246740072965622, + "learning_rate": 9.589672660212081e-07, + "logits/chosen": 1.517196774482727, + "logits/rejected": 1.8936448097229004, + "logps/chosen": -591.6846313476562, + "logps/rejected": -892.910888671875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.089322090148926, + "rewards/margins": 25.842926025390625, + "rewards/rejected": -34.932247161865234, + "step": 3989 + }, + { + "epoch": 2.4821150855365475, + "grad_norm": 0.001232079230248928, + "learning_rate": 9.578146611341633e-07, + "logits/chosen": -0.5079509019851685, + "logits/rejected": 2.6780648231506348, + "logps/chosen": -469.6315612792969, + "logps/rejected": -972.140380859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.966546535491943, + "rewards/margins": 27.8428897857666, + "rewards/rejected": -34.8094367980957, + "step": 3990 + }, + { + "epoch": 2.482737169517885, + "grad_norm": 13.245752334594727, + "learning_rate": 9.566620562471186e-07, + "logits/chosen": -0.15342366695404053, + "logits/rejected": 3.0701396465301514, + "logps/chosen": -640.5465087890625, + "logps/rejected": -959.1682739257812, + "loss": 0.0525, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.822296142578125, + "rewards/margins": 20.660457611083984, + "rewards/rejected": -34.48275375366211, + "step": 3991 + }, + { + "epoch": 2.4833592534992226, + "grad_norm": 4.577951884243703e-08, + "learning_rate": 9.555094513600738e-07, + "logits/chosen": 0.28653281927108765, + "logits/rejected": 3.837348699569702, + "logps/chosen": -554.57763671875, + "logps/rejected": -1132.2530517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.2543363571167, + "rewards/margins": 41.47886657714844, + "rewards/rejected": -50.73320007324219, + "step": 3992 + }, + { + "epoch": 2.48398133748056, + "grad_norm": 0.0004249585035722703, + "learning_rate": 9.54356846473029e-07, + "logits/chosen": 0.6426808834075928, + "logits/rejected": 3.9812469482421875, + "logps/chosen": -543.9054565429688, + "logps/rejected": -978.4523315429688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.190975189208984, + "rewards/margins": 26.68214225769043, + "rewards/rejected": -33.87311553955078, + "step": 3993 + }, + { + "epoch": 2.4846034214618973, + "grad_norm": 0.00025467202067375183, + "learning_rate": 9.532042415859843e-07, + "logits/chosen": -1.1799622774124146, + "logits/rejected": 3.2941060066223145, + "logps/chosen": -320.71478271484375, + "logps/rejected": -886.0281372070312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6708829402923584, + "rewards/margins": 34.079280853271484, + "rewards/rejected": -37.750160217285156, + "step": 3994 + }, + { + "epoch": 2.4852255054432346, + "grad_norm": 38.0293083190918, + "learning_rate": 9.520516366989397e-07, + "logits/chosen": 2.034773349761963, + "logits/rejected": 2.832022190093994, + "logps/chosen": -488.5412902832031, + "logps/rejected": -745.2386474609375, + "loss": 0.2294, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.242122173309326, + "rewards/margins": 22.864137649536133, + "rewards/rejected": -26.106258392333984, + "step": 3995 + }, + { + "epoch": 2.4858475894245724, + "grad_norm": 0.8855314254760742, + "learning_rate": 9.508990318118949e-07, + "logits/chosen": -0.5881878733634949, + "logits/rejected": 3.464144229888916, + "logps/chosen": -478.4122314453125, + "logps/rejected": -1075.0064697265625, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.319843292236328, + "rewards/margins": 38.023109436035156, + "rewards/rejected": -49.34294891357422, + "step": 3996 + }, + { + "epoch": 2.4864696734059097, + "grad_norm": 0.46036460995674133, + "learning_rate": 9.497464269248502e-07, + "logits/chosen": 0.5766007900238037, + "logits/rejected": 3.380526065826416, + "logps/chosen": -558.318359375, + "logps/rejected": -1159.97900390625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.872662544250488, + "rewards/margins": 37.3363037109375, + "rewards/rejected": -48.20896530151367, + "step": 3997 + }, + { + "epoch": 2.4870917573872475, + "grad_norm": 9.886953193927184e-06, + "learning_rate": 9.485938220378055e-07, + "logits/chosen": 1.076728105545044, + "logits/rejected": 3.4405856132507324, + "logps/chosen": -507.3965759277344, + "logps/rejected": -938.2133178710938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.397248268127441, + "rewards/margins": 25.83782196044922, + "rewards/rejected": -36.235069274902344, + "step": 3998 + }, + { + "epoch": 2.487713841368585, + "grad_norm": 0.018210668116807938, + "learning_rate": 9.474412171507608e-07, + "logits/chosen": -0.3068583607673645, + "logits/rejected": 3.235687494277954, + "logps/chosen": -429.76239013671875, + "logps/rejected": -877.342529296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.804065704345703, + "rewards/margins": 25.279605865478516, + "rewards/rejected": -34.08367156982422, + "step": 3999 + }, + { + "epoch": 2.488335925349922, + "grad_norm": 0.000381840713089332, + "learning_rate": 9.46288612263716e-07, + "logits/chosen": -1.823272466659546, + "logits/rejected": 2.943857192993164, + "logps/chosen": -354.3667907714844, + "logps/rejected": -927.176513671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.204986572265625, + "rewards/margins": 28.99197769165039, + "rewards/rejected": -34.196964263916016, + "step": 4000 + }, + { + "epoch": 2.4889580093312595, + "grad_norm": 0.0003201756626367569, + "learning_rate": 9.451360073766713e-07, + "logits/chosen": 1.1355750560760498, + "logits/rejected": 4.724059104919434, + "logps/chosen": -541.935302734375, + "logps/rejected": -1017.6828002929688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.335550785064697, + "rewards/margins": 31.088001251220703, + "rewards/rejected": -37.423553466796875, + "step": 4001 + }, + { + "epoch": 2.4895800933125973, + "grad_norm": 0.5558943152427673, + "learning_rate": 9.439834024896266e-07, + "logits/chosen": 1.82474684715271, + "logits/rejected": 4.334096908569336, + "logps/chosen": -533.4063720703125, + "logps/rejected": -910.9434814453125, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.618231296539307, + "rewards/margins": 23.29483413696289, + "rewards/rejected": -30.91306495666504, + "step": 4002 + }, + { + "epoch": 2.4902021772939347, + "grad_norm": 0.09100416302680969, + "learning_rate": 9.428307976025819e-07, + "logits/chosen": 0.3936919569969177, + "logits/rejected": 2.573723077774048, + "logps/chosen": -588.6266479492188, + "logps/rejected": -961.934326171875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.027005195617676, + "rewards/margins": 23.147462844848633, + "rewards/rejected": -32.174468994140625, + "step": 4003 + }, + { + "epoch": 2.490824261275272, + "grad_norm": 0.05002165213227272, + "learning_rate": 9.416781927155372e-07, + "logits/chosen": 0.522924542427063, + "logits/rejected": 3.548407554626465, + "logps/chosen": -479.7599792480469, + "logps/rejected": -1006.958984375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.0311279296875, + "rewards/margins": 27.491552352905273, + "rewards/rejected": -35.522682189941406, + "step": 4004 + }, + { + "epoch": 2.49144634525661, + "grad_norm": 0.1994076520204544, + "learning_rate": 9.405255878284925e-07, + "logits/chosen": -0.5984686017036438, + "logits/rejected": 3.4913418292999268, + "logps/chosen": -327.8326110839844, + "logps/rejected": -711.6600341796875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.165585994720459, + "rewards/margins": 22.621118545532227, + "rewards/rejected": -26.786705017089844, + "step": 4005 + }, + { + "epoch": 2.492068429237947, + "grad_norm": 0.0002858602092601359, + "learning_rate": 9.393729829414478e-07, + "logits/chosen": 3.4327967166900635, + "logits/rejected": 5.520690441131592, + "logps/chosen": -697.5659790039062, + "logps/rejected": -1217.9140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.23094367980957, + "rewards/margins": 35.31251525878906, + "rewards/rejected": -46.54345703125, + "step": 4006 + }, + { + "epoch": 2.4926905132192845, + "grad_norm": 1.2931499441037886e-05, + "learning_rate": 9.38220378054403e-07, + "logits/chosen": -1.832653522491455, + "logits/rejected": 1.7804694175720215, + "logps/chosen": -484.6544189453125, + "logps/rejected": -992.6403198242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.544832229614258, + "rewards/margins": 28.360626220703125, + "rewards/rejected": -38.905460357666016, + "step": 4007 + }, + { + "epoch": 2.4933125972006223, + "grad_norm": 0.00022435266873799264, + "learning_rate": 9.370677731673583e-07, + "logits/chosen": 1.0319801568984985, + "logits/rejected": 1.8618669509887695, + "logps/chosen": -561.7014770507812, + "logps/rejected": -932.447998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.712716102600098, + "rewards/margins": 28.847881317138672, + "rewards/rejected": -40.56060028076172, + "step": 4008 + }, + { + "epoch": 2.4939346811819596, + "grad_norm": 8.498398528899997e-05, + "learning_rate": 9.359151682803136e-07, + "logits/chosen": -1.1835764646530151, + "logits/rejected": 3.480799674987793, + "logps/chosen": -463.39447021484375, + "logps/rejected": -1009.1600341796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.3580193519592285, + "rewards/margins": 32.19048309326172, + "rewards/rejected": -38.54850387573242, + "step": 4009 + }, + { + "epoch": 2.494556765163297, + "grad_norm": 0.1589866578578949, + "learning_rate": 9.347625633932689e-07, + "logits/chosen": -1.8735637664794922, + "logits/rejected": 3.5191726684570312, + "logps/chosen": -357.97406005859375, + "logps/rejected": -895.1856689453125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.487676620483398, + "rewards/margins": 25.94668197631836, + "rewards/rejected": -32.434356689453125, + "step": 4010 + }, + { + "epoch": 2.4951788491446347, + "grad_norm": 0.31055301427841187, + "learning_rate": 9.336099585062241e-07, + "logits/chosen": 0.4072756767272949, + "logits/rejected": 4.024853706359863, + "logps/chosen": -391.751953125, + "logps/rejected": -752.5792236328125, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.09948444366455, + "rewards/margins": 15.251541137695312, + "rewards/rejected": -23.351028442382812, + "step": 4011 + }, + { + "epoch": 2.495800933125972, + "grad_norm": 0.0002815905027091503, + "learning_rate": 9.324573536191794e-07, + "logits/chosen": 0.9224781394004822, + "logits/rejected": 3.1477391719818115, + "logps/chosen": -464.29803466796875, + "logps/rejected": -789.9879150390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.706248760223389, + "rewards/margins": 18.78945541381836, + "rewards/rejected": -25.495704650878906, + "step": 4012 + }, + { + "epoch": 2.4964230171073094, + "grad_norm": 0.013372275978326797, + "learning_rate": 9.313047487321347e-07, + "logits/chosen": 1.7922581434249878, + "logits/rejected": 3.2224082946777344, + "logps/chosen": -683.3353881835938, + "logps/rejected": -1059.2672119140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.253475189208984, + "rewards/margins": 33.029808044433594, + "rewards/rejected": -42.283287048339844, + "step": 4013 + }, + { + "epoch": 2.4970451010886467, + "grad_norm": 1.9914675704058027e-06, + "learning_rate": 9.3015214384509e-07, + "logits/chosen": -2.2508482933044434, + "logits/rejected": 1.2407808303833008, + "logps/chosen": -380.6495666503906, + "logps/rejected": -867.527099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3229730129241943, + "rewards/margins": 31.814903259277344, + "rewards/rejected": -34.137874603271484, + "step": 4014 + }, + { + "epoch": 2.4976671850699845, + "grad_norm": 2.6373650143796112e-06, + "learning_rate": 9.289995389580453e-07, + "logits/chosen": -0.6663360595703125, + "logits/rejected": 2.0772130489349365, + "logps/chosen": -367.455078125, + "logps/rejected": -937.1649780273438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.327749252319336, + "rewards/margins": 29.579191207885742, + "rewards/rejected": -36.90694046020508, + "step": 4015 + }, + { + "epoch": 2.498289269051322, + "grad_norm": 0.12220276147127151, + "learning_rate": 9.278469340710006e-07, + "logits/chosen": -0.1861596703529358, + "logits/rejected": 3.5406994819641113, + "logps/chosen": -510.4183654785156, + "logps/rejected": -1062.176513671875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.92317008972168, + "rewards/margins": 27.95861053466797, + "rewards/rejected": -37.881778717041016, + "step": 4016 + }, + { + "epoch": 2.4989113530326597, + "grad_norm": 0.0015416694805026054, + "learning_rate": 9.266943291839559e-07, + "logits/chosen": -1.3664590120315552, + "logits/rejected": 3.208428144454956, + "logps/chosen": -441.11431884765625, + "logps/rejected": -867.0030517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.116560935974121, + "rewards/margins": 21.813411712646484, + "rewards/rejected": -29.929973602294922, + "step": 4017 + }, + { + "epoch": 2.499533437013997, + "grad_norm": 1.4782327525608707e-05, + "learning_rate": 9.255417242969111e-07, + "logits/chosen": 0.12820547819137573, + "logits/rejected": 2.7607710361480713, + "logps/chosen": -450.5256652832031, + "logps/rejected": -907.6300048828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.952629089355469, + "rewards/margins": 28.850107192993164, + "rewards/rejected": -37.802734375, + "step": 4018 + }, + { + "epoch": 2.5001555209953343, + "grad_norm": 0.007651094812899828, + "learning_rate": 9.243891194098664e-07, + "logits/chosen": 2.0740067958831787, + "logits/rejected": 3.646566390991211, + "logps/chosen": -650.134521484375, + "logps/rejected": -1027.8922119140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.123130798339844, + "rewards/margins": 30.832536697387695, + "rewards/rejected": -40.95566940307617, + "step": 4019 + }, + { + "epoch": 2.5007776049766717, + "grad_norm": 0.010663536377251148, + "learning_rate": 9.232365145228217e-07, + "logits/chosen": 2.3909530639648438, + "logits/rejected": 2.9909896850585938, + "logps/chosen": -634.50146484375, + "logps/rejected": -975.9163818359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.708215713500977, + "rewards/margins": 28.67369842529297, + "rewards/rejected": -39.38191604614258, + "step": 4020 + }, + { + "epoch": 2.5013996889580095, + "grad_norm": 3.569019099813886e-05, + "learning_rate": 9.22083909635777e-07, + "logits/chosen": 0.3791891932487488, + "logits/rejected": 3.304218292236328, + "logps/chosen": -655.1154174804688, + "logps/rejected": -1068.7027587890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.697490692138672, + "rewards/margins": 30.89398956298828, + "rewards/rejected": -43.59148025512695, + "step": 4021 + }, + { + "epoch": 2.502021772939347, + "grad_norm": 41.986019134521484, + "learning_rate": 9.209313047487322e-07, + "logits/chosen": 1.777288794517517, + "logits/rejected": 1.3985198736190796, + "logps/chosen": -622.2395629882812, + "logps/rejected": -823.553466796875, + "loss": 0.5233, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.957545280456543, + "rewards/margins": 16.908954620361328, + "rewards/rejected": -24.866500854492188, + "step": 4022 + }, + { + "epoch": 2.502643856920684, + "grad_norm": 1.0342598777413059e-08, + "learning_rate": 9.197786998616876e-07, + "logits/chosen": -1.3321645259857178, + "logits/rejected": 2.8050286769866943, + "logps/chosen": -496.466064453125, + "logps/rejected": -1060.14453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.479050636291504, + "rewards/margins": 32.64373779296875, + "rewards/rejected": -41.12278747558594, + "step": 4023 + }, + { + "epoch": 2.503265940902022, + "grad_norm": 0.008871527388691902, + "learning_rate": 9.186260949746428e-07, + "logits/chosen": -2.635526418685913, + "logits/rejected": 2.7529656887054443, + "logps/chosen": -385.7414245605469, + "logps/rejected": -1028.83837890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.784934997558594, + "rewards/margins": 33.65802764892578, + "rewards/rejected": -40.442962646484375, + "step": 4024 + }, + { + "epoch": 2.5038880248833593, + "grad_norm": 3.446579648880288e-05, + "learning_rate": 9.17473490087598e-07, + "logits/chosen": 1.5855050086975098, + "logits/rejected": 3.839909553527832, + "logps/chosen": -599.10498046875, + "logps/rejected": -959.9203491210938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.241596221923828, + "rewards/margins": 25.976350784301758, + "rewards/rejected": -36.21794509887695, + "step": 4025 + }, + { + "epoch": 2.5045101088646966, + "grad_norm": 10.38563060760498, + "learning_rate": 9.163208852005532e-07, + "logits/chosen": 0.5513242483139038, + "logits/rejected": 2.5588459968566895, + "logps/chosen": -555.89208984375, + "logps/rejected": -875.7945556640625, + "loss": 0.0715, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.112430572509766, + "rewards/margins": 20.556514739990234, + "rewards/rejected": -28.668941497802734, + "step": 4026 + }, + { + "epoch": 2.505132192846034, + "grad_norm": 0.0003059516893699765, + "learning_rate": 9.151682803135086e-07, + "logits/chosen": 3.751962423324585, + "logits/rejected": 5.804811954498291, + "logps/chosen": -769.1310424804688, + "logps/rejected": -1162.177001953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.83383846282959, + "rewards/margins": 24.885425567626953, + "rewards/rejected": -31.719261169433594, + "step": 4027 + }, + { + "epoch": 2.5057542768273717, + "grad_norm": 7.580235251225531e-05, + "learning_rate": 9.140156754264638e-07, + "logits/chosen": 0.8473861217498779, + "logits/rejected": 3.997392177581787, + "logps/chosen": -625.984619140625, + "logps/rejected": -1178.9935302734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.654664993286133, + "rewards/margins": 34.67824172973633, + "rewards/rejected": -46.332908630371094, + "step": 4028 + }, + { + "epoch": 2.506376360808709, + "grad_norm": 0.07776268571615219, + "learning_rate": 9.128630705394191e-07, + "logits/chosen": -1.9163434505462646, + "logits/rejected": 4.200723171234131, + "logps/chosen": -433.5632629394531, + "logps/rejected": -1088.713134765625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.527142524719238, + "rewards/margins": 27.72796058654785, + "rewards/rejected": -35.255104064941406, + "step": 4029 + }, + { + "epoch": 2.506998444790047, + "grad_norm": 1.2928827345604077e-05, + "learning_rate": 9.117104656523743e-07, + "logits/chosen": -1.044769525527954, + "logits/rejected": 1.8422355651855469, + "logps/chosen": -442.69696044921875, + "logps/rejected": -911.9422607421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.976151466369629, + "rewards/margins": 30.48032569885254, + "rewards/rejected": -39.45647430419922, + "step": 4030 + }, + { + "epoch": 2.507620528771384, + "grad_norm": 9.74557679001009e-07, + "learning_rate": 9.105578607653297e-07, + "logits/chosen": 2.877901554107666, + "logits/rejected": 3.4200408458709717, + "logps/chosen": -625.1708984375, + "logps/rejected": -876.2762451171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.222400665283203, + "rewards/margins": 25.483015060424805, + "rewards/rejected": -36.705413818359375, + "step": 4031 + }, + { + "epoch": 2.5082426127527215, + "grad_norm": 0.00010444582585478202, + "learning_rate": 9.09405255878285e-07, + "logits/chosen": 0.2044382095336914, + "logits/rejected": 2.0310511589050293, + "logps/chosen": -485.7327880859375, + "logps/rejected": -900.3209838867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.183245658874512, + "rewards/margins": 29.076282501220703, + "rewards/rejected": -36.25952911376953, + "step": 4032 + }, + { + "epoch": 2.508864696734059, + "grad_norm": 0.0003882237651851028, + "learning_rate": 9.082526509912402e-07, + "logits/chosen": -3.3604073524475098, + "logits/rejected": 2.7178521156311035, + "logps/chosen": -393.2567138671875, + "logps/rejected": -1042.647216796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.976676940917969, + "rewards/margins": 30.853498458862305, + "rewards/rejected": -37.830177307128906, + "step": 4033 + }, + { + "epoch": 2.5094867807153967, + "grad_norm": 1.6229850053787231, + "learning_rate": 9.071000461041956e-07, + "logits/chosen": 1.8078299760818481, + "logits/rejected": 2.902855157852173, + "logps/chosen": -531.661376953125, + "logps/rejected": -825.8492431640625, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.28935718536377, + "rewards/margins": 19.35369110107422, + "rewards/rejected": -28.643049240112305, + "step": 4034 + }, + { + "epoch": 2.510108864696734, + "grad_norm": 1.8153090476989746, + "learning_rate": 9.059474412171508e-07, + "logits/chosen": 1.5191748142242432, + "logits/rejected": 3.1580471992492676, + "logps/chosen": -539.547607421875, + "logps/rejected": -956.634521484375, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.679039001464844, + "rewards/margins": 28.129350662231445, + "rewards/rejected": -38.808387756347656, + "step": 4035 + }, + { + "epoch": 2.510730948678072, + "grad_norm": 3.546811580657959, + "learning_rate": 9.047948363301061e-07, + "logits/chosen": -0.5803591012954712, + "logits/rejected": 1.0151821374893188, + "logps/chosen": -532.4237060546875, + "logps/rejected": -790.9904174804688, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.401581764221191, + "rewards/margins": 18.95673370361328, + "rewards/rejected": -25.35831642150879, + "step": 4036 + }, + { + "epoch": 2.511353032659409, + "grad_norm": 23.09521484375, + "learning_rate": 9.036422314430613e-07, + "logits/chosen": 0.25977957248687744, + "logits/rejected": 3.5475993156433105, + "logps/chosen": -555.8200073242188, + "logps/rejected": -1017.760498046875, + "loss": 0.0648, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.457131385803223, + "rewards/margins": 29.075380325317383, + "rewards/rejected": -38.532508850097656, + "step": 4037 + }, + { + "epoch": 2.5119751166407465, + "grad_norm": 4.518855348578654e-05, + "learning_rate": 9.024896265560167e-07, + "logits/chosen": -1.4445290565490723, + "logits/rejected": 3.5027639865875244, + "logps/chosen": -242.11343383789062, + "logps/rejected": -775.8106689453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7559638023376465, + "rewards/margins": 27.689367294311523, + "rewards/rejected": -31.445327758789062, + "step": 4038 + }, + { + "epoch": 2.512597200622084, + "grad_norm": 0.0008194184629246593, + "learning_rate": 9.013370216689719e-07, + "logits/chosen": -1.9092761278152466, + "logits/rejected": 3.484623908996582, + "logps/chosen": -335.6897277832031, + "logps/rejected": -973.5714111328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.017749786376953, + "rewards/margins": 33.87322998046875, + "rewards/rejected": -38.8909797668457, + "step": 4039 + }, + { + "epoch": 2.5132192846034216, + "grad_norm": 0.06275061517953873, + "learning_rate": 9.001844167819272e-07, + "logits/chosen": 1.002368450164795, + "logits/rejected": 2.0995421409606934, + "logps/chosen": -733.36083984375, + "logps/rejected": -1132.36328125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.596542358398438, + "rewards/margins": 32.84700012207031, + "rewards/rejected": -44.443546295166016, + "step": 4040 + }, + { + "epoch": 2.513841368584759, + "grad_norm": 0.14462482929229736, + "learning_rate": 8.990318118948826e-07, + "logits/chosen": 3.533893346786499, + "logits/rejected": 2.424785614013672, + "logps/chosen": -831.066650390625, + "logps/rejected": -966.3888549804688, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.034292221069336, + "rewards/margins": 20.877782821655273, + "rewards/rejected": -32.91207504272461, + "step": 4041 + }, + { + "epoch": 2.5144634525660963, + "grad_norm": 1.888901923763342e-09, + "learning_rate": 8.978792070078378e-07, + "logits/chosen": 3.350219249725342, + "logits/rejected": 4.0092010498046875, + "logps/chosen": -673.218017578125, + "logps/rejected": -1049.03564453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.207067489624023, + "rewards/margins": 31.169370651245117, + "rewards/rejected": -40.37643814086914, + "step": 4042 + }, + { + "epoch": 2.515085536547434, + "grad_norm": 5.1957790958567784e-08, + "learning_rate": 8.967266021207931e-07, + "logits/chosen": -0.6556673049926758, + "logits/rejected": 2.5189614295959473, + "logps/chosen": -485.61602783203125, + "logps/rejected": -928.2960205078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.341263771057129, + "rewards/margins": 31.709503173828125, + "rewards/rejected": -40.05076599121094, + "step": 4043 + }, + { + "epoch": 2.5157076205287714, + "grad_norm": 0.9537517428398132, + "learning_rate": 8.955739972337483e-07, + "logits/chosen": -2.3854219913482666, + "logits/rejected": 3.221060037612915, + "logps/chosen": -317.2149658203125, + "logps/rejected": -987.2667846679688, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.228043556213379, + "rewards/margins": 37.49151611328125, + "rewards/rejected": -41.71955871582031, + "step": 4044 + }, + { + "epoch": 2.5163297045101087, + "grad_norm": 9.389303158968687e-06, + "learning_rate": 8.944213923467037e-07, + "logits/chosen": -0.7763998508453369, + "logits/rejected": 1.6843314170837402, + "logps/chosen": -446.32879638671875, + "logps/rejected": -863.89111328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.270142555236816, + "rewards/margins": 29.047107696533203, + "rewards/rejected": -35.31725311279297, + "step": 4045 + }, + { + "epoch": 2.516951788491446, + "grad_norm": 0.00035843587829731405, + "learning_rate": 8.932687874596589e-07, + "logits/chosen": -2.0564632415771484, + "logits/rejected": 0.005192816257476807, + "logps/chosen": -446.6129150390625, + "logps/rejected": -927.9066162109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.743563175201416, + "rewards/margins": 36.39313507080078, + "rewards/rejected": -44.13669967651367, + "step": 4046 + }, + { + "epoch": 2.517573872472784, + "grad_norm": 0.007599648553878069, + "learning_rate": 8.921161825726142e-07, + "logits/chosen": 0.054366230964660645, + "logits/rejected": 3.8880622386932373, + "logps/chosen": -459.69073486328125, + "logps/rejected": -890.9481201171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.929388046264648, + "rewards/margins": 22.893136978149414, + "rewards/rejected": -29.822526931762695, + "step": 4047 + }, + { + "epoch": 2.518195956454121, + "grad_norm": 0.008917691186070442, + "learning_rate": 8.909635776855694e-07, + "logits/chosen": 1.7491514682769775, + "logits/rejected": 3.895791530609131, + "logps/chosen": -583.1383056640625, + "logps/rejected": -971.4913330078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.932445526123047, + "rewards/margins": 25.728708267211914, + "rewards/rejected": -35.66115188598633, + "step": 4048 + }, + { + "epoch": 2.518818040435459, + "grad_norm": 1.1484187841415405, + "learning_rate": 8.898109727985248e-07, + "logits/chosen": -0.4084659516811371, + "logits/rejected": 0.8221626281738281, + "logps/chosen": -441.1219787597656, + "logps/rejected": -798.0272827148438, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.016278266906738, + "rewards/margins": 27.955223083496094, + "rewards/rejected": -33.971500396728516, + "step": 4049 + }, + { + "epoch": 2.5194401244167963, + "grad_norm": 2.94388484954834, + "learning_rate": 8.8865836791148e-07, + "logits/chosen": 0.5502256751060486, + "logits/rejected": 3.039674758911133, + "logps/chosen": -525.4452514648438, + "logps/rejected": -871.894775390625, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.72391128540039, + "rewards/margins": 26.043746948242188, + "rewards/rejected": -36.76765441894531, + "step": 4050 + }, + { + "epoch": 2.5200622083981337, + "grad_norm": 0.008566655218601227, + "learning_rate": 8.875057630244353e-07, + "logits/chosen": -1.273643136024475, + "logits/rejected": 3.792297601699829, + "logps/chosen": -435.6821594238281, + "logps/rejected": -1057.8905029296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.56712532043457, + "rewards/margins": 27.878154754638672, + "rewards/rejected": -37.445281982421875, + "step": 4051 + }, + { + "epoch": 2.520684292379471, + "grad_norm": 0.903061032295227, + "learning_rate": 8.863531581373907e-07, + "logits/chosen": -1.5747716426849365, + "logits/rejected": 2.0196101665496826, + "logps/chosen": -382.30938720703125, + "logps/rejected": -926.2044677734375, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.719232559204102, + "rewards/margins": 33.08274459838867, + "rewards/rejected": -39.801979064941406, + "step": 4052 + }, + { + "epoch": 2.521306376360809, + "grad_norm": 0.003625160548835993, + "learning_rate": 8.852005532503459e-07, + "logits/chosen": 0.947603702545166, + "logits/rejected": 3.8829972743988037, + "logps/chosen": -536.180419921875, + "logps/rejected": -991.75146484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.84277868270874, + "rewards/margins": 31.205059051513672, + "rewards/rejected": -38.04783630371094, + "step": 4053 + }, + { + "epoch": 2.521928460342146, + "grad_norm": 0.37235498428344727, + "learning_rate": 8.840479483633012e-07, + "logits/chosen": -0.7206171751022339, + "logits/rejected": 2.378139019012451, + "logps/chosen": -513.6537475585938, + "logps/rejected": -921.5626831054688, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.372352600097656, + "rewards/margins": 25.354045867919922, + "rewards/rejected": -36.72639846801758, + "step": 4054 + }, + { + "epoch": 2.522550544323484, + "grad_norm": 0.06400436162948608, + "learning_rate": 8.828953434762564e-07, + "logits/chosen": 2.3853330612182617, + "logits/rejected": 3.798861265182495, + "logps/chosen": -658.4878540039062, + "logps/rejected": -968.2791748046875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.424330711364746, + "rewards/margins": 25.68733024597168, + "rewards/rejected": -34.111663818359375, + "step": 4055 + }, + { + "epoch": 2.5231726283048213, + "grad_norm": 1.104139982999186e-06, + "learning_rate": 8.817427385892118e-07, + "logits/chosen": -1.199948787689209, + "logits/rejected": 2.331671953201294, + "logps/chosen": -509.634033203125, + "logps/rejected": -1042.027099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.206770896911621, + "rewards/margins": 30.182756423950195, + "rewards/rejected": -37.3895263671875, + "step": 4056 + }, + { + "epoch": 2.5237947122861586, + "grad_norm": 0.0002867870789486915, + "learning_rate": 8.80590133702167e-07, + "logits/chosen": -2.356788158416748, + "logits/rejected": 3.516348123550415, + "logps/chosen": -356.5927734375, + "logps/rejected": -1011.0084838867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.522395133972168, + "rewards/margins": 32.249908447265625, + "rewards/rejected": -39.772300720214844, + "step": 4057 + }, + { + "epoch": 2.524416796267496, + "grad_norm": 8.064653229666874e-06, + "learning_rate": 8.794375288151223e-07, + "logits/chosen": -0.5611940622329712, + "logits/rejected": 3.5149805545806885, + "logps/chosen": -508.0080871582031, + "logps/rejected": -1095.08935546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.362550735473633, + "rewards/margins": 35.61705780029297, + "rewards/rejected": -40.97960662841797, + "step": 4058 + }, + { + "epoch": 2.5250388802488337, + "grad_norm": 0.00042883484275080264, + "learning_rate": 8.782849239280774e-07, + "logits/chosen": 1.1045475006103516, + "logits/rejected": 3.2674155235290527, + "logps/chosen": -499.8591613769531, + "logps/rejected": -891.91748046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.630616664886475, + "rewards/margins": 25.193370819091797, + "rewards/rejected": -32.82398986816406, + "step": 4059 + }, + { + "epoch": 2.525660964230171, + "grad_norm": 1.2721602615783922e-06, + "learning_rate": 8.771323190410328e-07, + "logits/chosen": 3.0189056396484375, + "logits/rejected": 4.2439775466918945, + "logps/chosen": -627.486083984375, + "logps/rejected": -1018.70263671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.340555191040039, + "rewards/margins": 31.816898345947266, + "rewards/rejected": -45.15745544433594, + "step": 4060 + }, + { + "epoch": 2.5262830482115084, + "grad_norm": 0.6605531573295593, + "learning_rate": 8.75979714153988e-07, + "logits/chosen": 0.18107308447360992, + "logits/rejected": 3.3815932273864746, + "logps/chosen": -640.6924438476562, + "logps/rejected": -1271.03173828125, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.90386962890625, + "rewards/margins": 34.992061614990234, + "rewards/rejected": -53.895931243896484, + "step": 4061 + }, + { + "epoch": 2.526905132192846, + "grad_norm": 0.000140212316182442, + "learning_rate": 8.748271092669433e-07, + "logits/chosen": 2.1460206508636475, + "logits/rejected": 2.4291000366210938, + "logps/chosen": -774.4283447265625, + "logps/rejected": -1192.8189697265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.135278701782227, + "rewards/margins": 32.91741180419922, + "rewards/rejected": -44.05268859863281, + "step": 4062 + }, + { + "epoch": 2.5275272161741835, + "grad_norm": 37.31892013549805, + "learning_rate": 8.736745043798986e-07, + "logits/chosen": -0.12430325150489807, + "logits/rejected": 2.1037116050720215, + "logps/chosen": -624.890380859375, + "logps/rejected": -914.820556640625, + "loss": 1.4105, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.072190284729004, + "rewards/margins": 18.019466400146484, + "rewards/rejected": -29.091657638549805, + "step": 4063 + }, + { + "epoch": 2.528149300155521, + "grad_norm": 0.008869586512446404, + "learning_rate": 8.725218994928539e-07, + "logits/chosen": 0.9427728652954102, + "logits/rejected": 2.078561782836914, + "logps/chosen": -580.1802978515625, + "logps/rejected": -899.5389404296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.347139358520508, + "rewards/margins": 27.37150001525879, + "rewards/rejected": -39.7186393737793, + "step": 4064 + }, + { + "epoch": 2.528771384136858, + "grad_norm": 0.43725720047950745, + "learning_rate": 8.713692946058091e-07, + "logits/chosen": -0.928475022315979, + "logits/rejected": 2.0003530979156494, + "logps/chosen": -554.10205078125, + "logps/rejected": -907.9371948242188, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.801774978637695, + "rewards/margins": 20.24555206298828, + "rewards/rejected": -30.04732894897461, + "step": 4065 + }, + { + "epoch": 2.529393468118196, + "grad_norm": 0.5570970177650452, + "learning_rate": 8.702166897187644e-07, + "logits/chosen": -0.08360552787780762, + "logits/rejected": 2.402754306793213, + "logps/chosen": -647.493896484375, + "logps/rejected": -1055.98828125, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.159369468688965, + "rewards/margins": 29.314403533935547, + "rewards/rejected": -41.47377014160156, + "step": 4066 + }, + { + "epoch": 2.5300155520995333, + "grad_norm": 0.013835887424647808, + "learning_rate": 8.690640848317198e-07, + "logits/chosen": 2.6564700603485107, + "logits/rejected": 4.413903713226318, + "logps/chosen": -677.5421752929688, + "logps/rejected": -1058.09619140625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.916563987731934, + "rewards/margins": 25.28716468811035, + "rewards/rejected": -35.20372772216797, + "step": 4067 + }, + { + "epoch": 2.530637636080871, + "grad_norm": 1.2323258715696284e-06, + "learning_rate": 8.67911479944675e-07, + "logits/chosen": 2.8368053436279297, + "logits/rejected": 3.4437544345855713, + "logps/chosen": -592.5868530273438, + "logps/rejected": -943.364990234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.618453979492188, + "rewards/margins": 29.4837646484375, + "rewards/rejected": -40.10221862792969, + "step": 4068 + }, + { + "epoch": 2.5312597200622085, + "grad_norm": 0.00047519218060187995, + "learning_rate": 8.667588750576303e-07, + "logits/chosen": -2.7477948665618896, + "logits/rejected": 3.0729355812072754, + "logps/chosen": -308.309326171875, + "logps/rejected": -936.3649291992188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9158785343170166, + "rewards/margins": 30.847537994384766, + "rewards/rejected": -34.7634162902832, + "step": 4069 + }, + { + "epoch": 2.531881804043546, + "grad_norm": 0.0025224664714187384, + "learning_rate": 8.656062701705856e-07, + "logits/chosen": -0.4186660647392273, + "logits/rejected": 2.772813558578491, + "logps/chosen": -483.03546142578125, + "logps/rejected": -1053.2744140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.237714767456055, + "rewards/margins": 34.6431999206543, + "rewards/rejected": -40.880916595458984, + "step": 4070 + }, + { + "epoch": 2.532503888024883, + "grad_norm": 6.943326980035636e-07, + "learning_rate": 8.644536652835409e-07, + "logits/chosen": -1.8856847286224365, + "logits/rejected": 2.604322671890259, + "logps/chosen": -385.1220703125, + "logps/rejected": -934.83447265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.365512371063232, + "rewards/margins": 30.374441146850586, + "rewards/rejected": -34.73995590209961, + "step": 4071 + }, + { + "epoch": 2.533125972006221, + "grad_norm": 0.07411924749612808, + "learning_rate": 8.633010603964961e-07, + "logits/chosen": 0.5203073024749756, + "logits/rejected": 2.409273862838745, + "logps/chosen": -471.6881103515625, + "logps/rejected": -914.433349609375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.214423656463623, + "rewards/margins": 29.525753021240234, + "rewards/rejected": -36.74017333984375, + "step": 4072 + }, + { + "epoch": 2.5337480559875583, + "grad_norm": 8.959432307165116e-06, + "learning_rate": 8.621484555094514e-07, + "logits/chosen": -0.09636279940605164, + "logits/rejected": 2.9820473194122314, + "logps/chosen": -553.0858764648438, + "logps/rejected": -1140.2852783203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.259970664978027, + "rewards/margins": 36.687904357910156, + "rewards/rejected": -42.947872161865234, + "step": 4073 + }, + { + "epoch": 2.534370139968896, + "grad_norm": 36.08942413330078, + "learning_rate": 8.609958506224067e-07, + "logits/chosen": -0.2618432343006134, + "logits/rejected": 2.2460336685180664, + "logps/chosen": -635.515380859375, + "logps/rejected": -1065.712646484375, + "loss": 0.4531, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.69479751586914, + "rewards/margins": 28.274282455444336, + "rewards/rejected": -41.969078063964844, + "step": 4074 + }, + { + "epoch": 2.5349922239502334, + "grad_norm": 0.05928949639201164, + "learning_rate": 8.59843245735362e-07, + "logits/chosen": 0.16672658920288086, + "logits/rejected": 2.908172369003296, + "logps/chosen": -507.34716796875, + "logps/rejected": -994.593505859375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.214090347290039, + "rewards/margins": 30.99824333190918, + "rewards/rejected": -41.21233367919922, + "step": 4075 + }, + { + "epoch": 2.5356143079315707, + "grad_norm": 0.017843371257185936, + "learning_rate": 8.586906408483172e-07, + "logits/chosen": -1.8564605712890625, + "logits/rejected": 0.6809482574462891, + "logps/chosen": -592.2178344726562, + "logps/rejected": -1067.1971435546875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.823671340942383, + "rewards/margins": 25.124004364013672, + "rewards/rejected": -37.94767761230469, + "step": 4076 + }, + { + "epoch": 2.536236391912908, + "grad_norm": 1.2402077231854491e-07, + "learning_rate": 8.575380359612726e-07, + "logits/chosen": -0.0989261269569397, + "logits/rejected": 4.3484086990356445, + "logps/chosen": -504.6379699707031, + "logps/rejected": -1159.091552734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.602766036987305, + "rewards/margins": 37.80310821533203, + "rewards/rejected": -46.40587615966797, + "step": 4077 + }, + { + "epoch": 2.536858475894246, + "grad_norm": 39.21305847167969, + "learning_rate": 8.563854310742279e-07, + "logits/chosen": 1.0437443256378174, + "logits/rejected": 2.75134539604187, + "logps/chosen": -677.68212890625, + "logps/rejected": -988.155517578125, + "loss": 0.4726, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.888240814208984, + "rewards/margins": 20.878808975219727, + "rewards/rejected": -29.76704978942871, + "step": 4078 + }, + { + "epoch": 2.537480559875583, + "grad_norm": 0.02777073159813881, + "learning_rate": 8.552328261871831e-07, + "logits/chosen": 1.8793046474456787, + "logits/rejected": 3.102606773376465, + "logps/chosen": -432.9072265625, + "logps/rejected": -698.9886474609375, + "loss": 0.0867, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.23219108581543, + "rewards/margins": 21.439613342285156, + "rewards/rejected": -29.67180633544922, + "step": 4079 + }, + { + "epoch": 2.5381026438569205, + "grad_norm": 0.00032307422952726483, + "learning_rate": 8.540802213001384e-07, + "logits/chosen": -0.17820918560028076, + "logits/rejected": 2.1350302696228027, + "logps/chosen": -531.5789794921875, + "logps/rejected": -941.7060546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.655010223388672, + "rewards/margins": 26.932619094848633, + "rewards/rejected": -34.58762741088867, + "step": 4080 + }, + { + "epoch": 2.5387247278382583, + "grad_norm": 0.33138206601142883, + "learning_rate": 8.529276164130937e-07, + "logits/chosen": -0.9489991664886475, + "logits/rejected": 3.2138314247131348, + "logps/chosen": -483.31146240234375, + "logps/rejected": -1007.58056640625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.168546676635742, + "rewards/margins": 27.229930877685547, + "rewards/rejected": -34.39847946166992, + "step": 4081 + }, + { + "epoch": 2.5393468118195957, + "grad_norm": 0.010503578931093216, + "learning_rate": 8.51775011526049e-07, + "logits/chosen": 1.4573490619659424, + "logits/rejected": 2.668405771255493, + "logps/chosen": -647.7239990234375, + "logps/rejected": -1154.552001953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.74909782409668, + "rewards/margins": 39.35409927368164, + "rewards/rejected": -50.10319137573242, + "step": 4082 + }, + { + "epoch": 2.539968895800933, + "grad_norm": 0.0037981884088367224, + "learning_rate": 8.506224066390042e-07, + "logits/chosen": 0.5995239019393921, + "logits/rejected": 2.5143589973449707, + "logps/chosen": -560.736083984375, + "logps/rejected": -825.2479858398438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.262418746948242, + "rewards/margins": 21.219337463378906, + "rewards/rejected": -29.481754302978516, + "step": 4083 + }, + { + "epoch": 2.5405909797822703, + "grad_norm": 0.054800570011138916, + "learning_rate": 8.494698017519595e-07, + "logits/chosen": 3.720679521560669, + "logits/rejected": 3.344923973083496, + "logps/chosen": -735.9842529296875, + "logps/rejected": -956.2283325195312, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.82901668548584, + "rewards/margins": 21.535974502563477, + "rewards/rejected": -28.364992141723633, + "step": 4084 + }, + { + "epoch": 2.541213063763608, + "grad_norm": 5.5997816161834635e-06, + "learning_rate": 8.483171968649148e-07, + "logits/chosen": -2.129356622695923, + "logits/rejected": 2.2183260917663574, + "logps/chosen": -430.0462646484375, + "logps/rejected": -1063.35302734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.9142327308654785, + "rewards/margins": 35.044158935546875, + "rewards/rejected": -40.95839309692383, + "step": 4085 + }, + { + "epoch": 2.5418351477449455, + "grad_norm": 1.1461665630340576, + "learning_rate": 8.471645919778701e-07, + "logits/chosen": -1.286311149597168, + "logits/rejected": 0.3549751043319702, + "logps/chosen": -487.452392578125, + "logps/rejected": -805.5723876953125, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.289743900299072, + "rewards/margins": 23.007722854614258, + "rewards/rejected": -30.297466278076172, + "step": 4086 + }, + { + "epoch": 2.5424572317262832, + "grad_norm": 0.21102778613567352, + "learning_rate": 8.460119870908253e-07, + "logits/chosen": -2.918485641479492, + "logits/rejected": 0.9315952062606812, + "logps/chosen": -469.7716064453125, + "logps/rejected": -934.5232543945312, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.402835845947266, + "rewards/margins": 27.498199462890625, + "rewards/rejected": -35.90103530883789, + "step": 4087 + }, + { + "epoch": 2.5430793157076206, + "grad_norm": 9.886176109313965, + "learning_rate": 8.448593822037807e-07, + "logits/chosen": 1.0713324546813965, + "logits/rejected": 2.689504861831665, + "logps/chosen": -556.312744140625, + "logps/rejected": -890.0252685546875, + "loss": 0.1282, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.405909538269043, + "rewards/margins": 25.110034942626953, + "rewards/rejected": -32.51594543457031, + "step": 4088 + }, + { + "epoch": 2.543701399688958, + "grad_norm": 1.8649814592208713e-05, + "learning_rate": 8.43706777316736e-07, + "logits/chosen": 0.28653520345687866, + "logits/rejected": 3.634840488433838, + "logps/chosen": -469.40191650390625, + "logps/rejected": -930.4893798828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.568150043487549, + "rewards/margins": 29.26486587524414, + "rewards/rejected": -35.83301544189453, + "step": 4089 + }, + { + "epoch": 2.5443234836702953, + "grad_norm": 0.00011260463361395523, + "learning_rate": 8.425541724296912e-07, + "logits/chosen": 2.7270326614379883, + "logits/rejected": 1.5111112594604492, + "logps/chosen": -816.8228149414062, + "logps/rejected": -1039.581298828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.286932945251465, + "rewards/margins": 25.909936904907227, + "rewards/rejected": -36.196868896484375, + "step": 4090 + }, + { + "epoch": 2.544945567651633, + "grad_norm": 0.0286729633808136, + "learning_rate": 8.414015675426465e-07, + "logits/chosen": 0.7567600011825562, + "logits/rejected": 2.3368992805480957, + "logps/chosen": -482.7024841308594, + "logps/rejected": -1020.5185546875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.911073684692383, + "rewards/margins": 35.30067443847656, + "rewards/rejected": -40.21174621582031, + "step": 4091 + }, + { + "epoch": 2.5455676516329704, + "grad_norm": 0.0016783374594524503, + "learning_rate": 8.402489626556018e-07, + "logits/chosen": 0.7466878294944763, + "logits/rejected": 3.6340603828430176, + "logps/chosen": -568.1629028320312, + "logps/rejected": -1043.47900390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.166143417358398, + "rewards/margins": 33.148521423339844, + "rewards/rejected": -41.314666748046875, + "step": 4092 + }, + { + "epoch": 2.546189735614308, + "grad_norm": 0.5343481302261353, + "learning_rate": 8.390963577685569e-07, + "logits/chosen": 2.2212343215942383, + "logits/rejected": 4.400791645050049, + "logps/chosen": -635.5879516601562, + "logps/rejected": -1042.9224853515625, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.624446868896484, + "rewards/margins": 27.347734451293945, + "rewards/rejected": -36.9721794128418, + "step": 4093 + }, + { + "epoch": 2.5468118195956455, + "grad_norm": 0.19887447357177734, + "learning_rate": 8.379437528815122e-07, + "logits/chosen": 0.01381150633096695, + "logits/rejected": 1.177337408065796, + "logps/chosen": -422.07208251953125, + "logps/rejected": -756.161865234375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.822951793670654, + "rewards/margins": 22.88251495361328, + "rewards/rejected": -27.705467224121094, + "step": 4094 + }, + { + "epoch": 2.547433903576983, + "grad_norm": 0.007180861197412014, + "learning_rate": 8.367911479944676e-07, + "logits/chosen": 1.0952752828598022, + "logits/rejected": 2.7882447242736816, + "logps/chosen": -600.28759765625, + "logps/rejected": -1076.0926513671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.935996055603027, + "rewards/margins": 35.04020690917969, + "rewards/rejected": -44.976200103759766, + "step": 4095 + }, + { + "epoch": 2.54805598755832, + "grad_norm": 0.00010127165296580642, + "learning_rate": 8.356385431074228e-07, + "logits/chosen": 0.8541593551635742, + "logits/rejected": 3.7372541427612305, + "logps/chosen": -481.94366455078125, + "logps/rejected": -1099.032958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.173066139221191, + "rewards/margins": 35.346473693847656, + "rewards/rejected": -41.51953887939453, + "step": 4096 + }, + { + "epoch": 2.548678071539658, + "grad_norm": 0.0006400636048056185, + "learning_rate": 8.344859382203781e-07, + "logits/chosen": -0.9102742671966553, + "logits/rejected": 2.529773712158203, + "logps/chosen": -532.9279174804688, + "logps/rejected": -923.9201049804688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2139296531677246, + "rewards/margins": 27.08374786376953, + "rewards/rejected": -30.297679901123047, + "step": 4097 + }, + { + "epoch": 2.5493001555209953, + "grad_norm": 0.0010316974949091673, + "learning_rate": 8.333333333333333e-07, + "logits/chosen": -0.8006272315979004, + "logits/rejected": 2.707360029220581, + "logps/chosen": -453.3794860839844, + "logps/rejected": -1052.589111328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.090571403503418, + "rewards/margins": 31.266225814819336, + "rewards/rejected": -40.35679626464844, + "step": 4098 + }, + { + "epoch": 2.5499222395023327, + "grad_norm": 0.002551464596763253, + "learning_rate": 8.321807284462887e-07, + "logits/chosen": -0.6266838312149048, + "logits/rejected": 3.1678671836853027, + "logps/chosen": -529.3460693359375, + "logps/rejected": -1069.198974609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.927661895751953, + "rewards/margins": 32.658512115478516, + "rewards/rejected": -44.58617401123047, + "step": 4099 + }, + { + "epoch": 2.5505443234836704, + "grad_norm": 3.0759070796193555e-05, + "learning_rate": 8.310281235592439e-07, + "logits/chosen": 1.4877252578735352, + "logits/rejected": 3.491295337677002, + "logps/chosen": -675.742431640625, + "logps/rejected": -984.6753540039062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.454578399658203, + "rewards/margins": 24.514137268066406, + "rewards/rejected": -32.96871566772461, + "step": 4100 + }, + { + "epoch": 2.551166407465008, + "grad_norm": 0.09172773361206055, + "learning_rate": 8.298755186721992e-07, + "logits/chosen": 3.627704620361328, + "logits/rejected": 4.454146385192871, + "logps/chosen": -761.8814697265625, + "logps/rejected": -997.3330688476562, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.802077293395996, + "rewards/margins": 21.68487548828125, + "rewards/rejected": -32.48695373535156, + "step": 4101 + }, + { + "epoch": 2.551788491446345, + "grad_norm": 0.2365218549966812, + "learning_rate": 8.287229137851544e-07, + "logits/chosen": 2.003429412841797, + "logits/rejected": 3.921147346496582, + "logps/chosen": -653.858642578125, + "logps/rejected": -891.6544189453125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.306427955627441, + "rewards/margins": 15.111352920532227, + "rewards/rejected": -23.41777992248535, + "step": 4102 + }, + { + "epoch": 2.5524105754276825, + "grad_norm": 3.3804278700699797e-06, + "learning_rate": 8.275703088981098e-07, + "logits/chosen": 2.205820083618164, + "logits/rejected": 3.166776657104492, + "logps/chosen": -649.1009521484375, + "logps/rejected": -1086.208984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.009281158447266, + "rewards/margins": 33.14641189575195, + "rewards/rejected": -43.155696868896484, + "step": 4103 + }, + { + "epoch": 2.5530326594090202, + "grad_norm": 4.083518981933594, + "learning_rate": 8.264177040110651e-07, + "logits/chosen": 0.7583400011062622, + "logits/rejected": 2.963535785675049, + "logps/chosen": -558.18701171875, + "logps/rejected": -928.1401977539062, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.89256763458252, + "rewards/margins": 23.71649932861328, + "rewards/rejected": -34.609066009521484, + "step": 4104 + }, + { + "epoch": 2.5536547433903576, + "grad_norm": 2.8601727990462678e-06, + "learning_rate": 8.252650991240203e-07, + "logits/chosen": -2.2715840339660645, + "logits/rejected": 4.363325119018555, + "logps/chosen": -292.86273193359375, + "logps/rejected": -984.2553100585938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.256927490234375, + "rewards/margins": 29.46300506591797, + "rewards/rejected": -35.71993637084961, + "step": 4105 + }, + { + "epoch": 2.5542768273716954, + "grad_norm": 38.109676361083984, + "learning_rate": 8.241124942369757e-07, + "logits/chosen": -0.6005755662918091, + "logits/rejected": 3.6361074447631836, + "logps/chosen": -520.59912109375, + "logps/rejected": -1044.981201171875, + "loss": 0.2634, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.620247840881348, + "rewards/margins": 26.426218032836914, + "rewards/rejected": -39.04646682739258, + "step": 4106 + }, + { + "epoch": 2.5548989113530327, + "grad_norm": 0.018291635438799858, + "learning_rate": 8.229598893499309e-07, + "logits/chosen": -0.0760377049446106, + "logits/rejected": 3.9962425231933594, + "logps/chosen": -563.3379516601562, + "logps/rejected": -1018.1607666015625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.269431114196777, + "rewards/margins": 21.766199111938477, + "rewards/rejected": -31.03563117980957, + "step": 4107 + }, + { + "epoch": 2.55552099533437, + "grad_norm": 0.09162867069244385, + "learning_rate": 8.218072844628862e-07, + "logits/chosen": 2.3309199810028076, + "logits/rejected": 5.0157623291015625, + "logps/chosen": -610.57568359375, + "logps/rejected": -1068.809814453125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.771368980407715, + "rewards/margins": 27.12932586669922, + "rewards/rejected": -36.900691986083984, + "step": 4108 + }, + { + "epoch": 2.5561430793157074, + "grad_norm": 0.004627412185072899, + "learning_rate": 8.206546795758414e-07, + "logits/chosen": 1.278794765472412, + "logits/rejected": 3.1798346042633057, + "logps/chosen": -514.0498046875, + "logps/rejected": -854.1100463867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.337404251098633, + "rewards/margins": 23.32642364501953, + "rewards/rejected": -32.66382598876953, + "step": 4109 + }, + { + "epoch": 2.556765163297045, + "grad_norm": 5.289896011352539, + "learning_rate": 8.195020746887968e-07, + "logits/chosen": -1.7098119258880615, + "logits/rejected": 1.4635165929794312, + "logps/chosen": -458.4467468261719, + "logps/rejected": -922.351806640625, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.773290157318115, + "rewards/margins": 28.729084014892578, + "rewards/rejected": -34.502376556396484, + "step": 4110 + }, + { + "epoch": 2.5573872472783825, + "grad_norm": 17.51113510131836, + "learning_rate": 8.18349469801752e-07, + "logits/chosen": 0.880405843257904, + "logits/rejected": 2.761213541030884, + "logps/chosen": -465.3739318847656, + "logps/rejected": -771.1871337890625, + "loss": 0.1645, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.4234137535095215, + "rewards/margins": 18.73223876953125, + "rewards/rejected": -25.155654907226562, + "step": 4111 + }, + { + "epoch": 2.5580093312597203, + "grad_norm": 0.12192290276288986, + "learning_rate": 8.171968649147073e-07, + "logits/chosen": -0.6888011693954468, + "logits/rejected": 3.9244179725646973, + "logps/chosen": -356.2187194824219, + "logps/rejected": -837.59716796875, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.990688323974609, + "rewards/margins": 20.161766052246094, + "rewards/rejected": -26.152454376220703, + "step": 4112 + }, + { + "epoch": 2.5586314152410576, + "grad_norm": 0.01936480775475502, + "learning_rate": 8.160442600276625e-07, + "logits/chosen": -1.2800441980361938, + "logits/rejected": 1.316270112991333, + "logps/chosen": -446.74847412109375, + "logps/rejected": -940.384033203125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.286430835723877, + "rewards/margins": 28.183921813964844, + "rewards/rejected": -33.47035598754883, + "step": 4113 + }, + { + "epoch": 2.559253499222395, + "grad_norm": 0.00013975970796309412, + "learning_rate": 8.148916551406179e-07, + "logits/chosen": -1.8969745635986328, + "logits/rejected": 3.673656702041626, + "logps/chosen": -452.6147766113281, + "logps/rejected": -1037.419921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.33104419708252, + "rewards/margins": 31.03522491455078, + "rewards/rejected": -39.36627197265625, + "step": 4114 + }, + { + "epoch": 2.5598755832037323, + "grad_norm": 3.6346375509310747e-06, + "learning_rate": 8.137390502535732e-07, + "logits/chosen": -0.2831879258155823, + "logits/rejected": 4.18792200088501, + "logps/chosen": -457.58074951171875, + "logps/rejected": -1098.911376953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.230313301086426, + "rewards/margins": 34.850311279296875, + "rewards/rejected": -44.08062744140625, + "step": 4115 + }, + { + "epoch": 2.56049766718507, + "grad_norm": 0.004861139692366123, + "learning_rate": 8.125864453665284e-07, + "logits/chosen": 2.9572341442108154, + "logits/rejected": 3.5238685607910156, + "logps/chosen": -657.5555419921875, + "logps/rejected": -984.3204345703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.549543380737305, + "rewards/margins": 28.880319595336914, + "rewards/rejected": -38.42986297607422, + "step": 4116 + }, + { + "epoch": 2.5611197511664074, + "grad_norm": 5.274324621495907e-07, + "learning_rate": 8.114338404794838e-07, + "logits/chosen": -1.9102250337600708, + "logits/rejected": 2.5244147777557373, + "logps/chosen": -435.52679443359375, + "logps/rejected": -883.0416259765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.913174629211426, + "rewards/margins": 28.199817657470703, + "rewards/rejected": -35.11299133300781, + "step": 4117 + }, + { + "epoch": 2.561741835147745, + "grad_norm": 0.00010716221731854603, + "learning_rate": 8.10281235592439e-07, + "logits/chosen": -0.9655880331993103, + "logits/rejected": 2.9578256607055664, + "logps/chosen": -345.596435546875, + "logps/rejected": -819.7735595703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.622903347015381, + "rewards/margins": 25.935901641845703, + "rewards/rejected": -30.55880355834961, + "step": 4118 + }, + { + "epoch": 2.5623639191290826, + "grad_norm": 0.08033602684736252, + "learning_rate": 8.091286307053943e-07, + "logits/chosen": -2.8053464889526367, + "logits/rejected": 2.981839179992676, + "logps/chosen": -304.0949401855469, + "logps/rejected": -941.830322265625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.969958305358887, + "rewards/margins": 36.09151077270508, + "rewards/rejected": -43.06147003173828, + "step": 4119 + }, + { + "epoch": 2.56298600311042, + "grad_norm": 7.09777232259512e-05, + "learning_rate": 8.079760258183495e-07, + "logits/chosen": -1.7569762468338013, + "logits/rejected": 1.9847595691680908, + "logps/chosen": -415.6924133300781, + "logps/rejected": -955.1690673828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.376535892486572, + "rewards/margins": 31.938705444335938, + "rewards/rejected": -37.31523895263672, + "step": 4120 + }, + { + "epoch": 2.5636080870917572, + "grad_norm": 0.000233715123613365, + "learning_rate": 8.068234209313049e-07, + "logits/chosen": -2.1115782260894775, + "logits/rejected": 1.553727388381958, + "logps/chosen": -362.45758056640625, + "logps/rejected": -961.16357421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.228207588195801, + "rewards/margins": 34.83417510986328, + "rewards/rejected": -39.0623779296875, + "step": 4121 + }, + { + "epoch": 2.564230171073095, + "grad_norm": 1.8830749988555908, + "learning_rate": 8.056708160442601e-07, + "logits/chosen": 3.103546142578125, + "logits/rejected": 3.7607216835021973, + "logps/chosen": -695.34033203125, + "logps/rejected": -887.8741455078125, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.275184631347656, + "rewards/margins": 15.883432388305664, + "rewards/rejected": -27.15861701965332, + "step": 4122 + }, + { + "epoch": 2.5648522550544324, + "grad_norm": 0.0042056795209646225, + "learning_rate": 8.045182111572154e-07, + "logits/chosen": 0.4300188720226288, + "logits/rejected": 3.1653892993927, + "logps/chosen": -509.19171142578125, + "logps/rejected": -915.0361938476562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.071253776550293, + "rewards/margins": 25.42142677307129, + "rewards/rejected": -32.492679595947266, + "step": 4123 + }, + { + "epoch": 2.5654743390357697, + "grad_norm": 1.0041102170944214, + "learning_rate": 8.033656062701708e-07, + "logits/chosen": -0.7266380190849304, + "logits/rejected": 1.9405543804168701, + "logps/chosen": -543.4242553710938, + "logps/rejected": -904.788330078125, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.144115447998047, + "rewards/margins": 25.701583862304688, + "rewards/rejected": -35.84569549560547, + "step": 4124 + }, + { + "epoch": 2.5660964230171075, + "grad_norm": 0.12149225175380707, + "learning_rate": 8.02213001383126e-07, + "logits/chosen": 1.6091845035552979, + "logits/rejected": 3.8638434410095215, + "logps/chosen": -571.7221069335938, + "logps/rejected": -1004.748291015625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.035507202148438, + "rewards/margins": 25.663440704345703, + "rewards/rejected": -34.69894790649414, + "step": 4125 + }, + { + "epoch": 2.566718506998445, + "grad_norm": 2.1839077472686768, + "learning_rate": 8.010603964960813e-07, + "logits/chosen": 0.6656918525695801, + "logits/rejected": 4.665249824523926, + "logps/chosen": -567.3508911132812, + "logps/rejected": -1130.19580078125, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.491843223571777, + "rewards/margins": 30.916309356689453, + "rewards/rejected": -39.40815734863281, + "step": 4126 + }, + { + "epoch": 2.567340590979782, + "grad_norm": 3.404394374229014e-05, + "learning_rate": 7.999077916090364e-07, + "logits/chosen": -1.7685790061950684, + "logits/rejected": 2.534496307373047, + "logps/chosen": -428.60205078125, + "logps/rejected": -940.8599853515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.921274185180664, + "rewards/margins": 29.669403076171875, + "rewards/rejected": -35.590675354003906, + "step": 4127 + }, + { + "epoch": 2.5679626749611195, + "grad_norm": 1.0856837034225464, + "learning_rate": 7.987551867219917e-07, + "logits/chosen": -0.630801796913147, + "logits/rejected": 0.9611508846282959, + "logps/chosen": -548.1029663085938, + "logps/rejected": -929.326416015625, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.630784511566162, + "rewards/margins": 31.183979034423828, + "rewards/rejected": -37.814762115478516, + "step": 4128 + }, + { + "epoch": 2.5685847589424573, + "grad_norm": 0.026657918468117714, + "learning_rate": 7.97602581834947e-07, + "logits/chosen": 0.9505861401557922, + "logits/rejected": 2.9759345054626465, + "logps/chosen": -468.04266357421875, + "logps/rejected": -777.418701171875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.444450855255127, + "rewards/margins": 22.632177352905273, + "rewards/rejected": -27.076627731323242, + "step": 4129 + }, + { + "epoch": 2.5692068429237946, + "grad_norm": 0.0002953787916339934, + "learning_rate": 7.964499769479023e-07, + "logits/chosen": -2.286724090576172, + "logits/rejected": 2.6744308471679688, + "logps/chosen": -414.42919921875, + "logps/rejected": -985.4376220703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.565918922424316, + "rewards/margins": 33.906394958496094, + "rewards/rejected": -40.472312927246094, + "step": 4130 + }, + { + "epoch": 2.5698289269051324, + "grad_norm": 4.818135721507133e-07, + "learning_rate": 7.952973720608575e-07, + "logits/chosen": 0.27260005474090576, + "logits/rejected": 4.570462226867676, + "logps/chosen": -350.6351013183594, + "logps/rejected": -968.77294921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.497164726257324, + "rewards/margins": 35.60032653808594, + "rewards/rejected": -41.09749221801758, + "step": 4131 + }, + { + "epoch": 2.5704510108864698, + "grad_norm": 5.349870207282947e-06, + "learning_rate": 7.941447671738129e-07, + "logits/chosen": -1.763448715209961, + "logits/rejected": 3.7824950218200684, + "logps/chosen": -398.9009704589844, + "logps/rejected": -1069.40380859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.347376823425293, + "rewards/margins": 31.30596351623535, + "rewards/rejected": -40.653343200683594, + "step": 4132 + }, + { + "epoch": 2.571073094867807, + "grad_norm": 0.0005738705513067544, + "learning_rate": 7.929921622867681e-07, + "logits/chosen": 2.2045533657073975, + "logits/rejected": 4.384173393249512, + "logps/chosen": -643.0028076171875, + "logps/rejected": -1008.7015380859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.411272048950195, + "rewards/margins": 25.97161865234375, + "rewards/rejected": -38.38289260864258, + "step": 4133 + }, + { + "epoch": 2.5716951788491444, + "grad_norm": 0.05061626806855202, + "learning_rate": 7.918395573997234e-07, + "logits/chosen": 0.16258734464645386, + "logits/rejected": 1.2322566509246826, + "logps/chosen": -564.4116821289062, + "logps/rejected": -876.4520263671875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.655303955078125, + "rewards/margins": 24.78386878967285, + "rewards/rejected": -35.43917465209961, + "step": 4134 + }, + { + "epoch": 2.5723172628304822, + "grad_norm": 0.00019210392201784998, + "learning_rate": 7.906869525126787e-07, + "logits/chosen": 0.23870764672756195, + "logits/rejected": 3.414243698120117, + "logps/chosen": -517.4119262695312, + "logps/rejected": -1035.8192138671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.417430877685547, + "rewards/margins": 25.290889739990234, + "rewards/rejected": -32.70832061767578, + "step": 4135 + }, + { + "epoch": 2.5729393468118196, + "grad_norm": 5.645513738272712e-06, + "learning_rate": 7.89534347625634e-07, + "logits/chosen": -0.7859196066856384, + "logits/rejected": 4.74886417388916, + "logps/chosen": -418.10443115234375, + "logps/rejected": -1050.834716796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.610685348510742, + "rewards/margins": 30.351909637451172, + "rewards/rejected": -38.96259307861328, + "step": 4136 + }, + { + "epoch": 2.573561430793157, + "grad_norm": 2.9019644260406494, + "learning_rate": 7.883817427385892e-07, + "logits/chosen": 2.8801445960998535, + "logits/rejected": 3.751919746398926, + "logps/chosen": -731.3704833984375, + "logps/rejected": -1120.801025390625, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.350638389587402, + "rewards/margins": 25.67172622680664, + "rewards/rejected": -36.02236557006836, + "step": 4137 + }, + { + "epoch": 2.5741835147744947, + "grad_norm": 2.1129725524815512e-08, + "learning_rate": 7.872291378515445e-07, + "logits/chosen": -1.025829553604126, + "logits/rejected": 2.914036989212036, + "logps/chosen": -440.2637939453125, + "logps/rejected": -1089.4420166015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.31710147857666, + "rewards/margins": 42.682861328125, + "rewards/rejected": -50.999969482421875, + "step": 4138 + }, + { + "epoch": 2.574805598755832, + "grad_norm": 0.3254028856754303, + "learning_rate": 7.860765329644998e-07, + "logits/chosen": -0.7475310564041138, + "logits/rejected": 3.239452362060547, + "logps/chosen": -574.8551635742188, + "logps/rejected": -1061.8125, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.53398323059082, + "rewards/margins": 30.228036880493164, + "rewards/rejected": -40.762020111083984, + "step": 4139 + }, + { + "epoch": 2.5754276827371694, + "grad_norm": 2.3314505597227253e-06, + "learning_rate": 7.849239280774551e-07, + "logits/chosen": 1.8743324279785156, + "logits/rejected": 4.593924522399902, + "logps/chosen": -735.50341796875, + "logps/rejected": -1268.7764892578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.629220962524414, + "rewards/margins": 36.92953872680664, + "rewards/rejected": -47.55876159667969, + "step": 4140 + }, + { + "epoch": 2.576049766718507, + "grad_norm": 0.32990846037864685, + "learning_rate": 7.837713231904104e-07, + "logits/chosen": -0.5682883262634277, + "logits/rejected": 2.464709758758545, + "logps/chosen": -625.6953125, + "logps/rejected": -1091.140869140625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.468364715576172, + "rewards/margins": 29.091880798339844, + "rewards/rejected": -40.560245513916016, + "step": 4141 + }, + { + "epoch": 2.5766718506998445, + "grad_norm": 0.0006232153391465545, + "learning_rate": 7.826187183033657e-07, + "logits/chosen": 0.9526250958442688, + "logits/rejected": 4.00775146484375, + "logps/chosen": -487.50927734375, + "logps/rejected": -968.1290283203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.43969440460205, + "rewards/margins": 27.86787986755371, + "rewards/rejected": -37.30757522583008, + "step": 4142 + }, + { + "epoch": 2.577293934681182, + "grad_norm": 8.784759870650305e-07, + "learning_rate": 7.81466113416321e-07, + "logits/chosen": -1.9092073440551758, + "logits/rejected": 1.6192662715911865, + "logps/chosen": -502.8148193359375, + "logps/rejected": -995.5576171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.434041976928711, + "rewards/margins": 29.768796920776367, + "rewards/rejected": -37.202842712402344, + "step": 4143 + }, + { + "epoch": 2.5779160186625196, + "grad_norm": 0.00010087468399433419, + "learning_rate": 7.803135085292762e-07, + "logits/chosen": 2.192214012145996, + "logits/rejected": 3.749067544937134, + "logps/chosen": -692.218505859375, + "logps/rejected": -1114.0401611328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.803261756896973, + "rewards/margins": 32.304405212402344, + "rewards/rejected": -40.107666015625, + "step": 4144 + }, + { + "epoch": 2.578538102643857, + "grad_norm": 2.3301174223888665e-05, + "learning_rate": 7.791609036422315e-07, + "logits/chosen": 0.2807917594909668, + "logits/rejected": 4.68834924697876, + "logps/chosen": -480.9891662597656, + "logps/rejected": -1048.3934326171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.968709945678711, + "rewards/margins": 30.439815521240234, + "rewards/rejected": -41.40852355957031, + "step": 4145 + }, + { + "epoch": 2.5791601866251943, + "grad_norm": 0.9815332889556885, + "learning_rate": 7.780082987551868e-07, + "logits/chosen": 0.13096857070922852, + "logits/rejected": 3.9499459266662598, + "logps/chosen": -571.0997924804688, + "logps/rejected": -1036.63134765625, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.803068161010742, + "rewards/margins": 27.141456604003906, + "rewards/rejected": -37.944522857666016, + "step": 4146 + }, + { + "epoch": 2.5797822706065316, + "grad_norm": 7.814910411834717, + "learning_rate": 7.768556938681421e-07, + "logits/chosen": 0.5178133249282837, + "logits/rejected": 1.794018268585205, + "logps/chosen": -532.0501098632812, + "logps/rejected": -758.7357177734375, + "loss": 0.0258, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.415571212768555, + "rewards/margins": 16.038002014160156, + "rewards/rejected": -24.453575134277344, + "step": 4147 + }, + { + "epoch": 2.5804043545878694, + "grad_norm": 2.3325019693487548e-09, + "learning_rate": 7.757030889810973e-07, + "logits/chosen": 0.5286740064620972, + "logits/rejected": 3.41743540763855, + "logps/chosen": -509.2735900878906, + "logps/rejected": -1076.5255126953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.825985908508301, + "rewards/margins": 37.71453094482422, + "rewards/rejected": -42.54051971435547, + "step": 4148 + }, + { + "epoch": 2.5810264385692068, + "grad_norm": 1.063908712239936e-06, + "learning_rate": 7.745504840940527e-07, + "logits/chosen": -1.1882922649383545, + "logits/rejected": 4.256172180175781, + "logps/chosen": -430.41729736328125, + "logps/rejected": -1153.875732421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.591100692749023, + "rewards/margins": 39.84674835205078, + "rewards/rejected": -48.437843322753906, + "step": 4149 + }, + { + "epoch": 2.5816485225505446, + "grad_norm": 1.0655003279680386e-05, + "learning_rate": 7.733978792070079e-07, + "logits/chosen": 0.24003244936466217, + "logits/rejected": 3.3438003063201904, + "logps/chosen": -398.0082092285156, + "logps/rejected": -968.8842163085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.836625099182129, + "rewards/margins": 35.46302795410156, + "rewards/rejected": -44.29965591430664, + "step": 4150 + }, + { + "epoch": 2.582270606531882, + "grad_norm": 0.35538250207901, + "learning_rate": 7.722452743199632e-07, + "logits/chosen": -0.1121729165315628, + "logits/rejected": 3.873859405517578, + "logps/chosen": -515.9468994140625, + "logps/rejected": -1137.430419921875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.515294075012207, + "rewards/margins": 38.027626037597656, + "rewards/rejected": -47.54292297363281, + "step": 4151 + }, + { + "epoch": 2.5828926905132192, + "grad_norm": 0.008247281424701214, + "learning_rate": 7.710926694329185e-07, + "logits/chosen": 0.6036416888237, + "logits/rejected": 3.62092924118042, + "logps/chosen": -544.2308349609375, + "logps/rejected": -1042.4071044921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.00452184677124, + "rewards/margins": 30.49679183959961, + "rewards/rejected": -35.501312255859375, + "step": 4152 + }, + { + "epoch": 2.5835147744945566, + "grad_norm": 0.019443973898887634, + "learning_rate": 7.699400645458738e-07, + "logits/chosen": -0.25185853242874146, + "logits/rejected": 3.4223010540008545, + "logps/chosen": -470.89715576171875, + "logps/rejected": -1007.6849365234375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.071162223815918, + "rewards/margins": 29.602224349975586, + "rewards/rejected": -34.67338562011719, + "step": 4153 + }, + { + "epoch": 2.5841368584758944, + "grad_norm": 0.03014482371509075, + "learning_rate": 7.687874596588291e-07, + "logits/chosen": 0.9726011157035828, + "logits/rejected": 3.408278226852417, + "logps/chosen": -525.7872314453125, + "logps/rejected": -966.32666015625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.844058990478516, + "rewards/margins": 27.53978729248047, + "rewards/rejected": -35.383846282958984, + "step": 4154 + }, + { + "epoch": 2.5847589424572317, + "grad_norm": 0.0024584888014942408, + "learning_rate": 7.676348547717843e-07, + "logits/chosen": -1.4146912097930908, + "logits/rejected": -0.32575997710227966, + "logps/chosen": -458.1917724609375, + "logps/rejected": -849.0999145507812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.201642990112305, + "rewards/margins": 30.501832962036133, + "rewards/rejected": -39.70347595214844, + "step": 4155 + }, + { + "epoch": 2.585381026438569, + "grad_norm": 1.9921230887121055e-07, + "learning_rate": 7.664822498847396e-07, + "logits/chosen": -1.1639642715454102, + "logits/rejected": 1.7085157632827759, + "logps/chosen": -621.3360595703125, + "logps/rejected": -1296.9610595703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.151021957397461, + "rewards/margins": 46.671287536621094, + "rewards/rejected": -54.82231140136719, + "step": 4156 + }, + { + "epoch": 2.586003110419907, + "grad_norm": 1.6931274091080972e-11, + "learning_rate": 7.653296449976949e-07, + "logits/chosen": -1.5741413831710815, + "logits/rejected": 2.825228214263916, + "logps/chosen": -515.9703369140625, + "logps/rejected": -1107.7664794921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.022581100463867, + "rewards/margins": 36.04176330566406, + "rewards/rejected": -40.0643424987793, + "step": 4157 + }, + { + "epoch": 2.586625194401244, + "grad_norm": 0.12946701049804688, + "learning_rate": 7.641770401106502e-07, + "logits/chosen": 1.9064527750015259, + "logits/rejected": 3.7687430381774902, + "logps/chosen": -698.0882568359375, + "logps/rejected": -973.91845703125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.335290908813477, + "rewards/margins": 25.910003662109375, + "rewards/rejected": -35.24529266357422, + "step": 4158 + }, + { + "epoch": 2.5872472783825815, + "grad_norm": 28.270483016967773, + "learning_rate": 7.630244352236054e-07, + "logits/chosen": 1.3119711875915527, + "logits/rejected": 1.9648979902267456, + "logps/chosen": -762.5568237304688, + "logps/rejected": -919.1646728515625, + "loss": 0.1872, + "rewards/accuracies": 0.875, + "rewards/chosen": -12.725616455078125, + "rewards/margins": 16.814102172851562, + "rewards/rejected": -29.539718627929688, + "step": 4159 + }, + { + "epoch": 2.5878693623639193, + "grad_norm": 1.0774549991765525e-05, + "learning_rate": 7.618718303365608e-07, + "logits/chosen": 0.4979928135871887, + "logits/rejected": 3.370811700820923, + "logps/chosen": -485.9146423339844, + "logps/rejected": -1032.0020751953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.306400299072266, + "rewards/margins": 30.54342269897461, + "rewards/rejected": -38.849822998046875, + "step": 4160 + }, + { + "epoch": 2.5884914463452566, + "grad_norm": 0.6942704916000366, + "learning_rate": 7.607192254495159e-07, + "logits/chosen": 1.4790562391281128, + "logits/rejected": 4.277195930480957, + "logps/chosen": -449.6810302734375, + "logps/rejected": -966.2116088867188, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.123173713684082, + "rewards/margins": 26.700340270996094, + "rewards/rejected": -32.82351303100586, + "step": 4161 + }, + { + "epoch": 2.589113530326594, + "grad_norm": 0.10654214769601822, + "learning_rate": 7.595666205624712e-07, + "logits/chosen": -1.6439287662506104, + "logits/rejected": 4.115915298461914, + "logps/chosen": -498.05078125, + "logps/rejected": -1107.330810546875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.457066059112549, + "rewards/margins": 31.892675399780273, + "rewards/rejected": -38.34973907470703, + "step": 4162 + }, + { + "epoch": 2.5897356143079318, + "grad_norm": 2.1090505697429762e-07, + "learning_rate": 7.584140156754264e-07, + "logits/chosen": 1.5892367362976074, + "logits/rejected": 3.2688088417053223, + "logps/chosen": -545.9234619140625, + "logps/rejected": -955.6949462890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.612250328063965, + "rewards/margins": 31.79043197631836, + "rewards/rejected": -38.402679443359375, + "step": 4163 + }, + { + "epoch": 2.590357698289269, + "grad_norm": 0.054204028099775314, + "learning_rate": 7.572614107883818e-07, + "logits/chosen": 1.239471673965454, + "logits/rejected": 2.5942583084106445, + "logps/chosen": -691.4280395507812, + "logps/rejected": -923.4697265625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.972702026367188, + "rewards/margins": 16.35270881652832, + "rewards/rejected": -25.325408935546875, + "step": 4164 + }, + { + "epoch": 2.5909797822706064, + "grad_norm": 0.05186247453093529, + "learning_rate": 7.56108805901337e-07, + "logits/chosen": 1.7655837535858154, + "logits/rejected": 3.956921100616455, + "logps/chosen": -590.0676879882812, + "logps/rejected": -1094.4619140625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.105931282043457, + "rewards/margins": 33.443580627441406, + "rewards/rejected": -41.54951477050781, + "step": 4165 + }, + { + "epoch": 2.5916018662519438, + "grad_norm": 0.020832421258091927, + "learning_rate": 7.549562010142923e-07, + "logits/chosen": -0.06915748119354248, + "logits/rejected": 4.698666572570801, + "logps/chosen": -433.6332702636719, + "logps/rejected": -938.1735229492188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.690306663513184, + "rewards/margins": 25.77753448486328, + "rewards/rejected": -32.46784210205078, + "step": 4166 + }, + { + "epoch": 2.5922239502332816, + "grad_norm": 0.0976177230477333, + "learning_rate": 7.538035961272477e-07, + "logits/chosen": 3.007206916809082, + "logits/rejected": 3.8235087394714355, + "logps/chosen": -775.51904296875, + "logps/rejected": -1105.434814453125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.156403541564941, + "rewards/margins": 24.128873825073242, + "rewards/rejected": -34.2852783203125, + "step": 4167 + }, + { + "epoch": 2.592846034214619, + "grad_norm": 0.0019352661911398172, + "learning_rate": 7.526509912402029e-07, + "logits/chosen": -1.8974697589874268, + "logits/rejected": 3.135394334793091, + "logps/chosen": -375.7920837402344, + "logps/rejected": -905.7713012695312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.973499298095703, + "rewards/margins": 31.2205867767334, + "rewards/rejected": -37.19408416748047, + "step": 4168 + }, + { + "epoch": 2.5934681181959567, + "grad_norm": 0.020845483988523483, + "learning_rate": 7.514983863531582e-07, + "logits/chosen": 1.5700950622558594, + "logits/rejected": 4.825531482696533, + "logps/chosen": -542.24951171875, + "logps/rejected": -868.7842407226562, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.495453834533691, + "rewards/margins": 17.05913734436035, + "rewards/rejected": -24.55459213256836, + "step": 4169 + }, + { + "epoch": 2.594090202177294, + "grad_norm": 1.3278538801841933e-07, + "learning_rate": 7.503457814661134e-07, + "logits/chosen": -1.3590807914733887, + "logits/rejected": 3.1065785884857178, + "logps/chosen": -428.7196044921875, + "logps/rejected": -1092.657470703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.047929763793945, + "rewards/margins": 36.87749099731445, + "rewards/rejected": -44.92542266845703, + "step": 4170 + }, + { + "epoch": 2.5947122861586314, + "grad_norm": 0.5693565011024475, + "learning_rate": 7.491931765790688e-07, + "logits/chosen": -1.617149829864502, + "logits/rejected": 2.5822722911834717, + "logps/chosen": -352.824951171875, + "logps/rejected": -885.1978759765625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.319247245788574, + "rewards/margins": 29.19968032836914, + "rewards/rejected": -36.51892852783203, + "step": 4171 + }, + { + "epoch": 2.5953343701399687, + "grad_norm": 0.016859617084264755, + "learning_rate": 7.48040571692024e-07, + "logits/chosen": -1.226083755493164, + "logits/rejected": 1.5435649156570435, + "logps/chosen": -426.6649169921875, + "logps/rejected": -858.7387084960938, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.303621292114258, + "rewards/margins": 23.738893508911133, + "rewards/rejected": -31.04251480102539, + "step": 4172 + }, + { + "epoch": 2.5959564541213065, + "grad_norm": 1.0266690830817993e-09, + "learning_rate": 7.468879668049793e-07, + "logits/chosen": -1.7223788499832153, + "logits/rejected": 3.0244596004486084, + "logps/chosen": -518.0752563476562, + "logps/rejected": -1203.6407470703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.524657249450684, + "rewards/margins": 37.38650131225586, + "rewards/rejected": -43.911155700683594, + "step": 4173 + }, + { + "epoch": 2.596578538102644, + "grad_norm": 1.537376556370873e-05, + "learning_rate": 7.457353619179345e-07, + "logits/chosen": -2.407674789428711, + "logits/rejected": 2.881885051727295, + "logps/chosen": -268.8407897949219, + "logps/rejected": -897.4033203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.9550981521606445, + "rewards/margins": 34.330726623535156, + "rewards/rejected": -39.285823822021484, + "step": 4174 + }, + { + "epoch": 2.5972006220839816, + "grad_norm": 7.735238614259288e-05, + "learning_rate": 7.445827570308899e-07, + "logits/chosen": -0.7990444898605347, + "logits/rejected": 3.784900665283203, + "logps/chosen": -430.090087890625, + "logps/rejected": -1010.958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.116263389587402, + "rewards/margins": 29.52863311767578, + "rewards/rejected": -37.6448974609375, + "step": 4175 + }, + { + "epoch": 2.597822706065319, + "grad_norm": 0.20494690537452698, + "learning_rate": 7.434301521438451e-07, + "logits/chosen": -1.503658652305603, + "logits/rejected": 3.8516783714294434, + "logps/chosen": -463.29217529296875, + "logps/rejected": -1050.017578125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.306408405303955, + "rewards/margins": 33.17970275878906, + "rewards/rejected": -39.486106872558594, + "step": 4176 + }, + { + "epoch": 2.5984447900466563, + "grad_norm": 0.01972721703350544, + "learning_rate": 7.422775472568004e-07, + "logits/chosen": 0.09046536684036255, + "logits/rejected": 4.269755840301514, + "logps/chosen": -529.8043212890625, + "logps/rejected": -1187.9677734375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.779526710510254, + "rewards/margins": 39.77001190185547, + "rewards/rejected": -49.54953384399414, + "step": 4177 + }, + { + "epoch": 2.5990668740279936, + "grad_norm": 0.051523786038160324, + "learning_rate": 7.411249423697558e-07, + "logits/chosen": 1.389330267906189, + "logits/rejected": 4.2536211013793945, + "logps/chosen": -682.2140502929688, + "logps/rejected": -1222.2752685546875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.666873931884766, + "rewards/margins": 33.9172477722168, + "rewards/rejected": -46.58412170410156, + "step": 4178 + }, + { + "epoch": 2.5996889580093314, + "grad_norm": 0.0033380291424691677, + "learning_rate": 7.39972337482711e-07, + "logits/chosen": 1.5119990110397339, + "logits/rejected": 3.1918954849243164, + "logps/chosen": -627.9554443359375, + "logps/rejected": -1112.017822265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.91726303100586, + "rewards/margins": 32.120941162109375, + "rewards/rejected": -47.038204193115234, + "step": 4179 + }, + { + "epoch": 2.6003110419906688, + "grad_norm": 0.20266841351985931, + "learning_rate": 7.388197325956663e-07, + "logits/chosen": 1.38478422164917, + "logits/rejected": 3.9541900157928467, + "logps/chosen": -599.4830322265625, + "logps/rejected": -1026.7489013671875, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.972454071044922, + "rewards/margins": 31.13959503173828, + "rewards/rejected": -39.11205291748047, + "step": 4180 + }, + { + "epoch": 2.600933125972006, + "grad_norm": 35.50774002075195, + "learning_rate": 7.376671277086215e-07, + "logits/chosen": 0.7602481245994568, + "logits/rejected": 2.566112995147705, + "logps/chosen": -683.4918823242188, + "logps/rejected": -1027.614501953125, + "loss": 0.3128, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.960643768310547, + "rewards/margins": 21.349445343017578, + "rewards/rejected": -35.310089111328125, + "step": 4181 + }, + { + "epoch": 2.601555209953344, + "grad_norm": 0.0006896215490996838, + "learning_rate": 7.365145228215769e-07, + "logits/chosen": -0.33327698707580566, + "logits/rejected": 1.4698470830917358, + "logps/chosen": -489.9069519042969, + "logps/rejected": -799.717041015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.703483581542969, + "rewards/margins": 22.704559326171875, + "rewards/rejected": -32.408042907714844, + "step": 4182 + }, + { + "epoch": 2.6021772939346812, + "grad_norm": 1.4685786962509155, + "learning_rate": 7.353619179345321e-07, + "logits/chosen": 2.564486265182495, + "logits/rejected": 5.060420989990234, + "logps/chosen": -447.52593994140625, + "logps/rejected": -886.5484619140625, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.470832347869873, + "rewards/margins": 30.235698699951172, + "rewards/rejected": -36.70652770996094, + "step": 4183 + }, + { + "epoch": 2.6027993779160186, + "grad_norm": 8.54348618304357e-05, + "learning_rate": 7.342093130474874e-07, + "logits/chosen": 1.4889253377914429, + "logits/rejected": 3.0097999572753906, + "logps/chosen": -586.3075561523438, + "logps/rejected": -1041.024658203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.637677192687988, + "rewards/margins": 31.820865631103516, + "rewards/rejected": -40.45854187011719, + "step": 4184 + }, + { + "epoch": 2.603421461897356, + "grad_norm": 13.519923210144043, + "learning_rate": 7.330567081604426e-07, + "logits/chosen": -0.6276093125343323, + "logits/rejected": 3.1709046363830566, + "logps/chosen": -478.00921630859375, + "logps/rejected": -950.1629028320312, + "loss": 0.0534, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.828385353088379, + "rewards/margins": 25.66582489013672, + "rewards/rejected": -33.49420928955078, + "step": 4185 + }, + { + "epoch": 2.6040435458786937, + "grad_norm": 1.037954278082509e-09, + "learning_rate": 7.31904103273398e-07, + "logits/chosen": -0.8776130080223083, + "logits/rejected": 3.4253296852111816, + "logps/chosen": -388.2559509277344, + "logps/rejected": -1020.8163452148438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.2152509689331055, + "rewards/margins": 38.56673049926758, + "rewards/rejected": -44.781982421875, + "step": 4186 + }, + { + "epoch": 2.604665629860031, + "grad_norm": 0.04148663580417633, + "learning_rate": 7.307514983863533e-07, + "logits/chosen": -2.193047046661377, + "logits/rejected": 2.826978921890259, + "logps/chosen": -340.1023254394531, + "logps/rejected": -961.1428833007812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.069295883178711, + "rewards/margins": 31.567243576049805, + "rewards/rejected": -37.636539459228516, + "step": 4187 + }, + { + "epoch": 2.605287713841369, + "grad_norm": 0.0005553133087232709, + "learning_rate": 7.295988934993085e-07, + "logits/chosen": 0.2553267478942871, + "logits/rejected": 2.0529046058654785, + "logps/chosen": -596.5380249023438, + "logps/rejected": -1032.4805908203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.72260856628418, + "rewards/margins": 35.551048278808594, + "rewards/rejected": -46.27365493774414, + "step": 4188 + }, + { + "epoch": 2.605909797822706, + "grad_norm": 0.00013095911708660424, + "learning_rate": 7.284462886122639e-07, + "logits/chosen": 0.29065048694610596, + "logits/rejected": 3.1158859729766846, + "logps/chosen": -503.50836181640625, + "logps/rejected": -957.869384765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.637614727020264, + "rewards/margins": 27.772708892822266, + "rewards/rejected": -34.41032409667969, + "step": 4189 + }, + { + "epoch": 2.6065318818040435, + "grad_norm": 0.001143694738857448, + "learning_rate": 7.272936837252191e-07, + "logits/chosen": -0.21890854835510254, + "logits/rejected": 3.120697498321533, + "logps/chosen": -594.9986572265625, + "logps/rejected": -1057.1905517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.957072257995605, + "rewards/margins": 29.239805221557617, + "rewards/rejected": -41.196876525878906, + "step": 4190 + }, + { + "epoch": 2.607153965785381, + "grad_norm": 9.123002087108034e-08, + "learning_rate": 7.261410788381744e-07, + "logits/chosen": 3.8174686431884766, + "logits/rejected": 4.417840003967285, + "logps/chosen": -737.0982055664062, + "logps/rejected": -1032.296142578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.874161720275879, + "rewards/margins": 29.188743591308594, + "rewards/rejected": -37.06290817260742, + "step": 4191 + }, + { + "epoch": 2.6077760497667186, + "grad_norm": 0.0028207304421812296, + "learning_rate": 7.249884739511296e-07, + "logits/chosen": -0.5841156244277954, + "logits/rejected": 2.005009889602661, + "logps/chosen": -495.2757568359375, + "logps/rejected": -946.2617797851562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.724778175354004, + "rewards/margins": 30.42086410522461, + "rewards/rejected": -39.14564514160156, + "step": 4192 + }, + { + "epoch": 2.608398133748056, + "grad_norm": 0.032835643738508224, + "learning_rate": 7.23835869064085e-07, + "logits/chosen": 0.36219626665115356, + "logits/rejected": 3.0236282348632812, + "logps/chosen": -582.063720703125, + "logps/rejected": -1049.0797119140625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.190608978271484, + "rewards/margins": 28.47592544555664, + "rewards/rejected": -38.666534423828125, + "step": 4193 + }, + { + "epoch": 2.6090202177293937, + "grad_norm": 9.084986651863858e-10, + "learning_rate": 7.226832641770402e-07, + "logits/chosen": -0.41260039806365967, + "logits/rejected": 3.4702682495117188, + "logps/chosen": -510.906982421875, + "logps/rejected": -1089.784912109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.021139144897461, + "rewards/margins": 36.50783157348633, + "rewards/rejected": -45.528968811035156, + "step": 4194 + }, + { + "epoch": 2.609642301710731, + "grad_norm": 0.00016271619824692607, + "learning_rate": 7.215306592899954e-07, + "logits/chosen": -1.4239130020141602, + "logits/rejected": 3.2922325134277344, + "logps/chosen": -454.74200439453125, + "logps/rejected": -1156.666748046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.663017272949219, + "rewards/margins": 35.54042053222656, + "rewards/rejected": -45.20343780517578, + "step": 4195 + }, + { + "epoch": 2.6102643856920684, + "grad_norm": 0.011077502742409706, + "learning_rate": 7.203780544029507e-07, + "logits/chosen": 0.8665391802787781, + "logits/rejected": 3.879478931427002, + "logps/chosen": -521.7965087890625, + "logps/rejected": -966.2922973632812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.905708312988281, + "rewards/margins": 27.83869171142578, + "rewards/rejected": -36.74440002441406, + "step": 4196 + }, + { + "epoch": 2.6108864696734058, + "grad_norm": 3.622733856900595e-05, + "learning_rate": 7.19225449515906e-07, + "logits/chosen": 0.4364289343357086, + "logits/rejected": 3.5153493881225586, + "logps/chosen": -476.50048828125, + "logps/rejected": -972.5951538085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.1321916580200195, + "rewards/margins": 33.4731330871582, + "rewards/rejected": -38.605323791503906, + "step": 4197 + }, + { + "epoch": 2.6115085536547435, + "grad_norm": 0.9528161883354187, + "learning_rate": 7.180728446288612e-07, + "logits/chosen": 2.507401943206787, + "logits/rejected": 3.8850791454315186, + "logps/chosen": -719.9276733398438, + "logps/rejected": -1035.9569091796875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.24326229095459, + "rewards/margins": 26.770294189453125, + "rewards/rejected": -36.01355743408203, + "step": 4198 + }, + { + "epoch": 2.612130637636081, + "grad_norm": 1.709455204945698e-06, + "learning_rate": 7.169202397418165e-07, + "logits/chosen": -4.1853742599487305, + "logits/rejected": 3.332636594772339, + "logps/chosen": -352.78643798828125, + "logps/rejected": -1202.97021484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.660493850708008, + "rewards/margins": 36.10469055175781, + "rewards/rejected": -40.76518249511719, + "step": 4199 + }, + { + "epoch": 2.6127527216174182, + "grad_norm": 0.04437851160764694, + "learning_rate": 7.157676348547718e-07, + "logits/chosen": -1.551134467124939, + "logits/rejected": 1.4714164733886719, + "logps/chosen": -543.7584838867188, + "logps/rejected": -1100.864013671875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.794875621795654, + "rewards/margins": 33.39886474609375, + "rewards/rejected": -41.19374084472656, + "step": 4200 + }, + { + "epoch": 2.613374805598756, + "grad_norm": 2.520590305328369, + "learning_rate": 7.146150299677271e-07, + "logits/chosen": 0.9646257162094116, + "logits/rejected": 3.8723604679107666, + "logps/chosen": -569.6227416992188, + "logps/rejected": -960.802734375, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.596766471862793, + "rewards/margins": 24.772981643676758, + "rewards/rejected": -33.3697509765625, + "step": 4201 + }, + { + "epoch": 2.6139968895800934, + "grad_norm": 0.025293413549661636, + "learning_rate": 7.134624250806823e-07, + "logits/chosen": 0.3296043872833252, + "logits/rejected": 3.7700836658477783, + "logps/chosen": -566.0994873046875, + "logps/rejected": -1046.325439453125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.096038818359375, + "rewards/margins": 27.5986328125, + "rewards/rejected": -36.694671630859375, + "step": 4202 + }, + { + "epoch": 2.6146189735614307, + "grad_norm": 0.2624208629131317, + "learning_rate": 7.123098201936376e-07, + "logits/chosen": 1.185504674911499, + "logits/rejected": 3.8389434814453125, + "logps/chosen": -528.7069702148438, + "logps/rejected": -997.303466796875, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.241028785705566, + "rewards/margins": 23.622182846069336, + "rewards/rejected": -31.86321258544922, + "step": 4203 + }, + { + "epoch": 2.615241057542768, + "grad_norm": 0.08798151463270187, + "learning_rate": 7.11157215306593e-07, + "logits/chosen": -4.298510551452637, + "logits/rejected": 1.096667766571045, + "logps/chosen": -258.92425537109375, + "logps/rejected": -887.087646484375, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.425273895263672, + "rewards/margins": 31.15479278564453, + "rewards/rejected": -35.5800666809082, + "step": 4204 + }, + { + "epoch": 2.615863141524106, + "grad_norm": 0.2985028326511383, + "learning_rate": 7.100046104195482e-07, + "logits/chosen": 1.4374034404754639, + "logits/rejected": 4.240245342254639, + "logps/chosen": -624.4095458984375, + "logps/rejected": -1162.5343017578125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.385326385498047, + "rewards/margins": 32.3258056640625, + "rewards/rejected": -40.71113204956055, + "step": 4205 + }, + { + "epoch": 2.616485225505443, + "grad_norm": 16.65906524658203, + "learning_rate": 7.088520055325035e-07, + "logits/chosen": 1.678907036781311, + "logits/rejected": 4.096469402313232, + "logps/chosen": -486.4000244140625, + "logps/rejected": -908.15576171875, + "loss": 0.0819, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.184075355529785, + "rewards/margins": 24.814620971679688, + "rewards/rejected": -29.998699188232422, + "step": 4206 + }, + { + "epoch": 2.617107309486781, + "grad_norm": 0.00014799633936490864, + "learning_rate": 7.076994006454588e-07, + "logits/chosen": 0.9702072739601135, + "logits/rejected": 3.7747740745544434, + "logps/chosen": -652.5638427734375, + "logps/rejected": -1131.428955078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.863973617553711, + "rewards/margins": 33.608184814453125, + "rewards/rejected": -42.47215270996094, + "step": 4207 + }, + { + "epoch": 2.6177293934681183, + "grad_norm": 8.743905345909297e-05, + "learning_rate": 7.065467957584141e-07, + "logits/chosen": 2.3410634994506836, + "logits/rejected": 2.0488579273223877, + "logps/chosen": -680.400146484375, + "logps/rejected": -1035.8314208984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.8915433883667, + "rewards/margins": 28.13915252685547, + "rewards/rejected": -37.030696868896484, + "step": 4208 + }, + { + "epoch": 2.6183514774494556, + "grad_norm": 5.205459956414416e-07, + "learning_rate": 7.053941908713693e-07, + "logits/chosen": -0.7137738466262817, + "logits/rejected": 3.436619997024536, + "logps/chosen": -405.1341247558594, + "logps/rejected": -900.43310546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.368278980255127, + "rewards/margins": 26.511754989624023, + "rewards/rejected": -31.880033493041992, + "step": 4209 + }, + { + "epoch": 2.618973561430793, + "grad_norm": 0.01867208629846573, + "learning_rate": 7.042415859843246e-07, + "logits/chosen": -0.14740872383117676, + "logits/rejected": 0.6668645739555359, + "logps/chosen": -590.059326171875, + "logps/rejected": -878.8848876953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.67483901977539, + "rewards/margins": 27.501344680786133, + "rewards/rejected": -36.176185607910156, + "step": 4210 + }, + { + "epoch": 2.6195956454121307, + "grad_norm": 0.36106786131858826, + "learning_rate": 7.030889810972799e-07, + "logits/chosen": -1.5582423210144043, + "logits/rejected": 1.5341298580169678, + "logps/chosen": -435.0976257324219, + "logps/rejected": -801.2057495117188, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.826573371887207, + "rewards/margins": 23.22080421447754, + "rewards/rejected": -32.04737854003906, + "step": 4211 + }, + { + "epoch": 2.620217729393468, + "grad_norm": 0.06946823000907898, + "learning_rate": 7.019363762102352e-07, + "logits/chosen": -0.030036628246307373, + "logits/rejected": 3.2860264778137207, + "logps/chosen": -477.8688049316406, + "logps/rejected": -938.0089111328125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.1399126052856445, + "rewards/margins": 25.384279251098633, + "rewards/rejected": -30.524192810058594, + "step": 4212 + }, + { + "epoch": 2.620839813374806, + "grad_norm": 4.984333038330078, + "learning_rate": 7.007837713231904e-07, + "logits/chosen": -2.0527803897857666, + "logits/rejected": 2.763148307800293, + "logps/chosen": -382.4664306640625, + "logps/rejected": -986.9502563476562, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.928335189819336, + "rewards/margins": 27.36969757080078, + "rewards/rejected": -35.298030853271484, + "step": 4213 + }, + { + "epoch": 2.621461897356143, + "grad_norm": 0.27239108085632324, + "learning_rate": 6.996311664361458e-07, + "logits/chosen": 0.09156519174575806, + "logits/rejected": 4.214585304260254, + "logps/chosen": -508.62384033203125, + "logps/rejected": -1109.5693359375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.311394691467285, + "rewards/margins": 32.705902099609375, + "rewards/rejected": -43.017295837402344, + "step": 4214 + }, + { + "epoch": 2.6220839813374806, + "grad_norm": 5.2649149438366294e-05, + "learning_rate": 6.984785615491011e-07, + "logits/chosen": -0.7687242031097412, + "logits/rejected": 2.682828903198242, + "logps/chosen": -498.4187927246094, + "logps/rejected": -976.85107421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.1673665046691895, + "rewards/margins": 24.873815536499023, + "rewards/rejected": -32.04118347167969, + "step": 4215 + }, + { + "epoch": 2.622706065318818, + "grad_norm": 0.0028433254919946194, + "learning_rate": 6.973259566620563e-07, + "logits/chosen": 0.5688213109970093, + "logits/rejected": 1.6913548707962036, + "logps/chosen": -529.4207763671875, + "logps/rejected": -961.5525512695312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.866388320922852, + "rewards/margins": 31.74820327758789, + "rewards/rejected": -40.61458969116211, + "step": 4216 + }, + { + "epoch": 2.6233281493001557, + "grad_norm": 0.11423134058713913, + "learning_rate": 6.961733517750116e-07, + "logits/chosen": 1.4282294511795044, + "logits/rejected": 3.881552219390869, + "logps/chosen": -537.96533203125, + "logps/rejected": -983.5394897460938, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.764850616455078, + "rewards/margins": 26.096805572509766, + "rewards/rejected": -34.86166000366211, + "step": 4217 + }, + { + "epoch": 2.623950233281493, + "grad_norm": 2.352363480895292e-05, + "learning_rate": 6.950207468879669e-07, + "logits/chosen": 1.5548824071884155, + "logits/rejected": 3.7878293991088867, + "logps/chosen": -506.8604736328125, + "logps/rejected": -974.7597045898438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.410656929016113, + "rewards/margins": 29.260358810424805, + "rewards/rejected": -36.671016693115234, + "step": 4218 + }, + { + "epoch": 2.6245723172628304, + "grad_norm": 0.002875835634768009, + "learning_rate": 6.938681420009222e-07, + "logits/chosen": 1.1211720705032349, + "logits/rejected": 1.8514307737350464, + "logps/chosen": -586.6343994140625, + "logps/rejected": -912.5908813476562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.89848518371582, + "rewards/margins": 24.800308227539062, + "rewards/rejected": -35.698795318603516, + "step": 4219 + }, + { + "epoch": 2.625194401244168, + "grad_norm": 29.071430206298828, + "learning_rate": 6.927155371138774e-07, + "logits/chosen": 1.4410120248794556, + "logits/rejected": 5.300469875335693, + "logps/chosen": -619.1028442382812, + "logps/rejected": -1141.4307861328125, + "loss": 0.1668, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.94244384765625, + "rewards/margins": 26.28636932373047, + "rewards/rejected": -38.22881317138672, + "step": 4220 + }, + { + "epoch": 2.6258164852255055, + "grad_norm": 0.12146138399839401, + "learning_rate": 6.915629322268328e-07, + "logits/chosen": 0.5489065051078796, + "logits/rejected": 2.0479743480682373, + "logps/chosen": -506.0579833984375, + "logps/rejected": -892.6707153320312, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.431719779968262, + "rewards/margins": 28.322105407714844, + "rewards/rejected": -35.75382614135742, + "step": 4221 + }, + { + "epoch": 2.626438569206843, + "grad_norm": 48.736846923828125, + "learning_rate": 6.90410327339788e-07, + "logits/chosen": -0.009646058082580566, + "logits/rejected": 3.260806083679199, + "logps/chosen": -595.076171875, + "logps/rejected": -1000.2465209960938, + "loss": 0.2363, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.931441307067871, + "rewards/margins": 22.281841278076172, + "rewards/rejected": -33.21328353881836, + "step": 4222 + }, + { + "epoch": 2.62706065318818, + "grad_norm": 0.11191926151514053, + "learning_rate": 6.892577224527433e-07, + "logits/chosen": 0.5235657691955566, + "logits/rejected": 1.8013144731521606, + "logps/chosen": -515.4962158203125, + "logps/rejected": -802.4577026367188, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.3546319007873535, + "rewards/margins": 20.22857093811035, + "rewards/rejected": -26.583202362060547, + "step": 4223 + }, + { + "epoch": 2.627682737169518, + "grad_norm": 6.194474266862926e-09, + "learning_rate": 6.881051175656986e-07, + "logits/chosen": -1.7713825702667236, + "logits/rejected": 0.584291398525238, + "logps/chosen": -435.9535827636719, + "logps/rejected": -888.5193481445312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.4394989013671875, + "rewards/margins": 29.70003318786621, + "rewards/rejected": -37.13953399658203, + "step": 4224 + }, + { + "epoch": 2.6283048211508553, + "grad_norm": 0.0005869278102181852, + "learning_rate": 6.869525126786539e-07, + "logits/chosen": 0.9013730883598328, + "logits/rejected": 3.2150769233703613, + "logps/chosen": -431.56793212890625, + "logps/rejected": -943.7103271484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.473167419433594, + "rewards/margins": 33.50733184814453, + "rewards/rejected": -41.980499267578125, + "step": 4225 + }, + { + "epoch": 2.628926905132193, + "grad_norm": 3.5658481121063232, + "learning_rate": 6.857999077916092e-07, + "logits/chosen": 1.838148832321167, + "logits/rejected": 2.4090681076049805, + "logps/chosen": -568.6758422851562, + "logps/rejected": -890.5926513671875, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.6999053955078125, + "rewards/margins": 27.453750610351562, + "rewards/rejected": -35.153656005859375, + "step": 4226 + }, + { + "epoch": 2.6295489891135304, + "grad_norm": 0.2820209562778473, + "learning_rate": 6.846473029045644e-07, + "logits/chosen": -0.5001500844955444, + "logits/rejected": 3.3401925563812256, + "logps/chosen": -506.65838623046875, + "logps/rejected": -1072.645751953125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.734173774719238, + "rewards/margins": 38.09546661376953, + "rewards/rejected": -44.82963943481445, + "step": 4227 + }, + { + "epoch": 2.6301710730948678, + "grad_norm": 0.4728274345397949, + "learning_rate": 6.834946980175195e-07, + "logits/chosen": 1.5491831302642822, + "logits/rejected": 2.317342519760132, + "logps/chosen": -675.6054077148438, + "logps/rejected": -868.92236328125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.160175323486328, + "rewards/margins": 17.74323844909668, + "rewards/rejected": -26.903411865234375, + "step": 4228 + }, + { + "epoch": 2.630793157076205, + "grad_norm": 9.116470336914062, + "learning_rate": 6.823420931304749e-07, + "logits/chosen": -1.4375827312469482, + "logits/rejected": 2.915127754211426, + "logps/chosen": -476.7994384765625, + "logps/rejected": -1074.4156494140625, + "loss": 0.0514, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.458136558532715, + "rewards/margins": 31.899354934692383, + "rewards/rejected": -40.35749053955078, + "step": 4229 + }, + { + "epoch": 2.631415241057543, + "grad_norm": 5.379433787311427e-07, + "learning_rate": 6.811894882434302e-07, + "logits/chosen": -0.5646690130233765, + "logits/rejected": 2.736670970916748, + "logps/chosen": -390.4752197265625, + "logps/rejected": -953.4425048828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.021817684173584, + "rewards/margins": 34.986270904541016, + "rewards/rejected": -41.008087158203125, + "step": 4230 + }, + { + "epoch": 2.63203732503888, + "grad_norm": 0.08292002230882645, + "learning_rate": 6.800368833563854e-07, + "logits/chosen": -0.05222213268280029, + "logits/rejected": 3.956101179122925, + "logps/chosen": -325.4035339355469, + "logps/rejected": -900.9954833984375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.976347923278809, + "rewards/margins": 30.259437561035156, + "rewards/rejected": -36.23578643798828, + "step": 4231 + }, + { + "epoch": 2.632659409020218, + "grad_norm": 0.009427044540643692, + "learning_rate": 6.788842784693408e-07, + "logits/chosen": 1.4390075206756592, + "logits/rejected": 3.480053186416626, + "logps/chosen": -663.7787475585938, + "logps/rejected": -959.8733520507812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.171966552734375, + "rewards/margins": 21.327939987182617, + "rewards/rejected": -31.49990463256836, + "step": 4232 + }, + { + "epoch": 2.6332814930015553, + "grad_norm": 0.06294838339090347, + "learning_rate": 6.77731673582296e-07, + "logits/chosen": -0.3899800181388855, + "logits/rejected": 3.326247215270996, + "logps/chosen": -421.4250793457031, + "logps/rejected": -1036.8131103515625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.207780361175537, + "rewards/margins": 35.97682189941406, + "rewards/rejected": -40.184600830078125, + "step": 4233 + }, + { + "epoch": 2.6339035769828927, + "grad_norm": 0.1459599733352661, + "learning_rate": 6.765790686952513e-07, + "logits/chosen": 0.2932206392288208, + "logits/rejected": 3.4834704399108887, + "logps/chosen": -529.5743408203125, + "logps/rejected": -1094.4615478515625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.644756317138672, + "rewards/margins": 33.56782150268555, + "rewards/rejected": -46.21257781982422, + "step": 4234 + }, + { + "epoch": 2.63452566096423, + "grad_norm": 3.442986098889378e-06, + "learning_rate": 6.754264638082065e-07, + "logits/chosen": -1.2240034341812134, + "logits/rejected": 2.0704448223114014, + "logps/chosen": -433.75238037109375, + "logps/rejected": -1079.27978515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.548064231872559, + "rewards/margins": 40.05144119262695, + "rewards/rejected": -45.59950637817383, + "step": 4235 + }, + { + "epoch": 2.635147744945568, + "grad_norm": 2.317560911178589, + "learning_rate": 6.742738589211619e-07, + "logits/chosen": -0.23785221576690674, + "logits/rejected": 4.1717119216918945, + "logps/chosen": -499.7503662109375, + "logps/rejected": -1082.9619140625, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.164165496826172, + "rewards/margins": 29.362163543701172, + "rewards/rejected": -35.526329040527344, + "step": 4236 + }, + { + "epoch": 2.635769828926905, + "grad_norm": 0.22036594152450562, + "learning_rate": 6.731212540341171e-07, + "logits/chosen": 1.2997395992279053, + "logits/rejected": 2.5405406951904297, + "logps/chosen": -779.3462524414062, + "logps/rejected": -1011.1041870117188, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.838093757629395, + "rewards/margins": 20.889070510864258, + "rewards/rejected": -34.72716522216797, + "step": 4237 + }, + { + "epoch": 2.6363919129082425, + "grad_norm": 0.002922237850725651, + "learning_rate": 6.719686491470724e-07, + "logits/chosen": -0.7122418880462646, + "logits/rejected": 2.964954376220703, + "logps/chosen": -419.7462463378906, + "logps/rejected": -898.150634765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.0358076095581055, + "rewards/margins": 24.98333168029785, + "rewards/rejected": -31.019140243530273, + "step": 4238 + }, + { + "epoch": 2.6370139968895803, + "grad_norm": 0.96134352684021, + "learning_rate": 6.708160442600276e-07, + "logits/chosen": 0.594042181968689, + "logits/rejected": 2.318164348602295, + "logps/chosen": -532.7374877929688, + "logps/rejected": -886.8975830078125, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.02415657043457, + "rewards/margins": 23.79339599609375, + "rewards/rejected": -35.81755447387695, + "step": 4239 + }, + { + "epoch": 2.6376360808709176, + "grad_norm": 44.25870132446289, + "learning_rate": 6.69663439372983e-07, + "logits/chosen": 2.3529224395751953, + "logits/rejected": 1.3345285654067993, + "logps/chosen": -757.313232421875, + "logps/rejected": -974.56298828125, + "loss": 0.6363, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.141068458557129, + "rewards/margins": 16.502092361450195, + "rewards/rejected": -29.643163681030273, + "step": 4240 + }, + { + "epoch": 2.638258164852255, + "grad_norm": 22.81918716430664, + "learning_rate": 6.685108344859383e-07, + "logits/chosen": 2.233950614929199, + "logits/rejected": 3.8848304748535156, + "logps/chosen": -753.2386474609375, + "logps/rejected": -1093.29833984375, + "loss": 0.1278, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.840153694152832, + "rewards/margins": 23.329586029052734, + "rewards/rejected": -34.16973876953125, + "step": 4241 + }, + { + "epoch": 2.6388802488335923, + "grad_norm": 0.006664901971817017, + "learning_rate": 6.673582295988935e-07, + "logits/chosen": 0.8757166266441345, + "logits/rejected": 1.6427512168884277, + "logps/chosen": -549.896728515625, + "logps/rejected": -811.0697631835938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.502927780151367, + "rewards/margins": 22.282499313354492, + "rewards/rejected": -30.78542709350586, + "step": 4242 + }, + { + "epoch": 2.63950233281493, + "grad_norm": 0.22620686888694763, + "learning_rate": 6.662056247118489e-07, + "logits/chosen": 0.053068965673446655, + "logits/rejected": 2.0805678367614746, + "logps/chosen": -502.27984619140625, + "logps/rejected": -819.5906982421875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.042535781860352, + "rewards/margins": 20.76211166381836, + "rewards/rejected": -29.804649353027344, + "step": 4243 + }, + { + "epoch": 2.6401244167962674, + "grad_norm": 0.23999188840389252, + "learning_rate": 6.650530198248041e-07, + "logits/chosen": 2.5088069438934326, + "logits/rejected": 3.2695717811584473, + "logps/chosen": -784.7813110351562, + "logps/rejected": -889.4437255859375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.6802592277526855, + "rewards/margins": 17.840471267700195, + "rewards/rejected": -25.520729064941406, + "step": 4244 + }, + { + "epoch": 2.640746500777605, + "grad_norm": 0.001498526893556118, + "learning_rate": 6.639004149377594e-07, + "logits/chosen": 1.8689944744110107, + "logits/rejected": 4.283603191375732, + "logps/chosen": -619.4450073242188, + "logps/rejected": -1062.1727294921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.174517631530762, + "rewards/margins": 32.78430938720703, + "rewards/rejected": -42.95882797241211, + "step": 4245 + }, + { + "epoch": 2.6413685847589425, + "grad_norm": 1.6747339032008313e-05, + "learning_rate": 6.627478100507146e-07, + "logits/chosen": 1.833518385887146, + "logits/rejected": 3.115447759628296, + "logps/chosen": -584.89404296875, + "logps/rejected": -947.2235717773438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.330589294433594, + "rewards/margins": 30.510927200317383, + "rewards/rejected": -37.841514587402344, + "step": 4246 + }, + { + "epoch": 2.64199066874028, + "grad_norm": 0.00851528998464346, + "learning_rate": 6.6159520516367e-07, + "logits/chosen": -2.964029312133789, + "logits/rejected": 2.7679834365844727, + "logps/chosen": -350.8724365234375, + "logps/rejected": -992.7243041992188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.659777641296387, + "rewards/margins": 33.55049133300781, + "rewards/rejected": -42.21027374267578, + "step": 4247 + }, + { + "epoch": 2.642612752721617, + "grad_norm": 0.0012625380186364055, + "learning_rate": 6.604426002766252e-07, + "logits/chosen": -0.7499178051948547, + "logits/rejected": 2.6562108993530273, + "logps/chosen": -449.5132751464844, + "logps/rejected": -996.1448364257812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.215665817260742, + "rewards/margins": 30.846622467041016, + "rewards/rejected": -36.062286376953125, + "step": 4248 + }, + { + "epoch": 2.643234836702955, + "grad_norm": 9.423551716736256e-09, + "learning_rate": 6.592899953895805e-07, + "logits/chosen": -0.9357733130455017, + "logits/rejected": 3.503596067428589, + "logps/chosen": -386.92279052734375, + "logps/rejected": -1057.938232421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.58002233505249, + "rewards/margins": 38.17155838012695, + "rewards/rejected": -45.75157928466797, + "step": 4249 + }, + { + "epoch": 2.6438569206842923, + "grad_norm": 0.0967218279838562, + "learning_rate": 6.581373905025359e-07, + "logits/chosen": 0.3321017622947693, + "logits/rejected": 2.5047409534454346, + "logps/chosen": -614.6363525390625, + "logps/rejected": -1019.7175903320312, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.16756534576416, + "rewards/margins": 29.467573165893555, + "rewards/rejected": -39.63513946533203, + "step": 4250 + }, + { + "epoch": 2.64447900466563, + "grad_norm": 0.010432337410748005, + "learning_rate": 6.569847856154911e-07, + "logits/chosen": -0.8032090663909912, + "logits/rejected": 1.1724261045455933, + "logps/chosen": -439.4854736328125, + "logps/rejected": -797.3453369140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.029415607452393, + "rewards/margins": 22.670787811279297, + "rewards/rejected": -29.70020294189453, + "step": 4251 + }, + { + "epoch": 2.6451010886469675, + "grad_norm": 0.0005230961251072586, + "learning_rate": 6.558321807284464e-07, + "logits/chosen": -1.4568581581115723, + "logits/rejected": 2.3403358459472656, + "logps/chosen": -377.98309326171875, + "logps/rejected": -865.370361328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.655060291290283, + "rewards/margins": 29.115367889404297, + "rewards/rejected": -35.77043151855469, + "step": 4252 + }, + { + "epoch": 2.645723172628305, + "grad_norm": 0.31717145442962646, + "learning_rate": 6.546795758414016e-07, + "logits/chosen": -1.226088285446167, + "logits/rejected": 2.644490957260132, + "logps/chosen": -423.13421630859375, + "logps/rejected": -875.599853515625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.5810723304748535, + "rewards/margins": 20.505836486816406, + "rewards/rejected": -28.086910247802734, + "step": 4253 + }, + { + "epoch": 2.646345256609642, + "grad_norm": 0.00606823805719614, + "learning_rate": 6.53526970954357e-07, + "logits/chosen": -1.1096882820129395, + "logits/rejected": 3.2076497077941895, + "logps/chosen": -490.0120849609375, + "logps/rejected": -1097.8856201171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.587215900421143, + "rewards/margins": 33.65058898925781, + "rewards/rejected": -41.23780059814453, + "step": 4254 + }, + { + "epoch": 2.64696734059098, + "grad_norm": 0.004925860557705164, + "learning_rate": 6.523743660673122e-07, + "logits/chosen": -1.5207152366638184, + "logits/rejected": 0.664267361164093, + "logps/chosen": -440.29119873046875, + "logps/rejected": -877.4176025390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.222831726074219, + "rewards/margins": 26.639625549316406, + "rewards/rejected": -35.862457275390625, + "step": 4255 + }, + { + "epoch": 2.6475894245723173, + "grad_norm": 20.94559097290039, + "learning_rate": 6.512217611802675e-07, + "logits/chosen": -0.6214895248413086, + "logits/rejected": 2.9224040508270264, + "logps/chosen": -532.4989013671875, + "logps/rejected": -1065.483642578125, + "loss": 0.1216, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.967597007751465, + "rewards/margins": 25.820096969604492, + "rewards/rejected": -35.78769302368164, + "step": 4256 + }, + { + "epoch": 2.6482115085536546, + "grad_norm": 0.10524414479732513, + "learning_rate": 6.500691562932227e-07, + "logits/chosen": 1.6900782585144043, + "logits/rejected": 5.192551136016846, + "logps/chosen": -601.140625, + "logps/rejected": -1032.5745849609375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.735567092895508, + "rewards/margins": 22.786855697631836, + "rewards/rejected": -37.522422790527344, + "step": 4257 + }, + { + "epoch": 2.6488335925349924, + "grad_norm": 0.00883454643189907, + "learning_rate": 6.489165514061781e-07, + "logits/chosen": 1.3252453804016113, + "logits/rejected": 3.5295932292938232, + "logps/chosen": -600.4019775390625, + "logps/rejected": -1145.2413330078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.716447830200195, + "rewards/margins": 39.48173141479492, + "rewards/rejected": -48.19818115234375, + "step": 4258 + }, + { + "epoch": 2.6494556765163297, + "grad_norm": 1.2012814295303542e-05, + "learning_rate": 6.477639465191333e-07, + "logits/chosen": -1.301712155342102, + "logits/rejected": 3.1368601322174072, + "logps/chosen": -518.835205078125, + "logps/rejected": -1065.5616455078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.9481201171875, + "rewards/margins": 32.27141571044922, + "rewards/rejected": -40.219539642333984, + "step": 4259 + }, + { + "epoch": 2.650077760497667, + "grad_norm": 5.147276897332631e-05, + "learning_rate": 6.466113416320886e-07, + "logits/chosen": -0.972149670124054, + "logits/rejected": 3.9147684574127197, + "logps/chosen": -478.2498779296875, + "logps/rejected": -1068.1552734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.849447250366211, + "rewards/margins": 34.413818359375, + "rewards/rejected": -43.263267517089844, + "step": 4260 + }, + { + "epoch": 2.6506998444790044, + "grad_norm": 32.1840934753418, + "learning_rate": 6.45458736745044e-07, + "logits/chosen": 0.1018313467502594, + "logits/rejected": 1.8838446140289307, + "logps/chosen": -594.6072998046875, + "logps/rejected": -981.5043334960938, + "loss": 0.2721, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.896601676940918, + "rewards/margins": 23.032018661499023, + "rewards/rejected": -31.928619384765625, + "step": 4261 + }, + { + "epoch": 2.651321928460342, + "grad_norm": 0.035230863839387894, + "learning_rate": 6.443061318579991e-07, + "logits/chosen": -1.6414093971252441, + "logits/rejected": 2.646522283554077, + "logps/chosen": -417.251220703125, + "logps/rejected": -1044.8880615234375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.817538261413574, + "rewards/margins": 37.852577209472656, + "rewards/rejected": -42.67011642456055, + "step": 4262 + }, + { + "epoch": 2.6519440124416795, + "grad_norm": 1.312959341248643e-07, + "learning_rate": 6.431535269709543e-07, + "logits/chosen": -0.18359392881393433, + "logits/rejected": 3.251091718673706, + "logps/chosen": -436.75213623046875, + "logps/rejected": -925.1771240234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.020618438720703, + "rewards/margins": 33.024742126464844, + "rewards/rejected": -39.04536437988281, + "step": 4263 + }, + { + "epoch": 2.6525660964230173, + "grad_norm": 0.014321111142635345, + "learning_rate": 6.420009220839096e-07, + "logits/chosen": -2.6356351375579834, + "logits/rejected": 2.7543556690216064, + "logps/chosen": -335.8462829589844, + "logps/rejected": -937.9099731445312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.071978569030762, + "rewards/margins": 26.716182708740234, + "rewards/rejected": -32.78816223144531, + "step": 4264 + }, + { + "epoch": 2.6531881804043547, + "grad_norm": 0.016877740621566772, + "learning_rate": 6.408483171968649e-07, + "logits/chosen": 1.5884143114089966, + "logits/rejected": 2.4657225608825684, + "logps/chosen": -485.8375549316406, + "logps/rejected": -776.3801879882812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.200467109680176, + "rewards/margins": 20.69041633605957, + "rewards/rejected": -29.890884399414062, + "step": 4265 + }, + { + "epoch": 2.653810264385692, + "grad_norm": 0.009289245121181011, + "learning_rate": 6.396957123098202e-07, + "logits/chosen": -1.7212367057800293, + "logits/rejected": 0.07681708037853241, + "logps/chosen": -541.7117309570312, + "logps/rejected": -913.375732421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.911618232727051, + "rewards/margins": 26.93120574951172, + "rewards/rejected": -34.84282302856445, + "step": 4266 + }, + { + "epoch": 2.6544323483670293, + "grad_norm": 1.5398443053982191e-07, + "learning_rate": 6.385431074227755e-07, + "logits/chosen": 2.1448214054107666, + "logits/rejected": 4.380794048309326, + "logps/chosen": -633.0781860351562, + "logps/rejected": -1132.920654296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.251069068908691, + "rewards/margins": 30.245288848876953, + "rewards/rejected": -40.49635696411133, + "step": 4267 + }, + { + "epoch": 2.655054432348367, + "grad_norm": 0.00034798384876921773, + "learning_rate": 6.373905025357307e-07, + "logits/chosen": -0.5650123357772827, + "logits/rejected": 3.188966989517212, + "logps/chosen": -424.6273193359375, + "logps/rejected": -955.9008178710938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.289674758911133, + "rewards/margins": 34.93653869628906, + "rewards/rejected": -40.22621154785156, + "step": 4268 + }, + { + "epoch": 2.6556765163297045, + "grad_norm": 3.408320903778076, + "learning_rate": 6.362378976486861e-07, + "logits/chosen": 0.25153806805610657, + "logits/rejected": 2.325662612915039, + "logps/chosen": -574.9864501953125, + "logps/rejected": -909.0303955078125, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.206477165222168, + "rewards/margins": 25.306110382080078, + "rewards/rejected": -31.51258659362793, + "step": 4269 + }, + { + "epoch": 2.6562986003110423, + "grad_norm": 0.0008155781542882323, + "learning_rate": 6.350852927616413e-07, + "logits/chosen": 1.7965188026428223, + "logits/rejected": 3.6532082557678223, + "logps/chosen": -685.572509765625, + "logps/rejected": -1040.648193359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.555252075195312, + "rewards/margins": 24.258285522460938, + "rewards/rejected": -33.813533782958984, + "step": 4270 + }, + { + "epoch": 2.6569206842923796, + "grad_norm": 0.0809859037399292, + "learning_rate": 6.339326878745966e-07, + "logits/chosen": 2.1448750495910645, + "logits/rejected": 3.0449576377868652, + "logps/chosen": -635.3004150390625, + "logps/rejected": -1008.4331665039062, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.257442474365234, + "rewards/margins": 29.437252044677734, + "rewards/rejected": -37.6946907043457, + "step": 4271 + }, + { + "epoch": 2.657542768273717, + "grad_norm": 0.06920414417982101, + "learning_rate": 6.327800829875519e-07, + "logits/chosen": 0.02657720446586609, + "logits/rejected": 0.1941850781440735, + "logps/chosen": -641.6112060546875, + "logps/rejected": -884.4364624023438, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.948108673095703, + "rewards/margins": 23.97146224975586, + "rewards/rejected": -35.91957092285156, + "step": 4272 + }, + { + "epoch": 2.6581648522550543, + "grad_norm": 1.436835527420044, + "learning_rate": 6.316274781005072e-07, + "logits/chosen": 0.5097701549530029, + "logits/rejected": 2.3904218673706055, + "logps/chosen": -774.1339111328125, + "logps/rejected": -1126.98681640625, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.555923461914062, + "rewards/margins": 26.73133087158203, + "rewards/rejected": -42.287254333496094, + "step": 4273 + }, + { + "epoch": 2.658786936236392, + "grad_norm": 0.03071177937090397, + "learning_rate": 6.304748732134624e-07, + "logits/chosen": 1.7224812507629395, + "logits/rejected": 5.085427284240723, + "logps/chosen": -598.9939575195312, + "logps/rejected": -1024.482177734375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.686386108398438, + "rewards/margins": 31.341815948486328, + "rewards/rejected": -40.028202056884766, + "step": 4274 + }, + { + "epoch": 2.6594090202177294, + "grad_norm": 4.7121588977461215e-06, + "learning_rate": 6.293222683264177e-07, + "logits/chosen": -0.3162381649017334, + "logits/rejected": 2.9769697189331055, + "logps/chosen": -521.7335815429688, + "logps/rejected": -1134.259521484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.706584930419922, + "rewards/margins": 37.74868392944336, + "rewards/rejected": -46.45526885986328, + "step": 4275 + }, + { + "epoch": 2.6600311041990667, + "grad_norm": 2.563671588897705, + "learning_rate": 6.28169663439373e-07, + "logits/chosen": 1.0807900428771973, + "logits/rejected": 2.9324049949645996, + "logps/chosen": -572.3258056640625, + "logps/rejected": -898.4095458984375, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.48967170715332, + "rewards/margins": 23.60786247253418, + "rewards/rejected": -34.0975341796875, + "step": 4276 + }, + { + "epoch": 2.6606531881804045, + "grad_norm": 0.10538554191589355, + "learning_rate": 6.270170585523283e-07, + "logits/chosen": 2.7478926181793213, + "logits/rejected": 4.047188758850098, + "logps/chosen": -658.8651733398438, + "logps/rejected": -1066.6217041015625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.698392868041992, + "rewards/margins": 29.148723602294922, + "rewards/rejected": -36.84711456298828, + "step": 4277 + }, + { + "epoch": 2.661275272161742, + "grad_norm": 1.1563502550125122, + "learning_rate": 6.258644536652836e-07, + "logits/chosen": -0.7198779582977295, + "logits/rejected": 1.4708242416381836, + "logps/chosen": -403.05499267578125, + "logps/rejected": -645.0602416992188, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.093057155609131, + "rewards/margins": 15.91860580444336, + "rewards/rejected": -22.01166343688965, + "step": 4278 + }, + { + "epoch": 2.661897356143079, + "grad_norm": 0.0004465764795895666, + "learning_rate": 6.247118487782389e-07, + "logits/chosen": -1.3386738300323486, + "logits/rejected": 3.840517520904541, + "logps/chosen": -447.1226501464844, + "logps/rejected": -1076.5313720703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7728443145751953, + "rewards/margins": 30.563493728637695, + "rewards/rejected": -34.33633804321289, + "step": 4279 + }, + { + "epoch": 2.6625194401244165, + "grad_norm": 0.08408825099468231, + "learning_rate": 6.235592438911942e-07, + "logits/chosen": 0.062289535999298096, + "logits/rejected": 1.252807855606079, + "logps/chosen": -457.46234130859375, + "logps/rejected": -842.30517578125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.778433322906494, + "rewards/margins": 23.938804626464844, + "rewards/rejected": -31.71723747253418, + "step": 4280 + }, + { + "epoch": 2.6631415241057543, + "grad_norm": 5.8949448430212215e-05, + "learning_rate": 6.224066390041494e-07, + "logits/chosen": -1.4612160921096802, + "logits/rejected": 3.185497283935547, + "logps/chosen": -463.0811462402344, + "logps/rejected": -1052.7869873046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.640124797821045, + "rewards/margins": 29.35283660888672, + "rewards/rejected": -35.992958068847656, + "step": 4281 + }, + { + "epoch": 2.6637636080870917, + "grad_norm": 0.01346707995980978, + "learning_rate": 6.212540341171047e-07, + "logits/chosen": 1.049379825592041, + "logits/rejected": 1.272294282913208, + "logps/chosen": -743.18115234375, + "logps/rejected": -1055.4158935546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.955890655517578, + "rewards/margins": 30.826642990112305, + "rewards/rejected": -41.782535552978516, + "step": 4282 + }, + { + "epoch": 2.6643856920684295, + "grad_norm": 14.5199556350708, + "learning_rate": 6.2010142923006e-07, + "logits/chosen": -0.46947622299194336, + "logits/rejected": 3.629220962524414, + "logps/chosen": -535.6415405273438, + "logps/rejected": -1027.4896240234375, + "loss": 0.0661, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.940256118774414, + "rewards/margins": 23.86663818359375, + "rewards/rejected": -34.80689239501953, + "step": 4283 + }, + { + "epoch": 2.665007776049767, + "grad_norm": 0.0023726599756628275, + "learning_rate": 6.189488243430153e-07, + "logits/chosen": 0.3610234558582306, + "logits/rejected": 2.9661874771118164, + "logps/chosen": -427.2809753417969, + "logps/rejected": -845.6876220703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.674696922302246, + "rewards/margins": 25.183311462402344, + "rewards/rejected": -32.858009338378906, + "step": 4284 + }, + { + "epoch": 2.665629860031104, + "grad_norm": 40.15415573120117, + "learning_rate": 6.177962194559705e-07, + "logits/chosen": -0.06265552341938019, + "logits/rejected": 2.3903772830963135, + "logps/chosen": -485.881591796875, + "logps/rejected": -884.794921875, + "loss": 1.5431, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.829121589660645, + "rewards/margins": 25.358768463134766, + "rewards/rejected": -35.18789291381836, + "step": 4285 + }, + { + "epoch": 2.6662519440124415, + "grad_norm": 0.6707028746604919, + "learning_rate": 6.166436145689259e-07, + "logits/chosen": 0.2432861328125, + "logits/rejected": 2.304138660430908, + "logps/chosen": -518.6677856445312, + "logps/rejected": -877.533447265625, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.821560859680176, + "rewards/margins": 27.72966766357422, + "rewards/rejected": -39.55122756958008, + "step": 4286 + }, + { + "epoch": 2.6668740279937793, + "grad_norm": 2.2541589714819565e-05, + "learning_rate": 6.154910096818812e-07, + "logits/chosen": -0.033841878175735474, + "logits/rejected": 2.4506354331970215, + "logps/chosen": -722.0032958984375, + "logps/rejected": -1138.4862060546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.616849899291992, + "rewards/margins": 28.83795738220215, + "rewards/rejected": -40.45480728149414, + "step": 4287 + }, + { + "epoch": 2.6674961119751166, + "grad_norm": 0.0012312207836657763, + "learning_rate": 6.143384047948363e-07, + "logits/chosen": -2.3437626361846924, + "logits/rejected": 2.3316233158111572, + "logps/chosen": -242.3722381591797, + "logps/rejected": -801.2603759765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0629377365112305, + "rewards/margins": 33.32283020019531, + "rewards/rejected": -38.385765075683594, + "step": 4288 + }, + { + "epoch": 2.6681181959564544, + "grad_norm": 0.00012092386896256357, + "learning_rate": 6.131857999077916e-07, + "logits/chosen": 0.732921302318573, + "logits/rejected": 3.0997118949890137, + "logps/chosen": -423.0313415527344, + "logps/rejected": -842.7108764648438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.711177825927734, + "rewards/margins": 28.025440216064453, + "rewards/rejected": -34.73662185668945, + "step": 4289 + }, + { + "epoch": 2.6687402799377917, + "grad_norm": 0.4224456548690796, + "learning_rate": 6.12033195020747e-07, + "logits/chosen": 0.03448355197906494, + "logits/rejected": 1.8089535236358643, + "logps/chosen": -472.80010986328125, + "logps/rejected": -842.5576171875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.802806854248047, + "rewards/margins": 22.627397537231445, + "rewards/rejected": -31.43020248413086, + "step": 4290 + }, + { + "epoch": 2.669362363919129, + "grad_norm": 2.015106446151549e-07, + "learning_rate": 6.108805901337022e-07, + "logits/chosen": -2.042537212371826, + "logits/rejected": 2.324296474456787, + "logps/chosen": -361.269287109375, + "logps/rejected": -1081.171630859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.318136692047119, + "rewards/margins": 41.60655212402344, + "rewards/rejected": -48.92469024658203, + "step": 4291 + }, + { + "epoch": 2.6699844479004664, + "grad_norm": 0.09945330768823624, + "learning_rate": 6.097279852466575e-07, + "logits/chosen": -1.8823904991149902, + "logits/rejected": 3.3184216022491455, + "logps/chosen": -398.1663818359375, + "logps/rejected": -925.0068359375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.197899341583252, + "rewards/margins": 24.55364990234375, + "rewards/rejected": -29.751550674438477, + "step": 4292 + }, + { + "epoch": 2.670606531881804, + "grad_norm": 0.008346091955900192, + "learning_rate": 6.085753803596127e-07, + "logits/chosen": 0.5325585603713989, + "logits/rejected": 1.5247480869293213, + "logps/chosen": -607.7169799804688, + "logps/rejected": -879.230224609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.277191162109375, + "rewards/margins": 23.130996704101562, + "rewards/rejected": -34.40818786621094, + "step": 4293 + }, + { + "epoch": 2.6712286158631415, + "grad_norm": 0.059563759714365005, + "learning_rate": 6.074227754725681e-07, + "logits/chosen": 1.1496225595474243, + "logits/rejected": 2.825745105743408, + "logps/chosen": -553.5374145507812, + "logps/rejected": -909.6398315429688, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.7813825607299805, + "rewards/margins": 24.857816696166992, + "rewards/rejected": -32.63920211791992, + "step": 4294 + }, + { + "epoch": 2.671850699844479, + "grad_norm": 0.03193175047636032, + "learning_rate": 6.062701705855233e-07, + "logits/chosen": 1.2888555526733398, + "logits/rejected": 1.7476494312286377, + "logps/chosen": -692.4351806640625, + "logps/rejected": -1036.4249267578125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.082293510437012, + "rewards/margins": 30.193002700805664, + "rewards/rejected": -45.27529525756836, + "step": 4295 + }, + { + "epoch": 2.6724727838258167, + "grad_norm": 6.026350456522778e-05, + "learning_rate": 6.051175656984786e-07, + "logits/chosen": -1.0907959938049316, + "logits/rejected": 1.9298566579818726, + "logps/chosen": -275.3390808105469, + "logps/rejected": -685.6051635742188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2236406803131104, + "rewards/margins": 23.281383514404297, + "rewards/rejected": -25.505023956298828, + "step": 4296 + }, + { + "epoch": 2.673094867807154, + "grad_norm": 28.11423683166504, + "learning_rate": 6.039649608114339e-07, + "logits/chosen": -2.3258821964263916, + "logits/rejected": 1.7522019147872925, + "logps/chosen": -397.2674255371094, + "logps/rejected": -995.8222045898438, + "loss": 0.5865, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.696331024169922, + "rewards/margins": 34.149314880371094, + "rewards/rejected": -41.845645904541016, + "step": 4297 + }, + { + "epoch": 2.6737169517884913, + "grad_norm": 0.022131238132715225, + "learning_rate": 6.028123559243892e-07, + "logits/chosen": 1.659833312034607, + "logits/rejected": 2.845536231994629, + "logps/chosen": -592.1507568359375, + "logps/rejected": -993.156982421875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.496284484863281, + "rewards/margins": 26.269489288330078, + "rewards/rejected": -36.76577377319336, + "step": 4298 + }, + { + "epoch": 2.6743390357698287, + "grad_norm": 0.003015428315848112, + "learning_rate": 6.016597510373444e-07, + "logits/chosen": 0.17328539490699768, + "logits/rejected": 3.8262343406677246, + "logps/chosen": -496.84283447265625, + "logps/rejected": -1015.587646484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.074370384216309, + "rewards/margins": 27.95060920715332, + "rewards/rejected": -38.02498245239258, + "step": 4299 + }, + { + "epoch": 2.6749611197511665, + "grad_norm": 0.28797683119773865, + "learning_rate": 6.005071461502997e-07, + "logits/chosen": 1.7421389818191528, + "logits/rejected": 3.9504408836364746, + "logps/chosen": -627.1280517578125, + "logps/rejected": -947.1497192382812, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.968725204467773, + "rewards/margins": 22.320602416992188, + "rewards/rejected": -34.289329528808594, + "step": 4300 + }, + { + "epoch": 2.675583203732504, + "grad_norm": 33.13662338256836, + "learning_rate": 5.993545412632551e-07, + "logits/chosen": 0.9550960659980774, + "logits/rejected": 3.220900297164917, + "logps/chosen": -402.09564208984375, + "logps/rejected": -741.6087036132812, + "loss": 0.3385, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.56776237487793, + "rewards/margins": 21.219226837158203, + "rewards/rejected": -29.7869873046875, + "step": 4301 + }, + { + "epoch": 2.6762052877138416, + "grad_norm": 0.04643818736076355, + "learning_rate": 5.982019363762103e-07, + "logits/chosen": -1.7970244884490967, + "logits/rejected": 2.1972224712371826, + "logps/chosen": -345.9138488769531, + "logps/rejected": -744.7276611328125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.161243438720703, + "rewards/margins": 22.81188201904297, + "rewards/rejected": -29.973127365112305, + "step": 4302 + }, + { + "epoch": 2.676827371695179, + "grad_norm": 3.3912474606268006e-08, + "learning_rate": 5.970493314891656e-07, + "logits/chosen": 0.6632111072540283, + "logits/rejected": 4.397903919219971, + "logps/chosen": -500.2611999511719, + "logps/rejected": -1201.0867919921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.991384506225586, + "rewards/margins": 40.941043853759766, + "rewards/rejected": -48.932430267333984, + "step": 4303 + }, + { + "epoch": 2.6774494556765163, + "grad_norm": 3.7330451011657715, + "learning_rate": 5.958967266021209e-07, + "logits/chosen": 2.151575803756714, + "logits/rejected": 3.545137405395508, + "logps/chosen": -677.5784912109375, + "logps/rejected": -964.5394287109375, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.74315071105957, + "rewards/margins": 20.06719970703125, + "rewards/rejected": -33.81035232543945, + "step": 4304 + }, + { + "epoch": 2.6780715396578536, + "grad_norm": 0.014254480600357056, + "learning_rate": 5.947441217150761e-07, + "logits/chosen": 1.86721670627594, + "logits/rejected": 3.160707950592041, + "logps/chosen": -618.7957153320312, + "logps/rejected": -1003.787109375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.814743041992188, + "rewards/margins": 26.127132415771484, + "rewards/rejected": -37.94187927246094, + "step": 4305 + }, + { + "epoch": 2.6786936236391914, + "grad_norm": 0.6654040813446045, + "learning_rate": 5.935915168280314e-07, + "logits/chosen": -1.4605464935302734, + "logits/rejected": 2.656975507736206, + "logps/chosen": -592.65185546875, + "logps/rejected": -1145.5179443359375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.840351104736328, + "rewards/margins": 33.5650749206543, + "rewards/rejected": -44.405426025390625, + "step": 4306 + }, + { + "epoch": 2.6793157076205287, + "grad_norm": 4.114058017730713, + "learning_rate": 5.924389119409866e-07, + "logits/chosen": -1.903027057647705, + "logits/rejected": 2.2601137161254883, + "logps/chosen": -370.8560791015625, + "logps/rejected": -853.153076171875, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.281823635101318, + "rewards/margins": 29.14508819580078, + "rewards/rejected": -36.426910400390625, + "step": 4307 + }, + { + "epoch": 2.6799377916018665, + "grad_norm": 0.0327109694480896, + "learning_rate": 5.91286307053942e-07, + "logits/chosen": -0.14002388715744019, + "logits/rejected": 3.2458126544952393, + "logps/chosen": -501.65673828125, + "logps/rejected": -1063.7379150390625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.754554748535156, + "rewards/margins": 33.78563690185547, + "rewards/rejected": -44.540191650390625, + "step": 4308 + }, + { + "epoch": 2.680559875583204, + "grad_norm": 0.024720264598727226, + "learning_rate": 5.901337021668972e-07, + "logits/chosen": 1.7462257146835327, + "logits/rejected": 4.350376605987549, + "logps/chosen": -506.29852294921875, + "logps/rejected": -969.9747924804688, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.304910659790039, + "rewards/margins": 30.68954849243164, + "rewards/rejected": -37.99446105957031, + "step": 4309 + }, + { + "epoch": 2.681181959564541, + "grad_norm": 2.5972089767456055, + "learning_rate": 5.889810972798525e-07, + "logits/chosen": 1.9655194282531738, + "logits/rejected": 3.740055561065674, + "logps/chosen": -506.2864074707031, + "logps/rejected": -855.5678100585938, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.148515701293945, + "rewards/margins": 22.7083683013916, + "rewards/rejected": -31.856887817382812, + "step": 4310 + }, + { + "epoch": 2.6818040435458785, + "grad_norm": 0.045466069132089615, + "learning_rate": 5.878284923928077e-07, + "logits/chosen": 0.3978482484817505, + "logits/rejected": 4.564357280731201, + "logps/chosen": -550.11181640625, + "logps/rejected": -1136.945068359375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.231959342956543, + "rewards/margins": 34.99109649658203, + "rewards/rejected": -43.223052978515625, + "step": 4311 + }, + { + "epoch": 2.6824261275272163, + "grad_norm": 1.9469133860638976e-07, + "learning_rate": 5.866758875057631e-07, + "logits/chosen": -0.7411868572235107, + "logits/rejected": 3.781776189804077, + "logps/chosen": -480.9675598144531, + "logps/rejected": -1085.2298583984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.085660934448242, + "rewards/margins": 36.49788284301758, + "rewards/rejected": -42.58354949951172, + "step": 4312 + }, + { + "epoch": 2.6830482115085537, + "grad_norm": 0.014485174790024757, + "learning_rate": 5.855232826187184e-07, + "logits/chosen": 1.9372586011886597, + "logits/rejected": 2.3054134845733643, + "logps/chosen": -616.2396240234375, + "logps/rejected": -815.8970947265625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.824966430664062, + "rewards/margins": 19.153440475463867, + "rewards/rejected": -27.97840690612793, + "step": 4313 + }, + { + "epoch": 2.683670295489891, + "grad_norm": 0.05898513272404671, + "learning_rate": 5.843706777316736e-07, + "logits/chosen": 1.3665399551391602, + "logits/rejected": 3.381399154663086, + "logps/chosen": -555.00341796875, + "logps/rejected": -955.5720825195312, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.5432586669921875, + "rewards/margins": 29.97431755065918, + "rewards/rejected": -35.517578125, + "step": 4314 + }, + { + "epoch": 2.684292379471229, + "grad_norm": 1.3017769560974557e-05, + "learning_rate": 5.83218072844629e-07, + "logits/chosen": -2.1644983291625977, + "logits/rejected": 1.2285041809082031, + "logps/chosen": -467.1058654785156, + "logps/rejected": -1055.35400390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.010286331176758, + "rewards/margins": 35.792327880859375, + "rewards/rejected": -45.8026123046875, + "step": 4315 + }, + { + "epoch": 2.684914463452566, + "grad_norm": 3.057724952697754, + "learning_rate": 5.820654679575842e-07, + "logits/chosen": -0.4935888648033142, + "logits/rejected": 2.8731842041015625, + "logps/chosen": -602.6840209960938, + "logps/rejected": -1008.2568359375, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.988034248352051, + "rewards/margins": 27.111400604248047, + "rewards/rejected": -34.09943771362305, + "step": 4316 + }, + { + "epoch": 2.6855365474339035, + "grad_norm": 0.0020969114266335964, + "learning_rate": 5.809128630705395e-07, + "logits/chosen": -1.1788969039916992, + "logits/rejected": 1.914081335067749, + "logps/chosen": -526.4003295898438, + "logps/rejected": -928.8703002929688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.427753925323486, + "rewards/margins": 26.911319732666016, + "rewards/rejected": -34.33907699584961, + "step": 4317 + }, + { + "epoch": 2.686158631415241, + "grad_norm": 0.1595601886510849, + "learning_rate": 5.797602581834947e-07, + "logits/chosen": -0.34268662333488464, + "logits/rejected": 3.1043288707733154, + "logps/chosen": -554.5541381835938, + "logps/rejected": -1099.8505859375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.969440937042236, + "rewards/margins": 32.59528350830078, + "rewards/rejected": -40.564727783203125, + "step": 4318 + }, + { + "epoch": 2.6867807153965786, + "grad_norm": 0.0005578529671765864, + "learning_rate": 5.786076532964501e-07, + "logits/chosen": -1.6664105653762817, + "logits/rejected": 1.3208072185516357, + "logps/chosen": -356.27099609375, + "logps/rejected": -855.5338134765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.887505054473877, + "rewards/margins": 26.851097106933594, + "rewards/rejected": -32.73860168457031, + "step": 4319 + }, + { + "epoch": 2.687402799377916, + "grad_norm": 0.017177654430270195, + "learning_rate": 5.774550484094053e-07, + "logits/chosen": 2.851513624191284, + "logits/rejected": 3.758535861968994, + "logps/chosen": -751.7113037109375, + "logps/rejected": -1025.2801513671875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.320971488952637, + "rewards/margins": 24.983421325683594, + "rewards/rejected": -35.30438995361328, + "step": 4320 + }, + { + "epoch": 2.6880248833592537, + "grad_norm": 1.4212414026260376, + "learning_rate": 5.763024435223606e-07, + "logits/chosen": -1.5535696744918823, + "logits/rejected": 2.343139886856079, + "logps/chosen": -427.4454345703125, + "logps/rejected": -885.6641845703125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.263914108276367, + "rewards/margins": 28.99860191345215, + "rewards/rejected": -40.26251220703125, + "step": 4321 + }, + { + "epoch": 2.688646967340591, + "grad_norm": 2.3622999378858367e-07, + "learning_rate": 5.751498386353159e-07, + "logits/chosen": -1.9685401916503906, + "logits/rejected": 4.371389389038086, + "logps/chosen": -284.93084716796875, + "logps/rejected": -861.482666015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5671565532684326, + "rewards/margins": 29.86787223815918, + "rewards/rejected": -32.435028076171875, + "step": 4322 + }, + { + "epoch": 2.6892690513219284, + "grad_norm": 0.2193666398525238, + "learning_rate": 5.739972337482711e-07, + "logits/chosen": 0.7725979685783386, + "logits/rejected": 2.998575448989868, + "logps/chosen": -511.34527587890625, + "logps/rejected": -871.75927734375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.83578109741211, + "rewards/margins": 22.641033172607422, + "rewards/rejected": -32.47681427001953, + "step": 4323 + }, + { + "epoch": 2.6898911353032657, + "grad_norm": 0.0006468931096605957, + "learning_rate": 5.728446288612264e-07, + "logits/chosen": -0.7275848388671875, + "logits/rejected": 0.5731906890869141, + "logps/chosen": -552.6305541992188, + "logps/rejected": -949.397705078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.355508804321289, + "rewards/margins": 27.570133209228516, + "rewards/rejected": -36.92564392089844, + "step": 4324 + }, + { + "epoch": 2.6905132192846035, + "grad_norm": 0.6407142281532288, + "learning_rate": 5.716920239741816e-07, + "logits/chosen": -2.447523355484009, + "logits/rejected": 0.9723237156867981, + "logps/chosen": -477.42950439453125, + "logps/rejected": -824.8375854492188, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.186221599578857, + "rewards/margins": 21.85727310180664, + "rewards/rejected": -26.04349136352539, + "step": 4325 + }, + { + "epoch": 2.691135303265941, + "grad_norm": 2.10383404919412e-06, + "learning_rate": 5.70539419087137e-07, + "logits/chosen": -1.00802743434906, + "logits/rejected": 3.259660243988037, + "logps/chosen": -507.43731689453125, + "logps/rejected": -1148.611083984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.593570709228516, + "rewards/margins": 36.32147216796875, + "rewards/rejected": -44.91504669189453, + "step": 4326 + }, + { + "epoch": 2.6917573872472786, + "grad_norm": 0.01024108286947012, + "learning_rate": 5.693868142000923e-07, + "logits/chosen": -0.2035624384880066, + "logits/rejected": 3.430694103240967, + "logps/chosen": -612.5335693359375, + "logps/rejected": -1008.0703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.98814868927002, + "rewards/margins": 26.49085235595703, + "rewards/rejected": -35.479000091552734, + "step": 4327 + }, + { + "epoch": 2.692379471228616, + "grad_norm": 0.0010729931527748704, + "learning_rate": 5.682342093130475e-07, + "logits/chosen": 1.0878336429595947, + "logits/rejected": 2.785027027130127, + "logps/chosen": -598.495849609375, + "logps/rejected": -1020.3568725585938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.128654479980469, + "rewards/margins": 30.003232955932617, + "rewards/rejected": -38.13188934326172, + "step": 4328 + }, + { + "epoch": 2.6930015552099533, + "grad_norm": 4.427639484405518, + "learning_rate": 5.670816044260028e-07, + "logits/chosen": 2.2699520587921143, + "logits/rejected": 3.813429832458496, + "logps/chosen": -613.0655517578125, + "logps/rejected": -893.4889526367188, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.60464096069336, + "rewards/margins": 25.513607025146484, + "rewards/rejected": -36.11824417114258, + "step": 4329 + }, + { + "epoch": 2.6936236391912907, + "grad_norm": 0.0002954995143227279, + "learning_rate": 5.659289995389581e-07, + "logits/chosen": -1.6988110542297363, + "logits/rejected": 3.1969075202941895, + "logps/chosen": -366.23980712890625, + "logps/rejected": -999.2174682617188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.072360038757324, + "rewards/margins": 27.513774871826172, + "rewards/rejected": -34.58613586425781, + "step": 4330 + }, + { + "epoch": 2.6942457231726284, + "grad_norm": 1.4735642671585083, + "learning_rate": 5.647763946519134e-07, + "logits/chosen": 0.06061077117919922, + "logits/rejected": 3.5560169219970703, + "logps/chosen": -548.827392578125, + "logps/rejected": -1081.3961181640625, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.822676658630371, + "rewards/margins": 34.31581497192383, + "rewards/rejected": -43.13848876953125, + "step": 4331 + }, + { + "epoch": 2.694867807153966, + "grad_norm": 0.0016329983482137322, + "learning_rate": 5.636237897648686e-07, + "logits/chosen": 1.7709248065948486, + "logits/rejected": 2.6699976921081543, + "logps/chosen": -579.9384765625, + "logps/rejected": -892.5574340820312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.768032073974609, + "rewards/margins": 26.118501663208008, + "rewards/rejected": -33.886531829833984, + "step": 4332 + }, + { + "epoch": 2.695489891135303, + "grad_norm": 0.009099734015762806, + "learning_rate": 5.62471184877824e-07, + "logits/chosen": 1.7401294708251953, + "logits/rejected": 3.335153102874756, + "logps/chosen": -595.302490234375, + "logps/rejected": -866.0000610351562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.306253433227539, + "rewards/margins": 18.30873680114746, + "rewards/rejected": -28.614992141723633, + "step": 4333 + }, + { + "epoch": 2.696111975116641, + "grad_norm": 31.583667755126953, + "learning_rate": 5.613185799907792e-07, + "logits/chosen": -0.46423840522766113, + "logits/rejected": 3.5936923027038574, + "logps/chosen": -613.9580078125, + "logps/rejected": -1366.32666015625, + "loss": 0.3121, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.536033630371094, + "rewards/margins": 39.808067321777344, + "rewards/rejected": -51.34410095214844, + "step": 4334 + }, + { + "epoch": 2.6967340590979783, + "grad_norm": 0.0018100934103131294, + "learning_rate": 5.601659751037345e-07, + "logits/chosen": -0.2569233775138855, + "logits/rejected": 2.3816943168640137, + "logps/chosen": -449.564453125, + "logps/rejected": -746.4315185546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.061365127563477, + "rewards/margins": 23.170801162719727, + "rewards/rejected": -29.23216438293457, + "step": 4335 + }, + { + "epoch": 2.6973561430793156, + "grad_norm": 9.191144840769994e-07, + "learning_rate": 5.590133702166897e-07, + "logits/chosen": 2.2798638343811035, + "logits/rejected": 4.180978775024414, + "logps/chosen": -511.2894287109375, + "logps/rejected": -893.4908447265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.659784317016602, + "rewards/margins": 29.60740089416504, + "rewards/rejected": -40.26718521118164, + "step": 4336 + }, + { + "epoch": 2.697978227060653, + "grad_norm": 8.901963610696839e-09, + "learning_rate": 5.578607653296451e-07, + "logits/chosen": -1.8893110752105713, + "logits/rejected": 4.048734664916992, + "logps/chosen": -402.8697509765625, + "logps/rejected": -1160.6318359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.05605697631836, + "rewards/margins": 40.020328521728516, + "rewards/rejected": -51.076385498046875, + "step": 4337 + }, + { + "epoch": 2.6986003110419907, + "grad_norm": 17.029449462890625, + "learning_rate": 5.567081604426004e-07, + "logits/chosen": 1.0266895294189453, + "logits/rejected": 2.2230403423309326, + "logps/chosen": -618.4317016601562, + "logps/rejected": -924.9429931640625, + "loss": 0.1044, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.199832916259766, + "rewards/margins": 21.95380210876465, + "rewards/rejected": -37.15363693237305, + "step": 4338 + }, + { + "epoch": 2.699222395023328, + "grad_norm": 0.0002132367080776021, + "learning_rate": 5.555555555555555e-07, + "logits/chosen": -0.0829852819442749, + "logits/rejected": 3.98368501663208, + "logps/chosen": -583.0263061523438, + "logps/rejected": -1198.749755859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.993731498718262, + "rewards/margins": 35.4033088684082, + "rewards/rejected": -46.39704132080078, + "step": 4339 + }, + { + "epoch": 2.699844479004666, + "grad_norm": 0.432494580745697, + "learning_rate": 5.544029506685108e-07, + "logits/chosen": 1.575255036354065, + "logits/rejected": 2.5109803676605225, + "logps/chosen": -545.0003662109375, + "logps/rejected": -840.7965698242188, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.728708267211914, + "rewards/margins": 22.338848114013672, + "rewards/rejected": -29.06755828857422, + "step": 4340 + }, + { + "epoch": 2.700466562986003, + "grad_norm": 0.13241006433963776, + "learning_rate": 5.532503457814662e-07, + "logits/chosen": -0.6970356106758118, + "logits/rejected": 0.8758705854415894, + "logps/chosen": -580.6110229492188, + "logps/rejected": -915.1602783203125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.863323211669922, + "rewards/margins": 21.47724151611328, + "rewards/rejected": -32.3405647277832, + "step": 4341 + }, + { + "epoch": 2.7010886469673405, + "grad_norm": 0.00010155046038562432, + "learning_rate": 5.520977408944214e-07, + "logits/chosen": -1.4229021072387695, + "logits/rejected": 1.0566169023513794, + "logps/chosen": -456.4710388183594, + "logps/rejected": -824.149169921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.78543758392334, + "rewards/margins": 26.162662506103516, + "rewards/rejected": -31.948097229003906, + "step": 4342 + }, + { + "epoch": 2.701710730948678, + "grad_norm": 3.846147792613275e-12, + "learning_rate": 5.509451360073767e-07, + "logits/chosen": 0.1892796754837036, + "logits/rejected": 3.4125478267669678, + "logps/chosen": -550.337158203125, + "logps/rejected": -1173.93212890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.669839859008789, + "rewards/margins": 40.65834045410156, + "rewards/rejected": -49.328182220458984, + "step": 4343 + }, + { + "epoch": 2.7023328149300156, + "grad_norm": 0.17231476306915283, + "learning_rate": 5.49792531120332e-07, + "logits/chosen": 2.59763240814209, + "logits/rejected": 3.944274425506592, + "logps/chosen": -720.7247924804688, + "logps/rejected": -1106.485595703125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.691411972045898, + "rewards/margins": 26.218000411987305, + "rewards/rejected": -34.9094123840332, + "step": 4344 + }, + { + "epoch": 2.702954898911353, + "grad_norm": 7.438750617438927e-05, + "learning_rate": 5.486399262332873e-07, + "logits/chosen": -0.7106736302375793, + "logits/rejected": 3.2730722427368164, + "logps/chosen": -424.83203125, + "logps/rejected": -997.3363037109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.957796096801758, + "rewards/margins": 32.552249908447266, + "rewards/rejected": -41.510047912597656, + "step": 4345 + }, + { + "epoch": 2.7035769828926908, + "grad_norm": 8.103870641207322e-07, + "learning_rate": 5.474873213462425e-07, + "logits/chosen": 0.1780504584312439, + "logits/rejected": 2.3171892166137695, + "logps/chosen": -485.7713317871094, + "logps/rejected": -894.5025634765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.043638229370117, + "rewards/margins": 26.9890193939209, + "rewards/rejected": -34.032657623291016, + "step": 4346 + }, + { + "epoch": 2.704199066874028, + "grad_norm": 4.82890427520033e-05, + "learning_rate": 5.463347164591978e-07, + "logits/chosen": -0.6762113571166992, + "logits/rejected": 3.4721696376800537, + "logps/chosen": -483.78619384765625, + "logps/rejected": -1048.41748046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.2759599685668945, + "rewards/margins": 34.23728942871094, + "rewards/rejected": -39.51325225830078, + "step": 4347 + }, + { + "epoch": 2.7048211508553655, + "grad_norm": 0.0010803096229210496, + "learning_rate": 5.451821115721531e-07, + "logits/chosen": -0.5647343397140503, + "logits/rejected": 3.118108034133911, + "logps/chosen": -507.31219482421875, + "logps/rejected": -1145.51220703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.005629539489746, + "rewards/margins": 35.81145477294922, + "rewards/rejected": -41.81708526611328, + "step": 4348 + }, + { + "epoch": 2.705443234836703, + "grad_norm": 0.03288925066590309, + "learning_rate": 5.440295066851084e-07, + "logits/chosen": -0.9186725616455078, + "logits/rejected": 1.690115213394165, + "logps/chosen": -437.88800048828125, + "logps/rejected": -734.5858154296875, + "loss": 0.0866, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.427893161773682, + "rewards/margins": 18.78788185119629, + "rewards/rejected": -24.215774536132812, + "step": 4349 + }, + { + "epoch": 2.7060653188180406, + "grad_norm": 1.1404665201553144e-05, + "learning_rate": 5.428769017980637e-07, + "logits/chosen": -1.5076873302459717, + "logits/rejected": 3.565868377685547, + "logps/chosen": -260.5267333984375, + "logps/rejected": -897.2808227539062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2050933837890625, + "rewards/margins": 32.63178253173828, + "rewards/rejected": -36.836875915527344, + "step": 4350 + }, + { + "epoch": 2.706687402799378, + "grad_norm": 5.416849489847664e-06, + "learning_rate": 5.41724296911019e-07, + "logits/chosen": -1.6162619590759277, + "logits/rejected": 3.268805742263794, + "logps/chosen": -361.4254455566406, + "logps/rejected": -998.807861328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.2600626945495605, + "rewards/margins": 31.510923385620117, + "rewards/rejected": -37.7709846496582, + "step": 4351 + }, + { + "epoch": 2.7073094867807153, + "grad_norm": 0.08973027765750885, + "learning_rate": 5.405716920239743e-07, + "logits/chosen": -0.4660053551197052, + "logits/rejected": 1.4866576194763184, + "logps/chosen": -479.0644836425781, + "logps/rejected": -909.916748046875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.39615249633789, + "rewards/margins": 30.067825317382812, + "rewards/rejected": -38.4639778137207, + "step": 4352 + }, + { + "epoch": 2.707931570762053, + "grad_norm": 8.65224046719959e-06, + "learning_rate": 5.394190871369295e-07, + "logits/chosen": -0.05632424354553223, + "logits/rejected": 3.01267671585083, + "logps/chosen": -523.555419921875, + "logps/rejected": -921.5439453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.667513847351074, + "rewards/margins": 25.65847396850586, + "rewards/rejected": -37.32598876953125, + "step": 4353 + }, + { + "epoch": 2.7085536547433904, + "grad_norm": 31.374597549438477, + "learning_rate": 5.382664822498848e-07, + "logits/chosen": 1.0141353607177734, + "logits/rejected": 2.414851427078247, + "logps/chosen": -523.1561279296875, + "logps/rejected": -741.9181518554688, + "loss": 0.4007, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.2355732917785645, + "rewards/margins": 15.42635726928711, + "rewards/rejected": -21.661930084228516, + "step": 4354 + }, + { + "epoch": 2.7091757387247277, + "grad_norm": 0.00032730703242123127, + "learning_rate": 5.371138773628401e-07, + "logits/chosen": -1.0623137950897217, + "logits/rejected": 1.790582537651062, + "logps/chosen": -350.048828125, + "logps/rejected": -757.1088256835938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.168100357055664, + "rewards/margins": 23.775062561035156, + "rewards/rejected": -29.943164825439453, + "step": 4355 + }, + { + "epoch": 2.709797822706065, + "grad_norm": 5.184572219848633, + "learning_rate": 5.359612724757953e-07, + "logits/chosen": -0.6515331268310547, + "logits/rejected": 2.19075083732605, + "logps/chosen": -407.2192077636719, + "logps/rejected": -849.9705200195312, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.066438674926758, + "rewards/margins": 25.576189041137695, + "rewards/rejected": -29.64262580871582, + "step": 4356 + }, + { + "epoch": 2.710419906687403, + "grad_norm": 30.339340209960938, + "learning_rate": 5.348086675887506e-07, + "logits/chosen": 0.5228167176246643, + "logits/rejected": 1.639932632446289, + "logps/chosen": -682.2943115234375, + "logps/rejected": -1072.2547607421875, + "loss": 0.1604, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.397087097167969, + "rewards/margins": 25.680978775024414, + "rewards/rejected": -37.07806396484375, + "step": 4357 + }, + { + "epoch": 2.71104199066874, + "grad_norm": 0.006391817703843117, + "learning_rate": 5.336560627017058e-07, + "logits/chosen": 1.4407583475112915, + "logits/rejected": 3.6678295135498047, + "logps/chosen": -530.218994140625, + "logps/rejected": -1002.1013793945312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.938558578491211, + "rewards/margins": 32.15758514404297, + "rewards/rejected": -41.09614562988281, + "step": 4358 + }, + { + "epoch": 2.711664074650078, + "grad_norm": 0.00031976267928257585, + "learning_rate": 5.325034578146612e-07, + "logits/chosen": 1.9172717332839966, + "logits/rejected": 3.0557916164398193, + "logps/chosen": -624.5967407226562, + "logps/rejected": -958.0452880859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.343128204345703, + "rewards/margins": 23.5101318359375, + "rewards/rejected": -31.85325813293457, + "step": 4359 + }, + { + "epoch": 2.7122861586314153, + "grad_norm": 0.2668883800506592, + "learning_rate": 5.313508529276164e-07, + "logits/chosen": -2.6089425086975098, + "logits/rejected": 3.2084131240844727, + "logps/chosen": -315.37908935546875, + "logps/rejected": -944.3118896484375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.8562798500061035, + "rewards/margins": 26.441707611083984, + "rewards/rejected": -32.29798889160156, + "step": 4360 + }, + { + "epoch": 2.7129082426127527, + "grad_norm": 0.12905797362327576, + "learning_rate": 5.301982480405717e-07, + "logits/chosen": -3.010551929473877, + "logits/rejected": 1.19906485080719, + "logps/chosen": -329.3901672363281, + "logps/rejected": -893.433837890625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.323816776275635, + "rewards/margins": 28.397506713867188, + "rewards/rejected": -35.7213249206543, + "step": 4361 + }, + { + "epoch": 2.71353032659409, + "grad_norm": 0.02742931619286537, + "learning_rate": 5.29045643153527e-07, + "logits/chosen": 0.43825221061706543, + "logits/rejected": 2.532029628753662, + "logps/chosen": -554.6838989257812, + "logps/rejected": -1005.2730102539062, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.257204532623291, + "rewards/margins": 28.405176162719727, + "rewards/rejected": -33.66238021850586, + "step": 4362 + }, + { + "epoch": 2.7141524105754278, + "grad_norm": 0.008566766045987606, + "learning_rate": 5.278930382664823e-07, + "logits/chosen": -1.9675729274749756, + "logits/rejected": 2.2211780548095703, + "logps/chosen": -344.1619567871094, + "logps/rejected": -868.6703491210938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.205824851989746, + "rewards/margins": 20.807655334472656, + "rewards/rejected": -29.013479232788086, + "step": 4363 + }, + { + "epoch": 2.714774494556765, + "grad_norm": 0.11387711763381958, + "learning_rate": 5.267404333794376e-07, + "logits/chosen": -0.49191659688949585, + "logits/rejected": 3.8870506286621094, + "logps/chosen": -493.625, + "logps/rejected": -1037.103759765625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.6121931076049805, + "rewards/margins": 28.777477264404297, + "rewards/rejected": -35.389671325683594, + "step": 4364 + }, + { + "epoch": 2.715396578538103, + "grad_norm": 0.0002502195711713284, + "learning_rate": 5.255878284923928e-07, + "logits/chosen": 0.7257356643676758, + "logits/rejected": 3.592747688293457, + "logps/chosen": -431.878173828125, + "logps/rejected": -801.303466796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.508918762207031, + "rewards/margins": 22.942853927612305, + "rewards/rejected": -30.451770782470703, + "step": 4365 + }, + { + "epoch": 2.7160186625194402, + "grad_norm": 2.134223461151123, + "learning_rate": 5.244352236053482e-07, + "logits/chosen": 2.029142379760742, + "logits/rejected": 3.5649404525756836, + "logps/chosen": -642.5010986328125, + "logps/rejected": -990.53173828125, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.469472885131836, + "rewards/margins": 29.194623947143555, + "rewards/rejected": -37.66409683227539, + "step": 4366 + }, + { + "epoch": 2.7166407465007776, + "grad_norm": 0.014666501432657242, + "learning_rate": 5.232826187183034e-07, + "logits/chosen": 0.12603336572647095, + "logits/rejected": 2.8921942710876465, + "logps/chosen": -487.640380859375, + "logps/rejected": -989.414306640625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.62231731414795, + "rewards/margins": 25.92264175415039, + "rewards/rejected": -38.544960021972656, + "step": 4367 + }, + { + "epoch": 2.717262830482115, + "grad_norm": 0.15587158501148224, + "learning_rate": 5.221300138312587e-07, + "logits/chosen": -0.2560267150402069, + "logits/rejected": 2.216273307800293, + "logps/chosen": -526.2840576171875, + "logps/rejected": -870.5951538085938, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.446675300598145, + "rewards/margins": 24.011863708496094, + "rewards/rejected": -32.45853805541992, + "step": 4368 + }, + { + "epoch": 2.7178849144634527, + "grad_norm": 7.080076102283783e-06, + "learning_rate": 5.20977408944214e-07, + "logits/chosen": -2.0199179649353027, + "logits/rejected": 2.431148052215576, + "logps/chosen": -395.56085205078125, + "logps/rejected": -969.3992919921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.538591384887695, + "rewards/margins": 36.18790054321289, + "rewards/rejected": -42.72649383544922, + "step": 4369 + }, + { + "epoch": 2.71850699844479, + "grad_norm": 2.202810492235585e-06, + "learning_rate": 5.198248040571693e-07, + "logits/chosen": -2.0969204902648926, + "logits/rejected": 1.8270785808563232, + "logps/chosen": -397.2028503417969, + "logps/rejected": -844.73876953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.526747703552246, + "rewards/margins": 28.255794525146484, + "rewards/rejected": -35.78254318237305, + "step": 4370 + }, + { + "epoch": 2.7191290824261274, + "grad_norm": 0.0031678418163210154, + "learning_rate": 5.186721991701245e-07, + "logits/chosen": 3.7845406532287598, + "logits/rejected": 3.3417134284973145, + "logps/chosen": -703.6448974609375, + "logps/rejected": -964.923828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.213335990905762, + "rewards/margins": 25.453189849853516, + "rewards/rejected": -38.666526794433594, + "step": 4371 + }, + { + "epoch": 2.719751166407465, + "grad_norm": 0.00013797474093735218, + "learning_rate": 5.175195942830797e-07, + "logits/chosen": 0.5534344911575317, + "logits/rejected": 3.253185272216797, + "logps/chosen": -467.7929992675781, + "logps/rejected": -796.52197265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.673547744750977, + "rewards/margins": 23.377058029174805, + "rewards/rejected": -29.05060577392578, + "step": 4372 + }, + { + "epoch": 2.7203732503888025, + "grad_norm": 0.17591939866542816, + "learning_rate": 5.163669893960351e-07, + "logits/chosen": -2.0380706787109375, + "logits/rejected": 3.314939022064209, + "logps/chosen": -471.1107177734375, + "logps/rejected": -1084.6961669921875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.316633224487305, + "rewards/margins": 33.47694396972656, + "rewards/rejected": -42.7935791015625, + "step": 4373 + }, + { + "epoch": 2.72099533437014, + "grad_norm": 1.0730345820775256e-05, + "learning_rate": 5.152143845089903e-07, + "logits/chosen": -1.1209220886230469, + "logits/rejected": 1.6226786375045776, + "logps/chosen": -454.8038024902344, + "logps/rejected": -1002.0517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.348759651184082, + "rewards/margins": 34.19036102294922, + "rewards/rejected": -42.539119720458984, + "step": 4374 + }, + { + "epoch": 2.721617418351477, + "grad_norm": 8.572400838602334e-05, + "learning_rate": 5.140617796219456e-07, + "logits/chosen": 1.7390587329864502, + "logits/rejected": 3.3332607746124268, + "logps/chosen": -618.982666015625, + "logps/rejected": -1069.13720703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.608708381652832, + "rewards/margins": 30.47606086730957, + "rewards/rejected": -43.08477020263672, + "step": 4375 + }, + { + "epoch": 2.722239502332815, + "grad_norm": 1.4397851089142932e-07, + "learning_rate": 5.12909174734901e-07, + "logits/chosen": -1.9789304733276367, + "logits/rejected": 2.7752182483673096, + "logps/chosen": -425.43988037109375, + "logps/rejected": -1153.5184326171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.725929260253906, + "rewards/margins": 41.87081527709961, + "rewards/rejected": -49.596744537353516, + "step": 4376 + }, + { + "epoch": 2.7228615863141523, + "grad_norm": 2.710033550101798e-05, + "learning_rate": 5.117565698478562e-07, + "logits/chosen": 0.715092658996582, + "logits/rejected": 1.869808316230774, + "logps/chosen": -517.2560424804688, + "logps/rejected": -950.8013916015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.218734264373779, + "rewards/margins": 30.94306182861328, + "rewards/rejected": -36.16179656982422, + "step": 4377 + }, + { + "epoch": 2.72348367029549, + "grad_norm": 0.2727895677089691, + "learning_rate": 5.106039649608115e-07, + "logits/chosen": 0.3004588186740875, + "logits/rejected": 4.007742404937744, + "logps/chosen": -517.6912231445312, + "logps/rejected": -988.560302734375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.110770225524902, + "rewards/margins": 24.466075897216797, + "rewards/rejected": -32.57684326171875, + "step": 4378 + }, + { + "epoch": 2.7241057542768274, + "grad_norm": 5.683639301423682e-07, + "learning_rate": 5.094513600737667e-07, + "logits/chosen": -2.6395342350006104, + "logits/rejected": 1.850848913192749, + "logps/chosen": -472.1200256347656, + "logps/rejected": -1047.662109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.703500747680664, + "rewards/margins": 36.30821228027344, + "rewards/rejected": -46.011714935302734, + "step": 4379 + }, + { + "epoch": 2.724727838258165, + "grad_norm": 0.015439534559845924, + "learning_rate": 5.082987551867221e-07, + "logits/chosen": -1.267011046409607, + "logits/rejected": 3.612168550491333, + "logps/chosen": -418.2823486328125, + "logps/rejected": -964.836669921875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.654834747314453, + "rewards/margins": 27.616714477539062, + "rewards/rejected": -36.271549224853516, + "step": 4380 + }, + { + "epoch": 2.725349922239502, + "grad_norm": 0.00018610125698614866, + "learning_rate": 5.071461502996773e-07, + "logits/chosen": 0.0017415881156921387, + "logits/rejected": 2.2741801738739014, + "logps/chosen": -552.3558349609375, + "logps/rejected": -904.6097412109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.071257591247559, + "rewards/margins": 26.6653995513916, + "rewards/rejected": -35.736656188964844, + "step": 4381 + }, + { + "epoch": 2.72597200622084, + "grad_norm": 0.07816386222839355, + "learning_rate": 5.059935454126326e-07, + "logits/chosen": -2.1264593601226807, + "logits/rejected": 2.295696496963501, + "logps/chosen": -424.7033386230469, + "logps/rejected": -1066.138671875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.95219898223877, + "rewards/margins": 36.07676315307617, + "rewards/rejected": -45.028961181640625, + "step": 4382 + }, + { + "epoch": 2.7265940902021772, + "grad_norm": 0.1827636957168579, + "learning_rate": 5.048409405255878e-07, + "logits/chosen": 0.587814211845398, + "logits/rejected": 1.550323247909546, + "logps/chosen": -459.2896728515625, + "logps/rejected": -797.319091796875, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.950087547302246, + "rewards/margins": 27.087358474731445, + "rewards/rejected": -37.037445068359375, + "step": 4383 + }, + { + "epoch": 2.727216174183515, + "grad_norm": 0.018102193251252174, + "learning_rate": 5.036883356385432e-07, + "logits/chosen": -1.172330617904663, + "logits/rejected": 2.191157817840576, + "logps/chosen": -502.16253662109375, + "logps/rejected": -955.5294799804688, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.382427215576172, + "rewards/margins": 27.705997467041016, + "rewards/rejected": -37.08842849731445, + "step": 4384 + }, + { + "epoch": 2.7278382581648524, + "grad_norm": 0.0841863602399826, + "learning_rate": 5.025357307514984e-07, + "logits/chosen": 1.7941412925720215, + "logits/rejected": 2.9506564140319824, + "logps/chosen": -797.8958129882812, + "logps/rejected": -1073.514892578125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.37309455871582, + "rewards/margins": 23.541181564331055, + "rewards/rejected": -37.914276123046875, + "step": 4385 + }, + { + "epoch": 2.7284603421461897, + "grad_norm": 0.05608074739575386, + "learning_rate": 5.013831258644537e-07, + "logits/chosen": 0.6043331623077393, + "logits/rejected": 1.9576436281204224, + "logps/chosen": -613.4882202148438, + "logps/rejected": -1038.0992431640625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.326175689697266, + "rewards/margins": 32.297691345214844, + "rewards/rejected": -39.623870849609375, + "step": 4386 + }, + { + "epoch": 2.729082426127527, + "grad_norm": 0.09061995893716812, + "learning_rate": 5.002305209774091e-07, + "logits/chosen": 2.2635750770568848, + "logits/rejected": 3.9595303535461426, + "logps/chosen": -664.990966796875, + "logps/rejected": -1020.809326171875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.448055267333984, + "rewards/margins": 25.402923583984375, + "rewards/rejected": -32.85097885131836, + "step": 4387 + }, + { + "epoch": 2.729704510108865, + "grad_norm": 0.0019873238634318113, + "learning_rate": 4.990779160903643e-07, + "logits/chosen": -3.2694735527038574, + "logits/rejected": 1.7833689451217651, + "logps/chosen": -427.5120849609375, + "logps/rejected": -1065.823486328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.329551696777344, + "rewards/margins": 34.55972671508789, + "rewards/rejected": -44.889278411865234, + "step": 4388 + }, + { + "epoch": 2.730326594090202, + "grad_norm": 0.08666915446519852, + "learning_rate": 4.979253112033195e-07, + "logits/chosen": -1.542961597442627, + "logits/rejected": 2.0044682025909424, + "logps/chosen": -452.8818664550781, + "logps/rejected": -1085.8095703125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.248226642608643, + "rewards/margins": 37.20758056640625, + "rewards/rejected": -43.455806732177734, + "step": 4389 + }, + { + "epoch": 2.7309486780715395, + "grad_norm": 9.85058879852295, + "learning_rate": 4.967727063162748e-07, + "logits/chosen": 1.9844534397125244, + "logits/rejected": 3.299943685531616, + "logps/chosen": -642.399169921875, + "logps/rejected": -1097.474609375, + "loss": 0.0413, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.411534309387207, + "rewards/margins": 29.400665283203125, + "rewards/rejected": -40.81220245361328, + "step": 4390 + }, + { + "epoch": 2.7315707620528773, + "grad_norm": 0.009679671376943588, + "learning_rate": 4.956201014292301e-07, + "logits/chosen": 0.39165472984313965, + "logits/rejected": 4.176124572753906, + "logps/chosen": -613.4664306640625, + "logps/rejected": -1131.4798583984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.620540142059326, + "rewards/margins": 28.63414192199707, + "rewards/rejected": -35.25468444824219, + "step": 4391 + }, + { + "epoch": 2.7321928460342146, + "grad_norm": 6.542808478116058e-06, + "learning_rate": 4.944674965421854e-07, + "logits/chosen": 1.009442925453186, + "logits/rejected": 2.719297409057617, + "logps/chosen": -727.4083251953125, + "logps/rejected": -1082.810791015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.238219261169434, + "rewards/margins": 30.47528076171875, + "rewards/rejected": -41.7135009765625, + "step": 4392 + }, + { + "epoch": 2.732814930015552, + "grad_norm": 1.4480074644088745, + "learning_rate": 4.933148916551406e-07, + "logits/chosen": -0.8078665733337402, + "logits/rejected": 2.1155059337615967, + "logps/chosen": -473.40826416015625, + "logps/rejected": -780.3966064453125, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.8958587646484375, + "rewards/margins": 15.986709594726562, + "rewards/rejected": -22.882568359375, + "step": 4393 + }, + { + "epoch": 2.7334370139968893, + "grad_norm": 0.0005617666174657643, + "learning_rate": 4.92162286768096e-07, + "logits/chosen": -1.7720906734466553, + "logits/rejected": 2.736274003982544, + "logps/chosen": -399.5456848144531, + "logps/rejected": -1101.6707763671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.80557918548584, + "rewards/margins": 35.24809265136719, + "rewards/rejected": -42.053672790527344, + "step": 4394 + }, + { + "epoch": 2.734059097978227, + "grad_norm": 0.002742925425991416, + "learning_rate": 4.910096818810512e-07, + "logits/chosen": -1.521073341369629, + "logits/rejected": 2.829493999481201, + "logps/chosen": -433.4801330566406, + "logps/rejected": -994.6790771484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.705349445343018, + "rewards/margins": 33.4708251953125, + "rewards/rejected": -40.176177978515625, + "step": 4395 + }, + { + "epoch": 2.7346811819595644, + "grad_norm": 0.03669571876525879, + "learning_rate": 4.898570769940065e-07, + "logits/chosen": -1.7508848905563354, + "logits/rejected": 1.0612797737121582, + "logps/chosen": -548.3234252929688, + "logps/rejected": -1094.0908203125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.284712791442871, + "rewards/margins": 34.96992492675781, + "rewards/rejected": -47.254634857177734, + "step": 4396 + }, + { + "epoch": 2.7353032659409022, + "grad_norm": 0.10406485944986343, + "learning_rate": 4.887044721069617e-07, + "logits/chosen": -0.0743609368801117, + "logits/rejected": 4.600546836853027, + "logps/chosen": -567.2088623046875, + "logps/rejected": -1132.5198974609375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.125041961669922, + "rewards/margins": 28.387611389160156, + "rewards/rejected": -37.51264953613281, + "step": 4397 + }, + { + "epoch": 2.7359253499222396, + "grad_norm": 0.18575268983840942, + "learning_rate": 4.875518672199171e-07, + "logits/chosen": -0.16322201490402222, + "logits/rejected": 4.049489974975586, + "logps/chosen": -525.8208618164062, + "logps/rejected": -1135.487060546875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.548019409179688, + "rewards/margins": 35.76894760131836, + "rewards/rejected": -44.31696701049805, + "step": 4398 + }, + { + "epoch": 2.736547433903577, + "grad_norm": 0.05440472811460495, + "learning_rate": 4.863992623328723e-07, + "logits/chosen": -0.8141235113143921, + "logits/rejected": 4.115907192230225, + "logps/chosen": -467.80584716796875, + "logps/rejected": -1172.6866455078125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.733298301696777, + "rewards/margins": 39.912601470947266, + "rewards/rejected": -50.645896911621094, + "step": 4399 + }, + { + "epoch": 2.7371695178849142, + "grad_norm": 1.0240157166663266e-08, + "learning_rate": 4.852466574458276e-07, + "logits/chosen": -3.2142281532287598, + "logits/rejected": 3.145437479019165, + "logps/chosen": -326.4961242675781, + "logps/rejected": -974.7950439453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.854359149932861, + "rewards/margins": 35.600135803222656, + "rewards/rejected": -43.454498291015625, + "step": 4400 + }, + { + "epoch": 2.737791601866252, + "grad_norm": 19.847810745239258, + "learning_rate": 4.84094052558783e-07, + "logits/chosen": 0.38483256101608276, + "logits/rejected": 2.9243876934051514, + "logps/chosen": -568.7760620117188, + "logps/rejected": -860.8410034179688, + "loss": 0.1395, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.1824369430542, + "rewards/margins": 18.987302780151367, + "rewards/rejected": -30.169742584228516, + "step": 4401 + }, + { + "epoch": 2.7384136858475894, + "grad_norm": 0.007317651528865099, + "learning_rate": 4.829414476717382e-07, + "logits/chosen": 0.06382274627685547, + "logits/rejected": 2.503971576690674, + "logps/chosen": -651.1002197265625, + "logps/rejected": -989.615234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.62253189086914, + "rewards/margins": 26.692480087280273, + "rewards/rejected": -36.31501007080078, + "step": 4402 + }, + { + "epoch": 2.739035769828927, + "grad_norm": 0.0010312871308997273, + "learning_rate": 4.817888427846935e-07, + "logits/chosen": 0.0007169246673583984, + "logits/rejected": 3.0172805786132812, + "logps/chosen": -429.4022216796875, + "logps/rejected": -827.8827514648438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.983163833618164, + "rewards/margins": 23.772552490234375, + "rewards/rejected": -29.755714416503906, + "step": 4403 + }, + { + "epoch": 2.7396578538102645, + "grad_norm": 2.757115602493286, + "learning_rate": 4.806362378976487e-07, + "logits/chosen": 0.38977721333503723, + "logits/rejected": 2.635216236114502, + "logps/chosen": -659.2049560546875, + "logps/rejected": -903.1165771484375, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.251816749572754, + "rewards/margins": 14.301630973815918, + "rewards/rejected": -21.553447723388672, + "step": 4404 + }, + { + "epoch": 2.740279937791602, + "grad_norm": 0.03386557102203369, + "learning_rate": 4.794836330106041e-07, + "logits/chosen": 2.021665573120117, + "logits/rejected": 3.6409528255462646, + "logps/chosen": -555.582763671875, + "logps/rejected": -898.3143310546875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.261466979980469, + "rewards/margins": 22.672351837158203, + "rewards/rejected": -29.933818817138672, + "step": 4405 + }, + { + "epoch": 2.740902021772939, + "grad_norm": 0.00021831082995049655, + "learning_rate": 4.783310281235593e-07, + "logits/chosen": -1.1020753383636475, + "logits/rejected": 2.6016407012939453, + "logps/chosen": -515.0692749023438, + "logps/rejected": -1046.1241455078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.593591213226318, + "rewards/margins": 31.335071563720703, + "rewards/rejected": -38.92866134643555, + "step": 4406 + }, + { + "epoch": 2.741524105754277, + "grad_norm": 1.8972793817520142, + "learning_rate": 4.771784232365145e-07, + "logits/chosen": 3.971691846847534, + "logits/rejected": 4.097461223602295, + "logps/chosen": -809.5933227539062, + "logps/rejected": -1073.0018310546875, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.114361763000488, + "rewards/margins": 22.81982421875, + "rewards/rejected": -30.934188842773438, + "step": 4407 + }, + { + "epoch": 2.7421461897356143, + "grad_norm": 3.0011490252945805e-06, + "learning_rate": 4.7602581834946984e-07, + "logits/chosen": -0.5607624650001526, + "logits/rejected": 0.6232677102088928, + "logps/chosen": -511.8693542480469, + "logps/rejected": -843.8756103515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.825777530670166, + "rewards/margins": 25.076919555664062, + "rewards/rejected": -32.9026985168457, + "step": 4408 + }, + { + "epoch": 2.7427682737169516, + "grad_norm": 51.671817779541016, + "learning_rate": 4.748732134624251e-07, + "logits/chosen": 0.5230734944343567, + "logits/rejected": 2.3547849655151367, + "logps/chosen": -531.5377807617188, + "logps/rejected": -943.6278076171875, + "loss": 0.3686, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.170397758483887, + "rewards/margins": 27.10638999938965, + "rewards/rejected": -37.27678680419922, + "step": 4409 + }, + { + "epoch": 2.7433903576982894, + "grad_norm": 1.494800550005948e-08, + "learning_rate": 4.737206085753804e-07, + "logits/chosen": 1.9931923151016235, + "logits/rejected": 3.8050477504730225, + "logps/chosen": -661.6253662109375, + "logps/rejected": -1054.855224609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.947713851928711, + "rewards/margins": 32.363162994384766, + "rewards/rejected": -43.31087875366211, + "step": 4410 + }, + { + "epoch": 2.7440124416796268, + "grad_norm": 2.0981217403459596e-06, + "learning_rate": 4.7256800368833567e-07, + "logits/chosen": 0.8072133660316467, + "logits/rejected": 4.635097026824951, + "logps/chosen": -504.37249755859375, + "logps/rejected": -1036.0225830078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.078475952148438, + "rewards/margins": 30.97477149963379, + "rewards/rejected": -40.053245544433594, + "step": 4411 + }, + { + "epoch": 2.744634525660964, + "grad_norm": 1.6119975043693557e-05, + "learning_rate": 4.7141539880129095e-07, + "logits/chosen": 0.30014097690582275, + "logits/rejected": 2.809941291809082, + "logps/chosen": -473.0091857910156, + "logps/rejected": -881.51611328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.943344116210938, + "rewards/margins": 25.527257919311523, + "rewards/rejected": -34.47060012817383, + "step": 4412 + }, + { + "epoch": 2.7452566096423014, + "grad_norm": 0.10137374699115753, + "learning_rate": 4.702627939142462e-07, + "logits/chosen": 1.4836374521255493, + "logits/rejected": 3.366061210632324, + "logps/chosen": -570.8435668945312, + "logps/rejected": -958.2086181640625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.571364402770996, + "rewards/margins": 27.339611053466797, + "rewards/rejected": -38.910972595214844, + "step": 4413 + }, + { + "epoch": 2.7458786936236392, + "grad_norm": 0.029282765462994576, + "learning_rate": 4.691101890272015e-07, + "logits/chosen": -0.6731588840484619, + "logits/rejected": 2.8208022117614746, + "logps/chosen": -574.0529174804688, + "logps/rejected": -1081.0164794921875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.4242472648620605, + "rewards/margins": 30.81102752685547, + "rewards/rejected": -38.23527526855469, + "step": 4414 + }, + { + "epoch": 2.7465007776049766, + "grad_norm": 0.43205007910728455, + "learning_rate": 4.679575841401568e-07, + "logits/chosen": 0.5426101088523865, + "logits/rejected": 2.9084668159484863, + "logps/chosen": -659.4035034179688, + "logps/rejected": -1100.140625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.613444328308105, + "rewards/margins": 25.915149688720703, + "rewards/rejected": -36.528594970703125, + "step": 4415 + }, + { + "epoch": 2.7471228615863144, + "grad_norm": 8.718097524251789e-05, + "learning_rate": 4.6680497925311206e-07, + "logits/chosen": -3.586723804473877, + "logits/rejected": -0.23334333300590515, + "logps/chosen": -410.1449890136719, + "logps/rejected": -930.001953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.797764301300049, + "rewards/margins": 33.449073791503906, + "rewards/rejected": -38.2468376159668, + "step": 4416 + }, + { + "epoch": 2.7477449455676517, + "grad_norm": 0.34267765283584595, + "learning_rate": 4.6565237436606734e-07, + "logits/chosen": 2.4590861797332764, + "logits/rejected": 4.483171463012695, + "logps/chosen": -570.5287475585938, + "logps/rejected": -870.416015625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.81176233291626, + "rewards/margins": 20.070743560791016, + "rewards/rejected": -27.882505416870117, + "step": 4417 + }, + { + "epoch": 2.748367029548989, + "grad_norm": 10.427054405212402, + "learning_rate": 4.6449976947902267e-07, + "logits/chosen": -0.17008568346500397, + "logits/rejected": 2.956789016723633, + "logps/chosen": -639.9490966796875, + "logps/rejected": -1002.490478515625, + "loss": 0.0448, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.339786529541016, + "rewards/margins": 24.56946563720703, + "rewards/rejected": -34.90925216674805, + "step": 4418 + }, + { + "epoch": 2.7489891135303264, + "grad_norm": 0.023447509855031967, + "learning_rate": 4.6334716459197795e-07, + "logits/chosen": 1.941941738128662, + "logits/rejected": 5.219603061676025, + "logps/chosen": -556.0087280273438, + "logps/rejected": -958.0289306640625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.532724380493164, + "rewards/margins": 26.292644500732422, + "rewards/rejected": -33.82536697387695, + "step": 4419 + }, + { + "epoch": 2.749611197511664, + "grad_norm": 0.8535892963409424, + "learning_rate": 4.621945597049332e-07, + "logits/chosen": 0.18626666069030762, + "logits/rejected": 3.3300750255584717, + "logps/chosen": -559.193603515625, + "logps/rejected": -988.401611328125, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.761933326721191, + "rewards/margins": 21.86284065246582, + "rewards/rejected": -33.62477111816406, + "step": 4420 + }, + { + "epoch": 2.7502332814930015, + "grad_norm": 0.0011312151327729225, + "learning_rate": 4.610419548178885e-07, + "logits/chosen": 0.1989070177078247, + "logits/rejected": 4.104744911193848, + "logps/chosen": -471.160400390625, + "logps/rejected": -1040.41552734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.696352005004883, + "rewards/margins": 33.65787887573242, + "rewards/rejected": -44.35422897338867, + "step": 4421 + }, + { + "epoch": 2.7508553654743393, + "grad_norm": 0.02870684675872326, + "learning_rate": 4.598893499308438e-07, + "logits/chosen": -2.251566171646118, + "logits/rejected": 3.607893705368042, + "logps/chosen": -386.98480224609375, + "logps/rejected": -1097.69384765625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.63227653503418, + "rewards/margins": 32.436458587646484, + "rewards/rejected": -41.06873321533203, + "step": 4422 + }, + { + "epoch": 2.7514774494556766, + "grad_norm": 41.48528289794922, + "learning_rate": 4.58736745043799e-07, + "logits/chosen": -2.2110953330993652, + "logits/rejected": 2.933452606201172, + "logps/chosen": -511.3760070800781, + "logps/rejected": -1112.38916015625, + "loss": 0.5971, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.268409729003906, + "rewards/margins": 28.931161880493164, + "rewards/rejected": -37.1995735168457, + "step": 4423 + }, + { + "epoch": 2.752099533437014, + "grad_norm": 0.0033406876027584076, + "learning_rate": 4.575841401567543e-07, + "logits/chosen": -0.09376183152198792, + "logits/rejected": 2.7225241661071777, + "logps/chosen": -523.5830078125, + "logps/rejected": -1034.5584716796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.04576301574707, + "rewards/margins": 32.29011154174805, + "rewards/rejected": -41.33587646484375, + "step": 4424 + }, + { + "epoch": 2.7527216174183513, + "grad_norm": 0.0003896571579389274, + "learning_rate": 4.5643153526970956e-07, + "logits/chosen": -0.23421478271484375, + "logits/rejected": 3.9801552295684814, + "logps/chosen": -536.8067626953125, + "logps/rejected": -1102.0413818359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.031307220458984, + "rewards/margins": 35.871116638183594, + "rewards/rejected": -44.90242004394531, + "step": 4425 + }, + { + "epoch": 2.753343701399689, + "grad_norm": 0.13651464879512787, + "learning_rate": 4.5527893038266484e-07, + "logits/chosen": 1.7377511262893677, + "logits/rejected": 2.5145273208618164, + "logps/chosen": -729.896484375, + "logps/rejected": -944.122802734375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.97828483581543, + "rewards/margins": 22.620149612426758, + "rewards/rejected": -33.59843444824219, + "step": 4426 + }, + { + "epoch": 2.7539657853810264, + "grad_norm": 0.0006300930981524289, + "learning_rate": 4.541263254956201e-07, + "logits/chosen": -1.6989445686340332, + "logits/rejected": 4.08779239654541, + "logps/chosen": -300.9146728515625, + "logps/rejected": -1021.2611083984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.203524112701416, + "rewards/margins": 39.75101089477539, + "rewards/rejected": -43.95453643798828, + "step": 4427 + }, + { + "epoch": 2.7545878693623638, + "grad_norm": 7.86553391662892e-06, + "learning_rate": 4.529737206085754e-07, + "logits/chosen": 1.9402748346328735, + "logits/rejected": 3.665144443511963, + "logps/chosen": -575.7200927734375, + "logps/rejected": -916.6759643554688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.00657844543457, + "rewards/margins": 26.892932891845703, + "rewards/rejected": -35.899513244628906, + "step": 4428 + }, + { + "epoch": 2.7552099533437016, + "grad_norm": 2.7728248824132606e-05, + "learning_rate": 4.5182111572153067e-07, + "logits/chosen": 2.1965889930725098, + "logits/rejected": 2.6689367294311523, + "logps/chosen": -504.8052062988281, + "logps/rejected": -885.7496337890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.474845886230469, + "rewards/margins": 29.399259567260742, + "rewards/rejected": -38.874107360839844, + "step": 4429 + }, + { + "epoch": 2.755832037325039, + "grad_norm": 0.06133288890123367, + "learning_rate": 4.5066851083448595e-07, + "logits/chosen": 1.7043501138687134, + "logits/rejected": 3.910205125808716, + "logps/chosen": -513.8598022460938, + "logps/rejected": -825.887451171875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.094804763793945, + "rewards/margins": 23.070405960083008, + "rewards/rejected": -31.165210723876953, + "step": 4430 + }, + { + "epoch": 2.7564541213063762, + "grad_norm": 0.05586942657828331, + "learning_rate": 4.495159059474413e-07, + "logits/chosen": 0.8189829587936401, + "logits/rejected": 2.6572515964508057, + "logps/chosen": -621.164794921875, + "logps/rejected": -890.162353515625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.553102493286133, + "rewards/margins": 17.43643569946289, + "rewards/rejected": -28.989540100097656, + "step": 4431 + }, + { + "epoch": 2.7570762052877136, + "grad_norm": 0.06295454502105713, + "learning_rate": 4.4836330106039656e-07, + "logits/chosen": 0.7370268106460571, + "logits/rejected": 2.5953383445739746, + "logps/chosen": -539.400390625, + "logps/rejected": -952.5663452148438, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.905994415283203, + "rewards/margins": 27.762670516967773, + "rewards/rejected": -36.668663024902344, + "step": 4432 + }, + { + "epoch": 2.7576982892690514, + "grad_norm": 26.892684936523438, + "learning_rate": 4.4721069617335183e-07, + "logits/chosen": -2.1785507202148438, + "logits/rejected": 2.2861528396606445, + "logps/chosen": -470.45953369140625, + "logps/rejected": -901.58642578125, + "loss": 0.1868, + "rewards/accuracies": 0.875, + "rewards/chosen": -7.75625467300415, + "rewards/margins": 17.602706909179688, + "rewards/rejected": -25.358963012695312, + "step": 4433 + }, + { + "epoch": 2.7583203732503887, + "grad_norm": 6.219872375368141e-06, + "learning_rate": 4.460580912863071e-07, + "logits/chosen": 1.8374208211898804, + "logits/rejected": 3.0011253356933594, + "logps/chosen": -687.8511352539062, + "logps/rejected": -1078.059814453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.497608184814453, + "rewards/margins": 29.768260955810547, + "rewards/rejected": -44.265869140625, + "step": 4434 + }, + { + "epoch": 2.7589424572317265, + "grad_norm": 0.0023374557495117188, + "learning_rate": 4.449054863992624e-07, + "logits/chosen": -0.7675460577011108, + "logits/rejected": 4.285062789916992, + "logps/chosen": -476.12841796875, + "logps/rejected": -1053.994140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.0022873878479, + "rewards/margins": 27.987964630126953, + "rewards/rejected": -33.99024963378906, + "step": 4435 + }, + { + "epoch": 2.759564541213064, + "grad_norm": 6.796044181101024e-05, + "learning_rate": 4.4375288151221767e-07, + "logits/chosen": -3.5497636795043945, + "logits/rejected": 2.2416834831237793, + "logps/chosen": -369.75775146484375, + "logps/rejected": -1062.5948486328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.879331588745117, + "rewards/margins": 31.077716827392578, + "rewards/rejected": -38.95705032348633, + "step": 4436 + }, + { + "epoch": 2.760186625194401, + "grad_norm": 5.898369015433502e-10, + "learning_rate": 4.4260027662517294e-07, + "logits/chosen": 0.45849087834358215, + "logits/rejected": 2.6137187480926514, + "logps/chosen": -578.7003173828125, + "logps/rejected": -978.3519287109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.692137241363525, + "rewards/margins": 34.749053955078125, + "rewards/rejected": -40.441192626953125, + "step": 4437 + }, + { + "epoch": 2.7608087091757385, + "grad_norm": 0.0017215252155438066, + "learning_rate": 4.414476717381282e-07, + "logits/chosen": -1.5381698608398438, + "logits/rejected": 3.0950610637664795, + "logps/chosen": -471.3437194824219, + "logps/rejected": -1032.288330078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.139705657958984, + "rewards/margins": 33.45232009887695, + "rewards/rejected": -44.59202575683594, + "step": 4438 + }, + { + "epoch": 2.7614307931570763, + "grad_norm": 3.451422691345215, + "learning_rate": 4.402950668510835e-07, + "logits/chosen": 0.4012671113014221, + "logits/rejected": 2.3096537590026855, + "logps/chosen": -562.41796875, + "logps/rejected": -933.130126953125, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.880684852600098, + "rewards/margins": 20.48157501220703, + "rewards/rejected": -33.36226272583008, + "step": 4439 + }, + { + "epoch": 2.7620528771384136, + "grad_norm": 0.32829925417900085, + "learning_rate": 4.391424619640387e-07, + "logits/chosen": 0.8047013878822327, + "logits/rejected": 4.585402488708496, + "logps/chosen": -422.2301025390625, + "logps/rejected": -914.16748046875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.537834644317627, + "rewards/margins": 23.84255599975586, + "rewards/rejected": -31.380390167236328, + "step": 4440 + }, + { + "epoch": 2.7626749611197514, + "grad_norm": 0.0013651353074237704, + "learning_rate": 4.37989857076994e-07, + "logits/chosen": -1.0214922428131104, + "logits/rejected": 3.771331787109375, + "logps/chosen": -444.5788269042969, + "logps/rejected": -1036.6204833984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.631132125854492, + "rewards/margins": 32.456321716308594, + "rewards/rejected": -44.08745193481445, + "step": 4441 + }, + { + "epoch": 2.7632970451010888, + "grad_norm": 0.030825063586235046, + "learning_rate": 4.368372521899493e-07, + "logits/chosen": 1.3546017408370972, + "logits/rejected": 4.663175106048584, + "logps/chosen": -685.845458984375, + "logps/rejected": -1188.11474609375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.846428871154785, + "rewards/margins": 31.212594985961914, + "rewards/rejected": -39.059024810791016, + "step": 4442 + }, + { + "epoch": 2.763919129082426, + "grad_norm": 1.188171625137329, + "learning_rate": 4.3568464730290456e-07, + "logits/chosen": 0.717941164970398, + "logits/rejected": 3.334520101547241, + "logps/chosen": -399.744140625, + "logps/rejected": -762.46923828125, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.898489952087402, + "rewards/margins": 20.447917938232422, + "rewards/rejected": -26.346406936645508, + "step": 4443 + }, + { + "epoch": 2.7645412130637634, + "grad_norm": 1.4317882061004639, + "learning_rate": 4.345320424158599e-07, + "logits/chosen": 1.807151198387146, + "logits/rejected": 3.9615252017974854, + "logps/chosen": -687.8176879882812, + "logps/rejected": -1120.4713134765625, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.907222747802734, + "rewards/margins": 27.348201751708984, + "rewards/rejected": -40.25542449951172, + "step": 4444 + }, + { + "epoch": 2.765163297045101, + "grad_norm": 0.00014324445510283113, + "learning_rate": 4.3337943752881517e-07, + "logits/chosen": 0.6386379599571228, + "logits/rejected": 3.6459169387817383, + "logps/chosen": -518.3583984375, + "logps/rejected": -903.1107788085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.458932876586914, + "rewards/margins": 24.56821632385254, + "rewards/rejected": -33.02714538574219, + "step": 4445 + }, + { + "epoch": 2.7657853810264386, + "grad_norm": 2.535586190788308e-06, + "learning_rate": 4.3222683264177044e-07, + "logits/chosen": -0.27193400263786316, + "logits/rejected": 1.8687951564788818, + "logps/chosen": -444.036376953125, + "logps/rejected": -907.522216796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.857758045196533, + "rewards/margins": 32.78850555419922, + "rewards/rejected": -40.646263122558594, + "step": 4446 + }, + { + "epoch": 2.766407465007776, + "grad_norm": 0.7894816994667053, + "learning_rate": 4.310742277547257e-07, + "logits/chosen": -0.8826746940612793, + "logits/rejected": 2.4022674560546875, + "logps/chosen": -437.020263671875, + "logps/rejected": -951.0142822265625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.185009956359863, + "rewards/margins": 29.816877365112305, + "rewards/rejected": -35.001888275146484, + "step": 4447 + }, + { + "epoch": 2.7670295489891137, + "grad_norm": 4.225468001095578e-05, + "learning_rate": 4.29921622867681e-07, + "logits/chosen": 0.9749264717102051, + "logits/rejected": 2.5457139015197754, + "logps/chosen": -483.837646484375, + "logps/rejected": -826.4235229492188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.304150104522705, + "rewards/margins": 26.983564376831055, + "rewards/rejected": -33.28771209716797, + "step": 4448 + }, + { + "epoch": 2.767651632970451, + "grad_norm": 8.3215105405543e-05, + "learning_rate": 4.287690179806363e-07, + "logits/chosen": -1.8269789218902588, + "logits/rejected": 2.920605182647705, + "logps/chosen": -286.757080078125, + "logps/rejected": -869.705810546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.869862079620361, + "rewards/margins": 28.93183708190918, + "rewards/rejected": -35.801700592041016, + "step": 4449 + }, + { + "epoch": 2.7682737169517884, + "grad_norm": 0.00034528595278970897, + "learning_rate": 4.2761641309359155e-07, + "logits/chosen": 1.6075466871261597, + "logits/rejected": 3.9959115982055664, + "logps/chosen": -627.35205078125, + "logps/rejected": -1138.360107421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.445072174072266, + "rewards/margins": 32.44855880737305, + "rewards/rejected": -43.89363098144531, + "step": 4450 + }, + { + "epoch": 2.7688958009331257, + "grad_norm": 0.00025485100923106074, + "learning_rate": 4.2646380820654683e-07, + "logits/chosen": 0.14457732439041138, + "logits/rejected": 3.260528564453125, + "logps/chosen": -507.02459716796875, + "logps/rejected": -1023.6856689453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.381438255310059, + "rewards/margins": 34.37653732299805, + "rewards/rejected": -42.757972717285156, + "step": 4451 + }, + { + "epoch": 2.7695178849144635, + "grad_norm": 0.012433369643986225, + "learning_rate": 4.253112033195021e-07, + "logits/chosen": 0.6450670957565308, + "logits/rejected": 2.940737009048462, + "logps/chosen": -562.45263671875, + "logps/rejected": -1125.9609375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.368751525878906, + "rewards/margins": 34.92451858520508, + "rewards/rejected": -42.29327392578125, + "step": 4452 + }, + { + "epoch": 2.770139968895801, + "grad_norm": 6.034801117493771e-05, + "learning_rate": 4.241585984324574e-07, + "logits/chosen": 1.3404961824417114, + "logits/rejected": 3.4674181938171387, + "logps/chosen": -577.1162109375, + "logps/rejected": -947.2723999023438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.663448333740234, + "rewards/margins": 31.020864486694336, + "rewards/rejected": -39.6843147277832, + "step": 4453 + }, + { + "epoch": 2.7707620528771386, + "grad_norm": 38.70772933959961, + "learning_rate": 4.2300599354541266e-07, + "logits/chosen": -0.14834386110305786, + "logits/rejected": 2.671330213546753, + "logps/chosen": -505.04937744140625, + "logps/rejected": -908.2825927734375, + "loss": 0.4691, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.919454574584961, + "rewards/margins": 24.19499397277832, + "rewards/rejected": -33.114444732666016, + "step": 4454 + }, + { + "epoch": 2.771384136858476, + "grad_norm": 0.008293528109788895, + "learning_rate": 4.21853388658368e-07, + "logits/chosen": -1.5592821836471558, + "logits/rejected": 2.4315786361694336, + "logps/chosen": -591.888671875, + "logps/rejected": -1101.0301513671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.967531204223633, + "rewards/margins": 32.232948303222656, + "rewards/rejected": -43.20048141479492, + "step": 4455 + }, + { + "epoch": 2.7720062208398133, + "grad_norm": 0.4753230810165405, + "learning_rate": 4.2070078377132327e-07, + "logits/chosen": -1.0049407482147217, + "logits/rejected": 1.6527700424194336, + "logps/chosen": -504.166748046875, + "logps/rejected": -987.4268798828125, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.276983261108398, + "rewards/margins": 26.664608001708984, + "rewards/rejected": -35.94158935546875, + "step": 4456 + }, + { + "epoch": 2.7726283048211506, + "grad_norm": 0.0030523869208991528, + "learning_rate": 4.1954817888427844e-07, + "logits/chosen": -0.23812870681285858, + "logits/rejected": 2.4745800495147705, + "logps/chosen": -390.5793762207031, + "logps/rejected": -880.6826171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.854804515838623, + "rewards/margins": 28.500247955322266, + "rewards/rejected": -35.35505294799805, + "step": 4457 + }, + { + "epoch": 2.7732503888024884, + "grad_norm": 1.311348825083769e-07, + "learning_rate": 4.183955739972338e-07, + "logits/chosen": 0.2634304463863373, + "logits/rejected": 2.0421676635742188, + "logps/chosen": -627.1038208007812, + "logps/rejected": -1146.34423828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.411871910095215, + "rewards/margins": 36.43199920654297, + "rewards/rejected": -46.843868255615234, + "step": 4458 + }, + { + "epoch": 2.7738724727838258, + "grad_norm": 0.019252663478255272, + "learning_rate": 4.1724296911018905e-07, + "logits/chosen": -0.02178037166595459, + "logits/rejected": 2.942920207977295, + "logps/chosen": -406.655517578125, + "logps/rejected": -904.6901245117188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.956550598144531, + "rewards/margins": 27.37434959411621, + "rewards/rejected": -36.330902099609375, + "step": 4459 + }, + { + "epoch": 2.7744945567651635, + "grad_norm": 0.012821480631828308, + "learning_rate": 4.1609036422314433e-07, + "logits/chosen": 1.586922526359558, + "logits/rejected": 0.9899404048919678, + "logps/chosen": -728.854736328125, + "logps/rejected": -999.9464111328125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.720413208007812, + "rewards/margins": 25.15831184387207, + "rewards/rejected": -36.878726959228516, + "step": 4460 + }, + { + "epoch": 2.775116640746501, + "grad_norm": 0.0007586081046611071, + "learning_rate": 4.149377593360996e-07, + "logits/chosen": 1.5977089405059814, + "logits/rejected": 3.58918833732605, + "logps/chosen": -558.2069702148438, + "logps/rejected": -910.0013427734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.199235916137695, + "rewards/margins": 22.81451416015625, + "rewards/rejected": -27.013748168945312, + "step": 4461 + }, + { + "epoch": 2.7757387247278382, + "grad_norm": 0.08420630544424057, + "learning_rate": 4.137851544490549e-07, + "logits/chosen": 0.057189732789993286, + "logits/rejected": 1.9672027826309204, + "logps/chosen": -508.4757080078125, + "logps/rejected": -935.8024291992188, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.608369827270508, + "rewards/margins": 26.938785552978516, + "rewards/rejected": -33.547157287597656, + "step": 4462 + }, + { + "epoch": 2.7763608087091756, + "grad_norm": 4.442808130988851e-05, + "learning_rate": 4.1263254956201016e-07, + "logits/chosen": 2.554918050765991, + "logits/rejected": 3.760105848312378, + "logps/chosen": -675.0526123046875, + "logps/rejected": -1031.32080078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.590851783752441, + "rewards/margins": 26.76144027709961, + "rewards/rejected": -34.352291107177734, + "step": 4463 + }, + { + "epoch": 2.7769828926905133, + "grad_norm": 29.725894927978516, + "learning_rate": 4.1147994467496544e-07, + "logits/chosen": 2.8413314819335938, + "logits/rejected": 5.103183746337891, + "logps/chosen": -694.7820434570312, + "logps/rejected": -1091.3458251953125, + "loss": 0.2112, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.534599304199219, + "rewards/margins": 25.128934860229492, + "rewards/rejected": -35.66353225708008, + "step": 4464 + }, + { + "epoch": 2.7776049766718507, + "grad_norm": 2.02376651763916, + "learning_rate": 4.103273397879207e-07, + "logits/chosen": -0.6693498492240906, + "logits/rejected": 1.4553828239440918, + "logps/chosen": -387.061279296875, + "logps/rejected": -801.5008544921875, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7194764614105225, + "rewards/margins": 25.815839767456055, + "rewards/rejected": -29.535316467285156, + "step": 4465 + }, + { + "epoch": 2.778227060653188, + "grad_norm": 0.07350405305624008, + "learning_rate": 4.09174734900876e-07, + "logits/chosen": 0.9615667462348938, + "logits/rejected": 3.035022258758545, + "logps/chosen": -574.1995239257812, + "logps/rejected": -965.72607421875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.539543151855469, + "rewards/margins": 29.673593521118164, + "rewards/rejected": -35.213134765625, + "step": 4466 + }, + { + "epoch": 2.778849144634526, + "grad_norm": 1.4751883745193481, + "learning_rate": 4.080221300138313e-07, + "logits/chosen": -1.4429993629455566, + "logits/rejected": 0.7418410778045654, + "logps/chosen": -419.0539245605469, + "logps/rejected": -773.825439453125, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.026326179504395, + "rewards/margins": 21.981840133666992, + "rewards/rejected": -30.008167266845703, + "step": 4467 + }, + { + "epoch": 2.779471228615863, + "grad_norm": 4.9203539674635977e-05, + "learning_rate": 4.068695251267866e-07, + "logits/chosen": -1.568742275238037, + "logits/rejected": 1.5835626125335693, + "logps/chosen": -339.059814453125, + "logps/rejected": -995.8690185546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.751646518707275, + "rewards/margins": 37.416934967041016, + "rewards/rejected": -42.168582916259766, + "step": 4468 + }, + { + "epoch": 2.7800933125972005, + "grad_norm": 0.014542018994688988, + "learning_rate": 4.057169202397419e-07, + "logits/chosen": 0.9699658751487732, + "logits/rejected": 0.5567260980606079, + "logps/chosen": -613.11572265625, + "logps/rejected": -860.3040771484375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.781393051147461, + "rewards/margins": 26.79910659790039, + "rewards/rejected": -35.58049774169922, + "step": 4469 + }, + { + "epoch": 2.780715396578538, + "grad_norm": 0.810660183429718, + "learning_rate": 4.0456431535269716e-07, + "logits/chosen": 1.6214731931686401, + "logits/rejected": 1.755047082901001, + "logps/chosen": -628.4044799804688, + "logps/rejected": -830.4536743164062, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.023491859436035, + "rewards/margins": 22.268905639648438, + "rewards/rejected": -31.292400360107422, + "step": 4470 + }, + { + "epoch": 2.7813374805598756, + "grad_norm": 0.725084662437439, + "learning_rate": 4.0341171046565244e-07, + "logits/chosen": -2.426719903945923, + "logits/rejected": 3.6258738040924072, + "logps/chosen": -349.99017333984375, + "logps/rejected": -1080.127685546875, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.08149528503418, + "rewards/margins": 35.7020149230957, + "rewards/rejected": -40.78350830078125, + "step": 4471 + }, + { + "epoch": 2.781959564541213, + "grad_norm": 0.2610771656036377, + "learning_rate": 4.022591055786077e-07, + "logits/chosen": -2.8877463340759277, + "logits/rejected": 0.4570479989051819, + "logps/chosen": -481.33953857421875, + "logps/rejected": -955.6806030273438, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.037692070007324, + "rewards/margins": 28.204362869262695, + "rewards/rejected": -37.24205780029297, + "step": 4472 + }, + { + "epoch": 2.7825816485225507, + "grad_norm": 2.493027925491333, + "learning_rate": 4.01106500691563e-07, + "logits/chosen": -1.6067466735839844, + "logits/rejected": 3.3695433139801025, + "logps/chosen": -488.156005859375, + "logps/rejected": -1194.8486328125, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.307488441467285, + "rewards/margins": 40.60865020751953, + "rewards/rejected": -51.916141510009766, + "step": 4473 + }, + { + "epoch": 2.783203732503888, + "grad_norm": 24.57866668701172, + "learning_rate": 3.999538958045182e-07, + "logits/chosen": -0.47966277599334717, + "logits/rejected": 3.1841583251953125, + "logps/chosen": -600.14013671875, + "logps/rejected": -1150.5703125, + "loss": 0.1397, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.52299976348877, + "rewards/margins": 30.642662048339844, + "rewards/rejected": -41.1656608581543, + "step": 4474 + }, + { + "epoch": 2.7838258164852254, + "grad_norm": 0.0020669519435614347, + "learning_rate": 3.988012909174735e-07, + "logits/chosen": -0.7197089195251465, + "logits/rejected": 2.831148624420166, + "logps/chosen": -545.8742065429688, + "logps/rejected": -1166.651123046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.606180191040039, + "rewards/margins": 38.95869064331055, + "rewards/rejected": -46.56487274169922, + "step": 4475 + }, + { + "epoch": 2.7844479004665628, + "grad_norm": 1.5602360008415417e-06, + "learning_rate": 3.976486860304288e-07, + "logits/chosen": -0.7320283651351929, + "logits/rejected": 2.878640651702881, + "logps/chosen": -419.9617919921875, + "logps/rejected": -982.1232299804688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.648433685302734, + "rewards/margins": 34.59681701660156, + "rewards/rejected": -40.24524688720703, + "step": 4476 + }, + { + "epoch": 2.7850699844479005, + "grad_norm": 0.0013564362889155746, + "learning_rate": 3.9649608114338405e-07, + "logits/chosen": 0.333895206451416, + "logits/rejected": 4.933841705322266, + "logps/chosen": -350.03466796875, + "logps/rejected": -985.833740234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.056970119476318, + "rewards/margins": 32.718589782714844, + "rewards/rejected": -39.77556610107422, + "step": 4477 + }, + { + "epoch": 2.785692068429238, + "grad_norm": 0.7813869118690491, + "learning_rate": 3.9534347625633933e-07, + "logits/chosen": -0.41498851776123047, + "logits/rejected": 3.3700923919677734, + "logps/chosen": -596.0548095703125, + "logps/rejected": -1053.457275390625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.659764766693115, + "rewards/margins": 23.89316177368164, + "rewards/rejected": -30.55292510986328, + "step": 4478 + }, + { + "epoch": 2.7863141524105757, + "grad_norm": 0.2598787546157837, + "learning_rate": 3.941908713692946e-07, + "logits/chosen": -2.235633134841919, + "logits/rejected": 0.9133669137954712, + "logps/chosen": -472.2154541015625, + "logps/rejected": -889.4255981445312, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.542850494384766, + "rewards/margins": 19.466215133666992, + "rewards/rejected": -26.00906753540039, + "step": 4479 + }, + { + "epoch": 2.786936236391913, + "grad_norm": 1.2075815200805664, + "learning_rate": 3.930382664822499e-07, + "logits/chosen": 2.1868131160736084, + "logits/rejected": 1.7234981060028076, + "logps/chosen": -726.6332397460938, + "logps/rejected": -897.5386962890625, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.894367694854736, + "rewards/margins": 21.893020629882812, + "rewards/rejected": -27.787389755249023, + "step": 4480 + }, + { + "epoch": 2.7875583203732504, + "grad_norm": 0.0008331398130394518, + "learning_rate": 3.918856615952052e-07, + "logits/chosen": -2.639051914215088, + "logits/rejected": 2.49995493888855, + "logps/chosen": -304.4178466796875, + "logps/rejected": -796.910400390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.245255947113037, + "rewards/margins": 26.872276306152344, + "rewards/rejected": -34.11753463745117, + "step": 4481 + }, + { + "epoch": 2.7881804043545877, + "grad_norm": 0.01270584762096405, + "learning_rate": 3.907330567081605e-07, + "logits/chosen": 1.176623821258545, + "logits/rejected": 2.877804756164551, + "logps/chosen": -648.4818115234375, + "logps/rejected": -917.58740234375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.145034790039062, + "rewards/margins": 25.277450561523438, + "rewards/rejected": -35.4224853515625, + "step": 4482 + }, + { + "epoch": 2.7888024883359255, + "grad_norm": 0.6171900033950806, + "learning_rate": 3.8958045182111577e-07, + "logits/chosen": 0.6409913301467896, + "logits/rejected": 1.7398476600646973, + "logps/chosen": -533.915771484375, + "logps/rejected": -829.6786499023438, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.03984260559082, + "rewards/margins": 21.923110961914062, + "rewards/rejected": -32.96295166015625, + "step": 4483 + }, + { + "epoch": 2.789424572317263, + "grad_norm": 0.49171316623687744, + "learning_rate": 3.8842784693407105e-07, + "logits/chosen": -0.3405499756336212, + "logits/rejected": 1.7967162132263184, + "logps/chosen": -491.95098876953125, + "logps/rejected": -943.3385620117188, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.940799713134766, + "rewards/margins": 28.583152770996094, + "rewards/rejected": -35.52395248413086, + "step": 4484 + }, + { + "epoch": 2.7900466562986, + "grad_norm": 1.9615070812051272e-07, + "learning_rate": 3.872752420470263e-07, + "logits/chosen": 0.26819831132888794, + "logits/rejected": 4.310084819793701, + "logps/chosen": -454.3849182128906, + "logps/rejected": -1021.4901733398438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.043222427368164, + "rewards/margins": 30.55453109741211, + "rewards/rejected": -39.597755432128906, + "step": 4485 + }, + { + "epoch": 2.790668740279938, + "grad_norm": 0.460784912109375, + "learning_rate": 3.861226371599816e-07, + "logits/chosen": -1.8704692125320435, + "logits/rejected": 2.6920218467712402, + "logps/chosen": -410.005126953125, + "logps/rejected": -965.3362426757812, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.3175435066223145, + "rewards/margins": 34.1805534362793, + "rewards/rejected": -40.49809265136719, + "step": 4486 + }, + { + "epoch": 2.7912908242612753, + "grad_norm": 5.327062535798177e-05, + "learning_rate": 3.849700322729369e-07, + "logits/chosen": -2.625596523284912, + "logits/rejected": 1.3865752220153809, + "logps/chosen": -388.6800842285156, + "logps/rejected": -889.3480224609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.086573600769043, + "rewards/margins": 23.496519088745117, + "rewards/rejected": -29.583093643188477, + "step": 4487 + }, + { + "epoch": 2.7919129082426126, + "grad_norm": 32.0152702331543, + "learning_rate": 3.8381742738589216e-07, + "logits/chosen": 0.7535173892974854, + "logits/rejected": 2.33378005027771, + "logps/chosen": -550.8919677734375, + "logps/rejected": -866.2294921875, + "loss": 0.5518, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.010148048400879, + "rewards/margins": 23.95358657836914, + "rewards/rejected": -31.963733673095703, + "step": 4488 + }, + { + "epoch": 2.79253499222395, + "grad_norm": 0.002230971585959196, + "learning_rate": 3.8266482249884744e-07, + "logits/chosen": -0.4587140381336212, + "logits/rejected": -0.24935391545295715, + "logps/chosen": -423.4178161621094, + "logps/rejected": -713.5352783203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.603036880493164, + "rewards/margins": 25.637386322021484, + "rewards/rejected": -34.240421295166016, + "step": 4489 + }, + { + "epoch": 2.7931570762052877, + "grad_norm": 0.02404855750501156, + "learning_rate": 3.815122176118027e-07, + "logits/chosen": 1.2902207374572754, + "logits/rejected": 2.8667750358581543, + "logps/chosen": -725.5426635742188, + "logps/rejected": -1105.2545166015625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.346529006958008, + "rewards/margins": 27.529224395751953, + "rewards/rejected": -40.875755310058594, + "step": 4490 + }, + { + "epoch": 2.793779160186625, + "grad_norm": 0.003159100888296962, + "learning_rate": 3.8035961272475794e-07, + "logits/chosen": -0.68532794713974, + "logits/rejected": 3.2057533264160156, + "logps/chosen": -442.5166015625, + "logps/rejected": -1119.332275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.750860214233398, + "rewards/margins": 41.39916229248047, + "rewards/rejected": -52.1500244140625, + "step": 4491 + }, + { + "epoch": 2.794401244167963, + "grad_norm": 0.9106011390686035, + "learning_rate": 3.792070078377132e-07, + "logits/chosen": 2.084537982940674, + "logits/rejected": 3.1576859951019287, + "logps/chosen": -626.247802734375, + "logps/rejected": -874.1102905273438, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.74012565612793, + "rewards/margins": 18.01773452758789, + "rewards/rejected": -26.75786018371582, + "step": 4492 + }, + { + "epoch": 2.7950233281493, + "grad_norm": 0.0044584497809410095, + "learning_rate": 3.780544029506685e-07, + "logits/chosen": 2.107027769088745, + "logits/rejected": 4.1379594802856445, + "logps/chosen": -637.359130859375, + "logps/rejected": -1030.0760498046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.089095115661621, + "rewards/margins": 30.950481414794922, + "rewards/rejected": -41.039573669433594, + "step": 4493 + }, + { + "epoch": 2.7956454121306376, + "grad_norm": 3.15701836370863e-05, + "learning_rate": 3.769017980636238e-07, + "logits/chosen": -1.1554534435272217, + "logits/rejected": 2.479703664779663, + "logps/chosen": -464.4109802246094, + "logps/rejected": -899.0807495117188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.289630889892578, + "rewards/margins": 26.90167999267578, + "rewards/rejected": -34.19131088256836, + "step": 4494 + }, + { + "epoch": 2.796267496111975, + "grad_norm": 2.108475200657267e-06, + "learning_rate": 3.757491931765791e-07, + "logits/chosen": 0.3373467028141022, + "logits/rejected": 3.91178297996521, + "logps/chosen": -618.9368286132812, + "logps/rejected": -1257.044189453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.883026599884033, + "rewards/margins": 38.584205627441406, + "rewards/rejected": -45.46723175048828, + "step": 4495 + }, + { + "epoch": 2.7968895800933127, + "grad_norm": 0.10682783275842667, + "learning_rate": 3.745965882895344e-07, + "logits/chosen": 0.47683557868003845, + "logits/rejected": 2.9689526557922363, + "logps/chosen": -599.391357421875, + "logps/rejected": -1023.8408203125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.146797180175781, + "rewards/margins": 27.955692291259766, + "rewards/rejected": -39.10249328613281, + "step": 4496 + }, + { + "epoch": 2.79751166407465, + "grad_norm": 2.7964730262756348, + "learning_rate": 3.7344398340248966e-07, + "logits/chosen": 0.1441301703453064, + "logits/rejected": 2.867811679840088, + "logps/chosen": -398.3563232421875, + "logps/rejected": -792.4764404296875, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.013823986053467, + "rewards/margins": 22.58769416809082, + "rewards/rejected": -29.601516723632812, + "step": 4497 + }, + { + "epoch": 2.798133748055988, + "grad_norm": 2.440301250317134e-05, + "learning_rate": 3.7229137851544494e-07, + "logits/chosen": 2.6866021156311035, + "logits/rejected": 4.955380439758301, + "logps/chosen": -597.2278442382812, + "logps/rejected": -968.24755859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.944086074829102, + "rewards/margins": 27.064598083496094, + "rewards/rejected": -38.00868606567383, + "step": 4498 + }, + { + "epoch": 2.798755832037325, + "grad_norm": 35.87205505371094, + "learning_rate": 3.711387736284002e-07, + "logits/chosen": 1.2314321994781494, + "logits/rejected": 3.3555502891540527, + "logps/chosen": -576.2147216796875, + "logps/rejected": -910.40869140625, + "loss": 0.3366, + "rewards/accuracies": 0.875, + "rewards/chosen": -9.402275085449219, + "rewards/margins": 21.1616153717041, + "rewards/rejected": -30.563892364501953, + "step": 4499 + }, + { + "epoch": 2.7993779160186625, + "grad_norm": 0.0009058943251147866, + "learning_rate": 3.699861687413555e-07, + "logits/chosen": -0.4183881878852844, + "logits/rejected": 1.6836124658584595, + "logps/chosen": -588.3963012695312, + "logps/rejected": -1056.568603515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.816722869873047, + "rewards/margins": 34.03743362426758, + "rewards/rejected": -41.854156494140625, + "step": 4500 + } + ], + "logging_steps": 1, + "max_steps": 4821, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}