diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8684 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 2000, + "global_step": 5733, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005232862375719519, + "grad_norm": 84.98858388556421, + "learning_rate": 8.710801393728223e-10, + "logits/chosen": -3.3605234622955322, + "logits/rejected": -3.29974365234375, + "logps/chosen": -511.38861083984375, + "logps/rejected": -608.7561645507812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0052328623757195184, + "grad_norm": 79.43213747615805, + "learning_rate": 8.710801393728223e-09, + "logits/chosen": -2.7466022968292236, + "logits/rejected": -2.7475805282592773, + "logps/chosen": -345.8673095703125, + "logps/rejected": -288.7480163574219, + "loss": 0.69, + "rewards/accuracies": 0.4722222089767456, + "rewards/chosen": 0.007846314460039139, + "rewards/margins": 0.002511692699044943, + "rewards/rejected": 0.005334621295332909, + "step": 10 + }, + { + "epoch": 0.010465724751439037, + "grad_norm": 78.17853332408012, + "learning_rate": 1.7421602787456446e-08, + "logits/chosen": -2.7542061805725098, + "logits/rejected": -2.746408224105835, + "logps/chosen": -234.43270874023438, + "logps/rejected": -222.5880584716797, + "loss": 0.6938, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.005010186228901148, + "rewards/margins": -0.007448701653629541, + "rewards/rejected": 0.0024385168217122555, + "step": 20 + }, + { + "epoch": 0.015698587127158554, + "grad_norm": 76.48297301996568, + "learning_rate": 2.6132404181184667e-08, + "logits/chosen": -2.611788749694824, + "logits/rejected": -2.5875256061553955, + "logps/chosen": -311.51861572265625, + "logps/rejected": -283.8876953125, + "loss": 0.6939, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0030460101552307606, + "rewards/margins": 0.00527621153742075, + "rewards/rejected": -0.00223020208068192, + "step": 30 + }, + { + "epoch": 0.020931449502878074, + "grad_norm": 79.23012990922979, + "learning_rate": 3.484320557491289e-08, + "logits/chosen": -2.8673622608184814, + "logits/rejected": -2.744708299636841, + "logps/chosen": -327.95245361328125, + "logps/rejected": -314.35400390625, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.007563448045402765, + "rewards/margins": 0.004399295896291733, + "rewards/rejected": 0.003164151683449745, + "step": 40 + }, + { + "epoch": 0.026164311878597593, + "grad_norm": 83.70050104654835, + "learning_rate": 4.355400696864111e-08, + "logits/chosen": -2.904484987258911, + "logits/rejected": -2.7473223209381104, + "logps/chosen": -304.32318115234375, + "logps/rejected": -276.08258056640625, + "loss": 0.6934, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.012552691623568535, + "rewards/margins": -0.014694769866764545, + "rewards/rejected": 0.0021420782431960106, + "step": 50 + }, + { + "epoch": 0.03139717425431711, + "grad_norm": 81.16686963609796, + "learning_rate": 5.2264808362369334e-08, + "logits/chosen": -2.760773181915283, + "logits/rejected": -2.7229771614074707, + "logps/chosen": -281.700927734375, + "logps/rejected": -275.43499755859375, + "loss": 0.6924, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0024287248961627483, + "rewards/margins": 0.00016310028149746358, + "rewards/rejected": -0.002591826021671295, + "step": 60 + }, + { + "epoch": 0.03663003663003663, + "grad_norm": 87.29945601840559, + "learning_rate": 6.097560975609756e-08, + "logits/chosen": -2.875711679458618, + "logits/rejected": -2.777132511138916, + "logps/chosen": -340.478515625, + "logps/rejected": -273.4681701660156, + "loss": 0.6923, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.0026040722150355577, + "rewards/margins": -0.0013601541286334395, + "rewards/rejected": 0.003964226692914963, + "step": 70 + }, + { + "epoch": 0.04186289900575615, + "grad_norm": 80.29181059309379, + "learning_rate": 6.968641114982578e-08, + "logits/chosen": -2.816094398498535, + "logits/rejected": -2.787306308746338, + "logps/chosen": -315.94891357421875, + "logps/rejected": -313.4256896972656, + "loss": 0.6941, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.003847701009362936, + "rewards/margins": -0.0019189619924873114, + "rewards/rejected": -0.0019287395989522338, + "step": 80 + }, + { + "epoch": 0.04709576138147567, + "grad_norm": 72.93777220164954, + "learning_rate": 7.8397212543554e-08, + "logits/chosen": -2.819770574569702, + "logits/rejected": -2.770904302597046, + "logps/chosen": -254.18325805664062, + "logps/rejected": -221.6609649658203, + "loss": 0.6959, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.003520088968798518, + "rewards/margins": 0.007436218671500683, + "rewards/rejected": -0.003916130401194096, + "step": 90 + }, + { + "epoch": 0.052328623757195186, + "grad_norm": 71.94653775183346, + "learning_rate": 8.710801393728223e-08, + "logits/chosen": -2.8072497844696045, + "logits/rejected": -2.679840564727783, + "logps/chosen": -229.0762939453125, + "logps/rejected": -204.06356811523438, + "loss": 0.6902, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005685704294592142, + "rewards/margins": 0.012103366665542126, + "rewards/rejected": -0.006417661905288696, + "step": 100 + }, + { + "epoch": 0.0575614861329147, + "grad_norm": 78.88850026339597, + "learning_rate": 9.581881533101045e-08, + "logits/chosen": -2.7377512454986572, + "logits/rejected": -2.602003574371338, + "logps/chosen": -271.032470703125, + "logps/rejected": -203.76358032226562, + "loss": 0.6899, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.007942494004964828, + "rewards/margins": -0.012209742330014706, + "rewards/rejected": 0.004267249722033739, + "step": 110 + }, + { + "epoch": 0.06279434850863422, + "grad_norm": 76.99435072846764, + "learning_rate": 1.0452961672473867e-07, + "logits/chosen": -2.879337787628174, + "logits/rejected": -2.874720811843872, + "logps/chosen": -381.11151123046875, + "logps/rejected": -352.8544921875, + "loss": 0.6886, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0011743694776669145, + "rewards/margins": -0.005505544599145651, + "rewards/rejected": 0.004331176169216633, + "step": 120 + }, + { + "epoch": 0.06802721088435375, + "grad_norm": 77.8746093168026, + "learning_rate": 1.132404181184669e-07, + "logits/chosen": -2.7185428142547607, + "logits/rejected": -2.693519115447998, + "logps/chosen": -235.90042114257812, + "logps/rejected": -245.9110870361328, + "loss": 0.691, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.00215378450229764, + "rewards/margins": 0.008324772119522095, + "rewards/rejected": -0.010478556156158447, + "step": 130 + }, + { + "epoch": 0.07326007326007326, + "grad_norm": 77.75033765594083, + "learning_rate": 1.219512195121951e-07, + "logits/chosen": -2.6036200523376465, + "logits/rejected": -2.63554048538208, + "logps/chosen": -307.88336181640625, + "logps/rejected": -318.00921630859375, + "loss": 0.6876, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.010539834387600422, + "rewards/margins": 0.01989991031587124, + "rewards/rejected": -0.009360076859593391, + "step": 140 + }, + { + "epoch": 0.07849293563579278, + "grad_norm": 83.66004050529598, + "learning_rate": 1.3066202090592334e-07, + "logits/chosen": -2.842623233795166, + "logits/rejected": -2.7181859016418457, + "logps/chosen": -272.5040588378906, + "logps/rejected": -254.56149291992188, + "loss": 0.6853, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0004568263830151409, + "rewards/margins": -0.01074077095836401, + "rewards/rejected": 0.011197598651051521, + "step": 150 + }, + { + "epoch": 0.0837257980115123, + "grad_norm": 80.22034194722806, + "learning_rate": 1.3937282229965157e-07, + "logits/chosen": -2.8325233459472656, + "logits/rejected": -2.6719284057617188, + "logps/chosen": -273.96197509765625, + "logps/rejected": -241.1158447265625, + "loss": 0.6822, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.022296609356999397, + "rewards/margins": 0.03128524124622345, + "rewards/rejected": -0.008988635614514351, + "step": 160 + }, + { + "epoch": 0.08895866038723181, + "grad_norm": 90.62330477252651, + "learning_rate": 1.480836236933798e-07, + "logits/chosen": -2.7778377532958984, + "logits/rejected": -2.786107301712036, + "logps/chosen": -246.99569702148438, + "logps/rejected": -278.2984619140625, + "loss": 0.6836, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.000674112350679934, + "rewards/margins": 0.008748064748942852, + "rewards/rejected": -0.008073952980339527, + "step": 170 + }, + { + "epoch": 0.09419152276295134, + "grad_norm": 65.34284837182982, + "learning_rate": 1.56794425087108e-07, + "logits/chosen": -2.7959346771240234, + "logits/rejected": -2.655682325363159, + "logps/chosen": -301.09344482421875, + "logps/rejected": -274.50665283203125, + "loss": 0.6801, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0035839497577399015, + "rewards/margins": 0.035895925015211105, + "rewards/rejected": -0.032311975955963135, + "step": 180 + }, + { + "epoch": 0.09942438513867086, + "grad_norm": 85.2461007419555, + "learning_rate": 1.6550522648083622e-07, + "logits/chosen": -2.8159756660461426, + "logits/rejected": -2.6639835834503174, + "logps/chosen": -314.5155029296875, + "logps/rejected": -264.17413330078125, + "loss": 0.6679, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0449281670153141, + "rewards/margins": 0.06909157335758209, + "rewards/rejected": -0.02416340447962284, + "step": 190 + }, + { + "epoch": 0.10465724751439037, + "grad_norm": 70.83663131764116, + "learning_rate": 1.7421602787456445e-07, + "logits/chosen": -2.7046008110046387, + "logits/rejected": -2.6014657020568848, + "logps/chosen": -280.58026123046875, + "logps/rejected": -305.3539733886719, + "loss": 0.6675, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0308912992477417, + "rewards/margins": 0.053800441324710846, + "rewards/rejected": -0.022909147664904594, + "step": 200 + }, + { + "epoch": 0.10989010989010989, + "grad_norm": 77.22019352282102, + "learning_rate": 1.8292682926829268e-07, + "logits/chosen": -2.7690446376800537, + "logits/rejected": -2.667555332183838, + "logps/chosen": -219.1191864013672, + "logps/rejected": -202.34437561035156, + "loss": 0.6804, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.03057698905467987, + "rewards/margins": 0.01921568438410759, + "rewards/rejected": 0.011361300945281982, + "step": 210 + }, + { + "epoch": 0.1151229722658294, + "grad_norm": 71.09147529011994, + "learning_rate": 1.916376306620209e-07, + "logits/chosen": -2.8125951290130615, + "logits/rejected": -2.7477786540985107, + "logps/chosen": -289.70404052734375, + "logps/rejected": -251.4872283935547, + "loss": 0.6673, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.08494623005390167, + "rewards/margins": 0.10478035360574722, + "rewards/rejected": -0.01983409747481346, + "step": 220 + }, + { + "epoch": 0.12035583464154893, + "grad_norm": 70.50795570252929, + "learning_rate": 2.003484320557491e-07, + "logits/chosen": -2.8188929557800293, + "logits/rejected": -2.805239200592041, + "logps/chosen": -253.07046508789062, + "logps/rejected": -333.8083190917969, + "loss": 0.6506, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.052991289645433426, + "rewards/margins": 0.08006981760263443, + "rewards/rejected": -0.027078529819846153, + "step": 230 + }, + { + "epoch": 0.12558869701726844, + "grad_norm": 76.4931381931025, + "learning_rate": 2.0905923344947734e-07, + "logits/chosen": -2.7703301906585693, + "logits/rejected": -2.6502628326416016, + "logps/chosen": -238.9730682373047, + "logps/rejected": -223.41329956054688, + "loss": 0.6559, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.011081100441515446, + "rewards/margins": 0.06371158361434937, + "rewards/rejected": -0.05263049155473709, + "step": 240 + }, + { + "epoch": 0.13082155939298795, + "grad_norm": 73.95230061923809, + "learning_rate": 2.1777003484320556e-07, + "logits/chosen": -2.6704938411712646, + "logits/rejected": -2.6541714668273926, + "logps/chosen": -298.54742431640625, + "logps/rejected": -254.73251342773438, + "loss": 0.6453, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07010379433631897, + "rewards/margins": 0.1818067580461502, + "rewards/rejected": -0.11170294135808945, + "step": 250 + }, + { + "epoch": 0.1360544217687075, + "grad_norm": 83.13952056039882, + "learning_rate": 2.264808362369338e-07, + "logits/chosen": -2.8034861087799072, + "logits/rejected": -2.700900077819824, + "logps/chosen": -300.6609802246094, + "logps/rejected": -277.47882080078125, + "loss": 0.648, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.08231760561466217, + "rewards/margins": 0.169277623295784, + "rewards/rejected": -0.08696001768112183, + "step": 260 + }, + { + "epoch": 0.141287284144427, + "grad_norm": 61.076305133734294, + "learning_rate": 2.3519163763066202e-07, + "logits/chosen": -2.745058536529541, + "logits/rejected": -2.6726226806640625, + "logps/chosen": -280.98431396484375, + "logps/rejected": -280.2472839355469, + "loss": 0.6259, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.012805774807929993, + "rewards/margins": 0.050133805721998215, + "rewards/rejected": -0.06293957680463791, + "step": 270 + }, + { + "epoch": 0.14652014652014653, + "grad_norm": 79.19192659206922, + "learning_rate": 2.439024390243902e-07, + "logits/chosen": -2.6895556449890137, + "logits/rejected": -2.7086892127990723, + "logps/chosen": -311.8687744140625, + "logps/rejected": -293.9984130859375, + "loss": 0.6612, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.05314619094133377, + "rewards/margins": 0.11733772605657578, + "rewards/rejected": -0.064191535115242, + "step": 280 + }, + { + "epoch": 0.15175300889586604, + "grad_norm": 67.77150674233079, + "learning_rate": 2.526132404181184e-07, + "logits/chosen": -2.8477418422698975, + "logits/rejected": -2.6748757362365723, + "logps/chosen": -333.6998291015625, + "logps/rejected": -245.5762176513672, + "loss": 0.6423, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04657986760139465, + "rewards/margins": 0.14008301496505737, + "rewards/rejected": -0.09350315481424332, + "step": 290 + }, + { + "epoch": 0.15698587127158556, + "grad_norm": 73.86139382185733, + "learning_rate": 2.613240418118467e-07, + "logits/chosen": -2.8373353481292725, + "logits/rejected": -2.7571821212768555, + "logps/chosen": -321.17181396484375, + "logps/rejected": -259.47735595703125, + "loss": 0.6403, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.055416040122509, + "rewards/margins": 0.08808239549398422, + "rewards/rejected": -0.14349845051765442, + "step": 300 + }, + { + "epoch": 0.16221873364730507, + "grad_norm": 65.10105933983293, + "learning_rate": 2.700348432055749e-07, + "logits/chosen": -2.8734145164489746, + "logits/rejected": -2.725494384765625, + "logps/chosen": -328.14971923828125, + "logps/rejected": -306.2589416503906, + "loss": 0.6475, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12547266483306885, + "rewards/margins": 0.19133004546165466, + "rewards/rejected": -0.06585738807916641, + "step": 310 + }, + { + "epoch": 0.1674515960230246, + "grad_norm": 78.08733800783429, + "learning_rate": 2.7874564459930313e-07, + "logits/chosen": -2.8610997200012207, + "logits/rejected": -2.632071018218994, + "logps/chosen": -340.14752197265625, + "logps/rejected": -229.4875946044922, + "loss": 0.6443, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.007843856699764729, + "rewards/margins": 0.18785050511360168, + "rewards/rejected": -0.19569435715675354, + "step": 320 + }, + { + "epoch": 0.1726844583987441, + "grad_norm": 76.52379245557374, + "learning_rate": 2.874564459930314e-07, + "logits/chosen": -2.680572032928467, + "logits/rejected": -2.4787521362304688, + "logps/chosen": -258.25518798828125, + "logps/rejected": -200.94171142578125, + "loss": 0.6432, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.007691374514251947, + "rewards/margins": 0.15264154970645905, + "rewards/rejected": -0.14495018124580383, + "step": 330 + }, + { + "epoch": 0.17791732077446362, + "grad_norm": 81.16170652139007, + "learning_rate": 2.961672473867596e-07, + "logits/chosen": -2.794625997543335, + "logits/rejected": -2.813786268234253, + "logps/chosen": -285.979736328125, + "logps/rejected": -308.09039306640625, + "loss": 0.6168, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07205596566200256, + "rewards/margins": 0.21761822700500488, + "rewards/rejected": -0.14556226134300232, + "step": 340 + }, + { + "epoch": 0.18315018315018314, + "grad_norm": 72.97655641270019, + "learning_rate": 3.048780487804878e-07, + "logits/chosen": -2.724668025970459, + "logits/rejected": -2.8093247413635254, + "logps/chosen": -191.9694061279297, + "logps/rejected": -229.0023193359375, + "loss": 0.6085, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04489254206418991, + "rewards/margins": 0.38289427757263184, + "rewards/rejected": -0.3380017578601837, + "step": 350 + }, + { + "epoch": 0.18838304552590268, + "grad_norm": 67.08610224330708, + "learning_rate": 3.13588850174216e-07, + "logits/chosen": -2.804530382156372, + "logits/rejected": -2.7216782569885254, + "logps/chosen": -271.109619140625, + "logps/rejected": -278.29595947265625, + "loss": 0.6135, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03914060443639755, + "rewards/margins": 0.2538668215274811, + "rewards/rejected": -0.21472623944282532, + "step": 360 + }, + { + "epoch": 0.1936159079016222, + "grad_norm": 68.81979171610475, + "learning_rate": 3.2229965156794425e-07, + "logits/chosen": -2.731799602508545, + "logits/rejected": -2.6366994380950928, + "logps/chosen": -303.15216064453125, + "logps/rejected": -274.8804016113281, + "loss": 0.6031, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.013546006754040718, + "rewards/margins": 0.3800819516181946, + "rewards/rejected": -0.39362794160842896, + "step": 370 + }, + { + "epoch": 0.1988487702773417, + "grad_norm": 70.10460080093853, + "learning_rate": 3.3101045296167245e-07, + "logits/chosen": -2.6483497619628906, + "logits/rejected": -2.7345848083496094, + "logps/chosen": -207.73190307617188, + "logps/rejected": -335.92767333984375, + "loss": 0.6155, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03089023567736149, + "rewards/margins": 0.2889792323112488, + "rewards/rejected": -0.25808900594711304, + "step": 380 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 71.01223451909964, + "learning_rate": 3.3972125435540065e-07, + "logits/chosen": -2.8010830879211426, + "logits/rejected": -2.641580104827881, + "logps/chosen": -393.96368408203125, + "logps/rejected": -272.9259033203125, + "loss": 0.616, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.14445623755455017, + "rewards/margins": 0.19453056156635284, + "rewards/rejected": -0.05007433146238327, + "step": 390 + }, + { + "epoch": 0.20931449502878074, + "grad_norm": 66.6694321514813, + "learning_rate": 3.484320557491289e-07, + "logits/chosen": -2.8600525856018066, + "logits/rejected": -2.6983630657196045, + "logps/chosen": -301.33587646484375, + "logps/rejected": -254.32473754882812, + "loss": 0.6109, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.061463791877031326, + "rewards/margins": 0.3578440845012665, + "rewards/rejected": -0.29638028144836426, + "step": 400 + }, + { + "epoch": 0.21454735740450026, + "grad_norm": 69.21792732884464, + "learning_rate": 3.5714285714285716e-07, + "logits/chosen": -2.744187831878662, + "logits/rejected": -2.722655773162842, + "logps/chosen": -285.7259826660156, + "logps/rejected": -321.0186462402344, + "loss": 0.637, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.012902788817882538, + "rewards/margins": 0.26276642084121704, + "rewards/rejected": -0.2756691873073578, + "step": 410 + }, + { + "epoch": 0.21978021978021978, + "grad_norm": 70.14765627243237, + "learning_rate": 3.6585365853658536e-07, + "logits/chosen": -2.809702157974243, + "logits/rejected": -2.777172088623047, + "logps/chosen": -287.27740478515625, + "logps/rejected": -300.46856689453125, + "loss": 0.6015, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.104688361287117, + "rewards/margins": 0.07888265699148178, + "rewards/rejected": -0.18357104063034058, + "step": 420 + }, + { + "epoch": 0.2250130821559393, + "grad_norm": 77.0206106620458, + "learning_rate": 3.7456445993031356e-07, + "logits/chosen": -2.9213788509368896, + "logits/rejected": -2.7455177307128906, + "logps/chosen": -320.3589172363281, + "logps/rejected": -255.3843536376953, + "loss": 0.6119, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.08016510307788849, + "rewards/margins": 0.522097647190094, + "rewards/rejected": -0.4419324994087219, + "step": 430 + }, + { + "epoch": 0.2302459445316588, + "grad_norm": 83.29817536015507, + "learning_rate": 3.832752613240418e-07, + "logits/chosen": -2.768024444580078, + "logits/rejected": -2.7121284008026123, + "logps/chosen": -279.93682861328125, + "logps/rejected": -253.1497344970703, + "loss": 0.6162, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15743741393089294, + "rewards/margins": 0.15811249613761902, + "rewards/rejected": -0.31554991006851196, + "step": 440 + }, + { + "epoch": 0.23547880690737832, + "grad_norm": 74.0680805972226, + "learning_rate": 3.9198606271777e-07, + "logits/chosen": -2.662078380584717, + "logits/rejected": -2.679370641708374, + "logps/chosen": -240.2390899658203, + "logps/rejected": -248.5304718017578, + "loss": 0.5777, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.15076106786727905, + "rewards/margins": 0.22496457397937775, + "rewards/rejected": -0.375725656747818, + "step": 450 + }, + { + "epoch": 0.24071166928309787, + "grad_norm": 68.31121974793525, + "learning_rate": 4.006968641114982e-07, + "logits/chosen": -2.7916362285614014, + "logits/rejected": -2.662065029144287, + "logps/chosen": -295.880859375, + "logps/rejected": -268.3017578125, + "loss": 0.605, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.04480681195855141, + "rewards/margins": 0.40198850631713867, + "rewards/rejected": -0.35718169808387756, + "step": 460 + }, + { + "epoch": 0.24594453165881738, + "grad_norm": 94.24984253858831, + "learning_rate": 4.0940766550522647e-07, + "logits/chosen": -2.7813093662261963, + "logits/rejected": -2.7455015182495117, + "logps/chosen": -293.1402587890625, + "logps/rejected": -295.59295654296875, + "loss": 0.5783, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1348876804113388, + "rewards/margins": 0.3201070725917816, + "rewards/rejected": -0.4549947679042816, + "step": 470 + }, + { + "epoch": 0.25117739403453687, + "grad_norm": 79.26291299471244, + "learning_rate": 4.1811846689895467e-07, + "logits/chosen": -2.763723850250244, + "logits/rejected": -2.769278049468994, + "logps/chosen": -247.20993041992188, + "logps/rejected": -239.54849243164062, + "loss": 0.6004, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.041330404579639435, + "rewards/margins": 0.4440532624721527, + "rewards/rejected": -0.40272289514541626, + "step": 480 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 65.08505114883965, + "learning_rate": 4.268292682926829e-07, + "logits/chosen": -2.7704079151153564, + "logits/rejected": -2.7451767921447754, + "logps/chosen": -300.32598876953125, + "logps/rejected": -264.40692138671875, + "loss": 0.576, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.03305836394429207, + "rewards/margins": 0.30561283230781555, + "rewards/rejected": -0.2725544571876526, + "step": 490 + }, + { + "epoch": 0.2616431187859759, + "grad_norm": 80.1379910637347, + "learning_rate": 4.3554006968641113e-07, + "logits/chosen": -2.7437386512756348, + "logits/rejected": -2.746515989303589, + "logps/chosen": -264.9486389160156, + "logps/rejected": -279.97015380859375, + "loss": 0.5657, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.012850580736994743, + "rewards/margins": 0.6681180000305176, + "rewards/rejected": -0.6552674770355225, + "step": 500 + }, + { + "epoch": 0.2668759811616955, + "grad_norm": 75.29078775030078, + "learning_rate": 4.442508710801394e-07, + "logits/chosen": -2.8460960388183594, + "logits/rejected": -2.6950936317443848, + "logps/chosen": -293.77777099609375, + "logps/rejected": -257.11553955078125, + "loss": 0.5891, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.003244167659431696, + "rewards/margins": 0.35232454538345337, + "rewards/rejected": -0.3490803837776184, + "step": 510 + }, + { + "epoch": 0.272108843537415, + "grad_norm": 73.71242826599446, + "learning_rate": 4.529616724738676e-07, + "logits/chosen": -2.799469232559204, + "logits/rejected": -2.779831647872925, + "logps/chosen": -274.6687927246094, + "logps/rejected": -294.1000671386719, + "loss": 0.5846, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.15885034203529358, + "rewards/margins": 0.26622113585472107, + "rewards/rejected": -0.4250714182853699, + "step": 520 + }, + { + "epoch": 0.2773417059131345, + "grad_norm": 67.00484336857802, + "learning_rate": 4.616724738675958e-07, + "logits/chosen": -2.762690305709839, + "logits/rejected": -2.7298333644866943, + "logps/chosen": -349.2732849121094, + "logps/rejected": -257.4483947753906, + "loss": 0.5752, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.08133769035339355, + "rewards/margins": 0.6366723775863647, + "rewards/rejected": -0.5553346872329712, + "step": 530 + }, + { + "epoch": 0.282574568288854, + "grad_norm": 87.37301032099326, + "learning_rate": 4.7038327526132404e-07, + "logits/chosen": -2.734611988067627, + "logits/rejected": -2.682957887649536, + "logps/chosen": -300.8519592285156, + "logps/rejected": -287.28131103515625, + "loss": 0.5654, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16697955131530762, + "rewards/margins": 0.40015920996665955, + "rewards/rejected": -0.5671387910842896, + "step": 540 + }, + { + "epoch": 0.28780743066457354, + "grad_norm": 75.87340132424606, + "learning_rate": 4.790940766550523e-07, + "logits/chosen": -2.881843328475952, + "logits/rejected": -2.744377374649048, + "logps/chosen": -343.0857238769531, + "logps/rejected": -284.81341552734375, + "loss": 0.623, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0778597965836525, + "rewards/margins": 0.3903868496417999, + "rewards/rejected": -0.4682466387748718, + "step": 550 + }, + { + "epoch": 0.29304029304029305, + "grad_norm": 64.37139615879414, + "learning_rate": 4.878048780487804e-07, + "logits/chosen": -2.8723576068878174, + "logits/rejected": -2.8006961345672607, + "logps/chosen": -295.2679748535156, + "logps/rejected": -279.12310791015625, + "loss": 0.5644, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.01374953705817461, + "rewards/margins": 0.5936576128005981, + "rewards/rejected": -0.579908013343811, + "step": 560 + }, + { + "epoch": 0.29827315541601257, + "grad_norm": 76.23718102747013, + "learning_rate": 4.965156794425087e-07, + "logits/chosen": -2.9436142444610596, + "logits/rejected": -2.799950361251831, + "logps/chosen": -290.10546875, + "logps/rejected": -265.4535217285156, + "loss": 0.5782, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.015673398971557617, + "rewards/margins": 0.5041199922561646, + "rewards/rejected": -0.5197933912277222, + "step": 570 + }, + { + "epoch": 0.3035060177917321, + "grad_norm": 64.81448819369754, + "learning_rate": 4.999983312905696e-07, + "logits/chosen": -2.8923089504241943, + "logits/rejected": -2.7781293392181396, + "logps/chosen": -324.54986572265625, + "logps/rejected": -225.8867645263672, + "loss": 0.582, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1136259064078331, + "rewards/margins": 0.435842901468277, + "rewards/rejected": -0.5494688153266907, + "step": 580 + }, + { + "epoch": 0.3087388801674516, + "grad_norm": 64.62422308445984, + "learning_rate": 4.999881337025014e-07, + "logits/chosen": -2.710305690765381, + "logits/rejected": -2.7066807746887207, + "logps/chosen": -223.7068328857422, + "logps/rejected": -228.61758422851562, + "loss": 0.5895, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13477621972560883, + "rewards/margins": 0.29476475715637207, + "rewards/rejected": -0.4295410215854645, + "step": 590 + }, + { + "epoch": 0.3139717425431711, + "grad_norm": 69.1793063849544, + "learning_rate": 4.999686659648518e-07, + "logits/chosen": -2.7720563411712646, + "logits/rejected": -2.7554426193237305, + "logps/chosen": -291.6083679199219, + "logps/rejected": -280.64044189453125, + "loss": 0.6149, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.017333343625068665, + "rewards/margins": 0.35141366720199585, + "rewards/rejected": -0.3340803384780884, + "step": 600 + }, + { + "epoch": 0.31920460491889063, + "grad_norm": 71.16580461514887, + "learning_rate": 4.999399287995302e-07, + "logits/chosen": -2.8262219429016113, + "logits/rejected": -2.720231533050537, + "logps/chosen": -211.25503540039062, + "logps/rejected": -231.1370086669922, + "loss": 0.5364, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.009411575272679329, + "rewards/margins": 0.47361668944358826, + "rewards/rejected": -0.48302823305130005, + "step": 610 + }, + { + "epoch": 0.32443746729461015, + "grad_norm": 69.94030395472988, + "learning_rate": 4.999019232721791e-07, + "logits/chosen": -2.93601655960083, + "logits/rejected": -2.685936450958252, + "logps/chosen": -368.6300354003906, + "logps/rejected": -219.16061401367188, + "loss": 0.5753, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16614532470703125, + "rewards/margins": 0.7339236736297607, + "rewards/rejected": -0.5677784085273743, + "step": 620 + }, + { + "epoch": 0.32967032967032966, + "grad_norm": 59.26941308002417, + "learning_rate": 4.998546507921325e-07, + "logits/chosen": -2.7378721237182617, + "logits/rejected": -2.7604966163635254, + "logps/chosen": -233.4509735107422, + "logps/rejected": -278.7649841308594, + "loss": 0.6109, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.15916521847248077, + "rewards/margins": 0.500957190990448, + "rewards/rejected": -0.6601223945617676, + "step": 630 + }, + { + "epoch": 0.3349031920460492, + "grad_norm": 71.51739828519624, + "learning_rate": 4.997981131123656e-07, + "logits/chosen": -2.8753511905670166, + "logits/rejected": -2.7751049995422363, + "logps/chosen": -296.11126708984375, + "logps/rejected": -306.3322448730469, + "loss": 0.5664, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11808276176452637, + "rewards/margins": 0.7307382822036743, + "rewards/rejected": -0.612655520439148, + "step": 640 + }, + { + "epoch": 0.3401360544217687, + "grad_norm": 63.71373253795031, + "learning_rate": 4.997323123294291e-07, + "logits/chosen": -2.8338544368743896, + "logits/rejected": -2.7727811336517334, + "logps/chosen": -275.8819274902344, + "logps/rejected": -252.6679229736328, + "loss": 0.5886, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.004488348960876465, + "rewards/margins": 0.5318983197212219, + "rewards/rejected": -0.5274099707603455, + "step": 650 + }, + { + "epoch": 0.3453689167974882, + "grad_norm": 68.36944996832233, + "learning_rate": 4.99657250883371e-07, + "logits/chosen": -2.7883529663085938, + "logits/rejected": -2.727940797805786, + "logps/chosen": -244.5763702392578, + "logps/rejected": -236.46859741210938, + "loss": 0.5638, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.09980012476444244, + "rewards/margins": 0.3573591113090515, + "rewards/rejected": -0.4571591913700104, + "step": 660 + }, + { + "epoch": 0.35060177917320773, + "grad_norm": 83.50524399420453, + "learning_rate": 4.995729315576468e-07, + "logits/chosen": -2.6909804344177246, + "logits/rejected": -2.654859781265259, + "logps/chosen": -270.6436462402344, + "logps/rejected": -242.53622436523438, + "loss": 0.5697, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.08693110942840576, + "rewards/margins": 0.42284971475601196, + "rewards/rejected": -0.5097808241844177, + "step": 670 + }, + { + "epoch": 0.35583464154892724, + "grad_norm": 56.509818255549085, + "learning_rate": 4.99479357479016e-07, + "logits/chosen": -2.649606943130493, + "logits/rejected": -2.6005656719207764, + "logps/chosen": -251.02987670898438, + "logps/rejected": -230.5948486328125, + "loss": 0.5576, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2845116853713989, + "rewards/margins": 0.42809027433395386, + "rewards/rejected": -0.7126020193099976, + "step": 680 + }, + { + "epoch": 0.36106750392464676, + "grad_norm": 63.6023380966652, + "learning_rate": 4.993765321174261e-07, + "logits/chosen": -2.8176732063293457, + "logits/rejected": -2.7134623527526855, + "logps/chosen": -247.86801147460938, + "logps/rejected": -237.22915649414062, + "loss": 0.5227, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0951828807592392, + "rewards/margins": 0.6302462816238403, + "rewards/rejected": -0.5350633859634399, + "step": 690 + }, + { + "epoch": 0.3663003663003663, + "grad_norm": 72.72474978480933, + "learning_rate": 4.992644592858842e-07, + "logits/chosen": -2.7131919860839844, + "logits/rejected": -2.6791841983795166, + "logps/chosen": -265.3692932128906, + "logps/rejected": -248.10281372070312, + "loss": 0.5801, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20485153794288635, + "rewards/margins": 0.4856719374656677, + "rewards/rejected": -0.6905235052108765, + "step": 700 + }, + { + "epoch": 0.3715332286760858, + "grad_norm": 77.60639105406315, + "learning_rate": 4.991431431403148e-07, + "logits/chosen": -2.802262783050537, + "logits/rejected": -2.6851892471313477, + "logps/chosen": -340.07659912109375, + "logps/rejected": -285.58380126953125, + "loss": 0.5271, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07976872473955154, + "rewards/margins": 0.7483940720558167, + "rewards/rejected": -0.8281628489494324, + "step": 710 + }, + { + "epoch": 0.37676609105180536, + "grad_norm": 64.78758225968055, + "learning_rate": 4.99012588179407e-07, + "logits/chosen": -2.6652474403381348, + "logits/rejected": -2.7214195728302, + "logps/chosen": -220.8406219482422, + "logps/rejected": -237.08779907226562, + "loss": 0.5494, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.024497246369719505, + "rewards/margins": 0.5303259491920471, + "rewards/rejected": -0.5548231601715088, + "step": 720 + }, + { + "epoch": 0.3819989534275249, + "grad_norm": 65.34881911690515, + "learning_rate": 4.988727992444467e-07, + "logits/chosen": -2.7291436195373535, + "logits/rejected": -2.7527527809143066, + "logps/chosen": -272.0304870605469, + "logps/rejected": -307.57769775390625, + "loss": 0.5785, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.24241900444030762, + "rewards/margins": 0.6918538808822632, + "rewards/rejected": -0.9342729449272156, + "step": 730 + }, + { + "epoch": 0.3872318158032444, + "grad_norm": 68.99686104085876, + "learning_rate": 4.98723781519137e-07, + "logits/chosen": -2.7245595455169678, + "logits/rejected": -2.7607216835021973, + "logps/chosen": -246.9656524658203, + "logps/rejected": -233.1698455810547, + "loss": 0.5419, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.11478378623723984, + "rewards/margins": 0.5238386392593384, + "rewards/rejected": -0.6386224031448364, + "step": 740 + }, + { + "epoch": 0.3924646781789639, + "grad_norm": 73.8726217450531, + "learning_rate": 4.98565540529407e-07, + "logits/chosen": -2.645646572113037, + "logits/rejected": -2.5875344276428223, + "logps/chosen": -288.4378356933594, + "logps/rejected": -307.13775634765625, + "loss": 0.5328, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.03566960245370865, + "rewards/margins": 0.5397855639457703, + "rewards/rejected": -0.5041159987449646, + "step": 750 + }, + { + "epoch": 0.3976975405546834, + "grad_norm": 70.42082071557074, + "learning_rate": 4.983980821432054e-07, + "logits/chosen": -2.692359447479248, + "logits/rejected": -2.6449592113494873, + "logps/chosen": -238.6630859375, + "logps/rejected": -204.46713256835938, + "loss": 0.5833, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18362995982170105, + "rewards/margins": 0.5266052484512329, + "rewards/rejected": -0.7102352380752563, + "step": 760 + }, + { + "epoch": 0.40293040293040294, + "grad_norm": 59.90591958586741, + "learning_rate": 4.982214125702845e-07, + "logits/chosen": -2.688281297683716, + "logits/rejected": -2.6551880836486816, + "logps/chosen": -249.35751342773438, + "logps/rejected": -288.7128601074219, + "loss": 0.603, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2397182285785675, + "rewards/margins": 0.5740364789962769, + "rewards/rejected": -0.8137545585632324, + "step": 770 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 85.68097562700152, + "learning_rate": 4.980355383619684e-07, + "logits/chosen": -2.683443069458008, + "logits/rejected": -2.6418633460998535, + "logps/chosen": -244.8722381591797, + "logps/rejected": -213.88235473632812, + "loss": 0.5469, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2509283721446991, + "rewards/margins": 0.6306599974632263, + "rewards/rejected": -0.8815882802009583, + "step": 780 + }, + { + "epoch": 0.413396127681842, + "grad_norm": 70.10204209462056, + "learning_rate": 4.978404664109113e-07, + "logits/chosen": -2.6783461570739746, + "logits/rejected": -2.700972080230713, + "logps/chosen": -231.48727416992188, + "logps/rejected": -295.9053649902344, + "loss": 0.5492, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.4138612747192383, + "rewards/margins": 0.3425648808479309, + "rewards/rejected": -0.7564261555671692, + "step": 790 + }, + { + "epoch": 0.4186289900575615, + "grad_norm": 81.25028374863271, + "learning_rate": 4.97636203950841e-07, + "logits/chosen": -2.6948161125183105, + "logits/rejected": -2.6822495460510254, + "logps/chosen": -304.7908630371094, + "logps/rejected": -309.82183837890625, + "loss": 0.6007, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.19240470230579376, + "rewards/margins": 0.5690978765487671, + "rewards/rejected": -0.7615026235580444, + "step": 800 + }, + { + "epoch": 0.423861852433281, + "grad_norm": 71.71281427481196, + "learning_rate": 4.974227585562916e-07, + "logits/chosen": -2.68057918548584, + "logits/rejected": -2.581986427307129, + "logps/chosen": -313.5508117675781, + "logps/rejected": -278.8346862792969, + "loss": 0.5493, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.42161816358566284, + "rewards/margins": 0.6813873648643494, + "rewards/rejected": -1.1030056476593018, + "step": 810 + }, + { + "epoch": 0.4290947148090005, + "grad_norm": 54.56461253021572, + "learning_rate": 4.972001381423214e-07, + "logits/chosen": -2.74100399017334, + "logits/rejected": -2.6623387336730957, + "logps/chosen": -292.7416076660156, + "logps/rejected": -246.9758758544922, + "loss": 0.5516, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.38912060856819153, + "rewards/margins": 0.729678213596344, + "rewards/rejected": -1.118798851966858, + "step": 820 + }, + { + "epoch": 0.43432757718472004, + "grad_norm": 76.67534708024763, + "learning_rate": 4.969683509642206e-07, + "logits/chosen": -2.8405652046203613, + "logits/rejected": -2.7737975120544434, + "logps/chosen": -243.72720336914062, + "logps/rejected": -246.10107421875, + "loss": 0.6121, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42866888642311096, + "rewards/margins": 0.4650656282901764, + "rewards/rejected": -0.8937345743179321, + "step": 830 + }, + { + "epoch": 0.43956043956043955, + "grad_norm": 64.79089069454054, + "learning_rate": 4.967274056172044e-07, + "logits/chosen": -2.8649442195892334, + "logits/rejected": -2.603626251220703, + "logps/chosen": -413.15191650390625, + "logps/rejected": -301.858642578125, + "loss": 0.5475, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2905053496360779, + "rewards/margins": 0.8141521215438843, + "rewards/rejected": -1.1046574115753174, + "step": 840 + }, + { + "epoch": 0.44479330193615907, + "grad_norm": 82.06458260690994, + "learning_rate": 4.964773110360944e-07, + "logits/chosen": -2.750856876373291, + "logits/rejected": -2.5980868339538574, + "logps/chosen": -262.92462158203125, + "logps/rejected": -228.48593139648438, + "loss": 0.5918, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.44581151008605957, + "rewards/margins": 0.5122817754745483, + "rewards/rejected": -0.9580932855606079, + "step": 850 + }, + { + "epoch": 0.4500261643118786, + "grad_norm": 89.51335715472315, + "learning_rate": 4.962180764949876e-07, + "logits/chosen": -2.764763355255127, + "logits/rejected": -2.7525763511657715, + "logps/chosen": -194.84877014160156, + "logps/rejected": -269.7641296386719, + "loss": 0.562, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1448897421360016, + "rewards/margins": 0.8955299258232117, + "rewards/rejected": -1.040419578552246, + "step": 860 + }, + { + "epoch": 0.4552590266875981, + "grad_norm": 58.74124593162302, + "learning_rate": 4.959497116069122e-07, + "logits/chosen": -2.4614791870117188, + "logits/rejected": -2.5169665813446045, + "logps/chosen": -229.3387908935547, + "logps/rejected": -241.858154296875, + "loss": 0.5677, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39239010214805603, + "rewards/margins": 0.721211314201355, + "rewards/rejected": -1.1136014461517334, + "step": 870 + }, + { + "epoch": 0.4604918890633176, + "grad_norm": 77.15915949137676, + "learning_rate": 4.956722263234711e-07, + "logits/chosen": -2.7549219131469727, + "logits/rejected": -2.714980363845825, + "logps/chosen": -282.6243896484375, + "logps/rejected": -263.4341735839844, + "loss": 0.5399, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.48048511147499084, + "rewards/margins": 0.4717450737953186, + "rewards/rejected": -0.9522300958633423, + "step": 880 + }, + { + "epoch": 0.46572475143903713, + "grad_norm": 73.9394228989263, + "learning_rate": 4.95385630934473e-07, + "logits/chosen": -2.8114538192749023, + "logits/rejected": -2.7364768981933594, + "logps/chosen": -319.81170654296875, + "logps/rejected": -247.07186889648438, + "loss": 0.5491, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2333296239376068, + "rewards/margins": 0.6494348645210266, + "rewards/rejected": -0.8827645182609558, + "step": 890 + }, + { + "epoch": 0.47095761381475665, + "grad_norm": 96.7409696372864, + "learning_rate": 4.950899360675511e-07, + "logits/chosen": -2.672738552093506, + "logits/rejected": -2.633599042892456, + "logps/chosen": -256.09771728515625, + "logps/rejected": -304.5975646972656, + "loss": 0.5862, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6127587556838989, + "rewards/margins": 0.8507426977157593, + "rewards/rejected": -1.4635014533996582, + "step": 900 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 86.8453977856647, + "learning_rate": 4.947851526877681e-07, + "logits/chosen": -2.6899325847625732, + "logits/rejected": -2.6272430419921875, + "logps/chosen": -185.05880737304688, + "logps/rejected": -197.94110107421875, + "loss": 0.5669, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34028100967407227, + "rewards/margins": 0.9680425524711609, + "rewards/rejected": -1.308323621749878, + "step": 910 + }, + { + "epoch": 0.48142333856619574, + "grad_norm": 70.8964866934123, + "learning_rate": 4.944712920972108e-07, + "logits/chosen": -2.8275413513183594, + "logits/rejected": -2.68281888961792, + "logps/chosen": -314.7341613769531, + "logps/rejected": -263.71209716796875, + "loss": 0.5696, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4585978090763092, + "rewards/margins": 0.6378489136695862, + "rewards/rejected": -1.0964467525482178, + "step": 920 + }, + { + "epoch": 0.48665620094191525, + "grad_norm": 77.19210699477038, + "learning_rate": 4.9414836593457e-07, + "logits/chosen": -2.6968119144439697, + "logits/rejected": -2.6806297302246094, + "logps/chosen": -274.0387268066406, + "logps/rejected": -273.5841979980469, + "loss": 0.5291, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5575527548789978, + "rewards/margins": 0.4331057071685791, + "rewards/rejected": -0.9906584620475769, + "step": 930 + }, + { + "epoch": 0.49188906331763477, + "grad_norm": 63.91924634228564, + "learning_rate": 4.938163861747094e-07, + "logits/chosen": -2.7842488288879395, + "logits/rejected": -2.6924023628234863, + "logps/chosen": -305.1507568359375, + "logps/rejected": -256.9998779296875, + "loss": 0.5008, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4544626772403717, + "rewards/margins": 0.8790037035942078, + "rewards/rejected": -1.3334662914276123, + "step": 940 + }, + { + "epoch": 0.4971219256933543, + "grad_norm": 74.90193395092072, + "learning_rate": 4.934753651282215e-07, + "logits/chosen": -2.7118842601776123, + "logits/rejected": -2.5842690467834473, + "logps/chosen": -295.0773620605469, + "logps/rejected": -286.43194580078125, + "loss": 0.5441, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3371061384677887, + "rewards/margins": 0.9876976013183594, + "rewards/rejected": -1.3248035907745361, + "step": 950 + }, + { + "epoch": 0.5023547880690737, + "grad_norm": 68.4374675451541, + "learning_rate": 4.93125315440971e-07, + "logits/chosen": -2.7849133014678955, + "logits/rejected": -2.670518159866333, + "logps/chosen": -293.0834045410156, + "logps/rejected": -283.2849426269531, + "loss": 0.5556, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5373663902282715, + "rewards/margins": 0.6889173984527588, + "rewards/rejected": -1.2262837886810303, + "step": 960 + }, + { + "epoch": 0.5075876504447933, + "grad_norm": 71.79203747934943, + "learning_rate": 4.92766250093626e-07, + "logits/chosen": -2.679919958114624, + "logits/rejected": -2.5925917625427246, + "logps/chosen": -302.0339050292969, + "logps/rejected": -267.0357360839844, + "loss": 0.5464, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4768869876861572, + "rewards/margins": 1.1046160459518433, + "rewards/rejected": -1.581502914428711, + "step": 970 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 46.31017463847887, + "learning_rate": 4.92398182401176e-07, + "logits/chosen": -2.788491725921631, + "logits/rejected": -2.5828099250793457, + "logps/chosen": -315.658203125, + "logps/rejected": -256.7175598144531, + "loss": 0.5354, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16082218289375305, + "rewards/margins": 1.0569546222686768, + "rewards/rejected": -1.2177767753601074, + "step": 980 + }, + { + "epoch": 0.5180533751962323, + "grad_norm": 59.880098009341154, + "learning_rate": 4.920211260124395e-07, + "logits/chosen": -2.675851583480835, + "logits/rejected": -2.5794055461883545, + "logps/chosen": -254.67105102539062, + "logps/rejected": -239.4071044921875, + "loss": 0.5319, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.3605723977088928, + "rewards/margins": 0.8557994961738586, + "rewards/rejected": -1.216371774673462, + "step": 990 + }, + { + "epoch": 0.5232862375719518, + "grad_norm": 72.44078518479554, + "learning_rate": 4.916350949095566e-07, + "logits/chosen": -2.706974506378174, + "logits/rejected": -2.6369376182556152, + "logps/chosen": -247.0148468017578, + "logps/rejected": -239.7918243408203, + "loss": 0.557, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6575168371200562, + "rewards/margins": 0.6040263772010803, + "rewards/rejected": -1.2615431547164917, + "step": 1000 + }, + { + "epoch": 0.5285190999476713, + "grad_norm": 65.88304223948026, + "learning_rate": 4.912401034074708e-07, + "logits/chosen": -2.6823890209198, + "logits/rejected": -2.649326801300049, + "logps/chosen": -243.98629760742188, + "logps/rejected": -268.6001892089844, + "loss": 0.5605, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3754280209541321, + "rewards/margins": 0.6592222452163696, + "rewards/rejected": -1.034650206565857, + "step": 1010 + }, + { + "epoch": 0.533751962323391, + "grad_norm": 67.67480154154286, + "learning_rate": 4.908361661533989e-07, + "logits/chosen": -2.740328788757324, + "logits/rejected": -2.6959586143493652, + "logps/chosen": -302.77130126953125, + "logps/rejected": -265.9853210449219, + "loss": 0.5287, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.13270536065101624, + "rewards/margins": 1.0353169441223145, + "rewards/rejected": -1.1680222749710083, + "step": 1020 + }, + { + "epoch": 0.5389848246991105, + "grad_norm": 63.344533946977315, + "learning_rate": 4.904232981262866e-07, + "logits/chosen": -2.7135579586029053, + "logits/rejected": -2.6516432762145996, + "logps/chosen": -269.3987121582031, + "logps/rejected": -233.5369110107422, + "loss": 0.5625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6198391318321228, + "rewards/margins": 0.37689751386642456, + "rewards/rejected": -0.9967366456985474, + "step": 1030 + }, + { + "epoch": 0.54421768707483, + "grad_norm": 67.80712674562271, + "learning_rate": 4.900015146362544e-07, + "logits/chosen": -2.7513439655303955, + "logits/rejected": -2.8110382556915283, + "logps/chosen": -259.6036071777344, + "logps/rejected": -279.2042541503906, + "loss": 0.547, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4451045095920563, + "rewards/margins": 0.7799221277236938, + "rewards/rejected": -1.2250266075134277, + "step": 1040 + }, + { + "epoch": 0.5494505494505495, + "grad_norm": 84.53833487464527, + "learning_rate": 4.895708313240285e-07, + "logits/chosen": -2.8216004371643066, + "logits/rejected": -2.696685314178467, + "logps/chosen": -341.482421875, + "logps/rejected": -309.2986755371094, + "loss": 0.5597, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.31436505913734436, + "rewards/margins": 0.9949013590812683, + "rewards/rejected": -1.3092663288116455, + "step": 1050 + }, + { + "epoch": 0.554683411826269, + "grad_norm": 71.06241027666316, + "learning_rate": 4.891312641603623e-07, + "logits/chosen": -2.6985182762145996, + "logits/rejected": -2.704181432723999, + "logps/chosen": -273.2723693847656, + "logps/rejected": -284.52557373046875, + "loss": 0.5384, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1811438351869583, + "rewards/margins": 1.0559108257293701, + "rewards/rejected": -1.2370548248291016, + "step": 1060 + }, + { + "epoch": 0.5599162742019885, + "grad_norm": 60.42257436255781, + "learning_rate": 4.886828294454426e-07, + "logits/chosen": -2.7313995361328125, + "logits/rejected": -2.717003107070923, + "logps/chosen": -338.99810791015625, + "logps/rejected": -283.36285400390625, + "loss": 0.5669, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2885645925998688, + "rewards/margins": 0.6337456107139587, + "rewards/rejected": -0.9223102331161499, + "step": 1070 + }, + { + "epoch": 0.565149136577708, + "grad_norm": 65.79545292284378, + "learning_rate": 4.882255438082863e-07, + "logits/chosen": -2.7875468730926514, + "logits/rejected": -2.7002549171447754, + "logps/chosen": -242.84634399414062, + "logps/rejected": -242.5367889404297, + "loss": 0.5585, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3658864200115204, + "rewards/margins": 0.8278558850288391, + "rewards/rejected": -1.193742275238037, + "step": 1080 + }, + { + "epoch": 0.5703819989534276, + "grad_norm": 107.1693968556409, + "learning_rate": 4.877594242061233e-07, + "logits/chosen": -2.7447891235351562, + "logits/rejected": -2.5921199321746826, + "logps/chosen": -318.3252258300781, + "logps/rejected": -203.28018188476562, + "loss": 0.595, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.646048903465271, + "rewards/margins": 0.5135602951049805, + "rewards/rejected": -1.1596091985702515, + "step": 1090 + }, + { + "epoch": 0.5756148613291471, + "grad_norm": 62.65334429545523, + "learning_rate": 4.87284487923768e-07, + "logits/chosen": -2.6979050636291504, + "logits/rejected": -2.6344146728515625, + "logps/chosen": -273.78277587890625, + "logps/rejected": -289.87689208984375, + "loss": 0.52, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4887138307094574, + "rewards/margins": 0.7608338594436646, + "rewards/rejected": -1.2495477199554443, + "step": 1100 + }, + { + "epoch": 0.5808477237048666, + "grad_norm": 75.4519203317871, + "learning_rate": 4.868007525729775e-07, + "logits/chosen": -2.473546028137207, + "logits/rejected": -2.4677727222442627, + "logps/chosen": -183.38070678710938, + "logps/rejected": -213.664794921875, + "loss": 0.589, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3946594297885895, + "rewards/margins": 0.7618762254714966, + "rewards/rejected": -1.1565356254577637, + "step": 1110 + }, + { + "epoch": 0.5860805860805861, + "grad_norm": 77.31972728639262, + "learning_rate": 4.863082360917998e-07, + "logits/chosen": -2.6780097484588623, + "logits/rejected": -2.617384433746338, + "logps/chosen": -287.62469482421875, + "logps/rejected": -266.36541748046875, + "loss": 0.5183, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.42290663719177246, + "rewards/margins": 0.6520038843154907, + "rewards/rejected": -1.0749104022979736, + "step": 1120 + }, + { + "epoch": 0.5913134484563056, + "grad_norm": 81.30630179926432, + "learning_rate": 4.858069567439072e-07, + "logits/chosen": -2.590245246887207, + "logits/rejected": -2.565310478210449, + "logps/chosen": -232.45999145507812, + "logps/rejected": -280.054931640625, + "loss": 0.566, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8007476925849915, + "rewards/margins": 0.5347889065742493, + "rewards/rejected": -1.3355367183685303, + "step": 1130 + }, + { + "epoch": 0.5965463108320251, + "grad_norm": 68.70456072258055, + "learning_rate": 4.852969331179206e-07, + "logits/chosen": -2.8644909858703613, + "logits/rejected": -2.809370756149292, + "logps/chosen": -266.9808349609375, + "logps/rejected": -286.6571350097656, + "loss": 0.5083, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2700379490852356, + "rewards/margins": 0.9382151365280151, + "rewards/rejected": -1.2082529067993164, + "step": 1140 + }, + { + "epoch": 0.6017791732077447, + "grad_norm": 87.24229108451696, + "learning_rate": 4.847781841267185e-07, + "logits/chosen": -2.828613042831421, + "logits/rejected": -2.640158176422119, + "logps/chosen": -279.5322570800781, + "logps/rejected": -249.63223266601562, + "loss": 0.5443, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4706583023071289, + "rewards/margins": 0.6991799473762512, + "rewards/rejected": -1.169838309288025, + "step": 1150 + }, + { + "epoch": 0.6070120355834642, + "grad_norm": 70.40297431881797, + "learning_rate": 4.842507290067374e-07, + "logits/chosen": -2.5668742656707764, + "logits/rejected": -2.577543258666992, + "logps/chosen": -230.92886352539062, + "logps/rejected": -199.5034942626953, + "loss": 0.5571, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5253152251243591, + "rewards/margins": 0.41240495443344116, + "rewards/rejected": -0.9377201795578003, + "step": 1160 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 78.31414773218133, + "learning_rate": 4.837145873172567e-07, + "logits/chosen": -2.739893913269043, + "logits/rejected": -2.6579720973968506, + "logps/chosen": -284.1490173339844, + "logps/rejected": -297.1120300292969, + "loss": 0.563, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.15587486326694489, + "rewards/margins": 1.329700231552124, + "rewards/rejected": -1.4855751991271973, + "step": 1170 + }, + { + "epoch": 0.6174777603349032, + "grad_norm": 53.670817774925865, + "learning_rate": 4.83169778939675e-07, + "logits/chosen": -2.7941668033599854, + "logits/rejected": -2.727989673614502, + "logps/chosen": -326.0960693359375, + "logps/rejected": -282.9150085449219, + "loss": 0.5015, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48152121901512146, + "rewards/margins": 0.5162063837051392, + "rewards/rejected": -0.9977277517318726, + "step": 1180 + }, + { + "epoch": 0.6227106227106227, + "grad_norm": 70.60560817856916, + "learning_rate": 4.826163240767716e-07, + "logits/chosen": -2.8005542755126953, + "logits/rejected": -2.71057391166687, + "logps/chosen": -372.0180969238281, + "logps/rejected": -289.7819519042969, + "loss": 0.4971, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.12531840801239014, + "rewards/margins": 0.7546466588973999, + "rewards/rejected": -0.8799650073051453, + "step": 1190 + }, + { + "epoch": 0.6279434850863422, + "grad_norm": 58.79565369701411, + "learning_rate": 4.820542432519584e-07, + "logits/chosen": -2.5247480869293213, + "logits/rejected": -2.4183754920959473, + "logps/chosen": -315.38555908203125, + "logps/rejected": -283.47479248046875, + "loss": 0.5096, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6097077131271362, + "rewards/margins": 0.6779664158821106, + "rewards/rejected": -1.2876741886138916, + "step": 1200 + }, + { + "epoch": 0.6331763474620618, + "grad_norm": 56.823488768307215, + "learning_rate": 4.814835573085176e-07, + "logits/chosen": -2.818470001220703, + "logits/rejected": -2.7555344104766846, + "logps/chosen": -304.1570129394531, + "logps/rejected": -276.91168212890625, + "loss": 0.5567, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4183574616909027, + "rewards/margins": 0.8381322026252747, + "rewards/rejected": -1.256489634513855, + "step": 1210 + }, + { + "epoch": 0.6384092098377813, + "grad_norm": 75.5878427723409, + "learning_rate": 4.809042874088304e-07, + "logits/chosen": -2.78570556640625, + "logits/rejected": -2.745189666748047, + "logps/chosen": -333.1932373046875, + "logps/rejected": -298.15252685546875, + "loss": 0.5416, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5144246220588684, + "rewards/margins": 0.9846269488334656, + "rewards/rejected": -1.4990516901016235, + "step": 1220 + }, + { + "epoch": 0.6436420722135008, + "grad_norm": 67.0851501322478, + "learning_rate": 4.803164550335905e-07, + "logits/chosen": -2.6935131549835205, + "logits/rejected": -2.560260534286499, + "logps/chosen": -352.7795104980469, + "logps/rejected": -254.9468231201172, + "loss": 0.4895, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4828592836856842, + "rewards/margins": 1.221268892288208, + "rewards/rejected": -1.7041280269622803, + "step": 1230 + }, + { + "epoch": 0.6488749345892203, + "grad_norm": 55.066837550240976, + "learning_rate": 4.797200819810089e-07, + "logits/chosen": -2.7344272136688232, + "logits/rejected": -2.714303493499756, + "logps/chosen": -251.71859741210938, + "logps/rejected": -230.49465942382812, + "loss": 0.5232, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6297019720077515, + "rewards/margins": 0.7004765272140503, + "rewards/rejected": -1.3301784992218018, + "step": 1240 + }, + { + "epoch": 0.6541077969649398, + "grad_norm": 70.88810338766321, + "learning_rate": 4.79115190366005e-07, + "logits/chosen": -2.7695717811584473, + "logits/rejected": -2.6756415367126465, + "logps/chosen": -281.10321044921875, + "logps/rejected": -303.2259216308594, + "loss": 0.5633, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5369859337806702, + "rewards/margins": 0.8963001370429993, + "rewards/rejected": -1.4332859516143799, + "step": 1250 + }, + { + "epoch": 0.6593406593406593, + "grad_norm": 58.6326409397173, + "learning_rate": 4.785018026193862e-07, + "logits/chosen": -2.721715211868286, + "logits/rejected": -2.6623756885528564, + "logps/chosen": -276.13470458984375, + "logps/rejected": -191.18539428710938, + "loss": 0.5178, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.45906153321266174, + "rewards/margins": 0.90235835313797, + "rewards/rejected": -1.361419916152954, + "step": 1260 + }, + { + "epoch": 0.6645735217163788, + "grad_norm": 64.32538103431956, + "learning_rate": 4.77879941487017e-07, + "logits/chosen": -2.6778359413146973, + "logits/rejected": -2.616879463195801, + "logps/chosen": -243.45571899414062, + "logps/rejected": -230.1975860595703, + "loss": 0.5098, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4738277792930603, + "rewards/margins": 1.05690598487854, + "rewards/rejected": -1.5307337045669556, + "step": 1270 + }, + { + "epoch": 0.6698063840920984, + "grad_norm": 83.93310106998426, + "learning_rate": 4.772496300289748e-07, + "logits/chosen": -2.718397855758667, + "logits/rejected": -2.5793282985687256, + "logps/chosen": -258.0218200683594, + "logps/rejected": -233.40280151367188, + "loss": 0.5174, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.8180875778198242, + "rewards/margins": 0.8874326944351196, + "rewards/rejected": -1.7055202722549438, + "step": 1280 + }, + { + "epoch": 0.6750392464678179, + "grad_norm": 71.21172285981052, + "learning_rate": 4.766108916186949e-07, + "logits/chosen": -2.6712429523468018, + "logits/rejected": -2.6534574031829834, + "logps/chosen": -252.097412109375, + "logps/rejected": -296.68096923828125, + "loss": 0.5653, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.21600313484668732, + "rewards/margins": 1.2593408823013306, + "rewards/rejected": -1.475343942642212, + "step": 1290 + }, + { + "epoch": 0.6802721088435374, + "grad_norm": 70.06827063972005, + "learning_rate": 4.759637499421042e-07, + "logits/chosen": -2.6452338695526123, + "logits/rejected": -2.6825003623962402, + "logps/chosen": -275.8140563964844, + "logps/rejected": -300.26690673828125, + "loss": 0.5194, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6661376953125, + "rewards/margins": 0.8043373823165894, + "rewards/rejected": -1.4704749584197998, + "step": 1300 + }, + { + "epoch": 0.6855049712192569, + "grad_norm": 80.34333468537173, + "learning_rate": 4.7530822899674207e-07, + "logits/chosen": -2.863783836364746, + "logits/rejected": -2.825059413909912, + "logps/chosen": -254.4219207763672, + "logps/rejected": -225.2198486328125, + "loss": 0.4917, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.39257654547691345, + "rewards/margins": 1.1294041872024536, + "rewards/rejected": -1.5219806432724, + "step": 1310 + }, + { + "epoch": 0.6907378335949764, + "grad_norm": 101.81615459041603, + "learning_rate": 4.7464435309087137e-07, + "logits/chosen": -2.750309467315674, + "logits/rejected": -2.73734712600708, + "logps/chosen": -300.87109375, + "logps/rejected": -313.13214111328125, + "loss": 0.5185, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.704337477684021, + "rewards/margins": 0.6578517556190491, + "rewards/rejected": -1.3621892929077148, + "step": 1320 + }, + { + "epoch": 0.6959706959706959, + "grad_norm": 71.48706237174991, + "learning_rate": 4.739721468425763e-07, + "logits/chosen": -2.741844654083252, + "logits/rejected": -2.7405097484588623, + "logps/chosen": -275.5357971191406, + "logps/rejected": -317.1656188964844, + "loss": 0.4807, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.26859548687934875, + "rewards/margins": 1.2116283178329468, + "rewards/rejected": -1.4802236557006836, + "step": 1330 + }, + { + "epoch": 0.7012035583464155, + "grad_norm": 49.2715636482983, + "learning_rate": 4.7329163517885e-07, + "logits/chosen": -2.718933582305908, + "logits/rejected": -2.6036744117736816, + "logps/chosen": -272.68865966796875, + "logps/rejected": -228.1563720703125, + "loss": 0.4875, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.37356796860694885, + "rewards/margins": 1.084763765335083, + "rewards/rejected": -1.4583313465118408, + "step": 1340 + }, + { + "epoch": 0.706436420722135, + "grad_norm": 70.78565545572422, + "learning_rate": 4.7260284333466973e-07, + "logits/chosen": -2.8037075996398926, + "logits/rejected": -2.768467426300049, + "logps/chosen": -298.2441711425781, + "logps/rejected": -270.0771179199219, + "loss": 0.5372, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5205308198928833, + "rewards/margins": 0.7837169766426086, + "rewards/rejected": -1.3042476177215576, + "step": 1350 + }, + { + "epoch": 0.7116692830978545, + "grad_norm": 84.19956425522926, + "learning_rate": 4.719057968520617e-07, + "logits/chosen": -2.6150059700012207, + "logits/rejected": -2.578220844268799, + "logps/chosen": -350.07525634765625, + "logps/rejected": -318.77899169921875, + "loss": 0.5837, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7203130722045898, + "rewards/margins": 0.6178886294364929, + "rewards/rejected": -1.3382017612457275, + "step": 1360 + }, + { + "epoch": 0.716902145473574, + "grad_norm": 71.32390552400585, + "learning_rate": 4.7120052157915345e-07, + "logits/chosen": -2.8357949256896973, + "logits/rejected": -2.632596254348755, + "logps/chosen": -324.78045654296875, + "logps/rejected": -229.32681274414062, + "loss": 0.4823, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5584132671356201, + "rewards/margins": 0.9202592968940735, + "rewards/rejected": -1.4786723852157593, + "step": 1370 + }, + { + "epoch": 0.7221350078492935, + "grad_norm": 69.3531933754779, + "learning_rate": 4.7048704366921537e-07, + "logits/chosen": -2.802358627319336, + "logits/rejected": -2.7255759239196777, + "logps/chosen": -224.1294708251953, + "logps/rejected": -263.2400207519531, + "loss": 0.5116, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.37427574396133423, + "rewards/margins": 1.2040784358978271, + "rewards/rejected": -1.5783542394638062, + "step": 1380 + }, + { + "epoch": 0.727367870225013, + "grad_norm": 57.566188366989095, + "learning_rate": 4.6976538957969114e-07, + "logits/chosen": -2.695610523223877, + "logits/rejected": -2.584303379058838, + "logps/chosen": -267.1957092285156, + "logps/rejected": -235.9038543701172, + "loss": 0.5166, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5423551201820374, + "rewards/margins": 1.2531085014343262, + "rewards/rejected": -1.7954635620117188, + "step": 1390 + }, + { + "epoch": 0.7326007326007326, + "grad_norm": 75.33520016312441, + "learning_rate": 4.690355860712163e-07, + "logits/chosen": -2.690704822540283, + "logits/rejected": -2.6887764930725098, + "logps/chosen": -244.60501098632812, + "logps/rejected": -269.39752197265625, + "loss": 0.5501, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7962597608566284, + "rewards/margins": 0.6051412224769592, + "rewards/rejected": -1.4014009237289429, + "step": 1400 + }, + { + "epoch": 0.7378335949764521, + "grad_norm": 84.04471122585693, + "learning_rate": 4.682976602066262e-07, + "logits/chosen": -2.6003170013427734, + "logits/rejected": -2.5461487770080566, + "logps/chosen": -260.6416931152344, + "logps/rejected": -262.48638916015625, + "loss": 0.5595, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6298843622207642, + "rewards/margins": 1.3133777379989624, + "rewards/rejected": -1.9432621002197266, + "step": 1410 + }, + { + "epoch": 0.7430664573521716, + "grad_norm": 83.86573291558813, + "learning_rate": 4.6755163934995224e-07, + "logits/chosen": -2.7487921714782715, + "logits/rejected": -2.669774055480957, + "logps/chosen": -320.82354736328125, + "logps/rejected": -268.43853759765625, + "loss": 0.5686, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5370005965232849, + "rewards/margins": 0.9020074009895325, + "rewards/rejected": -1.4390079975128174, + "step": 1420 + }, + { + "epoch": 0.7482993197278912, + "grad_norm": 55.25890313638634, + "learning_rate": 4.667975511654072e-07, + "logits/chosen": -2.7589454650878906, + "logits/rejected": -2.6432929039001465, + "logps/chosen": -304.049560546875, + "logps/rejected": -270.5584411621094, + "loss": 0.4897, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4467310309410095, + "rewards/margins": 1.0153915882110596, + "rewards/rejected": -1.4621226787567139, + "step": 1430 + }, + { + "epoch": 0.7535321821036107, + "grad_norm": 84.87447605280406, + "learning_rate": 4.660354236163595e-07, + "logits/chosen": -2.8089873790740967, + "logits/rejected": -2.702833414077759, + "logps/chosen": -362.2886962890625, + "logps/rejected": -322.1484680175781, + "loss": 0.5516, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.03776503726840019, + "rewards/margins": 1.153167486190796, + "rewards/rejected": -1.1909326314926147, + "step": 1440 + }, + { + "epoch": 0.7587650444793302, + "grad_norm": 62.630365097419194, + "learning_rate": 4.6526528496429606e-07, + "logits/chosen": -2.7678020000457764, + "logits/rejected": -2.6640100479125977, + "logps/chosen": -304.62579345703125, + "logps/rejected": -283.0849609375, + "loss": 0.5381, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6487807631492615, + "rewards/margins": 1.1606749296188354, + "rewards/rejected": -1.8094558715820312, + "step": 1450 + }, + { + "epoch": 0.7639979068550498, + "grad_norm": 57.142971090558646, + "learning_rate": 4.644871637677745e-07, + "logits/chosen": -2.6548714637756348, + "logits/rejected": -2.6578633785247803, + "logps/chosen": -221.1512908935547, + "logps/rejected": -237.13601684570312, + "loss": 0.5584, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4943154752254486, + "rewards/margins": 0.6672312021255493, + "rewards/rejected": -1.1615464687347412, + "step": 1460 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 56.71803507289668, + "learning_rate": 4.637010888813638e-07, + "logits/chosen": -2.7698307037353516, + "logits/rejected": -2.6017284393310547, + "logps/chosen": -339.74176025390625, + "logps/rejected": -253.5137481689453, + "loss": 0.4972, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47833189368247986, + "rewards/margins": 0.9227139353752136, + "rewards/rejected": -1.4010460376739502, + "step": 1470 + }, + { + "epoch": 0.7744636316064888, + "grad_norm": 59.84525328670555, + "learning_rate": 4.6290708945457493e-07, + "logits/chosen": -2.658843517303467, + "logits/rejected": -2.6468276977539062, + "logps/chosen": -258.4879455566406, + "logps/rejected": -245.56283569335938, + "loss": 0.5689, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7929760217666626, + "rewards/margins": 0.5606399178504944, + "rewards/rejected": -1.3536159992218018, + "step": 1480 + }, + { + "epoch": 0.7796964939822083, + "grad_norm": 80.8077698995965, + "learning_rate": 4.6210519493077887e-07, + "logits/chosen": -2.454429864883423, + "logits/rejected": -2.4639267921447754, + "logps/chosen": -291.4048156738281, + "logps/rejected": -282.96368408203125, + "loss": 0.5162, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7014742493629456, + "rewards/margins": 0.7910742163658142, + "rewards/rejected": -1.4925483465194702, + "step": 1490 + }, + { + "epoch": 0.7849293563579278, + "grad_norm": 66.51030465502379, + "learning_rate": 4.6129543504611607e-07, + "logits/chosen": -2.6922767162323, + "logits/rejected": -2.643317461013794, + "logps/chosen": -220.99935913085938, + "logps/rejected": -281.58038330078125, + "loss": 0.4868, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8320234417915344, + "rewards/margins": 1.1816871166229248, + "rewards/rejected": -2.0137104988098145, + "step": 1500 + }, + { + "epoch": 0.7901622187336473, + "grad_norm": 53.99248963391964, + "learning_rate": 4.604778398283927e-07, + "logits/chosen": -2.622396469116211, + "logits/rejected": -2.6255924701690674, + "logps/chosen": -271.9859619140625, + "logps/rejected": -316.2346496582031, + "loss": 0.5778, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1589412689208984, + "rewards/margins": 0.8864221572875977, + "rewards/rejected": -2.045363426208496, + "step": 1510 + }, + { + "epoch": 0.7953950811093669, + "grad_norm": 56.49013069960226, + "learning_rate": 4.596524395959678e-07, + "logits/chosen": -2.6993722915649414, + "logits/rejected": -2.637159824371338, + "logps/chosen": -231.75247192382812, + "logps/rejected": -268.9724426269531, + "loss": 0.52, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6520730257034302, + "rewards/margins": 1.2404626607894897, + "rewards/rejected": -1.8925358057022095, + "step": 1520 + }, + { + "epoch": 0.8006279434850864, + "grad_norm": 67.5153392153966, + "learning_rate": 4.588192649566285e-07, + "logits/chosen": -2.8271138668060303, + "logits/rejected": -2.790332794189453, + "logps/chosen": -330.3937683105469, + "logps/rejected": -391.48492431640625, + "loss": 0.489, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7549493312835693, + "rewards/margins": 0.98554927110672, + "rewards/rejected": -1.7404985427856445, + "step": 1530 + }, + { + "epoch": 0.8058608058608059, + "grad_norm": 54.8196546474935, + "learning_rate": 4.5797834680645553e-07, + "logits/chosen": -2.734687566757202, + "logits/rejected": -2.7426223754882812, + "logps/chosen": -369.28692626953125, + "logps/rejected": -329.3565979003906, + "loss": 0.5567, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7556370496749878, + "rewards/margins": 0.6318597793579102, + "rewards/rejected": -1.3874967098236084, + "step": 1540 + }, + { + "epoch": 0.8110936682365254, + "grad_norm": 63.54050063708392, + "learning_rate": 4.5712971632867715e-07, + "logits/chosen": -2.7076220512390137, + "logits/rejected": -2.5632991790771484, + "logps/chosen": -329.8143310546875, + "logps/rejected": -240.5828094482422, + "loss": 0.5079, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.22298932075500488, + "rewards/margins": 1.1761170625686646, + "rewards/rejected": -1.3991062641143799, + "step": 1550 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 57.19800278535567, + "learning_rate": 4.562734049925129e-07, + "logits/chosen": -2.7164652347564697, + "logits/rejected": -2.6165108680725098, + "logps/chosen": -356.35986328125, + "logps/rejected": -309.109130859375, + "loss": 0.5143, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5088120698928833, + "rewards/margins": 1.126982569694519, + "rewards/rejected": -1.6357946395874023, + "step": 1560 + }, + { + "epoch": 0.8215593929879644, + "grad_norm": 64.23711870846407, + "learning_rate": 4.5540944455200663e-07, + "logits/chosen": -2.697077512741089, + "logits/rejected": -2.6353626251220703, + "logps/chosen": -244.924072265625, + "logps/rejected": -262.2737121582031, + "loss": 0.4999, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5048820972442627, + "rewards/margins": 1.0286469459533691, + "rewards/rejected": -1.5335289239883423, + "step": 1570 + }, + { + "epoch": 0.826792255363684, + "grad_norm": 76.683910640695, + "learning_rate": 4.545378670448492e-07, + "logits/chosen": -2.749056816101074, + "logits/rejected": -2.5895638465881348, + "logps/chosen": -298.3635559082031, + "logps/rejected": -256.74139404296875, + "loss": 0.5736, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8221967816352844, + "rewards/margins": 0.8676072359085083, + "rewards/rejected": -1.6898040771484375, + "step": 1580 + }, + { + "epoch": 0.8320251177394035, + "grad_norm": 78.25471044424866, + "learning_rate": 4.5365870479119014e-07, + "logits/chosen": -2.612903356552124, + "logits/rejected": -2.4850778579711914, + "logps/chosen": -246.4439697265625, + "logps/rejected": -225.95718383789062, + "loss": 0.4848, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.502545177936554, + "rewards/margins": 1.2864090204238892, + "rewards/rejected": -1.7889540195465088, + "step": 1590 + }, + { + "epoch": 0.837257980115123, + "grad_norm": 73.52834770469094, + "learning_rate": 4.5277199039243917e-07, + "logits/chosen": -2.6008079051971436, + "logits/rejected": -2.623173475265503, + "logps/chosen": -266.70184326171875, + "logps/rejected": -297.88494873046875, + "loss": 0.5043, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8646143078804016, + "rewards/margins": 0.8069089651107788, + "rewards/rejected": -1.6715233325958252, + "step": 1600 + }, + { + "epoch": 0.8424908424908425, + "grad_norm": 72.95183490586152, + "learning_rate": 4.5187775673005744e-07, + "logits/chosen": -2.783461093902588, + "logits/rejected": -2.638617753982544, + "logps/chosen": -384.44561767578125, + "logps/rejected": -337.8665466308594, + "loss": 0.5219, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5897997617721558, + "rewards/margins": 1.0438328981399536, + "rewards/rejected": -1.6336326599121094, + "step": 1610 + }, + { + "epoch": 0.847723704866562, + "grad_norm": 53.15633631946215, + "learning_rate": 4.509760369643384e-07, + "logits/chosen": -2.680973529815674, + "logits/rejected": -2.54726243019104, + "logps/chosen": -295.14544677734375, + "logps/rejected": -250.4654998779297, + "loss": 0.5654, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.894762396812439, + "rewards/margins": 0.6794382333755493, + "rewards/rejected": -1.5742003917694092, + "step": 1620 + }, + { + "epoch": 0.8529565672422815, + "grad_norm": 71.14002291944531, + "learning_rate": 4.5006686453317734e-07, + "logits/chosen": -2.824927806854248, + "logits/rejected": -2.827450752258301, + "logps/chosen": -247.74349975585938, + "logps/rejected": -260.1580505371094, + "loss": 0.5422, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5920854806900024, + "rewards/margins": 0.9330133199691772, + "rewards/rejected": -1.5250988006591797, + "step": 1630 + }, + { + "epoch": 0.858189429618001, + "grad_norm": 74.27481044176433, + "learning_rate": 4.4915027315083243e-07, + "logits/chosen": -2.720771074295044, + "logits/rejected": -2.7052905559539795, + "logps/chosen": -318.4162902832031, + "logps/rejected": -299.65234375, + "loss": 0.5582, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5697705745697021, + "rewards/margins": 0.8112031817436218, + "rewards/rejected": -1.3809736967086792, + "step": 1640 + }, + { + "epoch": 0.8634222919937206, + "grad_norm": 71.63285416460687, + "learning_rate": 4.482262968066737e-07, + "logits/chosen": -2.69111704826355, + "logits/rejected": -2.627052068710327, + "logps/chosen": -290.1894836425781, + "logps/rejected": -296.7138366699219, + "loss": 0.5193, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6927599906921387, + "rewards/margins": 0.6575680375099182, + "rewards/rejected": -1.3503280878067017, + "step": 1650 + }, + { + "epoch": 0.8686551543694401, + "grad_norm": 63.09467029844952, + "learning_rate": 4.4729496976392324e-07, + "logits/chosen": -2.6803653240203857, + "logits/rejected": -2.628178596496582, + "logps/chosen": -222.30813598632812, + "logps/rejected": -258.65277099609375, + "loss": 0.5362, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6566628813743591, + "rewards/margins": 0.8711689114570618, + "rewards/rejected": -1.5278319120407104, + "step": 1660 + }, + { + "epoch": 0.8738880167451596, + "grad_norm": 88.87966503532546, + "learning_rate": 4.463563265583843e-07, + "logits/chosen": -2.821124315261841, + "logits/rejected": -2.6980767250061035, + "logps/chosen": -271.47381591796875, + "logps/rejected": -271.1028747558594, + "loss": 0.5337, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6489533185958862, + "rewards/margins": 0.9798796772956848, + "rewards/rejected": -1.6288330554962158, + "step": 1670 + }, + { + "epoch": 0.8791208791208791, + "grad_norm": 62.47341360667123, + "learning_rate": 4.4541040199716063e-07, + "logits/chosen": -2.639287233352661, + "logits/rejected": -2.6262917518615723, + "logps/chosen": -263.83380126953125, + "logps/rejected": -286.5048828125, + "loss": 0.4636, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5826305150985718, + "rewards/margins": 1.0568525791168213, + "rewards/rejected": -1.639483094215393, + "step": 1680 + }, + { + "epoch": 0.8843537414965986, + "grad_norm": 76.00018439668553, + "learning_rate": 4.4445723115736587e-07, + "logits/chosen": -2.6233866214752197, + "logits/rejected": -2.572371244430542, + "logps/chosen": -259.8497619628906, + "logps/rejected": -245.5025634765625, + "loss": 0.4983, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5224615335464478, + "rewards/margins": 1.167961597442627, + "rewards/rejected": -1.6904230117797852, + "step": 1690 + }, + { + "epoch": 0.8895866038723181, + "grad_norm": 66.43267421558812, + "learning_rate": 4.434968493848228e-07, + "logits/chosen": -2.676736354827881, + "logits/rejected": -2.6138041019439697, + "logps/chosen": -284.3275146484375, + "logps/rejected": -275.8121032714844, + "loss": 0.5111, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8793582916259766, + "rewards/margins": 0.8545160293579102, + "rewards/rejected": -1.7338743209838867, + "step": 1700 + }, + { + "epoch": 0.8948194662480377, + "grad_norm": 80.56001605676286, + "learning_rate": 4.425292922927525e-07, + "logits/chosen": -2.70340895652771, + "logits/rejected": -2.6097865104675293, + "logps/chosen": -342.5941162109375, + "logps/rejected": -331.31744384765625, + "loss": 0.521, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5397204756736755, + "rewards/margins": 0.9078131914138794, + "rewards/rejected": -1.4475336074829102, + "step": 1710 + }, + { + "epoch": 0.9000523286237572, + "grad_norm": 60.35160268438662, + "learning_rate": 4.41554595760454e-07, + "logits/chosen": -2.775596857070923, + "logits/rejected": -2.607522487640381, + "logps/chosen": -302.60076904296875, + "logps/rejected": -264.5050354003906, + "loss": 0.5385, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.9988101720809937, + "rewards/margins": 0.4468079209327698, + "rewards/rejected": -1.4456180334091187, + "step": 1720 + }, + { + "epoch": 0.9052851909994767, + "grad_norm": 75.91645655507445, + "learning_rate": 4.4057279593197326e-07, + "logits/chosen": -2.7369158267974854, + "logits/rejected": -2.657431125640869, + "logps/chosen": -237.8176727294922, + "logps/rejected": -199.45449829101562, + "loss": 0.5123, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5934224724769592, + "rewards/margins": 0.9565766453742981, + "rewards/rejected": -1.5499989986419678, + "step": 1730 + }, + { + "epoch": 0.9105180533751962, + "grad_norm": 87.85842673612727, + "learning_rate": 4.395839292147637e-07, + "logits/chosen": -2.774777412414551, + "logits/rejected": -2.6052792072296143, + "logps/chosen": -262.7558288574219, + "logps/rejected": -229.07363891601562, + "loss": 0.564, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9033422470092773, + "rewards/margins": 0.7843619585037231, + "rewards/rejected": -1.68770432472229, + "step": 1740 + }, + { + "epoch": 0.9157509157509157, + "grad_norm": 69.63472701534312, + "learning_rate": 4.3858803227833526e-07, + "logits/chosen": -2.750851631164551, + "logits/rejected": -2.7061922550201416, + "logps/chosen": -333.9599914550781, + "logps/rejected": -302.74261474609375, + "loss": 0.5569, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.48318320512771606, + "rewards/margins": 1.0502393245697021, + "rewards/rejected": -1.5334227085113525, + "step": 1750 + }, + { + "epoch": 0.9209837781266352, + "grad_norm": 77.24089380633716, + "learning_rate": 4.375851420528951e-07, + "logits/chosen": -2.7777843475341797, + "logits/rejected": -2.7423300743103027, + "logps/chosen": -234.72314453125, + "logps/rejected": -225.20687866210938, + "loss": 0.4943, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7239280343055725, + "rewards/margins": 0.6287710070610046, + "rewards/rejected": -1.3526990413665771, + "step": 1760 + }, + { + "epoch": 0.9262166405023547, + "grad_norm": 73.03220449201594, + "learning_rate": 4.36575295727978e-07, + "logits/chosen": -2.6381800174713135, + "logits/rejected": -2.5485033988952637, + "logps/chosen": -306.7220764160156, + "logps/rejected": -267.9762268066406, + "loss": 0.5263, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7330409288406372, + "rewards/margins": 0.9656373262405396, + "rewards/rejected": -1.6986782550811768, + "step": 1770 + }, + { + "epoch": 0.9314495028780743, + "grad_norm": 58.57471867811895, + "learning_rate": 4.355585307510675e-07, + "logits/chosen": -2.619809865951538, + "logits/rejected": -2.5870203971862793, + "logps/chosen": -253.3949432373047, + "logps/rejected": -229.19570922851562, + "loss": 0.5223, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8501235842704773, + "rewards/margins": 0.9419552683830261, + "rewards/rejected": -1.792078971862793, + "step": 1780 + }, + { + "epoch": 0.9366823652537938, + "grad_norm": 76.61140880327854, + "learning_rate": 4.345348848262068e-07, + "logits/chosen": -2.713588237762451, + "logits/rejected": -2.7371697425842285, + "logps/chosen": -337.8150329589844, + "logps/rejected": -341.7342224121094, + "loss": 0.5213, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4116416573524475, + "rewards/margins": 0.8930836915969849, + "rewards/rejected": -1.3047252893447876, + "step": 1790 + }, + { + "epoch": 0.9419152276295133, + "grad_norm": 84.84756785476401, + "learning_rate": 4.33504395912601e-07, + "logits/chosen": -2.5321755409240723, + "logits/rejected": -2.4565305709838867, + "logps/chosen": -244.6804962158203, + "logps/rejected": -289.1626892089844, + "loss": 0.5003, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.976446807384491, + "rewards/margins": 1.38921058177948, + "rewards/rejected": -2.365657329559326, + "step": 1800 + }, + { + "epoch": 0.9471480900052328, + "grad_norm": 76.15234371104347, + "learning_rate": 4.324671022232095e-07, + "logits/chosen": -2.739748477935791, + "logits/rejected": -2.6528263092041016, + "logps/chosen": -266.6080017089844, + "logps/rejected": -242.7993927001953, + "loss": 0.5395, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.6612841486930847, + "rewards/margins": 1.1026535034179688, + "rewards/rejected": -1.7639377117156982, + "step": 1810 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 59.54173043288726, + "learning_rate": 4.314230422233286e-07, + "logits/chosen": -2.656968593597412, + "logits/rejected": -2.564446449279785, + "logps/chosen": -226.46499633789062, + "logps/rejected": -199.26419067382812, + "loss": 0.554, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7357887029647827, + "rewards/margins": 0.8301893472671509, + "rewards/rejected": -1.565977931022644, + "step": 1820 + }, + { + "epoch": 0.957613814756672, + "grad_norm": 90.01723409563466, + "learning_rate": 4.303722546291655e-07, + "logits/chosen": -2.8147635459899902, + "logits/rejected": -2.748579502105713, + "logps/chosen": -293.80706787109375, + "logps/rejected": -253.5705108642578, + "loss": 0.5496, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6129995584487915, + "rewards/margins": 0.715448260307312, + "rewards/rejected": -1.3284478187561035, + "step": 1830 + }, + { + "epoch": 0.9628466771323915, + "grad_norm": 66.06404656121876, + "learning_rate": 4.2931477840640243e-07, + "logits/chosen": -2.7455453872680664, + "logits/rejected": -2.5494322776794434, + "logps/chosen": -342.5234069824219, + "logps/rejected": -287.9923095703125, + "loss": 0.5235, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6869091391563416, + "rewards/margins": 0.9585253000259399, + "rewards/rejected": -1.6454343795776367, + "step": 1840 + }, + { + "epoch": 0.968079539508111, + "grad_norm": 70.59339859610029, + "learning_rate": 4.282506527687517e-07, + "logits/chosen": -2.6523709297180176, + "logits/rejected": -2.596681833267212, + "logps/chosen": -384.2684326171875, + "logps/rejected": -315.46728515625, + "loss": 0.5384, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2603829503059387, + "rewards/margins": 1.1419236660003662, + "rewards/rejected": -1.4023066759109497, + "step": 1850 + }, + { + "epoch": 0.9733124018838305, + "grad_norm": 56.435468927974156, + "learning_rate": 4.271799171765016e-07, + "logits/chosen": -2.6715855598449707, + "logits/rejected": -2.499748706817627, + "logps/chosen": -336.3532409667969, + "logps/rejected": -253.0960693359375, + "loss": 0.5253, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9096790552139282, + "rewards/margins": 0.7897397875785828, + "rewards/rejected": -1.6994187831878662, + "step": 1860 + }, + { + "epoch": 0.97854526425955, + "grad_norm": 79.21208604111206, + "learning_rate": 4.2610261133505323e-07, + "logits/chosen": -2.7125673294067383, + "logits/rejected": -2.6025753021240234, + "logps/chosen": -257.30023193359375, + "logps/rejected": -251.1166534423828, + "loss": 0.5067, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5088145136833191, + "rewards/margins": 0.9294927716255188, + "rewards/rejected": -1.438307285308838, + "step": 1870 + }, + { + "epoch": 0.9837781266352695, + "grad_norm": 67.10889828757449, + "learning_rate": 4.250187751934479e-07, + "logits/chosen": -2.759009838104248, + "logits/rejected": -2.835824966430664, + "logps/chosen": -262.1882019042969, + "logps/rejected": -329.35089111328125, + "loss": 0.513, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.34654396772384644, + "rewards/margins": 0.9472799301147461, + "rewards/rejected": -1.2938238382339478, + "step": 1880 + }, + { + "epoch": 0.989010989010989, + "grad_norm": 68.91612684145302, + "learning_rate": 4.2392844894288605e-07, + "logits/chosen": -2.685486078262329, + "logits/rejected": -2.623790740966797, + "logps/chosen": -402.46978759765625, + "logps/rejected": -347.0303039550781, + "loss": 0.5157, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5702678561210632, + "rewards/margins": 0.7701340913772583, + "rewards/rejected": -1.3404020071029663, + "step": 1890 + }, + { + "epoch": 0.9942438513867086, + "grad_norm": 63.01428510274556, + "learning_rate": 4.2283167301523634e-07, + "logits/chosen": -2.8204257488250732, + "logits/rejected": -2.680922508239746, + "logps/chosen": -238.2408447265625, + "logps/rejected": -238.68704223632812, + "loss": 0.5127, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9641359448432922, + "rewards/margins": 0.9542892575263977, + "rewards/rejected": -1.91842520236969, + "step": 1900 + }, + { + "epoch": 0.9994767137624281, + "grad_norm": 82.43417623887333, + "learning_rate": 4.217284880815369e-07, + "logits/chosen": -2.667724370956421, + "logits/rejected": -2.6773922443389893, + "logps/chosen": -338.7535705566406, + "logps/rejected": -349.4644470214844, + "loss": 0.5238, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.45123291015625, + "rewards/margins": 1.566555380821228, + "rewards/rejected": -2.0177884101867676, + "step": 1910 + }, + { + "epoch": 1.0047095761381475, + "grad_norm": 41.11062818319924, + "learning_rate": 4.2061893505048694e-07, + "logits/chosen": -2.667792558670044, + "logits/rejected": -2.6502389907836914, + "logps/chosen": -202.4060821533203, + "logps/rejected": -271.0544128417969, + "loss": 0.1721, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08865700662136078, + "rewards/margins": 2.546302318572998, + "rewards/rejected": -2.4576454162597656, + "step": 1920 + }, + { + "epoch": 1.0099424385138671, + "grad_norm": 16.348882259373745, + "learning_rate": 4.1950305506692967e-07, + "logits/chosen": -2.8264620304107666, + "logits/rejected": -2.6654040813446045, + "logps/chosen": -318.1976013183594, + "logps/rejected": -296.55914306640625, + "loss": 0.1149, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2823026180267334, + "rewards/margins": 4.868466377258301, + "rewards/rejected": -3.5861637592315674, + "step": 1930 + }, + { + "epoch": 1.0151753008895865, + "grad_norm": 20.175971999612642, + "learning_rate": 4.1838088951032656e-07, + "logits/chosen": -2.559612989425659, + "logits/rejected": -2.506347179412842, + "logps/chosen": -345.3162536621094, + "logps/rejected": -316.6770935058594, + "loss": 0.1262, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.7579983472824097, + "rewards/margins": 4.48593807220459, + "rewards/rejected": -3.7279388904571533, + "step": 1940 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 15.520890307252973, + "learning_rate": 4.172524799932231e-07, + "logits/chosen": -2.6519670486450195, + "logits/rejected": -2.6060433387756348, + "logps/chosen": -217.2434539794922, + "logps/rejected": -279.20989990234375, + "loss": 0.1184, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.11215732246637344, + "rewards/margins": 3.2862472534179688, + "rewards/rejected": -3.1740899085998535, + "step": 1950 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 13.628134487221653, + "learning_rate": 4.161178683597054e-07, + "logits/chosen": -2.804748773574829, + "logits/rejected": -2.621863842010498, + "logps/chosen": -257.8443603515625, + "logps/rejected": -234.2767333984375, + "loss": 0.0981, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.13752229511737823, + "rewards/margins": 3.750183582305908, + "rewards/rejected": -3.612661361694336, + "step": 1960 + }, + { + "epoch": 1.0308738880167452, + "grad_norm": 24.172632805742488, + "learning_rate": 4.1497709668384885e-07, + "logits/chosen": -2.7590997219085693, + "logits/rejected": -2.6684045791625977, + "logps/chosen": -342.3634338378906, + "logps/rejected": -321.36480712890625, + "loss": 0.1018, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8379875421524048, + "rewards/margins": 4.788851737976074, + "rewards/rejected": -3.95086407661438, + "step": 1970 + }, + { + "epoch": 1.0361067503924646, + "grad_norm": 33.73220087524629, + "learning_rate": 4.1383020726815745e-07, + "logits/chosen": -2.7631189823150635, + "logits/rejected": -2.651642084121704, + "logps/chosen": -245.7153778076172, + "logps/rejected": -274.5694885253906, + "loss": 0.1152, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14847412705421448, + "rewards/margins": 3.823739528656006, + "rewards/rejected": -3.9722137451171875, + "step": 1980 + }, + { + "epoch": 1.0413396127681842, + "grad_norm": 22.264648453297795, + "learning_rate": 4.126772426419959e-07, + "logits/chosen": -2.642127275466919, + "logits/rejected": -2.661806583404541, + "logps/chosen": -261.2215881347656, + "logps/rejected": -302.48541259765625, + "loss": 0.1445, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09080305695533752, + "rewards/margins": 3.6442935466766357, + "rewards/rejected": -3.7350971698760986, + "step": 1990 + }, + { + "epoch": 1.0465724751439036, + "grad_norm": 13.044758008413115, + "learning_rate": 4.1151824556001145e-07, + "logits/chosen": -2.7277259826660156, + "logits/rejected": -2.6704182624816895, + "logps/chosen": -231.52023315429688, + "logps/rejected": -295.1512145996094, + "loss": 0.1219, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.004637342877686024, + "rewards/margins": 4.0213823318481445, + "rewards/rejected": -4.016745090484619, + "step": 2000 + }, + { + "epoch": 1.0465724751439036, + "eval_logits/chosen": -2.681251049041748, + "eval_logits/rejected": -2.62642502784729, + "eval_logps/chosen": -280.007568359375, + "eval_logps/rejected": -295.7981872558594, + "eval_loss": 0.5597859025001526, + "eval_rewards/accuracies": 0.75390625, + "eval_rewards/chosen": -1.2750576734542847, + "eval_rewards/margins": 1.320380449295044, + "eval_rewards/rejected": -2.595438003540039, + "eval_runtime": 101.4599, + "eval_samples_per_second": 19.712, + "eval_steps_per_second": 0.315, + "step": 2000 + }, + { + "epoch": 1.0518053375196232, + "grad_norm": 5.757416919621772, + "learning_rate": 4.103532590005495e-07, + "logits/chosen": -2.795039653778076, + "logits/rejected": -2.6831464767456055, + "logps/chosen": -273.0514831542969, + "logps/rejected": -247.02188110351562, + "loss": 0.0915, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48246365785598755, + "rewards/margins": 4.765566825866699, + "rewards/rejected": -4.283102512359619, + "step": 2010 + }, + { + "epoch": 1.0570381998953426, + "grad_norm": 14.083494320034111, + "learning_rate": 4.091823261640592e-07, + "logits/chosen": -2.7425522804260254, + "logits/rejected": -2.6334598064422607, + "logps/chosen": -253.7746124267578, + "logps/rejected": -252.0037078857422, + "loss": 0.117, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.34954172372817993, + "rewards/margins": 4.962024211883545, + "rewards/rejected": -4.612483024597168, + "step": 2020 + }, + { + "epoch": 1.0622710622710623, + "grad_norm": 19.89811596162904, + "learning_rate": 4.080054904714917e-07, + "logits/chosen": -2.693772792816162, + "logits/rejected": -2.6306285858154297, + "logps/chosen": -240.82015991210938, + "logps/rejected": -271.550048828125, + "loss": 0.106, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5872729420661926, + "rewards/margins": 3.7091548442840576, + "rewards/rejected": -4.296427249908447, + "step": 2030 + }, + { + "epoch": 1.0675039246467817, + "grad_norm": 18.48630840724053, + "learning_rate": 4.0682279556268993e-07, + "logits/chosen": -2.687135696411133, + "logits/rejected": -2.6527206897735596, + "logps/chosen": -321.5030822753906, + "logps/rejected": -355.2987365722656, + "loss": 0.1024, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.37303978204727173, + "rewards/margins": 5.5488667488098145, + "rewards/rejected": -5.175827503204346, + "step": 2040 + }, + { + "epoch": 1.0727367870225013, + "grad_norm": 72.20430790189955, + "learning_rate": 4.056342852947706e-07, + "logits/chosen": -2.857966661453247, + "logits/rejected": -2.657653331756592, + "logps/chosen": -358.5530700683594, + "logps/rejected": -333.0049743652344, + "loss": 0.1054, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41347384452819824, + "rewards/margins": 5.176710605621338, + "rewards/rejected": -4.763236999511719, + "step": 2050 + }, + { + "epoch": 1.077969649398221, + "grad_norm": 15.825306334222, + "learning_rate": 4.044400037404973e-07, + "logits/chosen": -2.7194700241088867, + "logits/rejected": -2.6635639667510986, + "logps/chosen": -209.5430450439453, + "logps/rejected": -231.8815155029297, + "loss": 0.0934, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.1871255487203598, + "rewards/margins": 3.7858691215515137, + "rewards/rejected": -3.598743438720703, + "step": 2060 + }, + { + "epoch": 1.0832025117739403, + "grad_norm": 24.60722719393446, + "learning_rate": 4.032399951866468e-07, + "logits/chosen": -2.6166274547576904, + "logits/rejected": -2.493044853210449, + "logps/chosen": -220.2571563720703, + "logps/rejected": -224.3196563720703, + "loss": 0.1081, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.0679265633225441, + "rewards/margins": 3.6435153484344482, + "rewards/rejected": -3.7114417552948, + "step": 2070 + }, + { + "epoch": 1.08843537414966, + "grad_norm": 11.124081035150457, + "learning_rate": 4.0203430413236637e-07, + "logits/chosen": -2.7528700828552246, + "logits/rejected": -2.6914639472961426, + "logps/chosen": -290.67840576171875, + "logps/rejected": -337.6015625, + "loss": 0.1114, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.13888368010520935, + "rewards/margins": 4.532259464263916, + "rewards/rejected": -4.393376350402832, + "step": 2080 + }, + { + "epoch": 1.0936682365253794, + "grad_norm": 26.233852464976422, + "learning_rate": 4.0082297528752407e-07, + "logits/chosen": -2.6731064319610596, + "logits/rejected": -2.5635428428649902, + "logps/chosen": -190.65933227539062, + "logps/rejected": -245.5774688720703, + "loss": 0.1073, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.025271248072385788, + "rewards/margins": 4.773611545562744, + "rewards/rejected": -4.798882484436035, + "step": 2090 + }, + { + "epoch": 1.098901098901099, + "grad_norm": 26.69458051765813, + "learning_rate": 3.9960605357105e-07, + "logits/chosen": -2.754662275314331, + "logits/rejected": -2.672816753387451, + "logps/chosen": -271.67108154296875, + "logps/rejected": -290.9272155761719, + "loss": 0.1017, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26762229204177856, + "rewards/margins": 4.417377471923828, + "rewards/rejected": -4.684999465942383, + "step": 2100 + }, + { + "epoch": 1.1041339612768184, + "grad_norm": 15.294518524435599, + "learning_rate": 3.983835841092716e-07, + "logits/chosen": -2.7272603511810303, + "logits/rejected": -2.5137414932250977, + "logps/chosen": -303.4950256347656, + "logps/rejected": -244.9526824951172, + "loss": 0.112, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.03372827544808388, + "rewards/margins": 4.539273262023926, + "rewards/rejected": -4.573000907897949, + "step": 2110 + }, + { + "epoch": 1.109366823652538, + "grad_norm": 22.579864784334834, + "learning_rate": 3.971556122342398e-07, + "logits/chosen": -2.7080860137939453, + "logits/rejected": -2.598214626312256, + "logps/chosen": -262.7431945800781, + "logps/rejected": -255.9542999267578, + "loss": 0.1188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2957598567008972, + "rewards/margins": 3.6235549449920654, + "rewards/rejected": -3.9193148612976074, + "step": 2120 + }, + { + "epoch": 1.1145996860282574, + "grad_norm": 32.3871406216201, + "learning_rate": 3.9592218348204766e-07, + "logits/chosen": -2.754770278930664, + "logits/rejected": -2.6260383129119873, + "logps/chosen": -282.0311584472656, + "logps/rejected": -283.6141052246094, + "loss": 0.094, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.2089064121246338, + "rewards/margins": 3.732781171798706, + "rewards/rejected": -3.9416871070861816, + "step": 2130 + }, + { + "epoch": 1.119832548403977, + "grad_norm": 26.50872335052292, + "learning_rate": 3.946833435911423e-07, + "logits/chosen": -2.7639455795288086, + "logits/rejected": -2.5854954719543457, + "logps/chosen": -241.7471923828125, + "logps/rejected": -255.3394775390625, + "loss": 0.1284, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.10643292963504791, + "rewards/margins": 5.131852149963379, + "rewards/rejected": -5.025418281555176, + "step": 2140 + }, + { + "epoch": 1.1250654107796965, + "grad_norm": 13.796588381910421, + "learning_rate": 3.9343913850062856e-07, + "logits/chosen": -2.653146982192993, + "logits/rejected": -2.7795047760009766, + "logps/chosen": -219.7615966796875, + "logps/rejected": -321.1592712402344, + "loss": 0.1194, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6270453929901123, + "rewards/margins": 4.35850715637207, + "rewards/rejected": -4.985552787780762, + "step": 2150 + }, + { + "epoch": 1.130298273155416, + "grad_norm": 18.129359689281564, + "learning_rate": 3.921896143485657e-07, + "logits/chosen": -2.6613283157348633, + "logits/rejected": -2.5742406845092773, + "logps/chosen": -274.45330810546875, + "logps/rejected": -293.07037353515625, + "loss": 0.1367, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5230139493942261, + "rewards/margins": 4.180461883544922, + "rewards/rejected": -4.703475475311279, + "step": 2160 + }, + { + "epoch": 1.1355311355311355, + "grad_norm": 31.59182203614497, + "learning_rate": 3.9093481747025615e-07, + "logits/chosen": -2.8258137702941895, + "logits/rejected": -2.696892023086548, + "logps/chosen": -304.492919921875, + "logps/rejected": -299.60235595703125, + "loss": 0.1077, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1446405053138733, + "rewards/margins": 4.210850715637207, + "rewards/rejected": -4.3554911613464355, + "step": 2170 + }, + { + "epoch": 1.1407639979068551, + "grad_norm": 16.54672073873017, + "learning_rate": 3.896747943965275e-07, + "logits/chosen": -2.798146963119507, + "logits/rejected": -2.6052534580230713, + "logps/chosen": -246.67373657226562, + "logps/rejected": -273.1455993652344, + "loss": 0.1118, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.24677284061908722, + "rewards/margins": 4.432964324951172, + "rewards/rejected": -4.679737567901611, + "step": 2180 + }, + { + "epoch": 1.1459968602825745, + "grad_norm": 11.0165801283736, + "learning_rate": 3.8840959185200717e-07, + "logits/chosen": -2.6495652198791504, + "logits/rejected": -2.696471691131592, + "logps/chosen": -264.830322265625, + "logps/rejected": -284.7891540527344, + "loss": 0.1036, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07012466341257095, + "rewards/margins": 4.172719955444336, + "rewards/rejected": -4.102595329284668, + "step": 2190 + }, + { + "epoch": 1.1512297226582942, + "grad_norm": 25.08165850770016, + "learning_rate": 3.871392567533893e-07, + "logits/chosen": -2.758479595184326, + "logits/rejected": -2.6219115257263184, + "logps/chosen": -309.43341064453125, + "logps/rejected": -308.2267761230469, + "loss": 0.0932, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03831760957837105, + "rewards/margins": 4.258543968200684, + "rewards/rejected": -4.220226764678955, + "step": 2200 + }, + { + "epoch": 1.1564625850340136, + "grad_norm": 19.945909449447672, + "learning_rate": 3.858638362076953e-07, + "logits/chosen": -2.649031162261963, + "logits/rejected": -2.521435260772705, + "logps/chosen": -267.05474853515625, + "logps/rejected": -279.1736755371094, + "loss": 0.0981, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.2997003197669983, + "rewards/margins": 4.197286128997803, + "rewards/rejected": -3.8975861072540283, + "step": 2210 + }, + { + "epoch": 1.1616954474097332, + "grad_norm": 8.116785041619545, + "learning_rate": 3.845833775105272e-07, + "logits/chosen": -2.716078996658325, + "logits/rejected": -2.6938929557800293, + "logps/chosen": -250.26473999023438, + "logps/rejected": -304.3114013671875, + "loss": 0.0701, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.47546157240867615, + "rewards/margins": 4.779170989990234, + "rewards/rejected": -4.303709030151367, + "step": 2220 + }, + { + "epoch": 1.1669283097854526, + "grad_norm": 52.93274090068003, + "learning_rate": 3.832979281443133e-07, + "logits/chosen": -2.7947661876678467, + "logits/rejected": -2.7615036964416504, + "logps/chosen": -251.92333984375, + "logps/rejected": -279.75067138671875, + "loss": 0.1208, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4818398952484131, + "rewards/margins": 4.38036584854126, + "rewards/rejected": -3.8985257148742676, + "step": 2230 + }, + { + "epoch": 1.1721611721611722, + "grad_norm": 14.730493618178814, + "learning_rate": 3.8200753577654765e-07, + "logits/chosen": -2.7494163513183594, + "logits/rejected": -2.624601364135742, + "logps/chosen": -240.26797485351562, + "logps/rejected": -286.29998779296875, + "loss": 0.12, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.16159066557884216, + "rewards/margins": 4.487453460693359, + "rewards/rejected": -4.649044513702393, + "step": 2240 + }, + { + "epoch": 1.1773940345368916, + "grad_norm": 20.5831665916214, + "learning_rate": 3.8071224825802273e-07, + "logits/chosen": -2.821059226989746, + "logits/rejected": -2.809110641479492, + "logps/chosen": -298.7447814941406, + "logps/rejected": -364.2626647949219, + "loss": 0.0916, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.017295408993959427, + "rewards/margins": 4.2460737228393555, + "rewards/rejected": -4.228778839111328, + "step": 2250 + }, + { + "epoch": 1.1826268969126112, + "grad_norm": 21.38960333109125, + "learning_rate": 3.7941211362105453e-07, + "logits/chosen": -2.8299174308776855, + "logits/rejected": -2.7152066230773926, + "logps/chosen": -307.16558837890625, + "logps/rejected": -360.36529541015625, + "loss": 0.1011, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.2989030182361603, + "rewards/margins": 4.798842430114746, + "rewards/rejected": -4.499939918518066, + "step": 2260 + }, + { + "epoch": 1.1878597592883307, + "grad_norm": 24.962369035454596, + "learning_rate": 3.781071800777017e-07, + "logits/chosen": -2.6308321952819824, + "logits/rejected": -2.6083009243011475, + "logps/chosen": -299.7275390625, + "logps/rejected": -337.92413330078125, + "loss": 0.105, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05873597413301468, + "rewards/margins": 5.657949924468994, + "rewards/rejected": -5.599213123321533, + "step": 2270 + }, + { + "epoch": 1.1930926216640503, + "grad_norm": 30.89580342581049, + "learning_rate": 3.767974960179776e-07, + "logits/chosen": -2.758455514907837, + "logits/rejected": -2.7137277126312256, + "logps/chosen": -249.8431854248047, + "logps/rejected": -286.2932434082031, + "loss": 0.0806, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5361717939376831, + "rewards/margins": 4.667669773101807, + "rewards/rejected": -5.203841686248779, + "step": 2280 + }, + { + "epoch": 1.1983254840397697, + "grad_norm": 30.283751234272458, + "learning_rate": 3.7548311000805605e-07, + "logits/chosen": -2.6532771587371826, + "logits/rejected": -2.6755683422088623, + "logps/chosen": -265.91107177734375, + "logps/rejected": -353.2947692871094, + "loss": 0.1133, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.3525296151638031, + "rewards/margins": 4.7483320236206055, + "rewards/rejected": -5.100861549377441, + "step": 2290 + }, + { + "epoch": 1.2035583464154893, + "grad_norm": 9.859761545067276, + "learning_rate": 3.7416407078847015e-07, + "logits/chosen": -2.8084030151367188, + "logits/rejected": -2.7766425609588623, + "logps/chosen": -293.06781005859375, + "logps/rejected": -352.6015319824219, + "loss": 0.1005, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.38035669922828674, + "rewards/margins": 4.947328567504883, + "rewards/rejected": -5.327685356140137, + "step": 2300 + }, + { + "epoch": 1.2087912087912087, + "grad_norm": 18.03917985289818, + "learning_rate": 3.7284042727230506e-07, + "logits/chosen": -2.787038803100586, + "logits/rejected": -2.644801616668701, + "logps/chosen": -216.9444122314453, + "logps/rejected": -270.38037109375, + "loss": 0.1084, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.41174182295799255, + "rewards/margins": 5.080317497253418, + "rewards/rejected": -5.492059230804443, + "step": 2310 + }, + { + "epoch": 1.2140240711669283, + "grad_norm": 23.654847155769655, + "learning_rate": 3.7151222854338413e-07, + "logits/chosen": -2.8254880905151367, + "logits/rejected": -2.6135454177856445, + "logps/chosen": -311.6871032714844, + "logps/rejected": -318.1632080078125, + "loss": 0.1159, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.1879824846982956, + "rewards/margins": 5.800552845001221, + "rewards/rejected": -5.612570762634277, + "step": 2320 + }, + { + "epoch": 1.2192569335426477, + "grad_norm": 35.04893748896877, + "learning_rate": 3.701795238544488e-07, + "logits/chosen": -2.763002395629883, + "logits/rejected": -2.6701645851135254, + "logps/chosen": -298.36529541015625, + "logps/rejected": -325.75146484375, + "loss": 0.1037, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.038963984698057175, + "rewards/margins": 5.101754665374756, + "rewards/rejected": -5.062790870666504, + "step": 2330 + }, + { + "epoch": 1.2244897959183674, + "grad_norm": 7.125936445736221, + "learning_rate": 3.688423626253318e-07, + "logits/chosen": -2.615354299545288, + "logits/rejected": -2.674558401107788, + "logps/chosen": -211.46652221679688, + "logps/rejected": -271.6877136230469, + "loss": 0.1071, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.3125256896018982, + "rewards/margins": 4.879128456115723, + "rewards/rejected": -5.191654682159424, + "step": 2340 + }, + { + "epoch": 1.2297226582940868, + "grad_norm": 21.624796429967716, + "learning_rate": 3.675007944411253e-07, + "logits/chosen": -2.78330659866333, + "logits/rejected": -2.6863913536071777, + "logps/chosen": -288.99591064453125, + "logps/rejected": -279.0249938964844, + "loss": 0.1455, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.3831740617752075, + "rewards/margins": 4.454309940338135, + "rewards/rejected": -4.071135520935059, + "step": 2350 + }, + { + "epoch": 1.2349555206698064, + "grad_norm": 27.255995328708035, + "learning_rate": 3.6615486905034167e-07, + "logits/chosen": -2.7918009757995605, + "logits/rejected": -2.698138475418091, + "logps/chosen": -304.58099365234375, + "logps/rejected": -295.0577392578125, + "loss": 0.104, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02554398775100708, + "rewards/margins": 4.034098148345947, + "rewards/rejected": -4.008553981781006, + "step": 2360 + }, + { + "epoch": 1.2401883830455258, + "grad_norm": 61.76929752931291, + "learning_rate": 3.6480463636306846e-07, + "logits/chosen": -2.792266368865967, + "logits/rejected": -2.713168144226074, + "logps/chosen": -319.86151123046875, + "logps/rejected": -338.32769775390625, + "loss": 0.1403, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.04835640266537666, + "rewards/margins": 4.156170845031738, + "rewards/rejected": -4.204527854919434, + "step": 2370 + }, + { + "epoch": 1.2454212454212454, + "grad_norm": 15.002806398318375, + "learning_rate": 3.634501464491183e-07, + "logits/chosen": -2.7825894355773926, + "logits/rejected": -2.7090325355529785, + "logps/chosen": -247.15719604492188, + "logps/rejected": -306.4307556152344, + "loss": 0.088, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.2551979422569275, + "rewards/margins": 4.776516437530518, + "rewards/rejected": -4.5213189125061035, + "step": 2380 + }, + { + "epoch": 1.250654107796965, + "grad_norm": 13.601330406207861, + "learning_rate": 3.6209144953617175e-07, + "logits/chosen": -2.5528080463409424, + "logits/rejected": -2.5722885131835938, + "logps/chosen": -350.04766845703125, + "logps/rejected": -439.79510498046875, + "loss": 0.0795, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.187852144241333, + "rewards/margins": 5.684319496154785, + "rewards/rejected": -5.496466636657715, + "step": 2390 + }, + { + "epoch": 1.2558869701726845, + "grad_norm": 31.574093609154257, + "learning_rate": 3.607285960079146e-07, + "logits/chosen": -2.7976462841033936, + "logits/rejected": -2.683702230453491, + "logps/chosen": -323.38592529296875, + "logps/rejected": -345.6888122558594, + "loss": 0.1042, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.437208890914917, + "rewards/margins": 5.890275001525879, + "rewards/rejected": -5.453065395355225, + "step": 2400 + }, + { + "epoch": 1.2611198325484039, + "grad_norm": 36.854098923875576, + "learning_rate": 3.593616364021701e-07, + "logits/chosen": -2.840149402618408, + "logits/rejected": -2.7021851539611816, + "logps/chosen": -299.9290771484375, + "logps/rejected": -336.386962890625, + "loss": 0.1085, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.1426982879638672, + "rewards/margins": 4.883759021759033, + "rewards/rejected": -5.0264573097229, + "step": 2410 + }, + { + "epoch": 1.2663526949241235, + "grad_norm": 39.2447615051275, + "learning_rate": 3.5799062140902413e-07, + "logits/chosen": -2.70076322555542, + "logits/rejected": -2.5660500526428223, + "logps/chosen": -319.2273254394531, + "logps/rejected": -310.43475341796875, + "loss": 0.1012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06131463497877121, + "rewards/margins": 5.338719367980957, + "rewards/rejected": -5.277405261993408, + "step": 2420 + }, + { + "epoch": 1.2715855572998431, + "grad_norm": 25.822343644852324, + "learning_rate": 3.566156018689462e-07, + "logits/chosen": -2.757275342941284, + "logits/rejected": -2.4784798622131348, + "logps/chosen": -279.9647521972656, + "logps/rejected": -255.85922241210938, + "loss": 0.1481, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.0078494548797607, + "rewards/margins": 4.228066444396973, + "rewards/rejected": -5.2359161376953125, + "step": 2430 + }, + { + "epoch": 1.2768184196755625, + "grad_norm": 31.654947438021473, + "learning_rate": 3.552366287709038e-07, + "logits/chosen": -2.631016254425049, + "logits/rejected": -2.719831943511963, + "logps/chosen": -314.0375061035156, + "logps/rejected": -351.51788330078125, + "loss": 0.0966, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.245171457529068, + "rewards/margins": 6.115281105041504, + "rewards/rejected": -6.360452651977539, + "step": 2440 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 44.6834650269034, + "learning_rate": 3.5385375325047163e-07, + "logits/chosen": -2.6979851722717285, + "logits/rejected": -2.687857151031494, + "logps/chosen": -265.47509765625, + "logps/rejected": -299.4295654296875, + "loss": 0.0971, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.41958731412887573, + "rewards/margins": 4.668161392211914, + "rewards/rejected": -5.087749481201172, + "step": 2450 + }, + { + "epoch": 1.2872841444270016, + "grad_norm": 16.30151054829352, + "learning_rate": 3.524670265879353e-07, + "logits/chosen": -2.729936122894287, + "logits/rejected": -2.6401467323303223, + "logps/chosen": -230.04672241210938, + "logps/rejected": -255.76913452148438, + "loss": 0.1051, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.09567772597074509, + "rewards/margins": 4.742767810821533, + "rewards/rejected": -4.83844518661499, + "step": 2460 + }, + { + "epoch": 1.2925170068027212, + "grad_norm": 11.429201947078635, + "learning_rate": 3.510765002063901e-07, + "logits/chosen": -2.7029623985290527, + "logits/rejected": -2.683657646179199, + "logps/chosen": -256.6478576660156, + "logps/rejected": -329.90338134765625, + "loss": 0.0976, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.30263346433639526, + "rewards/margins": 5.271126747131348, + "rewards/rejected": -5.573760032653809, + "step": 2470 + }, + { + "epoch": 1.2977498691784406, + "grad_norm": 12.885983344976001, + "learning_rate": 3.4968222566983367e-07, + "logits/chosen": -2.8190150260925293, + "logits/rejected": -2.648803234100342, + "logps/chosen": -260.2120666503906, + "logps/rejected": -254.6842041015625, + "loss": 0.1215, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6284938454627991, + "rewards/margins": 4.271325588226318, + "rewards/rejected": -4.899819374084473, + "step": 2480 + }, + { + "epoch": 1.30298273155416, + "grad_norm": 22.628591752205843, + "learning_rate": 3.482842546812543e-07, + "logits/chosen": -2.772792339324951, + "logits/rejected": -2.627633571624756, + "logps/chosen": -348.0332336425781, + "logps/rejected": -337.7981262207031, + "loss": 0.0892, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25340384244918823, + "rewards/margins": 5.028731346130371, + "rewards/rejected": -5.282134056091309, + "step": 2490 + }, + { + "epoch": 1.3082155939298796, + "grad_norm": 9.442159742249636, + "learning_rate": 3.4688263908071307e-07, + "logits/chosen": -2.669041156768799, + "logits/rejected": -2.577634334564209, + "logps/chosen": -236.87039184570312, + "logps/rejected": -274.69769287109375, + "loss": 0.1074, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6515072584152222, + "rewards/margins": 4.638780117034912, + "rewards/rejected": -5.290286540985107, + "step": 2500 + }, + { + "epoch": 1.3134484563055993, + "grad_norm": 21.334255686437192, + "learning_rate": 3.454774308434222e-07, + "logits/chosen": -2.718568801879883, + "logits/rejected": -2.6813645362854004, + "logps/chosen": -252.12698364257812, + "logps/rejected": -354.6980285644531, + "loss": 0.1008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1903163492679596, + "rewards/margins": 5.55415153503418, + "rewards/rejected": -5.7444682121276855, + "step": 2510 + }, + { + "epoch": 1.3186813186813187, + "grad_norm": 23.562620607422794, + "learning_rate": 3.4406868207781725e-07, + "logits/chosen": -2.7359769344329834, + "logits/rejected": -2.6525063514709473, + "logps/chosen": -250.84408569335938, + "logps/rejected": -237.7877655029297, + "loss": 0.1239, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.18924112617969513, + "rewards/margins": 4.812147617340088, + "rewards/rejected": -5.001389026641846, + "step": 2520 + }, + { + "epoch": 1.323914181057038, + "grad_norm": 34.75893074799262, + "learning_rate": 3.426564450236249e-07, + "logits/chosen": -2.726229429244995, + "logits/rejected": -2.536306142807007, + "logps/chosen": -262.1828918457031, + "logps/rejected": -262.5714416503906, + "loss": 0.1047, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.25592824816703796, + "rewards/margins": 4.934244155883789, + "rewards/rejected": -5.1901726722717285, + "step": 2530 + }, + { + "epoch": 1.3291470434327577, + "grad_norm": 41.053053847742135, + "learning_rate": 3.4124077204992576e-07, + "logits/chosen": -2.5745468139648438, + "logits/rejected": -2.548739194869995, + "logps/chosen": -198.30911254882812, + "logps/rejected": -282.16412353515625, + "loss": 0.0918, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39724957942962646, + "rewards/margins": 5.94715690612793, + "rewards/rejected": -5.549907684326172, + "step": 2540 + }, + { + "epoch": 1.3343799058084773, + "grad_norm": 16.140284709994052, + "learning_rate": 3.398217156532125e-07, + "logits/chosen": -2.7923262119293213, + "logits/rejected": -2.654839038848877, + "logps/chosen": -292.51959228515625, + "logps/rejected": -311.15374755859375, + "loss": 0.081, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.07992169260978699, + "rewards/margins": 5.313913822174072, + "rewards/rejected": -5.393835067749023, + "step": 2550 + }, + { + "epoch": 1.3396127681841967, + "grad_norm": 21.152699853059136, + "learning_rate": 3.383993284554431e-07, + "logits/chosen": -2.7588140964508057, + "logits/rejected": -2.6776671409606934, + "logps/chosen": -268.39166259765625, + "logps/rejected": -297.9541931152344, + "loss": 0.0903, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.17908921837806702, + "rewards/margins": 5.116482734680176, + "rewards/rejected": -5.295571327209473, + "step": 2560 + }, + { + "epoch": 1.3448456305599163, + "grad_norm": 31.131582166759053, + "learning_rate": 3.3697366320208955e-07, + "logits/chosen": -2.6680309772491455, + "logits/rejected": -2.5980541706085205, + "logps/chosen": -304.59014892578125, + "logps/rejected": -317.4519958496094, + "loss": 0.0835, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.25217151641845703, + "rewards/margins": 4.94983434677124, + "rewards/rejected": -5.202005863189697, + "step": 2570 + }, + { + "epoch": 1.3500784929356358, + "grad_norm": 23.014778992969436, + "learning_rate": 3.355447727601816e-07, + "logits/chosen": -2.6754276752471924, + "logits/rejected": -2.528729200363159, + "logps/chosen": -262.725341796875, + "logps/rejected": -312.8125305175781, + "loss": 0.1066, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5141451358795166, + "rewards/margins": 5.295892238616943, + "rewards/rejected": -5.810038089752197, + "step": 2580 + }, + { + "epoch": 1.3553113553113554, + "grad_norm": 32.05773769051717, + "learning_rate": 3.3411271011634697e-07, + "logits/chosen": -2.6619973182678223, + "logits/rejected": -2.695253610610962, + "logps/chosen": -320.0899658203125, + "logps/rejected": -372.51043701171875, + "loss": 0.1251, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9629742503166199, + "rewards/margins": 4.419332027435303, + "rewards/rejected": -5.382306098937988, + "step": 2590 + }, + { + "epoch": 1.3605442176870748, + "grad_norm": 25.542660245310532, + "learning_rate": 3.3267752837484587e-07, + "logits/chosen": -2.617685556411743, + "logits/rejected": -2.566694498062134, + "logps/chosen": -241.1839599609375, + "logps/rejected": -275.4180908203125, + "loss": 0.1188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5535500645637512, + "rewards/margins": 4.251681327819824, + "rewards/rejected": -4.805230140686035, + "step": 2600 + }, + { + "epoch": 1.3657770800627944, + "grad_norm": 43.00702974824487, + "learning_rate": 3.31239280755602e-07, + "logits/chosen": -2.699061870574951, + "logits/rejected": -2.583082675933838, + "logps/chosen": -310.5912170410156, + "logps/rejected": -307.32171630859375, + "loss": 0.1019, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.540542483329773, + "rewards/margins": 4.341391086578369, + "rewards/rejected": -4.881933689117432, + "step": 2610 + }, + { + "epoch": 1.3710099424385138, + "grad_norm": 18.9420177678843, + "learning_rate": 3.2979802059222936e-07, + "logits/chosen": -2.711057662963867, + "logits/rejected": -2.570793867111206, + "logps/chosen": -295.284423828125, + "logps/rejected": -278.60174560546875, + "loss": 0.1096, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2152744084596634, + "rewards/margins": 4.452646732330322, + "rewards/rejected": -4.6679205894470215, + "step": 2620 + }, + { + "epoch": 1.3762428048142334, + "grad_norm": 30.053089697599695, + "learning_rate": 3.283538013300537e-07, + "logits/chosen": -2.5685324668884277, + "logits/rejected": -2.5950772762298584, + "logps/chosen": -223.4586944580078, + "logps/rejected": -321.4066467285156, + "loss": 0.0852, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5875605344772339, + "rewards/margins": 4.971282005310059, + "rewards/rejected": -5.558842658996582, + "step": 2630 + }, + { + "epoch": 1.3814756671899528, + "grad_norm": 17.373110941185526, + "learning_rate": 3.269066765241314e-07, + "logits/chosen": -2.748260974884033, + "logits/rejected": -2.680387496948242, + "logps/chosen": -284.13665771484375, + "logps/rejected": -295.009033203125, + "loss": 0.092, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.7688173055648804, + "rewards/margins": 4.525506019592285, + "rewards/rejected": -5.294323921203613, + "step": 2640 + }, + { + "epoch": 1.3867085295656725, + "grad_norm": 41.146953621312406, + "learning_rate": 3.254566998372634e-07, + "logits/chosen": -2.5892138481140137, + "logits/rejected": -2.644347667694092, + "logps/chosen": -216.08114624023438, + "logps/rejected": -313.13763427734375, + "loss": 0.1371, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.8023455739021301, + "rewards/margins": 5.653847694396973, + "rewards/rejected": -6.456193447113037, + "step": 2650 + }, + { + "epoch": 1.3919413919413919, + "grad_norm": 25.812159427086776, + "learning_rate": 3.2400392503800477e-07, + "logits/chosen": -2.6864781379699707, + "logits/rejected": -2.681652069091797, + "logps/chosen": -313.7518005371094, + "logps/rejected": -412.9150390625, + "loss": 0.1002, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4538533091545105, + "rewards/margins": 4.896275520324707, + "rewards/rejected": -5.350129127502441, + "step": 2660 + }, + { + "epoch": 1.3971742543171115, + "grad_norm": 11.560347498002699, + "learning_rate": 3.225484059986715e-07, + "logits/chosen": -2.7019784450531006, + "logits/rejected": -2.574402093887329, + "logps/chosen": -263.1709899902344, + "logps/rejected": -301.37896728515625, + "loss": 0.0824, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5435574054718018, + "rewards/margins": 4.627568244934082, + "rewards/rejected": -5.1711249351501465, + "step": 2670 + }, + { + "epoch": 1.402407116692831, + "grad_norm": 36.77469677124266, + "learning_rate": 3.2109019669334215e-07, + "logits/chosen": -2.6268675327301025, + "logits/rejected": -2.5382723808288574, + "logps/chosen": -345.9691467285156, + "logps/rejected": -365.1632080078125, + "loss": 0.1124, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5941765904426575, + "rewards/margins": 5.553042411804199, + "rewards/rejected": -6.147218704223633, + "step": 2680 + }, + { + "epoch": 1.4076399790685505, + "grad_norm": 20.51859627224848, + "learning_rate": 3.19629351195857e-07, + "logits/chosen": -2.6670846939086914, + "logits/rejected": -2.5615813732147217, + "logps/chosen": -259.65057373046875, + "logps/rejected": -329.07183837890625, + "loss": 0.1092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14442971348762512, + "rewards/margins": 5.420745372772217, + "rewards/rejected": -5.565175533294678, + "step": 2690 + }, + { + "epoch": 1.41287284144427, + "grad_norm": 22.506796085146252, + "learning_rate": 3.1816592367781236e-07, + "logits/chosen": -2.629802942276001, + "logits/rejected": -2.4548943042755127, + "logps/chosen": -325.84893798828125, + "logps/rejected": -310.5500183105469, + "loss": 0.0889, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2448773384094238, + "rewards/margins": 4.603526592254639, + "rewards/rejected": -5.848404407501221, + "step": 2700 + }, + { + "epoch": 1.4181057038199896, + "grad_norm": 30.24510148007379, + "learning_rate": 3.166999684065521e-07, + "logits/chosen": -2.6534583568573, + "logits/rejected": -2.5290088653564453, + "logps/chosen": -265.690185546875, + "logps/rejected": -276.3821716308594, + "loss": 0.1173, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9110490679740906, + "rewards/margins": 4.266907691955566, + "rewards/rejected": -5.177957057952881, + "step": 2710 + }, + { + "epoch": 1.423338566195709, + "grad_norm": 14.93135102150587, + "learning_rate": 3.1523153974315497e-07, + "logits/chosen": -2.67543625831604, + "logits/rejected": -2.6198840141296387, + "logps/chosen": -276.8697814941406, + "logps/rejected": -302.6800842285156, + "loss": 0.1174, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37720829248428345, + "rewards/margins": 5.005181312561035, + "rewards/rejected": -5.382389545440674, + "step": 2720 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 63.23687917036645, + "learning_rate": 3.137606921404191e-07, + "logits/chosen": -2.613619327545166, + "logits/rejected": -2.511970043182373, + "logps/chosen": -288.4800109863281, + "logps/rejected": -270.8289489746094, + "loss": 0.1467, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.8941848874092102, + "rewards/margins": 3.921539783477783, + "rewards/rejected": -4.815724849700928, + "step": 2730 + }, + { + "epoch": 1.433804290947148, + "grad_norm": 41.96001880791081, + "learning_rate": 3.1228748014084243e-07, + "logits/chosen": -2.45249342918396, + "logits/rejected": -2.4123005867004395, + "logps/chosen": -288.26263427734375, + "logps/rejected": -298.2550354003906, + "loss": 0.1287, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5363945960998535, + "rewards/margins": 4.373542785644531, + "rewards/rejected": -4.909936904907227, + "step": 2740 + }, + { + "epoch": 1.4390371533228676, + "grad_norm": 11.588371849208805, + "learning_rate": 3.108119583746005e-07, + "logits/chosen": -2.555427312850952, + "logits/rejected": -2.5331249237060547, + "logps/chosen": -234.6470947265625, + "logps/rejected": -295.04730224609375, + "loss": 0.1127, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.062489427626132965, + "rewards/margins": 5.1241350173950195, + "rewards/rejected": -5.061646461486816, + "step": 2750 + }, + { + "epoch": 1.4442700156985873, + "grad_norm": 28.25560385042178, + "learning_rate": 3.093341815575202e-07, + "logits/chosen": -2.606266498565674, + "logits/rejected": -2.4748575687408447, + "logps/chosen": -277.20196533203125, + "logps/rejected": -246.5891571044922, + "loss": 0.089, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.3125464916229248, + "rewards/margins": 4.526204586029053, + "rewards/rejected": -4.838751792907715, + "step": 2760 + }, + { + "epoch": 1.4495028780743067, + "grad_norm": 13.776303617966635, + "learning_rate": 3.078542044890513e-07, + "logits/chosen": -2.6697518825531006, + "logits/rejected": -2.4930496215820312, + "logps/chosen": -332.73040771484375, + "logps/rejected": -347.8982238769531, + "loss": 0.1193, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4222649037837982, + "rewards/margins": 5.524443626403809, + "rewards/rejected": -5.946708679199219, + "step": 2770 + }, + { + "epoch": 1.454735740450026, + "grad_norm": 19.363326441674072, + "learning_rate": 3.0637208205023386e-07, + "logits/chosen": -2.771562099456787, + "logits/rejected": -2.588222026824951, + "logps/chosen": -310.1404724121094, + "logps/rejected": -287.75677490234375, + "loss": 0.1085, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.3928970396518707, + "rewards/margins": 4.459607124328613, + "rewards/rejected": -4.852504730224609, + "step": 2780 + }, + { + "epoch": 1.4599686028257457, + "grad_norm": 15.266068407971215, + "learning_rate": 3.0488786920166343e-07, + "logits/chosen": -2.6312241554260254, + "logits/rejected": -2.6789093017578125, + "logps/chosen": -300.2938232421875, + "logps/rejected": -374.1767883300781, + "loss": 0.0948, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.1141195073723793, + "rewards/margins": 5.68836784362793, + "rewards/rejected": -5.574248313903809, + "step": 2790 + }, + { + "epoch": 1.4652014652014653, + "grad_norm": 25.032616813940045, + "learning_rate": 3.034016209814529e-07, + "logits/chosen": -2.6470537185668945, + "logits/rejected": -2.5894312858581543, + "logps/chosen": -265.81683349609375, + "logps/rejected": -308.41424560546875, + "loss": 0.1024, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.5616492033004761, + "rewards/margins": 4.933659076690674, + "rewards/rejected": -5.495308876037598, + "step": 2800 + }, + { + "epoch": 1.4704343275771847, + "grad_norm": 17.419430600072115, + "learning_rate": 3.0191339250319147e-07, + "logits/chosen": -2.668083906173706, + "logits/rejected": -2.698418378829956, + "logps/chosen": -287.32818603515625, + "logps/rejected": -362.6585388183594, + "loss": 0.0869, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.04601895809173584, + "rewards/margins": 5.480045795440674, + "rewards/rejected": -5.526064872741699, + "step": 2810 + }, + { + "epoch": 1.4756671899529041, + "grad_norm": 17.132397160309466, + "learning_rate": 3.004232389539011e-07, + "logits/chosen": -2.7672345638275146, + "logits/rejected": -2.6934361457824707, + "logps/chosen": -272.5774841308594, + "logps/rejected": -327.59991455078125, + "loss": 0.0906, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.19477109611034393, + "rewards/margins": 6.1975226402282715, + "rewards/rejected": -6.392293930053711, + "step": 2820 + }, + { + "epoch": 1.4809000523286238, + "grad_norm": 6.197135471084547, + "learning_rate": 2.989312155919898e-07, + "logits/chosen": -2.6563096046447754, + "logits/rejected": -2.5722525119781494, + "logps/chosen": -264.55035400390625, + "logps/rejected": -326.4817199707031, + "loss": 0.1031, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.24614736437797546, + "rewards/margins": 4.663209915161133, + "rewards/rejected": -4.909357070922852, + "step": 2830 + }, + { + "epoch": 1.4861329147043434, + "grad_norm": 23.032654080389246, + "learning_rate": 2.9743737774520266e-07, + "logits/chosen": -2.6749980449676514, + "logits/rejected": -2.6537280082702637, + "logps/chosen": -272.3828430175781, + "logps/rejected": -325.3900451660156, + "loss": 0.1257, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11704929172992706, + "rewards/margins": 6.074965953826904, + "rewards/rejected": -5.957917213439941, + "step": 2840 + }, + { + "epoch": 1.4913657770800628, + "grad_norm": 21.090303903647936, + "learning_rate": 2.959417808085702e-07, + "logits/chosen": -2.61649751663208, + "logits/rejected": -2.6308839321136475, + "logps/chosen": -227.5465850830078, + "logps/rejected": -272.5039367675781, + "loss": 0.0964, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8368734121322632, + "rewards/margins": 4.769274711608887, + "rewards/rejected": -5.6061482429504395, + "step": 2850 + }, + { + "epoch": 1.4965986394557822, + "grad_norm": 29.6313494223259, + "learning_rate": 2.944444802423542e-07, + "logits/chosen": -2.805844306945801, + "logits/rejected": -2.721168041229248, + "logps/chosen": -317.97711181640625, + "logps/rejected": -380.91412353515625, + "loss": 0.0985, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21231107413768768, + "rewards/margins": 5.917481899261475, + "rewards/rejected": -6.129792213439941, + "step": 2860 + }, + { + "epoch": 1.5018315018315018, + "grad_norm": 22.819722944342093, + "learning_rate": 2.929455315699908e-07, + "logits/chosen": -2.660499334335327, + "logits/rejected": -2.4537293910980225, + "logps/chosen": -310.8629455566406, + "logps/rejected": -357.47894287109375, + "loss": 0.1059, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.07246246188879013, + "rewards/margins": 6.156960964202881, + "rewards/rejected": -6.229423522949219, + "step": 2870 + }, + { + "epoch": 1.5070643642072215, + "grad_norm": 15.35991384284126, + "learning_rate": 2.9144499037603204e-07, + "logits/chosen": -2.739063262939453, + "logits/rejected": -2.6308000087738037, + "logps/chosen": -257.1328430175781, + "logps/rejected": -290.7419128417969, + "loss": 0.1176, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5068913698196411, + "rewards/margins": 5.11258602142334, + "rewards/rejected": -5.61947774887085, + "step": 2880 + }, + { + "epoch": 1.5122972265829409, + "grad_norm": 17.13609231258807, + "learning_rate": 2.899429123040843e-07, + "logits/chosen": -2.7439727783203125, + "logits/rejected": -2.716459274291992, + "logps/chosen": -262.11224365234375, + "logps/rejected": -316.2181091308594, + "loss": 0.0886, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5449376702308655, + "rewards/margins": 4.632278919219971, + "rewards/rejected": -5.177216529846191, + "step": 2890 + }, + { + "epoch": 1.5175300889586603, + "grad_norm": 42.3086232496891, + "learning_rate": 2.884393530547452e-07, + "logits/chosen": -2.8514835834503174, + "logits/rejected": -2.7157387733459473, + "logps/chosen": -298.56719970703125, + "logps/rejected": -331.87890625, + "loss": 0.1074, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.17806626856327057, + "rewards/margins": 4.861222267150879, + "rewards/rejected": -4.6831560134887695, + "step": 2900 + }, + { + "epoch": 1.5227629513343799, + "grad_norm": 23.22923804226432, + "learning_rate": 2.869343683835376e-07, + "logits/chosen": -2.709092140197754, + "logits/rejected": -2.59346342086792, + "logps/chosen": -239.79421997070312, + "logps/rejected": -342.177978515625, + "loss": 0.09, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.3563653826713562, + "rewards/margins": 5.73477840423584, + "rewards/rejected": -6.09114408493042, + "step": 2910 + }, + { + "epoch": 1.5279958137100995, + "grad_norm": 6.575306568756043, + "learning_rate": 2.8542801409884253e-07, + "logits/chosen": -2.766979932785034, + "logits/rejected": -2.666149139404297, + "logps/chosen": -330.9672546386719, + "logps/rejected": -371.9808349609375, + "loss": 0.0779, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5268090963363647, + "rewards/margins": 4.619814872741699, + "rewards/rejected": -5.1466240882873535, + "step": 2920 + }, + { + "epoch": 1.533228676085819, + "grad_norm": 29.973424802649863, + "learning_rate": 2.839203460598297e-07, + "logits/chosen": -2.7906851768493652, + "logits/rejected": -2.756540298461914, + "logps/chosen": -349.83331298828125, + "logps/rejected": -379.7663269042969, + "loss": 0.1136, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.3085012137889862, + "rewards/margins": 5.198960304260254, + "rewards/rejected": -5.5074615478515625, + "step": 2930 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 11.458737713264366, + "learning_rate": 2.8241142017438557e-07, + "logits/chosen": -2.7677807807922363, + "logits/rejected": -2.7230429649353027, + "logps/chosen": -320.4291687011719, + "logps/rejected": -336.20355224609375, + "loss": 0.0997, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.15537458658218384, + "rewards/margins": 5.858824253082275, + "rewards/rejected": -5.7034502029418945, + "step": 2940 + }, + { + "epoch": 1.543694400837258, + "grad_norm": 22.881053961998443, + "learning_rate": 2.8090129239704083e-07, + "logits/chosen": -2.7229790687561035, + "logits/rejected": -2.5809950828552246, + "logps/chosen": -307.8843078613281, + "logps/rejected": -268.83984375, + "loss": 0.1282, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.1330840587615967, + "rewards/margins": 4.647091388702393, + "rewards/rejected": -5.780175685882568, + "step": 2950 + }, + { + "epoch": 1.5489272632129776, + "grad_norm": 14.391989006889863, + "learning_rate": 2.7939001872689496e-07, + "logits/chosen": -2.6156535148620605, + "logits/rejected": -2.547813653945923, + "logps/chosen": -214.3231658935547, + "logps/rejected": -244.09432983398438, + "loss": 0.1027, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9809154272079468, + "rewards/margins": 4.217734336853027, + "rewards/rejected": -5.198649883270264, + "step": 2960 + }, + { + "epoch": 1.554160125588697, + "grad_norm": 47.42639642849719, + "learning_rate": 2.778776552055398e-07, + "logits/chosen": -2.628086566925049, + "logits/rejected": -2.4464826583862305, + "logps/chosen": -306.44354248046875, + "logps/rejected": -310.10003662109375, + "loss": 0.0903, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.7761075496673584, + "rewards/margins": 4.894813537597656, + "rewards/rejected": -5.6709208488464355, + "step": 2970 + }, + { + "epoch": 1.5593929879644164, + "grad_norm": 26.12349026205923, + "learning_rate": 2.763642579149817e-07, + "logits/chosen": -2.5449676513671875, + "logits/rejected": -2.521742343902588, + "logps/chosen": -250.126953125, + "logps/rejected": -307.7526550292969, + "loss": 0.1035, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.5303369760513306, + "rewards/margins": 4.585556983947754, + "rewards/rejected": -5.115893840789795, + "step": 2980 + }, + { + "epoch": 1.564625850340136, + "grad_norm": 55.23272129571936, + "learning_rate": 2.748498829755615e-07, + "logits/chosen": -2.6269307136535645, + "logits/rejected": -2.5769810676574707, + "logps/chosen": -268.09967041015625, + "logps/rejected": -364.55712890625, + "loss": 0.0956, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.2840079367160797, + "rewards/margins": 5.313620090484619, + "rewards/rejected": -5.597627639770508, + "step": 2990 + }, + { + "epoch": 1.5698587127158556, + "grad_norm": 16.075641960102608, + "learning_rate": 2.7333458654387344e-07, + "logits/chosen": -2.7007858753204346, + "logits/rejected": -2.648383855819702, + "logps/chosen": -303.09234619140625, + "logps/rejected": -318.8572082519531, + "loss": 0.0843, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.17780855298042297, + "rewards/margins": 5.029606819152832, + "rewards/rejected": -5.207415580749512, + "step": 3000 + }, + { + "epoch": 1.575091575091575, + "grad_norm": 23.826973600299446, + "learning_rate": 2.718184248106828e-07, + "logits/chosen": -2.797697067260742, + "logits/rejected": -2.664506435394287, + "logps/chosen": -342.29132080078125, + "logps/rejected": -380.3262023925781, + "loss": 0.0953, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23088698089122772, + "rewards/margins": 5.9646759033203125, + "rewards/rejected": -6.195563316345215, + "step": 3010 + }, + { + "epoch": 1.5803244374672945, + "grad_norm": 23.5189098138642, + "learning_rate": 2.7030145399884275e-07, + "logits/chosen": -2.7051501274108887, + "logits/rejected": -2.552473783493042, + "logps/chosen": -363.2586364746094, + "logps/rejected": -355.2478942871094, + "loss": 0.1077, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.46280232071876526, + "rewards/margins": 4.690756320953369, + "rewards/rejected": -5.153558254241943, + "step": 3020 + }, + { + "epoch": 1.585557299843014, + "grad_norm": 19.565421791870527, + "learning_rate": 2.687837303612085e-07, + "logits/chosen": -2.8115456104278564, + "logits/rejected": -2.6637587547302246, + "logps/chosen": -344.15252685546875, + "logps/rejected": -364.0646057128906, + "loss": 0.1045, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.13493283092975616, + "rewards/margins": 5.432315349578857, + "rewards/rejected": -5.567248344421387, + "step": 3030 + }, + { + "epoch": 1.5907901622187337, + "grad_norm": 21.638865724708594, + "learning_rate": 2.672653101785519e-07, + "logits/chosen": -2.5982906818389893, + "logits/rejected": -2.5806329250335693, + "logps/chosen": -305.8501892089844, + "logps/rejected": -348.97747802734375, + "loss": 0.0805, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1506006270647049, + "rewards/margins": 5.457698345184326, + "rewards/rejected": -5.6082987785339355, + "step": 3040 + }, + { + "epoch": 1.5960230245944533, + "grad_norm": 21.673302232000832, + "learning_rate": 2.657462497574747e-07, + "logits/chosen": -2.7238268852233887, + "logits/rejected": -2.709031581878662, + "logps/chosen": -238.9906005859375, + "logps/rejected": -276.9991760253906, + "loss": 0.0811, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.5081533193588257, + "rewards/margins": 4.087741374969482, + "rewards/rejected": -4.595894813537598, + "step": 3050 + }, + { + "epoch": 1.6012558869701727, + "grad_norm": 5.575995707313206, + "learning_rate": 2.642266054283198e-07, + "logits/chosen": -2.7808871269226074, + "logits/rejected": -2.5478625297546387, + "logps/chosen": -366.7637023925781, + "logps/rejected": -281.3974914550781, + "loss": 0.0798, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06902514398097992, + "rewards/margins": 5.459024429321289, + "rewards/rejected": -5.528049468994141, + "step": 3060 + }, + { + "epoch": 1.6064887493458921, + "grad_norm": 24.531441519428913, + "learning_rate": 2.627064335430829e-07, + "logits/chosen": -2.713057279586792, + "logits/rejected": -2.571733236312866, + "logps/chosen": -321.02972412109375, + "logps/rejected": -332.35870361328125, + "loss": 0.0769, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3989798128604889, + "rewards/margins": 5.438555717468262, + "rewards/rejected": -5.837535858154297, + "step": 3070 + }, + { + "epoch": 1.6117216117216118, + "grad_norm": 32.83788593503646, + "learning_rate": 2.611857904733227e-07, + "logits/chosen": -2.697803497314453, + "logits/rejected": -2.5155491828918457, + "logps/chosen": -309.11212158203125, + "logps/rejected": -297.1321716308594, + "loss": 0.0916, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6532170176506042, + "rewards/margins": 5.0370306968688965, + "rewards/rejected": -5.690248012542725, + "step": 3080 + }, + { + "epoch": 1.6169544740973314, + "grad_norm": 35.85854683798888, + "learning_rate": 2.5966473260807076e-07, + "logits/chosen": -2.7497193813323975, + "logits/rejected": -2.6400251388549805, + "logps/chosen": -353.42706298828125, + "logps/rejected": -394.00653076171875, + "loss": 0.0988, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.09235533326864243, + "rewards/margins": 6.314970970153809, + "rewards/rejected": -6.407326698303223, + "step": 3090 + }, + { + "epoch": 1.6221873364730508, + "grad_norm": 24.863574110452976, + "learning_rate": 2.5814331635173987e-07, + "logits/chosen": -2.6918444633483887, + "logits/rejected": -2.622985601425171, + "logps/chosen": -312.4832763671875, + "logps/rejected": -347.6034240722656, + "loss": 0.1311, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.5141621828079224, + "rewards/margins": 4.177474498748779, + "rewards/rejected": -4.6916375160217285, + "step": 3100 + }, + { + "epoch": 1.6274201988487702, + "grad_norm": 43.07833472677117, + "learning_rate": 2.566215981220331e-07, + "logits/chosen": -2.6178548336029053, + "logits/rejected": -2.5270726680755615, + "logps/chosen": -311.21502685546875, + "logps/rejected": -358.38018798828125, + "loss": 0.105, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.01169753074646, + "rewards/margins": 4.856429100036621, + "rewards/rejected": -5.868125915527344, + "step": 3110 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 26.45332527831414, + "learning_rate": 2.550996343478514e-07, + "logits/chosen": -2.633014440536499, + "logits/rejected": -2.5973496437072754, + "logps/chosen": -304.20880126953125, + "logps/rejected": -340.53369140625, + "loss": 0.1131, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.1011536568403244, + "rewards/margins": 5.776803016662598, + "rewards/rejected": -5.877956390380859, + "step": 3120 + }, + { + "epoch": 1.6378859236002095, + "grad_norm": 10.618056558670146, + "learning_rate": 2.5357748146720076e-07, + "logits/chosen": -2.6644821166992188, + "logits/rejected": -2.5023844242095947, + "logps/chosen": -209.9169158935547, + "logps/rejected": -265.97967529296875, + "loss": 0.0813, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.550481915473938, + "rewards/margins": 5.262048721313477, + "rewards/rejected": -5.812530517578125, + "step": 3130 + }, + { + "epoch": 1.6431187859759289, + "grad_norm": 28.997064937091526, + "learning_rate": 2.5205519592509993e-07, + "logits/chosen": -2.6454596519470215, + "logits/rejected": -2.534219264984131, + "logps/chosen": -272.67926025390625, + "logps/rejected": -313.68365478515625, + "loss": 0.1097, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5680716633796692, + "rewards/margins": 5.149653434753418, + "rewards/rejected": -5.717724800109863, + "step": 3140 + }, + { + "epoch": 1.6483516483516483, + "grad_norm": 29.98175418753011, + "learning_rate": 2.505328341714873e-07, + "logits/chosen": -2.7715792655944824, + "logits/rejected": -2.577012538909912, + "logps/chosen": -317.41448974609375, + "logps/rejected": -342.84649658203125, + "loss": 0.0798, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03935631364583969, + "rewards/margins": 6.34078311920166, + "rewards/rejected": -6.301426410675049, + "step": 3150 + }, + { + "epoch": 1.653584510727368, + "grad_norm": 30.141923742595296, + "learning_rate": 2.4901045265912687e-07, + "logits/chosen": -2.7321019172668457, + "logits/rejected": -2.657607078552246, + "logps/chosen": -314.8836975097656, + "logps/rejected": -374.3489685058594, + "loss": 0.1033, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.007449048571288586, + "rewards/margins": 6.138455390930176, + "rewards/rejected": -6.145905017852783, + "step": 3160 + }, + { + "epoch": 1.6588173731030875, + "grad_norm": 24.24799993269991, + "learning_rate": 2.4748810784151555e-07, + "logits/chosen": -2.685410976409912, + "logits/rejected": -2.6096012592315674, + "logps/chosen": -332.5256042480469, + "logps/rejected": -311.61358642578125, + "loss": 0.0815, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7060562372207642, + "rewards/margins": 4.939300537109375, + "rewards/rejected": -5.645357131958008, + "step": 3170 + }, + { + "epoch": 1.664050235478807, + "grad_norm": 38.71590851754405, + "learning_rate": 2.459658561707898e-07, + "logits/chosen": -2.694617986679077, + "logits/rejected": -2.61140513420105, + "logps/chosen": -319.1321716308594, + "logps/rejected": -358.1539611816406, + "loss": 0.0954, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.4491146504878998, + "rewards/margins": 4.88021993637085, + "rewards/rejected": -5.329334735870361, + "step": 3180 + }, + { + "epoch": 1.6692830978545263, + "grad_norm": 27.476389343518058, + "learning_rate": 2.4444375409563145e-07, + "logits/chosen": -2.716930627822876, + "logits/rejected": -2.5793745517730713, + "logps/chosen": -314.3849182128906, + "logps/rejected": -336.908203125, + "loss": 0.0794, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6793092489242554, + "rewards/margins": 5.401398658752441, + "rewards/rejected": -6.080708026885986, + "step": 3190 + }, + { + "epoch": 1.674515960230246, + "grad_norm": 25.09742034299905, + "learning_rate": 2.429218580591753e-07, + "logits/chosen": -2.5837604999542236, + "logits/rejected": -2.4644665718078613, + "logps/chosen": -329.11737060546875, + "logps/rejected": -298.93109130859375, + "loss": 0.1147, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.27274736762046814, + "rewards/margins": 5.840318202972412, + "rewards/rejected": -6.113064765930176, + "step": 3200 + }, + { + "epoch": 1.6797488226059656, + "grad_norm": 48.48521359435761, + "learning_rate": 2.414002244969158e-07, + "logits/chosen": -2.553994655609131, + "logits/rejected": -2.5049333572387695, + "logps/chosen": -283.4375, + "logps/rejected": -335.94732666015625, + "loss": 0.1058, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.013826847076416, + "rewards/margins": 6.045845031738281, + "rewards/rejected": -7.059671878814697, + "step": 3210 + }, + { + "epoch": 1.684981684981685, + "grad_norm": 25.83087216367204, + "learning_rate": 2.3987890983461403e-07, + "logits/chosen": -2.7287845611572266, + "logits/rejected": -2.6246705055236816, + "logps/chosen": -321.7281799316406, + "logps/rejected": -388.2299499511719, + "loss": 0.0967, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6154420375823975, + "rewards/margins": 5.903476238250732, + "rewards/rejected": -6.518918037414551, + "step": 3220 + }, + { + "epoch": 1.6902145473574044, + "grad_norm": 26.08572202767128, + "learning_rate": 2.3835797048620564e-07, + "logits/chosen": -2.754236936569214, + "logits/rejected": -2.670435905456543, + "logps/chosen": -292.9166564941406, + "logps/rejected": -303.14361572265625, + "loss": 0.1092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6038793325424194, + "rewards/margins": 4.955503940582275, + "rewards/rejected": -5.559382915496826, + "step": 3230 + }, + { + "epoch": 1.695447409733124, + "grad_norm": 52.292973177723304, + "learning_rate": 2.368374628517088e-07, + "logits/chosen": -2.5498924255371094, + "logits/rejected": -2.4609293937683105, + "logps/chosen": -298.534423828125, + "logps/rejected": -317.09130859375, + "loss": 0.121, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5181295275688171, + "rewards/margins": 5.679882049560547, + "rewards/rejected": -6.198011875152588, + "step": 3240 + }, + { + "epoch": 1.7006802721088436, + "grad_norm": 6.4370144608987845, + "learning_rate": 2.3531744331513247e-07, + "logits/chosen": -2.6444571018218994, + "logits/rejected": -2.669032335281372, + "logps/chosen": -245.9746856689453, + "logps/rejected": -309.67620849609375, + "loss": 0.0912, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6865631341934204, + "rewards/margins": 5.3827409744262695, + "rewards/rejected": -6.0693039894104, + "step": 3250 + }, + { + "epoch": 1.705913134484563, + "grad_norm": 54.288581382516014, + "learning_rate": 2.3379796824238608e-07, + "logits/chosen": -2.6092073917388916, + "logits/rejected": -2.5859627723693848, + "logps/chosen": -235.48385620117188, + "logps/rejected": -263.9149169921875, + "loss": 0.1273, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5413236618041992, + "rewards/margins": 4.505577564239502, + "rewards/rejected": -6.046900749206543, + "step": 3260 + }, + { + "epoch": 1.7111459968602825, + "grad_norm": 20.066394526978875, + "learning_rate": 2.3227909397918894e-07, + "logits/chosen": -2.861052989959717, + "logits/rejected": -2.7720844745635986, + "logps/chosen": -339.01605224609375, + "logps/rejected": -389.1539001464844, + "loss": 0.0829, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003049397375434637, + "rewards/margins": 6.6782331466674805, + "rewards/rejected": -6.681282997131348, + "step": 3270 + }, + { + "epoch": 1.716378859236002, + "grad_norm": 36.56452804031963, + "learning_rate": 2.3076087684898076e-07, + "logits/chosen": -2.6908583641052246, + "logits/rejected": -2.5621867179870605, + "logps/chosen": -282.6653137207031, + "logps/rejected": -338.23291015625, + "loss": 0.1145, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5438255667686462, + "rewards/margins": 5.641651153564453, + "rewards/rejected": -6.185477256774902, + "step": 3280 + }, + { + "epoch": 1.7216117216117217, + "grad_norm": 58.679378010104585, + "learning_rate": 2.2924337315083353e-07, + "logits/chosen": -2.7397890090942383, + "logits/rejected": -2.5919830799102783, + "logps/chosen": -377.1933898925781, + "logps/rejected": -388.566162109375, + "loss": 0.0779, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.14386649429798126, + "rewards/margins": 5.803525447845459, + "rewards/rejected": -5.947391986846924, + "step": 3290 + }, + { + "epoch": 1.7268445839874411, + "grad_norm": 22.7359891911816, + "learning_rate": 2.277266391573633e-07, + "logits/chosen": -2.6971657276153564, + "logits/rejected": -2.660691738128662, + "logps/chosen": -338.55120849609375, + "logps/rejected": -340.1263732910156, + "loss": 0.0776, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.48269566893577576, + "rewards/margins": 7.384807586669922, + "rewards/rejected": -6.902112007141113, + "step": 3300 + }, + { + "epoch": 1.7320774463631605, + "grad_norm": 51.56529486380081, + "learning_rate": 2.2621073111264357e-07, + "logits/chosen": -2.5473222732543945, + "logits/rejected": -2.5487756729125977, + "logps/chosen": -273.0016784667969, + "logps/rejected": -290.00872802734375, + "loss": 0.0931, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2249443233013153, + "rewards/margins": 5.341296672821045, + "rewards/rejected": -5.5662407875061035, + "step": 3310 + }, + { + "epoch": 1.7373103087388801, + "grad_norm": 29.535893549004907, + "learning_rate": 2.2469570523011993e-07, + "logits/chosen": -2.6070313453674316, + "logits/rejected": -2.6149723529815674, + "logps/chosen": -270.3302917480469, + "logps/rejected": -319.1665344238281, + "loss": 0.096, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.9645935893058777, + "rewards/margins": 4.5530548095703125, + "rewards/rejected": -5.517648696899414, + "step": 3320 + }, + { + "epoch": 1.7425431711145998, + "grad_norm": 4.27376987274256, + "learning_rate": 2.2318161769052525e-07, + "logits/chosen": -2.659609317779541, + "logits/rejected": -2.533578395843506, + "logps/chosen": -279.2534484863281, + "logps/rejected": -334.467041015625, + "loss": 0.1114, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5925628542900085, + "rewards/margins": 5.468548774719238, + "rewards/rejected": -6.0611114501953125, + "step": 3330 + }, + { + "epoch": 1.7477760334903192, + "grad_norm": 11.254161444342154, + "learning_rate": 2.2166852463979624e-07, + "logits/chosen": -2.5818896293640137, + "logits/rejected": -2.4699618816375732, + "logps/chosen": -266.83355712890625, + "logps/rejected": -278.3815002441406, + "loss": 0.1153, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40772876143455505, + "rewards/margins": 5.459908485412598, + "rewards/rejected": -5.867638111114502, + "step": 3340 + }, + { + "epoch": 1.7530088958660386, + "grad_norm": 9.929057072796711, + "learning_rate": 2.20156482186992e-07, + "logits/chosen": -2.58548903465271, + "logits/rejected": -2.5589373111724854, + "logps/chosen": -282.1081237792969, + "logps/rejected": -333.6109924316406, + "loss": 0.1012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4569730758666992, + "rewards/margins": 5.3031792640686035, + "rewards/rejected": -5.7601518630981445, + "step": 3350 + }, + { + "epoch": 1.7582417582417582, + "grad_norm": 14.366597370754016, + "learning_rate": 2.1864554640221244e-07, + "logits/chosen": -2.486529588699341, + "logits/rejected": -2.5694050788879395, + "logps/chosen": -220.8901824951172, + "logps/rejected": -341.2018127441406, + "loss": 0.1006, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.232269048690796, + "rewards/margins": 5.7011613845825195, + "rewards/rejected": -6.933429718017578, + "step": 3360 + }, + { + "epoch": 1.7634746206174778, + "grad_norm": 9.734727207350373, + "learning_rate": 2.1713577331452016e-07, + "logits/chosen": -2.7107510566711426, + "logits/rejected": -2.590179681777954, + "logps/chosen": -282.2343444824219, + "logps/rejected": -292.1159362792969, + "loss": 0.0792, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47104889154434204, + "rewards/margins": 5.032910346984863, + "rewards/rejected": -5.5039591789245605, + "step": 3370 + }, + { + "epoch": 1.7687074829931972, + "grad_norm": 30.73138043602282, + "learning_rate": 2.1562721890986199e-07, + "logits/chosen": -2.5853171348571777, + "logits/rejected": -2.427089214324951, + "logps/chosen": -258.68096923828125, + "logps/rejected": -263.1603698730469, + "loss": 0.0943, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.8167365789413452, + "rewards/margins": 4.762653350830078, + "rewards/rejected": -5.579390048980713, + "step": 3380 + }, + { + "epoch": 1.7739403453689166, + "grad_norm": 6.900770481857403, + "learning_rate": 2.1411993912899285e-07, + "logits/chosen": -2.6068625450134277, + "logits/rejected": -2.7255072593688965, + "logps/chosen": -251.8668670654297, + "logps/rejected": -388.23553466796875, + "loss": 0.0956, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3402079641819, + "rewards/margins": 5.3291215896606445, + "rewards/rejected": -5.669328689575195, + "step": 3390 + }, + { + "epoch": 1.7791732077446363, + "grad_norm": 32.88005271542385, + "learning_rate": 2.126139898654021e-07, + "logits/chosen": -2.6007299423217773, + "logits/rejected": -2.564429759979248, + "logps/chosen": -247.95761108398438, + "logps/rejected": -304.67218017578125, + "loss": 0.1105, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0324971675872803, + "rewards/margins": 4.652859687805176, + "rewards/rejected": -5.685357093811035, + "step": 3400 + }, + { + "epoch": 1.784406070120356, + "grad_norm": 14.049845011208202, + "learning_rate": 2.1110942696324012e-07, + "logits/chosen": -2.8334975242614746, + "logits/rejected": -2.7319045066833496, + "logps/chosen": -330.34368896484375, + "logps/rejected": -337.543701171875, + "loss": 0.1172, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.16336342692375183, + "rewards/margins": 5.068566799163818, + "rewards/rejected": -5.231931209564209, + "step": 3410 + }, + { + "epoch": 1.7896389324960753, + "grad_norm": 35.23957207470887, + "learning_rate": 2.0960630621524762e-07, + "logits/chosen": -2.604213237762451, + "logits/rejected": -2.5069069862365723, + "logps/chosen": -327.94659423828125, + "logps/rejected": -286.60223388671875, + "loss": 0.0987, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45767927169799805, + "rewards/margins": 4.978998184204102, + "rewards/rejected": -5.436676979064941, + "step": 3420 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 22.48343495873177, + "learning_rate": 2.0810468336068697e-07, + "logits/chosen": -2.60780668258667, + "logits/rejected": -2.6505560874938965, + "logps/chosen": -256.4802551269531, + "logps/rejected": -312.28045654296875, + "loss": 0.1047, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.9076415300369263, + "rewards/margins": 5.676671028137207, + "rewards/rejected": -6.584311485290527, + "step": 3430 + }, + { + "epoch": 1.8001046572475143, + "grad_norm": 24.57145458361626, + "learning_rate": 2.0660461408327535e-07, + "logits/chosen": -2.759155511856079, + "logits/rejected": -2.6342787742614746, + "logps/chosen": -315.87750244140625, + "logps/rejected": -288.4949951171875, + "loss": 0.079, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.7314565181732178, + "rewards/margins": 4.713875770568848, + "rewards/rejected": -5.445333003997803, + "step": 3440 + }, + { + "epoch": 1.805337519623234, + "grad_norm": 12.426591771574008, + "learning_rate": 2.0510615400911906e-07, + "logits/chosen": -2.7850677967071533, + "logits/rejected": -2.7042925357818604, + "logps/chosen": -288.1392517089844, + "logps/rejected": -294.2490539550781, + "loss": 0.1018, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.1649543195962906, + "rewards/margins": 5.316279888153076, + "rewards/rejected": -5.481234550476074, + "step": 3450 + }, + { + "epoch": 1.8105703819989536, + "grad_norm": 24.464622606853844, + "learning_rate": 2.0360935870465185e-07, + "logits/chosen": -2.7435317039489746, + "logits/rejected": -2.550192356109619, + "logps/chosen": -350.0984802246094, + "logps/rejected": -342.10076904296875, + "loss": 0.0851, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36250513792037964, + "rewards/margins": 6.055690765380859, + "rewards/rejected": -5.693184852600098, + "step": 3460 + }, + { + "epoch": 1.815803244374673, + "grad_norm": 37.4562098904198, + "learning_rate": 2.021142836745739e-07, + "logits/chosen": -2.6443090438842773, + "logits/rejected": -2.5326123237609863, + "logps/chosen": -294.5022888183594, + "logps/rejected": -312.35699462890625, + "loss": 0.0874, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3399999141693115, + "rewards/margins": 4.801590919494629, + "rewards/rejected": -5.1415910720825195, + "step": 3470 + }, + { + "epoch": 1.8210361067503924, + "grad_norm": 39.078137882685, + "learning_rate": 2.0062098435979308e-07, + "logits/chosen": -2.508788585662842, + "logits/rejected": -2.467001438140869, + "logps/chosen": -313.6659851074219, + "logps/rejected": -303.41864013671875, + "loss": 0.1166, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.8432501554489136, + "rewards/margins": 4.548348426818848, + "rewards/rejected": -5.391598701477051, + "step": 3480 + }, + { + "epoch": 1.826268969126112, + "grad_norm": 20.477804157343822, + "learning_rate": 1.9912951613536997e-07, + "logits/chosen": -2.7366108894348145, + "logits/rejected": -2.5674004554748535, + "logps/chosen": -307.49871826171875, + "logps/rejected": -292.94140625, + "loss": 0.0804, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.4221610426902771, + "rewards/margins": 5.448634147644043, + "rewards/rejected": -5.870795249938965, + "step": 3490 + }, + { + "epoch": 1.8315018315018317, + "grad_norm": 45.80462658045936, + "learning_rate": 1.9763993430846392e-07, + "logits/chosen": -2.7082409858703613, + "logits/rejected": -2.4875802993774414, + "logps/chosen": -282.53564453125, + "logps/rejected": -249.9893798828125, + "loss": 0.1, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5224814414978027, + "rewards/margins": 4.795421600341797, + "rewards/rejected": -5.317903518676758, + "step": 3500 + }, + { + "epoch": 1.836734693877551, + "grad_norm": 25.28320711526729, + "learning_rate": 1.9615229411628212e-07, + "logits/chosen": -2.5923116207122803, + "logits/rejected": -2.5757219791412354, + "logps/chosen": -211.8919677734375, + "logps/rejected": -322.93310546875, + "loss": 0.0908, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9356802701950073, + "rewards/margins": 5.03036642074585, + "rewards/rejected": -5.966047286987305, + "step": 3510 + }, + { + "epoch": 1.8419675562532705, + "grad_norm": 12.228474470448438, + "learning_rate": 1.946666507240314e-07, + "logits/chosen": -2.6713318824768066, + "logits/rejected": -2.5800724029541016, + "logps/chosen": -322.16314697265625, + "logps/rejected": -353.5315856933594, + "loss": 0.0878, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.723814845085144, + "rewards/margins": 5.294778347015381, + "rewards/rejected": -6.018592834472656, + "step": 3520 + }, + { + "epoch": 1.84720041862899, + "grad_norm": 17.755134246003884, + "learning_rate": 1.9318305922287268e-07, + "logits/chosen": -2.5991291999816895, + "logits/rejected": -2.5720906257629395, + "logps/chosen": -276.58441162109375, + "logps/rejected": -305.8017883300781, + "loss": 0.0765, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35180261731147766, + "rewards/margins": 6.045158386230469, + "rewards/rejected": -6.396960735321045, + "step": 3530 + }, + { + "epoch": 1.8524332810047097, + "grad_norm": 31.75867086902602, + "learning_rate": 1.9170157462787762e-07, + "logits/chosen": -2.6986277103424072, + "logits/rejected": -2.5714402198791504, + "logps/chosen": -349.29437255859375, + "logps/rejected": -312.58148193359375, + "loss": 0.0887, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.348706990480423, + "rewards/margins": 5.397832870483398, + "rewards/rejected": -5.74653959274292, + "step": 3540 + }, + { + "epoch": 1.8576661433804291, + "grad_norm": 23.05380759982723, + "learning_rate": 1.902222518759891e-07, + "logits/chosen": -2.819509267807007, + "logits/rejected": -2.602756977081299, + "logps/chosen": -371.6258239746094, + "logps/rejected": -367.76983642578125, + "loss": 0.1148, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.30471277236938477, + "rewards/margins": 5.3604960441589355, + "rewards/rejected": -5.6652092933654785, + "step": 3550 + }, + { + "epoch": 1.8628990057561485, + "grad_norm": 17.855176900716568, + "learning_rate": 1.8874514582398368e-07, + "logits/chosen": -2.6623425483703613, + "logits/rejected": -2.7280948162078857, + "logps/chosen": -315.5396423339844, + "logps/rejected": -357.8938903808594, + "loss": 0.0992, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47962045669555664, + "rewards/margins": 6.602120876312256, + "rewards/rejected": -7.081740379333496, + "step": 3560 + }, + { + "epoch": 1.8681318681318682, + "grad_norm": 10.749236071749618, + "learning_rate": 1.8727031124643738e-07, + "logits/chosen": -2.668679714202881, + "logits/rejected": -2.608853578567505, + "logps/chosen": -242.77444458007812, + "logps/rejected": -285.3056335449219, + "loss": 0.098, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6894394755363464, + "rewards/margins": 5.269094944000244, + "rewards/rejected": -5.958534240722656, + "step": 3570 + }, + { + "epoch": 1.8733647305075878, + "grad_norm": 22.614211293694257, + "learning_rate": 1.8579780283369472e-07, + "logits/chosen": -2.591667652130127, + "logits/rejected": -2.419919967651367, + "logps/chosen": -305.64959716796875, + "logps/rejected": -277.2184143066406, + "loss": 0.0907, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.9471152424812317, + "rewards/margins": 4.886394500732422, + "rewards/rejected": -5.83350944519043, + "step": 3580 + }, + { + "epoch": 1.8785975928833072, + "grad_norm": 25.65636556195965, + "learning_rate": 1.8432767518984043e-07, + "logits/chosen": -2.623939037322998, + "logits/rejected": -2.5226635932922363, + "logps/chosen": -309.31317138671875, + "logps/rejected": -318.83453369140625, + "loss": 0.1102, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6891399621963501, + "rewards/margins": 5.144829273223877, + "rewards/rejected": -5.833970069885254, + "step": 3590 + }, + { + "epoch": 1.8838304552590266, + "grad_norm": 63.30301150806038, + "learning_rate": 1.8285998283067478e-07, + "logits/chosen": -2.726407289505005, + "logits/rejected": -2.6477532386779785, + "logps/chosen": -278.20355224609375, + "logps/rejected": -310.3710021972656, + "loss": 0.108, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6373482942581177, + "rewards/margins": 5.8870849609375, + "rewards/rejected": -6.524433135986328, + "step": 3600 + }, + { + "epoch": 1.8890633176347462, + "grad_norm": 24.79124814354351, + "learning_rate": 1.8139478018169197e-07, + "logits/chosen": -2.6029670238494873, + "logits/rejected": -2.540987014770508, + "logps/chosen": -258.3710021972656, + "logps/rejected": -285.44000244140625, + "loss": 0.0944, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.813319981098175, + "rewards/margins": 4.891506671905518, + "rewards/rejected": -5.704827308654785, + "step": 3610 + }, + { + "epoch": 1.8942961800104658, + "grad_norm": 14.002181196398896, + "learning_rate": 1.799321215760617e-07, + "logits/chosen": -2.6256556510925293, + "logits/rejected": -2.5938618183135986, + "logps/chosen": -276.6966247558594, + "logps/rejected": -285.2062683105469, + "loss": 0.124, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1776187419891357, + "rewards/margins": 4.646222114562988, + "rewards/rejected": -5.823840141296387, + "step": 3620 + }, + { + "epoch": 1.8995290423861853, + "grad_norm": 44.217394704413046, + "learning_rate": 1.7847206125261476e-07, + "logits/chosen": -2.6064388751983643, + "logits/rejected": -2.598203659057617, + "logps/chosen": -248.5193328857422, + "logps/rejected": -276.5867614746094, + "loss": 0.121, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.913386344909668, + "rewards/margins": 5.072216987609863, + "rewards/rejected": -5.985602855682373, + "step": 3630 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 29.521492574877282, + "learning_rate": 1.7701465335383148e-07, + "logits/chosen": -2.717740535736084, + "logits/rejected": -2.582475185394287, + "logps/chosen": -292.51507568359375, + "logps/rejected": -279.9605712890625, + "loss": 0.0892, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.993351936340332, + "rewards/margins": 4.013619422912598, + "rewards/rejected": -5.00697135925293, + "step": 3640 + }, + { + "epoch": 1.9099947671376243, + "grad_norm": 37.81792131343119, + "learning_rate": 1.7555995192383377e-07, + "logits/chosen": -2.6133148670196533, + "logits/rejected": -2.720210552215576, + "logps/chosen": -251.6924285888672, + "logps/rejected": -428.26251220703125, + "loss": 0.0763, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.34535667300224304, + "rewards/margins": 5.914216041564941, + "rewards/rejected": -6.259573459625244, + "step": 3650 + }, + { + "epoch": 1.915227629513344, + "grad_norm": 45.15317768585093, + "learning_rate": 1.7410801090638166e-07, + "logits/chosen": -2.6592507362365723, + "logits/rejected": -2.568499803543091, + "logps/chosen": -309.73883056640625, + "logps/rejected": -313.0323181152344, + "loss": 0.1276, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4672146439552307, + "rewards/margins": 5.2041521072387695, + "rewards/rejected": -5.6713666915893555, + "step": 3660 + }, + { + "epoch": 1.9204604918890633, + "grad_norm": 14.599937120025656, + "learning_rate": 1.7265888414287245e-07, + "logits/chosen": -2.712362766265869, + "logits/rejected": -2.679932117462158, + "logps/chosen": -296.2633056640625, + "logps/rejected": -332.5452575683594, + "loss": 0.1004, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4468228816986084, + "rewards/margins": 6.227715015411377, + "rewards/rejected": -6.674537658691406, + "step": 3670 + }, + { + "epoch": 1.9256933542647827, + "grad_norm": 42.370504342423075, + "learning_rate": 1.7121262537034396e-07, + "logits/chosen": -2.75547456741333, + "logits/rejected": -2.597177028656006, + "logps/chosen": -320.3974609375, + "logps/rejected": -316.8236389160156, + "loss": 0.1182, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6133803129196167, + "rewards/margins": 4.7373881340026855, + "rewards/rejected": -5.350768566131592, + "step": 3680 + }, + { + "epoch": 1.9309262166405023, + "grad_norm": 24.109588654694512, + "learning_rate": 1.697692882194826e-07, + "logits/chosen": -2.544801712036133, + "logits/rejected": -2.5433974266052246, + "logps/chosen": -236.13455200195312, + "logps/rejected": -306.4762268066406, + "loss": 0.0824, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6495293378829956, + "rewards/margins": 5.061087608337402, + "rewards/rejected": -5.710616588592529, + "step": 3690 + }, + { + "epoch": 1.936159079016222, + "grad_norm": 27.927751296332747, + "learning_rate": 1.6832892621263406e-07, + "logits/chosen": -2.9226527214050293, + "logits/rejected": -2.6958699226379395, + "logps/chosen": -353.6954040527344, + "logps/rejected": -371.58599853515625, + "loss": 0.1095, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.03616750240325928, + "rewards/margins": 5.9524006843566895, + "rewards/rejected": -5.988568305969238, + "step": 3700 + }, + { + "epoch": 1.9413919413919414, + "grad_norm": 10.612592863711846, + "learning_rate": 1.668915927618183e-07, + "logits/chosen": -2.614467144012451, + "logits/rejected": -2.618373394012451, + "logps/chosen": -222.18557739257812, + "logps/rejected": -293.89935302734375, + "loss": 0.0847, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7807853817939758, + "rewards/margins": 4.5860748291015625, + "rewards/rejected": -5.366860389709473, + "step": 3710 + }, + { + "epoch": 1.9466248037676608, + "grad_norm": 8.767377121592377, + "learning_rate": 1.6545734116674965e-07, + "logits/chosen": -2.7589080333709717, + "logits/rejected": -2.6732800006866455, + "logps/chosen": -295.6678161621094, + "logps/rejected": -279.3349609375, + "loss": 0.1025, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.009974551387131214, + "rewards/margins": 5.947516918182373, + "rewards/rejected": -5.937542915344238, + "step": 3720 + }, + { + "epoch": 1.9518576661433804, + "grad_norm": 27.89405099547432, + "learning_rate": 1.6402622461286e-07, + "logits/chosen": -2.6114070415496826, + "logits/rejected": -2.5180039405822754, + "logps/chosen": -314.15069580078125, + "logps/rejected": -316.9871520996094, + "loss": 0.0879, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7035598158836365, + "rewards/margins": 5.666778087615967, + "rewards/rejected": -6.370337963104248, + "step": 3730 + }, + { + "epoch": 1.9570905285191, + "grad_norm": 28.15538164661059, + "learning_rate": 1.625982961693262e-07, + "logits/chosen": -2.8003592491149902, + "logits/rejected": -2.5596506595611572, + "logps/chosen": -346.68701171875, + "logps/rejected": -297.2021789550781, + "loss": 0.0817, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3486330807209015, + "rewards/margins": 5.3151984214782715, + "rewards/rejected": -5.663832187652588, + "step": 3740 + }, + { + "epoch": 1.9623233908948194, + "grad_norm": 41.38111514577999, + "learning_rate": 1.6117360878710266e-07, + "logits/chosen": -2.775566339492798, + "logits/rejected": -2.5968217849731445, + "logps/chosen": -321.11370849609375, + "logps/rejected": -345.9370422363281, + "loss": 0.116, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.368887722492218, + "rewards/margins": 5.136349678039551, + "rewards/rejected": -5.505237579345703, + "step": 3750 + }, + { + "epoch": 1.9675562532705388, + "grad_norm": 24.28749177179343, + "learning_rate": 1.5975221529695773e-07, + "logits/chosen": -2.661539316177368, + "logits/rejected": -2.532170057296753, + "logps/chosen": -232.6519317626953, + "logps/rejected": -238.15737915039062, + "loss": 0.1139, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.823898196220398, + "rewards/margins": 4.739381313323975, + "rewards/rejected": -5.563279151916504, + "step": 3760 + }, + { + "epoch": 1.9727891156462585, + "grad_norm": 23.092035590815094, + "learning_rate": 1.5833416840751406e-07, + "logits/chosen": -2.6281533241271973, + "logits/rejected": -2.4063587188720703, + "logps/chosen": -248.28378295898438, + "logps/rejected": -231.88119506835938, + "loss": 0.1124, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.9242329597473145, + "rewards/margins": 4.783598899841309, + "rewards/rejected": -5.707831382751465, + "step": 3770 + }, + { + "epoch": 1.978021978021978, + "grad_norm": 33.814206997216125, + "learning_rate": 1.5691952070329493e-07, + "logits/chosen": -2.756333112716675, + "logits/rejected": -2.683297634124756, + "logps/chosen": -342.19439697265625, + "logps/rejected": -398.38116455078125, + "loss": 0.1152, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.2830882668495178, + "rewards/margins": 5.8143815994262695, + "rewards/rejected": -6.097469806671143, + "step": 3780 + }, + { + "epoch": 1.9832548403976975, + "grad_norm": 32.594756198703855, + "learning_rate": 1.555083246427734e-07, + "logits/chosen": -2.586789846420288, + "logits/rejected": -2.5754427909851074, + "logps/chosen": -325.0731201171875, + "logps/rejected": -351.6557922363281, + "loss": 0.095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6281405687332153, + "rewards/margins": 5.96632194519043, + "rewards/rejected": -6.594461917877197, + "step": 3790 + }, + { + "epoch": 1.988487702773417, + "grad_norm": 27.78143850000216, + "learning_rate": 1.5410063255642767e-07, + "logits/chosen": -2.5559310913085938, + "logits/rejected": -2.545598030090332, + "logps/chosen": -280.82281494140625, + "logps/rejected": -320.7041931152344, + "loss": 0.0993, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4847874641418457, + "rewards/margins": 5.9905595779418945, + "rewards/rejected": -6.47534704208374, + "step": 3800 + }, + { + "epoch": 1.9937205651491365, + "grad_norm": 9.345286196197668, + "learning_rate": 1.5269649664480037e-07, + "logits/chosen": -2.5976288318634033, + "logits/rejected": -2.568420171737671, + "logps/chosen": -328.0233459472656, + "logps/rejected": -364.7568054199219, + "loss": 0.1041, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.7843068838119507, + "rewards/margins": 5.072054386138916, + "rewards/rejected": -5.856361389160156, + "step": 3810 + }, + { + "epoch": 1.9989534275248562, + "grad_norm": 22.221429490070964, + "learning_rate": 1.5129596897656255e-07, + "logits/chosen": -2.5863964557647705, + "logits/rejected": -2.479538917541504, + "logps/chosen": -294.79095458984375, + "logps/rejected": -299.70538330078125, + "loss": 0.0743, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.4342220425605774, + "rewards/margins": 5.279918670654297, + "rewards/rejected": -5.714139938354492, + "step": 3820 + }, + { + "epoch": 2.004186289900576, + "grad_norm": 3.151471772421924, + "learning_rate": 1.4989910148658324e-07, + "logits/chosen": -2.7527716159820557, + "logits/rejected": -2.647697925567627, + "logps/chosen": -294.06201171875, + "logps/rejected": -342.06939697265625, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6213226318359375, + "rewards/margins": 5.365971565246582, + "rewards/rejected": -5.987294673919678, + "step": 3830 + }, + { + "epoch": 2.009419152276295, + "grad_norm": 5.945101789435311, + "learning_rate": 1.485059459740035e-07, + "logits/chosen": -2.6567201614379883, + "logits/rejected": -2.5174503326416016, + "logps/chosen": -315.97198486328125, + "logps/rejected": -366.2234802246094, + "loss": 0.0185, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6567188501358032, + "rewards/margins": 6.468540191650391, + "rewards/rejected": -7.1252593994140625, + "step": 3840 + }, + { + "epoch": 2.0146520146520146, + "grad_norm": 4.294861211832759, + "learning_rate": 1.4711655410031536e-07, + "logits/chosen": -2.6191234588623047, + "logits/rejected": -2.5405681133270264, + "logps/chosen": -251.25662231445312, + "logps/rejected": -297.3587951660156, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7149181365966797, + "rewards/margins": 6.6346235275268555, + "rewards/rejected": -7.349541664123535, + "step": 3850 + }, + { + "epoch": 2.0198848770277342, + "grad_norm": 7.023199575059913, + "learning_rate": 1.4573097738744623e-07, + "logits/chosen": -2.5686943531036377, + "logits/rejected": -2.5673136711120605, + "logps/chosen": -257.622802734375, + "logps/rejected": -332.653564453125, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0377533435821533, + "rewards/margins": 6.569632053375244, + "rewards/rejected": -7.607385158538818, + "step": 3860 + }, + { + "epoch": 2.025117739403454, + "grad_norm": 4.796659300478697, + "learning_rate": 1.4434926721584865e-07, + "logits/chosen": -2.654580593109131, + "logits/rejected": -2.5008137226104736, + "logps/chosen": -285.6098937988281, + "logps/rejected": -343.22662353515625, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.090226650238037, + "rewards/margins": 6.6490278244018555, + "rewards/rejected": -7.739254951477051, + "step": 3870 + }, + { + "epoch": 2.030350601779173, + "grad_norm": 2.7489006427714218, + "learning_rate": 1.4297147482259424e-07, + "logits/chosen": -2.642270803451538, + "logits/rejected": -2.5516154766082764, + "logps/chosen": -281.71600341796875, + "logps/rejected": -305.82373046875, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2659205198287964, + "rewards/margins": 7.210963249206543, + "rewards/rejected": -8.476883888244629, + "step": 3880 + }, + { + "epoch": 2.0355834641548927, + "grad_norm": 1.494445263725559, + "learning_rate": 1.4159765129947443e-07, + "logits/chosen": -2.708300828933716, + "logits/rejected": -2.6663706302642822, + "logps/chosen": -259.70416259765625, + "logps/rejected": -313.52825927734375, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.073347806930542, + "rewards/margins": 8.003740310668945, + "rewards/rejected": -9.07708740234375, + "step": 3890 + }, + { + "epoch": 2.0408163265306123, + "grad_norm": 3.496998555606532, + "learning_rate": 1.4022784759110576e-07, + "logits/chosen": -2.5498406887054443, + "logits/rejected": -2.4592509269714355, + "logps/chosen": -284.31732177734375, + "logps/rejected": -343.39630126953125, + "loss": 0.0118, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.5686169862747192, + "rewards/margins": 6.356328010559082, + "rewards/rejected": -7.9249444007873535, + "step": 3900 + }, + { + "epoch": 2.046049188906332, + "grad_norm": 17.389942240056595, + "learning_rate": 1.3886211449304002e-07, + "logits/chosen": -2.5802905559539795, + "logits/rejected": -2.6096935272216797, + "logps/chosen": -255.0489044189453, + "logps/rejected": -419.2701110839844, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4543758630752563, + "rewards/margins": 7.66497802734375, + "rewards/rejected": -9.119354248046875, + "step": 3910 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 2.150445047937338, + "learning_rate": 1.3750050264988172e-07, + "logits/chosen": -2.548017740249634, + "logits/rejected": -2.626844644546509, + "logps/chosen": -199.0328369140625, + "logps/rejected": -322.7221984863281, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8127008676528931, + "rewards/margins": 8.354021072387695, + "rewards/rejected": -9.166723251342773, + "step": 3920 + }, + { + "epoch": 2.0565149136577707, + "grad_norm": 1.660952211206376, + "learning_rate": 1.3614306255340918e-07, + "logits/chosen": -2.7289199829101562, + "logits/rejected": -2.495772123336792, + "logps/chosen": -294.88165283203125, + "logps/rejected": -300.2463684082031, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5807745456695557, + "rewards/margins": 7.85580587387085, + "rewards/rejected": -8.4365816116333, + "step": 3930 + }, + { + "epoch": 2.0617477760334904, + "grad_norm": 4.318158186232724, + "learning_rate": 1.347898445407027e-07, + "logits/chosen": -2.6153035163879395, + "logits/rejected": -2.520585536956787, + "logps/chosen": -312.289794921875, + "logps/rejected": -370.9855041503906, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25612935423851013, + "rewards/margins": 8.64028263092041, + "rewards/rejected": -8.896411895751953, + "step": 3940 + }, + { + "epoch": 2.06698063840921, + "grad_norm": 2.821049446764142, + "learning_rate": 1.3344089879227768e-07, + "logits/chosen": -2.6195080280303955, + "logits/rejected": -2.528824806213379, + "logps/chosen": -330.6834411621094, + "logps/rejected": -354.763671875, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2032630443572998, + "rewards/margins": 8.363024711608887, + "rewards/rejected": -9.56628704071045, + "step": 3950 + }, + { + "epoch": 2.072213500784929, + "grad_norm": 0.7524659808370527, + "learning_rate": 1.3209627533022393e-07, + "logits/chosen": -2.459660768508911, + "logits/rejected": -2.45542311668396, + "logps/chosen": -317.22442626953125, + "logps/rejected": -374.64154052734375, + "loss": 0.0087, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.9240306615829468, + "rewards/margins": 8.787074089050293, + "rewards/rejected": -9.711104393005371, + "step": 3960 + }, + { + "epoch": 2.077446363160649, + "grad_norm": 3.885112950468523, + "learning_rate": 1.3075602401635056e-07, + "logits/chosen": -2.5620055198669434, + "logits/rejected": -2.480083465576172, + "logps/chosen": -235.87051391601562, + "logps/rejected": -239.4990692138672, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0387330055236816, + "rewards/margins": 6.51049280166626, + "rewards/rejected": -8.549224853515625, + "step": 3970 + }, + { + "epoch": 2.0826792255363684, + "grad_norm": 5.124178285131869, + "learning_rate": 1.2942019455033715e-07, + "logits/chosen": -2.595177412033081, + "logits/rejected": -2.55249285697937, + "logps/chosen": -363.89544677734375, + "logps/rejected": -383.08050537109375, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0965200662612915, + "rewards/margins": 7.803582668304443, + "rewards/rejected": -8.900102615356445, + "step": 3980 + }, + { + "epoch": 2.087912087912088, + "grad_norm": 1.72781923689808, + "learning_rate": 1.2808883646789088e-07, + "logits/chosen": -2.6225199699401855, + "logits/rejected": -2.5271828174591064, + "logps/chosen": -278.67962646484375, + "logps/rejected": -342.74578857421875, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2907451391220093, + "rewards/margins": 8.017008781433105, + "rewards/rejected": -9.307754516601562, + "step": 3990 + }, + { + "epoch": 2.0931449502878072, + "grad_norm": 2.3452944302465863, + "learning_rate": 1.2676199913890933e-07, + "logits/chosen": -2.4738712310791016, + "logits/rejected": -2.35957670211792, + "logps/chosen": -301.91607666015625, + "logps/rejected": -312.5462951660156, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5129525661468506, + "rewards/margins": 6.101428031921387, + "rewards/rejected": -7.614381313323975, + "step": 4000 + }, + { + "epoch": 2.0931449502878072, + "eval_logits/chosen": -2.5508530139923096, + "eval_logits/rejected": -2.4970502853393555, + "eval_logps/chosen": -307.0093994140625, + "eval_logps/rejected": -333.84161376953125, + "eval_loss": 0.6922265887260437, + "eval_rewards/accuracies": 0.78515625, + "eval_rewards/chosen": -3.975245952606201, + "eval_rewards/margins": 2.4245357513427734, + "eval_rewards/rejected": -6.399781703948975, + "eval_runtime": 95.5712, + "eval_samples_per_second": 20.927, + "eval_steps_per_second": 0.335, + "step": 4000 + }, + { + "epoch": 2.098377812663527, + "grad_norm": 3.865042704199612, + "learning_rate": 1.2543973176565012e-07, + "logits/chosen": -2.5314152240753174, + "logits/rejected": -2.4536736011505127, + "logps/chosen": -235.0891571044922, + "logps/rejected": -319.36614990234375, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.152523994445801, + "rewards/margins": 7.690751075744629, + "rewards/rejected": -9.843273162841797, + "step": 4010 + }, + { + "epoch": 2.1036106750392465, + "grad_norm": 7.733074515561227, + "learning_rate": 1.2412208338090565e-07, + "logits/chosen": -2.6669182777404785, + "logits/rejected": -2.5965628623962402, + "logps/chosen": -357.357177734375, + "logps/rejected": -405.3863525390625, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6724307537078857, + "rewards/margins": 7.350949764251709, + "rewards/rejected": -9.023381233215332, + "step": 4020 + }, + { + "epoch": 2.108843537414966, + "grad_norm": 2.108764330623009, + "learning_rate": 1.228091028461858e-07, + "logits/chosen": -2.63122820854187, + "logits/rejected": -2.5630502700805664, + "logps/chosen": -276.1968994140625, + "logps/rejected": -384.46856689453125, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7323462963104248, + "rewards/margins": 8.009744644165039, + "rewards/rejected": -9.742091178894043, + "step": 4030 + }, + { + "epoch": 2.1140763997906853, + "grad_norm": 5.205840373586514, + "learning_rate": 1.2150083884990536e-07, + "logits/chosen": -2.647475242614746, + "logits/rejected": -2.5325350761413574, + "logps/chosen": -296.84710693359375, + "logps/rejected": -351.752197265625, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0414936542510986, + "rewards/margins": 7.303194999694824, + "rewards/rejected": -9.34468936920166, + "step": 4040 + }, + { + "epoch": 2.119309262166405, + "grad_norm": 3.7420471789718266, + "learning_rate": 1.201973399055788e-07, + "logits/chosen": -2.737910032272339, + "logits/rejected": -2.6582093238830566, + "logps/chosen": -336.29315185546875, + "logps/rejected": -373.5148010253906, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1442760229110718, + "rewards/margins": 7.766120910644531, + "rewards/rejected": -8.91039752960205, + "step": 4050 + }, + { + "epoch": 2.1245421245421245, + "grad_norm": 1.4855886157432574, + "learning_rate": 1.1889865435002117e-07, + "logits/chosen": -2.7083852291107178, + "logits/rejected": -2.6565423011779785, + "logps/chosen": -302.7213134765625, + "logps/rejected": -359.2713928222656, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1232277154922485, + "rewards/margins": 7.309815406799316, + "rewards/rejected": -8.433042526245117, + "step": 4060 + }, + { + "epoch": 2.129774986917844, + "grad_norm": 1.138207703913642, + "learning_rate": 1.1760483034155588e-07, + "logits/chosen": -2.6401944160461426, + "logits/rejected": -2.5925869941711426, + "logps/chosen": -286.7811279296875, + "logps/rejected": -368.1828918457031, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.698175072669983, + "rewards/margins": 8.620880126953125, + "rewards/rejected": -10.31905460357666, + "step": 4070 + }, + { + "epoch": 2.1350078492935634, + "grad_norm": 1.1583347223952818, + "learning_rate": 1.163159158582284e-07, + "logits/chosen": -2.4908974170684814, + "logits/rejected": -2.490464448928833, + "logps/chosen": -299.4886779785156, + "logps/rejected": -369.92938232421875, + "loss": 0.02, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3759181499481201, + "rewards/margins": 8.674256324768066, + "rewards/rejected": -10.05017375946045, + "step": 4080 + }, + { + "epoch": 2.140240711669283, + "grad_norm": 1.292595681436358, + "learning_rate": 1.1503195869602766e-07, + "logits/chosen": -2.5776543617248535, + "logits/rejected": -2.437288522720337, + "logps/chosen": -280.73193359375, + "logps/rejected": -321.9732666015625, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7706005573272705, + "rewards/margins": 8.112302780151367, + "rewards/rejected": -9.882905006408691, + "step": 4090 + }, + { + "epoch": 2.1454735740450026, + "grad_norm": 3.1355222478979137, + "learning_rate": 1.137530064671135e-07, + "logits/chosen": -2.55924391746521, + "logits/rejected": -2.634777545928955, + "logps/chosen": -249.09994506835938, + "logps/rejected": -356.64141845703125, + "loss": 0.0156, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4390732049942017, + "rewards/margins": 7.6839189529418945, + "rewards/rejected": -9.122990608215332, + "step": 4100 + }, + { + "epoch": 2.1507064364207222, + "grad_norm": 5.68759855658989, + "learning_rate": 1.1247910659805063e-07, + "logits/chosen": -2.6641414165496826, + "logits/rejected": -2.5627830028533936, + "logps/chosen": -325.3441467285156, + "logps/rejected": -298.83612060546875, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3880575895309448, + "rewards/margins": 7.806868553161621, + "rewards/rejected": -9.194926261901855, + "step": 4110 + }, + { + "epoch": 2.155939298796442, + "grad_norm": 1.9973676382803611, + "learning_rate": 1.112103063280509e-07, + "logits/chosen": -2.560502052307129, + "logits/rejected": -2.4194540977478027, + "logps/chosen": -265.48956298828125, + "logps/rejected": -404.8538818359375, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9298257827758789, + "rewards/margins": 8.695016860961914, + "rewards/rejected": -9.624841690063477, + "step": 4120 + }, + { + "epoch": 2.161172161172161, + "grad_norm": 1.7882510195806205, + "learning_rate": 1.099466527072207e-07, + "logits/chosen": -2.56492018699646, + "logits/rejected": -2.5750932693481445, + "logps/chosen": -235.86630249023438, + "logps/rejected": -365.64300537109375, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7126439809799194, + "rewards/margins": 8.225854873657227, + "rewards/rejected": -9.938499450683594, + "step": 4130 + }, + { + "epoch": 2.1664050235478807, + "grad_norm": 2.72661087691154, + "learning_rate": 1.0868819259481638e-07, + "logits/chosen": -2.5510354042053223, + "logits/rejected": -2.3607094287872314, + "logps/chosen": -303.17156982421875, + "logps/rejected": -292.5484924316406, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.530797004699707, + "rewards/margins": 7.4277496337890625, + "rewards/rejected": -9.95854663848877, + "step": 4140 + }, + { + "epoch": 2.1716378859236003, + "grad_norm": 2.48899655088663, + "learning_rate": 1.0743497265750701e-07, + "logits/chosen": -2.6701772212982178, + "logits/rejected": -2.575355052947998, + "logps/chosen": -289.3881530761719, + "logps/rejected": -360.732177734375, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7940318584442139, + "rewards/margins": 7.533411502838135, + "rewards/rejected": -9.327444076538086, + "step": 4150 + }, + { + "epoch": 2.17687074829932, + "grad_norm": 5.329556069710662, + "learning_rate": 1.0618703936764359e-07, + "logits/chosen": -2.6774773597717285, + "logits/rejected": -2.5098540782928467, + "logps/chosen": -328.7174072265625, + "logps/rejected": -393.78729248046875, + "loss": 0.0151, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.827301502227783, + "rewards/margins": 8.235798835754395, + "rewards/rejected": -11.06309986114502, + "step": 4160 + }, + { + "epoch": 2.182103610675039, + "grad_norm": 1.9008399468133936, + "learning_rate": 1.0494443900153557e-07, + "logits/chosen": -2.6789798736572266, + "logits/rejected": -2.463243007659912, + "logps/chosen": -317.5295715332031, + "logps/rejected": -363.3247375488281, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1199839115142822, + "rewards/margins": 8.223379135131836, + "rewards/rejected": -9.343362808227539, + "step": 4170 + }, + { + "epoch": 2.1873364730507587, + "grad_norm": 1.7033088321775203, + "learning_rate": 1.0370721763773507e-07, + "logits/chosen": -2.6211695671081543, + "logits/rejected": -2.423337459564209, + "logps/chosen": -352.43670654296875, + "logps/rejected": -357.2591247558594, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3701328039169312, + "rewards/margins": 8.98823356628418, + "rewards/rejected": -10.358366966247559, + "step": 4180 + }, + { + "epoch": 2.1925693354264784, + "grad_norm": 20.353360868351782, + "learning_rate": 1.0247542115532845e-07, + "logits/chosen": -2.5802175998687744, + "logits/rejected": -2.5106277465820312, + "logps/chosen": -301.53375244140625, + "logps/rejected": -362.0812072753906, + "loss": 0.0196, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.9264440536499023, + "rewards/margins": 8.778693199157715, + "rewards/rejected": -10.705137252807617, + "step": 4190 + }, + { + "epoch": 2.197802197802198, + "grad_norm": 10.710391123253114, + "learning_rate": 1.0124909523223418e-07, + "logits/chosen": -2.572200059890747, + "logits/rejected": -2.5115675926208496, + "logps/chosen": -312.39404296875, + "logps/rejected": -369.1626892089844, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4592777490615845, + "rewards/margins": 8.952848434448242, + "rewards/rejected": -10.412126541137695, + "step": 4200 + }, + { + "epoch": 2.203035060177917, + "grad_norm": 1.4553015565286538, + "learning_rate": 1.0002828534350987e-07, + "logits/chosen": -2.6698861122131348, + "logits/rejected": -2.534928321838379, + "logps/chosen": -343.0860595703125, + "logps/rejected": -360.8196716308594, + "loss": 0.0188, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.8917086124420166, + "rewards/margins": 7.229773044586182, + "rewards/rejected": -9.121480941772461, + "step": 4210 + }, + { + "epoch": 2.208267922553637, + "grad_norm": 3.801200126371489, + "learning_rate": 9.881303675966524e-08, + "logits/chosen": -2.6052310466766357, + "logits/rejected": -2.4696848392486572, + "logps/chosen": -296.0155334472656, + "logps/rejected": -352.32928466796875, + "loss": 0.009, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.727985143661499, + "rewards/margins": 7.7842817306518555, + "rewards/rejected": -9.512266159057617, + "step": 4220 + }, + { + "epoch": 2.2135007849293564, + "grad_norm": 1.8608998334743396, + "learning_rate": 9.760339454498393e-08, + "logits/chosen": -2.4305007457733154, + "logits/rejected": -2.4329895973205566, + "logps/chosen": -256.9643859863281, + "logps/rejected": -316.918701171875, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.382755756378174, + "rewards/margins": 7.783593654632568, + "rewards/rejected": -10.166349411010742, + "step": 4230 + }, + { + "epoch": 2.218733647305076, + "grad_norm": 3.9057026589225528, + "learning_rate": 9.639940355585218e-08, + "logits/chosen": -2.6919617652893066, + "logits/rejected": -2.6409287452697754, + "logps/chosen": -295.90374755859375, + "logps/rejected": -379.12591552734375, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4674360752105713, + "rewards/margins": 7.371214866638184, + "rewards/rejected": -9.838650703430176, + "step": 4240 + }, + { + "epoch": 2.2239665096807952, + "grad_norm": 4.163349722355175, + "learning_rate": 9.52011084390954e-08, + "logits/chosen": -2.6314711570739746, + "logits/rejected": -2.589735746383667, + "logps/chosen": -290.67755126953125, + "logps/rejected": -351.04119873046875, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.112610340118408, + "rewards/margins": 7.909343719482422, + "rewards/rejected": -10.021953582763672, + "step": 4250 + }, + { + "epoch": 2.229199372056515, + "grad_norm": 11.426975588603128, + "learning_rate": 9.400855363032262e-08, + "logits/chosen": -2.6521573066711426, + "logits/rejected": -2.6732687950134277, + "logps/chosen": -317.30401611328125, + "logps/rejected": -395.8109436035156, + "loss": 0.0186, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.533034324645996, + "rewards/margins": 8.412559509277344, + "rewards/rejected": -9.945592880249023, + "step": 4260 + }, + { + "epoch": 2.2344322344322345, + "grad_norm": 1.0643961722343311, + "learning_rate": 9.282178335227883e-08, + "logits/chosen": -2.6245615482330322, + "logits/rejected": -2.5420284271240234, + "logps/chosen": -278.80419921875, + "logps/rejected": -372.5382995605469, + "loss": 0.0108, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.101954936981201, + "rewards/margins": 8.370219230651855, + "rewards/rejected": -10.472173690795898, + "step": 4270 + }, + { + "epoch": 2.239665096807954, + "grad_norm": 1.488877344015738, + "learning_rate": 9.164084161320471e-08, + "logits/chosen": -2.598118543624878, + "logits/rejected": -2.4516539573669434, + "logps/chosen": -281.11492919921875, + "logps/rejected": -353.3921813964844, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5900779962539673, + "rewards/margins": 9.468457221984863, + "rewards/rejected": -11.058534622192383, + "step": 4280 + }, + { + "epoch": 2.2448979591836733, + "grad_norm": 1.484123392002169, + "learning_rate": 9.046577220520518e-08, + "logits/chosen": -2.589380979537964, + "logits/rejected": -2.49733567237854, + "logps/chosen": -276.93499755859375, + "logps/rejected": -347.20880126953125, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0041606426239014, + "rewards/margins": 7.949263095855713, + "rewards/rejected": -9.953423500061035, + "step": 4290 + }, + { + "epoch": 2.250130821559393, + "grad_norm": 18.225211308951582, + "learning_rate": 8.929661870262525e-08, + "logits/chosen": -2.7803542613983154, + "logits/rejected": -2.62983775138855, + "logps/chosen": -398.0860900878906, + "logps/rejected": -380.4949951171875, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7444343566894531, + "rewards/margins": 7.942718505859375, + "rewards/rejected": -9.687153816223145, + "step": 4300 + }, + { + "epoch": 2.2553636839351126, + "grad_norm": 3.199591487607983, + "learning_rate": 8.813342446043423e-08, + "logits/chosen": -2.6458935737609863, + "logits/rejected": -2.5061302185058594, + "logps/chosen": -297.6692810058594, + "logps/rejected": -316.699462890625, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.22013521194458, + "rewards/margins": 7.4093337059021, + "rewards/rejected": -9.62946891784668, + "step": 4310 + }, + { + "epoch": 2.260596546310832, + "grad_norm": 1.2640296355883016, + "learning_rate": 8.697623261261788e-08, + "logits/chosen": -2.575767755508423, + "logits/rejected": -2.5581982135772705, + "logps/chosen": -264.8335876464844, + "logps/rejected": -372.76446533203125, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0652940273284912, + "rewards/margins": 9.945289611816406, + "rewards/rejected": -11.010583877563477, + "step": 4320 + }, + { + "epoch": 2.2658294086865514, + "grad_norm": 1.0437475906469669, + "learning_rate": 8.58250860705792e-08, + "logits/chosen": -2.742846965789795, + "logits/rejected": -2.6475348472595215, + "logps/chosen": -338.3741760253906, + "logps/rejected": -381.1085510253906, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.54091477394104, + "rewards/margins": 8.124018669128418, + "rewards/rejected": -9.664934158325195, + "step": 4330 + }, + { + "epoch": 2.271062271062271, + "grad_norm": 0.9690911735364153, + "learning_rate": 8.468002752154671e-08, + "logits/chosen": -2.7305939197540283, + "logits/rejected": -2.57236385345459, + "logps/chosen": -332.5419006347656, + "logps/rejected": -361.2933044433594, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5212883949279785, + "rewards/margins": 8.489986419677734, + "rewards/rejected": -10.011274337768555, + "step": 4340 + }, + { + "epoch": 2.2762951334379906, + "grad_norm": 7.071080275710779, + "learning_rate": 8.354109942699208e-08, + "logits/chosen": -2.6169490814208984, + "logits/rejected": -2.551487445831299, + "logps/chosen": -297.1095886230469, + "logps/rejected": -354.46722412109375, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9308229684829712, + "rewards/margins": 7.707949638366699, + "rewards/rejected": -9.638772964477539, + "step": 4350 + }, + { + "epoch": 2.2815279958137102, + "grad_norm": 0.6167801750895417, + "learning_rate": 8.240834402105524e-08, + "logits/chosen": -2.5586185455322266, + "logits/rejected": -2.452181577682495, + "logps/chosen": -327.6608581542969, + "logps/rejected": -337.8875427246094, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0531277656555176, + "rewards/margins": 8.026483535766602, + "rewards/rejected": -9.079609870910645, + "step": 4360 + }, + { + "epoch": 2.2867608581894294, + "grad_norm": 4.717373542275004, + "learning_rate": 8.128180330897791e-08, + "logits/chosen": -2.503763437271118, + "logits/rejected": -2.557732105255127, + "logps/chosen": -310.40924072265625, + "logps/rejected": -419.1622009277344, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9562809467315674, + "rewards/margins": 9.202638626098633, + "rewards/rejected": -11.158918380737305, + "step": 4370 + }, + { + "epoch": 2.291993720565149, + "grad_norm": 1.1978166487584516, + "learning_rate": 8.016151906554683e-08, + "logits/chosen": -2.621993064880371, + "logits/rejected": -2.636842727661133, + "logps/chosen": -289.68780517578125, + "logps/rejected": -449.16424560546875, + "loss": 0.015, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.7936582565307617, + "rewards/margins": 8.596534729003906, + "rewards/rejected": -10.390192985534668, + "step": 4380 + }, + { + "epoch": 2.2972265829408687, + "grad_norm": 1.2996428679554775, + "learning_rate": 7.90475328335439e-08, + "logits/chosen": -2.605773448944092, + "logits/rejected": -2.5295023918151855, + "logps/chosen": -243.2452850341797, + "logps/rejected": -304.86346435546875, + "loss": 0.0127, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.1631274223327637, + "rewards/margins": 7.450549125671387, + "rewards/rejected": -9.613676071166992, + "step": 4390 + }, + { + "epoch": 2.3024594453165883, + "grad_norm": 7.310470803121754, + "learning_rate": 7.793988592220568e-08, + "logits/chosen": -2.5795531272888184, + "logits/rejected": -2.4490249156951904, + "logps/chosen": -301.59979248046875, + "logps/rejected": -335.76605224609375, + "loss": 0.0189, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.096242904663086, + "rewards/margins": 7.220043182373047, + "rewards/rejected": -9.316286087036133, + "step": 4400 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 4.938780046264363, + "learning_rate": 7.683861940569217e-08, + "logits/chosen": -2.605426073074341, + "logits/rejected": -2.507072925567627, + "logps/chosen": -361.077880859375, + "logps/rejected": -363.7043762207031, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.157196521759033, + "rewards/margins": 7.572789192199707, + "rewards/rejected": -9.729986190795898, + "step": 4410 + }, + { + "epoch": 2.312925170068027, + "grad_norm": 21.771326079556008, + "learning_rate": 7.574377412156291e-08, + "logits/chosen": -2.6205391883850098, + "logits/rejected": -2.4058001041412354, + "logps/chosen": -293.1470642089844, + "logps/rejected": -329.00177001953125, + "loss": 0.0222, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.156372547149658, + "rewards/margins": 8.254606246948242, + "rewards/rejected": -10.410979270935059, + "step": 4420 + }, + { + "epoch": 2.3181580324437467, + "grad_norm": 2.3118779959043683, + "learning_rate": 7.465539066926322e-08, + "logits/chosen": -2.5743918418884277, + "logits/rejected": -2.5314648151397705, + "logps/chosen": -313.2865905761719, + "logps/rejected": -342.49481201171875, + "loss": 0.0159, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.2431182861328125, + "rewards/margins": 8.129000663757324, + "rewards/rejected": -10.372118949890137, + "step": 4430 + }, + { + "epoch": 2.3233908948194664, + "grad_norm": 4.4751527013255705, + "learning_rate": 7.357350940861845e-08, + "logits/chosen": -2.6586811542510986, + "logits/rejected": -2.613595962524414, + "logps/chosen": -352.1698303222656, + "logps/rejected": -448.161376953125, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0548558235168457, + "rewards/margins": 8.786008834838867, + "rewards/rejected": -10.840865135192871, + "step": 4440 + }, + { + "epoch": 2.328623757195186, + "grad_norm": 3.451212891606658, + "learning_rate": 7.249817045833726e-08, + "logits/chosen": -2.5706191062927246, + "logits/rejected": -2.5019803047180176, + "logps/chosen": -305.5481262207031, + "logps/rejected": -333.54180908203125, + "loss": 0.0169, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.8968453407287598, + "rewards/margins": 7.913191318511963, + "rewards/rejected": -10.810035705566406, + "step": 4450 + }, + { + "epoch": 2.333856619570905, + "grad_norm": 1.017008288548852, + "learning_rate": 7.14294136945241e-08, + "logits/chosen": -2.5936567783355713, + "logits/rejected": -2.5028891563415527, + "logps/chosen": -313.73687744140625, + "logps/rejected": -381.7586669921875, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4407036304473877, + "rewards/margins": 9.367916107177734, + "rewards/rejected": -10.808621406555176, + "step": 4460 + }, + { + "epoch": 2.339089481946625, + "grad_norm": 2.3105528237228374, + "learning_rate": 7.036727874920043e-08, + "logits/chosen": -2.4267191886901855, + "logits/rejected": -2.4052541255950928, + "logps/chosen": -292.8412780761719, + "logps/rejected": -387.59417724609375, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7293214797973633, + "rewards/margins": 8.727388381958008, + "rewards/rejected": -11.456710815429688, + "step": 4470 + }, + { + "epoch": 2.3443223443223444, + "grad_norm": 3.1199674855344504, + "learning_rate": 6.931180500883484e-08, + "logits/chosen": -2.5494871139526367, + "logits/rejected": -2.4779820442199707, + "logps/chosen": -246.21481323242188, + "logps/rejected": -301.2300720214844, + "loss": 0.0094, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.211869716644287, + "rewards/margins": 8.118663787841797, + "rewards/rejected": -10.330533027648926, + "step": 4480 + }, + { + "epoch": 2.3495552066980636, + "grad_norm": 2.05083376936891, + "learning_rate": 6.826303161288302e-08, + "logits/chosen": -2.451153039932251, + "logits/rejected": -2.3369247913360596, + "logps/chosen": -266.6675720214844, + "logps/rejected": -341.8884582519531, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5440659523010254, + "rewards/margins": 8.815584182739258, + "rewards/rejected": -11.359649658203125, + "step": 4490 + }, + { + "epoch": 2.3547880690737832, + "grad_norm": 16.55836084559718, + "learning_rate": 6.722099745233594e-08, + "logits/chosen": -2.7528815269470215, + "logits/rejected": -2.578997850418091, + "logps/chosen": -346.63006591796875, + "logps/rejected": -374.9027099609375, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.028027296066284, + "rewards/margins": 8.339401245117188, + "rewards/rejected": -10.367429733276367, + "step": 4500 + }, + { + "epoch": 2.360020931449503, + "grad_norm": 6.028287869962129, + "learning_rate": 6.618574116827786e-08, + "logits/chosen": -2.593658924102783, + "logits/rejected": -2.5544655323028564, + "logps/chosen": -266.7757568359375, + "logps/rejected": -348.6341247558594, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2745323181152344, + "rewards/margins": 8.218457221984863, + "rewards/rejected": -10.492990493774414, + "step": 4510 + }, + { + "epoch": 2.3652537938252225, + "grad_norm": 3.0886539584738575, + "learning_rate": 6.515730115045339e-08, + "logits/chosen": -2.684044361114502, + "logits/rejected": -2.547943592071533, + "logps/chosen": -336.9212951660156, + "logps/rejected": -375.62701416015625, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6338365077972412, + "rewards/margins": 9.30997371673584, + "rewards/rejected": -10.94381046295166, + "step": 4520 + }, + { + "epoch": 2.370486656200942, + "grad_norm": 2.6957279498558315, + "learning_rate": 6.413571553584399e-08, + "logits/chosen": -2.5638465881347656, + "logits/rejected": -2.4773478507995605, + "logps/chosen": -296.7364196777344, + "logps/rejected": -357.34027099609375, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7816386222839355, + "rewards/margins": 7.2836761474609375, + "rewards/rejected": -10.065314292907715, + "step": 4530 + }, + { + "epoch": 2.3757195185766613, + "grad_norm": 12.196071977619017, + "learning_rate": 6.312102220725346e-08, + "logits/chosen": -2.7183175086975098, + "logits/rejected": -2.5329537391662598, + "logps/chosen": -383.75970458984375, + "logps/rejected": -392.5120849609375, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3232245445251465, + "rewards/margins": 8.773935317993164, + "rewards/rejected": -11.097159385681152, + "step": 4540 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 3.2820692452777402, + "learning_rate": 6.21132587919036e-08, + "logits/chosen": -2.678323984146118, + "logits/rejected": -2.569042444229126, + "logps/chosen": -320.4778137207031, + "logps/rejected": -380.771728515625, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.385861873626709, + "rewards/margins": 7.826394081115723, + "rewards/rejected": -10.212255477905273, + "step": 4550 + }, + { + "epoch": 2.3861852433281006, + "grad_norm": 0.47087194855750114, + "learning_rate": 6.111246266003859e-08, + "logits/chosen": -2.5360188484191895, + "logits/rejected": -2.4553494453430176, + "logps/chosen": -358.38580322265625, + "logps/rejected": -437.56103515625, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.868281364440918, + "rewards/margins": 9.001848220825195, + "rewards/rejected": -11.870129585266113, + "step": 4560 + }, + { + "epoch": 2.3914181057038197, + "grad_norm": 2.4784234358738675, + "learning_rate": 6.011867092353934e-08, + "logits/chosen": -2.640843152999878, + "logits/rejected": -2.4602251052856445, + "logps/chosen": -320.490966796875, + "logps/rejected": -328.3841247558594, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0051512718200684, + "rewards/margins": 8.660110473632812, + "rewards/rejected": -10.665262222290039, + "step": 4570 + }, + { + "epoch": 2.3966509680795394, + "grad_norm": 1.140894037098134, + "learning_rate": 5.9131920434547235e-08, + "logits/chosen": -2.5126101970672607, + "logits/rejected": -2.5434188842773438, + "logps/chosen": -365.4574279785156, + "logps/rejected": -453.28094482421875, + "loss": 0.0174, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.6535847187042236, + "rewards/margins": 8.610297203063965, + "rewards/rejected": -11.263882637023926, + "step": 4580 + }, + { + "epoch": 2.401883830455259, + "grad_norm": 1.5548591343733114, + "learning_rate": 5.8152247784097664e-08, + "logits/chosen": -2.645655632019043, + "logits/rejected": -2.5567336082458496, + "logps/chosen": -361.2928771972656, + "logps/rejected": -426.90740966796875, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3525370359420776, + "rewards/margins": 10.510432243347168, + "rewards/rejected": -11.862970352172852, + "step": 4590 + }, + { + "epoch": 2.4071166928309786, + "grad_norm": 1.5180709415239144, + "learning_rate": 5.717968930076289e-08, + "logits/chosen": -2.6549229621887207, + "logits/rejected": -2.6011130809783936, + "logps/chosen": -257.218505859375, + "logps/rejected": -340.18231201171875, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1633079051971436, + "rewards/margins": 9.276578903198242, + "rewards/rejected": -11.439887046813965, + "step": 4600 + }, + { + "epoch": 2.4123495552066982, + "grad_norm": 0.7612063953273129, + "learning_rate": 5.621428104930528e-08, + "logits/chosen": -2.4110500812530518, + "logits/rejected": -2.356020450592041, + "logps/chosen": -236.77297973632812, + "logps/rejected": -355.3910827636719, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2140114307403564, + "rewards/margins": 10.140518188476562, + "rewards/rejected": -13.354528427124023, + "step": 4610 + }, + { + "epoch": 2.4175824175824174, + "grad_norm": 3.6824412193515514, + "learning_rate": 5.525605882933965e-08, + "logits/chosen": -2.549696922302246, + "logits/rejected": -2.5376381874084473, + "logps/chosen": -297.99847412109375, + "logps/rejected": -380.44793701171875, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1107754707336426, + "rewards/margins": 9.53602409362793, + "rewards/rejected": -11.646799087524414, + "step": 4620 + }, + { + "epoch": 2.422815279958137, + "grad_norm": 5.031761601034184, + "learning_rate": 5.4305058174005853e-08, + "logits/chosen": -2.4668195247650146, + "logits/rejected": -2.4291744232177734, + "logps/chosen": -418.4002990722656, + "logps/rejected": -465.49273681640625, + "loss": 0.0107, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.6363046169281006, + "rewards/margins": 10.962209701538086, + "rewards/rejected": -12.598515510559082, + "step": 4630 + }, + { + "epoch": 2.4280481423338567, + "grad_norm": 1.2710580407310335, + "learning_rate": 5.33613143486511e-08, + "logits/chosen": -2.5716395378112793, + "logits/rejected": -2.3918890953063965, + "logps/chosen": -351.7398986816406, + "logps/rejected": -351.0377197265625, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.486716628074646, + "rewards/margins": 9.630250930786133, + "rewards/rejected": -11.116966247558594, + "step": 4640 + }, + { + "epoch": 2.4332810047095763, + "grad_norm": 0.6204091483102194, + "learning_rate": 5.242486234952206e-08, + "logits/chosen": -2.5600039958953857, + "logits/rejected": -2.4350457191467285, + "logps/chosen": -319.65411376953125, + "logps/rejected": -355.7321472167969, + "loss": 0.011, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.7288930416107178, + "rewards/margins": 8.188405990600586, + "rewards/rejected": -10.917299270629883, + "step": 4650 + }, + { + "epoch": 2.4385138670852955, + "grad_norm": 1.4435258368690114, + "learning_rate": 5.149573690246758e-08, + "logits/chosen": -2.5671067237854004, + "logits/rejected": -2.4775681495666504, + "logps/chosen": -341.93096923828125, + "logps/rejected": -381.197998046875, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.316157341003418, + "rewards/margins": 9.097368240356445, + "rewards/rejected": -11.41352653503418, + "step": 4660 + }, + { + "epoch": 2.443746729461015, + "grad_norm": 1.4347004985829015, + "learning_rate": 5.057397246165052e-08, + "logits/chosen": -2.5920205116271973, + "logits/rejected": -2.477240800857544, + "logps/chosen": -387.96722412109375, + "logps/rejected": -392.7718200683594, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3120877742767334, + "rewards/margins": 9.547895431518555, + "rewards/rejected": -11.859980583190918, + "step": 4670 + }, + { + "epoch": 2.4489795918367347, + "grad_norm": 2.6229197053857294, + "learning_rate": 4.9659603208270173e-08, + "logits/chosen": -2.6845996379852295, + "logits/rejected": -2.446159839630127, + "logps/chosen": -382.69000244140625, + "logps/rejected": -356.66473388671875, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.651026725769043, + "rewards/margins": 7.654710292816162, + "rewards/rejected": -10.305737495422363, + "step": 4680 + }, + { + "epoch": 2.4542124542124544, + "grad_norm": 3.8843985711978384, + "learning_rate": 4.875266304929496e-08, + "logits/chosen": -2.411546230316162, + "logits/rejected": -2.312110424041748, + "logps/chosen": -259.7157897949219, + "logps/rejected": -315.6478271484375, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8747894763946533, + "rewards/margins": 8.53257942199707, + "rewards/rejected": -11.407368659973145, + "step": 4690 + }, + { + "epoch": 2.4594453165881736, + "grad_norm": 5.646092509090637, + "learning_rate": 4.785318561620511e-08, + "logits/chosen": -2.464293956756592, + "logits/rejected": -2.4637789726257324, + "logps/chosen": -268.41290283203125, + "logps/rejected": -385.8301696777344, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5808568000793457, + "rewards/margins": 9.485494613647461, + "rewards/rejected": -13.066350936889648, + "step": 4700 + }, + { + "epoch": 2.464678178963893, + "grad_norm": 2.1455296693383383, + "learning_rate": 4.696120426374503e-08, + "logits/chosen": -2.4258320331573486, + "logits/rejected": -2.463759660720825, + "logps/chosen": -279.572509765625, + "logps/rejected": -390.50244140625, + "loss": 0.0137, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.916973829269409, + "rewards/margins": 9.4240083694458, + "rewards/rejected": -12.340982437133789, + "step": 4710 + }, + { + "epoch": 2.469911041339613, + "grad_norm": 36.72082911020149, + "learning_rate": 4.607675206868705e-08, + "logits/chosen": -2.6820778846740723, + "logits/rejected": -2.558410882949829, + "logps/chosen": -298.0652770996094, + "logps/rejected": -333.7609558105469, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.282726764678955, + "rewards/margins": 9.007135391235352, + "rewards/rejected": -11.289862632751465, + "step": 4720 + }, + { + "epoch": 2.4751439037153324, + "grad_norm": 2.745357634365693, + "learning_rate": 4.519986182860452e-08, + "logits/chosen": -2.567898988723755, + "logits/rejected": -2.390460252761841, + "logps/chosen": -315.73736572265625, + "logps/rejected": -331.95635986328125, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1501383781433105, + "rewards/margins": 8.154587745666504, + "rewards/rejected": -10.304725646972656, + "step": 4730 + }, + { + "epoch": 2.4803767660910516, + "grad_norm": 5.338635929796716, + "learning_rate": 4.433056606065552e-08, + "logits/chosen": -2.5544776916503906, + "logits/rejected": -2.521737575531006, + "logps/chosen": -266.3556213378906, + "logps/rejected": -366.9061279296875, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6391263008117676, + "rewards/margins": 8.17832088470459, + "rewards/rejected": -10.817447662353516, + "step": 4740 + }, + { + "epoch": 2.4856096284667712, + "grad_norm": 0.5914614001413402, + "learning_rate": 4.3468897000377427e-08, + "logits/chosen": -2.7590155601501465, + "logits/rejected": -2.6252360343933105, + "logps/chosen": -310.00030517578125, + "logps/rejected": -352.8393249511719, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7302494049072266, + "rewards/margins": 8.767389297485352, + "rewards/rejected": -10.497637748718262, + "step": 4750 + }, + { + "epoch": 2.490842490842491, + "grad_norm": 5.805864346843536, + "learning_rate": 4.2614886600491115e-08, + "logits/chosen": -2.664672613143921, + "logits/rejected": -2.5826516151428223, + "logps/chosen": -305.0514831542969, + "logps/rejected": -394.5857849121094, + "loss": 0.0076, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.7474026679992676, + "rewards/margins": 8.99746322631836, + "rewards/rejected": -11.744866371154785, + "step": 4760 + }, + { + "epoch": 2.4960753532182105, + "grad_norm": 2.946932792892311, + "learning_rate": 4.1768566529716415e-08, + "logits/chosen": -2.53653883934021, + "logits/rejected": -2.5299153327941895, + "logps/chosen": -270.1798095703125, + "logps/rejected": -344.4693603515625, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.985710620880127, + "rewards/margins": 8.387160301208496, + "rewards/rejected": -11.372869491577148, + "step": 4770 + }, + { + "epoch": 2.50130821559393, + "grad_norm": 1.5669617016018436, + "learning_rate": 4.0929968171597526e-08, + "logits/chosen": -2.5235400199890137, + "logits/rejected": -2.4382717609405518, + "logps/chosen": -299.82257080078125, + "logps/rejected": -312.45953369140625, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.107891082763672, + "rewards/margins": 9.178747177124023, + "rewards/rejected": -11.286638259887695, + "step": 4780 + }, + { + "epoch": 2.5065410779696493, + "grad_norm": 1.5503193629564926, + "learning_rate": 4.009912262333942e-08, + "logits/chosen": -2.63818621635437, + "logits/rejected": -2.524019479751587, + "logps/chosen": -290.4059753417969, + "logps/rejected": -368.35968017578125, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5685508251190186, + "rewards/margins": 9.202163696289062, + "rewards/rejected": -12.770713806152344, + "step": 4790 + }, + { + "epoch": 2.511773940345369, + "grad_norm": 2.4404410351397003, + "learning_rate": 3.927606069465442e-08, + "logits/chosen": -2.5241026878356934, + "logits/rejected": -2.3206825256347656, + "logps/chosen": -339.6543884277344, + "logps/rejected": -364.41876220703125, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0121209621429443, + "rewards/margins": 9.030576705932617, + "rewards/rejected": -12.042699813842773, + "step": 4800 + }, + { + "epoch": 2.5170068027210886, + "grad_norm": 1.832113698737391, + "learning_rate": 3.8460812906620037e-08, + "logits/chosen": -2.6327362060546875, + "logits/rejected": -2.5126953125, + "logps/chosen": -330.55572509765625, + "logps/rejected": -386.8505554199219, + "loss": 0.0064, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.357273578643799, + "rewards/margins": 8.54776668548584, + "rewards/rejected": -10.905040740966797, + "step": 4810 + }, + { + "epoch": 2.5222396650968077, + "grad_norm": 3.2729938935364786, + "learning_rate": 3.765340949054696e-08, + "logits/chosen": -2.5752532482147217, + "logits/rejected": -2.4276397228240967, + "logps/chosen": -348.06451416015625, + "logps/rejected": -348.2574462890625, + "loss": 0.0121, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.5180726051330566, + "rewards/margins": 8.617886543273926, + "rewards/rejected": -11.135960578918457, + "step": 4820 + }, + { + "epoch": 2.5274725274725274, + "grad_norm": 2.5047863029948156, + "learning_rate": 3.685388038685811e-08, + "logits/chosen": -2.6267170906066895, + "logits/rejected": -2.539578914642334, + "logps/chosen": -384.8499755859375, + "logps/rejected": -444.0081481933594, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9967598915100098, + "rewards/margins": 9.045783042907715, + "rewards/rejected": -12.042542457580566, + "step": 4830 + }, + { + "epoch": 2.532705389848247, + "grad_norm": 3.2326566350857786, + "learning_rate": 3.60622552439783e-08, + "logits/chosen": -2.4952991008758545, + "logits/rejected": -2.404175043106079, + "logps/chosen": -300.92340087890625, + "logps/rejected": -385.161865234375, + "loss": 0.0187, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.1940178871154785, + "rewards/margins": 10.167230606079102, + "rewards/rejected": -12.361248016357422, + "step": 4840 + }, + { + "epoch": 2.5379382522239666, + "grad_norm": 6.726664707588461, + "learning_rate": 3.527856341723479e-08, + "logits/chosen": -2.4801971912384033, + "logits/rejected": -2.4623141288757324, + "logps/chosen": -258.33795166015625, + "logps/rejected": -395.36358642578125, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.240489482879639, + "rewards/margins": 9.309966087341309, + "rewards/rejected": -13.550456047058105, + "step": 4850 + }, + { + "epoch": 2.5431711145996863, + "grad_norm": 6.788349753702106, + "learning_rate": 3.4502833967768816e-08, + "logits/chosen": -2.5217742919921875, + "logits/rejected": -2.5007529258728027, + "logps/chosen": -360.753662109375, + "logps/rejected": -389.7902526855469, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.387969970703125, + "rewards/margins": 8.528416633605957, + "rewards/rejected": -11.916386604309082, + "step": 4860 + }, + { + "epoch": 2.5484039769754054, + "grad_norm": 0.5141841992567054, + "learning_rate": 3.373509566145793e-08, + "logits/chosen": -2.5920393466949463, + "logits/rejected": -2.4291844367980957, + "logps/chosen": -409.9930725097656, + "logps/rejected": -404.51812744140625, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0096020698547363, + "rewards/margins": 9.102025985717773, + "rewards/rejected": -12.111627578735352, + "step": 4870 + }, + { + "epoch": 2.553636839351125, + "grad_norm": 1.9901574241674767, + "learning_rate": 3.2975376967849104e-08, + "logits/chosen": -2.604759931564331, + "logits/rejected": -2.465873956680298, + "logps/chosen": -292.3291015625, + "logps/rejected": -366.34002685546875, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3565335273742676, + "rewards/margins": 8.622732162475586, + "rewards/rejected": -10.979265213012695, + "step": 4880 + }, + { + "epoch": 2.5588697017268447, + "grad_norm": 2.635976519171494, + "learning_rate": 3.222370605910332e-08, + "logits/chosen": -2.5512473583221436, + "logits/rejected": -2.478119134902954, + "logps/chosen": -325.674072265625, + "logps/rejected": -369.12762451171875, + "loss": 0.0117, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.2583580017089844, + "rewards/margins": 9.271775245666504, + "rewards/rejected": -11.530134201049805, + "step": 4890 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 9.016425567948197, + "learning_rate": 3.1480110808950746e-08, + "logits/chosen": -2.397136926651001, + "logits/rejected": -2.472404956817627, + "logps/chosen": -286.8371276855469, + "logps/rejected": -421.8182067871094, + "loss": 0.0138, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.3170464038848877, + "rewards/margins": 9.02189826965332, + "rewards/rejected": -12.338946342468262, + "step": 4900 + }, + { + "epoch": 2.5693354264782835, + "grad_norm": 0.6713458458157762, + "learning_rate": 3.07446187916568e-08, + "logits/chosen": -2.6027634143829346, + "logits/rejected": -2.5397934913635254, + "logps/chosen": -318.99664306640625, + "logps/rejected": -387.5476989746094, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.911219596862793, + "rewards/margins": 8.30928897857666, + "rewards/rejected": -11.220507621765137, + "step": 4910 + }, + { + "epoch": 2.574568288854003, + "grad_norm": 1.1125319314771425, + "learning_rate": 3.001725728100021e-08, + "logits/chosen": -2.6207592487335205, + "logits/rejected": -2.4813547134399414, + "logps/chosen": -344.18505859375, + "logps/rejected": -352.73004150390625, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0130362510681152, + "rewards/margins": 9.275091171264648, + "rewards/rejected": -12.288125991821289, + "step": 4920 + }, + { + "epoch": 2.5798011512297228, + "grad_norm": 5.299981453787648, + "learning_rate": 2.9298053249261238e-08, + "logits/chosen": -2.5177340507507324, + "logits/rejected": -2.5581235885620117, + "logps/chosen": -242.15701293945312, + "logps/rejected": -332.94146728515625, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.712165117263794, + "rewards/margins": 8.18096923828125, + "rewards/rejected": -11.893133163452148, + "step": 4930 + }, + { + "epoch": 2.5850340136054424, + "grad_norm": 4.723758865427596, + "learning_rate": 2.8587033366221534e-08, + "logits/chosen": -2.496917963027954, + "logits/rejected": -2.480437994003296, + "logps/chosen": -273.2696533203125, + "logps/rejected": -365.79193115234375, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.484445095062256, + "rewards/margins": 8.450800895690918, + "rewards/rejected": -11.935247421264648, + "step": 4940 + }, + { + "epoch": 2.5902668759811616, + "grad_norm": 2.0339103072849096, + "learning_rate": 2.7884223998175248e-08, + "logits/chosen": -2.616367816925049, + "logits/rejected": -2.547131061553955, + "logps/chosen": -285.77667236328125, + "logps/rejected": -393.36920166015625, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.509310245513916, + "rewards/margins": 8.665257453918457, + "rewards/rejected": -12.174566268920898, + "step": 4950 + }, + { + "epoch": 2.595499738356881, + "grad_norm": 2.5244815062612562, + "learning_rate": 2.718965120695141e-08, + "logits/chosen": -2.6186976432800293, + "logits/rejected": -2.637716054916382, + "logps/chosen": -319.0390625, + "logps/rejected": -411.00177001953125, + "loss": 0.0179, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.704867362976074, + "rewards/margins": 8.904386520385742, + "rewards/rejected": -11.6092529296875, + "step": 4960 + }, + { + "epoch": 2.600732600732601, + "grad_norm": 2.9436991950391382, + "learning_rate": 2.6503340748947083e-08, + "logits/chosen": -2.6223256587982178, + "logits/rejected": -2.651494026184082, + "logps/chosen": -322.9923095703125, + "logps/rejected": -477.2806701660156, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5072782039642334, + "rewards/margins": 9.954888343811035, + "rewards/rejected": -12.462165832519531, + "step": 4970 + }, + { + "epoch": 2.60596546310832, + "grad_norm": 1.6420492628107515, + "learning_rate": 2.5825318074172763e-08, + "logits/chosen": -2.7298004627227783, + "logits/rejected": -2.5805752277374268, + "logps/chosen": -316.26580810546875, + "logps/rejected": -383.5366516113281, + "loss": 0.012, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.4945201873779297, + "rewards/margins": 9.040359497070312, + "rewards/rejected": -11.534879684448242, + "step": 4980 + }, + { + "epoch": 2.6111983254840396, + "grad_norm": 2.9897115138402066, + "learning_rate": 2.5155608325308358e-08, + "logits/chosen": -2.6544556617736816, + "logits/rejected": -2.496344804763794, + "logps/chosen": -357.4461975097656, + "logps/rejected": -405.1456298828125, + "loss": 0.0087, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.524782180786133, + "rewards/margins": 9.047494888305664, + "rewards/rejected": -11.572277069091797, + "step": 4990 + }, + { + "epoch": 2.6164311878597593, + "grad_norm": 1.2733440450810523, + "learning_rate": 2.4494236336770695e-08, + "logits/chosen": -2.658801555633545, + "logits/rejected": -2.6000139713287354, + "logps/chosen": -301.155029296875, + "logps/rejected": -394.56976318359375, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3924107551574707, + "rewards/margins": 8.720831871032715, + "rewards/rejected": -11.113243103027344, + "step": 5000 + }, + { + "epoch": 2.621664050235479, + "grad_norm": 8.752405545719595, + "learning_rate": 2.3841226633792983e-08, + "logits/chosen": -2.5913498401641846, + "logits/rejected": -2.4253532886505127, + "logps/chosen": -346.916259765625, + "logps/rejected": -367.08001708984375, + "loss": 0.0064, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.553851842880249, + "rewards/margins": 7.920147895812988, + "rewards/rejected": -10.473998069763184, + "step": 5010 + }, + { + "epoch": 2.6268969126111985, + "grad_norm": 5.937834540038537, + "learning_rate": 2.319660343151511e-08, + "logits/chosen": -2.606656551361084, + "logits/rejected": -2.5195579528808594, + "logps/chosen": -300.65838623046875, + "logps/rejected": -324.0450439453125, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.727539539337158, + "rewards/margins": 8.226611137390137, + "rewards/rejected": -10.954151153564453, + "step": 5020 + }, + { + "epoch": 2.6321297749869177, + "grad_norm": 4.19168135519844, + "learning_rate": 2.2560390634085715e-08, + "logits/chosen": -2.38765549659729, + "logits/rejected": -2.407247543334961, + "logps/chosen": -271.3846740722656, + "logps/rejected": -430.45233154296875, + "loss": 0.016, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.0975801944732666, + "rewards/margins": 9.703760147094727, + "rewards/rejected": -12.801342964172363, + "step": 5030 + }, + { + "epoch": 2.6373626373626373, + "grad_norm": 4.438339211076942, + "learning_rate": 2.1932611833775843e-08, + "logits/chosen": -2.5761990547180176, + "logits/rejected": -2.4454965591430664, + "logps/chosen": -284.6046447753906, + "logps/rejected": -365.8029479980469, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3015778064727783, + "rewards/margins": 10.762308120727539, + "rewards/rejected": -13.063886642456055, + "step": 5040 + }, + { + "epoch": 2.642595499738357, + "grad_norm": 9.842355468779951, + "learning_rate": 2.1313290310103897e-08, + "logits/chosen": -2.562697410583496, + "logits/rejected": -2.459880828857422, + "logps/chosen": -263.71429443359375, + "logps/rejected": -355.89349365234375, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0439720153808594, + "rewards/margins": 8.155313491821289, + "rewards/rejected": -11.199285507202148, + "step": 5050 + }, + { + "epoch": 2.647828362114076, + "grad_norm": 1.3521199827350725, + "learning_rate": 2.0702449028972696e-08, + "logits/chosen": -2.510012626647949, + "logits/rejected": -2.523059368133545, + "logps/chosen": -317.8623962402344, + "logps/rejected": -404.1944885253906, + "loss": 0.0143, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.8070850372314453, + "rewards/margins": 8.632943153381348, + "rewards/rejected": -11.440028190612793, + "step": 5060 + }, + { + "epoch": 2.6530612244897958, + "grad_norm": 1.463322874368762, + "learning_rate": 2.0100110641817547e-08, + "logits/chosen": -2.5922181606292725, + "logits/rejected": -2.414249897003174, + "logps/chosen": -339.21002197265625, + "logps/rejected": -382.40435791015625, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.406454563140869, + "rewards/margins": 10.262566566467285, + "rewards/rejected": -13.66901969909668, + "step": 5070 + }, + { + "epoch": 2.6582940868655154, + "grad_norm": 2.1574773610595863, + "learning_rate": 1.9506297484766427e-08, + "logits/chosen": -2.666602611541748, + "logits/rejected": -2.5114593505859375, + "logps/chosen": -420.3592834472656, + "logps/rejected": -346.78863525390625, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.242121458053589, + "rewards/margins": 9.135951042175293, + "rewards/rejected": -11.378072738647461, + "step": 5080 + }, + { + "epoch": 2.663526949241235, + "grad_norm": 36.010723824965844, + "learning_rate": 1.8921031577811692e-08, + "logits/chosen": -2.354315996170044, + "logits/rejected": -2.2910566329956055, + "logps/chosen": -299.4645080566406, + "logps/rejected": -375.19195556640625, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.713761568069458, + "rewards/margins": 8.878849983215332, + "rewards/rejected": -12.592611312866211, + "step": 5090 + }, + { + "epoch": 2.6687598116169546, + "grad_norm": 1.0986132222203888, + "learning_rate": 1.834433462399351e-08, + "logits/chosen": -2.6104464530944824, + "logits/rejected": -2.4752371311187744, + "logps/chosen": -317.61346435546875, + "logps/rejected": -378.275390625, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.577064037322998, + "rewards/margins": 8.46171760559082, + "rewards/rejected": -11.03878116607666, + "step": 5100 + }, + { + "epoch": 2.6739926739926743, + "grad_norm": 7.0005850905584275, + "learning_rate": 1.7776228008594962e-08, + "logits/chosen": -2.6077401638031006, + "logits/rejected": -2.5730576515197754, + "logps/chosen": -302.76171875, + "logps/rejected": -442.8404846191406, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5833659172058105, + "rewards/margins": 9.618488311767578, + "rewards/rejected": -12.201854705810547, + "step": 5110 + }, + { + "epoch": 2.6792255363683934, + "grad_norm": 3.361896665859258, + "learning_rate": 1.721673279834926e-08, + "logits/chosen": -2.560586929321289, + "logits/rejected": -2.4337735176086426, + "logps/chosen": -304.4768371582031, + "logps/rejected": -360.99249267578125, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9418601989746094, + "rewards/margins": 8.619203567504883, + "rewards/rejected": -12.561063766479492, + "step": 5120 + }, + { + "epoch": 2.684458398744113, + "grad_norm": 2.8782909461152175, + "learning_rate": 1.666586974065831e-08, + "logits/chosen": -2.588582754135132, + "logits/rejected": -2.5598676204681396, + "logps/chosen": -331.9123229980469, + "logps/rejected": -444.1719665527344, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.416478395462036, + "rewards/margins": 9.728887557983398, + "rewards/rejected": -12.145366668701172, + "step": 5130 + }, + { + "epoch": 2.6896912611198327, + "grad_norm": 3.06657213255932, + "learning_rate": 1.6123659262823497e-08, + "logits/chosen": -2.5465493202209473, + "logits/rejected": -2.467817544937134, + "logps/chosen": -324.09197998046875, + "logps/rejected": -339.67120361328125, + "loss": 0.0121, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.714784622192383, + "rewards/margins": 8.506978988647461, + "rewards/rejected": -11.221763610839844, + "step": 5140 + }, + { + "epoch": 2.694924123495552, + "grad_norm": 10.956318628505112, + "learning_rate": 1.5590121471288104e-08, + "logits/chosen": -2.489410877227783, + "logits/rejected": -2.5203540325164795, + "logps/chosen": -240.01138305664062, + "logps/rejected": -359.5928039550781, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6659531593322754, + "rewards/margins": 10.252822875976562, + "rewards/rejected": -12.918774604797363, + "step": 5150 + }, + { + "epoch": 2.7001569858712715, + "grad_norm": 4.341827826318645, + "learning_rate": 1.5065276150891787e-08, + "logits/chosen": -2.4828386306762695, + "logits/rejected": -2.436096429824829, + "logps/chosen": -274.3287658691406, + "logps/rejected": -380.4012145996094, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.955925941467285, + "rewards/margins": 9.629919052124023, + "rewards/rejected": -12.585844993591309, + "step": 5160 + }, + { + "epoch": 2.705389848246991, + "grad_norm": 2.168504147683603, + "learning_rate": 1.4549142764136768e-08, + "logits/chosen": -2.496258020401001, + "logits/rejected": -2.3795084953308105, + "logps/chosen": -290.3370056152344, + "logps/rejected": -382.50799560546875, + "loss": 0.0144, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.412745952606201, + "rewards/margins": 9.241251945495605, + "rewards/rejected": -12.653998374938965, + "step": 5170 + }, + { + "epoch": 2.7106227106227108, + "grad_norm": 3.1888282613852583, + "learning_rate": 1.4041740450466383e-08, + "logits/chosen": -2.5063326358795166, + "logits/rejected": -2.498027801513672, + "logps/chosen": -305.488037109375, + "logps/rejected": -390.25543212890625, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.42686128616333, + "rewards/margins": 8.476984977722168, + "rewards/rejected": -11.90384578704834, + "step": 5180 + }, + { + "epoch": 2.7158555729984304, + "grad_norm": 1.5952711132379955, + "learning_rate": 1.3543088025555094e-08, + "logits/chosen": -2.4821343421936035, + "logits/rejected": -2.4512901306152344, + "logps/chosen": -296.63531494140625, + "logps/rejected": -329.72296142578125, + "loss": 0.0129, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.5127968788146973, + "rewards/margins": 8.06564998626709, + "rewards/rejected": -10.578447341918945, + "step": 5190 + }, + { + "epoch": 2.7210884353741496, + "grad_norm": 1.4040294684460386, + "learning_rate": 1.3053203980610744e-08, + "logits/chosen": -2.442251682281494, + "logits/rejected": -2.433336019515991, + "logps/chosen": -353.4270935058594, + "logps/rejected": -437.695556640625, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.559645175933838, + "rewards/margins": 10.471541404724121, + "rewards/rejected": -13.031187057495117, + "step": 5200 + }, + { + "epoch": 2.726321297749869, + "grad_norm": 5.531095570318318, + "learning_rate": 1.2572106481689243e-08, + "logits/chosen": -2.5265042781829834, + "logits/rejected": -2.406580686569214, + "logps/chosen": -278.25384521484375, + "logps/rejected": -334.8019714355469, + "loss": 0.0164, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.6399970054626465, + "rewards/margins": 8.102693557739258, + "rewards/rejected": -11.742691040039062, + "step": 5210 + }, + { + "epoch": 2.731554160125589, + "grad_norm": 0.7569273137668125, + "learning_rate": 1.2099813369020467e-08, + "logits/chosen": -2.6435656547546387, + "logits/rejected": -2.569716215133667, + "logps/chosen": -326.84368896484375, + "logps/rejected": -419.45440673828125, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.801252603530884, + "rewards/margins": 8.470110893249512, + "rewards/rejected": -11.271363258361816, + "step": 5220 + }, + { + "epoch": 2.736787022501308, + "grad_norm": 1.015615968536147, + "learning_rate": 1.1636342156346846e-08, + "logits/chosen": -2.6142373085021973, + "logits/rejected": -2.4517297744750977, + "logps/chosen": -296.00433349609375, + "logps/rejected": -368.3468017578125, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2934257984161377, + "rewards/margins": 8.836563110351562, + "rewards/rejected": -12.129989624023438, + "step": 5230 + }, + { + "epoch": 2.7420198848770276, + "grad_norm": 5.469101579410441, + "learning_rate": 1.1181710030274043e-08, + "logits/chosen": -2.3535382747650146, + "logits/rejected": -2.2188241481781006, + "logps/chosen": -249.70327758789062, + "logps/rejected": -329.28094482421875, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7749602794647217, + "rewards/margins": 10.26407241821289, + "rewards/rejected": -13.039031982421875, + "step": 5240 + }, + { + "epoch": 2.7472527472527473, + "grad_norm": 1.8575488304333363, + "learning_rate": 1.0735933849633561e-08, + "logits/chosen": -2.6143882274627686, + "logits/rejected": -2.464627504348755, + "logps/chosen": -348.214111328125, + "logps/rejected": -357.53399658203125, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9914422035217285, + "rewards/margins": 9.351872444152832, + "rewards/rejected": -12.343315124511719, + "step": 5250 + }, + { + "epoch": 2.752485609628467, + "grad_norm": 0.8727476983735399, + "learning_rate": 1.0299030144857445e-08, + "logits/chosen": -2.4906797409057617, + "logits/rejected": -2.5234460830688477, + "logps/chosen": -261.9267883300781, + "logps/rejected": -377.07049560546875, + "loss": 0.0109, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.1616902351379395, + "rewards/margins": 10.171551704406738, + "rewards/rejected": -13.333239555358887, + "step": 5260 + }, + { + "epoch": 2.7577184720041865, + "grad_norm": 1.4211558713802048, + "learning_rate": 9.871015117365516e-09, + "logits/chosen": -2.536323070526123, + "logits/rejected": -2.4908180236816406, + "logps/chosen": -265.8121643066406, + "logps/rejected": -334.70745849609375, + "loss": 0.0102, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.533576488494873, + "rewards/margins": 8.419042587280273, + "rewards/rejected": -11.952618598937988, + "step": 5270 + }, + { + "epoch": 2.7629513343799057, + "grad_norm": 1.6323427663620214, + "learning_rate": 9.451904638964447e-09, + "logits/chosen": -2.5910801887512207, + "logits/rejected": -2.4406516551971436, + "logps/chosen": -348.68878173828125, + "logps/rejected": -361.7004089355469, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7282471656799316, + "rewards/margins": 8.851235389709473, + "rewards/rejected": -11.579483032226562, + "step": 5280 + }, + { + "epoch": 2.7681841967556253, + "grad_norm": 14.086043155752852, + "learning_rate": 9.041714251259214e-09, + "logits/chosen": -2.4262852668762207, + "logits/rejected": -2.2455453872680664, + "logps/chosen": -325.5466613769531, + "logps/rejected": -377.5086364746094, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.007089376449585, + "rewards/margins": 9.666694641113281, + "rewards/rejected": -12.673784255981445, + "step": 5290 + }, + { + "epoch": 2.773417059131345, + "grad_norm": 35.01863706023192, + "learning_rate": 8.640459165076857e-09, + "logits/chosen": -2.4602913856506348, + "logits/rejected": -2.5747923851013184, + "logps/chosen": -250.23727416992188, + "logps/rejected": -391.7991638183594, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3855373859405518, + "rewards/margins": 9.337056159973145, + "rewards/rejected": -12.722593307495117, + "step": 5300 + }, + { + "epoch": 2.778649921507064, + "grad_norm": 6.363328775308636, + "learning_rate": 8.248154259902246e-09, + "logits/chosen": -2.5952470302581787, + "logits/rejected": -2.371581554412842, + "logps/chosen": -329.0868835449219, + "logps/rejected": -326.04925537109375, + "loss": 0.011, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.090206623077393, + "rewards/margins": 7.8073248863220215, + "rewards/rejected": -11.897531509399414, + "step": 5310 + }, + { + "epoch": 2.7838827838827838, + "grad_norm": 0.9592032356713021, + "learning_rate": 7.86481408332651e-09, + "logits/chosen": -2.581714630126953, + "logits/rejected": -2.4266419410705566, + "logps/chosen": -266.65960693359375, + "logps/rejected": -347.1474914550781, + "loss": 0.0101, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.535461902618408, + "rewards/margins": 7.9888787269592285, + "rewards/rejected": -11.524340629577637, + "step": 5320 + }, + { + "epoch": 2.7891156462585034, + "grad_norm": 1.8356349413284074, + "learning_rate": 7.490452850507506e-09, + "logits/chosen": -2.5398435592651367, + "logits/rejected": -2.4736721515655518, + "logps/chosen": -305.2430114746094, + "logps/rejected": -341.65008544921875, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.192692995071411, + "rewards/margins": 8.169621467590332, + "rewards/rejected": -11.362314224243164, + "step": 5330 + }, + { + "epoch": 2.794348508634223, + "grad_norm": 2.3463398326078164, + "learning_rate": 7.1250844436426535e-09, + "logits/chosen": -2.4499154090881348, + "logits/rejected": -2.3645966053009033, + "logps/chosen": -262.638671875, + "logps/rejected": -363.5102233886719, + "loss": 0.0125, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.4711711406707764, + "rewards/margins": 10.573376655578613, + "rewards/rejected": -14.044549942016602, + "step": 5340 + }, + { + "epoch": 2.7995813710099426, + "grad_norm": 6.784060648739115, + "learning_rate": 6.768722411454153e-09, + "logits/chosen": -2.477108955383301, + "logits/rejected": -2.440368890762329, + "logps/chosen": -284.68536376953125, + "logps/rejected": -355.6117858886719, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4868836402893066, + "rewards/margins": 8.629325866699219, + "rewards/rejected": -12.116209030151367, + "step": 5350 + }, + { + "epoch": 2.804814233385662, + "grad_norm": 2.5718272328645564, + "learning_rate": 6.421379968686663e-09, + "logits/chosen": -2.6738717555999756, + "logits/rejected": -2.4878640174865723, + "logps/chosen": -408.24224853515625, + "logps/rejected": -405.9677429199219, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7108867168426514, + "rewards/margins": 7.706369876861572, + "rewards/rejected": -10.417256355285645, + "step": 5360 + }, + { + "epoch": 2.8100470957613815, + "grad_norm": 0.5429034331939131, + "learning_rate": 6.083069995617113e-09, + "logits/chosen": -2.4864702224731445, + "logits/rejected": -2.316953659057617, + "logps/chosen": -293.7408142089844, + "logps/rejected": -360.5414123535156, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3920187950134277, + "rewards/margins": 8.823633193969727, + "rewards/rejected": -12.215652465820312, + "step": 5370 + }, + { + "epoch": 2.815279958137101, + "grad_norm": 3.2205880935954916, + "learning_rate": 5.753805037577192e-09, + "logits/chosen": -2.3979969024658203, + "logits/rejected": -2.4518043994903564, + "logps/chosen": -286.2723083496094, + "logps/rejected": -358.1659851074219, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3278305530548096, + "rewards/margins": 9.405172348022461, + "rewards/rejected": -11.733002662658691, + "step": 5380 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 4.313345968600854, + "learning_rate": 5.433597304488113e-09, + "logits/chosen": -2.5451204776763916, + "logits/rejected": -2.3840582370758057, + "logps/chosen": -340.4458923339844, + "logps/rejected": -439.3768615722656, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.964724063873291, + "rewards/margins": 9.207846641540527, + "rewards/rejected": -12.172571182250977, + "step": 5390 + }, + { + "epoch": 2.82574568288854, + "grad_norm": 5.264269877259486, + "learning_rate": 5.122458670407836e-09, + "logits/chosen": -2.6159005165100098, + "logits/rejected": -2.4100537300109863, + "logps/chosen": -292.0515441894531, + "logps/rejected": -290.3874206542969, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.151519775390625, + "rewards/margins": 6.9995832443237305, + "rewards/rejected": -10.151103973388672, + "step": 5400 + }, + { + "epoch": 2.8309785452642595, + "grad_norm": 4.515185713671941, + "learning_rate": 4.820400673090669e-09, + "logits/chosen": -2.5118215084075928, + "logits/rejected": -2.586270570755005, + "logps/chosen": -360.7964782714844, + "logps/rejected": -450.97003173828125, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.430558443069458, + "rewards/margins": 10.006916999816895, + "rewards/rejected": -13.437475204467773, + "step": 5410 + }, + { + "epoch": 2.836211407639979, + "grad_norm": 3.1818990142647396, + "learning_rate": 4.5274345135595525e-09, + "logits/chosen": -2.603188991546631, + "logits/rejected": -2.5455174446105957, + "logps/chosen": -384.5435485839844, + "logps/rejected": -441.33270263671875, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.382079601287842, + "rewards/margins": 10.478212356567383, + "rewards/rejected": -12.860292434692383, + "step": 5420 + }, + { + "epoch": 2.8414442700156988, + "grad_norm": 2.7363660749645393, + "learning_rate": 4.243571055690648e-09, + "logits/chosen": -2.677584409713745, + "logits/rejected": -2.6243412494659424, + "logps/chosen": -386.9283752441406, + "logps/rejected": -446.9391174316406, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8604724407196045, + "rewards/margins": 9.12932014465332, + "rewards/rejected": -11.98979377746582, + "step": 5430 + }, + { + "epoch": 2.846677132391418, + "grad_norm": 30.216614020048112, + "learning_rate": 3.968820825810431e-09, + "logits/chosen": -2.334294319152832, + "logits/rejected": -2.228971242904663, + "logps/chosen": -290.4869079589844, + "logps/rejected": -327.9725646972656, + "loss": 0.0152, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.2857768535614014, + "rewards/margins": 8.18476676940918, + "rewards/rejected": -11.47054386138916, + "step": 5440 + }, + { + "epoch": 2.8519099947671376, + "grad_norm": 10.307838000491031, + "learning_rate": 3.7031940123053997e-09, + "logits/chosen": -2.4512031078338623, + "logits/rejected": -2.3894190788269043, + "logps/chosen": -277.6960754394531, + "logps/rejected": -375.0935363769531, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.572457790374756, + "rewards/margins": 8.654683113098145, + "rewards/rejected": -12.227140426635742, + "step": 5450 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 1.1118530816981196, + "learning_rate": 3.4467004652442842e-09, + "logits/chosen": -2.4142355918884277, + "logits/rejected": -2.3406777381896973, + "logps/chosen": -250.6748504638672, + "logps/rejected": -334.2477111816406, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2360546588897705, + "rewards/margins": 8.070561408996582, + "rewards/rejected": -11.306615829467773, + "step": 5460 + }, + { + "epoch": 2.8623757195185764, + "grad_norm": 7.408246277035073, + "learning_rate": 3.1993496960127653e-09, + "logits/chosen": -2.5402913093566895, + "logits/rejected": -2.471750020980835, + "logps/chosen": -267.28509521484375, + "logps/rejected": -328.6303405761719, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3718109130859375, + "rewards/margins": 8.653961181640625, + "rewards/rejected": -12.025772094726562, + "step": 5470 + }, + { + "epoch": 2.867608581894296, + "grad_norm": 2.2244454243376888, + "learning_rate": 2.9611508769606663e-09, + "logits/chosen": -2.6168313026428223, + "logits/rejected": -2.6207103729248047, + "logps/chosen": -335.3636169433594, + "logps/rejected": -391.04510498046875, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4004509449005127, + "rewards/margins": 7.857171535491943, + "rewards/rejected": -11.257621765136719, + "step": 5480 + }, + { + "epoch": 2.8728414442700156, + "grad_norm": 0.8061031545246838, + "learning_rate": 2.7321128410620344e-09, + "logits/chosen": -2.4226486682891846, + "logits/rejected": -2.2207863330841064, + "logps/chosen": -266.69989013671875, + "logps/rejected": -311.1686096191406, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3387820720672607, + "rewards/margins": 7.805467128753662, + "rewards/rejected": -11.144248008728027, + "step": 5490 + }, + { + "epoch": 2.8780743066457353, + "grad_norm": 1.237817233976751, + "learning_rate": 2.5122440815873724e-09, + "logits/chosen": -2.54459547996521, + "logits/rejected": -2.35339617729187, + "logps/chosen": -372.35467529296875, + "logps/rejected": -341.9369201660156, + "loss": 0.0096, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.0608162879943848, + "rewards/margins": 8.385961532592773, + "rewards/rejected": -11.44677734375, + "step": 5500 + }, + { + "epoch": 2.883307169021455, + "grad_norm": 1.4592021216320497, + "learning_rate": 2.301552751788838e-09, + "logits/chosen": -2.401576519012451, + "logits/rejected": -2.4742298126220703, + "logps/chosen": -301.2481689453125, + "logps/rejected": -432.27459716796875, + "loss": 0.0166, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.694408416748047, + "rewards/margins": 9.245023727416992, + "rewards/rejected": -11.939432144165039, + "step": 5510 + }, + { + "epoch": 2.8885400313971745, + "grad_norm": 6.7253892662738854, + "learning_rate": 2.1000466645978433e-09, + "logits/chosen": -2.612203598022461, + "logits/rejected": -2.5522124767303467, + "logps/chosen": -252.40243530273438, + "logps/rejected": -316.52178955078125, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.106694221496582, + "rewards/margins": 7.8931169509887695, + "rewards/rejected": -10.999811172485352, + "step": 5520 + }, + { + "epoch": 2.8937728937728937, + "grad_norm": 2.521063730398925, + "learning_rate": 1.9077332923353728e-09, + "logits/chosen": -2.5552637577056885, + "logits/rejected": -2.4974677562713623, + "logps/chosen": -348.4877014160156, + "logps/rejected": -410.57403564453125, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.973598003387451, + "rewards/margins": 8.762843132019043, + "rewards/rejected": -11.736442565917969, + "step": 5530 + }, + { + "epoch": 2.8990057561486133, + "grad_norm": 1.8208189324306472, + "learning_rate": 1.7246197664347872e-09, + "logits/chosen": -2.682417392730713, + "logits/rejected": -2.5886688232421875, + "logps/chosen": -334.28973388671875, + "logps/rejected": -496.99957275390625, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5573782920837402, + "rewards/margins": 9.289700508117676, + "rewards/rejected": -11.847077369689941, + "step": 5540 + }, + { + "epoch": 2.904238618524333, + "grad_norm": 1.8289670826499012, + "learning_rate": 1.5507128771775346e-09, + "logits/chosen": -2.4755916595458984, + "logits/rejected": -2.3878397941589355, + "logps/chosen": -306.183837890625, + "logps/rejected": -390.5394287109375, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.636819362640381, + "rewards/margins": 8.681589126586914, + "rewards/rejected": -12.318408012390137, + "step": 5550 + }, + { + "epoch": 2.909471480900052, + "grad_norm": 1.3092391514959099, + "learning_rate": 1.3860190734411858e-09, + "logits/chosen": -2.5638322830200195, + "logits/rejected": -2.4192492961883545, + "logps/chosen": -346.40765380859375, + "logps/rejected": -422.3858947753906, + "loss": 0.0161, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.551558256149292, + "rewards/margins": 9.023459434509277, + "rewards/rejected": -11.575017929077148, + "step": 5560 + }, + { + "epoch": 2.9147043432757718, + "grad_norm": 6.938403304922241, + "learning_rate": 1.2305444624604034e-09, + "logits/chosen": -2.6479740142822266, + "logits/rejected": -2.628225088119507, + "logps/chosen": -338.06195068359375, + "logps/rejected": -417.61456298828125, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0014538764953613, + "rewards/margins": 10.071534156799316, + "rewards/rejected": -12.07298755645752, + "step": 5570 + }, + { + "epoch": 2.9199372056514914, + "grad_norm": 1.5146419848534598, + "learning_rate": 1.0842948096004835e-09, + "logits/chosen": -2.5002403259277344, + "logits/rejected": -2.4548568725585938, + "logps/chosen": -284.13372802734375, + "logps/rejected": -385.5391845703125, + "loss": 0.0174, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.2518792152404785, + "rewards/margins": 8.641477584838867, + "rewards/rejected": -11.893355369567871, + "step": 5580 + }, + { + "epoch": 2.925170068027211, + "grad_norm": 7.445325284727672, + "learning_rate": 9.472755381434161e-10, + "logits/chosen": -2.4903171062469482, + "logits/rejected": -2.287623167037964, + "logps/chosen": -323.30670166015625, + "logps/rejected": -314.6568603515625, + "loss": 0.0135, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.513911008834839, + "rewards/margins": 9.13040542602539, + "rewards/rejected": -11.644315719604492, + "step": 5590 + }, + { + "epoch": 2.9304029304029307, + "grad_norm": 2.602340314549884, + "learning_rate": 8.194917290869907e-10, + "logits/chosen": -2.5437402725219727, + "logits/rejected": -2.448638439178467, + "logps/chosen": -337.36602783203125, + "logps/rejected": -394.43756103515625, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3356971740722656, + "rewards/margins": 10.068342208862305, + "rewards/rejected": -12.404041290283203, + "step": 5600 + }, + { + "epoch": 2.93563579277865, + "grad_norm": 1.1068213174152017, + "learning_rate": 7.009481209561685e-10, + "logits/chosen": -2.5758707523345947, + "logits/rejected": -2.5155177116394043, + "logps/chosen": -259.2216491699219, + "logps/rejected": -384.0183410644531, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.294078826904297, + "rewards/margins": 9.293085098266602, + "rewards/rejected": -12.587165832519531, + "step": 5610 + }, + { + "epoch": 2.9408686551543695, + "grad_norm": 1.5286464770057508, + "learning_rate": 5.916491096275845e-10, + "logits/chosen": -2.636427402496338, + "logits/rejected": -2.594538450241089, + "logps/chosen": -328.97796630859375, + "logps/rejected": -415.40582275390625, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2140936851501465, + "rewards/margins": 9.737319946289062, + "rewards/rejected": -12.951414108276367, + "step": 5620 + }, + { + "epoch": 2.946101517530089, + "grad_norm": 0.9575300257762934, + "learning_rate": 4.915987481662887e-10, + "logits/chosen": -2.4386096000671387, + "logits/rejected": -2.3729395866394043, + "logps/chosen": -261.83331298828125, + "logps/rejected": -344.73443603515625, + "loss": 0.0088, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.9984965324401855, + "rewards/margins": 8.80710506439209, + "rewards/rejected": -11.805601119995117, + "step": 5630 + }, + { + "epoch": 2.9513343799058083, + "grad_norm": 0.7442669698182585, + "learning_rate": 4.0080074667570017e-10, + "logits/chosen": -2.5603861808776855, + "logits/rejected": -2.483668327331543, + "logps/chosen": -278.2349853515625, + "logps/rejected": -418.85760498046875, + "loss": 0.0118, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.313835859298706, + "rewards/margins": 8.679365158081055, + "rewards/rejected": -11.993200302124023, + "step": 5640 + }, + { + "epoch": 2.956567242281528, + "grad_norm": 1.114513621058661, + "learning_rate": 3.1925847215980017e-10, + "logits/chosen": -2.604092836380005, + "logits/rejected": -2.4918534755706787, + "logps/chosen": -304.39678955078125, + "logps/rejected": -384.1284484863281, + "loss": 0.0147, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.602552652359009, + "rewards/margins": 8.72242546081543, + "rewards/rejected": -11.324975967407227, + "step": 5650 + }, + { + "epoch": 2.9618001046572475, + "grad_norm": 10.222264124984395, + "learning_rate": 2.469749483985095e-10, + "logits/chosen": -2.5285439491271973, + "logits/rejected": -2.419832944869995, + "logps/chosen": -299.08013916015625, + "logps/rejected": -385.5743408203125, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.964160442352295, + "rewards/margins": 9.242414474487305, + "rewards/rejected": -12.206574440002441, + "step": 5660 + }, + { + "epoch": 2.967032967032967, + "grad_norm": 8.039843420640908, + "learning_rate": 1.8395285583530652e-10, + "logits/chosen": -2.542475700378418, + "logits/rejected": -2.422661066055298, + "logps/chosen": -323.48297119140625, + "logps/rejected": -365.52923583984375, + "loss": 0.0133, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.63987398147583, + "rewards/margins": 9.397886276245117, + "rewards/rejected": -12.037759780883789, + "step": 5670 + }, + { + "epoch": 2.9722658294086868, + "grad_norm": 1.4255702175275826, + "learning_rate": 1.3019453147805614e-10, + "logits/chosen": -2.5691847801208496, + "logits/rejected": -2.424009084701538, + "logps/chosen": -312.40118408203125, + "logps/rejected": -402.6890563964844, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.699326992034912, + "rewards/margins": 9.890931129455566, + "rewards/rejected": -13.59025764465332, + "step": 5680 + }, + { + "epoch": 2.977498691784406, + "grad_norm": 2.8176122633799983, + "learning_rate": 8.570196881216297e-11, + "logits/chosen": -2.3424875736236572, + "logits/rejected": -2.354027271270752, + "logps/chosen": -256.85028076171875, + "logps/rejected": -372.85394287109375, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.546224355697632, + "rewards/margins": 9.345178604125977, + "rewards/rejected": -11.891403198242188, + "step": 5690 + }, + { + "epoch": 2.9827315541601256, + "grad_norm": 2.0227865741921054, + "learning_rate": 5.0476817726852194e-11, + "logits/chosen": -2.490391254425049, + "logits/rejected": -2.5428659915924072, + "logps/chosen": -343.39263916015625, + "logps/rejected": -448.39495849609375, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.258328914642334, + "rewards/margins": 9.148773193359375, + "rewards/rejected": -12.407099723815918, + "step": 5700 + }, + { + "epoch": 2.987964416535845, + "grad_norm": 0.8541881834134185, + "learning_rate": 2.4520384453746712e-11, + "logits/chosen": -2.4259555339813232, + "logits/rejected": -2.4047558307647705, + "logps/chosen": -341.0238037109375, + "logps/rejected": -441.8011779785156, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.361252546310425, + "rewards/margins": 9.091890335083008, + "rewards/rejected": -12.453143119812012, + "step": 5710 + }, + { + "epoch": 2.9931972789115644, + "grad_norm": 1.8692261524092926, + "learning_rate": 7.833631518627815e-12, + "logits/chosen": -2.3938581943511963, + "logits/rejected": -2.3685402870178223, + "logps/chosen": -296.4942626953125, + "logps/rejected": -387.60223388671875, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.046889305114746, + "rewards/margins": 9.50677490234375, + "rewards/rejected": -12.553665161132812, + "step": 5720 + }, + { + "epoch": 2.998430141287284, + "grad_norm": 5.334919186361271, + "learning_rate": 4.1717770565830033e-13, + "logits/chosen": -2.628361940383911, + "logits/rejected": -2.5328991413116455, + "logps/chosen": -305.0509033203125, + "logps/rejected": -323.2393493652344, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.800204277038574, + "rewards/margins": 7.784017086029053, + "rewards/rejected": -10.584222793579102, + "step": 5730 + }, + { + "epoch": 3.0, + "step": 5733, + "total_flos": 0.0, + "train_loss": 0.22836191894055405, + "train_runtime": 34822.6853, + "train_samples_per_second": 5.267, + "train_steps_per_second": 0.165 + } + ], + "logging_steps": 10, + "max_steps": 5733, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}