{ "best_metric": 0.6631070971488953, "best_model_checkpoint": "./output/checkpoints/2024-05-27_09-03-33/checkpoint-1100", "epoch": 1.0, "eval_steps": 100, "global_step": 1271, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003933910306845004, "grad_norm": 27.324785232543945, "learning_rate": 6.25e-07, "logits/chosen": -0.23312029242515564, "logits/rejected": -0.7136957049369812, "logps/chosen": -206.98876953125, "logps/rejected": -177.72207641601562, "loss": 0.6946, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.0011991311330348253, "rewards/margins": -0.0031457520090043545, "rewards/rejected": 0.001946620992384851, "step": 5 }, { "epoch": 0.007867820613690008, "grad_norm": 26.920639038085938, "learning_rate": 1.40625e-06, "logits/chosen": -0.3985660672187805, "logits/rejected": -0.7379584908485413, "logps/chosen": -201.005859375, "logps/rejected": -177.08181762695312, "loss": 0.688, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.010929527692496777, "rewards/margins": 0.013672275468707085, "rewards/rejected": -0.002742747776210308, "step": 10 }, { "epoch": 0.011801730920535013, "grad_norm": 34.40425109863281, "learning_rate": 2.1875000000000002e-06, "logits/chosen": -0.35717901587486267, "logits/rejected": -0.660548746585846, "logps/chosen": -217.42825317382812, "logps/rejected": -194.10195922851562, "loss": 0.6924, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.034006841480731964, "rewards/margins": 0.0028066448867321014, "rewards/rejected": 0.03120020031929016, "step": 15 }, { "epoch": 0.015735641227380016, "grad_norm": 27.097261428833008, "learning_rate": 2.96875e-06, "logits/chosen": -0.3896491825580597, "logits/rejected": -0.7307055592536926, "logps/chosen": -209.29373168945312, "logps/rejected": -179.78488159179688, "loss": 0.6839, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.08366340398788452, "rewards/margins": 0.025963936001062393, "rewards/rejected": 0.05769947171211243, "step": 20 }, { "epoch": 0.01966955153422502, "grad_norm": 29.19064712524414, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -0.24666282534599304, "logits/rejected": -0.7009283900260925, "logps/chosen": -196.3118438720703, "logps/rejected": -178.7552032470703, "loss": 0.683, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.13745614886283875, "rewards/margins": 0.03245489299297333, "rewards/rejected": 0.1050012856721878, "step": 25 }, { "epoch": 0.023603461841070025, "grad_norm": 31.083709716796875, "learning_rate": 4.53125e-06, "logits/chosen": -0.3193593919277191, "logits/rejected": -0.6126649379730225, "logps/chosen": -208.44863891601562, "logps/rejected": -184.2353057861328, "loss": 0.6852, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.22270426154136658, "rewards/margins": 0.03411892056465149, "rewards/rejected": 0.18858537077903748, "step": 30 }, { "epoch": 0.02753737214791503, "grad_norm": 25.83799171447754, "learning_rate": 5.3125e-06, "logits/chosen": -0.46783486008644104, "logits/rejected": -0.7504000067710876, "logps/chosen": -221.98843383789062, "logps/rejected": -199.54000854492188, "loss": 0.6705, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.3932109773159027, "rewards/margins": 0.08941729366779327, "rewards/rejected": 0.30379369854927063, "step": 35 }, { "epoch": 0.03147128245476003, "grad_norm": 24.734338760375977, "learning_rate": 6.093750000000001e-06, "logits/chosen": -0.3396364748477936, "logits/rejected": -0.7113901376724243, "logps/chosen": -196.3134765625, "logps/rejected": -179.5933380126953, "loss": 0.6879, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.4551575779914856, "rewards/margins": 0.05487058684229851, "rewards/rejected": 0.4002869725227356, "step": 40 }, { "epoch": 0.03540519276160504, "grad_norm": 28.34064292907715, "learning_rate": 6.718750000000001e-06, "logits/chosen": -0.667598307132721, "logits/rejected": -1.014026403427124, "logps/chosen": -196.5115966796875, "logps/rejected": -165.67092895507812, "loss": 0.6852, "rewards/accuracies": 0.5, "rewards/chosen": 0.5074445605278015, "rewards/margins": 0.09343204647302628, "rewards/rejected": 0.41401252150535583, "step": 45 }, { "epoch": 0.03933910306845004, "grad_norm": 30.12347984313965, "learning_rate": 7.500000000000001e-06, "logits/chosen": -0.2210284173488617, "logits/rejected": -0.32401731610298157, "logps/chosen": -210.63818359375, "logps/rejected": -205.76895141601562, "loss": 0.6641, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5923845171928406, "rewards/margins": 0.12340062856674194, "rewards/rejected": 0.4689839482307434, "step": 50 }, { "epoch": 0.043273013375295044, "grad_norm": 47.19338607788086, "learning_rate": 8.281250000000001e-06, "logits/chosen": -0.5629546642303467, "logits/rejected": -0.7718995213508606, "logps/chosen": -194.5259552001953, "logps/rejected": -179.5989532470703, "loss": 0.6902, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6510985493659973, "rewards/margins": 0.10642552375793457, "rewards/rejected": 0.544672966003418, "step": 55 }, { "epoch": 0.04720692368214005, "grad_norm": 23.202775955200195, "learning_rate": 9.0625e-06, "logits/chosen": -0.3029821217060089, "logits/rejected": -0.7788914442062378, "logps/chosen": -214.9969940185547, "logps/rejected": -167.64263916015625, "loss": 0.6472, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6703575849533081, "rewards/margins": 0.22439488768577576, "rewards/rejected": 0.44596266746520996, "step": 60 }, { "epoch": 0.05114083398898505, "grad_norm": 35.26408386230469, "learning_rate": 9.84375e-06, "logits/chosen": -0.4141275882720947, "logits/rejected": -0.7083785533905029, "logps/chosen": -212.9031524658203, "logps/rejected": -198.8483428955078, "loss": 0.662, "rewards/accuracies": 0.5625, "rewards/chosen": 0.7610660791397095, "rewards/margins": 0.2469903975725174, "rewards/rejected": 0.514075756072998, "step": 65 }, { "epoch": 0.05507474429583006, "grad_norm": 19.10537338256836, "learning_rate": 1.0625e-05, "logits/chosen": -0.4033733308315277, "logits/rejected": -0.7651963829994202, "logps/chosen": -212.84487915039062, "logps/rejected": -174.28073120117188, "loss": 0.6534, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7663796544075012, "rewards/margins": 0.24841317534446716, "rewards/rejected": 0.5179664492607117, "step": 70 }, { "epoch": 0.059008654602675056, "grad_norm": 26.261890411376953, "learning_rate": 1.1406250000000001e-05, "logits/chosen": -0.10389180481433868, "logits/rejected": -0.5258628129959106, "logps/chosen": -206.84921264648438, "logps/rejected": -186.50869750976562, "loss": 0.6808, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.8156352043151855, "rewards/margins": 0.17864595353603363, "rewards/rejected": 0.6369892358779907, "step": 75 }, { "epoch": 0.06294256490952006, "grad_norm": 32.33486557006836, "learning_rate": 1.2187500000000001e-05, "logits/chosen": -0.22502727806568146, "logits/rejected": -0.49946776032447815, "logps/chosen": -209.71426391601562, "logps/rejected": -198.34292602539062, "loss": 0.6852, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6370053291320801, "rewards/margins": 0.15727970004081726, "rewards/rejected": 0.4797256886959076, "step": 80 }, { "epoch": 0.06687647521636507, "grad_norm": 38.13333511352539, "learning_rate": 1.2968750000000002e-05, "logits/chosen": -0.25742509961128235, "logits/rejected": -0.7358572483062744, "logps/chosen": -206.3865966796875, "logps/rejected": -178.12637329101562, "loss": 0.6652, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3785225450992584, "rewards/margins": 0.16723336279392242, "rewards/rejected": 0.2112891674041748, "step": 85 }, { "epoch": 0.07081038552321008, "grad_norm": 23.647096633911133, "learning_rate": 1.375e-05, "logits/chosen": -0.3365253806114197, "logits/rejected": -0.5771717429161072, "logps/chosen": -208.416748046875, "logps/rejected": -184.40476989746094, "loss": 0.7024, "rewards/accuracies": 0.625, "rewards/chosen": 0.2850777506828308, "rewards/margins": 0.14986075460910797, "rewards/rejected": 0.13521698117256165, "step": 90 }, { "epoch": 0.07474429583005507, "grad_norm": 22.20098114013672, "learning_rate": 1.453125e-05, "logits/chosen": -0.21254411339759827, "logits/rejected": -0.6303216218948364, "logps/chosen": -201.83139038085938, "logps/rejected": -183.7214813232422, "loss": 0.6843, "rewards/accuracies": 0.625, "rewards/chosen": 0.3604539632797241, "rewards/margins": 0.1408630609512329, "rewards/rejected": 0.2195909023284912, "step": 95 }, { "epoch": 0.07867820613690008, "grad_norm": 29.343482971191406, "learning_rate": 1.5312500000000003e-05, "logits/chosen": -0.41852107644081116, "logits/rejected": -0.7636915445327759, "logps/chosen": -208.08035278320312, "logps/rejected": -178.69972229003906, "loss": 0.6731, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.4267478585243225, "rewards/margins": 0.16477522253990173, "rewards/rejected": 0.2619726061820984, "step": 100 }, { "epoch": 0.07867820613690008, "eval_logits/chosen": 1.3246409893035889, "eval_logits/rejected": 1.0977884531021118, "eval_logps/chosen": -206.3737030029297, "eval_logps/rejected": -179.28366088867188, "eval_loss": 0.6665228009223938, "eval_rewards/accuracies": 0.635937511920929, "eval_rewards/chosen": 0.6386381387710571, "eval_rewards/margins": 0.19896559417247772, "eval_rewards/rejected": 0.4396725594997406, "eval_runtime": 307.3381, "eval_samples_per_second": 2.082, "eval_steps_per_second": 0.13, "step": 100 }, { "epoch": 0.08261211644374508, "grad_norm": 24.263774871826172, "learning_rate": 1.609375e-05, "logits/chosen": -0.16335585713386536, "logits/rejected": -0.4457281231880188, "logps/chosen": -201.37017822265625, "logps/rejected": -176.67379760742188, "loss": 0.6641, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.6822856068611145, "rewards/margins": 0.18253257870674133, "rewards/rejected": 0.4997529983520508, "step": 105 }, { "epoch": 0.08654602675059009, "grad_norm": 25.775903701782227, "learning_rate": 1.6875e-05, "logits/chosen": -0.436201810836792, "logits/rejected": -0.9347764849662781, "logps/chosen": -195.61062622070312, "logps/rejected": -169.15048217773438, "loss": 0.6596, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7915258407592773, "rewards/margins": 0.2815794348716736, "rewards/rejected": 0.5099464654922485, "step": 110 }, { "epoch": 0.0904799370574351, "grad_norm": 30.208763122558594, "learning_rate": 1.7656250000000002e-05, "logits/chosen": -0.5659558176994324, "logits/rejected": -0.855063796043396, "logps/chosen": -198.71206665039062, "logps/rejected": -174.78524780273438, "loss": 0.7202, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6501097679138184, "rewards/margins": 0.17246408760547638, "rewards/rejected": 0.4776456952095032, "step": 115 }, { "epoch": 0.0944138473642801, "grad_norm": 23.550596237182617, "learning_rate": 1.84375e-05, "logits/chosen": -0.5133547186851501, "logits/rejected": -0.734718382358551, "logps/chosen": -193.6223602294922, "logps/rejected": -179.42771911621094, "loss": 0.7313, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.5589785575866699, "rewards/margins": 0.10251788794994354, "rewards/rejected": 0.4564606547355652, "step": 120 }, { "epoch": 0.0983477576711251, "grad_norm": 29.921533584594727, "learning_rate": 1.9062500000000003e-05, "logits/chosen": -0.3889247179031372, "logits/rejected": -0.6225888133049011, "logps/chosen": -187.0243377685547, "logps/rejected": -176.29808044433594, "loss": 0.6273, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.40915530920028687, "rewards/margins": 0.3502606451511383, "rewards/rejected": 0.058894671499729156, "step": 125 }, { "epoch": 0.1022816679779701, "grad_norm": 29.90145492553711, "learning_rate": 1.984375e-05, "logits/chosen": -0.34609144926071167, "logits/rejected": -0.7598401308059692, "logps/chosen": -201.13104248046875, "logps/rejected": -173.50753784179688, "loss": 0.6626, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.1300664246082306, "rewards/margins": 0.21786466240882874, "rewards/rejected": -0.08779821544885635, "step": 130 }, { "epoch": 0.10621557828481511, "grad_norm": 23.906503677368164, "learning_rate": 1.9999395643917957e-05, "logits/chosen": -0.41295546293258667, "logits/rejected": -0.8447906374931335, "logps/chosen": -201.5752716064453, "logps/rejected": -165.7244415283203, "loss": 0.6405, "rewards/accuracies": 0.625, "rewards/chosen": 0.21347875893115997, "rewards/margins": 0.3085169196128845, "rewards/rejected": -0.09503819793462753, "step": 135 }, { "epoch": 0.11014948859166011, "grad_norm": 25.38832664489746, "learning_rate": 1.999694057253083e-05, "logits/chosen": -0.2702675759792328, "logits/rejected": -0.6757915019989014, "logps/chosen": -198.8104705810547, "logps/rejected": -175.73355102539062, "loss": 0.6331, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7145684361457825, "rewards/margins": 0.3979041576385498, "rewards/rejected": 0.31666427850723267, "step": 140 }, { "epoch": 0.11408339889850512, "grad_norm": 25.388601303100586, "learning_rate": 1.9992597476892096e-05, "logits/chosen": -0.20559760928153992, "logits/rejected": -0.6221147775650024, "logps/chosen": -203.33877563476562, "logps/rejected": -177.6593780517578, "loss": 0.6278, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.8970493078231812, "rewards/margins": 0.41804951429367065, "rewards/rejected": 0.4789998531341553, "step": 145 }, { "epoch": 0.11801730920535011, "grad_norm": 26.157350540161133, "learning_rate": 1.9986367177239688e-05, "logits/chosen": -0.34933823347091675, "logits/rejected": -0.5474187135696411, "logps/chosen": -192.22409057617188, "logps/rejected": -179.11972045898438, "loss": 0.7403, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7421566843986511, "rewards/margins": 0.24275951087474823, "rewards/rejected": 0.4993972182273865, "step": 150 }, { "epoch": 0.12195121951219512, "grad_norm": 27.657987594604492, "learning_rate": 1.9978250850229278e-05, "logits/chosen": -0.5602678060531616, "logits/rejected": -0.7431076765060425, "logps/chosen": -197.28172302246094, "logps/rejected": -180.0853271484375, "loss": 0.718, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6257942914962769, "rewards/margins": 0.286087304353714, "rewards/rejected": 0.3397069573402405, "step": 155 }, { "epoch": 0.12588512981904013, "grad_norm": 27.8662166595459, "learning_rate": 1.996825002871205e-05, "logits/chosen": -0.3598572611808777, "logits/rejected": -0.8388012647628784, "logps/chosen": -192.58541870117188, "logps/rejected": -165.87228393554688, "loss": 0.6815, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.49491995573043823, "rewards/margins": 0.3221299648284912, "rewards/rejected": 0.1727900207042694, "step": 160 }, { "epoch": 0.12981904012588513, "grad_norm": 21.444156646728516, "learning_rate": 1.9956366601445212e-05, "logits/chosen": -0.18239173293113708, "logits/rejected": -0.6315879225730896, "logps/chosen": -214.19509887695312, "logps/rejected": -185.4246368408203, "loss": 0.6328, "rewards/accuracies": 0.625, "rewards/chosen": 0.5859188437461853, "rewards/margins": 0.4131649136543274, "rewards/rejected": 0.1727539300918579, "step": 165 }, { "epoch": 0.13375295043273014, "grad_norm": 22.295812606811523, "learning_rate": 1.994260281273529e-05, "logits/chosen": -0.27679482102394104, "logits/rejected": -0.7712021470069885, "logps/chosen": -206.1096954345703, "logps/rejected": -173.62576293945312, "loss": 0.6613, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6996821165084839, "rewards/margins": 0.3059811294078827, "rewards/rejected": 0.3937010169029236, "step": 170 }, { "epoch": 0.13768686073957515, "grad_norm": 33.50761413574219, "learning_rate": 1.9926961262014237e-05, "logits/chosen": -0.3116024136543274, "logits/rejected": -0.625832736492157, "logps/chosen": -219.8788604736328, "logps/rejected": -187.32510375976562, "loss": 0.746, "rewards/accuracies": 0.625, "rewards/chosen": 1.1785697937011719, "rewards/margins": 0.21889865398406982, "rewards/rejected": 0.9596711993217468, "step": 175 }, { "epoch": 0.14162077104642015, "grad_norm": 15.657761573791504, "learning_rate": 1.9909444903348546e-05, "logits/chosen": -0.005524394102394581, "logits/rejected": -0.3487216532230377, "logps/chosen": -228.5839385986328, "logps/rejected": -201.77001953125, "loss": 0.7435, "rewards/accuracies": 0.5625, "rewards/chosen": 1.1422548294067383, "rewards/margins": 0.15804262459278107, "rewards/rejected": 0.9842122793197632, "step": 180 }, { "epoch": 0.14555468135326516, "grad_norm": 26.140518188476562, "learning_rate": 1.9890057044881308e-05, "logits/chosen": -0.12314258515834808, "logits/rejected": -0.5814956426620483, "logps/chosen": -201.1555633544922, "logps/rejected": -167.4046173095703, "loss": 0.6795, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0470006465911865, "rewards/margins": 0.35150283575057983, "rewards/rejected": 0.6954978108406067, "step": 185 }, { "epoch": 0.14948859166011014, "grad_norm": 19.782007217407227, "learning_rate": 1.9868801348207467e-05, "logits/chosen": -0.11235501617193222, "logits/rejected": -0.5538455247879028, "logps/chosen": -204.25839233398438, "logps/rejected": -181.46743774414062, "loss": 0.685, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.1285860538482666, "rewards/margins": 0.33753544092178345, "rewards/rejected": 0.7910505533218384, "step": 190 }, { "epoch": 0.15342250196695514, "grad_norm": 19.97163200378418, "learning_rate": 1.9845681827682263e-05, "logits/chosen": -0.16671855747699738, "logits/rejected": -0.540806233882904, "logps/chosen": -194.2422332763672, "logps/rejected": -163.8104705810547, "loss": 0.6713, "rewards/accuracies": 0.625, "rewards/chosen": 0.6743755340576172, "rewards/margins": 0.26031339168548584, "rewards/rejected": 0.4140622019767761, "step": 195 }, { "epoch": 0.15735641227380015, "grad_norm": 18.71397590637207, "learning_rate": 1.982070284966309e-05, "logits/chosen": -0.1493137627840042, "logits/rejected": -0.43618321418762207, "logps/chosen": -202.78318786621094, "logps/rejected": -177.56668090820312, "loss": 0.6528, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.5738676190376282, "rewards/margins": 0.31430238485336304, "rewards/rejected": 0.25956520438194275, "step": 200 }, { "epoch": 0.15735641227380015, "eval_logits/chosen": 1.3314845561981201, "eval_logits/rejected": 1.1080169677734375, "eval_logps/chosen": -206.4569549560547, "eval_logps/rejected": -179.43057250976562, "eval_loss": 0.6942009329795837, "eval_rewards/accuracies": 0.604687511920929, "eval_rewards/chosen": 0.6053363680839539, "eval_rewards/margins": 0.2244330197572708, "eval_rewards/rejected": 0.38090336322784424, "eval_runtime": 309.8464, "eval_samples_per_second": 2.066, "eval_steps_per_second": 0.129, "step": 200 }, { "epoch": 0.16129032258064516, "grad_norm": 15.415759086608887, "learning_rate": 1.9793869131684884e-05, "logits/chosen": -0.08272367715835571, "logits/rejected": -0.4305300712585449, "logps/chosen": -196.86305236816406, "logps/rejected": -178.54037475585938, "loss": 0.7078, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.5516935586929321, "rewards/margins": 0.21864008903503418, "rewards/rejected": 0.33305343985557556, "step": 205 }, { "epoch": 0.16522423288749016, "grad_norm": 28.38641929626465, "learning_rate": 1.9765185741569126e-05, "logits/chosen": -0.14836929738521576, "logits/rejected": -0.4139153063297272, "logps/chosen": -215.8746795654297, "logps/rejected": -190.37954711914062, "loss": 0.7474, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6253814697265625, "rewards/margins": 0.11703801155090332, "rewards/rejected": 0.5083434581756592, "step": 210 }, { "epoch": 0.16915814319433517, "grad_norm": 23.663591384887695, "learning_rate": 1.9734658096466774e-05, "logits/chosen": 0.011041751131415367, "logits/rejected": -0.4074042737483978, "logps/chosen": -209.1394500732422, "logps/rejected": -178.0277099609375, "loss": 0.6711, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.3468799591064453, "rewards/margins": 0.2509341835975647, "rewards/rejected": 0.09594579041004181, "step": 215 }, { "epoch": 0.17309205350118018, "grad_norm": 32.677852630615234, "learning_rate": 1.970229196183516e-05, "logits/chosen": -0.020372604951262474, "logits/rejected": -0.37563034892082214, "logps/chosen": -209.47402954101562, "logps/rejected": -177.0091094970703, "loss": 0.6983, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.594801127910614, "rewards/margins": 0.25161081552505493, "rewards/rejected": 0.34319034218788147, "step": 220 }, { "epoch": 0.17702596380802518, "grad_norm": 22.306182861328125, "learning_rate": 1.9668093450349125e-05, "logits/chosen": -0.1756196916103363, "logits/rejected": -0.5201798677444458, "logps/chosen": -217.6730499267578, "logps/rejected": -185.24819946289062, "loss": 0.6923, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.9194382429122925, "rewards/margins": 0.3321036696434021, "rewards/rejected": 0.5873345136642456, "step": 225 }, { "epoch": 0.1809598741148702, "grad_norm": 31.994035720825195, "learning_rate": 1.9632069020746574e-05, "logits/chosen": -0.3013627529144287, "logits/rejected": -0.7145218849182129, "logps/chosen": -206.0642547607422, "logps/rejected": -178.27896118164062, "loss": 0.6459, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8781298398971558, "rewards/margins": 0.5241779088973999, "rewards/rejected": 0.353952020406723, "step": 230 }, { "epoch": 0.1848937844217152, "grad_norm": 29.714988708496094, "learning_rate": 1.959422547660869e-05, "logits/chosen": -0.2492908537387848, "logits/rejected": -0.779377818107605, "logps/chosen": -198.94345092773438, "logps/rejected": -169.714599609375, "loss": 0.6366, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.5711004734039307, "rewards/margins": 0.413928359746933, "rewards/rejected": 0.15717211365699768, "step": 235 }, { "epoch": 0.1888276947285602, "grad_norm": 24.506587982177734, "learning_rate": 1.955456996507499e-05, "logits/chosen": -0.019927600398659706, "logits/rejected": -0.43524104356765747, "logps/chosen": -197.2928009033203, "logps/rejected": -168.06382751464844, "loss": 0.6361, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.787044107913971, "rewards/margins": 0.3754611909389496, "rewards/rejected": 0.4115828573703766, "step": 240 }, { "epoch": 0.19276160503540518, "grad_norm": 24.652503967285156, "learning_rate": 1.9513109975493553e-05, "logits/chosen": -0.30659085512161255, "logits/rejected": -0.6158447265625, "logps/chosen": -207.3615264892578, "logps/rejected": -198.04635620117188, "loss": 0.6338, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.8901578783988953, "rewards/margins": 0.45508089661598206, "rewards/rejected": 0.4350770115852356, "step": 245 }, { "epoch": 0.1966955153422502, "grad_norm": 22.106698989868164, "learning_rate": 1.9469853338006515e-05, "logits/chosen": -0.07243610918521881, "logits/rejected": -0.2781897187232971, "logps/chosen": -203.30215454101562, "logps/rejected": -188.57080078125, "loss": 0.7046, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.7840886116027832, "rewards/margins": 0.2757692337036133, "rewards/rejected": 0.5083193778991699, "step": 250 }, { "epoch": 0.2006294256490952, "grad_norm": 17.76561164855957, "learning_rate": 1.9424808222071337e-05, "logits/chosen": -0.1372375786304474, "logits/rejected": -0.4728778898715973, "logps/chosen": -218.58462524414062, "logps/rejected": -192.29983520507812, "loss": 0.622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8517538905143738, "rewards/margins": 0.47115468978881836, "rewards/rejected": 0.3805992603302002, "step": 255 }, { "epoch": 0.2045633359559402, "grad_norm": 21.741724014282227, "learning_rate": 1.9377983134917868e-05, "logits/chosen": -0.42930954694747925, "logits/rejected": -0.6508566737174988, "logps/chosen": -196.40382385253906, "logps/rejected": -180.81784057617188, "loss": 0.6814, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.42375677824020386, "rewards/margins": 0.3472265601158142, "rewards/rejected": 0.07653021067380905, "step": 260 }, { "epoch": 0.2084972462627852, "grad_norm": 25.856201171875, "learning_rate": 1.9329386919941694e-05, "logits/chosen": -0.5100887417793274, "logits/rejected": -0.896782398223877, "logps/chosen": -200.4944610595703, "logps/rejected": -168.5055694580078, "loss": 0.631, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5903893709182739, "rewards/margins": 0.45923447608947754, "rewards/rejected": 0.13115492463111877, "step": 265 }, { "epoch": 0.21243115656963021, "grad_norm": 21.10732078552246, "learning_rate": 1.927902875503397e-05, "logits/chosen": -0.2257436066865921, "logits/rejected": -0.6618258953094482, "logps/chosen": -216.7244415283203, "logps/rejected": -172.3234405517578, "loss": 0.6436, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.9558561444282532, "rewards/margins": 0.5136295557022095, "rewards/rejected": 0.44222649931907654, "step": 270 }, { "epoch": 0.21636506687647522, "grad_norm": 21.297080993652344, "learning_rate": 1.9226918150848067e-05, "logits/chosen": -0.325428307056427, "logits/rejected": -0.6309774518013, "logps/chosen": -190.9318389892578, "logps/rejected": -179.4983673095703, "loss": 0.6904, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.1083195209503174, "rewards/margins": 0.31002935767173767, "rewards/rejected": 0.7982901334762573, "step": 275 }, { "epoch": 0.22029897718332023, "grad_norm": 21.540422439575195, "learning_rate": 1.9173064949003408e-05, "logits/chosen": -0.05009857565164566, "logits/rejected": -0.3596547245979309, "logps/chosen": -200.29823303222656, "logps/rejected": -180.3629150390625, "loss": 0.6645, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.4049633741378784, "rewards/margins": 0.5104038119316101, "rewards/rejected": 0.8945595026016235, "step": 280 }, { "epoch": 0.22423288749016523, "grad_norm": NaN, "learning_rate": 1.9128734540932494e-05, "logits/chosen": -0.3485383987426758, "logits/rejected": -0.5194178223609924, "logps/chosen": -197.75784301757812, "logps/rejected": -181.0018768310547, "loss": 0.7351, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2131370306015015, "rewards/margins": 0.22653412818908691, "rewards/rejected": 0.9866029620170593, "step": 285 }, { "epoch": 0.22816679779701024, "grad_norm": 24.915868759155273, "learning_rate": 1.9071770513468988e-05, "logits/chosen": -0.17852464318275452, "logits/rejected": -0.35372194647789, "logps/chosen": -193.89865112304688, "logps/rejected": -187.19973754882812, "loss": 0.7047, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.9371153116226196, "rewards/margins": 0.20712292194366455, "rewards/rejected": 0.7299925088882446, "step": 290 }, { "epoch": 0.23210070810385522, "grad_norm": 19.513757705688477, "learning_rate": 1.901309318956141e-05, "logits/chosen": -0.4217872619628906, "logits/rejected": -0.7518173456192017, "logps/chosen": -194.53421020507812, "logps/rejected": -168.0951385498047, "loss": 0.7308, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6975895166397095, "rewards/margins": 0.23306536674499512, "rewards/rejected": 0.46452417969703674, "step": 295 }, { "epoch": 0.23603461841070023, "grad_norm": 18.220582962036133, "learning_rate": 1.8952713651021227e-05, "logits/chosen": -0.14223751425743103, "logits/rejected": -0.4979272484779358, "logps/chosen": -199.91549682617188, "logps/rejected": -177.2222900390625, "loss": 0.6827, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.819624125957489, "rewards/margins": 0.39503517746925354, "rewards/rejected": 0.42458897829055786, "step": 300 }, { "epoch": 0.23603461841070023, "eval_logits/chosen": 1.2563122510910034, "eval_logits/rejected": 1.0339769124984741, "eval_logps/chosen": -206.1991424560547, "eval_logps/rejected": -179.33786010742188, "eval_loss": 0.7167426347732544, "eval_rewards/accuracies": 0.6171875, "eval_rewards/chosen": 0.708461582660675, "eval_rewards/margins": 0.2904762327671051, "eval_rewards/rejected": 0.41798537969589233, "eval_runtime": 284.7459, "eval_samples_per_second": 2.248, "eval_steps_per_second": 0.14, "step": 300 }, { "epoch": 0.23996852871754523, "grad_norm": 23.576587677001953, "learning_rate": 1.8890643301140487e-05, "logits/chosen": -0.5384713411331177, "logits/rejected": -0.8448705673217773, "logps/chosen": -197.2958526611328, "logps/rejected": -165.64370727539062, "loss": 0.6409, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6531845331192017, "rewards/margins": 0.39299410581588745, "rewards/rejected": 0.2601904273033142, "step": 305 }, { "epoch": 0.24390243902439024, "grad_norm": 18.40612030029297, "learning_rate": 1.8826893862538233e-05, "logits/chosen": -0.3022890090942383, "logits/rejected": -0.5158249735832214, "logps/chosen": -207.9346160888672, "logps/rejected": -193.0900115966797, "loss": 0.7895, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.7925726771354675, "rewards/margins": 0.08937112987041473, "rewards/rejected": 0.7032015919685364, "step": 310 }, { "epoch": 0.24783634933123525, "grad_norm": 18.7589168548584, "learning_rate": 1.8761477374946548e-05, "logits/chosen": -0.12031130492687225, "logits/rejected": -0.4747944474220276, "logps/chosen": -211.0299530029297, "logps/rejected": -186.3873291015625, "loss": 0.6952, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9918599128723145, "rewards/margins": 0.28354746103286743, "rewards/rejected": 0.7083123922348022, "step": 315 }, { "epoch": 0.25177025963808025, "grad_norm": 20.57366180419922, "learning_rate": 1.869440619293672e-05, "logits/chosen": 0.015002071857452393, "logits/rejected": -0.4523535668849945, "logps/chosen": -215.18704223632812, "logps/rejected": -179.958984375, "loss": 0.6336, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9286333918571472, "rewards/margins": 0.4492555558681488, "rewards/rejected": 0.4793778359889984, "step": 320 }, { "epoch": 0.25570416994492523, "grad_norm": 24.69734001159668, "learning_rate": 1.8625692983585976e-05, "logits/chosen": -0.3278903663158417, "logits/rejected": -0.9296085238456726, "logps/chosen": -212.3651580810547, "logps/rejected": -168.00753784179688, "loss": 0.6633, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7741891741752625, "rewards/margins": 0.3930490016937256, "rewards/rejected": 0.38114017248153687, "step": 325 }, { "epoch": 0.25963808025177026, "grad_norm": 27.854631423950195, "learning_rate": 1.855535072408516e-05, "logits/chosen": -0.4728453755378723, "logits/rejected": -0.6778625249862671, "logps/chosen": -211.9385528564453, "logps/rejected": -193.85667419433594, "loss": 0.6953, "rewards/accuracies": 0.625, "rewards/chosen": 0.9169828295707703, "rewards/margins": 0.32869625091552734, "rewards/rejected": 0.5882865786552429, "step": 330 }, { "epoch": 0.26357199055861524, "grad_norm": 18.423259735107422, "learning_rate": 1.8483392699287858e-05, "logits/chosen": -0.05396045371890068, "logits/rejected": -0.5624040365219116, "logps/chosen": -222.1643524169922, "logps/rejected": -177.35289001464844, "loss": 0.6206, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.1117911338806152, "rewards/margins": 0.5459399223327637, "rewards/rejected": 0.5658511519432068, "step": 335 }, { "epoch": 0.2675059008654603, "grad_norm": 23.744850158691406, "learning_rate": 1.840983249920143e-05, "logits/chosen": -0.3244122564792633, "logits/rejected": -0.5297374725341797, "logps/chosen": -196.14691162109375, "logps/rejected": -188.9138946533203, "loss": 0.7056, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8795296549797058, "rewards/margins": 0.3909408748149872, "rewards/rejected": 0.488588809967041, "step": 340 }, { "epoch": 0.27143981117230526, "grad_norm": 18.513778686523438, "learning_rate": 1.8334684016420383e-05, "logits/chosen": -0.08137266337871552, "logits/rejected": -0.5458197593688965, "logps/chosen": -232.447509765625, "logps/rejected": -191.580078125, "loss": 0.6264, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.163153052330017, "rewards/margins": 0.4738085865974426, "rewards/rejected": 0.6893445253372192, "step": 345 }, { "epoch": 0.2753737214791503, "grad_norm": 15.827184677124023, "learning_rate": 1.8257961443502626e-05, "logits/chosen": -0.30110448598861694, "logits/rejected": -0.6258831024169922, "logps/chosen": -190.89808654785156, "logps/rejected": -173.31884765625, "loss": 0.6519, "rewards/accuracies": 0.625, "rewards/chosen": 0.946982204914093, "rewards/margins": 0.39443182945251465, "rewards/rejected": 0.5525503754615784, "step": 350 }, { "epoch": 0.27930763178599527, "grad_norm": 19.0930118560791, "learning_rate": 1.8179679270289048e-05, "logits/chosen": -0.2574307322502136, "logits/rejected": -0.7561649680137634, "logps/chosen": -201.4808349609375, "logps/rejected": -172.31173706054688, "loss": 0.6453, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.1022285223007202, "rewards/margins": 0.5637288689613342, "rewards/rejected": 0.5384997129440308, "step": 355 }, { "epoch": 0.2832415420928403, "grad_norm": 22.383216857910156, "learning_rate": 1.8099852281166974e-05, "logits/chosen": -0.2120940238237381, "logits/rejected": -0.7636501789093018, "logps/chosen": -209.04806518554688, "logps/rejected": -166.7012481689453, "loss": 0.6576, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.3941724300384521, "rewards/margins": 0.5536119341850281, "rewards/rejected": 0.8405605554580688, "step": 360 }, { "epoch": 0.2871754523996853, "grad_norm": 18.3509578704834, "learning_rate": 1.8018495552277987e-05, "logits/chosen": 0.07260416448116302, "logits/rejected": -0.2597780227661133, "logps/chosen": -208.8731689453125, "logps/rejected": -187.85023498535156, "loss": 0.6275, "rewards/accuracies": 0.75, "rewards/chosen": 1.4085180759429932, "rewards/margins": 0.5379746556282043, "rewards/rejected": 0.8705434799194336, "step": 365 }, { "epoch": 0.2911093627065303, "grad_norm": 21.863872528076172, "learning_rate": 1.7935624448670625e-05, "logits/chosen": -0.4248635172843933, "logits/rejected": -0.4336097836494446, "logps/chosen": -179.680908203125, "logps/rejected": -173.14013671875, "loss": 0.75, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0196665525436401, "rewards/margins": 0.20690293610095978, "rewards/rejected": 0.8127636909484863, "step": 370 }, { "epoch": 0.2950432730133753, "grad_norm": 26.93684196472168, "learning_rate": 1.785125462139855e-05, "logits/chosen": -0.16947659850120544, "logits/rejected": -0.451927125453949, "logps/chosen": -198.48106384277344, "logps/rejected": -174.99111938476562, "loss": 0.7696, "rewards/accuracies": 0.625, "rewards/chosen": 1.1577861309051514, "rewards/margins": 0.21412566304206848, "rewards/rejected": 0.9436607360839844, "step": 375 }, { "epoch": 0.2989771833202203, "grad_norm": 15.670443534851074, "learning_rate": 1.7765402004564687e-05, "logits/chosen": -0.1878432035446167, "logits/rejected": -0.5365083813667297, "logps/chosen": -204.27255249023438, "logps/rejected": -175.6739959716797, "loss": 0.6793, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.1427654027938843, "rewards/margins": 0.44054698944091797, "rewards/rejected": 0.7022184133529663, "step": 380 }, { "epoch": 0.3029110936270653, "grad_norm": 20.738510131835938, "learning_rate": 1.76780828123119e-05, "logits/chosen": -0.22227105498313904, "logits/rejected": -0.4939172863960266, "logps/chosen": -204.56930541992188, "logps/rejected": -187.81863403320312, "loss": 0.6359, "rewards/accuracies": 0.6875, "rewards/chosen": 1.086004376411438, "rewards/margins": 0.5049671530723572, "rewards/rejected": 0.5810372233390808, "step": 385 }, { "epoch": 0.3068450039339103, "grad_norm": 15.985719680786133, "learning_rate": 1.7589313535760787e-05, "logits/chosen": -0.33505499362945557, "logits/rejected": -0.5057377219200134, "logps/chosen": -203.09201049804688, "logps/rejected": -186.1582489013672, "loss": 0.728, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.9475752115249634, "rewards/margins": 0.21062707901000977, "rewards/rejected": 0.7369481325149536, "step": 390 }, { "epoch": 0.3107789142407553, "grad_norm": 15.00536823272705, "learning_rate": 1.7499110939895162e-05, "logits/chosen": -0.2682803273200989, "logits/rejected": -0.6644273400306702, "logps/chosen": -197.18655395507812, "logps/rejected": -184.64974975585938, "loss": 0.7331, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.4725784361362457, "rewards/margins": 0.17313337326049805, "rewards/rejected": 0.29944509267807007, "step": 395 }, { "epoch": 0.3147128245476003, "grad_norm": 18.541942596435547, "learning_rate": 1.7407492060395835e-05, "logits/chosen": -0.3485754132270813, "logits/rejected": -0.6408174633979797, "logps/chosen": -196.4596710205078, "logps/rejected": -178.34701538085938, "loss": 0.6891, "rewards/accuracies": 0.5625, "rewards/chosen": 0.33864206075668335, "rewards/margins": 0.22059743106365204, "rewards/rejected": 0.11804463714361191, "step": 400 }, { "epoch": 0.3147128245476003, "eval_logits/chosen": 1.2971076965332031, "eval_logits/rejected": 1.0804717540740967, "eval_logps/chosen": -207.33456420898438, "eval_logps/rejected": -180.31930541992188, "eval_loss": 0.7093836069107056, "eval_rewards/accuracies": 0.598437488079071, "eval_rewards/chosen": 0.2542892098426819, "eval_rewards/margins": 0.22887463867664337, "eval_rewards/rejected": 0.025414561852812767, "eval_runtime": 301.2073, "eval_samples_per_second": 2.125, "eval_steps_per_second": 0.133, "step": 400 }, { "epoch": 0.31864673485444533, "grad_norm": 22.79604148864746, "learning_rate": 1.731447420042321e-05, "logits/chosen": -0.33927303552627563, "logits/rejected": -0.5682342052459717, "logps/chosen": -190.31930541992188, "logps/rejected": -173.07032775878906, "loss": 0.7979, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0728757381439209, "rewards/margins": 0.02971130609512329, "rewards/rejected": 0.0431644432246685, "step": 405 }, { "epoch": 0.3225806451612903, "grad_norm": 22.005783081054688, "learning_rate": 1.7220074927349452e-05, "logits/chosen": -0.3349539339542389, "logits/rejected": -0.6785364151000977, "logps/chosen": -205.6999969482422, "logps/rejected": -174.34982299804688, "loss": 0.6723, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.04839733988046646, "rewards/margins": 0.2823019027709961, "rewards/rejected": -0.23390455543994904, "step": 410 }, { "epoch": 0.32651455546813535, "grad_norm": 18.50445556640625, "learning_rate": 1.712431206944067e-05, "logits/chosen": -0.31676384806632996, "logits/rejected": -0.47476306557655334, "logps/chosen": -194.7633056640625, "logps/rejected": -185.64987182617188, "loss": 0.6637, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.2732119560241699, "rewards/margins": 0.3499985337257385, "rewards/rejected": -0.07678655534982681, "step": 415 }, { "epoch": 0.3304484657749803, "grad_norm": 21.16750144958496, "learning_rate": 1.7027203712489902e-05, "logits/chosen": -0.22730335593223572, "logits/rejected": -0.6324140429496765, "logps/chosen": -209.23678588867188, "logps/rejected": -177.7320098876953, "loss": 0.7066, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.41964513063430786, "rewards/margins": 0.263131707906723, "rewards/rejected": 0.15651337802410126, "step": 420 }, { "epoch": 0.33438237608182536, "grad_norm": 21.21584129333496, "learning_rate": 1.6928768196401403e-05, "logits/chosen": -0.19787462055683136, "logits/rejected": -0.5100497007369995, "logps/chosen": -213.1494140625, "logps/rejected": -194.2113800048828, "loss": 0.7113, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.434047132730484, "rewards/margins": 0.20316064357757568, "rewards/rejected": 0.2308865338563919, "step": 425 }, { "epoch": 0.33831628638867034, "grad_norm": 26.320444107055664, "learning_rate": 1.682902411172698e-05, "logits/chosen": -0.27940934896469116, "logits/rejected": -0.6819210052490234, "logps/chosen": -191.19189453125, "logps/rejected": -160.06234741210938, "loss": 0.672, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6531416177749634, "rewards/margins": 0.32751747965812683, "rewards/rejected": 0.32562416791915894, "step": 430 }, { "epoch": 0.3422501966955153, "grad_norm": 16.507688522338867, "learning_rate": 1.6727990296154962e-05, "logits/chosen": -0.43093472719192505, "logits/rejected": -0.6659766435623169, "logps/chosen": -194.37916564941406, "logps/rejected": -175.87298583984375, "loss": 0.6782, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9038249254226685, "rewards/margins": 0.3305993974208832, "rewards/rejected": 0.5732254385948181, "step": 435 }, { "epoch": 0.34618410700236035, "grad_norm": 15.00309944152832, "learning_rate": 1.6625685830952533e-05, "logits/chosen": -0.017139725387096405, "logits/rejected": -0.5116509199142456, "logps/chosen": -203.77554321289062, "logps/rejected": -166.87571716308594, "loss": 0.6715, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.8991168141365051, "rewards/margins": 0.4240299165248871, "rewards/rejected": 0.47508686780929565, "step": 440 }, { "epoch": 0.35011801730920533, "grad_norm": 22.238525390625, "learning_rate": 1.6522130037362018e-05, "logits/chosen": -0.4809524416923523, "logits/rejected": -0.77618408203125, "logps/chosen": -183.9463348388672, "logps/rejected": -168.94070434570312, "loss": 0.7005, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9337043762207031, "rewards/margins": 0.2749274969100952, "rewards/rejected": 0.6587768197059631, "step": 445 }, { "epoch": 0.35405192761605037, "grad_norm": 17.745378494262695, "learning_rate": 1.641734247295189e-05, "logits/chosen": -0.4837673306465149, "logits/rejected": -0.8133207559585571, "logps/chosen": -187.5880126953125, "logps/rejected": -172.59933471679688, "loss": 0.6777, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.9450648427009583, "rewards/margins": 0.3392513394355774, "rewards/rejected": 0.6058135032653809, "step": 450 }, { "epoch": 0.35798583792289534, "grad_norm": 21.806243896484375, "learning_rate": 1.63113429279231e-05, "logits/chosen": -0.3670351207256317, "logits/rejected": -0.7418017387390137, "logps/chosen": -221.2038116455078, "logps/rejected": -184.3399200439453, "loss": 0.7212, "rewards/accuracies": 0.625, "rewards/chosen": 0.8858639001846313, "rewards/margins": 0.2686173915863037, "rewards/rejected": 0.6172465085983276, "step": 455 }, { "epoch": 0.3619197482297404, "grad_norm": 19.19058609008789, "learning_rate": 1.6204151421371504e-05, "logits/chosen": -0.5260201692581177, "logits/rejected": -0.887170672416687, "logps/chosen": -198.56930541992188, "logps/rejected": -170.34158325195312, "loss": 0.6642, "rewards/accuracies": 0.625, "rewards/chosen": 0.6595619320869446, "rewards/margins": 0.25892138481140137, "rewards/rejected": 0.4006405472755432, "step": 460 }, { "epoch": 0.36585365853658536, "grad_norm": 16.740882873535156, "learning_rate": 1.609578819750708e-05, "logits/chosen": -0.21146011352539062, "logits/rejected": -0.41337770223617554, "logps/chosen": -186.92779541015625, "logps/rejected": -183.7529754638672, "loss": 0.6911, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.568415641784668, "rewards/margins": 0.27034991979599, "rewards/rejected": 0.298065721988678, "step": 465 }, { "epoch": 0.3697875688434304, "grad_norm": 22.620988845825195, "learning_rate": 1.5986273721830557e-05, "logits/chosen": -0.17011170089244843, "logits/rejected": -0.5642642974853516, "logps/chosen": -206.16073608398438, "logps/rejected": -187.0243377685547, "loss": 0.73, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.44800883531570435, "rewards/margins": 0.19431404769420624, "rewards/rejected": 0.2536947727203369, "step": 470 }, { "epoch": 0.37372147915027537, "grad_norm": 19.39198112487793, "learning_rate": 1.587562867726832e-05, "logits/chosen": -0.18244773149490356, "logits/rejected": -0.5230101346969604, "logps/chosen": -223.02371215820312, "logps/rejected": -198.8177032470703, "loss": 0.6721, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.4324628710746765, "rewards/margins": 0.22442837059497833, "rewards/rejected": 0.20803451538085938, "step": 475 }, { "epoch": 0.3776553894571204, "grad_norm": 19.32149314880371, "learning_rate": 1.5763873960266236e-05, "logits/chosen": -0.29324209690093994, "logits/rejected": -0.5279776453971863, "logps/chosen": -206.15469360351562, "logps/rejected": -188.80137634277344, "loss": 0.6942, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.4868395924568176, "rewards/margins": 0.33774086833000183, "rewards/rejected": 0.14909867942333221, "step": 480 }, { "epoch": 0.3815892997639654, "grad_norm": 19.483469009399414, "learning_rate": 1.5673685398812467e-05, "logits/chosen": -0.1828387826681137, "logits/rejected": -0.41064882278442383, "logps/chosen": -217.49295043945312, "logps/rejected": -198.88177490234375, "loss": 0.7507, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 1.0268093347549438, "rewards/margins": 0.21613208949565887, "rewards/rejected": 0.810677170753479, "step": 485 }, { "epoch": 0.38552321007081036, "grad_norm": 28.394817352294922, "learning_rate": 1.555998659687541e-05, "logits/chosen": -0.49702200293540955, "logits/rejected": -1.0014259815216064, "logps/chosen": -197.88128662109375, "logps/rejected": -160.67999267578125, "loss": 0.6519, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.9069635272026062, "rewards/margins": 0.40647339820861816, "rewards/rejected": 0.5004900693893433, "step": 490 }, { "epoch": 0.3894571203776554, "grad_norm": 20.914031982421875, "learning_rate": 1.544523773472669e-05, "logits/chosen": 0.02130720391869545, "logits/rejected": -0.4486933648586273, "logps/chosen": -211.362060546875, "logps/rejected": -175.72430419921875, "loss": 0.685, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.1584622859954834, "rewards/margins": 0.3790398836135864, "rewards/rejected": 0.7794222831726074, "step": 495 }, { "epoch": 0.3933910306845004, "grad_norm": 14.320610046386719, "learning_rate": 1.532946048386001e-05, "logits/chosen": -0.010864943265914917, "logits/rejected": -0.5150319337844849, "logps/chosen": -207.92333984375, "logps/rejected": -178.11700439453125, "loss": 0.6224, "rewards/accuracies": 0.6875, "rewards/chosen": 1.1484225988388062, "rewards/margins": 0.4593985676765442, "rewards/rejected": 0.6890240907669067, "step": 500 }, { "epoch": 0.3933910306845004, "eval_logits/chosen": 1.327344536781311, "eval_logits/rejected": 1.1055529117584229, "eval_logps/chosen": -205.45755004882812, "eval_logps/rejected": -178.61904907226562, "eval_loss": 0.7026852369308472, "eval_rewards/accuracies": 0.620312511920929, "eval_rewards/chosen": 1.0051077604293823, "eval_rewards/margins": 0.29959002137184143, "eval_rewards/rejected": 0.7055177688598633, "eval_runtime": 297.7987, "eval_samples_per_second": 2.149, "eval_steps_per_second": 0.134, "step": 500 }, { "epoch": 0.3973249409913454, "grad_norm": 17.606443405151367, "learning_rate": 1.5212676709990762e-05, "logits/chosen": 0.12024303525686264, "logits/rejected": -0.33552008867263794, "logps/chosen": -205.59109497070312, "logps/rejected": -181.02566528320312, "loss": 0.6522, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.9811790585517883, "rewards/margins": 0.43993645906448364, "rewards/rejected": 0.5412425994873047, "step": 505 }, { "epoch": 0.4012588512981904, "grad_norm": 23.3114070892334, "learning_rate": 1.509490846892649e-05, "logits/chosen": 0.01656034216284752, "logits/rejected": -0.5744299292564392, "logps/chosen": -211.2788543701172, "logps/rejected": -167.57276916503906, "loss": 0.6138, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8017475008964539, "rewards/margins": 0.5002428293228149, "rewards/rejected": 0.3015046715736389, "step": 510 }, { "epoch": 0.4051927616050354, "grad_norm": 14.10328197479248, "learning_rate": 1.4976178002401408e-05, "logits/chosen": -0.3282383978366852, "logits/rejected": -0.48758015036582947, "logps/chosen": -200.8679962158203, "logps/rejected": -179.44241333007812, "loss": 0.6479, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6766945719718933, "rewards/margins": 0.3457737863063812, "rewards/rejected": 0.33092084527015686, "step": 515 }, { "epoch": 0.4091266719118804, "grad_norm": 26.593978881835938, "learning_rate": 1.4856507733875837e-05, "logits/chosen": -0.1160442978143692, "logits/rejected": -0.4207191467285156, "logps/chosen": -190.7376708984375, "logps/rejected": -169.13816833496094, "loss": 0.7379, "rewards/accuracies": 0.5625, "rewards/chosen": 0.7750043869018555, "rewards/margins": 0.34026703238487244, "rewards/rejected": 0.43473726511001587, "step": 520 }, { "epoch": 0.41306058221872544, "grad_norm": 17.67402458190918, "learning_rate": 1.4735920264301288e-05, "logits/chosen": -0.17023354768753052, "logits/rejected": -0.5197206735610962, "logps/chosen": -207.9748077392578, "logps/rejected": -182.002197265625, "loss": 0.7135, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5062464475631714, "rewards/margins": 0.19488921761512756, "rewards/rejected": 0.31135720014572144, "step": 525 }, { "epoch": 0.4169944925255704, "grad_norm": 16.364791870117188, "learning_rate": 1.4614438367852056e-05, "logits/chosen": -0.35339441895484924, "logits/rejected": -0.6959262490272522, "logps/chosen": -202.8052215576172, "logps/rejected": -167.2289276123047, "loss": 0.6573, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.89134281873703, "rewards/margins": 0.39820951223373413, "rewards/rejected": 0.4931332468986511, "step": 530 }, { "epoch": 0.4209284028324154, "grad_norm": 19.59364891052246, "learning_rate": 1.4492084987624071e-05, "logits/chosen": -0.1122426763176918, "logits/rejected": -0.44985610246658325, "logps/chosen": -204.77981567382812, "logps/rejected": -181.18716430664062, "loss": 0.6709, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.9437880516052246, "rewards/margins": 0.4574647545814514, "rewards/rejected": 0.48632335662841797, "step": 535 }, { "epoch": 0.42486231313926043, "grad_norm": 17.59402084350586, "learning_rate": 1.4368883231301885e-05, "logits/chosen": -0.17638197541236877, "logits/rejected": -0.5632339715957642, "logps/chosen": -201.26885986328125, "logps/rejected": -170.08328247070312, "loss": 0.6228, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.184136986732483, "rewards/margins": 0.756480872631073, "rewards/rejected": 0.42765602469444275, "step": 540 }, { "epoch": 0.4287962234461054, "grad_norm": 27.206796646118164, "learning_rate": 1.4244856366794517e-05, "logits/chosen": -0.057549990713596344, "logits/rejected": -0.4487794041633606, "logps/chosen": -205.1177215576172, "logps/rejected": -177.13014221191406, "loss": 0.6294, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0669742822647095, "rewards/margins": 0.5120534896850586, "rewards/rejected": 0.5549208521842957, "step": 545 }, { "epoch": 0.43273013375295044, "grad_norm": 16.399995803833008, "learning_rate": 1.4120027817841098e-05, "logits/chosen": -0.133390873670578, "logits/rejected": -0.47696390748023987, "logps/chosen": -214.5057373046875, "logps/rejected": -193.0947265625, "loss": 0.808, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.8214758038520813, "rewards/margins": 0.04125159978866577, "rewards/rejected": 0.7802242040634155, "step": 550 }, { "epoch": 0.4366640440597954, "grad_norm": 18.979785919189453, "learning_rate": 1.399442115958704e-05, "logits/chosen": -0.569675862789154, "logits/rejected": -0.8924716711044312, "logps/chosen": -211.4713897705078, "logps/rejected": -183.01220703125, "loss": 0.6587, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.8996235132217407, "rewards/margins": 0.45010414719581604, "rewards/rejected": 0.4495193362236023, "step": 555 }, { "epoch": 0.44059795436664045, "grad_norm": 21.638757705688477, "learning_rate": 1.3868060114131644e-05, "logits/chosen": -0.22702725231647491, "logits/rejected": -0.5234431028366089, "logps/chosen": -210.87393188476562, "logps/rejected": -195.6029052734375, "loss": 0.738, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.0586285591125488, "rewards/margins": 0.27768781781196594, "rewards/rejected": 0.7809406518936157, "step": 560 }, { "epoch": 0.44453186467348543, "grad_norm": 23.013927459716797, "learning_rate": 1.3740968546047935e-05, "logits/chosen": -0.17697608470916748, "logits/rejected": -0.4483562409877777, "logps/chosen": -211.2060089111328, "logps/rejected": -197.86001586914062, "loss": 0.7594, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.9211471676826477, "rewards/margins": 0.0961461290717125, "rewards/rejected": 0.825001060962677, "step": 565 }, { "epoch": 0.44846577498033047, "grad_norm": 20.101484298706055, "learning_rate": 1.3613170457875579e-05, "logits/chosen": -0.22834663093090057, "logits/rejected": -0.6228377223014832, "logps/chosen": -207.5561065673828, "logps/rejected": -182.3037567138672, "loss": 0.6097, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 1.1122691631317139, "rewards/margins": 0.5503975749015808, "rewards/rejected": 0.5618715882301331, "step": 570 }, { "epoch": 0.45239968528717545, "grad_norm": 26.358943939208984, "learning_rate": 1.348468998558779e-05, "logits/chosen": -0.13707995414733887, "logits/rejected": -0.44805946946144104, "logps/chosen": -220.7776641845703, "logps/rejected": -201.1964874267578, "loss": 0.713, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.9859493374824524, "rewards/margins": 0.3383699953556061, "rewards/rejected": 0.6475793123245239, "step": 575 }, { "epoch": 0.4563335955940205, "grad_norm": 16.33328628540039, "learning_rate": 1.3355551394032968e-05, "logits/chosen": -0.31562569737434387, "logits/rejected": -0.6708458065986633, "logps/chosen": -203.0553436279297, "logps/rejected": -176.8132781982422, "loss": 0.6889, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7495515942573547, "rewards/margins": 0.3594974875450134, "rewards/rejected": 0.39005404710769653, "step": 580 }, { "epoch": 0.46026750590086546, "grad_norm": 29.162113189697266, "learning_rate": 1.3225779072352066e-05, "logits/chosen": -0.32384806871414185, "logits/rejected": -0.6729586124420166, "logps/chosen": -214.14102172851562, "logps/rejected": -184.0008087158203, "loss": 0.6698, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.8371032476425171, "rewards/margins": 0.3700554370880127, "rewards/rejected": 0.4670478403568268, "step": 585 }, { "epoch": 0.46420141620771044, "grad_norm": 25.16128921508789, "learning_rate": 1.309539752937243e-05, "logits/chosen": -0.256720632314682, "logits/rejected": -0.4291699528694153, "logps/chosen": -191.2805938720703, "logps/rejected": -184.6292266845703, "loss": 0.6755, "rewards/accuracies": 0.625, "rewards/chosen": 0.6577237248420715, "rewards/margins": 0.28180426359176636, "rewards/rejected": 0.3759194016456604, "step": 590 }, { "epoch": 0.46813532651455547, "grad_norm": 20.09102439880371, "learning_rate": 1.2964431388979075e-05, "logits/chosen": -0.3570843040943146, "logits/rejected": -0.8670114278793335, "logps/chosen": -203.76992797851562, "logps/rejected": -163.80783081054688, "loss": 0.6412, "rewards/accuracies": 0.625, "rewards/chosen": 0.7835728526115417, "rewards/margins": 0.5176677703857422, "rewards/rejected": 0.2659050524234772, "step": 595 }, { "epoch": 0.47206923682140045, "grad_norm": 22.330236434936523, "learning_rate": 1.2832905385464193e-05, "logits/chosen": -0.3153493404388428, "logits/rejected": -0.6954606771469116, "logps/chosen": -199.0489501953125, "logps/rejected": -172.42919921875, "loss": 0.6764, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7396122217178345, "rewards/margins": 0.3455941677093506, "rewards/rejected": 0.39401811361312866, "step": 600 }, { "epoch": 0.47206923682140045, "eval_logits/chosen": 1.3154770135879517, "eval_logits/rejected": 1.0959367752075195, "eval_logps/chosen": -205.95361328125, "eval_logps/rejected": -179.14404296875, "eval_loss": 0.688846230506897, "eval_rewards/accuracies": 0.6234375238418579, "eval_rewards/chosen": 0.8066827058792114, "eval_rewards/margins": 0.3111591935157776, "eval_rewards/rejected": 0.49552351236343384, "eval_runtime": 282.0013, "eval_samples_per_second": 2.269, "eval_steps_per_second": 0.142, "step": 600 }, { "epoch": 0.4760031471282455, "grad_norm": 13.301490783691406, "learning_rate": 1.2700844358855853e-05, "logits/chosen": -0.2941150367259979, "logits/rejected": -0.7340162992477417, "logps/chosen": -194.4886932373047, "logps/rejected": -159.5877227783203, "loss": 0.6895, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.846507728099823, "rewards/margins": 0.3602963089942932, "rewards/rejected": 0.48621147871017456, "step": 605 }, { "epoch": 0.47993705743509046, "grad_norm": 19.667444229125977, "learning_rate": 1.2568273250226681e-05, "logits/chosen": -0.2455168217420578, "logits/rejected": -0.608180820941925, "logps/chosen": -225.4668426513672, "logps/rejected": -192.55905151367188, "loss": 0.6672, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.017348289489746, "rewards/margins": 0.38524192571640015, "rewards/rejected": 0.632106363773346, "step": 610 }, { "epoch": 0.4838709677419355, "grad_norm": 24.933828353881836, "learning_rate": 1.243521709698351e-05, "logits/chosen": -0.28044039011001587, "logits/rejected": -0.5124521255493164, "logps/chosen": -199.1013641357422, "logps/rejected": -195.05728149414062, "loss": 0.6967, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.9729631543159485, "rewards/margins": 0.31783193349838257, "rewards/rejected": 0.6551311016082764, "step": 615 }, { "epoch": 0.4878048780487805, "grad_norm": 21.9912109375, "learning_rate": 1.230170102813879e-05, "logits/chosen": -0.6046349406242371, "logits/rejected": -0.8912727236747742, "logps/chosen": -193.95303344726562, "logps/rejected": -169.863037109375, "loss": 0.6994, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9892646670341492, "rewards/margins": 0.35485339164733887, "rewards/rejected": 0.6344112753868103, "step": 620 }, { "epoch": 0.4917387883556255, "grad_norm": 14.393425941467285, "learning_rate": 1.2167750259564733e-05, "logits/chosen": -0.21057292819023132, "logits/rejected": -0.6453763246536255, "logps/chosen": -197.05722045898438, "logps/rejected": -194.5146942138672, "loss": 0.6655, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.8438342809677124, "rewards/margins": 0.3562160134315491, "rewards/rejected": 0.4876182973384857, "step": 625 }, { "epoch": 0.4956726986624705, "grad_norm": 27.751855850219727, "learning_rate": 1.203339008923103e-05, "logits/chosen": -0.08632899820804596, "logits/rejected": -0.5858111381530762, "logps/chosen": -210.37890625, "logps/rejected": -181.04751586914062, "loss": 0.7106, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.0096272230148315, "rewards/margins": 0.4222971796989441, "rewards/rejected": 0.587330162525177, "step": 630 }, { "epoch": 0.4996066089693155, "grad_norm": 21.017240524291992, "learning_rate": 1.1898645892427064e-05, "logits/chosen": -0.48605161905288696, "logits/rejected": -0.6945669651031494, "logps/chosen": -182.28805541992188, "logps/rejected": -169.93661499023438, "loss": 0.7755, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.5226560831069946, "rewards/margins": 0.05550839379429817, "rewards/rejected": 0.46714773774147034, "step": 635 }, { "epoch": 0.5035405192761605, "grad_norm": 20.2221622467041, "learning_rate": 1.1763543116969549e-05, "logits/chosen": -0.10474424064159393, "logits/rejected": -0.5913185477256775, "logps/chosen": -209.303466796875, "logps/rejected": -173.1480255126953, "loss": 0.6692, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6786335706710815, "rewards/margins": 0.3942939341068268, "rewards/rejected": 0.28433966636657715, "step": 640 }, { "epoch": 0.5074744295830055, "grad_norm": 15.26221752166748, "learning_rate": 1.1628107278396432e-05, "logits/chosen": -0.06124790757894516, "logits/rejected": -0.3360343873500824, "logps/chosen": -202.93270874023438, "logps/rejected": -184.75259399414062, "loss": 0.6547, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.39857378602027893, "rewards/margins": 0.2742787301540375, "rewards/rejected": 0.12429501861333847, "step": 645 }, { "epoch": 0.5114083398898505, "grad_norm": 18.45632553100586, "learning_rate": 1.1492363955148023e-05, "logits/chosen": -0.1759663075208664, "logits/rejected": -0.6530739665031433, "logps/chosen": -218.36123657226562, "logps/rejected": -199.7471160888672, "loss": 0.653, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.5292393565177917, "rewards/margins": 0.3620988726615906, "rewards/rejected": 0.16714049875736237, "step": 650 }, { "epoch": 0.5153422501966956, "grad_norm": 16.891386032104492, "learning_rate": 1.1356338783736256e-05, "logits/chosen": -0.4392605721950531, "logits/rejected": -0.7525895237922668, "logps/chosen": -194.24301147460938, "logps/rejected": -182.4429473876953, "loss": 0.6259, "rewards/accuracies": 0.625, "rewards/chosen": 0.4986444115638733, "rewards/margins": 0.49716418981552124, "rewards/rejected": 0.0014802322257310152, "step": 655 }, { "epoch": 0.5192761605035405, "grad_norm": 18.568416595458984, "learning_rate": 1.1220057453902973e-05, "logits/chosen": -0.2285362035036087, "logits/rejected": -0.6583995223045349, "logps/chosen": -219.6389617919922, "logps/rejected": -176.62965393066406, "loss": 0.6604, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6904179453849792, "rewards/margins": 0.3659079670906067, "rewards/rejected": 0.32451000809669495, "step": 660 }, { "epoch": 0.5232100708103855, "grad_norm": 16.81451416015625, "learning_rate": 1.1083545703768137e-05, "logits/chosen": -0.3168891370296478, "logits/rejected": -0.5861741304397583, "logps/chosen": -198.4099578857422, "logps/rejected": -181.83871459960938, "loss": 0.736, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6643240451812744, "rewards/margins": 0.17423763871192932, "rewards/rejected": 0.4900864064693451, "step": 665 }, { "epoch": 0.5271439811172305, "grad_norm": 20.030567169189453, "learning_rate": 1.0946829314968936e-05, "logits/chosen": -0.22313520312309265, "logits/rejected": -0.6608983874320984, "logps/chosen": -206.3205108642578, "logps/rejected": -178.14974975585938, "loss": 0.6314, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.784034252166748, "rewards/margins": 0.45540714263916016, "rewards/rejected": 0.3286270797252655, "step": 670 }, { "epoch": 0.5310778914240756, "grad_norm": 12.727190017700195, "learning_rate": 1.0809934107790675e-05, "logits/chosen": -0.1376127302646637, "logits/rejected": -0.5582663416862488, "logps/chosen": -207.121337890625, "logps/rejected": -189.23037719726562, "loss": 0.5616, "rewards/accuracies": 0.75, "rewards/chosen": 1.1340868473052979, "rewards/margins": 0.6862513422966003, "rewards/rejected": 0.4478355050086975, "step": 675 }, { "epoch": 0.5350118017309206, "grad_norm": 15.704160690307617, "learning_rate": 1.0672885936290316e-05, "logits/chosen": -0.11958789825439453, "logits/rejected": -0.41796404123306274, "logps/chosen": -200.3405303955078, "logps/rejected": -185.74917602539062, "loss": 0.7025, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.0963573455810547, "rewards/margins": 0.3328610956668854, "rewards/rejected": 0.7634962797164917, "step": 680 }, { "epoch": 0.5389457120377655, "grad_norm": 16.583145141601562, "learning_rate": 1.05357106834137e-05, "logits/chosen": -0.035154812037944794, "logits/rejected": -0.6018010377883911, "logps/chosen": -214.5799102783203, "logps/rejected": -181.4016571044922, "loss": 0.6904, "rewards/accuracies": 0.625, "rewards/chosen": 0.9387739300727844, "rewards/margins": 0.34907636046409607, "rewards/rejected": 0.5896975994110107, "step": 685 }, { "epoch": 0.5428796223446105, "grad_norm": 15.397040367126465, "learning_rate": 1.0398434256107291e-05, "logits/chosen": -0.3040166199207306, "logits/rejected": -0.6104984283447266, "logps/chosen": -190.73818969726562, "logps/rejected": -172.9613037109375, "loss": 0.6723, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8516994714736938, "rewards/margins": 0.3647121787071228, "rewards/rejected": 0.48698729276657104, "step": 690 }, { "epoch": 0.5468135326514555, "grad_norm": 17.214340209960938, "learning_rate": 1.0261082580425366e-05, "logits/chosen": -0.25491005182266235, "logits/rejected": -0.7748223543167114, "logps/chosen": -205.028564453125, "logps/rejected": -169.1365966796875, "loss": 0.6359, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.8545015454292297, "rewards/margins": 0.4432094693183899, "rewards/rejected": 0.41129201650619507, "step": 695 }, { "epoch": 0.5507474429583006, "grad_norm": 18.72207260131836, "learning_rate": 1.012368159663363e-05, "logits/chosen": -0.43465644121170044, "logits/rejected": -0.6075267195701599, "logps/chosen": -198.85336303710938, "logps/rejected": -185.84034729003906, "loss": 0.6205, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.8257676959037781, "rewards/margins": 0.442889541387558, "rewards/rejected": 0.38287803530693054, "step": 700 }, { "epoch": 0.5507474429583006, "eval_logits/chosen": 1.289400339126587, "eval_logits/rejected": 1.06741201877594, "eval_logps/chosen": -206.27685546875, "eval_logps/rejected": -179.56541442871094, "eval_loss": 0.6758726835250854, "eval_rewards/accuracies": 0.6343749761581421, "eval_rewards/chosen": 0.6773768067359924, "eval_rewards/margins": 0.3504090905189514, "eval_rewards/rejected": 0.32696765661239624, "eval_runtime": 264.1292, "eval_samples_per_second": 2.423, "eval_steps_per_second": 0.151, "step": 700 }, { "epoch": 0.5546813532651456, "grad_norm": 20.8519344329834, "learning_rate": 9.98625725431013e-06, "logits/chosen": -0.020856428891420364, "logits/rejected": -0.20043806731700897, "logps/chosen": -193.96920776367188, "logps/rejected": -172.1241912841797, "loss": 0.7039, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5150532722473145, "rewards/margins": 0.1648593544960022, "rewards/rejected": 0.35019388794898987, "step": 705 }, { "epoch": 0.5586152635719905, "grad_norm": 18.23834800720215, "learning_rate": 9.848835507444405e-06, "logits/chosen": -0.17138266563415527, "logits/rejected": -0.5400444269180298, "logps/chosen": -213.20947265625, "logps/rejected": -179.41683959960938, "loss": 0.5993, "rewards/accuracies": 0.6875, "rewards/chosen": 0.878060519695282, "rewards/margins": 0.5326789617538452, "rewards/rejected": 0.34538155794143677, "step": 710 }, { "epoch": 0.5625491738788355, "grad_norm": 17.19778060913086, "learning_rate": 9.71144230953582e-06, "logits/chosen": -0.15033751726150513, "logits/rejected": -0.6573851108551025, "logps/chosen": -209.91763305664062, "logps/rejected": -173.20547485351562, "loss": 0.637, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7313550710678101, "rewards/margins": 0.45394793152809143, "rewards/rejected": 0.27740710973739624, "step": 715 }, { "epoch": 0.5664830841856806, "grad_norm": 17.859058380126953, "learning_rate": 9.574103608691974e-06, "logits/chosen": -0.1018882766366005, "logits/rejected": -0.3827294111251831, "logps/chosen": -217.5899658203125, "logps/rejected": -190.86546325683594, "loss": 0.7034, "rewards/accuracies": 0.5, "rewards/chosen": 0.7803667187690735, "rewards/margins": 0.14793583750724792, "rewards/rejected": 0.632430911064148, "step": 720 }, { "epoch": 0.5704169944925256, "grad_norm": 17.891475677490234, "learning_rate": 9.436845342728142e-06, "logits/chosen": -0.23665161430835724, "logits/rejected": -0.6916168928146362, "logps/chosen": -198.93873596191406, "logps/rejected": -166.03292846679688, "loss": 0.6421, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.801128089427948, "rewards/margins": 0.4237571656703949, "rewards/rejected": 0.3773708939552307, "step": 725 }, { "epoch": 0.5743509047993706, "grad_norm": 17.744354248046875, "learning_rate": 9.299693434268653e-06, "logits/chosen": -0.01328353863209486, "logits/rejected": -0.2819923758506775, "logps/chosen": -207.9522705078125, "logps/rejected": -188.49993896484375, "loss": 0.6878, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.904397189617157, "rewards/margins": 0.36915481090545654, "rewards/rejected": 0.5352423787117004, "step": 730 }, { "epoch": 0.5782848151062155, "grad_norm": 18.68268394470215, "learning_rate": 9.162673785851131e-06, "logits/chosen": -0.39516356587409973, "logits/rejected": -0.7670010328292847, "logps/chosen": -204.0966796875, "logps/rejected": -170.11227416992188, "loss": 0.6341, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.8424245715141296, "rewards/margins": 0.40797433257102966, "rewards/rejected": 0.43445029854774475, "step": 735 }, { "epoch": 0.5822187254130606, "grad_norm": 14.530721664428711, "learning_rate": 9.025812275034541e-06, "logits/chosen": -0.14751622080802917, "logits/rejected": -0.5135005116462708, "logps/chosen": -225.6256866455078, "logps/rejected": -200.2797393798828, "loss": 0.621, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.029211401939392, "rewards/margins": 0.5424867868423462, "rewards/rejected": 0.48672476410865784, "step": 740 }, { "epoch": 0.5861526357199056, "grad_norm": 18.743927001953125, "learning_rate": 8.889134749511956e-06, "logits/chosen": -0.11462094634771347, "logits/rejected": -0.38805294036865234, "logps/chosen": -207.6776123046875, "logps/rejected": -181.88101196289062, "loss": 0.7368, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.7982211709022522, "rewards/margins": 0.21776151657104492, "rewards/rejected": 0.580459713935852, "step": 745 }, { "epoch": 0.5900865460267506, "grad_norm": 14.667529106140137, "learning_rate": 8.752667022228936e-06, "logits/chosen": -0.022926175966858864, "logits/rejected": -0.4718795418739319, "logps/chosen": -216.82284545898438, "logps/rejected": -186.5943603515625, "loss": 0.622, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.8725186586380005, "rewards/margins": 0.6078484058380127, "rewards/rejected": 0.264670193195343, "step": 750 }, { "epoch": 0.5940204563335956, "grad_norm": 20.248031616210938, "learning_rate": 8.616434866508519e-06, "logits/chosen": -0.15943610668182373, "logits/rejected": -0.6148089170455933, "logps/chosen": -209.1900177001953, "logps/rejected": -184.60047912597656, "loss": 0.6446, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7891548871994019, "rewards/margins": 0.48758840560913086, "rewards/rejected": 0.30156660079956055, "step": 755 }, { "epoch": 0.5979543666404405, "grad_norm": 16.850963592529297, "learning_rate": 8.480464011183631e-06, "logits/chosen": -0.2673138678073883, "logits/rejected": -0.6848293542861938, "logps/chosen": -201.9542999267578, "logps/rejected": -168.80638122558594, "loss": 0.6669, "rewards/accuracies": 0.625, "rewards/chosen": 0.6175512671470642, "rewards/margins": 0.30348506569862366, "rewards/rejected": 0.31406617164611816, "step": 760 }, { "epoch": 0.6018882769472856, "grad_norm": 18.8007755279541, "learning_rate": 8.344780135737962e-06, "logits/chosen": -0.31253287196159363, "logits/rejected": -0.8586766123771667, "logps/chosen": -212.3469696044922, "logps/rejected": -163.8748321533203, "loss": 0.6595, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.8451669812202454, "rewards/margins": 0.5855604410171509, "rewards/rejected": 0.2596065402030945, "step": 765 }, { "epoch": 0.6058221872541306, "grad_norm": 13.551706314086914, "learning_rate": 8.209408865456127e-06, "logits/chosen": -0.13036459684371948, "logits/rejected": -0.4954930245876312, "logps/chosen": -213.2278289794922, "logps/rejected": -188.24514770507812, "loss": 0.662, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.8455514907836914, "rewards/margins": 0.34862059354782104, "rewards/rejected": 0.49693092703819275, "step": 770 }, { "epoch": 0.6097560975609756, "grad_norm": 17.73063087463379, "learning_rate": 8.074375766584053e-06, "logits/chosen": 0.0039010108448565006, "logits/rejected": -0.5214850306510925, "logps/chosen": -213.3166046142578, "logps/rejected": -174.0699005126953, "loss": 0.717, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6939308643341064, "rewards/margins": 0.32082659006118774, "rewards/rejected": 0.3731042742729187, "step": 775 }, { "epoch": 0.6136900078678206, "grad_norm": 14.620991706848145, "learning_rate": 7.939706341500555e-06, "logits/chosen": -0.04872986674308777, "logits/rejected": -0.4084659516811371, "logps/chosen": -194.51834106445312, "logps/rejected": -185.00225830078125, "loss": 0.5966, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.8033088445663452, "rewards/margins": 0.5693622827529907, "rewards/rejected": 0.23394668102264404, "step": 780 }, { "epoch": 0.6176239181746657, "grad_norm": 13.0098876953125, "learning_rate": 7.805426023900938e-06, "logits/chosen": -0.4255433976650238, "logits/rejected": -0.7939322590827942, "logps/chosen": -190.10177612304688, "logps/rejected": -162.91436767578125, "loss": 0.7034, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6892917156219482, "rewards/margins": 0.3028218150138855, "rewards/rejected": 0.38646987080574036, "step": 785 }, { "epoch": 0.6215578284815106, "grad_norm": 22.03873634338379, "learning_rate": 7.671560173993588e-06, "logits/chosen": -0.08852169662714005, "logits/rejected": -0.4719138741493225, "logps/chosen": -199.76376342773438, "logps/rejected": -182.2493896484375, "loss": 0.6744, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7909868359565735, "rewards/margins": 0.3397650420665741, "rewards/rejected": 0.4512217938899994, "step": 790 }, { "epoch": 0.6254917387883556, "grad_norm": 18.647151947021484, "learning_rate": 7.538134073710437e-06, "logits/chosen": -0.38996896147727966, "logits/rejected": -0.6869844198226929, "logps/chosen": -198.90866088867188, "logps/rejected": -178.61019897460938, "loss": 0.7028, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7868278622627258, "rewards/margins": 0.44276612997055054, "rewards/rejected": 0.3440617322921753, "step": 795 }, { "epoch": 0.6294256490952006, "grad_norm": 17.837268829345703, "learning_rate": 7.405172921932214e-06, "logits/chosen": -0.09680289775133133, "logits/rejected": -0.4570208191871643, "logps/chosen": -196.43899536132812, "logps/rejected": -173.35025024414062, "loss": 0.6309, "rewards/accuracies": 0.625, "rewards/chosen": 0.7571867108345032, "rewards/margins": 0.43233370780944824, "rewards/rejected": 0.32485300302505493, "step": 800 }, { "epoch": 0.6294256490952006, "eval_logits/chosen": 1.2894115447998047, "eval_logits/rejected": 1.0707098245620728, "eval_logps/chosen": -206.11080932617188, "eval_logps/rejected": -179.48574829101562, "eval_loss": 0.6793522834777832, "eval_rewards/accuracies": 0.6265624761581421, "eval_rewards/chosen": 0.7437959313392639, "eval_rewards/margins": 0.384955495595932, "eval_rewards/rejected": 0.3588404655456543, "eval_runtime": 298.0621, "eval_samples_per_second": 2.147, "eval_steps_per_second": 0.134, "step": 800 }, { "epoch": 0.6333595594020456, "grad_norm": 23.481149673461914, "learning_rate": 7.272701829729378e-06, "logits/chosen": -0.09348127245903015, "logits/rejected": -0.39429792761802673, "logps/chosen": -222.31369018554688, "logps/rejected": -189.89024353027344, "loss": 0.7434, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7474627494812012, "rewards/margins": 0.24622318148612976, "rewards/rejected": 0.5012395977973938, "step": 805 }, { "epoch": 0.6372934697088907, "grad_norm": 18.71939468383789, "learning_rate": 7.140745815619632e-06, "logits/chosen": -0.09522039443254471, "logits/rejected": -0.4288865923881531, "logps/chosen": -198.81405639648438, "logps/rejected": -192.83120727539062, "loss": 0.6662, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.610317587852478, "rewards/margins": 0.3116861879825592, "rewards/rejected": 0.2986314296722412, "step": 810 }, { "epoch": 0.6412273800157356, "grad_norm": 18.34478759765625, "learning_rate": 7.009329800842929e-06, "logits/chosen": 0.017814218997955322, "logits/rejected": -0.3244866132736206, "logps/chosen": -229.75381469726562, "logps/rejected": -199.60000610351562, "loss": 0.7092, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.5720285177230835, "rewards/margins": 0.19818969070911407, "rewards/rejected": 0.3738388121128082, "step": 815 }, { "epoch": 0.6451612903225806, "grad_norm": 16.03777313232422, "learning_rate": 6.878478604654835e-06, "logits/chosen": -0.284344345331192, "logits/rejected": -0.6540359258651733, "logps/chosen": -195.71812438964844, "logps/rejected": -176.70550537109375, "loss": 0.5904, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6928594708442688, "rewards/margins": 0.6011512875556946, "rewards/rejected": 0.09170810133218765, "step": 820 }, { "epoch": 0.6490952006294256, "grad_norm": 22.05975914001465, "learning_rate": 6.748216939639158e-06, "logits/chosen": 0.07760115712881088, "logits/rejected": -0.4913705885410309, "logps/chosen": -190.44102478027344, "logps/rejected": -163.40457153320312, "loss": 0.6636, "rewards/accuracies": 0.75, "rewards/chosen": 0.5673459768295288, "rewards/margins": 0.46832141280174255, "rewards/rejected": 0.09902457147836685, "step": 825 }, { "epoch": 0.6530291109362707, "grad_norm": 19.04427146911621, "learning_rate": 6.618569407040736e-06, "logits/chosen": -0.2564006745815277, "logits/rejected": -0.621497392654419, "logps/chosen": -198.78524780273438, "logps/rejected": -172.7997283935547, "loss": 0.6624, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6063997149467468, "rewards/margins": 0.4061097204685211, "rewards/rejected": 0.2002900391817093, "step": 830 }, { "epoch": 0.6569630212431157, "grad_norm": 13.502724647521973, "learning_rate": 6.489560492119225e-06, "logits/chosen": 0.06354556977748871, "logits/rejected": -0.4314854145050049, "logps/chosen": -215.6816864013672, "logps/rejected": -183.03579711914062, "loss": 0.6743, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.43139171600341797, "rewards/margins": 0.3207935392856598, "rewards/rejected": 0.11059819161891937, "step": 835 }, { "epoch": 0.6608969315499607, "grad_norm": 15.181354522705078, "learning_rate": 6.361214559524817e-06, "logits/chosen": -0.3440548777580261, "logits/rejected": -0.6467902660369873, "logps/chosen": -194.0684814453125, "logps/rejected": -180.21780395507812, "loss": 0.615, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5782068967819214, "rewards/margins": 0.4565269947052002, "rewards/rejected": 0.12167992442846298, "step": 840 }, { "epoch": 0.6648308418568056, "grad_norm": 79.10075378417969, "learning_rate": 6.233555848696724e-06, "logits/chosen": -0.293182373046875, "logits/rejected": -0.5915425419807434, "logps/chosen": -208.3809356689453, "logps/rejected": -191.13064575195312, "loss": 0.7247, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.5359665155410767, "rewards/margins": 0.28759217262268066, "rewards/rejected": 0.24837426841259003, "step": 845 }, { "epoch": 0.6687647521636507, "grad_norm": 18.02682113647461, "learning_rate": 6.1066084692853224e-06, "logits/chosen": -0.03417937830090523, "logits/rejected": -0.43492475152015686, "logps/chosen": -212.67398071289062, "logps/rejected": -183.54196166992188, "loss": 0.6832, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3686201870441437, "rewards/margins": 0.2786737084388733, "rewards/rejected": 0.08994650840759277, "step": 850 }, { "epoch": 0.6726986624704957, "grad_norm": 17.677215576171875, "learning_rate": 5.980396396598777e-06, "logits/chosen": -0.2180563509464264, "logits/rejected": -0.3799629211425781, "logps/chosen": -192.2188720703125, "logps/rejected": -187.93289184570312, "loss": 0.6909, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.4506203234195709, "rewards/margins": 0.31998997926712036, "rewards/rejected": 0.13063031435012817, "step": 855 }, { "epoch": 0.6766325727773407, "grad_norm": 13.698114395141602, "learning_rate": 5.854943467075087e-06, "logits/chosen": -0.22957925498485565, "logits/rejected": -0.5203697085380554, "logps/chosen": -198.90037536621094, "logps/rejected": -180.50279235839844, "loss": 0.6282, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4433286786079407, "rewards/margins": 0.4702211916446686, "rewards/rejected": -0.026892513036727905, "step": 860 }, { "epoch": 0.6805664830841857, "grad_norm": 16.75077247619629, "learning_rate": 5.730273373780309e-06, "logits/chosen": -0.3643267750740051, "logits/rejected": -0.7527881860733032, "logps/chosen": -193.90756225585938, "logps/rejected": -173.71755981445312, "loss": 0.6902, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.46958428621292114, "rewards/margins": 0.43391847610473633, "rewards/rejected": 0.03566574305295944, "step": 865 }, { "epoch": 0.6845003933910306, "grad_norm": 21.622961044311523, "learning_rate": 5.606409661933889e-06, "logits/chosen": -0.023716717958450317, "logits/rejected": -0.3822089731693268, "logps/chosen": -221.4508056640625, "logps/rejected": -188.75930786132812, "loss": 0.7406, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6389329433441162, "rewards/margins": 0.3061096668243408, "rewards/rejected": 0.332823246717453, "step": 870 }, { "epoch": 0.6884343036978757, "grad_norm": 19.141998291015625, "learning_rate": 5.483375724461918e-06, "logits/chosen": -0.36916786432266235, "logits/rejected": -0.8393670320510864, "logps/chosen": -201.64920043945312, "logps/rejected": -163.6253662109375, "loss": 0.6788, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.36916905641555786, "rewards/margins": 0.3855450749397278, "rewards/rejected": -0.016376061365008354, "step": 875 }, { "epoch": 0.6923682140047207, "grad_norm": 20.840383529663086, "learning_rate": 5.361194797579108e-06, "logits/chosen": -0.27600985765457153, "logits/rejected": -0.7273412346839905, "logps/chosen": -213.323486328125, "logps/rejected": -172.2437286376953, "loss": 0.7035, "rewards/accuracies": 0.625, "rewards/chosen": 0.7029854655265808, "rewards/margins": 0.3768990635871887, "rewards/rejected": 0.3260864317417145, "step": 880 }, { "epoch": 0.6963021243115657, "grad_norm": 20.24435806274414, "learning_rate": 5.239889956400435e-06, "logits/chosen": 0.13340488076210022, "logits/rejected": -0.46101540327072144, "logps/chosen": -217.2809295654297, "logps/rejected": -176.47802734375, "loss": 0.6408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5957245230674744, "rewards/margins": 0.416795551776886, "rewards/rejected": 0.17892900109291077, "step": 885 }, { "epoch": 0.7002360346184107, "grad_norm": 18.37978172302246, "learning_rate": 5.119484110583135e-06, "logits/chosen": -0.4709344506263733, "logits/rejected": -0.7668399810791016, "logps/chosen": -200.41390991210938, "logps/rejected": -169.01779174804688, "loss": 0.6936, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.39610370993614197, "rewards/margins": 0.3159303665161133, "rewards/rejected": 0.08017335832118988, "step": 890 }, { "epoch": 0.7041699449252558, "grad_norm": 14.384517669677734, "learning_rate": 5.000000000000003e-06, "logits/chosen": -0.2237352579832077, "logits/rejected": -0.7978562116622925, "logps/chosen": -200.4236297607422, "logps/rejected": -168.09664916992188, "loss": 0.6054, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5839098691940308, "rewards/margins": 0.5763157606124878, "rewards/rejected": 0.007594155613332987, "step": 895 }, { "epoch": 0.7081038552321007, "grad_norm": 23.844955444335938, "learning_rate": 4.881460190444726e-06, "logits/chosen": -0.57319176197052, "logits/rejected": -0.7391110062599182, "logps/chosen": -205.91015625, "logps/rejected": -186.86459350585938, "loss": 0.6875, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5059628486633301, "rewards/margins": 0.3120475113391876, "rewards/rejected": 0.19391539692878723, "step": 900 }, { "epoch": 0.7081038552321007, "eval_logits/chosen": 1.2854810953140259, "eval_logits/rejected": 1.0660665035247803, "eval_logps/chosen": -206.6718292236328, "eval_logps/rejected": -179.9932861328125, "eval_loss": 0.6678879857063293, "eval_rewards/accuracies": 0.6265624761581421, "eval_rewards/chosen": 0.519389808177948, "eval_rewards/margins": 0.3635701537132263, "eval_rewards/rejected": 0.15581969916820526, "eval_runtime": 296.5851, "eval_samples_per_second": 2.158, "eval_steps_per_second": 0.135, "step": 900 }, { "epoch": 0.7120377655389457, "grad_norm": 14.835896492004395, "learning_rate": 4.763887069370107e-06, "logits/chosen": -0.1812276542186737, "logits/rejected": -0.5340962409973145, "logps/chosen": -184.94485473632812, "logps/rejected": -169.592041015625, "loss": 0.6794, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.4734250605106354, "rewards/margins": 0.3993573486804962, "rewards/rejected": 0.07406774908304214, "step": 905 }, { "epoch": 0.7159716758457907, "grad_norm": 24.3856143951416, "learning_rate": 4.64730284165996e-06, "logits/chosen": -0.04929916188120842, "logits/rejected": -0.5009157061576843, "logps/chosen": -225.0531768798828, "logps/rejected": -193.8749237060547, "loss": 0.6584, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7070298194885254, "rewards/margins": 0.4039214551448822, "rewards/rejected": 0.3031083941459656, "step": 910 }, { "epoch": 0.7199055861526357, "grad_norm": 22.8303279876709, "learning_rate": 4.531729525435501e-06, "logits/chosen": 0.0025139451026916504, "logits/rejected": -0.6012422442436218, "logps/chosen": -205.25, "logps/rejected": -166.71438598632812, "loss": 0.639, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6045829653739929, "rewards/margins": 0.42883044481277466, "rewards/rejected": 0.17575259506702423, "step": 915 }, { "epoch": 0.7238394964594808, "grad_norm": 14.778836250305176, "learning_rate": 4.417188947896983e-06, "logits/chosen": -0.30647343397140503, "logits/rejected": -0.6068025827407837, "logps/chosen": -185.31884765625, "logps/rejected": -171.61390686035156, "loss": 0.6358, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5817626118659973, "rewards/margins": 0.4069378972053528, "rewards/rejected": 0.17482469975948334, "step": 920 }, { "epoch": 0.7277734067663257, "grad_norm": 14.139073371887207, "learning_rate": 4.303702741201431e-06, "logits/chosen": -0.5711551904678345, "logits/rejected": -0.8691667318344116, "logps/chosen": -192.8331298828125, "logps/rejected": -175.0562286376953, "loss": 0.6808, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.38311484456062317, "rewards/margins": 0.3181079924106598, "rewards/rejected": 0.0650068148970604, "step": 925 }, { "epoch": 0.7317073170731707, "grad_norm": 15.895308494567871, "learning_rate": 4.1912923383771685e-06, "logits/chosen": -0.36842986941337585, "logits/rejected": -0.7152490019798279, "logps/chosen": -211.0810089111328, "logps/rejected": -196.27755737304688, "loss": 0.6735, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5359424948692322, "rewards/margins": 0.3752870559692383, "rewards/rejected": 0.16065548360347748, "step": 930 }, { "epoch": 0.7356412273800157, "grad_norm": 14.658058166503906, "learning_rate": 4.079978969275984e-06, "logits/chosen": -0.5706170797348022, "logits/rejected": -0.852310299873352, "logps/chosen": -176.20578002929688, "logps/rejected": -158.9827423095703, "loss": 0.7049, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.29048237204551697, "rewards/margins": 0.3135663866996765, "rewards/rejected": -0.023084009066224098, "step": 935 }, { "epoch": 0.7395751376868608, "grad_norm": 14.667938232421875, "learning_rate": 3.9697836565636484e-06, "logits/chosen": -0.0873163565993309, "logits/rejected": -0.4978067874908447, "logps/chosen": -219.19210815429688, "logps/rejected": -186.28640747070312, "loss": 0.6177, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.48243242502212524, "rewards/margins": 0.47774791717529297, "rewards/rejected": 0.004684485495090485, "step": 940 }, { "epoch": 0.7435090479937058, "grad_norm": 20.62685775756836, "learning_rate": 3.860727211749572e-06, "logits/chosen": -0.3459232449531555, "logits/rejected": -0.6185725927352905, "logps/chosen": -204.01295471191406, "logps/rejected": -179.14883422851562, "loss": 0.6996, "rewards/accuracies": 0.625, "rewards/chosen": 0.7561392188072205, "rewards/margins": 0.43435636162757874, "rewards/rejected": 0.3217828571796417, "step": 945 }, { "epoch": 0.7474429583005507, "grad_norm": 17.138633728027344, "learning_rate": 3.7528302312563447e-06, "logits/chosen": -0.21280460059642792, "logits/rejected": -0.6648741960525513, "logps/chosen": -207.45266723632812, "logps/rejected": -169.14617919921875, "loss": 0.7068, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.38267606496810913, "rewards/margins": 0.17945952713489532, "rewards/rejected": 0.2032165229320526, "step": 950 }, { "epoch": 0.7513768686073957, "grad_norm": 58.220947265625, "learning_rate": 3.646113092529878e-06, "logits/chosen": -0.21766535937786102, "logits/rejected": -0.6996904611587524, "logps/chosen": -225.0487060546875, "logps/rejected": -184.19442749023438, "loss": 0.7056, "rewards/accuracies": 0.625, "rewards/chosen": 0.6735895872116089, "rewards/margins": 0.4389261305332184, "rewards/rejected": 0.2346634566783905, "step": 955 }, { "epoch": 0.7553107789142408, "grad_norm": 20.379343032836914, "learning_rate": 3.5405959501909313e-06, "logits/chosen": -0.18848784267902374, "logits/rejected": -0.5305780172348022, "logps/chosen": -212.13162231445312, "logps/rejected": -186.52542114257812, "loss": 0.6872, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.3702337145805359, "rewards/margins": 0.29016590118408203, "rewards/rejected": 0.08006780594587326, "step": 960 }, { "epoch": 0.7592446892210858, "grad_norm": 17.178056716918945, "learning_rate": 3.436298732228699e-06, "logits/chosen": -0.21896116435527802, "logits/rejected": -0.6624099612236023, "logps/chosen": -205.2207794189453, "logps/rejected": -170.05699157714844, "loss": 0.6446, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4240780472755432, "rewards/margins": 0.37573105096817017, "rewards/rejected": 0.04834695905447006, "step": 965 }, { "epoch": 0.7631785995279308, "grad_norm": 19.06415557861328, "learning_rate": 3.3332411362372063e-06, "logits/chosen": -0.15206289291381836, "logits/rejected": -0.4406839907169342, "logps/chosen": -186.83627319335938, "logps/rejected": -164.04739379882812, "loss": 0.6972, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.5690515041351318, "rewards/margins": 0.338064044713974, "rewards/rejected": 0.23098750412464142, "step": 970 }, { "epoch": 0.7671125098347757, "grad_norm": 19.997249603271484, "learning_rate": 3.231442625695217e-06, "logits/chosen": -0.4492325186729431, "logits/rejected": -0.6821542978286743, "logps/chosen": -192.6551971435547, "logps/rejected": -174.02772521972656, "loss": 0.6523, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.41465169191360474, "rewards/margins": 0.4233173727989197, "rewards/rejected": -0.008665725588798523, "step": 975 }, { "epoch": 0.7710464201416207, "grad_norm": 21.12126350402832, "learning_rate": 3.1309224262903614e-06, "logits/chosen": -0.0248140636831522, "logits/rejected": -0.2627066373825073, "logps/chosen": -214.6104278564453, "logps/rejected": -192.9540557861328, "loss": 0.6733, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.5803993940353394, "rewards/margins": 0.2992710471153259, "rewards/rejected": 0.28112831711769104, "step": 980 }, { "epoch": 0.7749803304484658, "grad_norm": 12.457499504089355, "learning_rate": 3.0316995222881584e-06, "logits/chosen": -0.40065187215805054, "logits/rejected": -0.8357529640197754, "logps/chosen": -192.20655822753906, "logps/rejected": -164.68626403808594, "loss": 0.6292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5738979578018188, "rewards/margins": 0.4073941111564636, "rewards/rejected": 0.1665038764476776, "step": 985 }, { "epoch": 0.7789142407553108, "grad_norm": 12.965932846069336, "learning_rate": 2.9337926529466578e-06, "logits/chosen": -0.5754062533378601, "logits/rejected": -0.9457462430000305, "logps/chosen": -189.44522094726562, "logps/rejected": -169.0963897705078, "loss": 0.6242, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.49839162826538086, "rewards/margins": 0.4758077561855316, "rewards/rejected": 0.02258378639817238, "step": 990 }, { "epoch": 0.7828481510621558, "grad_norm": 14.567062377929688, "learning_rate": 2.83722030897733e-06, "logits/chosen": 0.24449042975902557, "logits/rejected": -0.30078762769699097, "logps/chosen": -205.9731903076172, "logps/rejected": -173.31008911132812, "loss": 0.5947, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5201369524002075, "rewards/margins": 0.5564968585968018, "rewards/rejected": -0.036359887570142746, "step": 995 }, { "epoch": 0.7867820613690008, "grad_norm": 18.595260620117188, "learning_rate": 2.7420007290529118e-06, "logits/chosen": -0.1308153122663498, "logits/rejected": -0.6352800726890564, "logps/chosen": -224.5437469482422, "logps/rejected": -178.47549438476562, "loss": 0.6361, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.6366292238235474, "rewards/margins": 0.4467080235481262, "rewards/rejected": 0.18992114067077637, "step": 1000 }, { "epoch": 0.7867820613690008, "eval_logits/chosen": 1.278507113456726, "eval_logits/rejected": 1.058009147644043, "eval_logps/chosen": -206.54354858398438, "eval_logps/rejected": -179.86978149414062, "eval_loss": 0.6649525165557861, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": 0.5706965923309326, "eval_rewards/margins": 0.3654647767543793, "eval_rewards/rejected": 0.20523183047771454, "eval_runtime": 301.4428, "eval_samples_per_second": 2.123, "eval_steps_per_second": 0.133, "step": 1000 }, { "epoch": 0.7907159716758458, "grad_norm": 19.2440242767334, "learning_rate": 2.6481518963628383e-06, "logits/chosen": -0.11340751498937607, "logits/rejected": -0.31099405884742737, "logps/chosen": -212.424072265625, "logps/rejected": -195.0722198486328, "loss": 0.6193, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5051929354667664, "rewards/margins": 0.5108169317245483, "rewards/rejected": -0.00562392920255661, "step": 1005 }, { "epoch": 0.7946498819826908, "grad_norm": 17.27981185913086, "learning_rate": 2.555691535216944e-06, "logits/chosen": -0.2921395003795624, "logits/rejected": -0.7080395817756653, "logps/chosen": -208.31747436523438, "logps/rejected": -180.02212524414062, "loss": 0.676, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.4920189380645752, "rewards/margins": 0.2944754660129547, "rewards/rejected": 0.19754347205162048, "step": 1010 }, { "epoch": 0.7985837922895358, "grad_norm": 12.276522636413574, "learning_rate": 2.464637107698046e-06, "logits/chosen": -0.3768986165523529, "logits/rejected": -0.9090649485588074, "logps/chosen": -195.61764526367188, "logps/rejected": -158.5428466796875, "loss": 0.676, "rewards/accuracies": 0.625, "rewards/chosen": 0.35970592498779297, "rewards/margins": 0.34655410051345825, "rewards/rejected": 0.013151821680366993, "step": 1015 }, { "epoch": 0.8025177025963808, "grad_norm": 12.748953819274902, "learning_rate": 2.3750058103640427e-06, "logits/chosen": -0.3452379107475281, "logits/rejected": -0.8985518217086792, "logps/chosen": -209.6136016845703, "logps/rejected": -173.26414489746094, "loss": 0.6122, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5482198596000671, "rewards/margins": 0.4972603917121887, "rewards/rejected": 0.05095947906374931, "step": 1020 }, { "epoch": 0.8064516129032258, "grad_norm": 15.02308177947998, "learning_rate": 2.286814571000171e-06, "logits/chosen": -0.2370177060365677, "logits/rejected": -0.6736031770706177, "logps/chosen": -194.3092041015625, "logps/rejected": -164.15817260742188, "loss": 0.6226, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.5286887288093567, "rewards/margins": 0.4581621289253235, "rewards/rejected": 0.0705266147851944, "step": 1025 }, { "epoch": 0.8103855232100708, "grad_norm": 15.919551849365234, "learning_rate": 2.2000800454220285e-06, "logits/chosen": -0.04363623261451721, "logits/rejected": -0.4236673414707184, "logps/chosen": -209.69235229492188, "logps/rejected": -175.1033935546875, "loss": 0.6664, "rewards/accuracies": 0.75, "rewards/chosen": 0.5712024569511414, "rewards/margins": 0.4244155287742615, "rewards/rejected": 0.1467868834733963, "step": 1030 }, { "epoch": 0.8143194335169158, "grad_norm": 16.688159942626953, "learning_rate": 2.114818614329945e-06, "logits/chosen": -0.18427999317646027, "logits/rejected": -0.4734131693840027, "logps/chosen": -200.3739776611328, "logps/rejected": -180.11984252929688, "loss": 0.6447, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6805271506309509, "rewards/margins": 0.4109037518501282, "rewards/rejected": 0.26962336897850037, "step": 1035 }, { "epoch": 0.8182533438237608, "grad_norm": 15.36899471282959, "learning_rate": 2.031046380215327e-06, "logits/chosen": -0.5546427965164185, "logits/rejected": -0.8263591527938843, "logps/chosen": -180.88345336914062, "logps/rejected": -167.74163818359375, "loss": 0.6247, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5958175659179688, "rewards/margins": 0.5058714151382446, "rewards/rejected": 0.08994609862565994, "step": 1040 }, { "epoch": 0.8221872541306058, "grad_norm": 24.130155563354492, "learning_rate": 1.9487791643195276e-06, "logits/chosen": -0.3917720317840576, "logits/rejected": -0.7242711782455444, "logps/chosen": -206.98135375976562, "logps/rejected": -183.48318481445312, "loss": 0.7598, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.24247002601623535, "rewards/margins": 0.1793862134218216, "rewards/rejected": 0.06308381259441376, "step": 1045 }, { "epoch": 0.8261211644374509, "grad_norm": 16.544754028320312, "learning_rate": 1.8680325036458535e-06, "logits/chosen": -0.16317354142665863, "logits/rejected": -0.5910676717758179, "logps/chosen": -204.1961669921875, "logps/rejected": -173.1997833251953, "loss": 0.706, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.40617918968200684, "rewards/margins": 0.3612835705280304, "rewards/rejected": 0.04489566385746002, "step": 1050 }, { "epoch": 0.8300550747442959, "grad_norm": 28.87963104248047, "learning_rate": 1.788821648025242e-06, "logits/chosen": -0.46491608023643494, "logits/rejected": -0.5262236595153809, "logps/chosen": -198.90652465820312, "logps/rejected": -188.23049926757812, "loss": 0.7507, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.4018153250217438, "rewards/margins": 0.1714794784784317, "rewards/rejected": 0.23033586144447327, "step": 1055 }, { "epoch": 0.8339889850511408, "grad_norm": 21.89056968688965, "learning_rate": 1.7111615572361628e-06, "logits/chosen": -0.1197819709777832, "logits/rejected": -0.40464717149734497, "logps/chosen": -211.6194305419922, "logps/rejected": -192.4689483642578, "loss": 0.7031, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.44010597467422485, "rewards/margins": 0.23361381888389587, "rewards/rejected": 0.20649214088916779, "step": 1060 }, { "epoch": 0.8379228953579858, "grad_norm": 25.025197982788086, "learning_rate": 1.6350668981793304e-06, "logits/chosen": -0.21810774505138397, "logits/rejected": -0.536165714263916, "logps/chosen": -195.72702026367188, "logps/rejected": -185.3990478515625, "loss": 0.6838, "rewards/accuracies": 0.625, "rewards/chosen": 0.22649447619915009, "rewards/margins": 0.3016238212585449, "rewards/rejected": -0.07512933015823364, "step": 1065 }, { "epoch": 0.8418568056648308, "grad_norm": 15.117574691772461, "learning_rate": 1.5605520421076969e-06, "logits/chosen": -0.34034574031829834, "logits/rejected": -0.5113102793693542, "logps/chosen": -195.9296417236328, "logps/rejected": -186.29287719726562, "loss": 0.6485, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23202356696128845, "rewards/margins": 0.3296849727630615, "rewards/rejected": -0.09766140580177307, "step": 1070 }, { "epoch": 0.8457907159716759, "grad_norm": 16.99416732788086, "learning_rate": 1.487631061912298e-06, "logits/chosen": -0.5572665929794312, "logits/rejected": -0.8171085119247437, "logps/chosen": -193.6608123779297, "logps/rejected": -176.0238800048828, "loss": 0.6605, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.35814136266708374, "rewards/margins": 0.37609419226646423, "rewards/rejected": -0.017952853813767433, "step": 1075 }, { "epoch": 0.8497246262785209, "grad_norm": 14.536643981933594, "learning_rate": 1.4163177294644438e-06, "logits/chosen": -0.2895492613315582, "logits/rejected": -0.48721733689308167, "logps/chosen": -198.87753295898438, "logps/rejected": -183.21096801757812, "loss": 0.6838, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.22944995760917664, "rewards/margins": 0.3464585840702057, "rewards/rejected": -0.11700858920812607, "step": 1080 }, { "epoch": 0.8536585365853658, "grad_norm": 25.793216705322266, "learning_rate": 1.3466255130147622e-06, "logits/chosen": -0.36471131443977356, "logits/rejected": -0.5930619239807129, "logps/chosen": -187.9856719970703, "logps/rejected": -175.9360809326172, "loss": 0.683, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1035921722650528, "rewards/margins": 0.23180215060710907, "rewards/rejected": -0.12820999324321747, "step": 1085 }, { "epoch": 0.8575924468922108, "grad_norm": 20.578927993774414, "learning_rate": 1.2785675746495752e-06, "logits/chosen": -0.24610432982444763, "logits/rejected": -0.7905102968215942, "logps/chosen": -188.7552032470703, "logps/rejected": -163.68289184570312, "loss": 0.6594, "rewards/accuracies": 0.6875, "rewards/chosen": 0.19958016276359558, "rewards/margins": 0.35812973976135254, "rewards/rejected": -0.15854960680007935, "step": 1090 }, { "epoch": 0.8615263571990559, "grad_norm": 17.24201011657715, "learning_rate": 1.212156767805115e-06, "logits/chosen": -0.3163990080356598, "logits/rejected": -0.8110219240188599, "logps/chosen": -175.55859375, "logps/rejected": -141.836181640625, "loss": 0.6571, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.13926038146018982, "rewards/margins": 0.2834976315498352, "rewards/rejected": -0.1442372053861618, "step": 1095 }, { "epoch": 0.8654602675059009, "grad_norm": 13.562137603759766, "learning_rate": 1.1474056348400141e-06, "logits/chosen": -0.25132131576538086, "logits/rejected": -0.5677313804626465, "logps/chosen": -192.50961303710938, "logps/rejected": -171.54611206054688, "loss": 0.6721, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3157256245613098, "rewards/margins": 0.3246195316314697, "rewards/rejected": -0.008893907070159912, "step": 1100 }, { "epoch": 0.8654602675059009, "eval_logits/chosen": 1.2642682790756226, "eval_logits/rejected": 1.043653964996338, "eval_logps/chosen": -207.01547241210938, "eval_logps/rejected": -180.30709838867188, "eval_loss": 0.6631070971488953, "eval_rewards/accuracies": 0.6265624761581421, "eval_rewards/chosen": 0.38192370533943176, "eval_rewards/margins": 0.3516288101673126, "eval_rewards/rejected": 0.03029490076005459, "eval_runtime": 300.501, "eval_samples_per_second": 2.13, "eval_steps_per_second": 0.133, "step": 1100 }, { "epoch": 0.8693941778127459, "grad_norm": 21.122217178344727, "learning_rate": 1.0843264046665558e-06, "logits/chosen": -0.5116424560546875, "logits/rejected": -0.6911696195602417, "logps/chosen": -184.15603637695312, "logps/rejected": -175.35256958007812, "loss": 0.757, "rewards/accuracies": 0.5625, "rewards/chosen": 0.15398895740509033, "rewards/margins": 0.07341472804546356, "rewards/rejected": 0.08057420700788498, "step": 1105 }, { "epoch": 0.8733280881195908, "grad_norm": 12.891325950622559, "learning_rate": 1.0229309904411178e-06, "logits/chosen": -0.5018507838249207, "logits/rejected": -0.8595576286315918, "logps/chosen": -195.92578125, "logps/rejected": -172.9355010986328, "loss": 0.6866, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.32908183336257935, "rewards/margins": 0.38747507333755493, "rewards/rejected": -0.05839322879910469, "step": 1110 }, { "epoch": 0.8772619984264359, "grad_norm": 17.477975845336914, "learning_rate": 9.63230987314251e-07, "logits/chosen": -0.27941471338272095, "logits/rejected": -0.5305674076080322, "logps/chosen": -191.43380737304688, "logps/rejected": -170.3942108154297, "loss": 0.6999, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.39622825384140015, "rewards/margins": 0.23104743659496307, "rewards/rejected": 0.16518081724643707, "step": 1115 }, { "epoch": 0.8811959087332809, "grad_norm": 19.63365936279297, "learning_rate": 9.052376702408206e-07, "logits/chosen": -0.4624987542629242, "logits/rejected": -0.5762002468109131, "logps/chosen": -187.87295532226562, "logps/rejected": -193.58670043945312, "loss": 0.7027, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.4038239121437073, "rewards/margins": 0.24877241253852844, "rewards/rejected": 0.15505146980285645, "step": 1120 }, { "epoch": 0.8851298190401259, "grad_norm": 20.428455352783203, "learning_rate": 8.489619918506098e-07, "logits/chosen": -0.23860251903533936, "logits/rejected": -0.6500253677368164, "logps/chosen": -212.96658325195312, "logps/rejected": -179.9956512451172, "loss": 0.7073, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.24726350605487823, "rewards/margins": 0.2762225866317749, "rewards/rejected": -0.028959061950445175, "step": 1125 }, { "epoch": 0.8890637293469709, "grad_norm": 19.340242385864258, "learning_rate": 7.944145803798064e-07, "logits/chosen": -0.23527947068214417, "logits/rejected": -0.59322589635849, "logps/chosen": -203.28225708007812, "logps/rejected": -180.2418670654297, "loss": 0.681, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.3411061465740204, "rewards/margins": 0.18520446121692657, "rewards/rejected": 0.155901700258255, "step": 1130 }, { "epoch": 0.8929976396538158, "grad_norm": 79.16990661621094, "learning_rate": 7.416057376637543e-07, "logits/chosen": -0.3579210638999939, "logits/rejected": -0.6960107088088989, "logps/chosen": -200.02012634277344, "logps/rejected": -180.67965698242188, "loss": 0.6985, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4765182435512543, "rewards/margins": 0.23227711021900177, "rewards/rejected": 0.2442411184310913, "step": 1135 }, { "epoch": 0.8969315499606609, "grad_norm": 17.410009384155273, "learning_rate": 6.905454371913467e-07, "logits/chosen": -0.1638367921113968, "logits/rejected": -0.5099595189094543, "logps/chosen": -195.05340576171875, "logps/rejected": -173.5426788330078, "loss": 0.586, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4057907164096832, "rewards/margins": 0.6160932183265686, "rewards/rejected": -0.21030254662036896, "step": 1140 }, { "epoch": 0.9008654602675059, "grad_norm": 22.917627334594727, "learning_rate": 6.412433222214265e-07, "logits/chosen": -0.2664688527584076, "logits/rejected": -0.6332502365112305, "logps/chosen": -216.44711303710938, "logps/rejected": -192.39352416992188, "loss": 0.6699, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.47267699241638184, "rewards/margins": 0.3270387649536133, "rewards/rejected": 0.14563825726509094, "step": 1145 }, { "epoch": 0.9047993705743509, "grad_norm": 17.761707305908203, "learning_rate": 5.937087039615619e-07, "logits/chosen": 0.004246175289154053, "logits/rejected": -0.3583109974861145, "logps/chosen": -208.1468963623047, "logps/rejected": -186.03244018554688, "loss": 0.647, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.37165918946266174, "rewards/margins": 0.42197996377944946, "rewards/rejected": -0.05032079294323921, "step": 1150 }, { "epoch": 0.9087332808811959, "grad_norm": 26.38233184814453, "learning_rate": 5.479505598095292e-07, "logits/chosen": -0.12539446353912354, "logits/rejected": -0.085462287068367, "logps/chosen": -205.96804809570312, "logps/rejected": -210.92672729492188, "loss": 0.7508, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.34662169218063354, "rewards/margins": 0.055609725415706635, "rewards/rejected": 0.2910119593143463, "step": 1155 }, { "epoch": 0.912667191188041, "grad_norm": 25.847694396972656, "learning_rate": 5.03977531657841e-07, "logits/chosen": -0.023742878809571266, "logits/rejected": -0.445591539144516, "logps/chosen": -206.13525390625, "logps/rejected": -183.71890258789062, "loss": 0.6829, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.3747442364692688, "rewards/margins": 0.33868470788002014, "rewards/rejected": 0.036059536039829254, "step": 1160 }, { "epoch": 0.9166011014948859, "grad_norm": 14.531253814697266, "learning_rate": 4.6179792426163107e-07, "logits/chosen": -0.13202346861362457, "logits/rejected": -0.539734423160553, "logps/chosen": -192.2351531982422, "logps/rejected": -167.59829711914062, "loss": 0.6574, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.4072590470314026, "rewards/margins": 0.43480420112609863, "rewards/rejected": -0.027545183897018433, "step": 1165 }, { "epoch": 0.9205350118017309, "grad_norm": 18.72174835205078, "learning_rate": 4.214197036702239e-07, "logits/chosen": 0.10880019515752792, "logits/rejected": -0.2607296109199524, "logps/chosen": -215.71939086914062, "logps/rejected": -196.47320556640625, "loss": 0.657, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5132928490638733, "rewards/margins": 0.36024293303489685, "rewards/rejected": 0.15304993093013763, "step": 1170 }, { "epoch": 0.9244689221085759, "grad_norm": 16.053632736206055, "learning_rate": 3.82850495722662e-07, "logits/chosen": -0.07127988338470459, "logits/rejected": -0.5435328483581543, "logps/chosen": -210.20547485351562, "logps/rejected": -173.3409881591797, "loss": 0.6586, "rewards/accuracies": 0.625, "rewards/chosen": 0.4504272937774658, "rewards/margins": 0.3692251741886139, "rewards/rejected": 0.08120210468769073, "step": 1175 }, { "epoch": 0.9284028324154209, "grad_norm": 20.035791397094727, "learning_rate": 3.4609758460748656e-07, "logits/chosen": -0.1992299109697342, "logits/rejected": -0.43638792634010315, "logps/chosen": -196.8170623779297, "logps/rejected": -181.31607055664062, "loss": 0.6511, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.29465168714523315, "rewards/margins": 0.3416779041290283, "rewards/rejected": -0.04702623561024666, "step": 1180 }, { "epoch": 0.932336742722266, "grad_norm": 14.104338645935059, "learning_rate": 3.1116791148704584e-07, "logits/chosen": -0.5095082521438599, "logits/rejected": -0.933671772480011, "logps/chosen": -181.0245819091797, "logps/rejected": -145.5948944091797, "loss": 0.6582, "rewards/accuracies": 0.625, "rewards/chosen": 0.405670702457428, "rewards/margins": 0.39661210775375366, "rewards/rejected": 0.009058552794158459, "step": 1185 }, { "epoch": 0.9362706530291109, "grad_norm": 24.393505096435547, "learning_rate": 2.78068073186587e-07, "logits/chosen": -0.07540292292833328, "logits/rejected": -0.5439732670783997, "logps/chosen": -220.9651336669922, "logps/rejected": -198.6578826904297, "loss": 0.662, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.47583404183387756, "rewards/margins": 0.4002237915992737, "rewards/rejected": 0.07561029493808746, "step": 1190 }, { "epoch": 0.9402045633359559, "grad_norm": 22.48759651184082, "learning_rate": 2.4680432094837394e-07, "logits/chosen": -0.030767759308218956, "logits/rejected": -0.40518251061439514, "logps/chosen": -192.23971557617188, "logps/rejected": -165.24063110351562, "loss": 0.6944, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.32550159096717834, "rewards/margins": 0.2751082479953766, "rewards/rejected": 0.05039336532354355, "step": 1195 }, { "epoch": 0.9441384736428009, "grad_norm": 16.363256454467773, "learning_rate": 2.1738255925108253e-07, "logits/chosen": -0.5227106809616089, "logits/rejected": -0.7640475034713745, "logps/chosen": -218.41708374023438, "logps/rejected": -194.16444396972656, "loss": 0.668, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.38994377851486206, "rewards/margins": 0.3536146283149719, "rewards/rejected": 0.03632917255163193, "step": 1200 }, { "epoch": 0.9441384736428009, "eval_logits/chosen": 1.2622064352035522, "eval_logits/rejected": 1.0416359901428223, "eval_logps/chosen": -207.0163116455078, "eval_logps/rejected": -180.30044555664062, "eval_loss": 0.6637659072875977, "eval_rewards/accuracies": 0.6390625238418579, "eval_rewards/chosen": 0.38159698247909546, "eval_rewards/margins": 0.34863370656967163, "eval_rewards/rejected": 0.03296329826116562, "eval_runtime": 307.2933, "eval_samples_per_second": 2.083, "eval_steps_per_second": 0.13, "step": 1200 }, { "epoch": 0.948072383949646, "grad_norm": 15.801830291748047, "learning_rate": 1.8980834469467523e-07, "logits/chosen": 0.028558891266584396, "logits/rejected": -0.36049187183380127, "logps/chosen": -225.1962127685547, "logps/rejected": -196.6998748779297, "loss": 0.7157, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.30342918634414673, "rewards/margins": 0.18997251987457275, "rewards/rejected": 0.11345665156841278, "step": 1205 }, { "epoch": 0.952006294256491, "grad_norm": 21.53165054321289, "learning_rate": 1.6408688495098134e-07, "logits/chosen": -0.09858529269695282, "logits/rejected": -0.52873694896698, "logps/chosen": -208.2776336669922, "logps/rejected": -179.6067657470703, "loss": 0.701, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3496444821357727, "rewards/margins": 0.24224546551704407, "rewards/rejected": 0.10739902406930923, "step": 1210 }, { "epoch": 0.955940204563336, "grad_norm": 15.417522430419922, "learning_rate": 1.402230377801761e-07, "logits/chosen": -0.12817321717739105, "logits/rejected": -0.5611924529075623, "logps/chosen": -223.1984405517578, "logps/rejected": -191.31808471679688, "loss": 0.673, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.5014004707336426, "rewards/margins": 0.3005516231060028, "rewards/rejected": 0.20084881782531738, "step": 1215 }, { "epoch": 0.9598741148701809, "grad_norm": 21.139495849609375, "learning_rate": 1.1822131011334003e-07, "logits/chosen": -0.330310583114624, "logits/rejected": -0.6778287887573242, "logps/chosen": -206.1497802734375, "logps/rejected": -175.0183563232422, "loss": 0.6634, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.40439096093177795, "rewards/margins": 0.39146164059638977, "rewards/rejected": 0.012929338030517101, "step": 1220 }, { "epoch": 0.963808025177026, "grad_norm": 15.441524505615234, "learning_rate": 9.80858572012866e-08, "logits/chosen": -0.10460350662469864, "logits/rejected": -0.46022725105285645, "logps/chosen": -223.1492156982422, "logps/rejected": -193.82369995117188, "loss": 0.6415, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3919476568698883, "rewards/margins": 0.37521207332611084, "rewards/rejected": 0.016735553741455078, "step": 1225 }, { "epoch": 0.967741935483871, "grad_norm": 19.24515724182129, "learning_rate": 7.982048182978985e-08, "logits/chosen": -0.3437039256095886, "logits/rejected": -0.7036724090576172, "logps/chosen": -210.358642578125, "logps/rejected": -189.95278930664062, "loss": 0.676, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3740697503089905, "rewards/margins": 0.34888529777526855, "rewards/rejected": 0.02518446370959282, "step": 1230 }, { "epoch": 0.971675845790716, "grad_norm": 16.690387725830078, "learning_rate": 6.342863360139672e-08, "logits/chosen": -0.29954901337623596, "logits/rejected": -0.7138617634773254, "logps/chosen": -181.06094360351562, "logps/rejected": -157.12701416015625, "loss": 0.6961, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3025146424770355, "rewards/margins": 0.25972747802734375, "rewards/rejected": 0.04278718680143356, "step": 1235 }, { "epoch": 0.975609756097561, "grad_norm": 17.477008819580078, "learning_rate": 4.8913408283934874e-08, "logits/chosen": -0.19394654035568237, "logits/rejected": -0.5592636466026306, "logps/chosen": -211.7626495361328, "logps/rejected": -190.55416870117188, "loss": 0.6955, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.39831337332725525, "rewards/margins": 0.3408041000366211, "rewards/rejected": 0.05750928074121475, "step": 1240 }, { "epoch": 0.9795436664044059, "grad_norm": 14.256926536560059, "learning_rate": 3.627754722584031e-08, "logits/chosen": -0.15048038959503174, "logits/rejected": -0.5208483934402466, "logps/chosen": -223.10110473632812, "logps/rejected": -190.59140014648438, "loss": 0.6593, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.49489039182662964, "rewards/margins": 0.3687785863876343, "rewards/rejected": 0.12611182034015656, "step": 1245 }, { "epoch": 0.983477576711251, "grad_norm": 29.595378875732422, "learning_rate": 2.5523436838430503e-08, "logits/chosen": -0.3160143494606018, "logits/rejected": -0.6430375576019287, "logps/chosen": -196.36361694335938, "logps/rejected": -166.49758911132812, "loss": 0.6625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.36816468834877014, "rewards/margins": 0.36051416397094727, "rewards/rejected": 0.007650518324226141, "step": 1250 }, { "epoch": 0.987411487018096, "grad_norm": 13.689908027648926, "learning_rate": 1.665310814520482e-08, "logits/chosen": -0.6328016519546509, "logits/rejected": -0.9240643382072449, "logps/chosen": -188.88470458984375, "logps/rejected": -166.7686767578125, "loss": 0.6975, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.09804626554250717, "rewards/margins": 0.26263147592544556, "rewards/rejected": -0.1645852029323578, "step": 1255 }, { "epoch": 0.991345397324941, "grad_norm": 17.047653198242188, "learning_rate": 9.668236398262532e-09, "logits/chosen": -0.35158300399780273, "logits/rejected": -0.6125014424324036, "logps/chosen": -203.73788452148438, "logps/rejected": -189.255126953125, "loss": 0.6549, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1564028114080429, "rewards/margins": 0.3770085275173187, "rewards/rejected": -0.22060570120811462, "step": 1260 }, { "epoch": 0.995279307631786, "grad_norm": 21.35641098022461, "learning_rate": 4.570140761918085e-09, "logits/chosen": -0.744472861289978, "logits/rejected": -0.9415663480758667, "logps/chosen": -186.4073028564453, "logps/rejected": -176.38418579101562, "loss": 0.6604, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.32688194513320923, "rewards/margins": 0.42695555090904236, "rewards/rejected": -0.10007365047931671, "step": 1265 }, { "epoch": 0.999213217938631, "grad_norm": 14.891934394836426, "learning_rate": 1.3597840635615201e-09, "logits/chosen": -0.14978916943073273, "logits/rejected": -0.6677058935165405, "logps/chosen": -209.85635375976562, "logps/rejected": -172.8124237060547, "loss": 0.6708, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.3781249225139618, "rewards/margins": 0.32455307245254517, "rewards/rejected": 0.05357181280851364, "step": 1270 }, { "epoch": 1.0, "step": 1271, "total_flos": 0.0, "train_loss": 0.675776368140424, "train_runtime": 24039.6181, "train_samples_per_second": 0.846, "train_steps_per_second": 0.053 } ], "logging_steps": 5, "max_steps": 1271, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }