{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984301412872841, "eval_steps": 50, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "dpo_lambda": 0.949999988079071, "epoch": 0.0020931449502878076, "grad_norm": 2.5238513624201033, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -1.0492467880249023, "logits/rejected": -1.2254173755645752, "logps/chosen": -398.8612060546875, "logps/rejected": -402.64599609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.004186289900575615, "grad_norm": 2.742132195148399, "learning_rate": 2.083333333333333e-08, "logits/chosen": -1.4932599067687988, "logits/rejected": -1.1972451210021973, "logps/chosen": -511.3546142578125, "logps/rejected": -351.1621398925781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.006279434850863423, "grad_norm": 2.5536641296038747, "learning_rate": 3.125e-08, "logits/chosen": -1.5133744478225708, "logits/rejected": -1.2765884399414062, "logps/chosen": -336.5682373046875, "logps/rejected": -284.9754943847656, "loss": 0.6925, "rewards/accuracies": 0.625, "rewards/chosen": -0.0002040768158622086, "rewards/margins": 0.0020699857268482447, "rewards/rejected": -0.0022740624845027924, "step": 3 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.00837257980115123, "grad_norm": 2.4662597233729597, "learning_rate": 4.166666666666666e-08, "logits/chosen": -1.606925368309021, "logits/rejected": -1.6972960233688354, "logps/chosen": -305.9817199707031, "logps/rejected": -254.50814819335938, "loss": 0.6936, "rewards/accuracies": 0.375, "rewards/chosen": -0.0009585857624188066, "rewards/margins": -0.002909567439928651, "rewards/rejected": 0.0019509815610945225, "step": 4 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.010465724751439037, "grad_norm": 2.237134442060232, "learning_rate": 5.208333333333333e-08, "logits/chosen": -1.2802187204360962, "logits/rejected": -1.3293852806091309, "logps/chosen": -203.30377197265625, "logps/rejected": -284.1756591796875, "loss": 0.6932, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0017435550689697266, "rewards/margins": 0.0016065265517681837, "rewards/rejected": -0.0033500816207379103, "step": 5 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.012558869701726845, "grad_norm": 2.48607063725916, "learning_rate": 6.25e-08, "logits/chosen": -1.6082518100738525, "logits/rejected": -1.6324546337127686, "logps/chosen": -288.4521484375, "logps/rejected": -290.5629577636719, "loss": 0.6934, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0007926106918603182, "rewards/margins": 0.0007213997887447476, "rewards/rejected": 7.121087401174009e-05, "step": 6 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.014652014652014652, "grad_norm": 2.6264118416704525, "learning_rate": 7.291666666666667e-08, "logits/chosen": -0.9061921238899231, "logits/rejected": -0.8409873247146606, "logps/chosen": -455.4169921875, "logps/rejected": -404.4984436035156, "loss": 0.6936, "rewards/accuracies": 0.4375, "rewards/chosen": -0.001248917542397976, "rewards/margins": -0.003766875248402357, "rewards/rejected": 0.002517957706004381, "step": 7 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.01674515960230246, "grad_norm": 2.5329041687637877, "learning_rate": 8.333333333333333e-08, "logits/chosen": -1.2976975440979004, "logits/rejected": -1.6363146305084229, "logps/chosen": -342.306396484375, "logps/rejected": -334.14825439453125, "loss": 0.6933, "rewards/accuracies": 0.4375, "rewards/chosen": -0.00023791065905243158, "rewards/margins": -0.0005231140530668199, "rewards/rejected": 0.0002852033940143883, "step": 8 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.018838304552590265, "grad_norm": 2.48624076650776, "learning_rate": 9.375e-08, "logits/chosen": -0.7640044689178467, "logits/rejected": -1.282034158706665, "logps/chosen": -296.31939697265625, "logps/rejected": -227.2830047607422, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": 0.00014542334247380495, "rewards/margins": 0.000283215194940567, "rewards/rejected": -0.0001377916196361184, "step": 9 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.020931449502878074, "grad_norm": 2.5804165628205107, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -1.0372041463851929, "logits/rejected": -1.1912529468536377, "logps/chosen": -342.556640625, "logps/rejected": -409.69842529296875, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.00046624901005998254, "rewards/margins": 0.0008997083059512079, "rewards/rejected": -0.00043345920857973397, "step": 10 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.023024594453165882, "grad_norm": 2.432016153966977, "learning_rate": 1.1458333333333332e-07, "logits/chosen": -0.9638723134994507, "logits/rejected": -1.2894995212554932, "logps/chosen": -395.68768310546875, "logps/rejected": -408.8056945800781, "loss": 0.6926, "rewards/accuracies": 0.75, "rewards/chosen": -0.001209955313242972, "rewards/margins": 0.0013160513481125236, "rewards/rejected": -0.0025260066613554955, "step": 11 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.02511773940345369, "grad_norm": 2.631979245281971, "learning_rate": 1.25e-07, "logits/chosen": -0.9251354932785034, "logits/rejected": -1.3486714363098145, "logps/chosen": -372.6667175292969, "logps/rejected": -291.44354248046875, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0003882289747707546, "rewards/margins": 0.0005017256480641663, "rewards/rejected": -0.00011349667329341173, "step": 12 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.027210884353741496, "grad_norm": 2.4624928918234925, "learning_rate": 1.3541666666666666e-07, "logits/chosen": -1.2540111541748047, "logits/rejected": -1.9516901969909668, "logps/chosen": -265.29266357421875, "logps/rejected": -226.957275390625, "loss": 0.6934, "rewards/accuracies": 0.625, "rewards/chosen": 0.004002328030765057, "rewards/margins": 0.002240309491753578, "rewards/rejected": 0.0017620180733501911, "step": 13 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.029304029304029304, "grad_norm": 2.6544325323599, "learning_rate": 1.4583333333333335e-07, "logits/chosen": -1.2324177026748657, "logits/rejected": -1.1027512550354004, "logps/chosen": -311.18511962890625, "logps/rejected": -347.5983581542969, "loss": 0.6928, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0023771976120769978, "rewards/margins": 0.0007142757531255484, "rewards/rejected": 0.0016629218589514494, "step": 14 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.03139717425431711, "grad_norm": 2.658666069776631, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -1.5901809930801392, "logits/rejected": -1.7792041301727295, "logps/chosen": -338.5160827636719, "logps/rejected": -298.5901184082031, "loss": 0.693, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0016016530571505427, "rewards/margins": 0.0003989767865277827, "rewards/rejected": 0.001202676328830421, "step": 15 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.03349031920460492, "grad_norm": 2.487653751085319, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -0.47793060541152954, "logits/rejected": -0.8347984552383423, "logps/chosen": -399.69171142578125, "logps/rejected": -368.736328125, "loss": 0.6937, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0015305518172681332, "rewards/margins": -0.0006346654845401645, "rewards/rejected": 0.002165217185392976, "step": 16 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.035583464154892726, "grad_norm": 2.6516902928755517, "learning_rate": 1.7708333333333334e-07, "logits/chosen": -0.9618392586708069, "logits/rejected": -1.2820124626159668, "logps/chosen": -438.302978515625, "logps/rejected": -305.95556640625, "loss": 0.6933, "rewards/accuracies": 0.3125, "rewards/chosen": 0.0026514935307204723, "rewards/margins": 0.0005700254114344716, "rewards/rejected": 0.002081468002870679, "step": 17 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.03767660910518053, "grad_norm": 2.5880555384469495, "learning_rate": 1.875e-07, "logits/chosen": -1.3747632503509521, "logits/rejected": -1.6497913599014282, "logps/chosen": -283.3655090332031, "logps/rejected": -223.08218383789062, "loss": 0.6936, "rewards/accuracies": 0.25, "rewards/chosen": 0.00047120568342506886, "rewards/margins": -0.0030409740284085274, "rewards/rejected": 0.00351217994466424, "step": 18 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.03976975405546834, "grad_norm": 2.575005579223518, "learning_rate": 1.9791666666666664e-07, "logits/chosen": -1.158360242843628, "logits/rejected": -1.185401439666748, "logps/chosen": -408.72882080078125, "logps/rejected": -405.0758056640625, "loss": 0.6933, "rewards/accuracies": 0.4375, "rewards/chosen": 0.003389589721336961, "rewards/margins": -0.0005582189187407494, "rewards/rejected": 0.0039478084072470665, "step": 19 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.04186289900575615, "grad_norm": 2.3899451294364593, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -1.0450775623321533, "logits/rejected": -1.160659670829773, "logps/chosen": -309.60723876953125, "logps/rejected": -342.3398132324219, "loss": 0.6933, "rewards/accuracies": 0.5625, "rewards/chosen": 0.001832823734730482, "rewards/margins": -0.0003225851687602699, "rewards/rejected": 0.0021554087288677692, "step": 20 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.04395604395604396, "grad_norm": 2.2029968156881012, "learning_rate": 2.1875e-07, "logits/chosen": -1.145942211151123, "logits/rejected": -1.4895652532577515, "logps/chosen": -213.55941772460938, "logps/rejected": -202.06842041015625, "loss": 0.6932, "rewards/accuracies": 0.875, "rewards/chosen": 0.005665008910000324, "rewards/margins": 0.0035952446050941944, "rewards/rejected": 0.002069764072075486, "step": 21 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.046049188906331764, "grad_norm": 2.4251783934111484, "learning_rate": 2.2916666666666663e-07, "logits/chosen": -1.0779845714569092, "logits/rejected": -1.2184988260269165, "logps/chosen": -338.2264404296875, "logps/rejected": -249.79437255859375, "loss": 0.6927, "rewards/accuracies": 0.3125, "rewards/chosen": 0.005221719853579998, "rewards/margins": -0.0003454876132309437, "rewards/rejected": 0.005567207001149654, "step": 22 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.04814233385661957, "grad_norm": 2.454645800842138, "learning_rate": 2.3958333333333335e-07, "logits/chosen": -1.6221305131912231, "logits/rejected": -0.9964793920516968, "logps/chosen": -309.6587829589844, "logps/rejected": -316.56451416015625, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": 0.007448410615324974, "rewards/margins": -0.0005814814940094948, "rewards/rejected": 0.008029892109334469, "step": 23 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.05023547880690738, "grad_norm": 2.5676939258617435, "learning_rate": 2.5e-07, "logits/chosen": -0.8644750118255615, "logits/rejected": -1.421642780303955, "logps/chosen": -329.37164306640625, "logps/rejected": -276.9499816894531, "loss": 0.6932, "rewards/accuracies": 0.6875, "rewards/chosen": 0.010316381230950356, "rewards/margins": 0.0005483555723913014, "rewards/rejected": 0.009768025949597359, "step": 24 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.052328623757195186, "grad_norm": 2.3474432877365867, "learning_rate": 2.604166666666667e-07, "logits/chosen": -1.5667359828948975, "logits/rejected": -1.7593765258789062, "logps/chosen": -222.93890380859375, "logps/rejected": -192.01153564453125, "loss": 0.6922, "rewards/accuracies": 0.6875, "rewards/chosen": 0.010832302272319794, "rewards/margins": 0.0036077809054404497, "rewards/rejected": 0.007224521599709988, "step": 25 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.05442176870748299, "grad_norm": 2.5154000780567984, "learning_rate": 2.708333333333333e-07, "logits/chosen": -1.4954286813735962, "logits/rejected": -1.8402884006500244, "logps/chosen": -219.48851013183594, "logps/rejected": -180.8162841796875, "loss": 0.6918, "rewards/accuracies": 0.5, "rewards/chosen": 0.008859949186444283, "rewards/margins": 0.00044920702930539846, "rewards/rejected": 0.00841074250638485, "step": 26 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.0565149136577708, "grad_norm": 2.613765998608133, "learning_rate": 2.8125e-07, "logits/chosen": -0.9153610467910767, "logits/rejected": -1.4024361371994019, "logps/chosen": -329.25323486328125, "logps/rejected": -292.6675109863281, "loss": 0.6923, "rewards/accuracies": 0.4375, "rewards/chosen": 0.009662103839218616, "rewards/margins": -0.0011162518057972193, "rewards/rejected": 0.010778354480862617, "step": 27 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.05860805860805861, "grad_norm": 2.5821485865074214, "learning_rate": 2.916666666666667e-07, "logits/chosen": -0.8812850713729858, "logits/rejected": -0.9202166199684143, "logps/chosen": -393.4171142578125, "logps/rejected": -364.45269775390625, "loss": 0.6919, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0167025625705719, "rewards/margins": 0.0006670024013146758, "rewards/rejected": 0.01603556051850319, "step": 28 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.06070120355834641, "grad_norm": 2.7922308821388424, "learning_rate": 3.020833333333333e-07, "logits/chosen": -0.9133128523826599, "logits/rejected": -0.8920246958732605, "logps/chosen": -496.87103271484375, "logps/rejected": -368.1391906738281, "loss": 0.6924, "rewards/accuracies": 0.375, "rewards/chosen": 0.019071899354457855, "rewards/margins": -0.001497082645073533, "rewards/rejected": 0.020568981766700745, "step": 29 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.06279434850863422, "grad_norm": 2.6933943577997894, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -1.0931180715560913, "logits/rejected": -1.156693696975708, "logps/chosen": -414.7959289550781, "logps/rejected": -414.8038024902344, "loss": 0.691, "rewards/accuracies": 0.6875, "rewards/chosen": 0.024503473192453384, "rewards/margins": 0.003209207206964493, "rewards/rejected": 0.021294264122843742, "step": 30 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.06488749345892203, "grad_norm": 2.502045558923208, "learning_rate": 3.2291666666666666e-07, "logits/chosen": -1.5044941902160645, "logits/rejected": -1.5712063312530518, "logps/chosen": -251.3910675048828, "logps/rejected": -307.28924560546875, "loss": 0.6917, "rewards/accuracies": 0.5, "rewards/chosen": 0.022805027663707733, "rewards/margins": 0.0006459974683821201, "rewards/rejected": 0.02215902879834175, "step": 31 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.06698063840920984, "grad_norm": 2.794919274724495, "learning_rate": 3.333333333333333e-07, "logits/chosen": -1.4800829887390137, "logits/rejected": -1.7993519306182861, "logps/chosen": -277.0565185546875, "logps/rejected": -245.9532470703125, "loss": 0.6909, "rewards/accuracies": 0.4375, "rewards/chosen": 0.026073835790157318, "rewards/margins": 0.0017528488533571362, "rewards/rejected": 0.02432098798453808, "step": 32 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.06907378335949764, "grad_norm": 2.561970041658725, "learning_rate": 3.4375e-07, "logits/chosen": -0.9976758360862732, "logits/rejected": -1.0321834087371826, "logps/chosen": -409.8870544433594, "logps/rejected": -308.4429931640625, "loss": 0.6905, "rewards/accuracies": 0.75, "rewards/chosen": 0.03413209319114685, "rewards/margins": 0.004116056486964226, "rewards/rejected": 0.030016038566827774, "step": 33 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.07116692830978545, "grad_norm": 2.6489098486833362, "learning_rate": 3.541666666666667e-07, "logits/chosen": -1.744815468788147, "logits/rejected": -0.8233209252357483, "logps/chosen": -301.051025390625, "logps/rejected": -439.6226806640625, "loss": 0.691, "rewards/accuracies": 0.5625, "rewards/chosen": 0.035877130925655365, "rewards/margins": -0.0004040311323478818, "rewards/rejected": 0.03628116101026535, "step": 34 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.07326007326007326, "grad_norm": 2.4069143442596057, "learning_rate": 3.645833333333333e-07, "logits/chosen": -1.8779923915863037, "logits/rejected": -2.1322505474090576, "logps/chosen": -225.31105041503906, "logps/rejected": -248.81930541992188, "loss": 0.6903, "rewards/accuracies": 0.75, "rewards/chosen": 0.04101023077964783, "rewards/margins": 0.003965466283261776, "rewards/rejected": 0.03704476356506348, "step": 35 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.07535321821036106, "grad_norm": 2.4803837171852194, "learning_rate": 3.75e-07, "logits/chosen": -1.0292996168136597, "logits/rejected": -1.4985079765319824, "logps/chosen": -297.11798095703125, "logps/rejected": -239.1429443359375, "loss": 0.6903, "rewards/accuracies": 0.75, "rewards/chosen": 0.047872141003608704, "rewards/margins": 0.0073857903480529785, "rewards/rejected": 0.040486350655555725, "step": 36 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.07744636316064887, "grad_norm": 2.5109082995582077, "learning_rate": 3.8541666666666665e-07, "logits/chosen": -1.147574782371521, "logits/rejected": -1.1821694374084473, "logps/chosen": -300.9302978515625, "logps/rejected": -264.7428283691406, "loss": 0.6891, "rewards/accuracies": 0.75, "rewards/chosen": 0.050175584852695465, "rewards/margins": 0.006107117980718613, "rewards/rejected": 0.04406846687197685, "step": 37 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.07953950811093669, "grad_norm": 2.438841409823368, "learning_rate": 3.958333333333333e-07, "logits/chosen": -1.1249700784683228, "logits/rejected": -1.2837371826171875, "logps/chosen": -333.0791320800781, "logps/rejected": -347.0002746582031, "loss": 0.6895, "rewards/accuracies": 0.4375, "rewards/chosen": 0.05678965896368027, "rewards/margins": 0.005165199749171734, "rewards/rejected": 0.05162446200847626, "step": 38 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.08163265306122448, "grad_norm": 2.6749953700325104, "learning_rate": 4.0625e-07, "logits/chosen": -0.5424120426177979, "logits/rejected": -1.086322546005249, "logps/chosen": -350.36871337890625, "logps/rejected": -257.62347412109375, "loss": 0.688, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06144564598798752, "rewards/margins": 0.005785645917057991, "rewards/rejected": 0.055660001933574677, "step": 39 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.0837257980115123, "grad_norm": 2.335200112165688, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -0.969620943069458, "logits/rejected": -1.5480098724365234, "logps/chosen": -239.6829071044922, "logps/rejected": -319.958984375, "loss": 0.6892, "rewards/accuracies": 0.6875, "rewards/chosen": 0.061700835824012756, "rewards/margins": 0.009912976995110512, "rewards/rejected": 0.0517878532409668, "step": 40 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.08581894296180011, "grad_norm": 2.4342697087205174, "learning_rate": 4.270833333333333e-07, "logits/chosen": -1.1205010414123535, "logits/rejected": -1.153897762298584, "logps/chosen": -308.97320556640625, "logps/rejected": -315.3182373046875, "loss": 0.6887, "rewards/accuracies": 0.5, "rewards/chosen": 0.06559581309556961, "rewards/margins": -0.0006550836842507124, "rewards/rejected": 0.06625089049339294, "step": 41 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.08791208791208792, "grad_norm": 2.4453505411023384, "learning_rate": 4.375e-07, "logits/chosen": -1.4840342998504639, "logits/rejected": -1.4749723672866821, "logps/chosen": -233.6084747314453, "logps/rejected": -243.11952209472656, "loss": 0.6891, "rewards/accuracies": 0.5, "rewards/chosen": 0.07008497416973114, "rewards/margins": -0.0027707151602953672, "rewards/rejected": 0.07285569608211517, "step": 42 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.09000523286237572, "grad_norm": 2.66038221048752, "learning_rate": 4.479166666666667e-07, "logits/chosen": -1.1283433437347412, "logits/rejected": -1.317920446395874, "logps/chosen": -435.1509094238281, "logps/rejected": -388.8012390136719, "loss": 0.6876, "rewards/accuracies": 0.375, "rewards/chosen": 0.08165865391492844, "rewards/margins": 0.003668295219540596, "rewards/rejected": 0.07799036800861359, "step": 43 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.09209837781266353, "grad_norm": 2.491819048187407, "learning_rate": 4.5833333333333327e-07, "logits/chosen": -1.5543797016143799, "logits/rejected": -1.9867057800292969, "logps/chosen": -443.44775390625, "logps/rejected": -274.569580078125, "loss": 0.6874, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1015908345580101, "rewards/margins": 0.020656811073422432, "rewards/rejected": 0.08093402534723282, "step": 44 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.09419152276295134, "grad_norm": 2.2515727477117577, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -1.1949578523635864, "logits/rejected": -1.4127506017684937, "logps/chosen": -305.904052734375, "logps/rejected": -317.99383544921875, "loss": 0.6878, "rewards/accuracies": 0.8125, "rewards/chosen": 0.09946791082620621, "rewards/margins": 0.008398786187171936, "rewards/rejected": 0.09106913208961487, "step": 45 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.09628466771323914, "grad_norm": 2.563007510040732, "learning_rate": 4.791666666666667e-07, "logits/chosen": -1.0176115036010742, "logits/rejected": -1.4347546100616455, "logps/chosen": -434.220703125, "logps/rejected": -276.7042236328125, "loss": 0.6835, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11302684247493744, "rewards/margins": 0.02043282985687256, "rewards/rejected": 0.09259402006864548, "step": 46 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.09837781266352695, "grad_norm": 2.488072566028342, "learning_rate": 4.895833333333333e-07, "logits/chosen": -1.6166744232177734, "logits/rejected": -1.9656682014465332, "logps/chosen": -262.8787841796875, "logps/rejected": -294.89422607421875, "loss": 0.6844, "rewards/accuracies": 0.5625, "rewards/chosen": 0.11034242808818817, "rewards/margins": 0.00747002474963665, "rewards/rejected": 0.10287239402532578, "step": 47 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.10047095761381476, "grad_norm": 2.494175901434857, "learning_rate": 5e-07, "logits/chosen": -1.0948954820632935, "logits/rejected": -1.6069471836090088, "logps/chosen": -318.8594970703125, "logps/rejected": -300.58294677734375, "loss": 0.6824, "rewards/accuracies": 0.875, "rewards/chosen": 0.1277778297662735, "rewards/margins": 0.027094997465610504, "rewards/rejected": 0.10068284720182419, "step": 48 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.10256410256410256, "grad_norm": 2.5429857075760847, "learning_rate": 4.999932966293553e-07, "logits/chosen": -1.101062536239624, "logits/rejected": -1.3739486932754517, "logps/chosen": -362.21197509765625, "logps/rejected": -348.89141845703125, "loss": 0.6837, "rewards/accuracies": 0.625, "rewards/chosen": 0.12920302152633667, "rewards/margins": 0.004603213630616665, "rewards/rejected": 0.12459981441497803, "step": 49 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.10465724751439037, "grad_norm": 2.5754295890771832, "learning_rate": 4.999731868769026e-07, "logits/chosen": -1.6606638431549072, "logits/rejected": -1.2856189012527466, "logps/chosen": -303.62518310546875, "logps/rejected": -354.71533203125, "loss": 0.6842, "rewards/accuracies": 0.625, "rewards/chosen": 0.14312568306922913, "rewards/margins": 0.00830678828060627, "rewards/rejected": 0.134818896651268, "step": 50 }, { "epoch": 0.10465724751439037, "eval_dpo_lambda": 0.9500001072883606, "eval_logits/chosen": -1.1642831563949585, "eval_logits/rejected": -1.288604974746704, "eval_logps/chosen": -318.35150146484375, "eval_logps/rejected": -293.2452087402344, "eval_loss": 0.6827022433280945, "eval_rewards/accuracies": 0.6865079402923584, "eval_rewards/chosen": 0.1495998352766037, "eval_rewards/margins": 0.014687073417007923, "eval_rewards/rejected": 0.13491272926330566, "eval_runtime": 126.9492, "eval_samples_per_second": 15.754, "eval_steps_per_second": 0.496, "step": 50 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.10675039246467818, "grad_norm": 2.2755919202453905, "learning_rate": 4.99939671821067e-07, "logits/chosen": -1.7552989721298218, "logits/rejected": -1.7865089178085327, "logps/chosen": -250.38571166992188, "logps/rejected": -189.3590850830078, "loss": 0.6849, "rewards/accuracies": 0.5625, "rewards/chosen": 0.13150912523269653, "rewards/margins": 0.0071302494034171104, "rewards/rejected": 0.12437885999679565, "step": 51 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.10884353741496598, "grad_norm": 2.5241111013026742, "learning_rate": 4.998927532591591e-07, "logits/chosen": -1.0593992471694946, "logits/rejected": -1.3912749290466309, "logps/chosen": -234.35882568359375, "logps/rejected": -244.15481567382812, "loss": 0.6828, "rewards/accuracies": 0.5, "rewards/chosen": 0.15121498703956604, "rewards/margins": 0.007220970466732979, "rewards/rejected": 0.14399400353431702, "step": 52 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.1109366823652538, "grad_norm": 2.3651386610482694, "learning_rate": 4.998324337072792e-07, "logits/chosen": -1.445833683013916, "logits/rejected": -1.6016114950180054, "logps/chosen": -253.54568481445312, "logps/rejected": -269.97259521484375, "loss": 0.6813, "rewards/accuracies": 0.75, "rewards/chosen": 0.17443205416202545, "rewards/margins": 0.02932567521929741, "rewards/rejected": 0.14510637521743774, "step": 53 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.1130298273155416, "grad_norm": 2.703233183840897, "learning_rate": 4.997587164001815e-07, "logits/chosen": -1.385809302330017, "logits/rejected": -1.3108282089233398, "logps/chosen": -419.46820068359375, "logps/rejected": -317.28076171875, "loss": 0.6795, "rewards/accuracies": 0.75, "rewards/chosen": 0.18837417662143707, "rewards/margins": 0.04271288216114044, "rewards/rejected": 0.14566129446029663, "step": 54 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.1151229722658294, "grad_norm": 2.387486767898641, "learning_rate": 4.996716052911017e-07, "logits/chosen": -1.2117292881011963, "logits/rejected": -1.3327128887176514, "logps/chosen": -290.7606506347656, "logps/rejected": -261.4772033691406, "loss": 0.6811, "rewards/accuracies": 0.75, "rewards/chosen": 0.18960295617580414, "rewards/margins": 0.028018314391374588, "rewards/rejected": 0.16158463060855865, "step": 55 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.11721611721611722, "grad_norm": 2.407751602441618, "learning_rate": 4.99571105051544e-07, "logits/chosen": -0.9387331008911133, "logits/rejected": -1.1321182250976562, "logps/chosen": -212.40528869628906, "logps/rejected": -306.0078430175781, "loss": 0.6795, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1804485023021698, "rewards/margins": 0.019616497680544853, "rewards/rejected": 0.1608319878578186, "step": 56 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.11930926216640503, "grad_norm": 2.471362383542089, "learning_rate": 4.994572210710314e-07, "logits/chosen": -0.7424502968788147, "logits/rejected": -1.0819528102874756, "logps/chosen": -317.90777587890625, "logps/rejected": -341.05621337890625, "loss": 0.6778, "rewards/accuracies": 0.75, "rewards/chosen": 0.1922038495540619, "rewards/margins": 0.01743231900036335, "rewards/rejected": 0.1747715324163437, "step": 57 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.12140240711669283, "grad_norm": 2.3567792826807823, "learning_rate": 4.993299594568162e-07, "logits/chosen": -1.3764880895614624, "logits/rejected": -1.314342737197876, "logps/chosen": -249.5397186279297, "logps/rejected": -308.6777038574219, "loss": 0.6806, "rewards/accuracies": 0.5625, "rewards/chosen": 0.18895545601844788, "rewards/margins": 0.014720916748046875, "rewards/rejected": 0.174234539270401, "step": 58 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.12349555206698064, "grad_norm": 2.3705955200417015, "learning_rate": 4.991893270335525e-07, "logits/chosen": -1.2738173007965088, "logits/rejected": -1.7423033714294434, "logps/chosen": -264.46142578125, "logps/rejected": -222.62757873535156, "loss": 0.6804, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1837330311536789, "rewards/margins": 0.014330385252833366, "rewards/rejected": 0.16940264403820038, "step": 59 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.12558869701726844, "grad_norm": 2.509489247058352, "learning_rate": 4.990353313429303e-07, "logits/chosen": -1.0909311771392822, "logits/rejected": -1.2675325870513916, "logps/chosen": -263.276123046875, "logps/rejected": -256.9141540527344, "loss": 0.675, "rewards/accuracies": 0.625, "rewards/chosen": 0.2111552208662033, "rewards/margins": 0.011078471317887306, "rewards/rejected": 0.20007675886154175, "step": 60 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.12768184196755625, "grad_norm": 2.538626284349186, "learning_rate": 4.988679806432711e-07, "logits/chosen": -0.7756716012954712, "logits/rejected": -0.7832010984420776, "logps/chosen": -311.29156494140625, "logps/rejected": -234.34881591796875, "loss": 0.6763, "rewards/accuracies": 0.5625, "rewards/chosen": 0.22171315550804138, "rewards/margins": 0.016394145786762238, "rewards/rejected": 0.20531903207302094, "step": 61 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.12977498691784406, "grad_norm": 2.4458796989311526, "learning_rate": 4.986872839090852e-07, "logits/chosen": -1.1334611177444458, "logits/rejected": -1.0728856325149536, "logps/chosen": -320.83917236328125, "logps/rejected": -342.316650390625, "loss": 0.6766, "rewards/accuracies": 0.625, "rewards/chosen": 0.2102716714143753, "rewards/margins": 0.014481215737760067, "rewards/rejected": 0.19579046964645386, "step": 62 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.13186813186813187, "grad_norm": 2.4813077942842017, "learning_rate": 4.9849325083059e-07, "logits/chosen": -1.075955867767334, "logits/rejected": -1.6124709844589233, "logps/chosen": -285.3878173828125, "logps/rejected": -221.7014923095703, "loss": 0.674, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2351919710636139, "rewards/margins": 0.0502181202173233, "rewards/rejected": 0.18497386574745178, "step": 63 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.13396127681841968, "grad_norm": 2.422025438782533, "learning_rate": 4.982858918131906e-07, "logits/chosen": -1.4893652200698853, "logits/rejected": -1.3398679494857788, "logps/chosen": -219.7025146484375, "logps/rejected": -294.2558898925781, "loss": 0.6779, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21946364641189575, "rewards/margins": 0.024202071130275726, "rewards/rejected": 0.19526156783103943, "step": 64 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.1360544217687075, "grad_norm": 2.4417083579137477, "learning_rate": 4.980652179769217e-07, "logits/chosen": -0.9350445866584778, "logits/rejected": -1.1353799104690552, "logps/chosen": -446.11871337890625, "logps/rejected": -393.4355773925781, "loss": 0.6743, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2512449622154236, "rewards/margins": 0.046813588589429855, "rewards/rejected": 0.20443135499954224, "step": 65 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.13814756671899528, "grad_norm": 2.4322102177386333, "learning_rate": 4.978312411558517e-07, "logits/chosen": -1.480696439743042, "logits/rejected": -1.2343032360076904, "logps/chosen": -279.2574768066406, "logps/rejected": -331.2752990722656, "loss": 0.6765, "rewards/accuracies": 0.5, "rewards/chosen": 0.23720219731330872, "rewards/margins": 0.009973518550395966, "rewards/rejected": 0.22722868621349335, "step": 66 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.1402407116692831, "grad_norm": 2.6810415228869657, "learning_rate": 4.975839738974473e-07, "logits/chosen": -1.1023014783859253, "logits/rejected": -1.6509053707122803, "logps/chosen": -380.06024169921875, "logps/rejected": -354.8814392089844, "loss": 0.6658, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2437136471271515, "rewards/margins": 0.041057467460632324, "rewards/rejected": 0.20265617966651917, "step": 67 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.1423338566195709, "grad_norm": 2.3982747269440803, "learning_rate": 4.97323429461901e-07, "logits/chosen": -1.525477409362793, "logits/rejected": -1.7293528318405151, "logps/chosen": -379.8116149902344, "logps/rejected": -298.1125183105469, "loss": 0.6752, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24697479605674744, "rewards/margins": 0.032709792256355286, "rewards/rejected": 0.21426498889923096, "step": 68 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.14442700156985872, "grad_norm": 2.3015778908634346, "learning_rate": 4.970496218214204e-07, "logits/chosen": -0.8262905478477478, "logits/rejected": -0.9529343247413635, "logps/chosen": -267.7677307128906, "logps/rejected": -325.0674743652344, "loss": 0.6783, "rewards/accuracies": 0.75, "rewards/chosen": 0.28256407380104065, "rewards/margins": 0.04105500504374504, "rewards/rejected": 0.2415090650320053, "step": 69 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.14652014652014653, "grad_norm": 2.359233905861832, "learning_rate": 4.967625656594781e-07, "logits/chosen": -1.408897876739502, "logits/rejected": -1.4837861061096191, "logps/chosen": -291.5557861328125, "logps/rejected": -257.3231201171875, "loss": 0.675, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2662292420864105, "rewards/margins": 0.031066056340932846, "rewards/rejected": 0.23516318202018738, "step": 70 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.14861329147043434, "grad_norm": 2.4110820557204247, "learning_rate": 4.964622763700252e-07, "logits/chosen": -1.3170831203460693, "logits/rejected": -1.5105832815170288, "logps/chosen": -369.9690246582031, "logps/rejected": -337.8123474121094, "loss": 0.6746, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24139149487018585, "rewards/margins": 0.018225520849227905, "rewards/rejected": 0.22316598892211914, "step": 71 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.15070643642072212, "grad_norm": 2.659457876838649, "learning_rate": 4.961487700566646e-07, "logits/chosen": -1.0732438564300537, "logits/rejected": -1.5925201177597046, "logps/chosen": -379.78302001953125, "logps/rejected": -208.7226104736328, "loss": 0.6689, "rewards/accuracies": 0.875, "rewards/chosen": 0.25341007113456726, "rewards/margins": 0.041209153831005096, "rewards/rejected": 0.21220090985298157, "step": 72 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.15279958137100993, "grad_norm": 2.452657343665281, "learning_rate": 4.958220635317885e-07, "logits/chosen": -1.4009957313537598, "logits/rejected": -1.502071738243103, "logps/chosen": -238.7895965576172, "logps/rejected": -204.4416961669922, "loss": 0.6705, "rewards/accuracies": 0.625, "rewards/chosen": 0.2505919635295868, "rewards/margins": 0.004445520229637623, "rewards/rejected": 0.24614644050598145, "step": 73 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.15489272632129775, "grad_norm": 2.3431867233973627, "learning_rate": 4.954821743156767e-07, "logits/chosen": -0.8862349390983582, "logits/rejected": -1.2672829627990723, "logps/chosen": -351.131103515625, "logps/rejected": -284.5081787109375, "loss": 0.6745, "rewards/accuracies": 0.75, "rewards/chosen": 0.2530231475830078, "rewards/margins": 0.05167631432414055, "rewards/rejected": 0.20134682953357697, "step": 74 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.15698587127158556, "grad_norm": 2.3662321901836467, "learning_rate": 4.951291206355559e-07, "logits/chosen": -0.9427766799926758, "logits/rejected": -1.3873231410980225, "logps/chosen": -362.88018798828125, "logps/rejected": -303.9943542480469, "loss": 0.672, "rewards/accuracies": 0.625, "rewards/chosen": 0.2761828303337097, "rewards/margins": 0.039097219705581665, "rewards/rejected": 0.23708559572696686, "step": 75 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.15907901622187337, "grad_norm": 2.3278898461643873, "learning_rate": 4.947629214246236e-07, "logits/chosen": -0.7770520448684692, "logits/rejected": -1.1578378677368164, "logps/chosen": -317.7205810546875, "logps/rejected": -320.40130615234375, "loss": 0.6681, "rewards/accuracies": 0.875, "rewards/chosen": 0.292828768491745, "rewards/margins": 0.05452661216259003, "rewards/rejected": 0.23830217123031616, "step": 76 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.16117216117216118, "grad_norm": 2.4077245906573403, "learning_rate": 4.943835963210323e-07, "logits/chosen": -1.0316717624664307, "logits/rejected": -1.4393993616104126, "logps/chosen": -400.1679382324219, "logps/rejected": -370.2728271484375, "loss": 0.669, "rewards/accuracies": 0.8125, "rewards/chosen": 0.27847668528556824, "rewards/margins": 0.04721963405609131, "rewards/rejected": 0.23125705122947693, "step": 77 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.16326530612244897, "grad_norm": 2.319398889378967, "learning_rate": 4.939911656668361e-07, "logits/chosen": -0.9531311392784119, "logits/rejected": -1.6043527126312256, "logps/chosen": -251.8982696533203, "logps/rejected": -169.3506622314453, "loss": 0.6696, "rewards/accuracies": 0.75, "rewards/chosen": 0.29433679580688477, "rewards/margins": 0.06263977289199829, "rewards/rejected": 0.23169703781604767, "step": 78 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.16535845107273678, "grad_norm": 2.502186202156719, "learning_rate": 4.935856505068998e-07, "logits/chosen": -0.9636015295982361, "logits/rejected": -1.5627679824829102, "logps/chosen": -417.8866882324219, "logps/rejected": -279.7188720703125, "loss": 0.6624, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2873650789260864, "rewards/margins": 0.06544859707355499, "rewards/rejected": 0.22191648185253143, "step": 79 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.1674515960230246, "grad_norm": 2.4765042114384612, "learning_rate": 4.93167072587771e-07, "logits/chosen": -1.333372950553894, "logits/rejected": -1.3615721464157104, "logps/chosen": -326.4842224121094, "logps/rejected": -212.3470916748047, "loss": 0.6726, "rewards/accuracies": 0.375, "rewards/chosen": 0.22611269354820251, "rewards/margins": 0.004515619948506355, "rewards/rejected": 0.2215970754623413, "step": 80 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.1695447409733124, "grad_norm": 2.3140410729726786, "learning_rate": 4.92735454356513e-07, "logits/chosen": -0.6755834817886353, "logits/rejected": -1.6798317432403564, "logps/chosen": -314.6034851074219, "logps/rejected": -218.89540100097656, "loss": 0.6682, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2700071334838867, "rewards/margins": 0.08321366459131241, "rewards/rejected": 0.1867934763431549, "step": 81 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.17163788592360021, "grad_norm": 2.23070971313636, "learning_rate": 4.922908189595017e-07, "logits/chosen": -1.5057995319366455, "logits/rejected": -1.8168041706085205, "logps/chosen": -231.22900390625, "logps/rejected": -181.6092529296875, "loss": 0.6684, "rewards/accuracies": 0.625, "rewards/chosen": 0.24196691811084747, "rewards/margins": 0.01709696091711521, "rewards/rejected": 0.224869966506958, "step": 82 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.17373103087388803, "grad_norm": 2.3018305762716467, "learning_rate": 4.918331902411841e-07, "logits/chosen": -1.2091577053070068, "logits/rejected": -1.2855414152145386, "logps/chosen": -307.2161865234375, "logps/rejected": -257.3757629394531, "loss": 0.6744, "rewards/accuracies": 0.625, "rewards/chosen": 0.2892816662788391, "rewards/margins": 0.02718321792781353, "rewards/rejected": 0.26209843158721924, "step": 83 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.17582417582417584, "grad_norm": 2.6021802988798224, "learning_rate": 4.913625927427995e-07, "logits/chosen": -1.691704273223877, "logits/rejected": -1.265133261680603, "logps/chosen": -305.3105163574219, "logps/rejected": -514.9328002929688, "loss": 0.6629, "rewards/accuracies": 0.8125, "rewards/chosen": 0.25614750385284424, "rewards/margins": 0.07186707854270935, "rewards/rejected": 0.1842804253101349, "step": 84 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.17791732077446362, "grad_norm": 2.3315605448977745, "learning_rate": 4.908790517010636e-07, "logits/chosen": -1.2815343141555786, "logits/rejected": -1.1240736246109009, "logps/chosen": -267.77294921875, "logps/rejected": -306.5283203125, "loss": 0.6624, "rewards/accuracies": 0.75, "rewards/chosen": 0.2773919105529785, "rewards/margins": 0.04133237525820732, "rewards/rejected": 0.2360595166683197, "step": 85 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.18001046572475143, "grad_norm": 2.3097278826632848, "learning_rate": 4.903825930468148e-07, "logits/chosen": -1.178104043006897, "logits/rejected": -1.3639379739761353, "logps/chosen": -171.203857421875, "logps/rejected": -216.42581176757812, "loss": 0.6639, "rewards/accuracies": 0.875, "rewards/chosen": 0.26765158772468567, "rewards/margins": 0.055917322635650635, "rewards/rejected": 0.21173425018787384, "step": 86 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.18210361067503925, "grad_norm": 2.494294430472006, "learning_rate": 4.898732434036243e-07, "logits/chosen": -1.4836080074310303, "logits/rejected": -1.3039590120315552, "logps/chosen": -205.9970703125, "logps/rejected": -255.0936279296875, "loss": 0.6608, "rewards/accuracies": 0.6875, "rewards/chosen": 0.26746511459350586, "rewards/margins": 0.06307379901409149, "rewards/rejected": 0.20439133048057556, "step": 87 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.18419675562532706, "grad_norm": 2.3079874024889686, "learning_rate": 4.893510300863676e-07, "logits/chosen": -1.2986102104187012, "logits/rejected": -1.1857050657272339, "logps/chosen": -220.7415313720703, "logps/rejected": -292.52154541015625, "loss": 0.6729, "rewards/accuracies": 0.5, "rewards/chosen": 0.24720710515975952, "rewards/margins": -0.005298815667629242, "rewards/rejected": 0.252505898475647, "step": 88 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.18628990057561487, "grad_norm": 2.6081794033554093, "learning_rate": 4.8881598109976e-07, "logits/chosen": -1.0027302503585815, "logits/rejected": -1.5816898345947266, "logps/chosen": -335.0837097167969, "logps/rejected": -247.4977264404297, "loss": 0.6646, "rewards/accuracies": 0.5, "rewards/chosen": 0.2669665217399597, "rewards/margins": 0.02164977416396141, "rewards/rejected": 0.24531671404838562, "step": 89 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.18838304552590268, "grad_norm": 2.5024824214624437, "learning_rate": 4.882681251368548e-07, "logits/chosen": -0.9626286625862122, "logits/rejected": -1.3093549013137817, "logps/chosen": -326.1711730957031, "logps/rejected": -287.4186096191406, "loss": 0.6607, "rewards/accuracies": 0.75, "rewards/chosen": 0.2841154932975769, "rewards/margins": 0.0759100466966629, "rewards/rejected": 0.2082054615020752, "step": 90 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.19047619047619047, "grad_norm": 2.3873158579663727, "learning_rate": 4.877074915775048e-07, "logits/chosen": -1.2809832096099854, "logits/rejected": -1.6485763788223267, "logps/chosen": -266.6936950683594, "logps/rejected": -263.4827880859375, "loss": 0.6605, "rewards/accuracies": 0.875, "rewards/chosen": 0.28310662508010864, "rewards/margins": 0.0795968696475029, "rewards/rejected": 0.20350974798202515, "step": 91 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.19256933542647828, "grad_norm": 2.3456276483682648, "learning_rate": 4.871341104867864e-07, "logits/chosen": -0.9808674454689026, "logits/rejected": -1.1323598623275757, "logps/chosen": -352.28302001953125, "logps/rejected": -310.7407531738281, "loss": 0.6578, "rewards/accuracies": 0.75, "rewards/chosen": 0.2945118248462677, "rewards/margins": 0.06869323551654816, "rewards/rejected": 0.22581861913204193, "step": 92 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.1946624803767661, "grad_norm": 2.2881721876295416, "learning_rate": 4.865480126133871e-07, "logits/chosen": -1.4138919115066528, "logits/rejected": -1.9016234874725342, "logps/chosen": -261.8511657714844, "logps/rejected": -421.49346923828125, "loss": 0.6646, "rewards/accuracies": 0.8125, "rewards/chosen": 0.24475426971912384, "rewards/margins": 0.07296748459339142, "rewards/rejected": 0.17178678512573242, "step": 93 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.1967556253270539, "grad_norm": 2.2591077469788265, "learning_rate": 4.859492293879573e-07, "logits/chosen": -1.1044683456420898, "logits/rejected": -1.3629188537597656, "logps/chosen": -256.7251281738281, "logps/rejected": -279.8775634765625, "loss": 0.6697, "rewards/accuracies": 0.6875, "rewards/chosen": 0.27829205989837646, "rewards/margins": 0.0658087432384491, "rewards/rejected": 0.21248331665992737, "step": 94 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.1988487702773417, "grad_norm": 2.4662075844547044, "learning_rate": 4.853377929214243e-07, "logits/chosen": -1.8464876413345337, "logits/rejected": -1.8711469173431396, "logps/chosen": -260.9811096191406, "logps/rejected": -407.55670166015625, "loss": 0.6569, "rewards/accuracies": 0.75, "rewards/chosen": 0.23133748769760132, "rewards/margins": 0.034351229667663574, "rewards/rejected": 0.19698627293109894, "step": 95 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.20094191522762953, "grad_norm": 2.262705714521302, "learning_rate": 4.847137360032699e-07, "logits/chosen": -1.4402207136154175, "logits/rejected": -1.8564977645874023, "logps/chosen": -412.8888244628906, "logps/rejected": -250.38375854492188, "loss": 0.6625, "rewards/accuracies": 0.625, "rewards/chosen": 0.27641361951828003, "rewards/margins": 0.05554671213030815, "rewards/rejected": 0.22086691856384277, "step": 96 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2030350601779173, "grad_norm": 2.4783581737631573, "learning_rate": 4.84077092099773e-07, "logits/chosen": -1.1330995559692383, "logits/rejected": -1.1069601774215698, "logps/chosen": -360.162109375, "logps/rejected": -297.1282958984375, "loss": 0.6609, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2751573324203491, "rewards/margins": 0.05376603081822395, "rewards/rejected": 0.22139132022857666, "step": 97 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.20512820512820512, "grad_norm": 2.349824687720894, "learning_rate": 4.834278953522137e-07, "logits/chosen": -1.313166618347168, "logits/rejected": -1.7200367450714111, "logps/chosen": -360.7608337402344, "logps/rejected": -249.40931701660156, "loss": 0.6543, "rewards/accuracies": 0.625, "rewards/chosen": 0.3034079074859619, "rewards/margins": 0.06569239497184753, "rewards/rejected": 0.23771551251411438, "step": 98 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.20722135007849293, "grad_norm": 2.298908111063349, "learning_rate": 4.827661805750437e-07, "logits/chosen": -1.3135740756988525, "logits/rejected": -1.38303804397583, "logps/chosen": -283.65325927734375, "logps/rejected": -289.5970764160156, "loss": 0.6728, "rewards/accuracies": 0.75, "rewards/chosen": 0.2879467308521271, "rewards/margins": 0.08636203408241272, "rewards/rejected": 0.20158466696739197, "step": 99 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.20931449502878074, "grad_norm": 2.315541836258674, "learning_rate": 4.820919832540181e-07, "logits/chosen": -0.4371851682662964, "logits/rejected": -1.3404946327209473, "logps/chosen": -381.5962829589844, "logps/rejected": -260.8759460449219, "loss": 0.6498, "rewards/accuracies": 0.8125, "rewards/chosen": 0.30900368094444275, "rewards/margins": 0.07341316342353821, "rewards/rejected": 0.23559051752090454, "step": 100 }, { "epoch": 0.20931449502878074, "eval_dpo_lambda": 0.9500001072883606, "eval_logits/chosen": -1.1265989542007446, "eval_logits/rejected": -1.2588810920715332, "eval_logps/chosen": -305.62188720703125, "eval_logps/rejected": -285.2926330566406, "eval_loss": 0.6609352827072144, "eval_rewards/accuracies": 0.738095223903656, "eval_rewards/chosen": 0.2768958806991577, "eval_rewards/margins": 0.06245749443769455, "eval_rewards/rejected": 0.21443839371204376, "eval_runtime": 126.6427, "eval_samples_per_second": 15.792, "eval_steps_per_second": 0.497, "step": 100 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.21140763997906856, "grad_norm": 2.3838005412460843, "learning_rate": 4.814053395442932e-07, "logits/chosen": -1.4000431299209595, "logits/rejected": -1.2939010858535767, "logps/chosen": -384.4804992675781, "logps/rejected": -389.9180908203125, "loss": 0.6686, "rewards/accuracies": 0.8125, "rewards/chosen": 0.29500657320022583, "rewards/margins": 0.09199095517396927, "rewards/rejected": 0.20301562547683716, "step": 101 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.21350078492935637, "grad_norm": 2.3319261331966565, "learning_rate": 4.807062862684873e-07, "logits/chosen": -1.3374634981155396, "logits/rejected": -1.5403110980987549, "logps/chosen": -327.0498046875, "logps/rejected": -333.315673828125, "loss": 0.6613, "rewards/accuracies": 0.5625, "rewards/chosen": 0.25264033675193787, "rewards/margins": 0.04336639866232872, "rewards/rejected": 0.20927396416664124, "step": 102 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.21559392987964415, "grad_norm": 2.490799881399673, "learning_rate": 4.799948609147061e-07, "logits/chosen": -1.1582235097885132, "logits/rejected": -1.3536336421966553, "logps/chosen": -266.0186767578125, "logps/rejected": -256.7572326660156, "loss": 0.6706, "rewards/accuracies": 0.75, "rewards/chosen": 0.24074122309684753, "rewards/margins": 0.03719187527894974, "rewards/rejected": 0.2035493403673172, "step": 103 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.21768707482993196, "grad_norm": 2.2500067216893966, "learning_rate": 4.792711016345321e-07, "logits/chosen": -1.1320271492004395, "logits/rejected": -1.2357622385025024, "logps/chosen": -260.5439453125, "logps/rejected": -299.4569091796875, "loss": 0.6585, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24176481366157532, "rewards/margins": 0.015657618641853333, "rewards/rejected": 0.22610719501972198, "step": 104 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.21978021978021978, "grad_norm": 2.502891906343648, "learning_rate": 4.785350472409791e-07, "logits/chosen": -0.8266074657440186, "logits/rejected": -0.7922312021255493, "logps/chosen": -340.3038330078125, "logps/rejected": -343.33843994140625, "loss": 0.6503, "rewards/accuracies": 0.5625, "rewards/chosen": 0.26149868965148926, "rewards/margins": 0.03895071893930435, "rewards/rejected": 0.2225479781627655, "step": 105 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2218733647305076, "grad_norm": 2.469506461939846, "learning_rate": 4.777867372064105e-07, "logits/chosen": -1.0342568159103394, "logits/rejected": -1.3202353715896606, "logps/chosen": -336.59136962890625, "logps/rejected": -227.89590454101562, "loss": 0.661, "rewards/accuracies": 0.625, "rewards/chosen": 0.29561781883239746, "rewards/margins": 0.0590677335858345, "rewards/rejected": 0.23655004799365997, "step": 106 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2239665096807954, "grad_norm": 2.3707764867612102, "learning_rate": 4.770262116604223e-07, "logits/chosen": -1.3570079803466797, "logits/rejected": -1.8119703531265259, "logps/chosen": -299.6976623535156, "logps/rejected": -279.8818664550781, "loss": 0.6583, "rewards/accuracies": 0.875, "rewards/chosen": 0.29008060693740845, "rewards/margins": 0.13339433073997498, "rewards/rejected": 0.15668627619743347, "step": 107 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2260596546310832, "grad_norm": 2.4158455704933592, "learning_rate": 4.7625351138769166e-07, "logits/chosen": -0.9670537114143372, "logits/rejected": -1.2495864629745483, "logps/chosen": -281.28314208984375, "logps/rejected": -245.93357849121094, "loss": 0.6585, "rewards/accuracies": 0.8125, "rewards/chosen": 0.29050394892692566, "rewards/margins": 0.13617663085460663, "rewards/rejected": 0.15432730317115784, "step": 108 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.228152799581371, "grad_norm": 2.494975824105596, "learning_rate": 4.75468677825789e-07, "logits/chosen": -1.3777551651000977, "logits/rejected": -1.5846264362335205, "logps/chosen": -272.94580078125, "logps/rejected": -242.5831298828125, "loss": 0.6588, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2587592601776123, "rewards/margins": 0.05067931488156319, "rewards/rejected": 0.2080799639225006, "step": 109 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2302459445316588, "grad_norm": 2.2267114497187044, "learning_rate": 4.7467175306295647e-07, "logits/chosen": -0.8978460431098938, "logits/rejected": -1.0895323753356934, "logps/chosen": -289.9827880859375, "logps/rejected": -303.84320068359375, "loss": 0.6608, "rewards/accuracies": 0.625, "rewards/chosen": 0.2326616495847702, "rewards/margins": 0.03727094829082489, "rewards/rejected": 0.1953907012939453, "step": 110 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.23233908948194662, "grad_norm": 2.370743200246439, "learning_rate": 4.7386277983585053e-07, "logits/chosen": -0.9505612850189209, "logits/rejected": -1.1513551473617554, "logps/chosen": -278.4796142578125, "logps/rejected": -278.3269958496094, "loss": 0.6554, "rewards/accuracies": 0.5625, "rewards/chosen": 0.24911299347877502, "rewards/margins": 0.05647106096148491, "rewards/rejected": 0.192641943693161, "step": 111 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.23443223443223443, "grad_norm": 2.57992654786571, "learning_rate": 4.7304180152725024e-07, "logits/chosen": -1.5378581285476685, "logits/rejected": -1.3804616928100586, "logps/chosen": -281.45062255859375, "logps/rejected": -272.4478454589844, "loss": 0.6462, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2044842541217804, "rewards/margins": 0.009916345588862896, "rewards/rejected": 0.19456791877746582, "step": 112 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.23652537938252224, "grad_norm": 2.644035980552638, "learning_rate": 4.7220886216373085e-07, "logits/chosen": -0.45617127418518066, "logits/rejected": -0.99241042137146, "logps/chosen": -219.23446655273438, "logps/rejected": -196.12559509277344, "loss": 0.6568, "rewards/accuracies": 0.75, "rewards/chosen": 0.2768452763557434, "rewards/margins": 0.07626468688249588, "rewards/rejected": 0.20058056712150574, "step": 113 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.23861852433281006, "grad_norm": 2.4049724162731065, "learning_rate": 4.7136400641330245e-07, "logits/chosen": -1.4702264070510864, "logits/rejected": -1.5111284255981445, "logps/chosen": -323.161865234375, "logps/rejected": -319.70111083984375, "loss": 0.6527, "rewards/accuracies": 0.75, "rewards/chosen": 0.2813788652420044, "rewards/margins": 0.13999086618423462, "rewards/rejected": 0.14138802886009216, "step": 114 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.24071166928309787, "grad_norm": 2.330153763370056, "learning_rate": 4.70507279583015e-07, "logits/chosen": -1.0421326160430908, "logits/rejected": -0.9258497953414917, "logps/chosen": -315.66888427734375, "logps/rejected": -361.380859375, "loss": 0.6571, "rewards/accuracies": 0.75, "rewards/chosen": 0.24884723126888275, "rewards/margins": 0.09158321470022202, "rewards/rejected": 0.15726402401924133, "step": 115 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.24280481423338565, "grad_norm": 2.5853412379320404, "learning_rate": 4.6963872761652834e-07, "logits/chosen": -0.9899653196334839, "logits/rejected": -1.3739241361618042, "logps/chosen": -253.53387451171875, "logps/rejected": -225.81756591796875, "loss": 0.6475, "rewards/accuracies": 0.8125, "rewards/chosen": 0.29086723923683167, "rewards/margins": 0.07563789188861847, "rewards/rejected": 0.215229332447052, "step": 116 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.24489795918367346, "grad_norm": 2.5215705552062753, "learning_rate": 4.687583970916486e-07, "logits/chosen": -1.1951545476913452, "logits/rejected": -1.3386662006378174, "logps/chosen": -305.3542175292969, "logps/rejected": -357.0126037597656, "loss": 0.6434, "rewards/accuracies": 0.75, "rewards/chosen": 0.24293264746665955, "rewards/margins": 0.09169358015060425, "rewards/rejected": 0.1512390524148941, "step": 117 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.24699110413396128, "grad_norm": 2.6628565563232223, "learning_rate": 4.6786633521783005e-07, "logits/chosen": -0.9409959316253662, "logits/rejected": -0.9255667328834534, "logps/chosen": -296.095703125, "logps/rejected": -307.32318115234375, "loss": 0.6516, "rewards/accuracies": 0.6875, "rewards/chosen": 0.25823795795440674, "rewards/margins": 0.09044396877288818, "rewards/rejected": 0.16779397428035736, "step": 118 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2490842490842491, "grad_norm": 2.490876946112336, "learning_rate": 4.669625898336438e-07, "logits/chosen": -1.454582929611206, "logits/rejected": -1.5617461204528809, "logps/chosen": -258.7835693359375, "logps/rejected": -211.126708984375, "loss": 0.6456, "rewards/accuracies": 0.75, "rewards/chosen": 0.27489981055259705, "rewards/margins": 0.11937963962554932, "rewards/rejected": 0.15552015602588654, "step": 119 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.25117739403453687, "grad_norm": 2.376732854984331, "learning_rate": 4.6604720940421207e-07, "logits/chosen": -1.0676320791244507, "logits/rejected": -1.1190202236175537, "logps/chosen": -278.9148864746094, "logps/rejected": -311.2773132324219, "loss": 0.6498, "rewards/accuracies": 0.75, "rewards/chosen": 0.2749730348587036, "rewards/margins": 0.10118619352579117, "rewards/rejected": 0.17378687858581543, "step": 120 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2532705389848247, "grad_norm": 2.3567986477743896, "learning_rate": 4.651202430186092e-07, "logits/chosen": -1.188884973526001, "logits/rejected": -1.2482266426086426, "logps/chosen": -296.7980651855469, "logps/rejected": -276.63214111328125, "loss": 0.639, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2930307388305664, "rewards/margins": 0.07296772301197052, "rewards/rejected": 0.22006304562091827, "step": 121 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2553636839351125, "grad_norm": 2.3986749706384987, "learning_rate": 4.6418174038722924e-07, "logits/chosen": -1.2444785833358765, "logits/rejected": -1.5000934600830078, "logps/chosen": -294.5802001953125, "logps/rejected": -267.584228515625, "loss": 0.6444, "rewards/accuracies": 0.5625, "rewards/chosen": 0.23259134590625763, "rewards/margins": 0.077959805727005, "rewards/rejected": 0.15463152527809143, "step": 122 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.25745682888540034, "grad_norm": 2.4111092259135343, "learning_rate": 4.6323175183912023e-07, "logits/chosen": -0.9954867959022522, "logits/rejected": -1.14194917678833, "logps/chosen": -310.08319091796875, "logps/rejected": -324.5462646484375, "loss": 0.6427, "rewards/accuracies": 0.75, "rewards/chosen": 0.25345727801322937, "rewards/margins": 0.13480302691459656, "rewards/rejected": 0.11865423619747162, "step": 123 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2595499738356881, "grad_norm": 2.3248698753457586, "learning_rate": 4.6227032831928483e-07, "logits/chosen": -1.2530393600463867, "logits/rejected": -1.4184015989303589, "logps/chosen": -270.5785827636719, "logps/rejected": -311.03411865234375, "loss": 0.6368, "rewards/accuracies": 0.75, "rewards/chosen": 0.25522518157958984, "rewards/margins": 0.14147831499576569, "rewards/rejected": 0.11374684423208237, "step": 124 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2616431187859759, "grad_norm": 2.4702571089201424, "learning_rate": 4.612975213859487e-07, "logits/chosen": -1.4616117477416992, "logits/rejected": -1.436432957649231, "logps/chosen": -183.26138305664062, "logps/rejected": -239.16583251953125, "loss": 0.6582, "rewards/accuracies": 0.8125, "rewards/chosen": 0.25017082691192627, "rewards/margins": 0.066016785800457, "rewards/rejected": 0.18415404856204987, "step": 125 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.26373626373626374, "grad_norm": 2.2976446977491367, "learning_rate": 4.603133832077953e-07, "logits/chosen": -1.3302080631256104, "logits/rejected": -1.759768009185791, "logps/chosen": -407.9803466796875, "logps/rejected": -284.9658203125, "loss": 0.6445, "rewards/accuracies": 0.625, "rewards/chosen": 0.22962862253189087, "rewards/margins": 0.06049675494432449, "rewards/rejected": 0.16913188993930817, "step": 126 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2658294086865515, "grad_norm": 2.200648391725821, "learning_rate": 4.5931796656116837e-07, "logits/chosen": -1.087525725364685, "logits/rejected": -1.4507644176483154, "logps/chosen": -244.12631225585938, "logps/rejected": -241.78172302246094, "loss": 0.652, "rewards/accuracies": 0.75, "rewards/chosen": 0.2512468993663788, "rewards/margins": 0.06589895486831665, "rewards/rejected": 0.18534794449806213, "step": 127 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.26792255363683937, "grad_norm": 2.4708337306614396, "learning_rate": 4.5831132482724193e-07, "logits/chosen": -0.7926342487335205, "logits/rejected": -1.3278822898864746, "logps/chosen": -269.0376892089844, "logps/rejected": -281.436767578125, "loss": 0.6568, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2080076038837433, "rewards/margins": 0.00329400971531868, "rewards/rejected": 0.2047135978937149, "step": 128 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.27001569858712715, "grad_norm": 2.3360260360958494, "learning_rate": 4.5729351198915705e-07, "logits/chosen": -1.1431214809417725, "logits/rejected": -0.8985929489135742, "logps/chosen": -317.6500549316406, "logps/rejected": -340.1954650878906, "loss": 0.6494, "rewards/accuracies": 0.625, "rewards/chosen": 0.24971501529216766, "rewards/margins": 0.03427686542272568, "rewards/rejected": 0.21543815732002258, "step": 129 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.272108843537415, "grad_norm": 2.5415575509242676, "learning_rate": 4.5626458262912735e-07, "logits/chosen": -1.1145471334457397, "logits/rejected": -1.562002420425415, "logps/chosen": -309.1630859375, "logps/rejected": -317.02032470703125, "loss": 0.6297, "rewards/accuracies": 0.75, "rewards/chosen": 0.25996366143226624, "rewards/margins": 0.1552920788526535, "rewards/rejected": 0.10467158257961273, "step": 130 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2742019884877028, "grad_norm": 2.453882753749538, "learning_rate": 4.5522459192551166e-07, "logits/chosen": -1.6377232074737549, "logits/rejected": -1.5837520360946655, "logps/chosen": -228.57135009765625, "logps/rejected": -217.5687255859375, "loss": 0.6382, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2963574528694153, "rewards/margins": 0.11947888135910034, "rewards/rejected": 0.17687860131263733, "step": 131 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.27629513343799056, "grad_norm": 2.45875519985061, "learning_rate": 4.541735956498554e-07, "logits/chosen": -0.8745557069778442, "logits/rejected": -1.3967058658599854, "logps/chosen": -440.8940734863281, "logps/rejected": -310.1091613769531, "loss": 0.6433, "rewards/accuracies": 0.75, "rewards/chosen": 0.3009231686592102, "rewards/margins": 0.17215877771377563, "rewards/rejected": 0.12876439094543457, "step": 132 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2783882783882784, "grad_norm": 2.322562054069078, "learning_rate": 4.5311165016389914e-07, "logits/chosen": -1.6143832206726074, "logits/rejected": -1.7220548391342163, "logps/chosen": -365.3206481933594, "logps/rejected": -241.51495361328125, "loss": 0.6345, "rewards/accuracies": 0.75, "rewards/chosen": 0.30499067902565, "rewards/margins": 0.16423507034778595, "rewards/rejected": 0.14075560867786407, "step": 133 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2804814233385662, "grad_norm": 2.37181643116754, "learning_rate": 4.520388124165564e-07, "logits/chosen": -1.4498357772827148, "logits/rejected": -1.2864556312561035, "logps/chosen": -310.04376220703125, "logps/rejected": -379.285400390625, "loss": 0.6462, "rewards/accuracies": 0.625, "rewards/chosen": 0.24614813923835754, "rewards/margins": 0.08764280378818512, "rewards/rejected": 0.15850532054901123, "step": 134 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.282574568288854, "grad_norm": 2.4993868839860345, "learning_rate": 4.5095513994085974e-07, "logits/chosen": -1.4276573657989502, "logits/rejected": -1.4376674890518188, "logps/chosen": -264.44757080078125, "logps/rejected": -254.091796875, "loss": 0.6392, "rewards/accuracies": 0.75, "rewards/chosen": 0.24881112575531006, "rewards/margins": 0.05410250276327133, "rewards/rejected": 0.19470861554145813, "step": 135 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2846677132391418, "grad_norm": 2.52873456694845, "learning_rate": 4.498606908508753e-07, "logits/chosen": -0.9768342971801758, "logits/rejected": -1.3321075439453125, "logps/chosen": -326.61639404296875, "logps/rejected": -302.7270202636719, "loss": 0.6499, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2503766119480133, "rewards/margins": 0.09211332350969315, "rewards/rejected": 0.15826329588890076, "step": 136 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2867608581894296, "grad_norm": 2.4040923854595864, "learning_rate": 4.487555238385862e-07, "logits/chosen": -1.172602653503418, "logits/rejected": -1.7622894048690796, "logps/chosen": -326.3885498046875, "logps/rejected": -281.468017578125, "loss": 0.6467, "rewards/accuracies": 0.6875, "rewards/chosen": 0.25907692313194275, "rewards/margins": 0.13277941942214966, "rewards/rejected": 0.1262975037097931, "step": 137 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.28885400313971743, "grad_norm": 2.4992029532949926, "learning_rate": 4.476396981707453e-07, "logits/chosen": -1.3722294569015503, "logits/rejected": -1.5105780363082886, "logps/chosen": -337.4222412109375, "logps/rejected": -269.143310546875, "loss": 0.6605, "rewards/accuracies": 0.75, "rewards/chosen": 0.28751471638679504, "rewards/margins": 0.1320674866437912, "rewards/rejected": 0.15544724464416504, "step": 138 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2909471480900052, "grad_norm": 2.4597405686148006, "learning_rate": 4.4651327368569684e-07, "logits/chosen": -1.015929102897644, "logits/rejected": -0.9359642863273621, "logps/chosen": -406.0782470703125, "logps/rejected": -408.199462890625, "loss": 0.6456, "rewards/accuracies": 0.875, "rewards/chosen": 0.3100045323371887, "rewards/margins": 0.2035290002822876, "rewards/rejected": 0.10647552460432053, "step": 139 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.29304029304029305, "grad_norm": 2.4794799080355534, "learning_rate": 4.453763107901675e-07, "logits/chosen": -1.5453729629516602, "logits/rejected": -1.9060596227645874, "logps/chosen": -298.16558837890625, "logps/rejected": -240.1239013671875, "loss": 0.6448, "rewards/accuracies": 0.8125, "rewards/chosen": 0.23826953768730164, "rewards/margins": 0.10319440811872482, "rewards/rejected": 0.1350751519203186, "step": 140 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.29513343799058084, "grad_norm": 2.489194152910731, "learning_rate": 4.4422887045602674e-07, "logits/chosen": -1.1712546348571777, "logits/rejected": -1.6510064601898193, "logps/chosen": -367.8188171386719, "logps/rejected": -284.88934326171875, "loss": 0.6535, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2720351219177246, "rewards/margins": 0.16033916175365448, "rewards/rejected": 0.11169596016407013, "step": 141 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.2972265829408687, "grad_norm": 2.329995050962624, "learning_rate": 4.4307101421701755e-07, "logits/chosen": -1.1322382688522339, "logits/rejected": -1.4809411764144897, "logps/chosen": -290.8354187011719, "logps/rejected": -217.35595703125, "loss": 0.6304, "rewards/accuracies": 0.75, "rewards/chosen": 0.2578047811985016, "rewards/margins": 0.10935106873512268, "rewards/rejected": 0.1484537124633789, "step": 142 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.29931972789115646, "grad_norm": 2.5221661492130085, "learning_rate": 4.419028041654559e-07, "logits/chosen": -0.9347215890884399, "logits/rejected": -0.9855947494506836, "logps/chosen": -414.4169921875, "logps/rejected": -398.33258056640625, "loss": 0.6366, "rewards/accuracies": 0.8125, "rewards/chosen": 0.22949610650539398, "rewards/margins": 0.16940245032310486, "rewards/rejected": 0.06009366363286972, "step": 143 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.30141287284144425, "grad_norm": 2.417517962913556, "learning_rate": 4.4072430294890166e-07, "logits/chosen": -1.2026840448379517, "logits/rejected": -1.4032888412475586, "logps/chosen": -316.7716979980469, "logps/rejected": -294.29437255859375, "loss": 0.6321, "rewards/accuracies": 0.875, "rewards/chosen": 0.199890598654747, "rewards/margins": 0.12671639025211334, "rewards/rejected": 0.07317420840263367, "step": 144 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3035060177917321, "grad_norm": 2.4017999135848136, "learning_rate": 4.395355737667985e-07, "logits/chosen": -1.2982053756713867, "logits/rejected": -1.4728295803070068, "logps/chosen": -306.82061767578125, "logps/rejected": -223.6586151123047, "loss": 0.6364, "rewards/accuracies": 0.875, "rewards/chosen": 0.15790940821170807, "rewards/margins": 0.11503908038139343, "rewards/rejected": 0.042870327830314636, "step": 145 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.30559916274201987, "grad_norm": 2.416193914372365, "learning_rate": 4.3833668036708483e-07, "logits/chosen": -1.1176211833953857, "logits/rejected": -1.4571778774261475, "logps/chosen": -226.58258056640625, "logps/rejected": -233.7766571044922, "loss": 0.6477, "rewards/accuracies": 0.75, "rewards/chosen": 0.15076148509979248, "rewards/margins": 0.08108443021774292, "rewards/rejected": 0.06967706978321075, "step": 146 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3076923076923077, "grad_norm": 2.527339535679976, "learning_rate": 4.3712768704277524e-07, "logits/chosen": -1.1586792469024658, "logits/rejected": -0.8416894674301147, "logps/chosen": -261.54608154296875, "logps/rejected": -267.8458251953125, "loss": 0.6487, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2358892858028412, "rewards/margins": 0.13921299576759338, "rewards/rejected": 0.0966762974858284, "step": 147 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3097854526425955, "grad_norm": 2.3658902583087422, "learning_rate": 4.3590865862851263e-07, "logits/chosen": -1.1472032070159912, "logits/rejected": -1.450994610786438, "logps/chosen": -244.56910705566406, "logps/rejected": -252.7799835205078, "loss": 0.6437, "rewards/accuracies": 0.875, "rewards/chosen": 0.19444799423217773, "rewards/margins": 0.07008150219917297, "rewards/rejected": 0.12436649948358536, "step": 148 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.31187859759288333, "grad_norm": 2.392160022432871, "learning_rate": 4.346796604970912e-07, "logits/chosen": -0.8277313709259033, "logits/rejected": -0.8548777103424072, "logps/chosen": -330.10260009765625, "logps/rejected": -341.2257080078125, "loss": 0.6554, "rewards/accuracies": 0.625, "rewards/chosen": 0.23878023028373718, "rewards/margins": 0.10449697077274323, "rewards/rejected": 0.13428327441215515, "step": 149 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3139717425431711, "grad_norm": 2.353651077758689, "learning_rate": 4.3344075855595097e-07, "logits/chosen": -1.2566170692443848, "logits/rejected": -1.317298173904419, "logps/chosen": -284.64349365234375, "logps/rejected": -294.55126953125, "loss": 0.6549, "rewards/accuracies": 0.6875, "rewards/chosen": 0.27726876735687256, "rewards/margins": 0.10337522625923157, "rewards/rejected": 0.1738935112953186, "step": 150 }, { "epoch": 0.3139717425431711, "eval_dpo_lambda": 0.9500001072883606, "eval_logits/chosen": -1.1845933198928833, "eval_logits/rejected": -1.3147549629211426, "eval_logps/chosen": -310.4278564453125, "eval_logps/rejected": -296.91943359375, "eval_loss": 0.6407532691955566, "eval_rewards/accuracies": 0.7341269850730896, "eval_rewards/chosen": 0.22883661091327667, "eval_rewards/margins": 0.13066653907299042, "eval_rewards/rejected": 0.09817008674144745, "eval_runtime": 124.6584, "eval_samples_per_second": 16.044, "eval_steps_per_second": 0.505, "step": 150 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3160648874934589, "grad_norm": 2.353126654043626, "learning_rate": 4.3219201924364323e-07, "logits/chosen": -1.4839799404144287, "logits/rejected": -1.771735429763794, "logps/chosen": -250.8814239501953, "logps/rejected": -228.6520233154297, "loss": 0.6401, "rewards/accuracies": 0.75, "rewards/chosen": 0.23518478870391846, "rewards/margins": 0.10509979724884033, "rewards/rejected": 0.13008499145507812, "step": 151 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.31815803244374674, "grad_norm": 2.468980592028843, "learning_rate": 4.309335095262675e-07, "logits/chosen": -1.1111326217651367, "logits/rejected": -1.8293967247009277, "logps/chosen": -255.92601013183594, "logps/rejected": -293.20635986328125, "loss": 0.636, "rewards/accuracies": 0.6875, "rewards/chosen": 0.19598445296287537, "rewards/margins": 0.14701193571090698, "rewards/rejected": 0.04897250980138779, "step": 152 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3202511773940345, "grad_norm": 2.541502271082875, "learning_rate": 4.2966529689388064e-07, "logits/chosen": -1.031123399734497, "logits/rejected": -1.492027997970581, "logps/chosen": -264.6174621582031, "logps/rejected": -214.0040740966797, "loss": 0.6307, "rewards/accuracies": 0.75, "rewards/chosen": 0.21471568942070007, "rewards/margins": 0.10869686305522919, "rewards/rejected": 0.10601884126663208, "step": 153 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.32234432234432236, "grad_norm": 2.513030721557436, "learning_rate": 4.2838744935687716e-07, "logits/chosen": -1.02005934715271, "logits/rejected": -1.3629133701324463, "logps/chosen": -273.4529113769531, "logps/rejected": -186.1577911376953, "loss": 0.6381, "rewards/accuracies": 0.5625, "rewards/chosen": 0.21922677755355835, "rewards/margins": 0.13480353355407715, "rewards/rejected": 0.08442322164773941, "step": 154 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.32443746729461015, "grad_norm": 2.4439411071152213, "learning_rate": 4.271000354423425e-07, "logits/chosen": -0.8973273038864136, "logits/rejected": -1.772719383239746, "logps/chosen": -491.8912353515625, "logps/rejected": -326.322998046875, "loss": 0.6297, "rewards/accuracies": 0.875, "rewards/chosen": 0.33490049839019775, "rewards/margins": 0.27585843205451965, "rewards/rejected": 0.059042058885097504, "step": 155 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.32653061224489793, "grad_norm": 2.3998127334630195, "learning_rate": 4.258031241903777e-07, "logits/chosen": -1.4651778936386108, "logits/rejected": -1.3838015794754028, "logps/chosen": -206.5081024169922, "logps/rejected": -370.8124084472656, "loss": 0.6491, "rewards/accuracies": 0.8125, "rewards/chosen": 0.14312505722045898, "rewards/margins": 0.15322436392307281, "rewards/rejected": -0.010099297389388084, "step": 156 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3286237571951858, "grad_norm": 2.366663987284687, "learning_rate": 4.2449678515039743e-07, "logits/chosen": -1.1987701654434204, "logits/rejected": -1.3515450954437256, "logps/chosen": -261.3342590332031, "logps/rejected": -214.8706817626953, "loss": 0.6466, "rewards/accuracies": 0.6875, "rewards/chosen": 0.18163400888442993, "rewards/margins": 0.07753065228462219, "rewards/rejected": 0.10410335659980774, "step": 157 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.33071690214547356, "grad_norm": 2.4738084145585777, "learning_rate": 4.2318108837739986e-07, "logits/chosen": -0.7750232219696045, "logits/rejected": -0.9522998332977295, "logps/chosen": -305.97943115234375, "logps/rejected": -340.639404296875, "loss": 0.6352, "rewards/accuracies": 0.625, "rewards/chosen": 0.24369773268699646, "rewards/margins": 0.08186392486095428, "rewards/rejected": 0.16183380782604218, "step": 158 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3328100470957614, "grad_norm": 2.3559676981853683, "learning_rate": 4.218561044282098e-07, "logits/chosen": -1.0587737560272217, "logits/rejected": -1.6721540689468384, "logps/chosen": -466.4519348144531, "logps/rejected": -440.5288391113281, "loss": 0.6381, "rewards/accuracies": 0.75, "rewards/chosen": 0.2164725959300995, "rewards/margins": 0.18982842564582825, "rewards/rejected": 0.02664417028427124, "step": 159 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3349031920460492, "grad_norm": 2.3058515915756916, "learning_rate": 4.2052190435769554e-07, "logits/chosen": -1.2652602195739746, "logits/rejected": -1.5281144380569458, "logps/chosen": -241.17904663085938, "logps/rejected": -274.4759826660156, "loss": 0.6401, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2575688362121582, "rewards/margins": 0.11499206721782684, "rewards/rejected": 0.14257675409317017, "step": 160 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.336996336996337, "grad_norm": 2.542317739443924, "learning_rate": 4.1917855971495763e-07, "logits/chosen": -1.1756772994995117, "logits/rejected": -1.183746576309204, "logps/chosen": -331.9105224609375, "logps/rejected": -283.5142822265625, "loss": 0.6394, "rewards/accuracies": 0.8125, "rewards/chosen": 0.23874206840991974, "rewards/margins": 0.18135549128055573, "rewards/rejected": 0.0573866069316864, "step": 161 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3390894819466248, "grad_norm": 2.3965031898002085, "learning_rate": 4.1782614253949255e-07, "logits/chosen": -0.690435528755188, "logits/rejected": -0.8953478336334229, "logps/chosen": -298.3455810546875, "logps/rejected": -300.263916015625, "loss": 0.6284, "rewards/accuracies": 0.875, "rewards/chosen": 0.19689339399337769, "rewards/margins": 0.1865723729133606, "rewards/rejected": 0.010321006178855896, "step": 162 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3411826268969126, "grad_norm": 2.295549922329293, "learning_rate": 4.164647253573289e-07, "logits/chosen": -1.5574886798858643, "logits/rejected": -1.7043709754943848, "logps/chosen": -228.50660705566406, "logps/rejected": -239.03619384765625, "loss": 0.629, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23374566435813904, "rewards/margins": 0.12954719364643097, "rewards/rejected": 0.10419845581054688, "step": 163 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.34327577184720043, "grad_norm": 2.302117077284407, "learning_rate": 4.1509438117713863e-07, "logits/chosen": -1.501077651977539, "logits/rejected": -1.3392484188079834, "logps/chosen": -309.2160339355469, "logps/rejected": -321.8545227050781, "loss": 0.6254, "rewards/accuracies": 0.75, "rewards/chosen": 0.2376839518547058, "rewards/margins": 0.06350281834602356, "rewards/rejected": 0.17418113350868225, "step": 164 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3453689167974882, "grad_norm": 2.4913035199653493, "learning_rate": 4.137151834863213e-07, "logits/chosen": -1.1273077726364136, "logits/rejected": -1.613063097000122, "logps/chosen": -256.123291015625, "logps/rejected": -258.82415771484375, "loss": 0.6475, "rewards/accuracies": 0.625, "rewards/chosen": 0.14060387015342712, "rewards/margins": 0.07706978917121887, "rewards/rejected": 0.06353408843278885, "step": 165 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.34746206174777605, "grad_norm": 2.3855642300092486, "learning_rate": 4.123272062470633e-07, "logits/chosen": -1.273988962173462, "logits/rejected": -1.4383682012557983, "logps/chosen": -263.6395568847656, "logps/rejected": -235.447509765625, "loss": 0.6366, "rewards/accuracies": 0.75, "rewards/chosen": 0.239268496632576, "rewards/margins": 0.15390436351299286, "rewards/rejected": 0.08536411076784134, "step": 166 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.34955520669806384, "grad_norm": 2.4567659168208453, "learning_rate": 4.1093052389237174e-07, "logits/chosen": -1.398369550704956, "logits/rejected": -1.7410180568695068, "logps/chosen": -395.40313720703125, "logps/rejected": -327.74468994140625, "loss": 0.6408, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2275480180978775, "rewards/margins": 0.14268644154071808, "rewards/rejected": 0.08486156910657883, "step": 167 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3516483516483517, "grad_norm": 2.7301169001089467, "learning_rate": 4.0952521132208267e-07, "logits/chosen": -1.5941585302352905, "logits/rejected": -1.6910505294799805, "logps/chosen": -143.9154052734375, "logps/rejected": -180.61599731445312, "loss": 0.6436, "rewards/accuracies": 0.4375, "rewards/chosen": 0.1531486213207245, "rewards/margins": 0.04333385080099106, "rewards/rejected": 0.10981477051973343, "step": 168 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.35374149659863946, "grad_norm": 2.4822309155172695, "learning_rate": 4.081113438988443e-07, "logits/chosen": -1.209855556488037, "logits/rejected": -1.0931593179702759, "logps/chosen": -273.060791015625, "logps/rejected": -269.7195739746094, "loss": 0.6369, "rewards/accuracies": 0.5625, "rewards/chosen": 0.06676235795021057, "rewards/margins": 0.04176097363233566, "rewards/rejected": 0.02500138059258461, "step": 169 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.35583464154892724, "grad_norm": 2.6368099928248325, "learning_rate": 4.0668899744407567e-07, "logits/chosen": -1.6101369857788086, "logits/rejected": -1.4242777824401855, "logps/chosen": -325.4241943359375, "logps/rejected": -278.8458251953125, "loss": 0.6258, "rewards/accuracies": 0.875, "rewards/chosen": 0.2411346286535263, "rewards/margins": 0.24983572959899902, "rewards/rejected": -0.008701073937118053, "step": 170 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3579277864992151, "grad_norm": 2.4705991469732016, "learning_rate": 4.0525824823390043e-07, "logits/chosen": -0.91307532787323, "logits/rejected": -1.0882445573806763, "logps/chosen": -254.78794860839844, "logps/rejected": -211.02835083007812, "loss": 0.6206, "rewards/accuracies": 0.75, "rewards/chosen": 0.20832128822803497, "rewards/margins": 0.16422270238399506, "rewards/rejected": 0.044098611921072006, "step": 171 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.36002093144950287, "grad_norm": 2.520783142361627, "learning_rate": 4.0381917299505686e-07, "logits/chosen": -0.9361591339111328, "logits/rejected": -1.5039886236190796, "logps/chosen": -311.29095458984375, "logps/rejected": -252.6978759765625, "loss": 0.626, "rewards/accuracies": 0.4375, "rewards/chosen": 0.1798720508813858, "rewards/margins": 0.09926888346672058, "rewards/rejected": 0.08060317486524582, "step": 172 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3621140763997907, "grad_norm": 2.4963868354953935, "learning_rate": 4.0237184890078243e-07, "logits/chosen": -1.451729655265808, "logits/rejected": -1.4041798114776611, "logps/chosen": -284.7095642089844, "logps/rejected": -354.4183654785156, "loss": 0.6238, "rewards/accuracies": 0.6875, "rewards/chosen": 0.25954151153564453, "rewards/margins": 0.21889463067054749, "rewards/rejected": 0.04064687713980675, "step": 173 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3642072213500785, "grad_norm": 2.485738178602117, "learning_rate": 4.00916353566676e-07, "logits/chosen": -1.2032397985458374, "logits/rejected": -1.206512689590454, "logps/chosen": -259.771240234375, "logps/rejected": -276.5047302246094, "loss": 0.6329, "rewards/accuracies": 0.5625, "rewards/chosen": 0.10867345333099365, "rewards/margins": 0.05639608949422836, "rewards/rejected": 0.052277371287345886, "step": 174 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3663003663003663, "grad_norm": 2.557616912095427, "learning_rate": 3.994527650465352e-07, "logits/chosen": -1.6746225357055664, "logits/rejected": -1.5756711959838867, "logps/chosen": -360.1378173828125, "logps/rejected": -276.6419677734375, "loss": 0.6314, "rewards/accuracies": 0.75, "rewards/chosen": 0.1260715276002884, "rewards/margins": 0.12275249511003494, "rewards/rejected": 0.003319043666124344, "step": 175 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3683935112506541, "grad_norm": 2.4557591851995975, "learning_rate": 3.979811618281705e-07, "logits/chosen": -1.0650980472564697, "logits/rejected": -1.21378755569458, "logps/chosen": -422.3215637207031, "logps/rejected": -372.7249755859375, "loss": 0.6135, "rewards/accuracies": 0.75, "rewards/chosen": 0.29210084676742554, "rewards/margins": 0.2728201746940613, "rewards/rejected": 0.01928069442510605, "step": 176 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3704866562009419, "grad_norm": 2.5981844306568904, "learning_rate": 3.9650162282919654e-07, "logits/chosen": -0.9138455986976624, "logits/rejected": -1.111466884613037, "logps/chosen": -348.7144775390625, "logps/rejected": -309.4266357421875, "loss": 0.6185, "rewards/accuracies": 0.8125, "rewards/chosen": 0.19288626313209534, "rewards/margins": 0.1864483803510666, "rewards/rejected": 0.0064378827810287476, "step": 177 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.37257980115122974, "grad_norm": 2.48725938073683, "learning_rate": 3.9501422739279953e-07, "logits/chosen": -1.353095531463623, "logits/rejected": -1.6012011766433716, "logps/chosen": -212.70111083984375, "logps/rejected": -229.73806762695312, "loss": 0.6153, "rewards/accuracies": 0.875, "rewards/chosen": 0.10757704079151154, "rewards/margins": 0.14108923077583313, "rewards/rejected": -0.033512186259031296, "step": 178 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3746729461015175, "grad_norm": 2.5554851132775327, "learning_rate": 3.935190552834828e-07, "logits/chosen": -1.155402660369873, "logits/rejected": -1.3392162322998047, "logps/chosen": -401.35491943359375, "logps/rejected": -399.68157958984375, "loss": 0.6134, "rewards/accuracies": 0.75, "rewards/chosen": 0.18621841073036194, "rewards/margins": 0.12476975470781326, "rewards/rejected": 0.06144864857196808, "step": 179 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.37676609105180536, "grad_norm": 2.413990982041016, "learning_rate": 3.920161866827889e-07, "logits/chosen": -1.6539878845214844, "logits/rejected": -1.4273403882980347, "logps/chosen": -126.3817367553711, "logps/rejected": -185.66796875, "loss": 0.6219, "rewards/accuracies": 0.75, "rewards/chosen": 0.19708161056041718, "rewards/margins": 0.14128416776657104, "rewards/rejected": 0.05579744279384613, "step": 180 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.37885923600209315, "grad_norm": 2.5134248003143935, "learning_rate": 3.90505702185e-07, "logits/chosen": -1.0900201797485352, "logits/rejected": -1.378363013267517, "logps/chosen": -362.9962158203125, "logps/rejected": -325.4007263183594, "loss": 0.6403, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1960827112197876, "rewards/margins": 0.2827332615852356, "rewards/rejected": -0.0866505429148674, "step": 181 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.38095238095238093, "grad_norm": 2.514910535341247, "learning_rate": 3.889876827928156e-07, "logits/chosen": -1.486838698387146, "logits/rejected": -1.1954071521759033, "logps/chosen": -236.7154541015625, "logps/rejected": -331.57928466796875, "loss": 0.6219, "rewards/accuracies": 0.625, "rewards/chosen": 0.16283904016017914, "rewards/margins": 0.17473725974559784, "rewards/rejected": -0.011898223310709, "step": 182 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.38304552590266877, "grad_norm": 2.4027358303017627, "learning_rate": 3.874622099130087e-07, "logits/chosen": -1.1960852146148682, "logits/rejected": -1.3151079416275024, "logps/chosen": -407.71429443359375, "logps/rejected": -318.0823669433594, "loss": 0.6053, "rewards/accuracies": 0.5625, "rewards/chosen": 0.16753815114498138, "rewards/margins": 0.11795823276042938, "rewards/rejected": 0.049579918384552, "step": 183 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.38513867085295656, "grad_norm": 2.380664808056985, "learning_rate": 3.859293653520604e-07, "logits/chosen": -1.3267618417739868, "logits/rejected": -1.1700211763381958, "logps/chosen": -203.7779541015625, "logps/rejected": -258.7023620605469, "loss": 0.6231, "rewards/accuracies": 0.5, "rewards/chosen": 0.19549718499183655, "rewards/margins": 0.08279408514499664, "rewards/rejected": 0.1127031221985817, "step": 184 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3872318158032444, "grad_norm": 2.3722500966489584, "learning_rate": 3.8438923131177237e-07, "logits/chosen": -1.1541537046432495, "logits/rejected": -1.3800581693649292, "logps/chosen": -234.66744995117188, "logps/rejected": -204.023681640625, "loss": 0.6345, "rewards/accuracies": 0.625, "rewards/chosen": 0.1438305377960205, "rewards/margins": 0.046937644481658936, "rewards/rejected": 0.09689289331436157, "step": 185 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3893249607535322, "grad_norm": 2.5566993504033237, "learning_rate": 3.828418903848593e-07, "logits/chosen": -1.1675281524658203, "logits/rejected": -1.4031877517700195, "logps/chosen": -291.6648254394531, "logps/rejected": -301.2039794921875, "loss": 0.6225, "rewards/accuracies": 0.6875, "rewards/chosen": 0.19316133856773376, "rewards/margins": 0.15984830260276794, "rewards/rejected": 0.03331303596496582, "step": 186 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.39141810570381996, "grad_norm": 2.4978256342110883, "learning_rate": 3.812874255505191e-07, "logits/chosen": -1.1284247636795044, "logits/rejected": -1.0235272645950317, "logps/chosen": -386.06756591796875, "logps/rejected": -461.76727294921875, "loss": 0.6261, "rewards/accuracies": 0.5625, "rewards/chosen": 0.17677512764930725, "rewards/margins": 0.06438468396663666, "rewards/rejected": 0.1123904436826706, "step": 187 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3935112506541078, "grad_norm": 2.5559341324286007, "learning_rate": 3.797259201699833e-07, "logits/chosen": -1.1615216732025146, "logits/rejected": -1.4762074947357178, "logps/chosen": -270.22637939453125, "logps/rejected": -250.18788146972656, "loss": 0.6567, "rewards/accuracies": 0.625, "rewards/chosen": 0.11873458325862885, "rewards/margins": 0.06219414249062538, "rewards/rejected": 0.05654044821858406, "step": 188 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3956043956043956, "grad_norm": 2.2634746963653054, "learning_rate": 3.781574579820464e-07, "logits/chosen": -1.7563698291778564, "logits/rejected": -1.6744518280029297, "logps/chosen": -154.44241333007812, "logps/rejected": -154.7679901123047, "loss": 0.6498, "rewards/accuracies": 0.5, "rewards/chosen": 0.15934938192367554, "rewards/margins": 0.07699678838253021, "rewards/rejected": 0.08235260844230652, "step": 189 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3976975405546834, "grad_norm": 2.586990973722058, "learning_rate": 3.765821230985757e-07, "logits/chosen": -1.51203453540802, "logits/rejected": -1.3744680881500244, "logps/chosen": -338.53253173828125, "logps/rejected": -342.0909118652344, "loss": 0.6007, "rewards/accuracies": 0.75, "rewards/chosen": 0.2170889973640442, "rewards/margins": 0.17495933175086975, "rewards/rejected": 0.04212965816259384, "step": 190 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.3997906855049712, "grad_norm": 2.6622488216185474, "learning_rate": 3.75e-07, "logits/chosen": -0.7891209125518799, "logits/rejected": -0.8951573371887207, "logps/chosen": -300.98089599609375, "logps/rejected": -344.1754455566406, "loss": 0.6434, "rewards/accuracies": 0.5625, "rewards/chosen": 0.12258611619472504, "rewards/margins": 0.04959294572472572, "rewards/rejected": 0.07299317419528961, "step": 191 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.40188383045525905, "grad_norm": 2.511548752259852, "learning_rate": 3.734111735307796e-07, "logits/chosen": -0.8865792155265808, "logits/rejected": -1.0144256353378296, "logps/chosen": -289.1127624511719, "logps/rejected": -309.16217041015625, "loss": 0.6268, "rewards/accuracies": 0.8125, "rewards/chosen": 0.22665242850780487, "rewards/margins": 0.18135380744934082, "rewards/rejected": 0.04529860243201256, "step": 192 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.40397697540554683, "grad_norm": 2.5684016112199948, "learning_rate": 3.7181572889485623e-07, "logits/chosen": -1.4993069171905518, "logits/rejected": -1.373354196548462, "logps/chosen": -251.82711791992188, "logps/rejected": -288.63079833984375, "loss": 0.6331, "rewards/accuracies": 0.8125, "rewards/chosen": 0.17966584861278534, "rewards/margins": 0.1652246117591858, "rewards/rejected": 0.014441237784922123, "step": 193 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4060701203558346, "grad_norm": 2.398092380490866, "learning_rate": 3.7021375165108377e-07, "logits/chosen": -1.3061937093734741, "logits/rejected": -1.2535077333450317, "logps/chosen": -233.1542510986328, "logps/rejected": -235.06668090820312, "loss": 0.5981, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23160725831985474, "rewards/margins": 0.24850605428218842, "rewards/rejected": -0.01689879409968853, "step": 194 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.40816326530612246, "grad_norm": 2.5681234460352984, "learning_rate": 3.6860532770864005e-07, "logits/chosen": -0.7629050016403198, "logits/rejected": -1.1765587329864502, "logps/chosen": -246.88026428222656, "logps/rejected": -223.17343139648438, "loss": 0.634, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09434913843870163, "rewards/margins": 0.1590423285961151, "rewards/rejected": -0.06469318270683289, "step": 195 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.41025641025641024, "grad_norm": 2.4570466790512815, "learning_rate": 3.6699054332241985e-07, "logits/chosen": -1.5074723958969116, "logits/rejected": -1.427249550819397, "logps/chosen": -280.9584045410156, "logps/rejected": -274.966552734375, "loss": 0.6164, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24049101769924164, "rewards/margins": 0.2488221824169159, "rewards/rejected": -0.008331160992383957, "step": 196 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4123495552066981, "grad_norm": 2.6883351181828594, "learning_rate": 3.653694850884091e-07, "logits/chosen": -1.4056396484375, "logits/rejected": -1.2442378997802734, "logps/chosen": -267.7893981933594, "logps/rejected": -401.9318542480469, "loss": 0.6336, "rewards/accuracies": 0.4375, "rewards/chosen": 0.04651251435279846, "rewards/margins": -0.026873305439949036, "rewards/rejected": 0.0733858197927475, "step": 197 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.41444270015698587, "grad_norm": 2.597237160653232, "learning_rate": 3.6374223993904124e-07, "logits/chosen": -1.4499425888061523, "logits/rejected": -1.3498481512069702, "logps/chosen": -233.3629150390625, "logps/rejected": -344.94720458984375, "loss": 0.6305, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1677456498146057, "rewards/margins": 0.17840783298015594, "rewards/rejected": -0.010662171989679337, "step": 198 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4165358451072737, "grad_norm": 2.4499534166628623, "learning_rate": 3.621088951385353e-07, "logits/chosen": -1.1960229873657227, "logits/rejected": -1.245772361755371, "logps/chosen": -373.84033203125, "logps/rejected": -462.5028381347656, "loss": 0.6289, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14031030237674713, "rewards/margins": 0.07471412420272827, "rewards/rejected": 0.06559617817401886, "step": 199 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4186289900575615, "grad_norm": 3.183595036065602, "learning_rate": 3.604695382782159e-07, "logits/chosen": -1.11525297164917, "logits/rejected": -1.4690768718719482, "logps/chosen": -311.82623291015625, "logps/rejected": -220.60018920898438, "loss": 0.6413, "rewards/accuracies": 0.625, "rewards/chosen": 0.08638463914394379, "rewards/margins": 0.09558765590190887, "rewards/rejected": -0.009203016757965088, "step": 200 }, { "epoch": 0.4186289900575615, "eval_dpo_lambda": 0.9500001072883606, "eval_logits/chosen": -1.176053762435913, "eval_logits/rejected": -1.2955509424209595, "eval_logps/chosen": -317.1191711425781, "eval_logps/rejected": -309.9195251464844, "eval_loss": 0.6249718070030212, "eval_rewards/accuracies": 0.738095223903656, "eval_rewards/chosen": 0.1619236171245575, "eval_rewards/margins": 0.19375446438789368, "eval_rewards/rejected": -0.031830862164497375, "eval_runtime": 117.5843, "eval_samples_per_second": 17.009, "eval_steps_per_second": 0.536, "step": 200 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4207221350078493, "grad_norm": 2.5645678898874182, "learning_rate": 3.588242572718162e-07, "logits/chosen": -1.3985207080841064, "logits/rejected": -1.5649539232254028, "logps/chosen": -338.6553039550781, "logps/rejected": -320.3418273925781, "loss": 0.6237, "rewards/accuracies": 0.625, "rewards/chosen": 0.16122041642665863, "rewards/margins": 0.18632397055625916, "rewards/rejected": -0.025103561580181122, "step": 201 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4228152799581371, "grad_norm": 2.450495768335415, "learning_rate": 3.571731403507635e-07, "logits/chosen": -1.1729366779327393, "logits/rejected": -1.6279866695404053, "logps/chosen": -335.41973876953125, "logps/rejected": -387.6841735839844, "loss": 0.6323, "rewards/accuracies": 0.8125, "rewards/chosen": 0.22152478992938995, "rewards/margins": 0.2871818244457245, "rewards/rejected": -0.06565702706575394, "step": 202 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4249084249084249, "grad_norm": 2.534411446907058, "learning_rate": 3.5551627605944746e-07, "logits/chosen": -1.3499069213867188, "logits/rejected": -1.6095776557922363, "logps/chosen": -308.20465087890625, "logps/rejected": -254.54412841796875, "loss": 0.6236, "rewards/accuracies": 0.75, "rewards/chosen": 0.05744122341275215, "rewards/margins": 0.19424757361412048, "rewards/rejected": -0.13680635392665863, "step": 203 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.42700156985871274, "grad_norm": 2.443318249469058, "learning_rate": 3.5385375325047163e-07, "logits/chosen": -1.2373605966567993, "logits/rejected": -1.3634225130081177, "logps/chosen": -294.68231201171875, "logps/rejected": -260.337158203125, "loss": 0.6438, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01622108370065689, "rewards/margins": 0.013739176094532013, "rewards/rejected": 0.002481900155544281, "step": 204 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4290947148090005, "grad_norm": 2.6119624868855724, "learning_rate": 3.5218566107988867e-07, "logits/chosen": -1.155225396156311, "logits/rejected": -1.1389286518096924, "logps/chosen": -334.7388000488281, "logps/rejected": -310.7633972167969, "loss": 0.6234, "rewards/accuracies": 1.0, "rewards/chosen": 0.20221556723117828, "rewards/margins": 0.34638532996177673, "rewards/rejected": -0.14416977763175964, "step": 205 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4311878597592883, "grad_norm": 2.619920217500606, "learning_rate": 3.505120890024195e-07, "logits/chosen": -1.5947747230529785, "logits/rejected": -1.5108340978622437, "logps/chosen": -268.2635192871094, "logps/rejected": -277.8184814453125, "loss": 0.6484, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07959023863077164, "rewards/margins": 0.09483392536640167, "rewards/rejected": -0.015243688598275185, "step": 206 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.43328100470957615, "grad_norm": 2.5236262975153863, "learning_rate": 3.4883312676665534e-07, "logits/chosen": -1.4129953384399414, "logits/rejected": -1.478919506072998, "logps/chosen": -356.1133117675781, "logps/rejected": -371.2951354980469, "loss": 0.6574, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14316225051879883, "rewards/margins": 0.09954243153333664, "rewards/rejected": 0.04361981898546219, "step": 207 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.43537414965986393, "grad_norm": 2.5512982574064, "learning_rate": 3.4714886441024573e-07, "logits/chosen": -0.959884524345398, "logits/rejected": -1.4933385848999023, "logps/chosen": -349.9438781738281, "logps/rejected": -256.1654968261719, "loss": 0.621, "rewards/accuracies": 0.625, "rewards/chosen": 0.1415381133556366, "rewards/margins": 0.17355072498321533, "rewards/rejected": -0.03201258182525635, "step": 208 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.43746729461015177, "grad_norm": 2.491632430381176, "learning_rate": 3.454593922550693e-07, "logits/chosen": -0.8306704759597778, "logits/rejected": -1.3567924499511719, "logps/chosen": -403.4012451171875, "logps/rejected": -364.3705749511719, "loss": 0.6359, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2507641017436981, "rewards/margins": 0.25432345271110535, "rewards/rejected": -0.003559347242116928, "step": 209 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.43956043956043955, "grad_norm": 2.588647708334434, "learning_rate": 3.4376480090239047e-07, "logits/chosen": -0.6872680187225342, "logits/rejected": -1.453586459159851, "logps/chosen": -435.5264587402344, "logps/rejected": -292.83709716796875, "loss": 0.6089, "rewards/accuracies": 0.6875, "rewards/chosen": 0.13921087980270386, "rewards/margins": 0.22478558123111725, "rewards/rejected": -0.08557470142841339, "step": 210 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4416535845107274, "grad_norm": 2.519253275048204, "learning_rate": 3.4206518122800055e-07, "logits/chosen": -1.4051663875579834, "logits/rejected": -1.9473254680633545, "logps/chosen": -278.544189453125, "logps/rejected": -257.1572265625, "loss": 0.6323, "rewards/accuracies": 0.8125, "rewards/chosen": 0.12254437059164047, "rewards/margins": 0.22737647593021393, "rewards/rejected": -0.10483211278915405, "step": 211 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4437467294610152, "grad_norm": 2.375297108610387, "learning_rate": 3.403606243773448e-07, "logits/chosen": -1.268899917602539, "logits/rejected": -1.3240399360656738, "logps/chosen": -246.43350219726562, "logps/rejected": -242.434814453125, "loss": 0.6428, "rewards/accuracies": 0.4375, "rewards/chosen": -0.025858985260128975, "rewards/margins": -0.03879370540380478, "rewards/rejected": 0.012934722006320953, "step": 212 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.44583987441130296, "grad_norm": 2.662540724084386, "learning_rate": 3.3865122176063385e-07, "logits/chosen": -1.2308666706085205, "logits/rejected": -1.2196345329284668, "logps/chosen": -274.77569580078125, "logps/rejected": -347.3487548828125, "loss": 0.6221, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03683917224407196, "rewards/margins": 0.18827465176582336, "rewards/rejected": -0.1514354646205902, "step": 213 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4479330193615908, "grad_norm": 2.7413521383683155, "learning_rate": 3.3693706504794243e-07, "logits/chosen": -1.5986062288284302, "logits/rejected": -1.5727263689041138, "logps/chosen": -254.8701934814453, "logps/rejected": -306.6053466796875, "loss": 0.623, "rewards/accuracies": 0.75, "rewards/chosen": 0.1537170708179474, "rewards/margins": 0.21143585443496704, "rewards/rejected": -0.05771879851818085, "step": 214 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4500261643118786, "grad_norm": 2.605976892359232, "learning_rate": 3.3521824616429284e-07, "logits/chosen": -1.2375333309173584, "logits/rejected": -1.381260633468628, "logps/chosen": -221.16017150878906, "logps/rejected": -286.5414123535156, "loss": 0.629, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14493045210838318, "rewards/margins": 0.20082828402519226, "rewards/rejected": -0.05589780956506729, "step": 215 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4521193092621664, "grad_norm": 2.51553883265307, "learning_rate": 3.334948572847253e-07, "logits/chosen": -1.422798991203308, "logits/rejected": -1.1554739475250244, "logps/chosen": -152.12472534179688, "logps/rejected": -257.127685546875, "loss": 0.6139, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11727073043584824, "rewards/margins": 0.2626047432422638, "rewards/rejected": -0.14533402025699615, "step": 216 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4542124542124542, "grad_norm": 2.7470902061009475, "learning_rate": 3.317669908293554e-07, "logits/chosen": -1.5382193326950073, "logits/rejected": -1.4770201444625854, "logps/chosen": -283.92138671875, "logps/rejected": -317.8141784667969, "loss": 0.639, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1118953675031662, "rewards/margins": 0.19783765077590942, "rewards/rejected": -0.08594229072332382, "step": 217 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.456305599162742, "grad_norm": 2.511724616075412, "learning_rate": 3.300347394584172e-07, "logits/chosen": -0.9787873029708862, "logits/rejected": -1.4981741905212402, "logps/chosen": -323.4879455566406, "logps/rejected": -268.7342529296875, "loss": 0.6121, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1377992331981659, "rewards/margins": 0.1823887676000595, "rewards/rejected": -0.044589538127183914, "step": 218 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.45839874411302983, "grad_norm": 2.5737048299378884, "learning_rate": 3.2829819606729477e-07, "logits/chosen": -0.9151563048362732, "logits/rejected": -1.1868966817855835, "logps/chosen": -296.09576416015625, "logps/rejected": -298.5234375, "loss": 0.6138, "rewards/accuracies": 0.5625, "rewards/chosen": 0.19875109195709229, "rewards/margins": 0.1635432243347168, "rewards/rejected": 0.035207852721214294, "step": 219 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4604918890633176, "grad_norm": 2.5562361806695275, "learning_rate": 3.265574537815398e-07, "logits/chosen": -1.242297649383545, "logits/rejected": -1.205316185951233, "logps/chosen": -263.7002258300781, "logps/rejected": -261.51409912109375, "loss": 0.6106, "rewards/accuracies": 0.75, "rewards/chosen": 0.07027896493673325, "rewards/margins": 0.12756158411502838, "rewards/rejected": -0.05728260427713394, "step": 220 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.46258503401360546, "grad_norm": 2.6750213795635456, "learning_rate": 3.248126059518784e-07, "logits/chosen": -0.9967190027236938, "logits/rejected": -1.4355674982070923, "logps/chosen": -358.86138916015625, "logps/rejected": -283.48583984375, "loss": 0.6057, "rewards/accuracies": 0.75, "rewards/chosen": 0.19496701657772064, "rewards/margins": 0.297015905380249, "rewards/rejected": -0.10204888880252838, "step": 221 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.46467817896389324, "grad_norm": 2.56988855896997, "learning_rate": 3.230637461492043e-07, "logits/chosen": -0.8802845478057861, "logits/rejected": -0.8851996064186096, "logps/chosen": -303.05133056640625, "logps/rejected": -308.7895202636719, "loss": 0.6171, "rewards/accuracies": 0.75, "rewards/chosen": 0.15049469470977783, "rewards/margins": 0.1633467972278595, "rewards/rejected": -0.012852095067501068, "step": 222 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4667713239141811, "grad_norm": 2.659155993552341, "learning_rate": 3.213109681595612e-07, "logits/chosen": -0.9984061121940613, "logits/rejected": -1.2433918714523315, "logps/chosen": -315.7237548828125, "logps/rejected": -270.2972106933594, "loss": 0.6166, "rewards/accuracies": 0.6875, "rewards/chosen": 0.12128496915102005, "rewards/margins": 0.13776874542236328, "rewards/rejected": -0.016483768820762634, "step": 223 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.46886446886446886, "grad_norm": 2.82483921889064, "learning_rate": 3.1955436597911315e-07, "logits/chosen": -1.338395595550537, "logits/rejected": -1.5943886041641235, "logps/chosen": -250.51109313964844, "logps/rejected": -235.90579223632812, "loss": 0.6225, "rewards/accuracies": 0.875, "rewards/chosen": 0.18126514554023743, "rewards/margins": 0.2962171137332916, "rewards/rejected": -0.1149519681930542, "step": 224 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.47095761381475665, "grad_norm": 2.6127462809638438, "learning_rate": 3.1779403380910425e-07, "logits/chosen": -1.6047933101654053, "logits/rejected": -1.3916809558868408, "logps/chosen": -290.03045654296875, "logps/rejected": -414.2421875, "loss": 0.6153, "rewards/accuracies": 0.75, "rewards/chosen": 0.12157540023326874, "rewards/margins": 0.19476866722106934, "rewards/rejected": -0.0731932744383812, "step": 225 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4730507587650445, "grad_norm": 2.508987719131239, "learning_rate": 3.160300660508064e-07, "logits/chosen": -1.369289755821228, "logits/rejected": -1.884090542793274, "logps/chosen": -159.00723266601562, "logps/rejected": -139.6253662109375, "loss": 0.6173, "rewards/accuracies": 0.75, "rewards/chosen": 0.1623269021511078, "rewards/margins": 0.19411006569862366, "rewards/rejected": -0.03178314119577408, "step": 226 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.47514390371533227, "grad_norm": 2.7778339672385144, "learning_rate": 3.1426255730045695e-07, "logits/chosen": -1.2404892444610596, "logits/rejected": -1.1408684253692627, "logps/chosen": -218.57252502441406, "logps/rejected": -379.2306823730469, "loss": 0.6343, "rewards/accuracies": 0.75, "rewards/chosen": 0.03034341335296631, "rewards/margins": 0.3198496401309967, "rewards/rejected": -0.289506196975708, "step": 227 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4772370486656201, "grad_norm": 2.820416590294311, "learning_rate": 3.1249160234418644e-07, "logits/chosen": -1.0147160291671753, "logits/rejected": -0.9541725516319275, "logps/chosen": -241.65599060058594, "logps/rejected": -248.71047973632812, "loss": 0.6467, "rewards/accuracies": 0.875, "rewards/chosen": 0.14148034155368805, "rewards/margins": 0.2784053385257721, "rewards/rejected": -0.13692499697208405, "step": 228 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4793301936159079, "grad_norm": 2.64118754198273, "learning_rate": 3.1071729615293424e-07, "logits/chosen": -0.9893019199371338, "logits/rejected": -1.5328551530838013, "logps/chosen": -294.81671142578125, "logps/rejected": -290.1551208496094, "loss": 0.6203, "rewards/accuracies": 0.8125, "rewards/chosen": 0.12497810274362564, "rewards/margins": 0.24076685309410095, "rewards/rejected": -0.11578874289989471, "step": 229 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.48142333856619574, "grad_norm": 2.6274753613455455, "learning_rate": 3.0893973387735683e-07, "logits/chosen": -1.0491061210632324, "logits/rejected": -1.4888513088226318, "logps/chosen": -379.806640625, "logps/rejected": -281.98065185546875, "loss": 0.607, "rewards/accuracies": 0.75, "rewards/chosen": 0.1855032742023468, "rewards/margins": 0.30984121561050415, "rewards/rejected": -0.12433796375989914, "step": 230 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4835164835164835, "grad_norm": 2.576068591056347, "learning_rate": 3.071590108427243e-07, "logits/chosen": -1.8507444858551025, "logits/rejected": -1.5297131538391113, "logps/chosen": -264.462646484375, "logps/rejected": -315.08648681640625, "loss": 0.6038, "rewards/accuracies": 0.5625, "rewards/chosen": 0.10969758033752441, "rewards/margins": 0.21770010888576508, "rewards/rejected": -0.10800251364707947, "step": 231 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4856096284667713, "grad_norm": 2.6536226852247102, "learning_rate": 3.05375222543809e-07, "logits/chosen": -1.2449592351913452, "logits/rejected": -1.0192017555236816, "logps/chosen": -313.2269592285156, "logps/rejected": -319.3896179199219, "loss": 0.6136, "rewards/accuracies": 0.625, "rewards/chosen": 0.135199174284935, "rewards/margins": 0.07153313606977463, "rewards/rejected": 0.06366601586341858, "step": 232 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.48770277341705914, "grad_norm": 2.7705407455118753, "learning_rate": 3.035884646397637e-07, "logits/chosen": -1.5836330652236938, "logits/rejected": -1.54354989528656, "logps/chosen": -294.09136962890625, "logps/rejected": -314.0255126953125, "loss": 0.6184, "rewards/accuracies": 0.5, "rewards/chosen": 0.11818283796310425, "rewards/margins": 0.1459788680076599, "rewards/rejected": -0.027796020731329918, "step": 233 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4897959183673469, "grad_norm": 2.5715791482732904, "learning_rate": 3.017988329489923e-07, "logits/chosen": -1.0522046089172363, "logits/rejected": -1.4236711263656616, "logps/chosen": -391.730224609375, "logps/rejected": -310.12298583984375, "loss": 0.5724, "rewards/accuracies": 0.75, "rewards/chosen": 0.1908198595046997, "rewards/margins": 0.2755054533481598, "rewards/rejected": -0.08468559384346008, "step": 234 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.49188906331763477, "grad_norm": 2.70477263683191, "learning_rate": 3.000064234440111e-07, "logits/chosen": -1.526976466178894, "logits/rejected": -1.944162130355835, "logps/chosen": -252.0861358642578, "logps/rejected": -227.17788696289062, "loss": 0.619, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0542568564414978, "rewards/margins": 0.07527622580528259, "rewards/rejected": -0.02101937308907509, "step": 235 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.49398220826792255, "grad_norm": 2.7067887412292913, "learning_rate": 2.9821133224630223e-07, "logits/chosen": -0.8284770250320435, "logits/rejected": -1.35467529296875, "logps/chosen": -366.1927185058594, "logps/rejected": -352.7197265625, "loss": 0.6171, "rewards/accuracies": 0.75, "rewards/chosen": 0.1724318563938141, "rewards/margins": 0.2694428563117981, "rewards/rejected": -0.09701099991798401, "step": 236 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.49607535321821034, "grad_norm": 2.703282002872591, "learning_rate": 2.964136556211588e-07, "logits/chosen": -1.6709789037704468, "logits/rejected": -1.792007565498352, "logps/chosen": -274.9388427734375, "logps/rejected": -350.2506103515625, "loss": 0.6046, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0776035413146019, "rewards/margins": 0.2291201502084732, "rewards/rejected": -0.1515166014432907, "step": 237 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.4981684981684982, "grad_norm": 2.779484898259592, "learning_rate": 2.946134899725226e-07, "logits/chosen": -0.9175939559936523, "logits/rejected": -1.2100294828414917, "logps/chosen": -321.0260314941406, "logps/rejected": -246.68536376953125, "loss": 0.626, "rewards/accuracies": 0.75, "rewards/chosen": 0.15614421665668488, "rewards/margins": 0.28551387786865234, "rewards/rejected": -0.12936967611312866, "step": 238 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.500261643118786, "grad_norm": 2.3506266334154304, "learning_rate": 2.9281093183781403e-07, "logits/chosen": -1.3832676410675049, "logits/rejected": -1.5731139183044434, "logps/chosen": -275.0372314453125, "logps/rejected": -305.38031005859375, "loss": 0.6146, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07807444036006927, "rewards/margins": 0.17999014258384705, "rewards/rejected": -0.10191572457551956, "step": 239 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5023547880690737, "grad_norm": 2.617602632105344, "learning_rate": 2.910060778827554e-07, "logits/chosen": -1.0989478826522827, "logits/rejected": -0.8966367244720459, "logps/chosen": -313.17144775390625, "logps/rejected": -380.89862060546875, "loss": 0.6018, "rewards/accuracies": 0.5625, "rewards/chosen": 0.05261870473623276, "rewards/margins": 0.11077626049518585, "rewards/rejected": -0.05815756320953369, "step": 240 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5044479330193616, "grad_norm": 2.749255452007427, "learning_rate": 2.891990248961871e-07, "logits/chosen": -1.3736119270324707, "logits/rejected": -1.7840209007263184, "logps/chosen": -372.358154296875, "logps/rejected": -350.45721435546875, "loss": 0.615, "rewards/accuracies": 0.8125, "rewards/chosen": 0.10940291732549667, "rewards/margins": 0.2012423574924469, "rewards/rejected": -0.09183944016695023, "step": 241 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5065410779696494, "grad_norm": 2.68400217873059, "learning_rate": 2.873898697848762e-07, "logits/chosen": -1.549246072769165, "logits/rejected": -1.7415339946746826, "logps/chosen": -311.4115295410156, "logps/rejected": -254.94070434570312, "loss": 0.5929, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03989606350660324, "rewards/margins": 0.19907906651496887, "rewards/rejected": -0.15918299555778503, "step": 242 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5086342229199372, "grad_norm": 2.7483526806775527, "learning_rate": 2.8557870956832133e-07, "logits/chosen": -1.340730905532837, "logits/rejected": -1.5479342937469482, "logps/chosen": -370.501708984375, "logps/rejected": -399.2262878417969, "loss": 0.629, "rewards/accuracies": 0.8125, "rewards/chosen": 0.16452787816524506, "rewards/margins": 0.27565494179725647, "rewards/rejected": -0.11112706363201141, "step": 243 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.510727367870225, "grad_norm": 2.6816250672192696, "learning_rate": 2.837656413735479e-07, "logits/chosen": -1.2477527856826782, "logits/rejected": -1.3629240989685059, "logps/chosen": -444.814208984375, "logps/rejected": -408.3481140136719, "loss": 0.5894, "rewards/accuracies": 0.875, "rewards/chosen": 0.18180187046527863, "rewards/margins": 0.3771238923072815, "rewards/rejected": -0.19532200694084167, "step": 244 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5128205128205128, "grad_norm": 2.5526969338481993, "learning_rate": 2.8195076242990116e-07, "logits/chosen": -1.1564173698425293, "logits/rejected": -1.7547028064727783, "logps/chosen": -270.09161376953125, "logps/rejected": -209.7252197265625, "loss": 0.6067, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08461175113916397, "rewards/margins": 0.2116965353488922, "rewards/rejected": -0.12708479166030884, "step": 245 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5149136577708007, "grad_norm": 2.7172180459730098, "learning_rate": 2.801341700638307e-07, "logits/chosen": -1.208418369293213, "logits/rejected": -1.0779743194580078, "logps/chosen": -327.49298095703125, "logps/rejected": -318.8200988769531, "loss": 0.623, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07853143662214279, "rewards/margins": 0.35405662655830383, "rewards/rejected": -0.27552518248558044, "step": 246 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5170068027210885, "grad_norm": 3.081546760428069, "learning_rate": 2.7831596169367227e-07, "logits/chosen": -1.0276368856430054, "logits/rejected": -1.2528905868530273, "logps/chosen": -281.01898193359375, "logps/rejected": -274.3391418457031, "loss": 0.6071, "rewards/accuracies": 0.8125, "rewards/chosen": 0.21969196200370789, "rewards/margins": 0.24800124764442444, "rewards/rejected": -0.02830929309129715, "step": 247 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5190999476713762, "grad_norm": 2.7093653613142186, "learning_rate": 2.7649623482442274e-07, "logits/chosen": -1.0049794912338257, "logits/rejected": -1.492990493774414, "logps/chosen": -237.46295166015625, "logps/rejected": -295.78631591796875, "loss": 0.6186, "rewards/accuracies": 0.75, "rewards/chosen": 0.05765185505151749, "rewards/margins": 0.3115951418876648, "rewards/rejected": -0.2539432644844055, "step": 248 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.521193092621664, "grad_norm": 2.5983707886004126, "learning_rate": 2.7467508704251135e-07, "logits/chosen": -1.2649043798446655, "logits/rejected": -1.2697545289993286, "logps/chosen": -264.38970947265625, "logps/rejected": -288.0372009277344, "loss": 0.6052, "rewards/accuracies": 0.625, "rewards/chosen": 0.11182260513305664, "rewards/margins": 0.28603774309158325, "rewards/rejected": -0.17421512305736542, "step": 249 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5232862375719518, "grad_norm": 2.75621820288515, "learning_rate": 2.7285261601056697e-07, "logits/chosen": -0.9653730392456055, "logits/rejected": -1.2730536460876465, "logps/chosen": -259.1565856933594, "logps/rejected": -230.4204559326172, "loss": 0.6069, "rewards/accuracies": 0.625, "rewards/chosen": 0.05514091998338699, "rewards/margins": 0.22837483882904053, "rewards/rejected": -0.17323392629623413, "step": 250 }, { "epoch": 0.5232862375719518, "eval_dpo_lambda": 0.9500001072883606, "eval_logits/chosen": -1.169466495513916, "eval_logits/rejected": -1.282702922821045, "eval_logps/chosen": -324.45379638671875, "eval_logps/rejected": -323.57830810546875, "eval_loss": 0.6113972067832947, "eval_rewards/accuracies": 0.7301587462425232, "eval_rewards/chosen": 0.08857683092355728, "eval_rewards/margins": 0.2569958567619324, "eval_rewards/rejected": -0.1684190183877945, "eval_runtime": 125.8598, "eval_samples_per_second": 15.891, "eval_steps_per_second": 0.501, "step": 250 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5253793825222397, "grad_norm": 2.7469100605175525, "learning_rate": 2.7102891946217994e-07, "logits/chosen": -1.287118673324585, "logits/rejected": -1.341564416885376, "logps/chosen": -256.66265869140625, "logps/rejected": -301.3120422363281, "loss": 0.5911, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08034676313400269, "rewards/margins": 0.29956135153770447, "rewards/rejected": -0.21921458840370178, "step": 251 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5274725274725275, "grad_norm": 2.658381111233549, "learning_rate": 2.692040951966617e-07, "logits/chosen": -1.7827293872833252, "logits/rejected": -1.713936448097229, "logps/chosen": -196.93814086914062, "logps/rejected": -303.04632568359375, "loss": 0.6207, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1290343701839447, "rewards/margins": 0.12903884053230286, "rewards/rejected": -4.453584551811218e-06, "step": 252 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5295656724228153, "grad_norm": 2.7678225167412713, "learning_rate": 2.6737824107379947e-07, "logits/chosen": -0.9162348508834839, "logits/rejected": -1.192211627960205, "logps/chosen": -355.7130432128906, "logps/rejected": -290.8765563964844, "loss": 0.5958, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11847563087940216, "rewards/margins": 0.340948224067688, "rewards/rejected": -0.22247257828712463, "step": 253 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.531658817373103, "grad_norm": 2.695595205481752, "learning_rate": 2.655514550086086e-07, "logits/chosen": -1.2251895666122437, "logits/rejected": -1.0986230373382568, "logps/chosen": -378.72503662109375, "logps/rejected": -344.495361328125, "loss": 0.627, "rewards/accuracies": 0.6875, "rewards/chosen": 0.19855576753616333, "rewards/margins": 0.36231300234794617, "rewards/rejected": -0.16375723481178284, "step": 254 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.533751962323391, "grad_norm": 2.744946156645301, "learning_rate": 2.6372383496608186e-07, "logits/chosen": -1.0925184488296509, "logits/rejected": -1.293321132659912, "logps/chosen": -235.58290100097656, "logps/rejected": -231.56219482421875, "loss": 0.6047, "rewards/accuracies": 0.625, "rewards/chosen": 0.050735145807266235, "rewards/margins": 0.10960999131202698, "rewards/rejected": -0.05887485295534134, "step": 255 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5358451072736787, "grad_norm": 2.695403293665629, "learning_rate": 2.618954789559356e-07, "logits/chosen": -1.2986500263214111, "logits/rejected": -1.5851117372512817, "logps/chosen": -374.7220458984375, "logps/rejected": -282.9933166503906, "loss": 0.6167, "rewards/accuracies": 0.5625, "rewards/chosen": 0.136151984333992, "rewards/margins": 0.2535018026828766, "rewards/rejected": -0.11734981834888458, "step": 256 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5379382522239665, "grad_norm": 3.1144507427199626, "learning_rate": 2.600664850273538e-07, "logits/chosen": -1.25803542137146, "logits/rejected": -1.72632896900177, "logps/chosen": -253.3446807861328, "logps/rejected": -258.0625, "loss": 0.5856, "rewards/accuracies": 0.75, "rewards/chosen": 0.12793627381324768, "rewards/margins": 0.3228214979171753, "rewards/rejected": -0.19488520920276642, "step": 257 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5400313971742543, "grad_norm": 2.8924788660472847, "learning_rate": 2.582369512637302e-07, "logits/chosen": -1.4959533214569092, "logits/rejected": -1.4483211040496826, "logps/chosen": -230.9194793701172, "logps/rejected": -300.3072509765625, "loss": 0.608, "rewards/accuracies": 0.5625, "rewards/chosen": -0.20188772678375244, "rewards/margins": 0.07079000771045685, "rewards/rejected": -0.2726777195930481, "step": 258 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5421245421245421, "grad_norm": 2.8732053840757095, "learning_rate": 2.5640697577740815e-07, "logits/chosen": -1.3412814140319824, "logits/rejected": -1.3204525709152222, "logps/chosen": -337.7987060546875, "logps/rejected": -348.4739990234375, "loss": 0.6232, "rewards/accuracies": 0.75, "rewards/chosen": 0.13963356614112854, "rewards/margins": 0.37431785464286804, "rewards/rejected": -0.2346842736005783, "step": 259 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.54421768707483, "grad_norm": 2.7123468222412828, "learning_rate": 2.5457665670441937e-07, "logits/chosen": -1.274739384651184, "logits/rejected": -1.1421024799346924, "logps/chosen": -333.603271484375, "logps/rejected": -337.48095703125, "loss": 0.6189, "rewards/accuracies": 0.5625, "rewards/chosen": 0.01722700148820877, "rewards/margins": 0.0855383574962616, "rewards/rejected": -0.06831135600805283, "step": 260 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5463108320251178, "grad_norm": 2.8380337146390517, "learning_rate": 2.527460921992209e-07, "logits/chosen": -1.3387187719345093, "logits/rejected": -1.3726869821548462, "logps/chosen": -351.8864440917969, "logps/rejected": -425.67852783203125, "loss": 0.604, "rewards/accuracies": 0.75, "rewards/chosen": 0.19371047616004944, "rewards/margins": 0.3303050398826599, "rewards/rejected": -0.13659457862377167, "step": 261 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5484039769754055, "grad_norm": 3.0890215918439363, "learning_rate": 2.509153804294318e-07, "logits/chosen": -1.0819432735443115, "logits/rejected": -1.134494662284851, "logps/chosen": -398.4555969238281, "logps/rejected": -353.40777587890625, "loss": 0.6021, "rewards/accuracies": 0.75, "rewards/chosen": 0.08512923866510391, "rewards/margins": 0.3029916286468506, "rewards/rejected": -0.21786239743232727, "step": 262 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5504971219256933, "grad_norm": 2.7873478294965506, "learning_rate": 2.4908461957056825e-07, "logits/chosen": -0.8926244974136353, "logits/rejected": -1.3792859315872192, "logps/chosen": -258.2203063964844, "logps/rejected": -252.16165161132812, "loss": 0.59, "rewards/accuracies": 0.9375, "rewards/chosen": 0.102707639336586, "rewards/margins": 0.44128042459487915, "rewards/rejected": -0.33857277035713196, "step": 263 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5525902668759811, "grad_norm": 3.025318744241963, "learning_rate": 2.4725390780077905e-07, "logits/chosen": -1.0822410583496094, "logits/rejected": -1.074449062347412, "logps/chosen": -311.35546875, "logps/rejected": -332.9521179199219, "loss": 0.6107, "rewards/accuracies": 0.625, "rewards/chosen": 0.09198248386383057, "rewards/margins": 0.1788451075553894, "rewards/rejected": -0.08686259388923645, "step": 264 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.554683411826269, "grad_norm": 2.6259736891973837, "learning_rate": 2.454233432955807e-07, "logits/chosen": -1.623409628868103, "logits/rejected": -1.8341429233551025, "logps/chosen": -297.4691162109375, "logps/rejected": -362.90460205078125, "loss": 0.6095, "rewards/accuracies": 0.875, "rewards/chosen": 0.2575242519378662, "rewards/margins": 0.49057164788246155, "rewards/rejected": -0.23304738104343414, "step": 265 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5567765567765568, "grad_norm": 2.826290781251233, "learning_rate": 2.435930242225919e-07, "logits/chosen": -1.2750158309936523, "logits/rejected": -1.349815011024475, "logps/chosen": -336.7392272949219, "logps/rejected": -353.5858459472656, "loss": 0.5984, "rewards/accuracies": 0.5625, "rewards/chosen": 0.08481253683567047, "rewards/margins": 0.1936895102262497, "rewards/rejected": -0.10887696593999863, "step": 266 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5588697017268446, "grad_norm": 2.7627596385187836, "learning_rate": 2.4176304873626984e-07, "logits/chosen": -1.0803136825561523, "logits/rejected": -1.0669090747833252, "logps/chosen": -366.21331787109375, "logps/rejected": -350.44940185546875, "loss": 0.6056, "rewards/accuracies": 0.75, "rewards/chosen": 0.19493043422698975, "rewards/margins": 0.39818888902664185, "rewards/rejected": -0.2032584697008133, "step": 267 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5609628466771324, "grad_norm": 2.9810489343910285, "learning_rate": 2.399335149726463e-07, "logits/chosen": -1.6511884927749634, "logits/rejected": -1.6679778099060059, "logps/chosen": -284.0511779785156, "logps/rejected": -264.1990966796875, "loss": 0.6205, "rewards/accuracies": 0.625, "rewards/chosen": -0.011040575802326202, "rewards/margins": 0.15094222128391266, "rewards/rejected": -0.16198278963565826, "step": 268 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5630559916274201, "grad_norm": 2.8319824934223767, "learning_rate": 2.381045210440644e-07, "logits/chosen": -1.3782899379730225, "logits/rejected": -1.557348608970642, "logps/chosen": -331.06158447265625, "logps/rejected": -259.7497863769531, "loss": 0.629, "rewards/accuracies": 0.75, "rewards/chosen": -0.0025225766003131866, "rewards/margins": 0.24256059527397156, "rewards/rejected": -0.24508318305015564, "step": 269 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.565149136577708, "grad_norm": 2.7808751179941713, "learning_rate": 2.3627616503391812e-07, "logits/chosen": -1.1129744052886963, "logits/rejected": -1.4809553623199463, "logps/chosen": -286.2938537597656, "logps/rejected": -333.546875, "loss": 0.5827, "rewards/accuracies": 0.625, "rewards/chosen": 0.062207940965890884, "rewards/margins": 0.2534541189670563, "rewards/rejected": -0.1912461817264557, "step": 270 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5672422815279958, "grad_norm": 2.6085078034044966, "learning_rate": 2.344485449913914e-07, "logits/chosen": -0.8759198784828186, "logits/rejected": -1.1389330625534058, "logps/chosen": -319.16741943359375, "logps/rejected": -206.53955078125, "loss": 0.5949, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04166262969374657, "rewards/margins": 0.20365414023399353, "rewards/rejected": -0.16199150681495667, "step": 271 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5693354264782836, "grad_norm": 2.899432127704415, "learning_rate": 2.3262175892620062e-07, "logits/chosen": -1.5146594047546387, "logits/rejected": -1.6545236110687256, "logps/chosen": -344.39935302734375, "logps/rejected": -291.72625732421875, "loss": 0.6171, "rewards/accuracies": 0.75, "rewards/chosen": -0.05655979365110397, "rewards/margins": 0.16514280438423157, "rewards/rejected": -0.22170260548591614, "step": 272 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5714285714285714, "grad_norm": 2.8469607608820082, "learning_rate": 2.3079590480333827e-07, "logits/chosen": -1.249172568321228, "logits/rejected": -1.7167844772338867, "logps/chosen": -327.19403076171875, "logps/rejected": -247.78785705566406, "loss": 0.6344, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12325453013181686, "rewards/margins": -0.02075212448835373, "rewards/rejected": -0.10250239074230194, "step": 273 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5735217163788592, "grad_norm": 2.854286222852487, "learning_rate": 2.2897108053782e-07, "logits/chosen": -1.3627867698669434, "logits/rejected": -1.4201370477676392, "logps/chosen": -363.587890625, "logps/rejected": -409.68084716796875, "loss": 0.6287, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0064238253980875015, "rewards/margins": 0.2309945523738861, "rewards/rejected": -0.22457075119018555, "step": 274 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5756148613291471, "grad_norm": 2.8301369031771424, "learning_rate": 2.2714738398943308e-07, "logits/chosen": -1.2416610717773438, "logits/rejected": -1.6580586433410645, "logps/chosen": -312.631103515625, "logps/rejected": -316.0438232421875, "loss": 0.5801, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06473555415868759, "rewards/margins": 0.3742232322692871, "rewards/rejected": -0.3094877004623413, "step": 275 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5777080062794349, "grad_norm": 3.0753262987958507, "learning_rate": 2.2532491295748865e-07, "logits/chosen": -2.070075035095215, "logits/rejected": -2.3812270164489746, "logps/chosen": -204.93057250976562, "logps/rejected": -280.10430908203125, "loss": 0.6287, "rewards/accuracies": 0.75, "rewards/chosen": 0.045294389128685, "rewards/margins": 0.29399359226226807, "rewards/rejected": -0.24869920313358307, "step": 276 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5798011512297226, "grad_norm": 3.141489921964011, "learning_rate": 2.2350376517557726e-07, "logits/chosen": -1.4304625988006592, "logits/rejected": -1.3248302936553955, "logps/chosen": -146.6204376220703, "logps/rejected": -236.08685302734375, "loss": 0.6233, "rewards/accuracies": 0.875, "rewards/chosen": 0.15948548913002014, "rewards/margins": 0.3558812141418457, "rewards/rejected": -0.19639569520950317, "step": 277 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5818942961800104, "grad_norm": 2.7453332021121577, "learning_rate": 2.2168403830632769e-07, "logits/chosen": -1.6977075338363647, "logits/rejected": -1.807907223701477, "logps/chosen": -273.19189453125, "logps/rejected": -271.668701171875, "loss": 0.5969, "rewards/accuracies": 0.875, "rewards/chosen": 0.08268200606107712, "rewards/margins": 0.1804083287715912, "rewards/rejected": -0.09772632271051407, "step": 278 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5839874411302983, "grad_norm": 2.9886130638331165, "learning_rate": 2.1986582993616925e-07, "logits/chosen": -1.3371131420135498, "logits/rejected": -1.3413515090942383, "logps/chosen": -290.02288818359375, "logps/rejected": -360.6324462890625, "loss": 0.591, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04492471367120743, "rewards/margins": 0.27273786067962646, "rewards/rejected": -0.3176625370979309, "step": 279 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5860805860805861, "grad_norm": 2.9902852437434064, "learning_rate": 2.1804923757009882e-07, "logits/chosen": -1.3496345281600952, "logits/rejected": -1.67328679561615, "logps/chosen": -355.2178649902344, "logps/rejected": -335.02655029296875, "loss": 0.6125, "rewards/accuracies": 0.5, "rewards/chosen": -0.0014897063374519348, "rewards/margins": 0.1599361002445221, "rewards/rejected": -0.16142579913139343, "step": 280 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5881737310308739, "grad_norm": 3.1395920164918665, "learning_rate": 2.1623435862645205e-07, "logits/chosen": -1.8589756488800049, "logits/rejected": -1.5944573879241943, "logps/chosen": -180.1637420654297, "logps/rejected": -375.59124755859375, "loss": 0.5855, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03851357847452164, "rewards/margins": 0.24672767519950867, "rewards/rejected": -0.2852412760257721, "step": 281 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5902668759811617, "grad_norm": 3.3402645198134984, "learning_rate": 2.1442129043167873e-07, "logits/chosen": -1.610151767730713, "logits/rejected": -1.7864381074905396, "logps/chosen": -331.7604064941406, "logps/rejected": -370.580078125, "loss": 0.6236, "rewards/accuracies": 0.625, "rewards/chosen": -0.1412406712770462, "rewards/margins": 0.17880569398403168, "rewards/rejected": -0.3200463652610779, "step": 282 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5923600209314495, "grad_norm": 2.722309366353571, "learning_rate": 2.1261013021512378e-07, "logits/chosen": -1.1649731397628784, "logits/rejected": -1.010484218597412, "logps/chosen": -349.72430419921875, "logps/rejected": -354.2286071777344, "loss": 0.5918, "rewards/accuracies": 0.75, "rewards/chosen": -0.012704163789749146, "rewards/margins": 0.2764449119567871, "rewards/rejected": -0.28914907574653625, "step": 283 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5944531658817374, "grad_norm": 2.8449741657109002, "learning_rate": 2.1080097510381294e-07, "logits/chosen": -1.2498743534088135, "logits/rejected": -1.2238713502883911, "logps/chosen": -277.5393371582031, "logps/rejected": -279.5948791503906, "loss": 0.6112, "rewards/accuracies": 0.8125, "rewards/chosen": 0.015373140573501587, "rewards/margins": 0.2826661765575409, "rewards/rejected": -0.2672930359840393, "step": 284 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5965463108320251, "grad_norm": 2.8208418524999197, "learning_rate": 2.089939221172446e-07, "logits/chosen": -1.0174002647399902, "logits/rejected": -1.0846703052520752, "logps/chosen": -313.46234130859375, "logps/rejected": -400.6974792480469, "loss": 0.5848, "rewards/accuracies": 0.75, "rewards/chosen": -0.07635616511106491, "rewards/margins": 0.2194177806377411, "rewards/rejected": -0.2957739233970642, "step": 285 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.5986394557823129, "grad_norm": 2.705469772045495, "learning_rate": 2.0718906816218595e-07, "logits/chosen": -1.3362562656402588, "logits/rejected": -1.3673946857452393, "logps/chosen": -402.3995361328125, "logps/rejected": -380.6618957519531, "loss": 0.6021, "rewards/accuracies": 0.75, "rewards/chosen": -0.04868704080581665, "rewards/margins": 0.4125064015388489, "rewards/rejected": -0.46119338274002075, "step": 286 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6007326007326007, "grad_norm": 2.99299747238389, "learning_rate": 2.053865100274774e-07, "logits/chosen": -1.6918283700942993, "logits/rejected": -1.925981044769287, "logps/chosen": -276.82427978515625, "logps/rejected": -269.3271484375, "loss": 0.6288, "rewards/accuracies": 0.5625, "rewards/chosen": -0.05672285333275795, "rewards/margins": 0.17042046785354614, "rewards/rejected": -0.227143332362175, "step": 287 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6028257456828885, "grad_norm": 2.6811459172502508, "learning_rate": 2.035863443788411e-07, "logits/chosen": -1.1499416828155518, "logits/rejected": -1.2810485363006592, "logps/chosen": -238.072021484375, "logps/rejected": -267.8752746582031, "loss": 0.6161, "rewards/accuracies": 0.75, "rewards/chosen": 0.09462117403745651, "rewards/margins": 0.25331178307533264, "rewards/rejected": -0.15869060158729553, "step": 288 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6049188906331764, "grad_norm": 2.641139696208472, "learning_rate": 2.0178866775369774e-07, "logits/chosen": -1.4141920804977417, "logits/rejected": -1.1955763101577759, "logps/chosen": -243.6517333984375, "logps/rejected": -287.361328125, "loss": 0.6039, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06543895602226257, "rewards/margins": 0.38033464550971985, "rewards/rejected": -0.31489571928977966, "step": 289 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6070120355834642, "grad_norm": 2.8067530729260857, "learning_rate": 1.9999357655598891e-07, "logits/chosen": -1.253098487854004, "logits/rejected": -1.4915664196014404, "logps/chosen": -298.73675537109375, "logps/rejected": -300.498779296875, "loss": 0.5972, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13263273239135742, "rewards/margins": 0.25007081031799316, "rewards/rejected": -0.3827035427093506, "step": 290 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.609105180533752, "grad_norm": 3.594810229074242, "learning_rate": 1.9820116705100775e-07, "logits/chosen": -1.220096230506897, "logits/rejected": -1.1954095363616943, "logps/chosen": -329.1642761230469, "logps/rejected": -495.6220703125, "loss": 0.6363, "rewards/accuracies": 0.625, "rewards/chosen": 0.03573284298181534, "rewards/margins": 0.4117221236228943, "rewards/rejected": -0.37598925828933716, "step": 291 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6111983254840397, "grad_norm": 2.6731071220497977, "learning_rate": 1.9641153536023642e-07, "logits/chosen": -1.0972908735275269, "logits/rejected": -1.3106157779693604, "logps/chosen": -412.78729248046875, "logps/rejected": -295.85455322265625, "loss": 0.5822, "rewards/accuracies": 0.8125, "rewards/chosen": 0.038140662014484406, "rewards/margins": 0.3702796995639801, "rewards/rejected": -0.3321390450000763, "step": 292 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6132914704343275, "grad_norm": 3.532209979637062, "learning_rate": 1.9462477745619106e-07, "logits/chosen": -0.5958943963050842, "logits/rejected": -1.06789231300354, "logps/chosen": -304.1565246582031, "logps/rejected": -257.5810546875, "loss": 0.6235, "rewards/accuracies": 0.6875, "rewards/chosen": -0.016200721263885498, "rewards/margins": 0.29386353492736816, "rewards/rejected": -0.3100642263889313, "step": 293 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6153846153846154, "grad_norm": 2.7461060577103895, "learning_rate": 1.928409891572757e-07, "logits/chosen": -0.7405527234077454, "logits/rejected": -0.8169790506362915, "logps/chosen": -329.9715270996094, "logps/rejected": -314.2995300292969, "loss": 0.5843, "rewards/accuracies": 0.5, "rewards/chosen": -0.12127472460269928, "rewards/margins": -0.017086341977119446, "rewards/rejected": -0.10418836772441864, "step": 294 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6174777603349032, "grad_norm": 2.9011465583382994, "learning_rate": 1.9106026612264315e-07, "logits/chosen": -1.1302580833435059, "logits/rejected": -1.486553430557251, "logps/chosen": -335.17742919921875, "logps/rejected": -345.6240539550781, "loss": 0.574, "rewards/accuracies": 0.75, "rewards/chosen": 0.09762419760227203, "rewards/margins": 0.2514112591743469, "rewards/rejected": -0.1537870615720749, "step": 295 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.619570905285191, "grad_norm": 2.7184068214213366, "learning_rate": 1.8928270384706582e-07, "logits/chosen": -1.1963120698928833, "logits/rejected": -1.528740406036377, "logps/chosen": -418.1468505859375, "logps/rejected": -288.8190612792969, "loss": 0.5858, "rewards/accuracies": 0.8125, "rewards/chosen": 0.08226844668388367, "rewards/margins": 0.1892382949590683, "rewards/rejected": -0.10696983337402344, "step": 296 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6216640502354788, "grad_norm": 3.1508205071635413, "learning_rate": 1.875083976558136e-07, "logits/chosen": -1.1826684474945068, "logits/rejected": -1.3524236679077148, "logps/chosen": -576.4324951171875, "logps/rejected": -490.62554931640625, "loss": 0.5841, "rewards/accuracies": 0.75, "rewards/chosen": 0.15535779297351837, "rewards/margins": 0.3767828047275543, "rewards/rejected": -0.22142501175403595, "step": 297 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6237571951857667, "grad_norm": 2.8231107793562815, "learning_rate": 1.8573744269954297e-07, "logits/chosen": -1.0781053304672241, "logits/rejected": -1.183738112449646, "logps/chosen": -417.0195007324219, "logps/rejected": -281.6366882324219, "loss": 0.5963, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14981061220169067, "rewards/margins": 0.07426027953624725, "rewards/rejected": -0.22407089173793793, "step": 298 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6258503401360545, "grad_norm": 2.8417522958492984, "learning_rate": 1.839699339491937e-07, "logits/chosen": -1.7358417510986328, "logits/rejected": -1.751924991607666, "logps/chosen": -188.27989196777344, "logps/rejected": -272.8227233886719, "loss": 0.5768, "rewards/accuracies": 0.75, "rewards/chosen": 0.1288813352584839, "rewards/margins": 0.37426143884658813, "rewards/rejected": -0.24538010358810425, "step": 299 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6279434850863422, "grad_norm": 2.8672182875485452, "learning_rate": 1.8220596619089573e-07, "logits/chosen": -1.4606785774230957, "logits/rejected": -1.4873846769332886, "logps/chosen": -348.63787841796875, "logps/rejected": -360.87652587890625, "loss": 0.611, "rewards/accuracies": 0.75, "rewards/chosen": -0.0835147500038147, "rewards/margins": 0.1348082721233368, "rewards/rejected": -0.2183230221271515, "step": 300 }, { "epoch": 0.6279434850863422, "eval_dpo_lambda": 0.9500001072883606, "eval_logits/chosen": -1.152755618095398, "eval_logits/rejected": -1.2574565410614014, "eval_logps/chosen": -328.69921875, "eval_logps/rejected": -333.47650146484375, "eval_loss": 0.5997101068496704, "eval_rewards/accuracies": 0.738095223903656, "eval_rewards/chosen": 0.046122703701257706, "eval_rewards/margins": 0.31352293491363525, "eval_rewards/rejected": -0.26740026473999023, "eval_runtime": 126.7419, "eval_samples_per_second": 15.78, "eval_steps_per_second": 0.497, "step": 300 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.63003663003663, "grad_norm": 2.9684923641316083, "learning_rate": 1.8044563402088682e-07, "logits/chosen": -1.5144453048706055, "logits/rejected": -1.726236343383789, "logps/chosen": -303.8741760253906, "logps/rejected": -304.7886962890625, "loss": 0.5996, "rewards/accuracies": 0.75, "rewards/chosen": 0.07779493927955627, "rewards/margins": 0.42407867312431335, "rewards/rejected": -0.3462837040424347, "step": 301 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6321297749869178, "grad_norm": 2.994746440247753, "learning_rate": 1.7868903184043885e-07, "logits/chosen": -1.382077932357788, "logits/rejected": -1.3923065662384033, "logps/chosen": -340.5814208984375, "logps/rejected": -291.1869201660156, "loss": 0.5971, "rewards/accuracies": 0.625, "rewards/chosen": -0.05908205360174179, "rewards/margins": 0.06250374764204025, "rewards/rejected": -0.12158581614494324, "step": 302 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6342229199372057, "grad_norm": 2.6201377144589255, "learning_rate": 1.7693625385079574e-07, "logits/chosen": -1.1162067651748657, "logits/rejected": -1.0164040327072144, "logps/chosen": -367.78936767578125, "logps/rejected": -382.03289794921875, "loss": 0.5739, "rewards/accuracies": 0.875, "rewards/chosen": 0.09106776863336563, "rewards/margins": 0.40245896577835083, "rewards/rejected": -0.3113911747932434, "step": 303 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6363160648874935, "grad_norm": 3.1976093467898297, "learning_rate": 1.7518739404812155e-07, "logits/chosen": -1.211500883102417, "logits/rejected": -1.4269013404846191, "logps/chosen": -273.0542907714844, "logps/rejected": -375.3648681640625, "loss": 0.6029, "rewards/accuracies": 0.8125, "rewards/chosen": -0.02511598914861679, "rewards/margins": 0.4733920991420746, "rewards/rejected": -0.498508095741272, "step": 304 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6384092098377813, "grad_norm": 2.8859631896603193, "learning_rate": 1.7344254621846017e-07, "logits/chosen": -1.7794303894042969, "logits/rejected": -1.7412896156311035, "logps/chosen": -398.88739013671875, "logps/rejected": -343.1975402832031, "loss": 0.5874, "rewards/accuracies": 0.8125, "rewards/chosen": 0.12792496383190155, "rewards/margins": 0.3402012586593628, "rewards/rejected": -0.21227629482746124, "step": 305 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.640502354788069, "grad_norm": 2.9539779554898438, "learning_rate": 1.717018039327053e-07, "logits/chosen": -1.190926432609558, "logits/rejected": -1.8115710020065308, "logps/chosen": -468.99993896484375, "logps/rejected": -334.0068359375, "loss": 0.585, "rewards/accuracies": 0.8125, "rewards/chosen": -0.14913079142570496, "rewards/margins": 0.41096633672714233, "rewards/rejected": -0.5600970983505249, "step": 306 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6425954997383568, "grad_norm": 2.7885368689838175, "learning_rate": 1.699652605415828e-07, "logits/chosen": -1.1208600997924805, "logits/rejected": -1.4495139122009277, "logps/chosen": -375.7610778808594, "logps/rejected": -396.4300842285156, "loss": 0.5791, "rewards/accuracies": 0.875, "rewards/chosen": 0.10813463479280472, "rewards/margins": 0.5855265855789185, "rewards/rejected": -0.47739195823669434, "step": 307 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6446886446886447, "grad_norm": 3.1913187684163526, "learning_rate": 1.6823300917064458e-07, "logits/chosen": -1.1223787069320679, "logits/rejected": -1.3091034889221191, "logps/chosen": -350.25543212890625, "logps/rejected": -287.968505859375, "loss": 0.5829, "rewards/accuracies": 0.875, "rewards/chosen": 0.09806571900844574, "rewards/margins": 0.2386128306388855, "rewards/rejected": -0.14054711163043976, "step": 308 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6467817896389325, "grad_norm": 2.816863658491057, "learning_rate": 1.6650514271527465e-07, "logits/chosen": -1.1969443559646606, "logits/rejected": -0.9648417234420776, "logps/chosen": -308.03387451171875, "logps/rejected": -252.2939910888672, "loss": 0.5875, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04692830145359039, "rewards/margins": 0.2519851624965668, "rewards/rejected": -0.20505686104297638, "step": 309 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6488749345892203, "grad_norm": 2.958271999615368, "learning_rate": 1.647817538357072e-07, "logits/chosen": -1.7227213382720947, "logits/rejected": -1.7207832336425781, "logps/chosen": -214.9093475341797, "logps/rejected": -266.1515197753906, "loss": 0.6045, "rewards/accuracies": 0.625, "rewards/chosen": 0.00013159960508346558, "rewards/margins": 0.3577786684036255, "rewards/rejected": -0.3576470613479614, "step": 310 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6509680795395081, "grad_norm": 3.1592624947340764, "learning_rate": 1.6306293495205755e-07, "logits/chosen": -1.292384386062622, "logits/rejected": -1.1098520755767822, "logps/chosen": -261.2619323730469, "logps/rejected": -336.4939270019531, "loss": 0.5852, "rewards/accuracies": 0.625, "rewards/chosen": -0.052145615220069885, "rewards/margins": 0.28515321016311646, "rewards/rejected": -0.33729884028434753, "step": 311 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6530612244897959, "grad_norm": 2.846608771550514, "learning_rate": 1.6134877823936607e-07, "logits/chosen": -0.6564756631851196, "logits/rejected": -1.0646233558654785, "logps/chosen": -398.82086181640625, "logps/rejected": -372.4912414550781, "loss": 0.5764, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04534997418522835, "rewards/margins": 0.3533475399017334, "rewards/rejected": -0.39869752526283264, "step": 312 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6551543694400838, "grad_norm": 2.8931520114645446, "learning_rate": 1.5963937562265522e-07, "logits/chosen": -1.343094825744629, "logits/rejected": -1.7467936277389526, "logps/chosen": -283.416259765625, "logps/rejected": -280.51373291015625, "loss": 0.6108, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09352143853902817, "rewards/margins": 0.22399359941482544, "rewards/rejected": -0.13047216832637787, "step": 313 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6572475143903715, "grad_norm": 2.815700882329053, "learning_rate": 1.5793481877199943e-07, "logits/chosen": -1.43204665184021, "logits/rejected": -1.5756096839904785, "logps/chosen": -266.8113098144531, "logps/rejected": -224.4196319580078, "loss": 0.5872, "rewards/accuracies": 0.75, "rewards/chosen": -0.07680583000183105, "rewards/margins": 0.16660752892494202, "rewards/rejected": -0.24341335892677307, "step": 314 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6593406593406593, "grad_norm": 2.8137310444663886, "learning_rate": 1.562351990976095e-07, "logits/chosen": -1.5739721059799194, "logits/rejected": -1.6129150390625, "logps/chosen": -348.204833984375, "logps/rejected": -329.01361083984375, "loss": 0.5505, "rewards/accuracies": 0.75, "rewards/chosen": 0.18597671389579773, "rewards/margins": 0.5959646701812744, "rewards/rejected": -0.4099879264831543, "step": 315 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6614338042909471, "grad_norm": 2.8423884378355906, "learning_rate": 1.5454060774493065e-07, "logits/chosen": -1.7593684196472168, "logits/rejected": -1.6146714687347412, "logps/chosen": -261.1969909667969, "logps/rejected": -272.06109619140625, "loss": 0.5893, "rewards/accuracies": 0.75, "rewards/chosen": 0.018922820687294006, "rewards/margins": 0.21677649021148682, "rewards/rejected": -0.19785365462303162, "step": 316 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.663526949241235, "grad_norm": 3.02095288518032, "learning_rate": 1.5285113558975427e-07, "logits/chosen": -1.049725890159607, "logits/rejected": -1.217036485671997, "logps/chosen": -261.72369384765625, "logps/rejected": -317.48736572265625, "loss": 0.5584, "rewards/accuracies": 0.8125, "rewards/chosen": 0.12496969103813171, "rewards/margins": 0.36013174057006836, "rewards/rejected": -0.23516204953193665, "step": 317 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6656200941915228, "grad_norm": 2.7694515215588504, "learning_rate": 1.5116687323334464e-07, "logits/chosen": -1.550876259803772, "logits/rejected": -1.8359020948410034, "logps/chosen": -255.31036376953125, "logps/rejected": -248.61073303222656, "loss": 0.5888, "rewards/accuracies": 0.75, "rewards/chosen": -0.006704147905111313, "rewards/margins": 0.4305347800254822, "rewards/rejected": -0.4372389316558838, "step": 318 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6677132391418106, "grad_norm": 2.8401299419755692, "learning_rate": 1.4948791099758052e-07, "logits/chosen": -0.9036725759506226, "logits/rejected": -1.0863186120986938, "logps/chosen": -289.23028564453125, "logps/rejected": -284.5397644042969, "loss": 0.5875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07022331655025482, "rewards/margins": 0.1702737659215927, "rewards/rejected": -0.24049708247184753, "step": 319 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6698063840920984, "grad_norm": 3.0419948559376038, "learning_rate": 1.478143389201113e-07, "logits/chosen": -1.6008977890014648, "logits/rejected": -1.4907864332199097, "logps/chosen": -272.605224609375, "logps/rejected": -276.8568115234375, "loss": 0.6043, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07701703906059265, "rewards/margins": 0.39531823992729187, "rewards/rejected": -0.3183012008666992, "step": 320 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6718995290423861, "grad_norm": 3.4464173500507322, "learning_rate": 1.461462467495284e-07, "logits/chosen": -1.0648694038391113, "logits/rejected": -1.018904209136963, "logps/chosen": -263.15997314453125, "logps/rejected": -349.84442138671875, "loss": 0.6275, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0101262666285038, "rewards/margins": 0.2943682372570038, "rewards/rejected": -0.3044945001602173, "step": 321 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.673992673992674, "grad_norm": 2.5726572380197337, "learning_rate": 1.4448372394055246e-07, "logits/chosen": -1.5901743173599243, "logits/rejected": -1.3653056621551514, "logps/chosen": -322.6933898925781, "logps/rejected": -332.12811279296875, "loss": 0.563, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1433330625295639, "rewards/margins": 0.556541383266449, "rewards/rejected": -0.4132083058357239, "step": 322 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6760858189429618, "grad_norm": 3.3548746086979895, "learning_rate": 1.428268596492364e-07, "logits/chosen": -1.249694585800171, "logits/rejected": -1.3499314785003662, "logps/chosen": -394.60455322265625, "logps/rejected": -375.867919921875, "loss": 0.592, "rewards/accuracies": 0.5, "rewards/chosen": -0.11615871638059616, "rewards/margins": 0.10562621802091599, "rewards/rejected": -0.22178493440151215, "step": 323 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6781789638932496, "grad_norm": 2.830104860730658, "learning_rate": 1.4117574272818386e-07, "logits/chosen": -1.574323058128357, "logits/rejected": -1.415414571762085, "logps/chosen": -301.65411376953125, "logps/rejected": -347.116943359375, "loss": 0.6055, "rewards/accuracies": 0.75, "rewards/chosen": -0.019321933388710022, "rewards/margins": 0.27965104579925537, "rewards/rejected": -0.2989729940891266, "step": 324 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6802721088435374, "grad_norm": 3.1596951802874425, "learning_rate": 1.3953046172178413e-07, "logits/chosen": -1.5781238079071045, "logits/rejected": -1.589447021484375, "logps/chosen": -354.6593933105469, "logps/rejected": -375.35992431640625, "loss": 0.6144, "rewards/accuracies": 0.625, "rewards/chosen": 0.03557761013507843, "rewards/margins": 0.2532786428928375, "rewards/rejected": -0.2177010327577591, "step": 325 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6823652537938252, "grad_norm": 2.9579170305081757, "learning_rate": 1.3789110486146468e-07, "logits/chosen": -1.355236530303955, "logits/rejected": -1.258398175239563, "logps/chosen": -289.5960388183594, "logps/rejected": -310.9838562011719, "loss": 0.5912, "rewards/accuracies": 0.75, "rewards/chosen": -0.07027177512645721, "rewards/margins": 0.30167698860168457, "rewards/rejected": -0.3719487190246582, "step": 326 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6844583987441131, "grad_norm": 2.93799646935141, "learning_rate": 1.362577600609588e-07, "logits/chosen": -0.7065570950508118, "logits/rejected": -0.9963769912719727, "logps/chosen": -222.33880615234375, "logps/rejected": -208.6898956298828, "loss": 0.5805, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1881701946258545, "rewards/margins": 0.3975644111633301, "rewards/rejected": -0.2093942016363144, "step": 327 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6865515436944009, "grad_norm": 2.890628326187476, "learning_rate": 1.3463051491159093e-07, "logits/chosen": -1.1522157192230225, "logits/rejected": -0.8082643747329712, "logps/chosen": -292.56854248046875, "logps/rejected": -353.1051940917969, "loss": 0.5994, "rewards/accuracies": 0.4375, "rewards/chosen": -0.08080998808145523, "rewards/margins": 0.15119798481464386, "rewards/rejected": -0.2320079654455185, "step": 328 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6886446886446886, "grad_norm": 2.9602713951640425, "learning_rate": 1.3300945667758012e-07, "logits/chosen": -0.9483600854873657, "logits/rejected": -1.2900683879852295, "logps/chosen": -441.95599365234375, "logps/rejected": -421.157470703125, "loss": 0.5633, "rewards/accuracies": 0.75, "rewards/chosen": 0.11120367050170898, "rewards/margins": 0.4193558096885681, "rewards/rejected": -0.30815210938453674, "step": 329 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6907378335949764, "grad_norm": 2.734455320135366, "learning_rate": 1.3139467229135998e-07, "logits/chosen": -1.2660586833953857, "logits/rejected": -1.2434581518173218, "logps/chosen": -302.03558349609375, "logps/rejected": -320.6789245605469, "loss": 0.601, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12139102816581726, "rewards/margins": 0.26467812061309814, "rewards/rejected": -0.3860691487789154, "step": 330 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6928309785452642, "grad_norm": 2.927024897268758, "learning_rate": 1.2978624834891626e-07, "logits/chosen": -1.1598354578018188, "logits/rejected": -1.1631252765655518, "logps/chosen": -332.1982421875, "logps/rejected": -371.22344970703125, "loss": 0.5676, "rewards/accuracies": 0.75, "rewards/chosen": 0.0077375248074531555, "rewards/margins": 0.235450878739357, "rewards/rejected": -0.22771333158016205, "step": 331 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6949241234955521, "grad_norm": 3.1174651474465653, "learning_rate": 1.281842711051438e-07, "logits/chosen": -1.5273184776306152, "logits/rejected": -1.6041035652160645, "logps/chosen": -277.0570983886719, "logps/rejected": -353.633056640625, "loss": 0.5648, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0700652152299881, "rewards/margins": 0.46601444482803345, "rewards/rejected": -0.39594924449920654, "step": 332 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6970172684458399, "grad_norm": 2.8402929332460727, "learning_rate": 1.2658882646922033e-07, "logits/chosen": -0.7204049229621887, "logits/rejected": -0.5981565713882446, "logps/chosen": -291.55438232421875, "logps/rejected": -271.3363037109375, "loss": 0.6055, "rewards/accuracies": 0.5, "rewards/chosen": -0.035089973360300064, "rewards/margins": 0.027638748288154602, "rewards/rejected": -0.06272870302200317, "step": 333 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.6991104133961277, "grad_norm": 2.7594831712195287, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.974805474281311, "logits/rejected": -1.3674122095108032, "logps/chosen": -457.0871276855469, "logps/rejected": -287.9721374511719, "loss": 0.5807, "rewards/accuracies": 0.8125, "rewards/chosen": 0.19807353615760803, "rewards/margins": 0.4476739466190338, "rewards/rejected": -0.2496003955602646, "step": 334 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7012035583464155, "grad_norm": 3.0859411459906023, "learning_rate": 1.2341787690142435e-07, "logits/chosen": -1.0456016063690186, "logits/rejected": -1.455796241760254, "logps/chosen": -283.3838195800781, "logps/rejected": -314.14996337890625, "loss": 0.6059, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07535441219806671, "rewards/margins": 0.3652963638305664, "rewards/rejected": -0.4406507611274719, "step": 335 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7032967032967034, "grad_norm": 3.2976194013630216, "learning_rate": 1.2184254201795363e-07, "logits/chosen": -0.9428431391716003, "logits/rejected": -0.9520983099937439, "logps/chosen": -290.36871337890625, "logps/rejected": -285.92987060546875, "loss": 0.5698, "rewards/accuracies": 0.875, "rewards/chosen": 0.08222910761833191, "rewards/margins": 0.2786232829093933, "rewards/rejected": -0.1963942050933838, "step": 336 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7053898482469911, "grad_norm": 3.0116971606259506, "learning_rate": 1.202740798300168e-07, "logits/chosen": -0.8543267250061035, "logits/rejected": -0.773929238319397, "logps/chosen": -415.064208984375, "logps/rejected": -442.967529296875, "loss": 0.5812, "rewards/accuracies": 0.625, "rewards/chosen": 0.10885193943977356, "rewards/margins": 0.39388900995254517, "rewards/rejected": -0.2850370407104492, "step": 337 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7074829931972789, "grad_norm": 2.9971139948790033, "learning_rate": 1.1871257444948096e-07, "logits/chosen": -1.0442243814468384, "logits/rejected": -1.1720740795135498, "logps/chosen": -243.80303955078125, "logps/rejected": -253.8277130126953, "loss": 0.5999, "rewards/accuracies": 0.5, "rewards/chosen": -0.08283738791942596, "rewards/margins": 0.14333190023899078, "rewards/rejected": -0.22616928815841675, "step": 338 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7095761381475667, "grad_norm": 3.2736504577600094, "learning_rate": 1.1715810961514072e-07, "logits/chosen": -1.2990602254867554, "logits/rejected": -1.9961154460906982, "logps/chosen": -404.4445495605469, "logps/rejected": -320.1275329589844, "loss": 0.5726, "rewards/accuracies": 0.6875, "rewards/chosen": 0.017597146332263947, "rewards/margins": 0.3037232458591461, "rewards/rejected": -0.2861260771751404, "step": 339 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7116692830978545, "grad_norm": 3.1899370932652005, "learning_rate": 1.1561076868822755e-07, "logits/chosen": -1.4109141826629639, "logits/rejected": -1.431431770324707, "logps/chosen": -388.3424072265625, "logps/rejected": -421.73809814453125, "loss": 0.6312, "rewards/accuracies": 0.5, "rewards/chosen": 0.024379979819059372, "rewards/margins": 0.21260623633861542, "rewards/rejected": -0.18822626769542694, "step": 340 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7137624280481424, "grad_norm": 2.724321146744929, "learning_rate": 1.1407063464793965e-07, "logits/chosen": -0.7314552068710327, "logits/rejected": -1.1736316680908203, "logps/chosen": -284.68572998046875, "logps/rejected": -273.8853759765625, "loss": 0.5605, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13478504121303558, "rewards/margins": 0.5126200914382935, "rewards/rejected": -0.3778350353240967, "step": 341 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7158555729984302, "grad_norm": 2.926353442254869, "learning_rate": 1.125377900869913e-07, "logits/chosen": -0.8707342743873596, "logits/rejected": -1.5625667572021484, "logps/chosen": -362.37713623046875, "logps/rejected": -266.0124816894531, "loss": 0.5733, "rewards/accuracies": 0.5, "rewards/chosen": -0.0838310644030571, "rewards/margins": 0.08862137794494629, "rewards/rejected": -0.1724524050951004, "step": 342 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.717948717948718, "grad_norm": 3.3401465090762916, "learning_rate": 1.110123172071844e-07, "logits/chosen": -1.5237928628921509, "logits/rejected": -1.6122753620147705, "logps/chosen": -358.39996337890625, "logps/rejected": -357.63043212890625, "loss": 0.5811, "rewards/accuracies": 0.75, "rewards/chosen": 0.1199035570025444, "rewards/margins": 0.46440428495407104, "rewards/rejected": -0.34450072050094604, "step": 343 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7200418628990057, "grad_norm": 2.94627878527879, "learning_rate": 1.09494297815e-07, "logits/chosen": -1.265371561050415, "logits/rejected": -1.1523489952087402, "logps/chosen": -286.3548583984375, "logps/rejected": -364.54205322265625, "loss": 0.5586, "rewards/accuracies": 0.625, "rewards/chosen": 0.017269540578126907, "rewards/margins": 0.3244696259498596, "rewards/rejected": -0.3072000741958618, "step": 344 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7221350078492935, "grad_norm": 3.102266671071275, "learning_rate": 1.0798381331721107e-07, "logits/chosen": -0.9287211894989014, "logits/rejected": -1.1919511556625366, "logps/chosen": -200.88504028320312, "logps/rejected": -264.4483642578125, "loss": 0.5481, "rewards/accuracies": 0.9375, "rewards/chosen": 0.14602060616016388, "rewards/margins": 0.4962640702724457, "rewards/rejected": -0.350243479013443, "step": 345 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7242281527995814, "grad_norm": 2.9670538954056935, "learning_rate": 1.0648094471651722e-07, "logits/chosen": -1.5825669765472412, "logits/rejected": -1.7299472093582153, "logps/chosen": -296.21014404296875, "logps/rejected": -275.9136657714844, "loss": 0.5796, "rewards/accuracies": 0.8125, "rewards/chosen": 0.20572277903556824, "rewards/margins": 0.5951564311981201, "rewards/rejected": -0.38943371176719666, "step": 346 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7263212977498692, "grad_norm": 3.230731139876855, "learning_rate": 1.0498577260720048e-07, "logits/chosen": -0.8463734984397888, "logits/rejected": -1.0593740940093994, "logps/chosen": -272.32049560546875, "logps/rejected": -305.7054138183594, "loss": 0.5863, "rewards/accuracies": 0.625, "rewards/chosen": -0.14406979084014893, "rewards/margins": 0.17186206579208374, "rewards/rejected": -0.31593188643455505, "step": 347 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.728414442700157, "grad_norm": 3.0126681234768675, "learning_rate": 1.0349837717080347e-07, "logits/chosen": -1.0127146244049072, "logits/rejected": -1.4931713342666626, "logps/chosen": -306.0135803222656, "logps/rejected": -277.12689208984375, "loss": 0.5413, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04544328153133392, "rewards/margins": 0.2818772494792938, "rewards/rejected": -0.32732054591178894, "step": 348 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7305075876504448, "grad_norm": 2.7460177566999073, "learning_rate": 1.0201883817182949e-07, "logits/chosen": -1.3901773691177368, "logits/rejected": -1.1678118705749512, "logps/chosen": -245.79476928710938, "logps/rejected": -295.84991455078125, "loss": 0.5713, "rewards/accuracies": 0.5, "rewards/chosen": -0.08364185690879822, "rewards/margins": 0.2024136334657669, "rewards/rejected": -0.28605547547340393, "step": 349 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7326007326007326, "grad_norm": 3.601672079724932, "learning_rate": 1.0054723495346482e-07, "logits/chosen": -1.5284239053726196, "logits/rejected": -1.7634990215301514, "logps/chosen": -356.5145263671875, "logps/rejected": -332.6150207519531, "loss": 0.6151, "rewards/accuracies": 0.875, "rewards/chosen": -0.0661069005727768, "rewards/margins": 0.3310953974723816, "rewards/rejected": -0.39720234274864197, "step": 350 }, { "epoch": 0.7326007326007326, "eval_dpo_lambda": 0.9500001072883606, "eval_logits/chosen": -1.1370099782943726, "eval_logits/rejected": -1.2391026020050049, "eval_logps/chosen": -333.4674377441406, "eval_logps/rejected": -342.59625244140625, "eval_loss": 0.5924096703529358, "eval_rewards/accuracies": 0.7222222089767456, "eval_rewards/chosen": -0.0015593047719448805, "eval_rewards/margins": 0.357039213180542, "eval_rewards/rejected": -0.358598530292511, "eval_runtime": 127.5615, "eval_samples_per_second": 15.679, "eval_steps_per_second": 0.494, "step": 350 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7346938775510204, "grad_norm": 2.959485970341728, "learning_rate": 9.908364643332398e-08, "logits/chosen": -0.8477253913879395, "logits/rejected": -1.174777626991272, "logps/chosen": -294.0464172363281, "logps/rejected": -259.5352783203125, "loss": 0.6071, "rewards/accuracies": 0.8125, "rewards/chosen": -0.028058402240276337, "rewards/margins": 0.3106793761253357, "rewards/rejected": -0.33873772621154785, "step": 351 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7367870225013082, "grad_norm": 3.0855930202356547, "learning_rate": 9.76281510992176e-08, "logits/chosen": -1.4485905170440674, "logits/rejected": -1.6518748998641968, "logps/chosen": -314.1768798828125, "logps/rejected": -392.55963134765625, "loss": 0.5941, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1556646078824997, "rewards/margins": 0.37559986114501953, "rewards/rejected": -0.5312644839286804, "step": 352 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.738880167451596, "grad_norm": 3.143268801871789, "learning_rate": 9.618082700494318e-08, "logits/chosen": -1.4012107849121094, "logits/rejected": -1.4628479480743408, "logps/chosen": -255.58563232421875, "logps/rejected": -260.3592224121094, "loss": 0.6288, "rewards/accuracies": 0.625, "rewards/chosen": -0.15864990651607513, "rewards/margins": 0.19354340434074402, "rewards/rejected": -0.35219329595565796, "step": 353 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7409733124018838, "grad_norm": 2.9613904486760343, "learning_rate": 9.474175176609956e-08, "logits/chosen": -1.2831871509552002, "logits/rejected": -1.442847490310669, "logps/chosen": -350.13262939453125, "logps/rejected": -341.5611877441406, "loss": 0.5951, "rewards/accuracies": 0.75, "rewards/chosen": -0.045420967042446136, "rewards/margins": 0.32893097400665283, "rewards/rejected": -0.3743519186973572, "step": 354 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7430664573521716, "grad_norm": 3.294434829045258, "learning_rate": 9.331100255592436e-08, "logits/chosen": -0.8240109086036682, "logits/rejected": -0.968763530254364, "logps/chosen": -462.10662841796875, "logps/rejected": -372.427978515625, "loss": 0.6241, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0016819555312395096, "rewards/margins": 0.3995983898639679, "rewards/rejected": -0.40128037333488464, "step": 355 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7451596023024595, "grad_norm": 2.8360000204682287, "learning_rate": 9.18886561011557e-08, "logits/chosen": -1.345785140991211, "logits/rejected": -1.7952075004577637, "logps/chosen": -404.19061279296875, "logps/rejected": -323.08868408203125, "loss": 0.5641, "rewards/accuracies": 0.75, "rewards/chosen": -0.038200005888938904, "rewards/margins": 0.46547362208366394, "rewards/rejected": -0.5036736130714417, "step": 356 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7472527472527473, "grad_norm": 2.7415981815075203, "learning_rate": 9.047478867791731e-08, "logits/chosen": -0.8407701253890991, "logits/rejected": -0.9676626920700073, "logps/chosen": -338.549072265625, "logps/rejected": -383.54010009765625, "loss": 0.5447, "rewards/accuracies": 0.625, "rewards/chosen": -0.10126753151416779, "rewards/margins": 0.3071839511394501, "rewards/rejected": -0.40845149755477905, "step": 357 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.749345892203035, "grad_norm": 3.0355599256567722, "learning_rate": 8.906947610762825e-08, "logits/chosen": -1.2541128396987915, "logits/rejected": -1.4691364765167236, "logps/chosen": -455.7889099121094, "logps/rejected": -415.0899658203125, "loss": 0.5859, "rewards/accuracies": 0.875, "rewards/chosen": 0.14887842535972595, "rewards/margins": 0.5138112902641296, "rewards/rejected": -0.3649328649044037, "step": 358 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7514390371533228, "grad_norm": 3.1123223229231605, "learning_rate": 8.76727937529367e-08, "logits/chosen": -0.8968989849090576, "logits/rejected": -1.2970577478408813, "logps/chosen": -384.78045654296875, "logps/rejected": -429.00042724609375, "loss": 0.6348, "rewards/accuracies": 0.6875, "rewards/chosen": 0.015154330059885979, "rewards/margins": 0.28424006700515747, "rewards/rejected": -0.26908573508262634, "step": 359 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7535321821036107, "grad_norm": 2.990216717907498, "learning_rate": 8.628481651367875e-08, "logits/chosen": -0.9355572462081909, "logits/rejected": -1.1124552488327026, "logps/chosen": -343.1297607421875, "logps/rejected": -326.9549560546875, "loss": 0.603, "rewards/accuracies": 0.75, "rewards/chosen": 0.2483363002538681, "rewards/margins": 0.41506317257881165, "rewards/rejected": -0.16672685742378235, "step": 360 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7556253270538985, "grad_norm": 3.2623136470492677, "learning_rate": 8.490561882286135e-08, "logits/chosen": -0.8635209798812866, "logits/rejected": -1.2157515287399292, "logps/chosen": -363.4804382324219, "logps/rejected": -361.91351318359375, "loss": 0.5953, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06303808093070984, "rewards/margins": 0.3833729028701782, "rewards/rejected": -0.44641098380088806, "step": 361 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7577184720041863, "grad_norm": 3.0834474979003432, "learning_rate": 8.353527464267104e-08, "logits/chosen": -1.454489827156067, "logits/rejected": -1.5765470266342163, "logps/chosen": -295.3980407714844, "logps/rejected": -335.7327880859375, "loss": 0.5964, "rewards/accuracies": 0.75, "rewards/chosen": 0.08737202733755112, "rewards/margins": 0.4309237003326416, "rewards/rejected": -0.3435516953468323, "step": 362 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7598116169544741, "grad_norm": 2.914118109602609, "learning_rate": 8.217385746050742e-08, "logits/chosen": -1.238272786140442, "logits/rejected": -1.5411715507507324, "logps/chosen": -292.2579345703125, "logps/rejected": -238.81320190429688, "loss": 0.5812, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10383353382349014, "rewards/margins": 0.18802590668201447, "rewards/rejected": -0.2918594479560852, "step": 363 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7619047619047619, "grad_norm": 2.9854885451359023, "learning_rate": 8.082144028504231e-08, "logits/chosen": -1.367719292640686, "logits/rejected": -1.2657394409179688, "logps/chosen": -234.74166870117188, "logps/rejected": -281.1946716308594, "loss": 0.5687, "rewards/accuracies": 0.6875, "rewards/chosen": 0.12704059481620789, "rewards/margins": 0.34460991621017456, "rewards/rejected": -0.2175692915916443, "step": 364 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7639979068550498, "grad_norm": 2.953300558672877, "learning_rate": 7.947809564230445e-08, "logits/chosen": -1.4098320007324219, "logits/rejected": -1.2868678569793701, "logps/chosen": -243.4346923828125, "logps/rejected": -298.5202941894531, "loss": 0.6015, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04837167263031006, "rewards/margins": 0.057322531938552856, "rewards/rejected": -0.1056942418217659, "step": 365 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7660910518053375, "grad_norm": 2.99850368919061, "learning_rate": 7.814389557179016e-08, "logits/chosen": -1.124545931816101, "logits/rejected": -1.3188456296920776, "logps/chosen": -386.4886474609375, "logps/rejected": -329.67877197265625, "loss": 0.594, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12546221911907196, "rewards/margins": 0.4888818860054016, "rewards/rejected": -0.36341968178749084, "step": 366 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7681841967556253, "grad_norm": 3.458726258188203, "learning_rate": 7.681891162260015e-08, "logits/chosen": -1.5179108381271362, "logits/rejected": -1.812450885772705, "logps/chosen": -365.3643798828125, "logps/rejected": -297.4730224609375, "loss": 0.58, "rewards/accuracies": 0.5625, "rewards/chosen": -0.05222572013735771, "rewards/margins": 0.15194657444953918, "rewards/rejected": -0.2041723132133484, "step": 367 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7702773417059131, "grad_norm": 3.072591063072517, "learning_rate": 7.550321484960251e-08, "logits/chosen": -1.018741488456726, "logits/rejected": -1.2473299503326416, "logps/chosen": -311.4349670410156, "logps/rejected": -314.35394287109375, "loss": 0.5688, "rewards/accuracies": 0.8125, "rewards/chosen": 0.07459983229637146, "rewards/margins": 0.4309018850326538, "rewards/rejected": -0.35630208253860474, "step": 368 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7723704866562009, "grad_norm": 3.5372423378988334, "learning_rate": 7.419687580962222e-08, "logits/chosen": -1.212430477142334, "logits/rejected": -1.426560878753662, "logps/chosen": -228.5093994140625, "logps/rejected": -264.28936767578125, "loss": 0.6119, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08344899117946625, "rewards/margins": 0.19705668091773987, "rewards/rejected": -0.2805056869983673, "step": 369 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7744636316064888, "grad_norm": 2.997232147010516, "learning_rate": 7.289996455765748e-08, "logits/chosen": -1.6446086168289185, "logits/rejected": -1.6737974882125854, "logps/chosen": -331.7569580078125, "logps/rejected": -291.8965148925781, "loss": 0.5893, "rewards/accuracies": 0.5, "rewards/chosen": -0.13146553933620453, "rewards/margins": 0.2519133985042572, "rewards/rejected": -0.38337892293930054, "step": 370 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7765567765567766, "grad_norm": 3.080590463086197, "learning_rate": 7.161255064312283e-08, "logits/chosen": -1.1136746406555176, "logits/rejected": -1.0231876373291016, "logps/chosen": -309.6198425292969, "logps/rejected": -363.610107421875, "loss": 0.5674, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17655643820762634, "rewards/margins": 0.4049268364906311, "rewards/rejected": -0.5814833045005798, "step": 371 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7786499215070644, "grad_norm": 3.6244516998780507, "learning_rate": 7.033470310611945e-08, "logits/chosen": -1.1016966104507446, "logits/rejected": -1.245896339416504, "logps/chosen": -359.70452880859375, "logps/rejected": -433.61834716796875, "loss": 0.5889, "rewards/accuracies": 0.9375, "rewards/chosen": 0.15750715136528015, "rewards/margins": 0.6576098203659058, "rewards/rejected": -0.500102698802948, "step": 372 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7807430664573521, "grad_norm": 2.7943459104008737, "learning_rate": 6.906649047373245e-08, "logits/chosen": -1.476548194885254, "logits/rejected": -1.5324046611785889, "logps/chosen": -221.1163330078125, "logps/rejected": -242.81161499023438, "loss": 0.5805, "rewards/accuracies": 0.4375, "rewards/chosen": -0.03714430332183838, "rewards/margins": 0.029976069927215576, "rewards/rejected": -0.06712038815021515, "step": 373 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7828362114076399, "grad_norm": 2.8322297474527742, "learning_rate": 6.780798075635675e-08, "logits/chosen": -1.3379898071289062, "logits/rejected": -1.118496060371399, "logps/chosen": -291.7900390625, "logps/rejected": -374.0988464355469, "loss": 0.5751, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06792306900024414, "rewards/margins": 0.5735393762588501, "rewards/rejected": -0.6414624452590942, "step": 374 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7849293563579278, "grad_norm": 2.87697157039399, "learning_rate": 6.655924144404906e-08, "logits/chosen": -1.1238576173782349, "logits/rejected": -1.7565028667449951, "logps/chosen": -286.9996032714844, "logps/rejected": -336.0424499511719, "loss": 0.5835, "rewards/accuracies": 0.625, "rewards/chosen": -0.016137562692165375, "rewards/margins": 0.43295276165008545, "rewards/rejected": -0.4490903317928314, "step": 375 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7870225013082156, "grad_norm": 3.2179619916163467, "learning_rate": 6.532033950290885e-08, "logits/chosen": -1.3389768600463867, "logits/rejected": -1.0774224996566772, "logps/chosen": -287.64239501953125, "logps/rejected": -413.46868896484375, "loss": 0.5848, "rewards/accuracies": 0.875, "rewards/chosen": -0.0896245688199997, "rewards/margins": 0.5254437923431396, "rewards/rejected": -0.6150683164596558, "step": 376 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7891156462585034, "grad_norm": 2.8855153079889786, "learning_rate": 6.409134137148736e-08, "logits/chosen": -1.022530436515808, "logits/rejected": -1.1851997375488281, "logps/chosen": -298.86297607421875, "logps/rejected": -332.5123596191406, "loss": 0.5875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17468523979187012, "rewards/margins": 0.29647061228752136, "rewards/rejected": -0.4711558222770691, "step": 377 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7912087912087912, "grad_norm": 2.990469807350208, "learning_rate": 6.28723129572247e-08, "logits/chosen": -1.473344326019287, "logits/rejected": -1.1325485706329346, "logps/chosen": -318.8504638671875, "logps/rejected": -449.6240234375, "loss": 0.5859, "rewards/accuracies": 0.8125, "rewards/chosen": 0.08505594730377197, "rewards/margins": 0.6390045285224915, "rewards/rejected": -0.5539485812187195, "step": 378 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7933019361590791, "grad_norm": 2.9595003490020297, "learning_rate": 6.166331963291519e-08, "logits/chosen": -1.7838646173477173, "logits/rejected": -1.8269150257110596, "logps/chosen": -205.31951904296875, "logps/rejected": -268.8065490722656, "loss": 0.5982, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03895656019449234, "rewards/margins": 0.24119150638580322, "rewards/rejected": -0.28014805912971497, "step": 379 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7953950811093669, "grad_norm": 2.8571350864096328, "learning_rate": 6.046442623320145e-08, "logits/chosen": -0.8389091491699219, "logits/rejected": -1.2211157083511353, "logps/chosen": -327.31793212890625, "logps/rejected": -304.3401794433594, "loss": 0.5665, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08584833145141602, "rewards/margins": 0.35426634550094604, "rewards/rejected": -0.4401146173477173, "step": 380 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7974882260596546, "grad_norm": 2.8827076766905586, "learning_rate": 5.9275697051098275e-08, "logits/chosen": -0.630349338054657, "logits/rejected": -1.0219539403915405, "logps/chosen": -325.81036376953125, "logps/rejected": -359.8531494140625, "loss": 0.5677, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01764664426445961, "rewards/margins": 0.3589470386505127, "rewards/rejected": -0.341300368309021, "step": 381 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.7995813710099424, "grad_norm": 3.2022395778240407, "learning_rate": 5.809719583454414e-08, "logits/chosen": -1.1647589206695557, "logits/rejected": -0.7380653619766235, "logps/chosen": -367.7460632324219, "logps/rejected": -562.48095703125, "loss": 0.5729, "rewards/accuracies": 0.625, "rewards/chosen": -0.16796761751174927, "rewards/margins": 0.28384852409362793, "rewards/rejected": -0.4518161416053772, "step": 382 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8016745159602302, "grad_norm": 3.465982225966641, "learning_rate": 5.6928985782982524e-08, "logits/chosen": -0.7446600198745728, "logits/rejected": -0.9810519218444824, "logps/chosen": -433.0188293457031, "logps/rejected": -405.549072265625, "loss": 0.6244, "rewards/accuracies": 0.625, "rewards/chosen": 0.03277536854147911, "rewards/margins": 0.3812035918235779, "rewards/rejected": -0.34842824935913086, "step": 383 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8037676609105181, "grad_norm": 3.186555695359678, "learning_rate": 5.57711295439732e-08, "logits/chosen": -1.3340767621994019, "logits/rejected": -1.232506275177002, "logps/chosen": -329.8331298828125, "logps/rejected": -369.17431640625, "loss": 0.5912, "rewards/accuracies": 0.5, "rewards/chosen": -0.09295055270195007, "rewards/margins": -0.020929299294948578, "rewards/rejected": -0.07202126085758209, "step": 384 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8058608058608059, "grad_norm": 3.19391991333365, "learning_rate": 5.4623689209832484e-08, "logits/chosen": -1.453857183456421, "logits/rejected": -1.464058756828308, "logps/chosen": -466.036376953125, "logps/rejected": -397.72869873046875, "loss": 0.5873, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3399375081062317, "rewards/margins": 0.14604422450065613, "rewards/rejected": -0.4859817624092102, "step": 385 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8079539508110937, "grad_norm": 2.9654351748544996, "learning_rate": 5.3486726314303175e-08, "logits/chosen": -0.9499026536941528, "logits/rejected": -1.1998071670532227, "logps/chosen": -375.9937744140625, "logps/rejected": -317.107421875, "loss": 0.584, "rewards/accuracies": 0.875, "rewards/chosen": 0.10635887831449509, "rewards/margins": 0.4747195541858673, "rewards/rejected": -0.3683606684207916, "step": 386 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8100470957613815, "grad_norm": 3.0395158088403447, "learning_rate": 5.2360301829254745e-08, "logits/chosen": -1.4061027765274048, "logits/rejected": -1.604775309562683, "logps/chosen": -277.3642883300781, "logps/rejected": -257.7698974609375, "loss": 0.5704, "rewards/accuracies": 0.875, "rewards/chosen": 0.21056102216243744, "rewards/margins": 0.4900778532028198, "rewards/rejected": -0.2795168161392212, "step": 387 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8121402407116692, "grad_norm": 3.118682569326712, "learning_rate": 5.1244476161413806e-08, "logits/chosen": -1.060973048210144, "logits/rejected": -1.234295129776001, "logps/chosen": -402.9338684082031, "logps/rejected": -350.74969482421875, "loss": 0.5943, "rewards/accuracies": 0.75, "rewards/chosen": 0.01770045980811119, "rewards/margins": 0.397371768951416, "rewards/rejected": -0.3796713352203369, "step": 388 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8142333856619571, "grad_norm": 3.060926896783828, "learning_rate": 5.013930914912476e-08, "logits/chosen": -0.6037985682487488, "logits/rejected": -0.8761445879936218, "logps/chosen": -398.3765869140625, "logps/rejected": -471.43902587890625, "loss": 0.5633, "rewards/accuracies": 0.6875, "rewards/chosen": 0.025280356407165527, "rewards/margins": 0.6326404809951782, "rewards/rejected": -0.6073601245880127, "step": 389 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8163265306122449, "grad_norm": 3.3398020114588656, "learning_rate": 4.904486005914027e-08, "logits/chosen": -1.122170329093933, "logits/rejected": -1.0950374603271484, "logps/chosen": -409.85699462890625, "logps/rejected": -400.9709777832031, "loss": 0.5907, "rewards/accuracies": 0.8125, "rewards/chosen": -0.011219769716262817, "rewards/margins": 0.568664014339447, "rewards/rejected": -0.5798838138580322, "step": 390 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8184196755625327, "grad_norm": 3.140797781571759, "learning_rate": 4.796118758344353e-08, "logits/chosen": -1.632859706878662, "logits/rejected": -1.7060823440551758, "logps/chosen": -302.7551574707031, "logps/rejected": -326.54541015625, "loss": 0.581, "rewards/accuracies": 0.875, "rewards/chosen": -0.03222353383898735, "rewards/margins": 0.37952807545661926, "rewards/rejected": -0.4117515981197357, "step": 391 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8205128205128205, "grad_norm": 3.34918640654809, "learning_rate": 4.688834983610082e-08, "logits/chosen": -1.001910924911499, "logits/rejected": -0.7797136902809143, "logps/chosen": -287.0443420410156, "logps/rejected": -267.46661376953125, "loss": 0.6002, "rewards/accuracies": 0.75, "rewards/chosen": 0.1146119013428688, "rewards/margins": 0.2659221291542053, "rewards/rejected": -0.15131022036075592, "step": 392 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8226059654631083, "grad_norm": 3.2952941450953923, "learning_rate": 4.582640435014459e-08, "logits/chosen": -0.8455122709274292, "logits/rejected": -1.2038440704345703, "logps/chosen": -253.87689208984375, "logps/rejected": -310.3250732421875, "loss": 0.6078, "rewards/accuracies": 0.6875, "rewards/chosen": 0.002484044060111046, "rewards/margins": 0.4840199947357178, "rewards/rejected": -0.48153597116470337, "step": 393 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8246991104133962, "grad_norm": 3.43188935769205, "learning_rate": 4.477540807448832e-08, "logits/chosen": -1.1995909214019775, "logits/rejected": -1.4026349782943726, "logps/chosen": -347.7198181152344, "logps/rejected": -333.0040588378906, "loss": 0.5796, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05503479391336441, "rewards/margins": 0.4271388649940491, "rewards/rejected": -0.3721040189266205, "step": 394 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.826792255363684, "grad_norm": 3.146582803944609, "learning_rate": 4.373541737087263e-08, "logits/chosen": -1.0510830879211426, "logits/rejected": -1.3736388683319092, "logps/chosen": -341.2471618652344, "logps/rejected": -329.8455505371094, "loss": 0.5999, "rewards/accuracies": 0.625, "rewards/chosen": -0.04440370947122574, "rewards/margins": 0.39059868454933167, "rewards/rejected": -0.4350023865699768, "step": 395 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8288854003139717, "grad_norm": 2.905169603428801, "learning_rate": 4.270648801084295e-08, "logits/chosen": -1.3081679344177246, "logits/rejected": -1.6274116039276123, "logps/chosen": -285.16815185546875, "logps/rejected": -277.05126953125, "loss": 0.5866, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11289995908737183, "rewards/margins": 0.370134562253952, "rewards/rejected": -0.2572346031665802, "step": 396 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8309785452642595, "grad_norm": 3.2725027286355357, "learning_rate": 4.168867517275806e-08, "logits/chosen": -0.8182617425918579, "logits/rejected": -1.325834035873413, "logps/chosen": -283.5909729003906, "logps/rejected": -330.6864318847656, "loss": 0.5854, "rewards/accuracies": 0.8125, "rewards/chosen": 0.12649551033973694, "rewards/margins": 0.5855914950370789, "rewards/rejected": -0.4590959846973419, "step": 397 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8330716902145474, "grad_norm": 2.8784732910375643, "learning_rate": 4.0682033438831584e-08, "logits/chosen": -1.058720350265503, "logits/rejected": -1.2680238485336304, "logps/chosen": -299.87347412109375, "logps/rejected": -311.7471008300781, "loss": 0.5773, "rewards/accuracies": 0.75, "rewards/chosen": 0.07535728812217712, "rewards/margins": 0.40021175146102905, "rewards/rejected": -0.32485443353652954, "step": 398 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8351648351648352, "grad_norm": 3.17128860637903, "learning_rate": 3.968661679220467e-08, "logits/chosen": -1.4112520217895508, "logits/rejected": -1.595922827720642, "logps/chosen": -353.90716552734375, "logps/rejected": -380.09527587890625, "loss": 0.5626, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07107022404670715, "rewards/margins": 0.29196488857269287, "rewards/rejected": -0.3630351126194, "step": 399 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.837257980115123, "grad_norm": 3.226152151057545, "learning_rate": 3.8702478614051345e-08, "logits/chosen": -0.859539806842804, "logits/rejected": -0.854383647441864, "logps/chosen": -243.15301513671875, "logps/rejected": -282.77581787109375, "loss": 0.5997, "rewards/accuracies": 0.375, "rewards/chosen": -0.25701195001602173, "rewards/margins": 0.16655617952346802, "rewards/rejected": -0.42356812953948975, "step": 400 }, { "epoch": 0.837257980115123, "eval_dpo_lambda": 0.9500001072883606, "eval_logits/chosen": -1.1255731582641602, "eval_logits/rejected": -1.2248106002807617, "eval_logps/chosen": -334.57720947265625, "eval_logps/rejected": -345.5813293457031, "eval_loss": 0.5897952914237976, "eval_rewards/accuracies": 0.7222222089767456, "eval_rewards/chosen": -0.012656522914767265, "eval_rewards/margins": 0.37579214572906494, "eval_rewards/rejected": -0.38844865560531616, "eval_runtime": 129.2013, "eval_samples_per_second": 15.48, "eval_steps_per_second": 0.488, "step": 400 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8393511250654108, "grad_norm": 2.936415752619779, "learning_rate": 3.772967168071517e-08, "logits/chosen": -1.0231287479400635, "logits/rejected": -1.6366195678710938, "logps/chosen": -403.4915771484375, "logps/rejected": -270.6023254394531, "loss": 0.6034, "rewards/accuracies": 0.875, "rewards/chosen": -0.012652808800339699, "rewards/margins": 0.4569967985153198, "rewards/rejected": -0.46964961290359497, "step": 401 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8414442700156985, "grad_norm": 2.9813767996331677, "learning_rate": 3.676824816087978e-08, "logits/chosen": -1.4093921184539795, "logits/rejected": -1.3950053453445435, "logps/chosen": -451.0011901855469, "logps/rejected": -364.6950378417969, "loss": 0.562, "rewards/accuracies": 0.8125, "rewards/chosen": 0.08801425248384476, "rewards/margins": 0.47349080443382263, "rewards/rejected": -0.38547655940055847, "step": 402 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8435374149659864, "grad_norm": 3.6126682587682852, "learning_rate": 3.581825961277074e-08, "logits/chosen": -1.1978733539581299, "logits/rejected": -1.3086341619491577, "logps/chosen": -357.6504211425781, "logps/rejected": -486.1517639160156, "loss": 0.6004, "rewards/accuracies": 0.5, "rewards/chosen": -0.26101547479629517, "rewards/margins": -0.004345163702964783, "rewards/rejected": -0.2566703259944916, "step": 403 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8456305599162742, "grad_norm": 3.0882048614335775, "learning_rate": 3.487975698139084e-08, "logits/chosen": -0.846460223197937, "logits/rejected": -1.512641429901123, "logps/chosen": -342.6937561035156, "logps/rejected": -300.4619445800781, "loss": 0.6109, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0065054334700107574, "rewards/margins": 0.382173091173172, "rewards/rejected": -0.37566763162612915, "step": 404 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.847723704866562, "grad_norm": 2.9888302049354785, "learning_rate": 3.3952790595787986e-08, "logits/chosen": -0.8430641889572144, "logits/rejected": -1.3741079568862915, "logps/chosen": -346.3929138183594, "logps/rejected": -302.869873046875, "loss": 0.5848, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13285376131534576, "rewards/margins": 0.2974400818347931, "rewards/rejected": -0.43029382824897766, "step": 405 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8498168498168498, "grad_norm": 3.180748510247276, "learning_rate": 3.303741016635614e-08, "logits/chosen": -0.8760824203491211, "logits/rejected": -0.9473685026168823, "logps/chosen": -305.699951171875, "logps/rejected": -347.35443115234375, "loss": 0.5632, "rewards/accuracies": 0.75, "rewards/chosen": 0.1274721324443817, "rewards/margins": 0.526026725769043, "rewards/rejected": -0.39855456352233887, "step": 406 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8519099947671376, "grad_norm": 3.766907275683148, "learning_rate": 3.2133664782169944e-08, "logits/chosen": -0.9324281811714172, "logits/rejected": -1.0408509969711304, "logps/chosen": -346.48260498046875, "logps/rejected": -303.20654296875, "loss": 0.5665, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10819912701845169, "rewards/margins": 0.45785000920295715, "rewards/rejected": -0.5660492181777954, "step": 407 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8540031397174255, "grad_norm": 3.3977504026452174, "learning_rate": 3.12416029083514e-08, "logits/chosen": -1.5843658447265625, "logits/rejected": -1.6134032011032104, "logps/chosen": -291.4720764160156, "logps/rejected": -357.4535827636719, "loss": 0.6164, "rewards/accuracies": 0.8125, "rewards/chosen": -0.12114998698234558, "rewards/margins": 0.38358548283576965, "rewards/rejected": -0.5047354698181152, "step": 408 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8560962846677133, "grad_norm": 4.534029627178779, "learning_rate": 3.036127238347164e-08, "logits/chosen": -1.1062606573104858, "logits/rejected": -1.036970615386963, "logps/chosen": -325.1647033691406, "logps/rejected": -382.50390625, "loss": 0.6088, "rewards/accuracies": 0.75, "rewards/chosen": 0.13931109011173248, "rewards/margins": 0.46783575415611267, "rewards/rejected": -0.3285245895385742, "step": 409 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.858189429618001, "grad_norm": 3.3300745632307716, "learning_rate": 2.9492720416985e-08, "logits/chosen": -1.2156195640563965, "logits/rejected": -1.2286605834960938, "logps/chosen": -344.3565979003906, "logps/rejected": -343.79779052734375, "loss": 0.6044, "rewards/accuracies": 0.75, "rewards/chosen": 0.23161619901657104, "rewards/margins": 0.40767335891723633, "rewards/rejected": -0.17605718970298767, "step": 410 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8602825745682888, "grad_norm": 3.660218660909633, "learning_rate": 2.863599358669755e-08, "logits/chosen": -1.2867145538330078, "logits/rejected": -1.0989384651184082, "logps/chosen": -399.7301330566406, "logps/rejected": -442.936767578125, "loss": 0.6191, "rewards/accuracies": 0.625, "rewards/chosen": -0.1534915417432785, "rewards/margins": 0.18057143688201904, "rewards/rejected": -0.33406299352645874, "step": 411 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8623757195185766, "grad_norm": 4.056033506432057, "learning_rate": 2.7791137836269158e-08, "logits/chosen": -1.018944263458252, "logits/rejected": -1.320552110671997, "logps/chosen": -281.3681640625, "logps/rejected": -265.7995300292969, "loss": 0.5576, "rewards/accuracies": 0.75, "rewards/chosen": -0.014759466052055359, "rewards/margins": 0.383958637714386, "rewards/rejected": -0.39871811866760254, "step": 412 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8644688644688645, "grad_norm": 3.0801302791514718, "learning_rate": 2.6958198472749717e-08, "logits/chosen": -1.572458267211914, "logits/rejected": -1.7468321323394775, "logps/chosen": -298.75628662109375, "logps/rejected": -416.5832824707031, "loss": 0.5984, "rewards/accuracies": 0.8125, "rewards/chosen": -0.03307090327143669, "rewards/margins": 0.45480668544769287, "rewards/rejected": -0.48787763714790344, "step": 413 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8665620094191523, "grad_norm": 3.1139897027943566, "learning_rate": 2.613722016414943e-08, "logits/chosen": -1.1548796892166138, "logits/rejected": -1.3051416873931885, "logps/chosen": -227.35589599609375, "logps/rejected": -281.3026428222656, "loss": 0.5572, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05905963480472565, "rewards/margins": 0.3928758502006531, "rewards/rejected": -0.3338162302970886, "step": 414 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8686551543694401, "grad_norm": 3.168746220886711, "learning_rate": 2.5328246937043525e-08, "logits/chosen": -1.2107877731323242, "logits/rejected": -1.0023467540740967, "logps/chosen": -174.06605529785156, "logps/rejected": -278.3609619140625, "loss": 0.5916, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06353476643562317, "rewards/margins": 0.3717324137687683, "rewards/rejected": -0.43526721000671387, "step": 415 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8707482993197279, "grad_norm": 3.0670289018488326, "learning_rate": 2.4531322174210973e-08, "logits/chosen": -0.46390390396118164, "logits/rejected": -1.0033340454101562, "logps/chosen": -332.8857421875, "logps/rejected": -306.626708984375, "loss": 0.5844, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07684755325317383, "rewards/margins": 0.27887627482414246, "rewards/rejected": -0.3557237982749939, "step": 416 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8728414442700158, "grad_norm": 3.174254257608862, "learning_rate": 2.3746488612308295e-08, "logits/chosen": -0.6462736129760742, "logits/rejected": -1.1193785667419434, "logps/chosen": -321.9003601074219, "logps/rejected": -307.1663818359375, "loss": 0.5795, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3036060929298401, "rewards/margins": 0.2366805523633957, "rewards/rejected": -0.540286660194397, "step": 417 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8749345892203035, "grad_norm": 3.2392614562475, "learning_rate": 2.297378833957761e-08, "logits/chosen": -1.1161251068115234, "logits/rejected": -0.8608018159866333, "logps/chosen": -383.7115478515625, "logps/rejected": -468.5533752441406, "loss": 0.5658, "rewards/accuracies": 0.875, "rewards/chosen": -0.20031166076660156, "rewards/margins": 0.43779081106185913, "rewards/rejected": -0.6381024718284607, "step": 418 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8770277341705913, "grad_norm": 3.2670292840527875, "learning_rate": 2.2213262793589482e-08, "logits/chosen": -1.5432257652282715, "logits/rejected": -1.149251937866211, "logps/chosen": -285.0873718261719, "logps/rejected": -347.3460693359375, "loss": 0.5622, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0543360635638237, "rewards/margins": 0.1881767064332962, "rewards/rejected": -0.2425127923488617, "step": 419 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8791208791208791, "grad_norm": 3.107710111688003, "learning_rate": 2.1464952759020856e-08, "logits/chosen": -1.4026098251342773, "logits/rejected": -1.5206897258758545, "logps/chosen": -413.05377197265625, "logps/rejected": -396.6631774902344, "loss": 0.5597, "rewards/accuracies": 0.5, "rewards/chosen": -0.04296074062585831, "rewards/margins": 0.40356194972991943, "rewards/rejected": -0.44652271270751953, "step": 420 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8812140240711669, "grad_norm": 2.9813330717937876, "learning_rate": 2.07288983654679e-08, "logits/chosen": -1.0447331666946411, "logits/rejected": -1.171353816986084, "logps/chosen": -313.85870361328125, "logps/rejected": -311.4779357910156, "loss": 0.5641, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05653198063373566, "rewards/margins": 0.3648712635040283, "rewards/rejected": -0.4214032292366028, "step": 421 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8833071690214548, "grad_norm": 3.829897013260754, "learning_rate": 2.0005139085293942e-08, "logits/chosen": -1.1140737533569336, "logits/rejected": -1.3278985023498535, "logps/chosen": -332.94598388671875, "logps/rejected": -305.7933349609375, "loss": 0.6232, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0229061096906662, "rewards/margins": 0.3411436975002289, "rewards/rejected": -0.3640497922897339, "step": 422 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8854003139717426, "grad_norm": 3.1063375475092387, "learning_rate": 1.9293713731512673e-08, "logits/chosen": -1.15253746509552, "logits/rejected": -1.4765141010284424, "logps/chosen": -327.35003662109375, "logps/rejected": -371.173828125, "loss": 0.5705, "rewards/accuracies": 0.75, "rewards/chosen": 0.08236168324947357, "rewards/margins": 0.5512325763702393, "rewards/rejected": -0.4688709080219269, "step": 423 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8874934589220304, "grad_norm": 2.9582859361130462, "learning_rate": 1.8594660455706763e-08, "logits/chosen": -1.4676004648208618, "logits/rejected": -1.5918136835098267, "logps/chosen": -282.777587890625, "logps/rejected": -282.3938903808594, "loss": 0.5526, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1971668154001236, "rewards/margins": 0.3437334895133972, "rewards/rejected": -0.5409002900123596, "step": 424 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8895866038723181, "grad_norm": 3.123824295200849, "learning_rate": 1.7908016745981856e-08, "logits/chosen": -1.6513912677764893, "logits/rejected": -1.5601446628570557, "logps/chosen": -366.196533203125, "logps/rejected": -438.4700622558594, "loss": 0.5651, "rewards/accuracies": 0.625, "rewards/chosen": -0.04288112372159958, "rewards/margins": 0.3245621919631958, "rewards/rejected": -0.367443323135376, "step": 425 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8916797488226059, "grad_norm": 3.6190591164465684, "learning_rate": 1.7233819424956247e-08, "logits/chosen": -0.8792276382446289, "logits/rejected": -1.1027016639709473, "logps/chosen": -553.1680908203125, "logps/rejected": -415.5176086425781, "loss": 0.5813, "rewards/accuracies": 0.8125, "rewards/chosen": 0.014700181782245636, "rewards/margins": 0.5792965888977051, "rewards/rejected": -0.56459641456604, "step": 426 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8937728937728938, "grad_norm": 3.8316201082451653, "learning_rate": 1.6572104647786245e-08, "logits/chosen": -1.6309349536895752, "logits/rejected": -1.6912909746170044, "logps/chosen": -348.33795166015625, "logps/rejected": -412.5161437988281, "loss": 0.5992, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13494598865509033, "rewards/margins": 0.125399649143219, "rewards/rejected": -0.2603456377983093, "step": 427 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8958660387231816, "grad_norm": 3.0319617686204863, "learning_rate": 1.5922907900227017e-08, "logits/chosen": -1.1016099452972412, "logits/rejected": -1.5737460851669312, "logps/chosen": -346.3306579589844, "logps/rejected": -378.94708251953125, "loss": 0.5662, "rewards/accuracies": 0.625, "rewards/chosen": -0.12931616604328156, "rewards/margins": 0.3607315421104431, "rewards/rejected": -0.4900476932525635, "step": 428 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.8979591836734694, "grad_norm": 2.9495925841167154, "learning_rate": 1.5286263996730026e-08, "logits/chosen": -0.9166581630706787, "logits/rejected": -0.8029398322105408, "logps/chosen": -265.19793701171875, "logps/rejected": -290.7116394042969, "loss": 0.549, "rewards/accuracies": 0.8125, "rewards/chosen": 0.03339245170354843, "rewards/margins": 0.2928437888622284, "rewards/rejected": -0.25945132970809937, "step": 429 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9000523286237572, "grad_norm": 3.013944432077002, "learning_rate": 1.4662207078575684e-08, "logits/chosen": -0.7334145307540894, "logits/rejected": -1.2314000129699707, "logps/chosen": -358.03668212890625, "logps/rejected": -297.3995361328125, "loss": 0.6003, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0846734493970871, "rewards/margins": 0.20193952322006226, "rewards/rejected": -0.28661295771598816, "step": 430 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.902145473574045, "grad_norm": 3.0458813501428983, "learning_rate": 1.40507706120426e-08, "logits/chosen": -1.4258354902267456, "logits/rejected": -1.7249447107315063, "logps/chosen": -276.9074401855469, "logps/rejected": -226.23248291015625, "loss": 0.5757, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13859999179840088, "rewards/margins": 0.33903825283050537, "rewards/rejected": -0.47763827443122864, "step": 431 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9042386185243328, "grad_norm": 3.4122488730429135, "learning_rate": 1.345198738661285e-08, "logits/chosen": -0.7685383558273315, "logits/rejected": -0.9603262543678284, "logps/chosen": -325.9361572265625, "logps/rejected": -296.58868408203125, "loss": 0.5995, "rewards/accuracies": 0.6875, "rewards/chosen": 0.18472544848918915, "rewards/margins": 0.3669314980506897, "rewards/rejected": -0.18220606446266174, "step": 432 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9063317634746206, "grad_norm": 3.052961452668848, "learning_rate": 1.2865889513213628e-08, "logits/chosen": -1.2096422910690308, "logits/rejected": -1.1499203443527222, "logps/chosen": -197.70555114746094, "logps/rejected": -279.8352966308594, "loss": 0.5713, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1538601666688919, "rewards/margins": 0.2419377565383911, "rewards/rejected": -0.3957979083061218, "step": 433 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9084249084249084, "grad_norm": 3.3334398534353684, "learning_rate": 1.2292508422495157e-08, "logits/chosen": -0.9015167355537415, "logits/rejected": -1.1574257612228394, "logps/chosen": -327.7990417480469, "logps/rejected": -282.1093444824219, "loss": 0.5872, "rewards/accuracies": 0.625, "rewards/chosen": -0.004913732409477234, "rewards/margins": 0.27218225598335266, "rewards/rejected": -0.2770960032939911, "step": 434 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9105180533751962, "grad_norm": 3.3602331750584034, "learning_rate": 1.1731874863145142e-08, "logits/chosen": -1.058158040046692, "logits/rejected": -1.2722753286361694, "logps/chosen": -335.21978759765625, "logps/rejected": -309.15728759765625, "loss": 0.5988, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10927972942590714, "rewards/margins": 0.2762323021888733, "rewards/rejected": -0.3855120539665222, "step": 435 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.912611198325484, "grad_norm": 3.2059083882946102, "learning_rate": 1.118401890024001e-08, "logits/chosen": -1.0633395910263062, "logits/rejected": -1.2969701290130615, "logps/chosen": -252.75567626953125, "logps/rejected": -263.0828552246094, "loss": 0.5918, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11592428386211395, "rewards/margins": 0.48797333240509033, "rewards/rejected": -0.3720490336418152, "step": 436 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9147043432757719, "grad_norm": 3.0727835096570133, "learning_rate": 1.06489699136324e-08, "logits/chosen": -1.2614545822143555, "logits/rejected": -1.331321358680725, "logps/chosen": -442.27093505859375, "logps/rejected": -383.12646484375, "loss": 0.6014, "rewards/accuracies": 0.5, "rewards/chosen": -0.21684342622756958, "rewards/margins": 0.20827914774417877, "rewards/rejected": -0.42512252926826477, "step": 437 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9167974882260597, "grad_norm": 3.0074696050405874, "learning_rate": 1.0126756596375685e-08, "logits/chosen": -1.2663462162017822, "logits/rejected": -1.4684832096099854, "logps/chosen": -268.9501953125, "logps/rejected": -265.1226501464844, "loss": 0.5861, "rewards/accuracies": 0.5625, "rewards/chosen": -0.017824526876211166, "rewards/margins": 0.1599656641483307, "rewards/rejected": -0.17779019474983215, "step": 438 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9188906331763474, "grad_norm": 3.032757366747028, "learning_rate": 9.617406953185136e-09, "logits/chosen": -1.1892074346542358, "logits/rejected": -1.1546742916107178, "logps/chosen": -252.57630920410156, "logps/rejected": -278.7446594238281, "loss": 0.5204, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09857797622680664, "rewards/margins": 0.32015323638916016, "rewards/rejected": -0.22157524526119232, "step": 439 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9209837781266352, "grad_norm": 3.2065844426513066, "learning_rate": 9.12094829893642e-09, "logits/chosen": -1.0980162620544434, "logits/rejected": -1.3246887922286987, "logps/chosen": -391.4311828613281, "logps/rejected": -320.11346435546875, "loss": 0.5693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.15473929047584534, "rewards/margins": 0.0732923373579979, "rewards/rejected": -0.22803160548210144, "step": 440 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9230769230769231, "grad_norm": 2.9717261942927773, "learning_rate": 8.637407257200496e-09, "logits/chosen": -0.7083653807640076, "logits/rejected": -0.7791855931282043, "logps/chosen": -221.68829345703125, "logps/rejected": -293.51824951171875, "loss": 0.5788, "rewards/accuracies": 0.75, "rewards/chosen": 0.04367516189813614, "rewards/margins": 0.2928394377231598, "rewards/rejected": -0.24916428327560425, "step": 441 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9251700680272109, "grad_norm": 3.93913530280895, "learning_rate": 8.166809758815895e-09, "logits/chosen": -1.2606542110443115, "logits/rejected": -1.228628158569336, "logps/chosen": -337.81787109375, "logps/rejected": -348.7516784667969, "loss": 0.5886, "rewards/accuracies": 0.75, "rewards/chosen": 0.04720412939786911, "rewards/margins": 0.5092675685882568, "rewards/rejected": -0.46206343173980713, "step": 442 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9272632129774987, "grad_norm": 3.1007919933088597, "learning_rate": 7.709181040498253e-09, "logits/chosen": -0.8253033757209778, "logits/rejected": -1.166407585144043, "logps/chosen": -338.40093994140625, "logps/rejected": -292.34283447265625, "loss": 0.5771, "rewards/accuracies": 0.8125, "rewards/chosen": -0.027752190828323364, "rewards/margins": 0.38949188590049744, "rewards/rejected": -0.4172440469264984, "step": 443 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9293563579277865, "grad_norm": 3.382649823038388, "learning_rate": 7.2645456434869965e-09, "logits/chosen": -1.1709818840026855, "logits/rejected": -1.425568699836731, "logps/chosen": -239.03799438476562, "logps/rejected": -259.7547607421875, "loss": 0.6266, "rewards/accuracies": 0.625, "rewards/chosen": -0.10446663200855255, "rewards/margins": 0.1513005495071411, "rewards/rejected": -0.25576722621917725, "step": 444 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9314495028780743, "grad_norm": 3.3088688536000777, "learning_rate": 6.832927412229017e-09, "logits/chosen": -1.4180254936218262, "logits/rejected": -1.7165518999099731, "logps/chosen": -300.8059387207031, "logps/rejected": -288.5361633300781, "loss": 0.5706, "rewards/accuracies": 0.6875, "rewards/chosen": -0.000718429684638977, "rewards/margins": 0.37574297189712524, "rewards/rejected": -0.3764614164829254, "step": 445 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9335426478283622, "grad_norm": 3.696588197945613, "learning_rate": 6.414349493100129e-09, "logits/chosen": -0.6548866033554077, "logits/rejected": -1.1064519882202148, "logps/chosen": -332.73876953125, "logps/rejected": -320.54595947265625, "loss": 0.5724, "rewards/accuracies": 0.6875, "rewards/chosen": 0.004259917885065079, "rewards/margins": 0.3434597849845886, "rewards/rejected": -0.33919987082481384, "step": 446 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9356357927786499, "grad_norm": 3.0966349568375136, "learning_rate": 6.0088343331638756e-09, "logits/chosen": -0.9749784469604492, "logits/rejected": -0.7469868659973145, "logps/chosen": -442.6228942871094, "logps/rejected": -515.1798095703125, "loss": 0.5793, "rewards/accuracies": 0.75, "rewards/chosen": 0.03440067917108536, "rewards/margins": 0.2795267105102539, "rewards/rejected": -0.24512605369091034, "step": 447 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9377289377289377, "grad_norm": 3.6920485858982923, "learning_rate": 5.616403678967624e-09, "logits/chosen": -0.899390459060669, "logits/rejected": -1.196982502937317, "logps/chosen": -337.94671630859375, "logps/rejected": -391.2649230957031, "loss": 0.5788, "rewards/accuracies": 0.6875, "rewards/chosen": -0.18063896894454956, "rewards/margins": 0.30657681822776794, "rewards/rejected": -0.4872157573699951, "step": 448 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9398220826792255, "grad_norm": 3.2036956055572725, "learning_rate": 5.2370785753763356e-09, "logits/chosen": -1.1814134120941162, "logits/rejected": -1.5648890733718872, "logps/chosen": -319.4839782714844, "logps/rejected": -373.0506591796875, "loss": 0.5976, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2478373944759369, "rewards/margins": 0.33236271142959595, "rewards/rejected": -0.5802000761032104, "step": 449 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9419152276295133, "grad_norm": 2.976277059857243, "learning_rate": 4.8708793644441086e-09, "logits/chosen": -1.677223801612854, "logits/rejected": -1.7529618740081787, "logps/chosen": -225.3381805419922, "logps/rejected": -285.229736328125, "loss": 0.5708, "rewards/accuracies": 0.75, "rewards/chosen": -0.16200092434883118, "rewards/margins": 0.4109874963760376, "rewards/rejected": -0.5729883909225464, "step": 450 }, { "epoch": 0.9419152276295133, "eval_dpo_lambda": 0.9500001072883606, "eval_logits/chosen": -1.1199915409088135, "eval_logits/rejected": -1.219022274017334, "eval_logps/chosen": -335.0126647949219, "eval_logps/rejected": -346.4959411621094, "eval_loss": 0.5889970064163208, "eval_rewards/accuracies": 0.7301587462425232, "eval_rewards/chosen": -0.017011789605021477, "eval_rewards/margins": 0.3805830180644989, "eval_rewards/rejected": -0.39759477972984314, "eval_runtime": 127.3398, "eval_samples_per_second": 15.706, "eval_steps_per_second": 0.495, "step": 450 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9440083725798012, "grad_norm": 3.0129948197979006, "learning_rate": 4.517825684323323e-09, "logits/chosen": -1.1246261596679688, "logits/rejected": -1.126707911491394, "logps/chosen": -285.481201171875, "logps/rejected": -258.5925598144531, "loss": 0.5787, "rewards/accuracies": 0.5625, "rewards/chosen": -0.25798118114471436, "rewards/margins": 0.14677190780639648, "rewards/rejected": -0.40475308895111084, "step": 451 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.946101517530089, "grad_norm": 3.711628885499933, "learning_rate": 4.1779364682113794e-09, "logits/chosen": -0.7498070001602173, "logits/rejected": -1.0126219987869263, "logps/chosen": -370.9879455566406, "logps/rejected": -308.6629333496094, "loss": 0.6146, "rewards/accuracies": 0.75, "rewards/chosen": -0.043602943420410156, "rewards/margins": 0.2808014154434204, "rewards/rejected": -0.32440435886383057, "step": 452 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9481946624803768, "grad_norm": 3.0508873950949806, "learning_rate": 3.851229943335393e-09, "logits/chosen": -1.4863890409469604, "logits/rejected": -1.6739808320999146, "logps/chosen": -310.1466979980469, "logps/rejected": -320.50848388671875, "loss": 0.5529, "rewards/accuracies": 0.8125, "rewards/chosen": 0.07640773802995682, "rewards/margins": 0.43768224120140076, "rewards/rejected": -0.36127451062202454, "step": 453 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9502878074306645, "grad_norm": 3.2360238286339733, "learning_rate": 3.5377236299748147e-09, "logits/chosen": -1.3143385648727417, "logits/rejected": -1.3345000743865967, "logps/chosen": -261.9361877441406, "logps/rejected": -252.34402465820312, "loss": 0.6018, "rewards/accuracies": 0.625, "rewards/chosen": -0.10826227068901062, "rewards/margins": 0.18725652992725372, "rewards/rejected": -0.29551878571510315, "step": 454 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9523809523809523, "grad_norm": 3.310678587578471, "learning_rate": 3.2374343405217884e-09, "logits/chosen": -1.5579487085342407, "logits/rejected": -1.8721368312835693, "logps/chosen": -260.0130920410156, "logps/rejected": -280.2405090332031, "loss": 0.5831, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0800006315112114, "rewards/margins": 0.44366368651390076, "rewards/rejected": -0.36366304755210876, "step": 455 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9544740973312402, "grad_norm": 3.0990226430114403, "learning_rate": 2.9503781785795713e-09, "logits/chosen": -1.097597360610962, "logits/rejected": -1.1685129404067993, "logps/chosen": -258.849365234375, "logps/rejected": -236.74749755859375, "loss": 0.5663, "rewards/accuracies": 0.875, "rewards/chosen": 0.08834816515445709, "rewards/margins": 0.6104041934013367, "rewards/rejected": -0.5220560431480408, "step": 456 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.956567242281528, "grad_norm": 2.851250278871114, "learning_rate": 2.6765705380989432e-09, "logits/chosen": -1.72226881980896, "logits/rejected": -1.3489344120025635, "logps/chosen": -283.7491455078125, "logps/rejected": -375.00238037109375, "loss": 0.573, "rewards/accuracies": 0.75, "rewards/chosen": 0.009540770202875137, "rewards/margins": 0.33223041892051697, "rewards/rejected": -0.32268965244293213, "step": 457 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9586603872318158, "grad_norm": 3.6873687759683325, "learning_rate": 2.416026102552732e-09, "logits/chosen": -0.7740952968597412, "logits/rejected": -0.7481369972229004, "logps/chosen": -443.49639892578125, "logps/rejected": -392.90948486328125, "loss": 0.6096, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1254246085882187, "rewards/margins": 0.3135533630847931, "rewards/rejected": -0.438977986574173, "step": 458 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9607535321821036, "grad_norm": 3.414694617859811, "learning_rate": 2.168758844148272e-09, "logits/chosen": -1.2945045232772827, "logits/rejected": -1.5964264869689941, "logps/chosen": -382.6551513671875, "logps/rejected": -384.6936340332031, "loss": 0.6073, "rewards/accuracies": 0.75, "rewards/chosen": -0.12786780297756195, "rewards/margins": 0.31756332516670227, "rewards/rejected": -0.445431113243103, "step": 459 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9628466771323915, "grad_norm": 3.6142106378581635, "learning_rate": 1.9347820230782295e-09, "logits/chosen": -1.1631758213043213, "logits/rejected": -1.4611421823501587, "logps/chosen": -375.6534423828125, "logps/rejected": -348.94989013671875, "loss": 0.5701, "rewards/accuracies": 0.875, "rewards/chosen": 0.11676093935966492, "rewards/margins": 0.6981862783432007, "rewards/rejected": -0.5814253687858582, "step": 460 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9649398220826793, "grad_norm": 3.1889923293777067, "learning_rate": 1.7141081868094209e-09, "logits/chosen": -1.1662390232086182, "logits/rejected": -1.4079844951629639, "logps/chosen": -389.94677734375, "logps/rejected": -363.14068603515625, "loss": 0.5545, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05117765814065933, "rewards/margins": 0.3404976725578308, "rewards/rejected": -0.2893199920654297, "step": 461 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.967032967032967, "grad_norm": 3.2969225856170588, "learning_rate": 1.5067491694100153e-09, "logits/chosen": -1.0681743621826172, "logits/rejected": -0.9535655975341797, "logps/chosen": -432.03955078125, "logps/rejected": -350.5995788574219, "loss": 0.581, "rewards/accuracies": 0.5625, "rewards/chosen": -0.14307783544063568, "rewards/margins": 0.07278827577829361, "rewards/rejected": -0.2158661186695099, "step": 462 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9691261119832548, "grad_norm": 3.9337765441509074, "learning_rate": 1.3127160909147672e-09, "logits/chosen": -1.1827539205551147, "logits/rejected": -1.6625728607177734, "logps/chosen": -394.8363342285156, "logps/rejected": -335.9792175292969, "loss": 0.5706, "rewards/accuracies": 0.625, "rewards/chosen": 0.008750967681407928, "rewards/margins": 0.18456429243087769, "rewards/rejected": -0.17581330239772797, "step": 463 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9712192569335426, "grad_norm": 3.14559889273045, "learning_rate": 1.1320193567288527e-09, "logits/chosen": -1.052103877067566, "logits/rejected": -1.4543917179107666, "logps/chosen": -409.89727783203125, "logps/rejected": -260.9820251464844, "loss": 0.5906, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02242114394903183, "rewards/margins": 0.2970905900001526, "rewards/rejected": -0.3195117712020874, "step": 464 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9733124018838305, "grad_norm": 3.1699162214344208, "learning_rate": 9.64668657069706e-10, "logits/chosen": -1.118654727935791, "logits/rejected": -1.4434138536453247, "logps/chosen": -313.4532470703125, "logps/rejected": -298.4122314453125, "loss": 0.5826, "rewards/accuracies": 0.625, "rewards/chosen": -0.16541258990764618, "rewards/margins": 0.2686760723590851, "rewards/rejected": -0.43408867716789246, "step": 465 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9754055468341183, "grad_norm": 3.0687237296661203, "learning_rate": 8.106729664475176e-10, "logits/chosen": -1.155794382095337, "logits/rejected": -1.5298821926116943, "logps/chosen": -223.0940704345703, "logps/rejected": -304.4292907714844, "loss": 0.5819, "rewards/accuracies": 0.875, "rewards/chosen": 0.128111332654953, "rewards/margins": 0.444845587015152, "rewards/rejected": -0.316734254360199, "step": 466 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9774986917844061, "grad_norm": 2.9086673211471137, "learning_rate": 6.700405431837585e-10, "logits/chosen": -0.8144406080245972, "logits/rejected": -0.9702533483505249, "logps/chosen": -254.71255493164062, "logps/rejected": -284.7623596191406, "loss": 0.5529, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08053113520145416, "rewards/margins": 0.31915003061294556, "rewards/rejected": -0.2386188507080078, "step": 467 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9795918367346939, "grad_norm": 2.853138826629461, "learning_rate": 5.427789289685347e-10, "logits/chosen": -1.1961573362350464, "logits/rejected": -1.179906964302063, "logps/chosen": -331.8739318847656, "logps/rejected": -373.998291015625, "loss": 0.5423, "rewards/accuracies": 0.875, "rewards/chosen": 0.0800902470946312, "rewards/margins": 0.45181548595428467, "rewards/rejected": -0.3717252314090729, "step": 468 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9816849816849816, "grad_norm": 2.9114452300427613, "learning_rate": 4.288949484559934e-10, "logits/chosen": -1.0562822818756104, "logits/rejected": -0.7359100580215454, "logps/chosen": -277.19854736328125, "logps/rejected": -455.24652099609375, "loss": 0.5797, "rewards/accuracies": 0.625, "rewards/chosen": -0.11454301327466965, "rewards/margins": 0.3727719485759735, "rewards/rejected": -0.48731493949890137, "step": 469 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9837781266352695, "grad_norm": 3.196059083815297, "learning_rate": 3.2839470889836627e-10, "logits/chosen": -1.0566866397857666, "logits/rejected": -0.8914412260055542, "logps/chosen": -373.61700439453125, "logps/rejected": -465.02532958984375, "loss": 0.6069, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1425824612379074, "rewards/margins": 0.3075590133666992, "rewards/rejected": -0.4501414895057678, "step": 470 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9858712715855573, "grad_norm": 3.6368119106222454, "learning_rate": 2.412835998185092e-10, "logits/chosen": -1.4378622770309448, "logits/rejected": -1.6878738403320312, "logps/chosen": -582.3114013671875, "logps/rejected": -437.59552001953125, "loss": 0.5562, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1123276948928833, "rewards/margins": 0.23780681192874908, "rewards/rejected": -0.3501345217227936, "step": 471 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9879644165358451, "grad_norm": 3.6113790961285006, "learning_rate": 1.6756629272085544e-10, "logits/chosen": -0.9051448702812195, "logits/rejected": -1.0294338464736938, "logps/chosen": -447.9708557128906, "logps/rejected": -438.98529052734375, "loss": 0.6559, "rewards/accuracies": 0.625, "rewards/chosen": -0.022281933575868607, "rewards/margins": 0.09695281833410263, "rewards/rejected": -0.11923475563526154, "step": 472 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9900575614861329, "grad_norm": 3.0035036696890627, "learning_rate": 1.072467408408384e-10, "logits/chosen": -0.9700950384140015, "logits/rejected": -1.111785888671875, "logps/chosen": -310.5140686035156, "logps/rejected": -339.4414978027344, "loss": 0.579, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17494910955429077, "rewards/margins": 0.36721929907798767, "rewards/rejected": -0.5421684384346008, "step": 473 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9921507064364207, "grad_norm": 2.7028457840053792, "learning_rate": 6.032817893297793e-11, "logits/chosen": -1.0764236450195312, "logits/rejected": -1.064743995666504, "logps/chosen": -351.79107666015625, "logps/rejected": -359.93548583984375, "loss": 0.5393, "rewards/accuracies": 0.875, "rewards/chosen": 0.11068565398454666, "rewards/margins": 0.5748571157455444, "rewards/rejected": -0.46417149901390076, "step": 474 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9942438513867086, "grad_norm": 3.1892615454695314, "learning_rate": 2.6813123097352287e-11, "logits/chosen": -0.9911459684371948, "logits/rejected": -1.4481531381607056, "logps/chosen": -230.3305206298828, "logps/rejected": -256.53192138671875, "loss": 0.5923, "rewards/accuracies": 0.625, "rewards/chosen": 0.07679829001426697, "rewards/margins": 0.34001997113227844, "rewards/rejected": -0.2632216811180115, "step": 475 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9963369963369964, "grad_norm": 2.958470567512489, "learning_rate": 6.7033706447061635e-12, "logits/chosen": -1.9690698385238647, "logits/rejected": -1.7968908548355103, "logps/chosen": -252.61404418945312, "logps/rejected": -352.68341064453125, "loss": 0.5904, "rewards/accuracies": 0.625, "rewards/chosen": -0.18716391921043396, "rewards/margins": 0.16536082327365875, "rewards/rejected": -0.3525247275829315, "step": 476 }, { "dpo_lambda": 0.949999988079071, "epoch": 0.9984301412872841, "grad_norm": 2.9694086844971213, "learning_rate": 0.0, "logits/chosen": -1.4441354274749756, "logits/rejected": -1.4043502807617188, "logps/chosen": -575.747314453125, "logps/rejected": -454.4081726074219, "loss": 0.5683, "rewards/accuracies": 0.8125, "rewards/chosen": 0.13661468029022217, "rewards/margins": 0.673367977142334, "rewards/rejected": -0.5367532968521118, "step": 477 }, { "epoch": 0.9984301412872841, "step": 477, "total_flos": 0.0, "train_loss": 0.6220506337203819, "train_runtime": 15789.8628, "train_samples_per_second": 3.872, "train_steps_per_second": 0.03 } ], "logging_steps": 1, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }