{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6468797564687976, "eval_steps": 50, "global_step": 850, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0076103500761035, "grad_norm": 0.058339186012744904, "learning_rate": 4.999451708687114e-06, "logits/chosen": 14.268467903137207, "logits/rejected": 14.600369453430176, "logps/chosen": -0.2669850289821625, "logps/rejected": -0.3412467837333679, "loss": 0.9049, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4004775583744049, "rewards/margins": 0.11139259487390518, "rewards/rejected": -0.5118702054023743, "step": 10 }, { "epoch": 0.015220700152207, "grad_norm": 0.049545690417289734, "learning_rate": 4.997807075247147e-06, "logits/chosen": 14.14539623260498, "logits/rejected": 15.191584587097168, "logps/chosen": -0.25579872727394104, "logps/rejected": -0.3931494653224945, "loss": 0.8989, "rewards/accuracies": 0.625, "rewards/chosen": -0.38369807600975037, "rewards/margins": 0.2060261219739914, "rewards/rejected": -0.5897241830825806, "step": 20 }, { "epoch": 0.0228310502283105, "grad_norm": 0.061699289828538895, "learning_rate": 4.9950668210706795e-06, "logits/chosen": 14.284139633178711, "logits/rejected": 15.006326675415039, "logps/chosen": -0.275672048330307, "logps/rejected": -0.3603581488132477, "loss": 0.9004, "rewards/accuracies": 0.625, "rewards/chosen": -0.4135080873966217, "rewards/margins": 0.12702910602092743, "rewards/rejected": -0.5405372381210327, "step": 30 }, { "epoch": 0.030441400304414, "grad_norm": 0.05706426501274109, "learning_rate": 4.9912321481237616e-06, "logits/chosen": 14.275796890258789, "logits/rejected": 14.935521125793457, "logps/chosen": -0.2802076041698456, "logps/rejected": -0.38278770446777344, "loss": 0.9138, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.42031145095825195, "rewards/margins": 0.15387018024921417, "rewards/rejected": -0.5741815567016602, "step": 40 }, { "epoch": 0.0380517503805175, "grad_norm": 0.05318514257669449, "learning_rate": 4.986304738420684e-06, "logits/chosen": 14.433627128601074, "logits/rejected": 15.458297729492188, "logps/chosen": -0.2581387162208557, "logps/rejected": -0.38208404183387756, "loss": 0.914, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.38720807433128357, "rewards/margins": 0.18591801822185516, "rewards/rejected": -0.5731261372566223, "step": 50 }, { "epoch": 0.0380517503805175, "eval_logits/chosen": 14.396967887878418, "eval_logits/rejected": 15.221076965332031, "eval_logps/chosen": -0.27519574761390686, "eval_logps/rejected": -0.3709692656993866, "eval_loss": 0.9084128141403198, "eval_rewards/accuracies": 0.5981308221817017, "eval_rewards/chosen": -0.4127936065196991, "eval_rewards/margins": 0.14366032183170319, "eval_rewards/rejected": -0.5564539432525635, "eval_runtime": 30.773, "eval_samples_per_second": 27.622, "eval_steps_per_second": 3.477, "step": 50 }, { "epoch": 0.045662100456621, "grad_norm": 0.06310460716485977, "learning_rate": 4.980286753286196e-06, "logits/chosen": 14.548416137695312, "logits/rejected": 15.526041030883789, "logps/chosen": -0.29403647780418396, "logps/rejected": -0.40682005882263184, "loss": 0.9082, "rewards/accuracies": 0.625, "rewards/chosen": -0.44105473160743713, "rewards/margins": 0.1691754311323166, "rewards/rejected": -0.6102300882339478, "step": 60 }, { "epoch": 0.0532724505327245, "grad_norm": 0.1258806735277176, "learning_rate": 4.973180832407471e-06, "logits/chosen": 14.390210151672363, "logits/rejected": 14.817584037780762, "logps/chosen": -0.25258123874664307, "logps/rejected": -0.36392712593078613, "loss": 0.896, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3788718581199646, "rewards/margins": 0.1670188158750534, "rewards/rejected": -0.5458906888961792, "step": 70 }, { "epoch": 0.060882800608828, "grad_norm": 0.09006265550851822, "learning_rate": 4.964990092676263e-06, "logits/chosen": 13.844560623168945, "logits/rejected": 14.811120986938477, "logps/chosen": -0.2630843222141266, "logps/rejected": -0.3794577717781067, "loss": 0.8977, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3946264684200287, "rewards/margins": 0.17456015944480896, "rewards/rejected": -0.5691865682601929, "step": 80 }, { "epoch": 0.0684931506849315, "grad_norm": 0.07123688608407974, "learning_rate": 4.9557181268217225e-06, "logits/chosen": 13.927327156066895, "logits/rejected": 14.746416091918945, "logps/chosen": -0.25282323360443115, "logps/rejected": -0.3279832601547241, "loss": 0.9092, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.37923485040664673, "rewards/margins": 0.11274002492427826, "rewards/rejected": -0.4919748902320862, "step": 90 }, { "epoch": 0.076103500761035, "grad_norm": 0.08333446085453033, "learning_rate": 4.9453690018345144e-06, "logits/chosen": 14.406118392944336, "logits/rejected": 14.770090103149414, "logps/chosen": -0.28569403290748596, "logps/rejected": -0.3596845269203186, "loss": 0.8932, "rewards/accuracies": 0.5625, "rewards/chosen": -0.42854103446006775, "rewards/margins": 0.11098580062389374, "rewards/rejected": -0.5395268201828003, "step": 100 }, { "epoch": 0.076103500761035, "eval_logits/chosen": 13.925265312194824, "eval_logits/rejected": 14.808513641357422, "eval_logps/chosen": -0.2667020559310913, "eval_logps/rejected": -0.3739235997200012, "eval_loss": 0.8984279036521912, "eval_rewards/accuracies": 0.5981308221817017, "eval_rewards/chosen": -0.40005311369895935, "eval_rewards/margins": 0.16083234548568726, "eval_rewards/rejected": -0.5608854293823242, "eval_runtime": 30.7791, "eval_samples_per_second": 27.616, "eval_steps_per_second": 3.476, "step": 100 }, { "epoch": 0.0837138508371385, "grad_norm": 0.08474570512771606, "learning_rate": 4.933947257182901e-06, "logits/chosen": 13.641456604003906, "logits/rejected": 14.799921035766602, "logps/chosen": -0.2721528708934784, "logps/rejected": -0.38378894329071045, "loss": 0.8995, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.40822935104370117, "rewards/margins": 0.1674540936946869, "rewards/rejected": -0.5756834149360657, "step": 110 }, { "epoch": 0.091324200913242, "grad_norm": 0.1004580408334732, "learning_rate": 4.921457902821578e-06, "logits/chosen": 13.835454940795898, "logits/rejected": 14.882522583007812, "logps/chosen": -0.28507837653160095, "logps/rejected": -0.39737468957901, "loss": 0.8795, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.42761754989624023, "rewards/margins": 0.16844449937343597, "rewards/rejected": -0.5960620641708374, "step": 120 }, { "epoch": 0.0989345509893455, "grad_norm": 0.09537151455879211, "learning_rate": 4.907906416994146e-06, "logits/chosen": 13.607874870300293, "logits/rejected": 14.091131210327148, "logps/chosen": -0.2739318013191223, "logps/rejected": -0.36800479888916016, "loss": 0.8912, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4108976721763611, "rewards/margins": 0.14110951125621796, "rewards/rejected": -0.5520071983337402, "step": 130 }, { "epoch": 0.106544901065449, "grad_norm": 0.10281535238027573, "learning_rate": 4.893298743830168e-06, "logits/chosen": 12.017224311828613, "logits/rejected": 13.04835319519043, "logps/chosen": -0.24072685837745667, "logps/rejected": -0.36906492710113525, "loss": 0.8908, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3610902428627014, "rewards/margins": 0.19250717759132385, "rewards/rejected": -0.5535974502563477, "step": 140 }, { "epoch": 0.1141552511415525, "grad_norm": 0.707987368106842, "learning_rate": 4.8776412907378845e-06, "logits/chosen": 12.522550582885742, "logits/rejected": 13.272679328918457, "logps/chosen": -0.2583540081977844, "logps/rejected": -0.3796755075454712, "loss": 0.8867, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.38753098249435425, "rewards/margins": 0.18198221921920776, "rewards/rejected": -0.569513201713562, "step": 150 }, { "epoch": 0.1141552511415525, "eval_logits/chosen": 11.989100456237793, "eval_logits/rejected": 12.92872142791748, "eval_logps/chosen": -0.27158522605895996, "eval_logps/rejected": -0.40521273016929626, "eval_loss": 0.8765817284584045, "eval_rewards/accuracies": 0.5981308221817017, "eval_rewards/chosen": -0.40737783908843994, "eval_rewards/margins": 0.20044119656085968, "eval_rewards/rejected": -0.6078190803527832, "eval_runtime": 30.7739, "eval_samples_per_second": 27.621, "eval_steps_per_second": 3.477, "step": 150 }, { "epoch": 0.121765601217656, "grad_norm": 0.19342070817947388, "learning_rate": 4.860940925593703e-06, "logits/chosen": 11.095940589904785, "logits/rejected": 12.351040840148926, "logps/chosen": -0.24749942123889923, "logps/rejected": -0.43422192335128784, "loss": 0.8762, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.37124913930892944, "rewards/margins": 0.2800838053226471, "rewards/rejected": -0.6513329744338989, "step": 160 }, { "epoch": 0.1293759512937595, "grad_norm": 0.19374576210975647, "learning_rate": 4.84320497372973e-06, "logits/chosen": 10.510068893432617, "logits/rejected": 11.507593154907227, "logps/chosen": -0.26223134994506836, "logps/rejected": -0.43635931611061096, "loss": 0.8581, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.39334696531295776, "rewards/margins": 0.2611919641494751, "rewards/rejected": -0.6545389294624329, "step": 170 }, { "epoch": 0.136986301369863, "grad_norm": 0.20330430567264557, "learning_rate": 4.824441214720629e-06, "logits/chosen": 9.89570140838623, "logits/rejected": 10.669364929199219, "logps/chosen": -0.3143860101699829, "logps/rejected": -0.46989941596984863, "loss": 0.8558, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.47157901525497437, "rewards/margins": 0.23327013850212097, "rewards/rejected": -0.704849123954773, "step": 180 }, { "epoch": 0.1445966514459665, "grad_norm": 0.22942212224006653, "learning_rate": 4.804657878971252e-06, "logits/chosen": 8.887057304382324, "logits/rejected": 9.542157173156738, "logps/chosen": -0.2906036972999573, "logps/rejected": -0.4810206890106201, "loss": 0.8554, "rewards/accuracies": 0.625, "rewards/chosen": -0.4359055459499359, "rewards/margins": 0.28562551736831665, "rewards/rejected": -0.7215310335159302, "step": 190 }, { "epoch": 0.15220700152207, "grad_norm": 0.29071903228759766, "learning_rate": 4.783863644106502e-06, "logits/chosen": 6.791537284851074, "logits/rejected": 7.366445064544678, "logps/chosen": -0.31382033228874207, "logps/rejected": -0.5417486429214478, "loss": 0.838, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4707304835319519, "rewards/margins": 0.34189245104789734, "rewards/rejected": -0.8126228451728821, "step": 200 }, { "epoch": 0.15220700152207, "eval_logits/chosen": 7.050150394439697, "eval_logits/rejected": 7.516275405883789, "eval_logps/chosen": -0.3289315402507782, "eval_logps/rejected": -0.5481724143028259, "eval_loss": 0.813983678817749, "eval_rewards/accuracies": 0.6168224215507507, "eval_rewards/chosen": -0.4933973252773285, "eval_rewards/margins": 0.3288613557815552, "eval_rewards/rejected": -0.8222586512565613, "eval_runtime": 30.7734, "eval_samples_per_second": 27.621, "eval_steps_per_second": 3.477, "step": 200 }, { "epoch": 0.1598173515981735, "grad_norm": 0.23101097345352173, "learning_rate": 4.762067631165049e-06, "logits/chosen": 5.132790565490723, "logits/rejected": 5.848537445068359, "logps/chosen": -0.33372369408607483, "logps/rejected": -0.5993582010269165, "loss": 0.8212, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5005855560302734, "rewards/margins": 0.3984517455101013, "rewards/rejected": -0.8990373611450195, "step": 210 }, { "epoch": 0.167427701674277, "grad_norm": 0.5136363506317139, "learning_rate": 4.7392794005985324e-06, "logits/chosen": 3.807554244995117, "logits/rejected": 4.600871562957764, "logps/chosen": -0.32092416286468506, "logps/rejected": -0.651642918586731, "loss": 0.7851, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4813862442970276, "rewards/margins": 0.4960783123970032, "rewards/rejected": -0.977464497089386, "step": 220 }, { "epoch": 0.1750380517503805, "grad_norm": 0.4106898009777069, "learning_rate": 4.715508948078037e-06, "logits/chosen": 2.760650396347046, "logits/rejected": 2.1608071327209473, "logps/chosen": -0.43665003776550293, "logps/rejected": -0.8352751731872559, "loss": 0.7685, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6549750566482544, "rewards/margins": 0.5979377627372742, "rewards/rejected": -1.2529128789901733, "step": 230 }, { "epoch": 0.182648401826484, "grad_norm": 0.4719419479370117, "learning_rate": 4.690766700109659e-06, "logits/chosen": 3.1216347217559814, "logits/rejected": 2.7202537059783936, "logps/chosen": -0.444007933139801, "logps/rejected": -0.7697597742080688, "loss": 0.7474, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6660118699073792, "rewards/margins": 0.4886276125907898, "rewards/rejected": -1.154639482498169, "step": 240 }, { "epoch": 0.1902587519025875, "grad_norm": 0.548523485660553, "learning_rate": 4.665063509461098e-06, "logits/chosen": 1.3678622245788574, "logits/rejected": 0.46835970878601074, "logps/chosen": -0.48227253556251526, "logps/rejected": -0.997289776802063, "loss": 0.7017, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7234088182449341, "rewards/margins": 0.7725256681442261, "rewards/rejected": -1.4959346055984497, "step": 250 }, { "epoch": 0.1902587519025875, "eval_logits/chosen": 2.1362831592559814, "eval_logits/rejected": 1.1932121515274048, "eval_logps/chosen": -0.500978946685791, "eval_logps/rejected": -1.0073517560958862, "eval_loss": 0.6914573907852173, "eval_rewards/accuracies": 0.6542056202888489, "eval_rewards/chosen": -0.7514683604240417, "eval_rewards/margins": 0.7595593929290771, "eval_rewards/rejected": -1.5110276937484741, "eval_runtime": 30.7706, "eval_samples_per_second": 27.624, "eval_steps_per_second": 3.477, "step": 250 }, { "epoch": 0.197869101978691, "grad_norm": 0.700670063495636, "learning_rate": 4.638410650401267e-06, "logits/chosen": 2.537666082382202, "logits/rejected": 1.3070740699768066, "logps/chosen": -0.59038907289505, "logps/rejected": -1.0600087642669678, "loss": 0.6908, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8855836987495422, "rewards/margins": 0.7044296264648438, "rewards/rejected": -1.5900132656097412, "step": 260 }, { "epoch": 0.2054794520547945, "grad_norm": 0.6454456448554993, "learning_rate": 4.610819813755038e-06, "logits/chosen": 2.312289237976074, "logits/rejected": 1.6705052852630615, "logps/chosen": -0.601074755191803, "logps/rejected": -1.12887442111969, "loss": 0.6868, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9016121029853821, "rewards/margins": 0.7916995286941528, "rewards/rejected": -1.6933116912841797, "step": 270 }, { "epoch": 0.213089802130898, "grad_norm": 0.8001136183738708, "learning_rate": 4.582303101775249e-06, "logits/chosen": 1.6213299036026, "logits/rejected": 0.9048928022384644, "logps/chosen": -0.6731385588645935, "logps/rejected": -1.3181935548782349, "loss": 0.632, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0097079277038574, "rewards/margins": 0.9675822257995605, "rewards/rejected": -1.977290153503418, "step": 280 }, { "epoch": 0.2207001522070015, "grad_norm": 0.45858490467071533, "learning_rate": 4.55287302283426e-06, "logits/chosen": 1.0463030338287354, "logits/rejected": 0.05798797681927681, "logps/chosen": -0.677167534828186, "logps/rejected": -1.4764039516448975, "loss": 0.6447, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0157512426376343, "rewards/margins": 1.1988548040390015, "rewards/rejected": -2.2146058082580566, "step": 290 }, { "epoch": 0.228310502283105, "grad_norm": 0.5778977870941162, "learning_rate": 4.522542485937369e-06, "logits/chosen": 2.3259291648864746, "logits/rejected": 1.6117414236068726, "logps/chosen": -0.7591919302940369, "logps/rejected": -1.5995824337005615, "loss": 0.5702, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1387879848480225, "rewards/margins": 1.2605856657028198, "rewards/rejected": -2.3993735313415527, "step": 300 }, { "epoch": 0.228310502283105, "eval_logits/chosen": 1.9625831842422485, "eval_logits/rejected": 1.028193473815918, "eval_logps/chosen": -0.7516441941261292, "eval_logps/rejected": -1.771378517150879, "eval_loss": 0.5786539912223816, "eval_rewards/accuracies": 0.6915887594223022, "eval_rewards/chosen": -1.1274662017822266, "eval_rewards/margins": 1.5296014547348022, "eval_rewards/rejected": -2.6570677757263184, "eval_runtime": 30.7716, "eval_samples_per_second": 27.623, "eval_steps_per_second": 3.477, "step": 300 }, { "epoch": 0.2359208523592085, "grad_norm": 0.5383133292198181, "learning_rate": 4.491324795060491e-06, "logits/chosen": 1.2824015617370605, "logits/rejected": 0.7073851823806763, "logps/chosen": -0.8315173387527466, "logps/rejected": -1.9733762741088867, "loss": 0.587, "rewards/accuracies": 0.75, "rewards/chosen": -1.2472760677337646, "rewards/margins": 1.7127883434295654, "rewards/rejected": -2.96006441116333, "step": 310 }, { "epoch": 0.243531202435312, "grad_norm": 3.721909284591675, "learning_rate": 4.4592336433146e-06, "logits/chosen": 1.993947982788086, "logits/rejected": 1.192871332168579, "logps/chosen": -0.9074883460998535, "logps/rejected": -1.9389015436172485, "loss": 0.5194, "rewards/accuracies": 0.625, "rewards/chosen": -1.3612326383590698, "rewards/margins": 1.5471194982528687, "rewards/rejected": -2.9083518981933594, "step": 320 }, { "epoch": 0.2511415525114155, "grad_norm": 0.9611485004425049, "learning_rate": 4.426283106939474e-06, "logits/chosen": 0.607239305973053, "logits/rejected": 0.040740929543972015, "logps/chosen": -0.9696615934371948, "logps/rejected": -2.3865818977355957, "loss": 0.4715, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4544923305511475, "rewards/margins": 2.1253809928894043, "rewards/rejected": -3.5798733234405518, "step": 330 }, { "epoch": 0.258751902587519, "grad_norm": 3.716665744781494, "learning_rate": 4.3924876391293915e-06, "logits/chosen": 1.486352801322937, "logits/rejected": 0.860406756401062, "logps/chosen": -0.9488881826400757, "logps/rejected": -2.771193027496338, "loss": 0.4584, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4233323335647583, "rewards/margins": 2.733457088470459, "rewards/rejected": -4.156789302825928, "step": 340 }, { "epoch": 0.2663622526636225, "grad_norm": 2.496544361114502, "learning_rate": 4.357862063693486e-06, "logits/chosen": 2.1065332889556885, "logits/rejected": 1.4116215705871582, "logps/chosen": -0.9290377497673035, "logps/rejected": -2.717181444168091, "loss": 0.4126, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.393556833267212, "rewards/margins": 2.682215452194214, "rewards/rejected": -4.075772285461426, "step": 350 }, { "epoch": 0.2663622526636225, "eval_logits/chosen": 2.3063719272613525, "eval_logits/rejected": 1.7392665147781372, "eval_logps/chosen": -0.9553582072257996, "eval_logps/rejected": -2.8578038215637207, "eval_loss": 0.43925610184669495, "eval_rewards/accuracies": 0.7196261882781982, "eval_rewards/chosen": -1.433037281036377, "eval_rewards/margins": 2.853668212890625, "eval_rewards/rejected": -4.286705493927002, "eval_runtime": 30.7732, "eval_samples_per_second": 27.621, "eval_steps_per_second": 3.477, "step": 350 }, { "epoch": 0.273972602739726, "grad_norm": 1.0364434719085693, "learning_rate": 4.322421568553529e-06, "logits/chosen": 3.5145366191864014, "logits/rejected": 2.562318801879883, "logps/chosen": -0.9316509366035461, "logps/rejected": -2.7451562881469727, "loss": 0.4566, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3974764347076416, "rewards/margins": 2.7202582359313965, "rewards/rejected": -4.117734432220459, "step": 360 }, { "epoch": 0.2815829528158295, "grad_norm": 0.7246320843696594, "learning_rate": 4.286181699082008e-06, "logits/chosen": 1.6608537435531616, "logits/rejected": 1.27449631690979, "logps/chosen": -1.0797128677368164, "logps/rejected": -3.467390537261963, "loss": 0.4299, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6195694208145142, "rewards/margins": 3.5815162658691406, "rewards/rejected": -5.201085567474365, "step": 370 }, { "epoch": 0.289193302891933, "grad_norm": 0.942298173904419, "learning_rate": 4.249158351283414e-06, "logits/chosen": 2.1106579303741455, "logits/rejected": 1.5492799282073975, "logps/chosen": -1.2671682834625244, "logps/rejected": -3.201054811477661, "loss": 0.4114, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.900752305984497, "rewards/margins": 2.900829792022705, "rewards/rejected": -4.801582336425781, "step": 380 }, { "epoch": 0.2968036529680365, "grad_norm": 0.4278697371482849, "learning_rate": 4.211367764821722e-06, "logits/chosen": 3.2620933055877686, "logits/rejected": 2.7777600288391113, "logps/chosen": -1.0661684274673462, "logps/rejected": -3.025578022003174, "loss": 0.4259, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.599252462387085, "rewards/margins": 2.9391140937805176, "rewards/rejected": -4.53836727142334, "step": 390 }, { "epoch": 0.30441400304414, "grad_norm": 0.6019588708877563, "learning_rate": 4.172826515897146e-06, "logits/chosen": 3.057295560836792, "logits/rejected": 2.397916078567505, "logps/chosen": -1.0584070682525635, "logps/rejected": -3.479670286178589, "loss": 0.4167, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5876106023788452, "rewards/margins": 3.631894588470459, "rewards/rejected": -5.219505310058594, "step": 400 }, { "epoch": 0.30441400304414, "eval_logits/chosen": 3.2906479835510254, "eval_logits/rejected": 2.9191884994506836, "eval_logps/chosen": -1.1546303033828735, "eval_logps/rejected": -3.499722957611084, "eval_loss": 0.4080003499984741, "eval_rewards/accuracies": 0.7102803587913513, "eval_rewards/chosen": -1.731945514678955, "eval_rewards/margins": 3.517639398574829, "eval_rewards/rejected": -5.249584674835205, "eval_runtime": 30.7819, "eval_samples_per_second": 27.614, "eval_steps_per_second": 3.476, "step": 400 }, { "epoch": 0.3120243531202435, "grad_norm": 0.610105037689209, "learning_rate": 4.133551509975264e-06, "logits/chosen": 2.7366433143615723, "logits/rejected": 2.350151538848877, "logps/chosen": -1.3425816297531128, "logps/rejected": -4.451743125915527, "loss": 0.3834, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0138726234436035, "rewards/margins": 4.663742542266846, "rewards/rejected": -6.677615165710449, "step": 410 }, { "epoch": 0.319634703196347, "grad_norm": 0.9136129021644592, "learning_rate": 4.093559974371725e-06, "logits/chosen": 3.8271331787109375, "logits/rejected": 3.666091203689575, "logps/chosen": -1.3493579626083374, "logps/rejected": -3.8569908142089844, "loss": 0.397, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0240368843078613, "rewards/margins": 3.7614493370056152, "rewards/rejected": -5.785486221313477, "step": 420 }, { "epoch": 0.3272450532724505, "grad_norm": 0.6076493859291077, "learning_rate": 4.052869450695776e-06, "logits/chosen": 3.027143955230713, "logits/rejected": 2.2761549949645996, "logps/chosen": -1.2890465259552002, "logps/rejected": -4.363173961639404, "loss": 0.3623, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9335696697235107, "rewards/margins": 4.6111907958984375, "rewards/rejected": -6.544760704040527, "step": 430 }, { "epoch": 0.334855403348554, "grad_norm": 1.0694931745529175, "learning_rate": 4.011497787155938e-06, "logits/chosen": 3.989302158355713, "logits/rejected": 3.3767571449279785, "logps/chosen": -1.3799726963043213, "logps/rejected": -4.611227512359619, "loss": 0.3786, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0699591636657715, "rewards/margins": 4.846882343292236, "rewards/rejected": -6.916840553283691, "step": 440 }, { "epoch": 0.3424657534246575, "grad_norm": 2.3523929119110107, "learning_rate": 3.969463130731183e-06, "logits/chosen": 3.046278953552246, "logits/rejected": 2.7509286403656006, "logps/chosen": -1.577859878540039, "logps/rejected": -4.554004669189453, "loss": 0.3948, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.3667898178100586, "rewards/margins": 4.464217185974121, "rewards/rejected": -6.831006050109863, "step": 450 }, { "epoch": 0.3424657534246575, "eval_logits/chosen": 3.555213451385498, "eval_logits/rejected": 3.359722375869751, "eval_logps/chosen": -1.6125141382217407, "eval_logps/rejected": -4.374329566955566, "eval_loss": 0.3748260736465454, "eval_rewards/accuracies": 0.7943925261497498, "eval_rewards/chosen": -2.418771266937256, "eval_rewards/margins": 4.142723560333252, "eval_rewards/rejected": -6.56149435043335, "eval_runtime": 30.7795, "eval_samples_per_second": 27.616, "eval_steps_per_second": 3.476, "step": 450 }, { "epoch": 0.350076103500761, "grad_norm": 1.113964557647705, "learning_rate": 3.92678391921108e-06, "logits/chosen": 3.61175274848938, "logits/rejected": 3.547903537750244, "logps/chosen": -1.7464786767959595, "logps/rejected": -5.045803070068359, "loss": 0.3717, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.619718074798584, "rewards/margins": 4.948986053466797, "rewards/rejected": -7.568705081939697, "step": 460 }, { "epoch": 0.3576864535768645, "grad_norm": 1.5195355415344238, "learning_rate": 3.88347887310836e-06, "logits/chosen": 3.0807926654815674, "logits/rejected": 3.012016773223877, "logps/chosen": -2.164515733718872, "logps/rejected": -5.039651393890381, "loss": 0.3507, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.2467732429504395, "rewards/margins": 4.312704086303711, "rewards/rejected": -7.55947732925415, "step": 470 }, { "epoch": 0.365296803652968, "grad_norm": 2.3880045413970947, "learning_rate": 3.839566987447492e-06, "logits/chosen": 2.4990105628967285, "logits/rejected": 2.5192058086395264, "logps/chosen": -2.5131685733795166, "logps/rejected": -5.811826705932617, "loss": 0.3326, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.7697532176971436, "rewards/margins": 4.947987079620361, "rewards/rejected": -8.717740058898926, "step": 480 }, { "epoch": 0.3729071537290715, "grad_norm": 4.61068868637085, "learning_rate": 3.795067523432826e-06, "logits/chosen": 2.1001622676849365, "logits/rejected": 2.0562539100646973, "logps/chosen": -2.7572569847106934, "logps/rejected": -6.228929042816162, "loss": 0.3227, "rewards/accuracies": 0.9375, "rewards/chosen": -4.135885715484619, "rewards/margins": 5.207508563995361, "rewards/rejected": -9.343393325805664, "step": 490 }, { "epoch": 0.380517503805175, "grad_norm": 8.403047561645508, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 3.1137287616729736, "logits/rejected": 2.6646764278411865, "logps/chosen": -2.8061861991882324, "logps/rejected": -6.236757755279541, "loss": 0.3422, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.2092790603637695, "rewards/margins": 5.1458563804626465, "rewards/rejected": -9.355135917663574, "step": 500 }, { "epoch": 0.380517503805175, "eval_logits/chosen": 3.3083701133728027, "eval_logits/rejected": 3.13222336769104, "eval_logps/chosen": -2.6677865982055664, "eval_logps/rejected": -5.843282222747803, "eval_loss": 0.30595287680625916, "eval_rewards/accuracies": 0.8878504633903503, "eval_rewards/chosen": -4.001679420471191, "eval_rewards/margins": 4.763244152069092, "eval_rewards/rejected": -8.764924049377441, "eval_runtime": 30.7793, "eval_samples_per_second": 27.616, "eval_steps_per_second": 3.476, "step": 500 }, { "epoch": 0.3881278538812785, "grad_norm": 2.3582851886749268, "learning_rate": 3.7043841852542884e-06, "logits/chosen": 2.7116522789001465, "logits/rejected": 2.5776076316833496, "logps/chosen": -2.7367191314697266, "logps/rejected": -6.1324052810668945, "loss": 0.2649, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.10507869720459, "rewards/margins": 5.093530178070068, "rewards/rejected": -9.1986083984375, "step": 510 }, { "epoch": 0.395738203957382, "grad_norm": 2.8183226585388184, "learning_rate": 3.658240087799655e-06, "logits/chosen": 2.327544689178467, "logits/rejected": 2.3745343685150146, "logps/chosen": -2.6957223415374756, "logps/rejected": -6.291537284851074, "loss": 0.295, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.043583393096924, "rewards/margins": 5.393722057342529, "rewards/rejected": -9.437305450439453, "step": 520 }, { "epoch": 0.4033485540334855, "grad_norm": 1.8313360214233398, "learning_rate": 3.611587947962319e-06, "logits/chosen": 2.4468109607696533, "logits/rejected": 2.4551472663879395, "logps/chosen": -2.7839953899383545, "logps/rejected": -6.5379180908203125, "loss": 0.2618, "rewards/accuracies": 0.875, "rewards/chosen": -4.175992965698242, "rewards/margins": 5.63088321685791, "rewards/rejected": -9.806875228881836, "step": 530 }, { "epoch": 0.410958904109589, "grad_norm": 2.2132411003112793, "learning_rate": 3.564448228912682e-06, "logits/chosen": 3.125279664993286, "logits/rejected": 2.7795650959014893, "logps/chosen": -3.349208116531372, "logps/rejected": -6.923414707183838, "loss": 0.2592, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.023811340332031, "rewards/margins": 5.361310005187988, "rewards/rejected": -10.385122299194336, "step": 540 }, { "epoch": 0.4185692541856925, "grad_norm": 6.05848503112793, "learning_rate": 3.516841607689501e-06, "logits/chosen": 2.841399669647217, "logits/rejected": 2.997351884841919, "logps/chosen": -3.256176710128784, "logps/rejected": -7.098822593688965, "loss": 0.2603, "rewards/accuracies": 0.9375, "rewards/chosen": -4.884264945983887, "rewards/margins": 5.763968467712402, "rewards/rejected": -10.648235321044922, "step": 550 }, { "epoch": 0.4185692541856925, "eval_logits/chosen": 3.109469413757324, "eval_logits/rejected": 3.010756492614746, "eval_logps/chosen": -3.216036558151245, "eval_logps/rejected": -6.825747013092041, "eval_loss": 0.27887609601020813, "eval_rewards/accuracies": 0.8878504633903503, "eval_rewards/chosen": -4.824055194854736, "eval_rewards/margins": 5.414565563201904, "eval_rewards/rejected": -10.23862075805664, "eval_runtime": 30.773, "eval_samples_per_second": 27.622, "eval_steps_per_second": 3.477, "step": 550 }, { "epoch": 0.426179604261796, "grad_norm": 1.8734403848648071, "learning_rate": 3.4687889661302577e-06, "logits/chosen": 1.8899682760238647, "logits/rejected": 1.7766664028167725, "logps/chosen": -3.1907763481140137, "logps/rejected": -7.273028373718262, "loss": 0.2669, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.7861647605896, "rewards/margins": 6.123377323150635, "rewards/rejected": -10.909541130065918, "step": 560 }, { "epoch": 0.4337899543378995, "grad_norm": 3.2115261554718018, "learning_rate": 3.4203113817116955e-06, "logits/chosen": 2.563091278076172, "logits/rejected": 2.530696392059326, "logps/chosen": -3.620448589324951, "logps/rejected": -7.546849250793457, "loss": 0.2683, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.430673599243164, "rewards/margins": 5.8896002769470215, "rewards/rejected": -11.320273399353027, "step": 570 }, { "epoch": 0.441400304414003, "grad_norm": 3.684910297393799, "learning_rate": 3.3714301183045382e-06, "logits/chosen": 2.873882293701172, "logits/rejected": 3.193092107772827, "logps/chosen": -3.386859178543091, "logps/rejected": -7.514338493347168, "loss": 0.2715, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -5.080288887023926, "rewards/margins": 6.191219329833984, "rewards/rejected": -11.271509170532227, "step": 580 }, { "epoch": 0.4490106544901065, "grad_norm": 2.661367416381836, "learning_rate": 3.3221666168464584e-06, "logits/chosen": 2.5157277584075928, "logits/rejected": 2.5739080905914307, "logps/chosen": -3.658534526824951, "logps/rejected": -7.9988884925842285, "loss": 0.2454, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.487801551818848, "rewards/margins": 6.510530948638916, "rewards/rejected": -11.998331069946289, "step": 590 }, { "epoch": 0.45662100456621, "grad_norm": 1.8180292844772339, "learning_rate": 3.272542485937369e-06, "logits/chosen": 2.6391870975494385, "logits/rejected": 2.72003173828125, "logps/chosen": -3.382587432861328, "logps/rejected": -8.08546257019043, "loss": 0.2706, "rewards/accuracies": 0.875, "rewards/chosen": -5.073880672454834, "rewards/margins": 7.054312229156494, "rewards/rejected": -12.128194808959961, "step": 600 }, { "epoch": 0.45662100456621, "eval_logits/chosen": 3.235776901245117, "eval_logits/rejected": 3.2310192584991455, "eval_logps/chosen": -3.201641082763672, "eval_logps/rejected": -7.113856315612793, "eval_loss": 0.26078400015830994, "eval_rewards/accuracies": 0.8878504633903503, "eval_rewards/chosen": -4.802461624145508, "eval_rewards/margins": 5.8683247566223145, "eval_rewards/rejected": -10.67078685760498, "eval_runtime": 30.7777, "eval_samples_per_second": 27.617, "eval_steps_per_second": 3.477, "step": 600 }, { "epoch": 0.4642313546423135, "grad_norm": 3.1237454414367676, "learning_rate": 3.222579492361179e-06, "logits/chosen": 3.097729444503784, "logits/rejected": 2.8835232257843018, "logps/chosen": -3.2986862659454346, "logps/rejected": -7.824951171875, "loss": 0.2455, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.948029518127441, "rewards/margins": 6.789399147033691, "rewards/rejected": -11.7374267578125, "step": 610 }, { "epoch": 0.471841704718417, "grad_norm": 2.250023365020752, "learning_rate": 3.1722995515381644e-06, "logits/chosen": 3.303495407104492, "logits/rejected": 3.124060869216919, "logps/chosen": -2.9880855083465576, "logps/rejected": -6.581275939941406, "loss": 0.2754, "rewards/accuracies": 0.875, "rewards/chosen": -4.482128143310547, "rewards/margins": 5.389786243438721, "rewards/rejected": -9.87191390991211, "step": 620 }, { "epoch": 0.4794520547945205, "grad_norm": 4.364448547363281, "learning_rate": 3.121724717912138e-06, "logits/chosen": 2.8994319438934326, "logits/rejected": 2.593780755996704, "logps/chosen": -3.4450290203094482, "logps/rejected": -7.1797990798950195, "loss": 0.2316, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.167544364929199, "rewards/margins": 5.602154731750488, "rewards/rejected": -10.769698143005371, "step": 630 }, { "epoch": 0.487062404870624, "grad_norm": 3.65561580657959, "learning_rate": 3.0708771752766397e-06, "logits/chosen": 3.1075518131256104, "logits/rejected": 2.8703231811523438, "logps/chosen": -3.26599383354187, "logps/rejected": -7.536534786224365, "loss": 0.2549, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.898990631103516, "rewards/margins": 6.4058122634887695, "rewards/rejected": -11.304803848266602, "step": 640 }, { "epoch": 0.4946727549467275, "grad_norm": 3.1211891174316406, "learning_rate": 3.019779227044398e-06, "logits/chosen": 3.2364888191223145, "logits/rejected": 3.3938751220703125, "logps/chosen": -3.538849353790283, "logps/rejected": -7.827691555023193, "loss": 0.2571, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.308274269104004, "rewards/margins": 6.433261871337891, "rewards/rejected": -11.741537094116211, "step": 650 }, { "epoch": 0.4946727549467275, "eval_logits/chosen": 3.091616153717041, "eval_logits/rejected": 3.0459396839141846, "eval_logps/chosen": -3.361125946044922, "eval_logps/rejected": -7.390212535858154, "eval_loss": 0.2536354660987854, "eval_rewards/accuracies": 0.8971962332725525, "eval_rewards/chosen": -5.041689395904541, "eval_rewards/margins": 6.043630599975586, "eval_rewards/rejected": -11.085319519042969, "eval_runtime": 30.7751, "eval_samples_per_second": 27.62, "eval_steps_per_second": 3.477, "step": 650 }, { "epoch": 0.502283105022831, "grad_norm": 4.375415802001953, "learning_rate": 2.9684532864643123e-06, "logits/chosen": 1.8274192810058594, "logits/rejected": 1.9628839492797852, "logps/chosen": -3.083608627319336, "logps/rejected": -8.370513916015625, "loss": 0.2166, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.625412940979004, "rewards/margins": 7.930357456207275, "rewards/rejected": -12.555770874023438, "step": 660 }, { "epoch": 0.5098934550989346, "grad_norm": 3.6583638191223145, "learning_rate": 2.9169218667902562e-06, "logits/chosen": 2.5409281253814697, "logits/rejected": 2.46968150138855, "logps/chosen": -3.252990245819092, "logps/rejected": -7.6487884521484375, "loss": 0.2103, "rewards/accuracies": 0.9375, "rewards/chosen": -4.879485607147217, "rewards/margins": 6.593697547912598, "rewards/rejected": -11.473182678222656, "step": 670 }, { "epoch": 0.517503805175038, "grad_norm": 2.009876251220703, "learning_rate": 2.8652075714060296e-06, "logits/chosen": 3.553900957107544, "logits/rejected": 3.4104526042938232, "logps/chosen": -2.9901890754699707, "logps/rejected": -7.472288608551025, "loss": 0.2405, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.485283374786377, "rewards/margins": 6.723149299621582, "rewards/rejected": -11.208433151245117, "step": 680 }, { "epoch": 0.5251141552511416, "grad_norm": 2.9065611362457275, "learning_rate": 2.813333083910761e-06, "logits/chosen": 2.323111057281494, "logits/rejected": 2.0086140632629395, "logps/chosen": -3.430807590484619, "logps/rejected": -8.105338096618652, "loss": 0.2182, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.146210670471191, "rewards/margins": 7.011796474456787, "rewards/rejected": -12.15800666809082, "step": 690 }, { "epoch": 0.532724505327245, "grad_norm": 4.097139358520508, "learning_rate": 2.761321158169134e-06, "logits/chosen": 1.9292926788330078, "logits/rejected": 2.0105385780334473, "logps/chosen": -3.337139129638672, "logps/rejected": -7.7645721435546875, "loss": 0.245, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.005709171295166, "rewards/margins": 6.641148567199707, "rewards/rejected": -11.646858215332031, "step": 700 }, { "epoch": 0.532724505327245, "eval_logits/chosen": 2.946713924407959, "eval_logits/rejected": 2.912729501724243, "eval_logps/chosen": -3.3646414279937744, "eval_logps/rejected": -7.633481979370117, "eval_loss": 0.2412233203649521, "eval_rewards/accuracies": 0.9065420627593994, "eval_rewards/chosen": -5.046962261199951, "eval_rewards/margins": 6.403261184692383, "eval_rewards/rejected": -11.450223922729492, "eval_runtime": 30.7742, "eval_samples_per_second": 27.621, "eval_steps_per_second": 3.477, "step": 700 }, { "epoch": 0.5403348554033486, "grad_norm": 4.534071922302246, "learning_rate": 2.70919460833079e-06, "logits/chosen": 2.4406304359436035, "logits/rejected": 2.713369607925415, "logps/chosen": -3.4392433166503906, "logps/rejected": -8.228872299194336, "loss": 0.2245, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.158864498138428, "rewards/margins": 7.184444427490234, "rewards/rejected": -12.34330940246582, "step": 710 }, { "epoch": 0.547945205479452, "grad_norm": 1.5941667556762695, "learning_rate": 2.6569762988232838e-06, "logits/chosen": 3.3259758949279785, "logits/rejected": 3.3186354637145996, "logps/chosen": -3.7198212146759033, "logps/rejected": -7.6400909423828125, "loss": 0.2339, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.5797319412231445, "rewards/margins": 5.8804030418396, "rewards/rejected": -11.460134506225586, "step": 720 }, { "epoch": 0.5555555555555556, "grad_norm": 5.972750186920166, "learning_rate": 2.604689134322999e-06, "logits/chosen": 2.455244302749634, "logits/rejected": 2.5398240089416504, "logps/chosen": -3.892965316772461, "logps/rejected": -8.59666633605957, "loss": 0.2041, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.83944845199585, "rewards/margins": 7.055548667907715, "rewards/rejected": -12.894998550415039, "step": 730 }, { "epoch": 0.563165905631659, "grad_norm": 3.1441280841827393, "learning_rate": 2.5523560497083927e-06, "logits/chosen": 2.3029608726501465, "logits/rejected": 2.3662524223327637, "logps/chosen": -3.682513475418091, "logps/rejected": -8.340951919555664, "loss": 0.2723, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.523770332336426, "rewards/margins": 6.9876580238342285, "rewards/rejected": -12.511428833007812, "step": 740 }, { "epoch": 0.5707762557077626, "grad_norm": 2.30711030960083, "learning_rate": 2.5e-06, "logits/chosen": 1.8446356058120728, "logits/rejected": 1.9468234777450562, "logps/chosen": -3.808454990386963, "logps/rejected": -8.047523498535156, "loss": 0.1936, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.712681770324707, "rewards/margins": 6.3586015701293945, "rewards/rejected": -12.071284294128418, "step": 750 }, { "epoch": 0.5707762557077626, "eval_logits/chosen": 2.7250914573669434, "eval_logits/rejected": 2.7465131282806396, "eval_logps/chosen": -3.5594334602355957, "eval_logps/rejected": -7.899675369262695, "eval_loss": 0.2329329252243042, "eval_rewards/accuracies": 0.9345794320106506, "eval_rewards/chosen": -5.339150905609131, "eval_rewards/margins": 6.510361671447754, "eval_rewards/rejected": -11.84951114654541, "eval_runtime": 30.7783, "eval_samples_per_second": 27.617, "eval_steps_per_second": 3.476, "step": 750 }, { "epoch": 0.578386605783866, "grad_norm": 3.0719997882843018, "learning_rate": 2.447643950291608e-06, "logits/chosen": 2.3534064292907715, "logits/rejected": 2.401563882827759, "logps/chosen": -3.610807418823242, "logps/rejected": -7.985041618347168, "loss": 0.1847, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.416211128234863, "rewards/margins": 6.561351776123047, "rewards/rejected": -11.97756290435791, "step": 760 }, { "epoch": 0.5859969558599696, "grad_norm": 3.6076536178588867, "learning_rate": 2.3953108656770018e-06, "logits/chosen": 1.969351053237915, "logits/rejected": 2.2207655906677246, "logps/chosen": -2.9666709899902344, "logps/rejected": -8.2462797164917, "loss": 0.2211, "rewards/accuracies": 0.9375, "rewards/chosen": -4.450006484985352, "rewards/margins": 7.919413089752197, "rewards/rejected": -12.369420051574707, "step": 770 }, { "epoch": 0.593607305936073, "grad_norm": 3.9795055389404297, "learning_rate": 2.3430237011767166e-06, "logits/chosen": 2.6336114406585693, "logits/rejected": 2.438262462615967, "logps/chosen": -3.4108245372772217, "logps/rejected": -8.591341018676758, "loss": 0.2306, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.116236209869385, "rewards/margins": 7.770774841308594, "rewards/rejected": -12.887011528015137, "step": 780 }, { "epoch": 0.6012176560121766, "grad_norm": 3.288729667663574, "learning_rate": 2.290805391669212e-06, "logits/chosen": 1.6185804605484009, "logits/rejected": 1.6822645664215088, "logps/chosen": -3.2728798389434814, "logps/rejected": -7.791805267333984, "loss": 0.2329, "rewards/accuracies": 0.9375, "rewards/chosen": -4.909319877624512, "rewards/margins": 6.778387546539307, "rewards/rejected": -11.687707901000977, "step": 790 }, { "epoch": 0.60882800608828, "grad_norm": 2.315206289291382, "learning_rate": 2.238678841830867e-06, "logits/chosen": 2.018578052520752, "logits/rejected": 1.8465204238891602, "logps/chosen": -3.2794997692108154, "logps/rejected": -8.164708137512207, "loss": 0.2082, "rewards/accuracies": 0.9375, "rewards/chosen": -4.919250011444092, "rewards/margins": 7.327812194824219, "rewards/rejected": -12.247061729431152, "step": 800 }, { "epoch": 0.60882800608828, "eval_logits/chosen": 3.08111572265625, "eval_logits/rejected": 3.0503101348876953, "eval_logps/chosen": -3.4727284908294678, "eval_logps/rejected": -7.895880699157715, "eval_loss": 0.22325536608695984, "eval_rewards/accuracies": 0.9252336621284485, "eval_rewards/chosen": -5.209092617034912, "eval_rewards/margins": 6.634730339050293, "eval_rewards/rejected": -11.84382152557373, "eval_runtime": 30.7753, "eval_samples_per_second": 27.62, "eval_steps_per_second": 3.477, "step": 800 }, { "epoch": 0.6164383561643836, "grad_norm": 2.4823191165924072, "learning_rate": 2.186666916089239e-06, "logits/chosen": 2.6345906257629395, "logits/rejected": 2.5206990242004395, "logps/chosen": -3.123882532119751, "logps/rejected": -7.772116184234619, "loss": 0.1931, "rewards/accuracies": 0.9375, "rewards/chosen": -4.685823440551758, "rewards/margins": 6.972352027893066, "rewards/rejected": -11.658174514770508, "step": 810 }, { "epoch": 0.624048706240487, "grad_norm": 1.3935630321502686, "learning_rate": 2.134792428593971e-06, "logits/chosen": 2.4640917778015137, "logits/rejected": 2.531430721282959, "logps/chosen": -3.5584664344787598, "logps/rejected": -8.64016342163086, "loss": 0.205, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.337700366973877, "rewards/margins": 7.622546195983887, "rewards/rejected": -12.960246086120605, "step": 820 }, { "epoch": 0.6316590563165906, "grad_norm": 2.7727415561676025, "learning_rate": 2.0830781332097446e-06, "logits/chosen": 3.5152816772460938, "logits/rejected": 3.3117728233337402, "logps/chosen": -3.820496082305908, "logps/rejected": -8.241179466247559, "loss": 0.2315, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.730743408203125, "rewards/margins": 6.631025791168213, "rewards/rejected": -12.361770629882812, "step": 830 }, { "epoch": 0.639269406392694, "grad_norm": 3.5436055660247803, "learning_rate": 2.031546713535688e-06, "logits/chosen": 2.813575506210327, "logits/rejected": 2.650791645050049, "logps/chosen": -3.552276134490967, "logps/rejected": -8.587823867797852, "loss": 0.1964, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.328413963317871, "rewards/margins": 7.553321838378906, "rewards/rejected": -12.881736755371094, "step": 840 }, { "epoch": 0.6468797564687976, "grad_norm": 4.140048027038574, "learning_rate": 1.9802207729556023e-06, "logits/chosen": 2.695815324783325, "logits/rejected": 2.509413242340088, "logps/chosen": -3.706005573272705, "logps/rejected": -8.673744201660156, "loss": 0.1882, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.55900764465332, "rewards/margins": 7.451608180999756, "rewards/rejected": -13.01061725616455, "step": 850 }, { "epoch": 0.6468797564687976, "eval_logits/chosen": 3.0777766704559326, "eval_logits/rejected": 3.120222806930542, "eval_logps/chosen": -3.801321268081665, "eval_logps/rejected": -8.376675605773926, "eval_loss": 0.21778903901576996, "eval_rewards/accuracies": 0.9345794320106506, "eval_rewards/chosen": -5.701981067657471, "eval_rewards/margins": 6.863031387329102, "eval_rewards/rejected": -12.565014839172363, "eval_runtime": 30.7706, "eval_samples_per_second": 27.624, "eval_steps_per_second": 3.477, "step": 850 } ], "logging_steps": 10, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.9792225832827617e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }