{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1465, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00682681230532918, "grad_norm": 20.28844809527937, "learning_rate": 6.122448979591837e-08, "logits/chosen": 0.036548737436532974, "logits/rejected": 0.04153463989496231, "logps/chosen": -191.76834106445312, "logps/rejected": -189.42454528808594, "loss": 0.6921, "rewards/accuracies": 0.4296875298023224, "rewards/chosen": 0.00048236188013106585, "rewards/margins": 0.002409812528640032, "rewards/rejected": -0.0019274509977549314, "step": 10 }, { "epoch": 0.01365362461065836, "grad_norm": 21.758270816027, "learning_rate": 1.2925170068027211e-07, "logits/chosen": 0.045435529202222824, "logits/rejected": 0.05586998164653778, "logps/chosen": -187.8279571533203, "logps/rejected": -187.57005310058594, "loss": 0.6934, "rewards/accuracies": 0.47343751788139343, "rewards/chosen": 0.0018052328377962112, "rewards/margins": -0.00019586144480854273, "rewards/rejected": 0.0020010939333587885, "step": 20 }, { "epoch": 0.02048043691598754, "grad_norm": 20.292602508788892, "learning_rate": 1.9727891156462583e-07, "logits/chosen": 0.01841779053211212, "logits/rejected": 0.043382205069065094, "logps/chosen": -185.8760986328125, "logps/rejected": -189.0175323486328, "loss": 0.6933, "rewards/accuracies": 0.5015625357627869, "rewards/chosen": -0.0002999665157403797, "rewards/margins": 2.151366788893938e-05, "rewards/rejected": -0.0003214801545254886, "step": 30 }, { "epoch": 0.02730724922131672, "grad_norm": 19.887585625537866, "learning_rate": 2.653061224489796e-07, "logits/chosen": 0.030143991112709045, "logits/rejected": 0.031897470355033875, "logps/chosen": -193.6663360595703, "logps/rejected": -190.40435791015625, "loss": 0.6926, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.0008976617245934904, "rewards/margins": 0.001484251581132412, "rewards/rejected": -0.0005865898565389216, "step": 40 }, { "epoch": 0.0341340615266459, "grad_norm": 21.04186313593167, "learning_rate": 3.333333333333333e-07, "logits/chosen": 0.049032919108867645, "logits/rejected": 0.06374948471784592, "logps/chosen": -188.4130401611328, "logps/rejected": -190.03564453125, "loss": 0.6953, "rewards/accuracies": 0.4468750059604645, "rewards/chosen": 0.0011686112266033888, "rewards/margins": -0.0039381845854222775, "rewards/rejected": 0.005106796510517597, "step": 50 }, { "epoch": 0.04096087383197508, "grad_norm": 20.494153976987644, "learning_rate": 4.0136054421768705e-07, "logits/chosen": 0.054087888449430466, "logits/rejected": 0.05269278585910797, "logps/chosen": -189.312744140625, "logps/rejected": -184.3561248779297, "loss": 0.6934, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0027984948828816414, "rewards/margins": -5.139678250998259e-06, "rewards/rejected": 0.0028036346193403006, "step": 60 }, { "epoch": 0.04778768613730426, "grad_norm": 22.523707852506742, "learning_rate": 4.693877551020408e-07, "logits/chosen": 0.03845703601837158, "logits/rejected": 0.04080147296190262, "logps/chosen": -189.47398376464844, "logps/rejected": -190.44366455078125, "loss": 0.6923, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.008863605558872223, "rewards/margins": 0.002069632289931178, "rewards/rejected": 0.006793972570449114, "step": 70 }, { "epoch": 0.05461449844263344, "grad_norm": 19.61999116994199, "learning_rate": 5.374149659863945e-07, "logits/chosen": 0.02700674906373024, "logits/rejected": 0.014029408805072308, "logps/chosen": -189.84165954589844, "logps/rejected": -187.60842895507812, "loss": 0.688, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.019321825355291367, "rewards/margins": 0.01094727497547865, "rewards/rejected": 0.008374550379812717, "step": 80 }, { "epoch": 0.06144131074796262, "grad_norm": 22.05785225177187, "learning_rate": 6.054421768707482e-07, "logits/chosen": 0.01984417997300625, "logits/rejected": 0.025053691118955612, "logps/chosen": -186.66159057617188, "logps/rejected": -189.27655029296875, "loss": 0.6881, "rewards/accuracies": 0.5859375, "rewards/chosen": 0.027713492512702942, "rewards/margins": 0.010677953250706196, "rewards/rejected": 0.01703553833067417, "step": 90 }, { "epoch": 0.0682681230532918, "grad_norm": 20.523376899329886, "learning_rate": 6.734693877551019e-07, "logits/chosen": 0.03126838803291321, "logits/rejected": 0.054031528532505035, "logps/chosen": -190.2693328857422, "logps/rejected": -189.81666564941406, "loss": 0.6859, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.040415603667497635, "rewards/margins": 0.015625018626451492, "rewards/rejected": 0.024790585041046143, "step": 100 }, { "epoch": 0.07509493535862098, "grad_norm": 21.235392584783057, "learning_rate": 7.414965986394558e-07, "logits/chosen": -0.008218009024858475, "logits/rejected": -0.0177446398884058, "logps/chosen": -189.50607299804688, "logps/rejected": -192.59072875976562, "loss": 0.6827, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": 0.05662111937999725, "rewards/margins": 0.022500621154904366, "rewards/rejected": 0.03412050008773804, "step": 110 }, { "epoch": 0.08192174766395016, "grad_norm": 19.963280285650864, "learning_rate": 8.095238095238095e-07, "logits/chosen": -0.048616923391819, "logits/rejected": -0.04890388250350952, "logps/chosen": -197.3944091796875, "logps/rejected": -192.91751098632812, "loss": 0.6817, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": 0.06002511456608772, "rewards/margins": 0.02546420879662037, "rewards/rejected": 0.034560900181531906, "step": 120 }, { "epoch": 0.08874855996927934, "grad_norm": 19.85458405774497, "learning_rate": 8.775510204081632e-07, "logits/chosen": -0.02412007376551628, "logits/rejected": -0.02812131866812706, "logps/chosen": -192.2180938720703, "logps/rejected": -190.7085418701172, "loss": 0.6769, "rewards/accuracies": 0.6609375476837158, "rewards/chosen": 0.07226413488388062, "rewards/margins": 0.03687696158885956, "rewards/rejected": 0.03538716956973076, "step": 130 }, { "epoch": 0.09557537227460852, "grad_norm": 22.035739399905705, "learning_rate": 9.45578231292517e-07, "logits/chosen": -0.03001168556511402, "logits/rejected": -0.00422773277387023, "logps/chosen": -193.58248901367188, "logps/rejected": -189.52310180664062, "loss": 0.6712, "rewards/accuracies": 0.653124988079071, "rewards/chosen": 0.08945093303918839, "rewards/margins": 0.049683406949043274, "rewards/rejected": 0.03976753354072571, "step": 140 }, { "epoch": 0.1024021845799377, "grad_norm": 20.65980177560287, "learning_rate": 9.98482549317147e-07, "logits/chosen": -0.07848148047924042, "logits/rejected": -0.08446665108203888, "logps/chosen": -203.80819702148438, "logps/rejected": -202.5292510986328, "loss": 0.6659, "rewards/accuracies": 0.651562511920929, "rewards/chosen": 0.10370737314224243, "rewards/margins": 0.06292011588811874, "rewards/rejected": 0.04078725352883339, "step": 150 }, { "epoch": 0.10922899688526688, "grad_norm": 20.319118469990684, "learning_rate": 9.908952959028832e-07, "logits/chosen": -0.09329548478126526, "logits/rejected": -0.08749746531248093, "logps/chosen": -185.65835571289062, "logps/rejected": -186.53118896484375, "loss": 0.6661, "rewards/accuracies": 0.6578125357627869, "rewards/chosen": 0.08388800173997879, "rewards/margins": 0.06475642323493958, "rewards/rejected": 0.019131578505039215, "step": 160 }, { "epoch": 0.11605580919059606, "grad_norm": 20.49502936431809, "learning_rate": 9.833080424886191e-07, "logits/chosen": -0.08693637698888779, "logits/rejected": -0.05635486915707588, "logps/chosen": -188.38397216796875, "logps/rejected": -190.4040069580078, "loss": 0.6596, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": 0.04457355663180351, "rewards/margins": 0.08550170809030533, "rewards/rejected": -0.040928155183792114, "step": 170 }, { "epoch": 0.12288262149592524, "grad_norm": 22.028249586292812, "learning_rate": 9.75720789074355e-07, "logits/chosen": -0.07805919647216797, "logits/rejected": -0.0718764141201973, "logps/chosen": -197.16851806640625, "logps/rejected": -197.43580627441406, "loss": 0.6533, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.04377557337284088, "rewards/margins": 0.10366812348365784, "rewards/rejected": -0.059892550110816956, "step": 180 }, { "epoch": 0.12970943380125444, "grad_norm": 22.318401877969084, "learning_rate": 9.68133535660091e-07, "logits/chosen": -0.07107028365135193, "logits/rejected": -0.04754173755645752, "logps/chosen": -190.40989685058594, "logps/rejected": -195.1139373779297, "loss": 0.6431, "rewards/accuracies": 0.671875, "rewards/chosen": 0.049717679619789124, "rewards/margins": 0.12626302242279053, "rewards/rejected": -0.0765453577041626, "step": 190 }, { "epoch": 0.1365362461065836, "grad_norm": 22.36881239648136, "learning_rate": 9.60546282245827e-07, "logits/chosen": -0.11804388463497162, "logits/rejected": -0.08306587487459183, "logps/chosen": -193.70574951171875, "logps/rejected": -196.11013793945312, "loss": 0.6244, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": 0.035508595407009125, "rewards/margins": 0.1746881902217865, "rewards/rejected": -0.13917961716651917, "step": 200 }, { "epoch": 0.1433630584119128, "grad_norm": 22.89524823533437, "learning_rate": 9.52959028831563e-07, "logits/chosen": -0.1726662516593933, "logits/rejected": -0.1550799012184143, "logps/chosen": -203.2538604736328, "logps/rejected": -200.24282836914062, "loss": 0.6277, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.02303643897175789, "rewards/margins": 0.18620665371418, "rewards/rejected": -0.20924308896064758, "step": 210 }, { "epoch": 0.15018987071724196, "grad_norm": 20.757150099669722, "learning_rate": 9.453717754172988e-07, "logits/chosen": -0.18080198764801025, "logits/rejected": -0.14751200377941132, "logps/chosen": -198.46144104003906, "logps/rejected": -200.69223022460938, "loss": 0.6021, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.039264436811208725, "rewards/margins": 0.2533213794231415, "rewards/rejected": -0.2925858199596405, "step": 220 }, { "epoch": 0.15701668302257116, "grad_norm": 21.58011887614948, "learning_rate": 9.377845220030348e-07, "logits/chosen": -0.20878151059150696, "logits/rejected": -0.18805599212646484, "logps/chosen": -197.68267822265625, "logps/rejected": -200.50274658203125, "loss": 0.6107, "rewards/accuracies": 0.671875, "rewards/chosen": -0.09467742592096329, "rewards/margins": 0.2487429678440094, "rewards/rejected": -0.3434203863143921, "step": 230 }, { "epoch": 0.16384349532790032, "grad_norm": 24.074058895354312, "learning_rate": 9.301972685887707e-07, "logits/chosen": -0.2260722517967224, "logits/rejected": -0.1964491903781891, "logps/chosen": -191.25946044921875, "logps/rejected": -196.70068359375, "loss": 0.6126, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.1551636904478073, "rewards/margins": 0.24429623782634735, "rewards/rejected": -0.39945995807647705, "step": 240 }, { "epoch": 0.17067030763322952, "grad_norm": 22.193469540092437, "learning_rate": 9.226100151745068e-07, "logits/chosen": -0.23559394478797913, "logits/rejected": -0.20932801067829132, "logps/chosen": -191.7481689453125, "logps/rejected": -197.9097900390625, "loss": 0.6207, "rewards/accuracies": 0.6578125357627869, "rewards/chosen": -0.23032304644584656, "rewards/margins": 0.262432336807251, "rewards/rejected": -0.4927554130554199, "step": 250 }, { "epoch": 0.17749711993855868, "grad_norm": 23.15984586558187, "learning_rate": 9.150227617602428e-07, "logits/chosen": -0.21696263551712036, "logits/rejected": -0.1862940490245819, "logps/chosen": -190.5767822265625, "logps/rejected": -195.84523010253906, "loss": 0.6057, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.25581681728363037, "rewards/margins": 0.3005717098712921, "rewards/rejected": -0.5563884973526001, "step": 260 }, { "epoch": 0.18432393224388788, "grad_norm": 23.590660195882563, "learning_rate": 9.074355083459787e-07, "logits/chosen": -0.21451206505298615, "logits/rejected": -0.19581295549869537, "logps/chosen": -194.9508056640625, "logps/rejected": -202.88421630859375, "loss": 0.5954, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": -0.27721405029296875, "rewards/margins": 0.30862218141555786, "rewards/rejected": -0.5858362317085266, "step": 270 }, { "epoch": 0.19115074454921704, "grad_norm": 23.62922114673101, "learning_rate": 8.998482549317147e-07, "logits/chosen": -0.26997700333595276, "logits/rejected": -0.2415258288383484, "logps/chosen": -189.7841339111328, "logps/rejected": -194.67752075195312, "loss": 0.5888, "rewards/accuracies": 0.6984375715255737, "rewards/chosen": -0.2856101393699646, "rewards/margins": 0.3423476219177246, "rewards/rejected": -0.6279577016830444, "step": 280 }, { "epoch": 0.19797755685454624, "grad_norm": 26.534317555189272, "learning_rate": 8.922610015174506e-07, "logits/chosen": -0.2865559160709381, "logits/rejected": -0.268317312002182, "logps/chosen": -202.84178161621094, "logps/rejected": -207.90689086914062, "loss": 0.5911, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": -0.33995726704597473, "rewards/margins": 0.3573826551437378, "rewards/rejected": -0.6973399519920349, "step": 290 }, { "epoch": 0.2048043691598754, "grad_norm": 23.626680063365733, "learning_rate": 8.846737481031866e-07, "logits/chosen": -0.2780352830886841, "logits/rejected": -0.242587149143219, "logps/chosen": -201.13905334472656, "logps/rejected": -203.72994995117188, "loss": 0.6114, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.3799774646759033, "rewards/margins": 0.32134854793548584, "rewards/rejected": -0.7013260126113892, "step": 300 }, { "epoch": 0.2116311814652046, "grad_norm": 21.614874156586975, "learning_rate": 8.770864946889226e-07, "logits/chosen": -0.2941948175430298, "logits/rejected": -0.2689184844493866, "logps/chosen": -204.522216796875, "logps/rejected": -214.1429443359375, "loss": 0.5795, "rewards/accuracies": 0.7187500596046448, "rewards/chosen": -0.36675944924354553, "rewards/margins": 0.4399191737174988, "rewards/rejected": -0.8066786527633667, "step": 310 }, { "epoch": 0.21845799377053376, "grad_norm": 25.28063384418063, "learning_rate": 8.694992412746586e-07, "logits/chosen": -0.2757799029350281, "logits/rejected": -0.27304551005363464, "logps/chosen": -198.39239501953125, "logps/rejected": -204.60415649414062, "loss": 0.574, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.3858140707015991, "rewards/margins": 0.41097506880760193, "rewards/rejected": -0.7967891097068787, "step": 320 }, { "epoch": 0.22528480607586296, "grad_norm": 23.979378980602498, "learning_rate": 8.619119878603945e-07, "logits/chosen": -0.33364883065223694, "logits/rejected": -0.3188924193382263, "logps/chosen": -207.99539184570312, "logps/rejected": -212.8521270751953, "loss": 0.5916, "rewards/accuracies": 0.676562488079071, "rewards/chosen": -0.47858649492263794, "rewards/margins": 0.37764379382133484, "rewards/rejected": -0.8562303185462952, "step": 330 }, { "epoch": 0.23211161838119213, "grad_norm": 23.546759098667398, "learning_rate": 8.543247344461305e-07, "logits/chosen": -0.3054922819137573, "logits/rejected": -0.28177574276924133, "logps/chosen": -203.67938232421875, "logps/rejected": -211.8566131591797, "loss": 0.5591, "rewards/accuracies": 0.7312500476837158, "rewards/chosen": -0.3886514902114868, "rewards/margins": 0.5112159252166748, "rewards/rejected": -0.8998674154281616, "step": 340 }, { "epoch": 0.23893843068652132, "grad_norm": 23.169862441537628, "learning_rate": 8.467374810318663e-07, "logits/chosen": -0.3280317485332489, "logits/rejected": -0.28759223222732544, "logps/chosen": -204.1493682861328, "logps/rejected": -212.39410400390625, "loss": 0.5519, "rewards/accuracies": 0.7140624523162842, "rewards/chosen": -0.46998512744903564, "rewards/margins": 0.5246675610542297, "rewards/rejected": -0.9946527481079102, "step": 350 }, { "epoch": 0.24576524299185049, "grad_norm": 24.779086737609497, "learning_rate": 8.391502276176023e-07, "logits/chosen": -0.30415648221969604, "logits/rejected": -0.272957980632782, "logps/chosen": -196.55276489257812, "logps/rejected": -204.5069580078125, "loss": 0.5813, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.48177972435951233, "rewards/margins": 0.4307110607624054, "rewards/rejected": -0.9124907851219177, "step": 360 }, { "epoch": 0.25259205529717965, "grad_norm": 23.12378831072875, "learning_rate": 8.315629742033384e-07, "logits/chosen": -0.28424739837646484, "logits/rejected": -0.2549440562725067, "logps/chosen": -197.5590057373047, "logps/rejected": -210.90127563476562, "loss": 0.5664, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -0.5475950837135315, "rewards/margins": 0.4803504943847656, "rewards/rejected": -1.027945637702942, "step": 370 }, { "epoch": 0.2594188676025089, "grad_norm": 21.651196996632308, "learning_rate": 8.239757207890743e-07, "logits/chosen": -0.30731576681137085, "logits/rejected": -0.28506577014923096, "logps/chosen": -204.0982208251953, "logps/rejected": -214.4062957763672, "loss": 0.5418, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5043364763259888, "rewards/margins": 0.5770630240440369, "rewards/rejected": -1.0813994407653809, "step": 380 }, { "epoch": 0.26624567990783804, "grad_norm": 22.751040205953682, "learning_rate": 8.163884673748103e-07, "logits/chosen": -0.2683367133140564, "logits/rejected": -0.2332780659198761, "logps/chosen": -206.3434600830078, "logps/rejected": -217.34039306640625, "loss": 0.532, "rewards/accuracies": 0.7359375357627869, "rewards/chosen": -0.47760826349258423, "rewards/margins": 0.6127501130104065, "rewards/rejected": -1.0903582572937012, "step": 390 }, { "epoch": 0.2730724922131672, "grad_norm": 24.910083016180884, "learning_rate": 8.088012139605462e-07, "logits/chosen": -0.28875064849853516, "logits/rejected": -0.24306923151016235, "logps/chosen": -202.8630828857422, "logps/rejected": -216.6767578125, "loss": 0.5276, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.5778933167457581, "rewards/margins": 0.6203677654266357, "rewards/rejected": -1.198261022567749, "step": 400 }, { "epoch": 0.2798993045184964, "grad_norm": 24.823111882995203, "learning_rate": 8.012139605462822e-07, "logits/chosen": -0.3603791296482086, "logits/rejected": -0.314382940530777, "logps/chosen": -202.1355438232422, "logps/rejected": -208.8005828857422, "loss": 0.5573, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.6083186864852905, "rewards/margins": 0.5520691275596619, "rewards/rejected": -1.1603877544403076, "step": 410 }, { "epoch": 0.2867261168238256, "grad_norm": 24.6727191743662, "learning_rate": 7.936267071320181e-07, "logits/chosen": -0.352464497089386, "logits/rejected": -0.31943339109420776, "logps/chosen": -207.77703857421875, "logps/rejected": -216.52903747558594, "loss": 0.5276, "rewards/accuracies": 0.75, "rewards/chosen": -0.6607990264892578, "rewards/margins": 0.6181461215019226, "rewards/rejected": -1.2789450883865356, "step": 420 }, { "epoch": 0.29355292912915476, "grad_norm": 23.595963095415875, "learning_rate": 7.860394537177542e-07, "logits/chosen": -0.35178378224372864, "logits/rejected": -0.3272789418697357, "logps/chosen": -211.35107421875, "logps/rejected": -227.26947021484375, "loss": 0.5216, "rewards/accuracies": 0.7437500357627869, "rewards/chosen": -0.7554113268852234, "rewards/margins": 0.7442721724510193, "rewards/rejected": -1.4996836185455322, "step": 430 }, { "epoch": 0.3003797414344839, "grad_norm": 21.126042694881086, "learning_rate": 7.784522003034901e-07, "logits/chosen": -0.3556375503540039, "logits/rejected": -0.3088908791542053, "logps/chosen": -203.177001953125, "logps/rejected": -212.28463745117188, "loss": 0.5038, "rewards/accuracies": 0.7843750715255737, "rewards/chosen": -0.6830729246139526, "rewards/margins": 0.719536304473877, "rewards/rejected": -1.4026092290878296, "step": 440 }, { "epoch": 0.3072065537398131, "grad_norm": 29.677764730937284, "learning_rate": 7.708649468892261e-07, "logits/chosen": -0.37930557131767273, "logits/rejected": -0.3498023450374603, "logps/chosen": -209.02462768554688, "logps/rejected": -225.4560089111328, "loss": 0.5226, "rewards/accuracies": 0.7484375238418579, "rewards/chosen": -0.7893091440200806, "rewards/margins": 0.6981508731842041, "rewards/rejected": -1.4874598979949951, "step": 450 }, { "epoch": 0.3140333660451423, "grad_norm": 22.623116660950316, "learning_rate": 7.632776934749621e-07, "logits/chosen": -0.4005587100982666, "logits/rejected": -0.37974676489830017, "logps/chosen": -208.33297729492188, "logps/rejected": -223.84945678710938, "loss": 0.5006, "rewards/accuracies": 0.7640625238418579, "rewards/chosen": -0.7449624538421631, "rewards/margins": 0.8266347646713257, "rewards/rejected": -1.5715970993041992, "step": 460 }, { "epoch": 0.3208601783504715, "grad_norm": 24.70720094293141, "learning_rate": 7.55690440060698e-07, "logits/chosen": -0.4142500162124634, "logits/rejected": -0.39995333552360535, "logps/chosen": -211.68191528320312, "logps/rejected": -222.69406127929688, "loss": 0.4891, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7804237604141235, "rewards/margins": 0.8243077993392944, "rewards/rejected": -1.6047316789627075, "step": 470 }, { "epoch": 0.32768699065580065, "grad_norm": 25.568499628810063, "learning_rate": 7.481031866464339e-07, "logits/chosen": -0.446720689535141, "logits/rejected": -0.4042230248451233, "logps/chosen": -201.85269165039062, "logps/rejected": -216.66664123535156, "loss": 0.5178, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.8340361714363098, "rewards/margins": 0.7315189838409424, "rewards/rejected": -1.565555214881897, "step": 480 }, { "epoch": 0.3345138029611298, "grad_norm": 25.32075291618558, "learning_rate": 7.405159332321699e-07, "logits/chosen": -0.46066954731941223, "logits/rejected": -0.4040055274963379, "logps/chosen": -215.21507263183594, "logps/rejected": -234.66297912597656, "loss": 0.5151, "rewards/accuracies": 0.745312511920929, "rewards/chosen": -0.9270689487457275, "rewards/margins": 0.8053193688392639, "rewards/rejected": -1.7323882579803467, "step": 490 }, { "epoch": 0.34134061526645904, "grad_norm": 33.13945555979027, "learning_rate": 7.329286798179059e-07, "logits/chosen": -0.41453421115875244, "logits/rejected": -0.38940101861953735, "logps/chosen": -216.4798583984375, "logps/rejected": -225.99053955078125, "loss": 0.5269, "rewards/accuracies": 0.739062488079071, "rewards/chosen": -0.9324908256530762, "rewards/margins": 0.7752447724342346, "rewards/rejected": -1.707735538482666, "step": 500 }, { "epoch": 0.3481674275717882, "grad_norm": 29.438798964644228, "learning_rate": 7.253414264036418e-07, "logits/chosen": -0.45365840196609497, "logits/rejected": -0.4384625256061554, "logps/chosen": -210.4576873779297, "logps/rejected": -227.66439819335938, "loss": 0.4929, "rewards/accuracies": 0.7703125476837158, "rewards/chosen": -0.7669359445571899, "rewards/margins": 0.8243012428283691, "rewards/rejected": -1.5912370681762695, "step": 510 }, { "epoch": 0.35499423987711737, "grad_norm": 30.351745830656164, "learning_rate": 7.177541729893778e-07, "logits/chosen": -0.4614608883857727, "logits/rejected": -0.4203529953956604, "logps/chosen": -203.9699249267578, "logps/rejected": -225.49705505371094, "loss": 0.5141, "rewards/accuracies": 0.7515625357627869, "rewards/chosen": -0.7932397127151489, "rewards/margins": 0.7744376063346863, "rewards/rejected": -1.56767737865448, "step": 520 }, { "epoch": 0.36182105218244653, "grad_norm": 28.0826891393098, "learning_rate": 7.101669195751137e-07, "logits/chosen": -0.44373035430908203, "logits/rejected": -0.4143510162830353, "logps/chosen": -217.09344482421875, "logps/rejected": -232.4281768798828, "loss": 0.5047, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -0.9070903062820435, "rewards/margins": 0.8483283519744873, "rewards/rejected": -1.7554187774658203, "step": 530 }, { "epoch": 0.36864786448777576, "grad_norm": 24.372697015036685, "learning_rate": 7.025796661608497e-07, "logits/chosen": -0.4468221068382263, "logits/rejected": -0.39906221628189087, "logps/chosen": -199.6805877685547, "logps/rejected": -217.41481018066406, "loss": 0.5279, "rewards/accuracies": 0.75, "rewards/chosen": -0.7898293733596802, "rewards/margins": 0.7459580302238464, "rewards/rejected": -1.5357873439788818, "step": 540 }, { "epoch": 0.3754746767931049, "grad_norm": 23.655888347883682, "learning_rate": 6.949924127465857e-07, "logits/chosen": -0.424525648355484, "logits/rejected": -0.3915669322013855, "logps/chosen": -218.43475341796875, "logps/rejected": -230.16305541992188, "loss": 0.5019, "rewards/accuracies": 0.7703125476837158, "rewards/chosen": -0.7072125673294067, "rewards/margins": 0.8257580399513245, "rewards/rejected": -1.5329705476760864, "step": 550 }, { "epoch": 0.3823014890984341, "grad_norm": 23.724107854224396, "learning_rate": 6.874051593323217e-07, "logits/chosen": -0.4273075759410858, "logits/rejected": -0.39428552985191345, "logps/chosen": -208.09197998046875, "logps/rejected": -228.27288818359375, "loss": 0.4661, "rewards/accuracies": 0.776562511920929, "rewards/chosen": -0.8194674253463745, "rewards/margins": 0.9406121373176575, "rewards/rejected": -1.7600796222686768, "step": 560 }, { "epoch": 0.38912830140376325, "grad_norm": 27.455015118742324, "learning_rate": 6.798179059180577e-07, "logits/chosen": -0.44063428044319153, "logits/rejected": -0.3982015550136566, "logps/chosen": -208.05084228515625, "logps/rejected": -224.2151336669922, "loss": 0.5007, "rewards/accuracies": 0.7671874761581421, "rewards/chosen": -0.8383839726448059, "rewards/margins": 0.7879061698913574, "rewards/rejected": -1.6262900829315186, "step": 570 }, { "epoch": 0.3959551137090925, "grad_norm": 25.123237861481527, "learning_rate": 6.722306525037936e-07, "logits/chosen": -0.44060733914375305, "logits/rejected": -0.40178999304771423, "logps/chosen": -213.97142028808594, "logps/rejected": -234.66355895996094, "loss": 0.5009, "rewards/accuracies": 0.7640625238418579, "rewards/chosen": -0.8785637617111206, "rewards/margins": 0.8950086236000061, "rewards/rejected": -1.7735724449157715, "step": 580 }, { "epoch": 0.40278192601442164, "grad_norm": 23.64031892312566, "learning_rate": 6.646433990895296e-07, "logits/chosen": -0.4277493357658386, "logits/rejected": -0.3930940628051758, "logps/chosen": -210.43264770507812, "logps/rejected": -230.69744873046875, "loss": 0.4728, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.8971768617630005, "rewards/margins": 0.9437196850776672, "rewards/rejected": -1.840896487236023, "step": 590 }, { "epoch": 0.4096087383197508, "grad_norm": 26.066890027531336, "learning_rate": 6.570561456752655e-07, "logits/chosen": -0.4176095128059387, "logits/rejected": -0.3907008171081543, "logps/chosen": -212.6049346923828, "logps/rejected": -230.02601623535156, "loss": 0.4545, "rewards/accuracies": 0.784375011920929, "rewards/chosen": -0.915949821472168, "rewards/margins": 1.011177659034729, "rewards/rejected": -1.9271275997161865, "step": 600 }, { "epoch": 0.41643555062508, "grad_norm": 26.361787585647985, "learning_rate": 6.494688922610015e-07, "logits/chosen": -0.4795023202896118, "logits/rejected": -0.43696874380111694, "logps/chosen": -212.20419311523438, "logps/rejected": -234.36700439453125, "loss": 0.4559, "rewards/accuracies": 0.7796875238418579, "rewards/chosen": -1.0691964626312256, "rewards/margins": 1.075463056564331, "rewards/rejected": -2.1446595191955566, "step": 610 }, { "epoch": 0.4232623629304092, "grad_norm": 26.1477045167914, "learning_rate": 6.418816388467374e-07, "logits/chosen": -0.4670104384422302, "logits/rejected": -0.4489012360572815, "logps/chosen": -212.9263916015625, "logps/rejected": -230.10812377929688, "loss": 0.4773, "rewards/accuracies": 0.7906249761581421, "rewards/chosen": -1.1133571863174438, "rewards/margins": 0.960852324962616, "rewards/rejected": -2.074209451675415, "step": 620 }, { "epoch": 0.43008917523573836, "grad_norm": 27.292995243666663, "learning_rate": 6.342943854324734e-07, "logits/chosen": -0.5064845085144043, "logits/rejected": -0.45950955152511597, "logps/chosen": -210.99880981445312, "logps/rejected": -233.96551513671875, "loss": 0.4687, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1231608390808105, "rewards/margins": 1.1056054830551147, "rewards/rejected": -2.2287662029266357, "step": 630 }, { "epoch": 0.43691598754106753, "grad_norm": 27.008219472309488, "learning_rate": 6.267071320182093e-07, "logits/chosen": -0.5144222378730774, "logits/rejected": -0.47659891843795776, "logps/chosen": -216.17208862304688, "logps/rejected": -241.91847229003906, "loss": 0.4634, "rewards/accuracies": 0.785937488079071, "rewards/chosen": -1.1570467948913574, "rewards/margins": 1.1083678007125854, "rewards/rejected": -2.2654144763946533, "step": 640 }, { "epoch": 0.4437427998463967, "grad_norm": 26.471519737054525, "learning_rate": 6.191198786039453e-07, "logits/chosen": -0.5065978765487671, "logits/rejected": -0.4691276550292969, "logps/chosen": -220.1791229248047, "logps/rejected": -241.1091766357422, "loss": 0.4658, "rewards/accuracies": 0.770312488079071, "rewards/chosen": -1.1591362953186035, "rewards/margins": 1.0631475448608398, "rewards/rejected": -2.2222838401794434, "step": 650 }, { "epoch": 0.4505696121517259, "grad_norm": 28.17275739709613, "learning_rate": 6.115326251896813e-07, "logits/chosen": -0.5238359570503235, "logits/rejected": -0.4979589581489563, "logps/chosen": -217.52597045898438, "logps/rejected": -234.4109344482422, "loss": 0.521, "rewards/accuracies": 0.7703125476837158, "rewards/chosen": -1.1146113872528076, "rewards/margins": 0.9384186863899231, "rewards/rejected": -2.053030252456665, "step": 660 }, { "epoch": 0.4573964244570551, "grad_norm": 31.93706252257983, "learning_rate": 6.039453717754173e-07, "logits/chosen": -0.49779045581817627, "logits/rejected": -0.4656790494918823, "logps/chosen": -206.24905395507812, "logps/rejected": -230.8712158203125, "loss": 0.4947, "rewards/accuracies": 0.7515624761581421, "rewards/chosen": -1.0656239986419678, "rewards/margins": 1.1138218641281128, "rewards/rejected": -2.179445743560791, "step": 670 }, { "epoch": 0.46422323676238425, "grad_norm": 24.781714808834078, "learning_rate": 5.963581183611533e-07, "logits/chosen": -0.5391644835472107, "logits/rejected": -0.5127192139625549, "logps/chosen": -219.6558074951172, "logps/rejected": -235.97950744628906, "loss": 0.4614, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.971098780632019, "rewards/margins": 1.0510507822036743, "rewards/rejected": -2.0221495628356934, "step": 680 }, { "epoch": 0.47105004906771347, "grad_norm": 27.945074054287147, "learning_rate": 5.887708649468892e-07, "logits/chosen": -0.48153987526893616, "logits/rejected": -0.43504899740219116, "logps/chosen": -211.96145629882812, "logps/rejected": -234.5848388671875, "loss": 0.4352, "rewards/accuracies": 0.792187511920929, "rewards/chosen": -0.9944396018981934, "rewards/margins": 1.1353037357330322, "rewards/rejected": -2.1297435760498047, "step": 690 }, { "epoch": 0.47787686137304264, "grad_norm": 28.51572741404973, "learning_rate": 5.811836115326252e-07, "logits/chosen": -0.4900820851325989, "logits/rejected": -0.45390579104423523, "logps/chosen": -215.57627868652344, "logps/rejected": -239.19187927246094, "loss": 0.4538, "rewards/accuracies": 0.7781250476837158, "rewards/chosen": -1.0837225914001465, "rewards/margins": 1.1697932481765747, "rewards/rejected": -2.2535159587860107, "step": 700 }, { "epoch": 0.4847036736783718, "grad_norm": 32.171436824447824, "learning_rate": 5.735963581183611e-07, "logits/chosen": -0.49004998803138733, "logits/rejected": -0.44813936948776245, "logps/chosen": -211.11868286132812, "logps/rejected": -236.65904235839844, "loss": 0.4482, "rewards/accuracies": 0.784375011920929, "rewards/chosen": -1.0840590000152588, "rewards/margins": 1.1766505241394043, "rewards/rejected": -2.260709524154663, "step": 710 }, { "epoch": 0.49153048598370097, "grad_norm": 27.130561237438616, "learning_rate": 5.660091047040971e-07, "logits/chosen": -0.506287693977356, "logits/rejected": -0.44507527351379395, "logps/chosen": -205.77537536621094, "logps/rejected": -230.67068481445312, "loss": 0.4736, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -1.1519620418548584, "rewards/margins": 1.0746945142745972, "rewards/rejected": -2.226656675338745, "step": 720 }, { "epoch": 0.4983572982890302, "grad_norm": 23.54195977607552, "learning_rate": 5.584218512898331e-07, "logits/chosen": -0.47103822231292725, "logits/rejected": -0.42172738909721375, "logps/chosen": -214.85769653320312, "logps/rejected": -236.97178649902344, "loss": 0.4464, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1006401777267456, "rewards/margins": 1.1676136255264282, "rewards/rejected": -2.268253803253174, "step": 730 }, { "epoch": 0.5051841105943593, "grad_norm": 24.379091590009676, "learning_rate": 5.508345978755691e-07, "logits/chosen": -0.5123965740203857, "logits/rejected": -0.47382158041000366, "logps/chosen": -214.0547637939453, "logps/rejected": -236.20230102539062, "loss": 0.4352, "rewards/accuracies": 0.7984375357627869, "rewards/chosen": -1.0824706554412842, "rewards/margins": 1.1324328184127808, "rewards/rejected": -2.2149033546447754, "step": 740 }, { "epoch": 0.5120109228996885, "grad_norm": 26.994677156319458, "learning_rate": 5.432473444613049e-07, "logits/chosen": -0.47567224502563477, "logits/rejected": -0.4289511442184448, "logps/chosen": -215.22872924804688, "logps/rejected": -240.09747314453125, "loss": 0.4434, "rewards/accuracies": 0.8046875596046448, "rewards/chosen": -1.116477131843567, "rewards/margins": 1.183769941329956, "rewards/rejected": -2.3002471923828125, "step": 750 }, { "epoch": 0.5188377352050177, "grad_norm": 25.25194090429139, "learning_rate": 5.356600910470409e-07, "logits/chosen": -0.5224145650863647, "logits/rejected": -0.46407467126846313, "logps/chosen": -213.71389770507812, "logps/rejected": -237.69161987304688, "loss": 0.4235, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.0965174436569214, "rewards/margins": 1.2732737064361572, "rewards/rejected": -2.369791030883789, "step": 760 }, { "epoch": 0.5256645475103469, "grad_norm": 27.278156612784407, "learning_rate": 5.280728376327769e-07, "logits/chosen": -0.5006579160690308, "logits/rejected": -0.475443571805954, "logps/chosen": -209.89276123046875, "logps/rejected": -235.71739196777344, "loss": 0.4355, "rewards/accuracies": 0.8031250238418579, "rewards/chosen": -1.149773120880127, "rewards/margins": 1.2504949569702148, "rewards/rejected": -2.400268077850342, "step": 770 }, { "epoch": 0.5324913598156761, "grad_norm": 23.589718462559564, "learning_rate": 5.204855842185128e-07, "logits/chosen": -0.526742696762085, "logits/rejected": -0.47116631269454956, "logps/chosen": -217.98532104492188, "logps/rejected": -245.28933715820312, "loss": 0.4599, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -1.2864385843276978, "rewards/margins": 1.1668964624404907, "rewards/rejected": -2.4533350467681885, "step": 780 }, { "epoch": 0.5393181721210053, "grad_norm": 30.5610451636278, "learning_rate": 5.128983308042489e-07, "logits/chosen": -0.4980033040046692, "logits/rejected": -0.454012930393219, "logps/chosen": -213.7527618408203, "logps/rejected": -242.71051025390625, "loss": 0.4175, "rewards/accuracies": 0.8203125596046448, "rewards/chosen": -1.2648416757583618, "rewards/margins": 1.2589421272277832, "rewards/rejected": -2.5237839221954346, "step": 790 }, { "epoch": 0.5461449844263344, "grad_norm": 25.860792001570903, "learning_rate": 5.053110773899848e-07, "logits/chosen": -0.5009379982948303, "logits/rejected": -0.45161643624305725, "logps/chosen": -218.66497802734375, "logps/rejected": -247.26480102539062, "loss": 0.4239, "rewards/accuracies": 0.8046875, "rewards/chosen": -1.2370792627334595, "rewards/margins": 1.2767220735549927, "rewards/rejected": -2.513801097869873, "step": 800 }, { "epoch": 0.5529717967316636, "grad_norm": 26.913987383333986, "learning_rate": 4.977238239757208e-07, "logits/chosen": -0.4731639325618744, "logits/rejected": -0.4382101893424988, "logps/chosen": -219.00997924804688, "logps/rejected": -243.0546417236328, "loss": 0.4263, "rewards/accuracies": 0.8015625476837158, "rewards/chosen": -1.2992050647735596, "rewards/margins": 1.2243870496749878, "rewards/rejected": -2.523592233657837, "step": 810 }, { "epoch": 0.5597986090369927, "grad_norm": 28.53426642330379, "learning_rate": 4.901365705614567e-07, "logits/chosen": -0.4574064612388611, "logits/rejected": -0.4343743920326233, "logps/chosen": -220.09823608398438, "logps/rejected": -241.9452362060547, "loss": 0.455, "rewards/accuracies": 0.7968750596046448, "rewards/chosen": -1.3589019775390625, "rewards/margins": 1.2111233472824097, "rewards/rejected": -2.5700252056121826, "step": 820 }, { "epoch": 0.566625421342322, "grad_norm": 32.236623934330666, "learning_rate": 4.825493171471927e-07, "logits/chosen": -0.477167010307312, "logits/rejected": -0.4342673122882843, "logps/chosen": -210.48898315429688, "logps/rejected": -236.6256561279297, "loss": 0.4202, "rewards/accuracies": 0.8109375238418579, "rewards/chosen": -1.266889214515686, "rewards/margins": 1.3714545965194702, "rewards/rejected": -2.6383438110351562, "step": 830 }, { "epoch": 0.5734522336476512, "grad_norm": 22.52023880849803, "learning_rate": 4.7496206373292864e-07, "logits/chosen": -0.4688745141029358, "logits/rejected": -0.4309556484222412, "logps/chosen": -211.64566040039062, "logps/rejected": -246.43594360351562, "loss": 0.3935, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.1579673290252686, "rewards/margins": 1.452370524406433, "rewards/rejected": -2.610337972640991, "step": 840 }, { "epoch": 0.5802790459529803, "grad_norm": 22.86650334456815, "learning_rate": 4.673748103186646e-07, "logits/chosen": -0.45984596014022827, "logits/rejected": -0.4113742709159851, "logps/chosen": -221.2747039794922, "logps/rejected": -251.6842803955078, "loss": 0.3937, "rewards/accuracies": 0.8265625238418579, "rewards/chosen": -1.2885468006134033, "rewards/margins": 1.4187039136886597, "rewards/rejected": -2.7072505950927734, "step": 850 }, { "epoch": 0.5871058582583095, "grad_norm": 24.866961143566478, "learning_rate": 4.597875569044006e-07, "logits/chosen": -0.4843246042728424, "logits/rejected": -0.43403178453445435, "logps/chosen": -217.4392852783203, "logps/rejected": -241.3638153076172, "loss": 0.4269, "rewards/accuracies": 0.7937500476837158, "rewards/chosen": -1.3239818811416626, "rewards/margins": 1.322103500366211, "rewards/rejected": -2.646085500717163, "step": 860 }, { "epoch": 0.5939326705636387, "grad_norm": 27.26389649040254, "learning_rate": 4.5220030349013654e-07, "logits/chosen": -0.5012161135673523, "logits/rejected": -0.4667961299419403, "logps/chosen": -214.83123779296875, "logps/rejected": -237.57041931152344, "loss": 0.4311, "rewards/accuracies": 0.7953125238418579, "rewards/chosen": -1.337403655052185, "rewards/margins": 1.2597852945327759, "rewards/rejected": -2.597188711166382, "step": 870 }, { "epoch": 0.6007594828689679, "grad_norm": 27.440215221554137, "learning_rate": 4.446130500758725e-07, "logits/chosen": -0.5221506953239441, "logits/rejected": -0.47429159283638, "logps/chosen": -218.2428436279297, "logps/rejected": -249.35227966308594, "loss": 0.4219, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.3460763692855835, "rewards/margins": 1.4286969900131226, "rewards/rejected": -2.774773359298706, "step": 880 }, { "epoch": 0.6075862951742971, "grad_norm": 26.890967949171902, "learning_rate": 4.370257966616085e-07, "logits/chosen": -0.5067352056503296, "logits/rejected": -0.4783848822116852, "logps/chosen": -214.87808227539062, "logps/rejected": -240.6841583251953, "loss": 0.4601, "rewards/accuracies": 0.78125, "rewards/chosen": -1.44821298122406, "rewards/margins": 1.3052537441253662, "rewards/rejected": -2.7534666061401367, "step": 890 }, { "epoch": 0.6144131074796262, "grad_norm": 32.237693494920336, "learning_rate": 4.2943854324734444e-07, "logits/chosen": -0.4975440502166748, "logits/rejected": -0.4508504271507263, "logps/chosen": -218.56907653808594, "logps/rejected": -252.4253692626953, "loss": 0.4016, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4073625802993774, "rewards/margins": 1.4989043474197388, "rewards/rejected": -2.9062671661376953, "step": 900 }, { "epoch": 0.6212399197849554, "grad_norm": 29.55632282027641, "learning_rate": 4.2185128983308036e-07, "logits/chosen": -0.5099817514419556, "logits/rejected": -0.45365262031555176, "logps/chosen": -221.89877319335938, "logps/rejected": -250.57550048828125, "loss": 0.4166, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3778979778289795, "rewards/margins": 1.4088386297225952, "rewards/rejected": -2.7867367267608643, "step": 910 }, { "epoch": 0.6280667320902846, "grad_norm": 23.405437038158652, "learning_rate": 4.142640364188164e-07, "logits/chosen": -0.5026878118515015, "logits/rejected": -0.46861255168914795, "logps/chosen": -224.63916015625, "logps/rejected": -250.61224365234375, "loss": 0.4178, "rewards/accuracies": 0.815625011920929, "rewards/chosen": -1.4456019401550293, "rewards/margins": 1.366437554359436, "rewards/rejected": -2.812039613723755, "step": 920 }, { "epoch": 0.6348935443956137, "grad_norm": 26.317645366377434, "learning_rate": 4.0667678300455234e-07, "logits/chosen": -0.5089496374130249, "logits/rejected": -0.447975218296051, "logps/chosen": -219.5886688232422, "logps/rejected": -253.34963989257812, "loss": 0.3633, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.3742705583572388, "rewards/margins": 1.611161231994629, "rewards/rejected": -2.985431671142578, "step": 930 }, { "epoch": 0.641720356700943, "grad_norm": 21.42106734892436, "learning_rate": 3.990895295902883e-07, "logits/chosen": -0.5624995827674866, "logits/rejected": -0.5080554485321045, "logps/chosen": -222.9444580078125, "logps/rejected": -251.54403686523438, "loss": 0.3967, "rewards/accuracies": 0.8250000476837158, "rewards/chosen": -1.435417890548706, "rewards/margins": 1.4602770805358887, "rewards/rejected": -2.8956949710845947, "step": 940 }, { "epoch": 0.6485471690062722, "grad_norm": 21.661157767536693, "learning_rate": 3.915022761760243e-07, "logits/chosen": -0.5764337182044983, "logits/rejected": -0.5422189235687256, "logps/chosen": -209.9173126220703, "logps/rejected": -239.46363830566406, "loss": 0.4007, "rewards/accuracies": 0.8203125, "rewards/chosen": -1.5316213369369507, "rewards/margins": 1.480596899986267, "rewards/rejected": -3.012218475341797, "step": 950 }, { "epoch": 0.6553739813116013, "grad_norm": 23.414697757548318, "learning_rate": 3.8391502276176024e-07, "logits/chosen": -0.5713392496109009, "logits/rejected": -0.5199674963951111, "logps/chosen": -228.35032653808594, "logps/rejected": -261.5352478027344, "loss": 0.39, "rewards/accuracies": 0.817187488079071, "rewards/chosen": -1.6193687915802002, "rewards/margins": 1.6461690664291382, "rewards/rejected": -3.265537977218628, "step": 960 }, { "epoch": 0.6622007936169305, "grad_norm": 26.16469307265146, "learning_rate": 3.763277693474962e-07, "logits/chosen": -0.5215846300125122, "logits/rejected": -0.4677078127861023, "logps/chosen": -218.17672729492188, "logps/rejected": -251.90655517578125, "loss": 0.4195, "rewards/accuracies": 0.807812511920929, "rewards/chosen": -1.639040470123291, "rewards/margins": 1.440010666847229, "rewards/rejected": -3.0790510177612305, "step": 970 }, { "epoch": 0.6690276059222596, "grad_norm": 24.910240774548882, "learning_rate": 3.687405159332321e-07, "logits/chosen": -0.5399425029754639, "logits/rejected": -0.5003796219825745, "logps/chosen": -223.10562133789062, "logps/rejected": -255.92811584472656, "loss": 0.4148, "rewards/accuracies": 0.8125000596046448, "rewards/chosen": -1.7429447174072266, "rewards/margins": 1.4403008222579956, "rewards/rejected": -3.1832454204559326, "step": 980 }, { "epoch": 0.6758544182275888, "grad_norm": 25.952929350002567, "learning_rate": 3.611532625189681e-07, "logits/chosen": -0.5129621028900146, "logits/rejected": -0.4642283320426941, "logps/chosen": -229.56149291992188, "logps/rejected": -265.0505676269531, "loss": 0.401, "rewards/accuracies": 0.823437511920929, "rewards/chosen": -1.6355669498443604, "rewards/margins": 1.5693683624267578, "rewards/rejected": -3.204935312271118, "step": 990 }, { "epoch": 0.6826812305329181, "grad_norm": 30.276024499323412, "learning_rate": 3.5356600910470406e-07, "logits/chosen": -0.5371091365814209, "logits/rejected": -0.4973250925540924, "logps/chosen": -236.55892944335938, "logps/rejected": -263.94873046875, "loss": 0.3813, "rewards/accuracies": 0.8421875238418579, "rewards/chosen": -1.6107735633850098, "rewards/margins": 1.5443642139434814, "rewards/rejected": -3.155137538909912, "step": 1000 }, { "epoch": 0.6895080428382472, "grad_norm": 29.460156400060427, "learning_rate": 3.459787556904401e-07, "logits/chosen": -0.5438990592956543, "logits/rejected": -0.49257737398147583, "logps/chosen": -223.89715576171875, "logps/rejected": -254.1317138671875, "loss": 0.405, "rewards/accuracies": 0.823437511920929, "rewards/chosen": -1.5855486392974854, "rewards/margins": 1.5699981451034546, "rewards/rejected": -3.1555471420288086, "step": 1010 }, { "epoch": 0.6963348551435764, "grad_norm": 37.79554341081753, "learning_rate": 3.3839150227617604e-07, "logits/chosen": -0.5885217189788818, "logits/rejected": -0.5452876091003418, "logps/chosen": -228.7353515625, "logps/rejected": -262.69781494140625, "loss": 0.3902, "rewards/accuracies": 0.8484375476837158, "rewards/chosen": -1.5996198654174805, "rewards/margins": 1.5858361721038818, "rewards/rejected": -3.185455799102783, "step": 1020 }, { "epoch": 0.7031616674489056, "grad_norm": 32.4485056058962, "learning_rate": 3.30804248861912e-07, "logits/chosen": -0.6067803502082825, "logits/rejected": -0.5617104768753052, "logps/chosen": -224.02381896972656, "logps/rejected": -258.9015808105469, "loss": 0.3977, "rewards/accuracies": 0.8218750357627869, "rewards/chosen": -1.7180209159851074, "rewards/margins": 1.5710700750350952, "rewards/rejected": -3.289091110229492, "step": 1030 }, { "epoch": 0.7099884797542347, "grad_norm": 25.533353588922054, "learning_rate": 3.232169954476479e-07, "logits/chosen": -0.6116800308227539, "logits/rejected": -0.5805966854095459, "logps/chosen": -225.80697631835938, "logps/rejected": -264.60693359375, "loss": 0.3617, "rewards/accuracies": 0.8453124761581421, "rewards/chosen": -1.4660491943359375, "rewards/margins": 1.7193350791931152, "rewards/rejected": -3.1853842735290527, "step": 1040 }, { "epoch": 0.716815292059564, "grad_norm": 26.60381949350959, "learning_rate": 3.156297420333839e-07, "logits/chosen": -0.6242787837982178, "logits/rejected": -0.5573821067810059, "logps/chosen": -226.86032104492188, "logps/rejected": -264.05621337890625, "loss": 0.3629, "rewards/accuracies": 0.8421875238418579, "rewards/chosen": -1.66229248046875, "rewards/margins": 1.770555019378662, "rewards/rejected": -3.432847499847412, "step": 1050 }, { "epoch": 0.7236421043648931, "grad_norm": 23.579508860359553, "learning_rate": 3.0804248861911986e-07, "logits/chosen": -0.5435959696769714, "logits/rejected": -0.49748197197914124, "logps/chosen": -226.20513916015625, "logps/rejected": -257.2373046875, "loss": 0.3933, "rewards/accuracies": 0.8281250596046448, "rewards/chosen": -1.6908609867095947, "rewards/margins": 1.6625466346740723, "rewards/rejected": -3.353407144546509, "step": 1060 }, { "epoch": 0.7304689166702223, "grad_norm": 27.167904414342036, "learning_rate": 3.004552352048558e-07, "logits/chosen": -0.6101936101913452, "logits/rejected": -0.5515713691711426, "logps/chosen": -222.96014404296875, "logps/rejected": -254.43185424804688, "loss": 0.3563, "rewards/accuracies": 0.8421875238418579, "rewards/chosen": -1.3781498670578003, "rewards/margins": 1.5955610275268555, "rewards/rejected": -2.9737110137939453, "step": 1070 }, { "epoch": 0.7372957289755515, "grad_norm": 26.180827032107008, "learning_rate": 2.928679817905918e-07, "logits/chosen": -0.5770156383514404, "logits/rejected": -0.5399613380432129, "logps/chosen": -226.30825805664062, "logps/rejected": -257.7979736328125, "loss": 0.3894, "rewards/accuracies": 0.8312500715255737, "rewards/chosen": -1.5706027746200562, "rewards/margins": 1.551680326461792, "rewards/rejected": -3.1222832202911377, "step": 1080 }, { "epoch": 0.7441225412808806, "grad_norm": 28.76861593583347, "learning_rate": 2.8528072837632776e-07, "logits/chosen": -0.5627835392951965, "logits/rejected": -0.48850327730178833, "logps/chosen": -215.5068359375, "logps/rejected": -251.03070068359375, "loss": 0.3595, "rewards/accuracies": 0.8531249761581421, "rewards/chosen": -1.5191683769226074, "rewards/margins": 1.6994376182556152, "rewards/rejected": -3.2186059951782227, "step": 1090 }, { "epoch": 0.7509493535862098, "grad_norm": 28.410377070052604, "learning_rate": 2.776934749620637e-07, "logits/chosen": -0.5616481304168701, "logits/rejected": -0.5304921865463257, "logps/chosen": -213.49261474609375, "logps/rejected": -240.36477661132812, "loss": 0.4064, "rewards/accuracies": 0.828125, "rewards/chosen": -1.595428466796875, "rewards/margins": 1.5317962169647217, "rewards/rejected": -3.1272246837615967, "step": 1100 }, { "epoch": 0.7577761658915391, "grad_norm": 28.197822913401385, "learning_rate": 2.7010622154779964e-07, "logits/chosen": -0.5820162892341614, "logits/rejected": -0.5277166366577148, "logps/chosen": -224.64027404785156, "logps/rejected": -258.8291320800781, "loss": 0.4045, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5898746252059937, "rewards/margins": 1.5575275421142578, "rewards/rejected": -3.147402048110962, "step": 1110 }, { "epoch": 0.7646029781968682, "grad_norm": 19.603917980899062, "learning_rate": 2.6251896813353566e-07, "logits/chosen": -0.5747621059417725, "logits/rejected": -0.5307395458221436, "logps/chosen": -224.86712646484375, "logps/rejected": -255.05657958984375, "loss": 0.3845, "rewards/accuracies": 0.8203125, "rewards/chosen": -1.4760308265686035, "rewards/margins": 1.6863486766815186, "rewards/rejected": -3.162379264831543, "step": 1120 }, { "epoch": 0.7714297905021974, "grad_norm": 23.578968750795934, "learning_rate": 2.549317147192716e-07, "logits/chosen": -0.5622972846031189, "logits/rejected": -0.5020321607589722, "logps/chosen": -219.63861083984375, "logps/rejected": -253.4189453125, "loss": 0.3657, "rewards/accuracies": 0.84375, "rewards/chosen": -1.452361822128296, "rewards/margins": 1.6745182275772095, "rewards/rejected": -3.126879930496216, "step": 1130 }, { "epoch": 0.7782566028075265, "grad_norm": 28.72777820280462, "learning_rate": 2.473444613050076e-07, "logits/chosen": -0.6052004098892212, "logits/rejected": -0.5665544271469116, "logps/chosen": -216.50294494628906, "logps/rejected": -247.38058471679688, "loss": 0.3809, "rewards/accuracies": 0.8281250596046448, "rewards/chosen": -1.524512529373169, "rewards/margins": 1.5869126319885254, "rewards/rejected": -3.1114251613616943, "step": 1140 }, { "epoch": 0.7850834151128557, "grad_norm": 29.78459070126252, "learning_rate": 2.3975720789074356e-07, "logits/chosen": -0.6316035985946655, "logits/rejected": -0.5892209410667419, "logps/chosen": -231.2786407470703, "logps/rejected": -262.7926025390625, "loss": 0.3996, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.5984147787094116, "rewards/margins": 1.644458532333374, "rewards/rejected": -3.242873430252075, "step": 1150 }, { "epoch": 0.791910227418185, "grad_norm": 25.317404664815193, "learning_rate": 2.321699544764795e-07, "logits/chosen": -0.5851012468338013, "logits/rejected": -0.5416604280471802, "logps/chosen": -212.95620727539062, "logps/rejected": -249.06094360351562, "loss": 0.3691, "rewards/accuracies": 0.8250000476837158, "rewards/chosen": -1.6382710933685303, "rewards/margins": 1.698676586151123, "rewards/rejected": -3.336947441101074, "step": 1160 }, { "epoch": 0.7987370397235141, "grad_norm": 30.536892352417116, "learning_rate": 2.2458270106221546e-07, "logits/chosen": -0.6020447611808777, "logits/rejected": -0.5497596263885498, "logps/chosen": -223.9817352294922, "logps/rejected": -254.70596313476562, "loss": 0.3743, "rewards/accuracies": 0.832812488079071, "rewards/chosen": -1.5852043628692627, "rewards/margins": 1.6272475719451904, "rewards/rejected": -3.212451934814453, "step": 1170 }, { "epoch": 0.8055638520288433, "grad_norm": 79.72745340543844, "learning_rate": 2.1699544764795143e-07, "logits/chosen": -0.6025545597076416, "logits/rejected": -0.5697463154792786, "logps/chosen": -222.6460418701172, "logps/rejected": -253.47068786621094, "loss": 0.4136, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6384048461914062, "rewards/margins": 1.580573558807373, "rewards/rejected": -3.2189784049987793, "step": 1180 }, { "epoch": 0.8123906643341725, "grad_norm": 22.663782810990345, "learning_rate": 2.094081942336874e-07, "logits/chosen": -0.5814501047134399, "logits/rejected": -0.541379988193512, "logps/chosen": -221.52999877929688, "logps/rejected": -253.7140350341797, "loss": 0.3363, "rewards/accuracies": 0.8640625476837158, "rewards/chosen": -1.6155626773834229, "rewards/margins": 1.7298386096954346, "rewards/rejected": -3.3454012870788574, "step": 1190 }, { "epoch": 0.8192174766395016, "grad_norm": 20.696850459845496, "learning_rate": 2.0182094081942336e-07, "logits/chosen": -0.5838125944137573, "logits/rejected": -0.5411943793296814, "logps/chosen": -225.6829071044922, "logps/rejected": -253.77597045898438, "loss": 0.3816, "rewards/accuracies": 0.8343750238418579, "rewards/chosen": -1.6227580308914185, "rewards/margins": 1.729698896408081, "rewards/rejected": -3.352457046508789, "step": 1200 }, { "epoch": 0.8260442889448308, "grad_norm": 22.607979595724903, "learning_rate": 1.9423368740515933e-07, "logits/chosen": -0.5873730182647705, "logits/rejected": -0.5527704954147339, "logps/chosen": -236.8846435546875, "logps/rejected": -263.2967224121094, "loss": 0.366, "rewards/accuracies": 0.8531250357627869, "rewards/chosen": -1.619153618812561, "rewards/margins": 1.6637271642684937, "rewards/rejected": -3.2828805446624756, "step": 1210 }, { "epoch": 0.83287110125016, "grad_norm": 26.109280560076318, "learning_rate": 1.8664643399089527e-07, "logits/chosen": -0.5835367441177368, "logits/rejected": -0.5328267216682434, "logps/chosen": -220.54185485839844, "logps/rejected": -259.7679748535156, "loss": 0.3908, "rewards/accuracies": 0.828125, "rewards/chosen": -1.6561741828918457, "rewards/margins": 1.7342405319213867, "rewards/rejected": -3.3904144763946533, "step": 1220 }, { "epoch": 0.8396979135554892, "grad_norm": 32.97497497653311, "learning_rate": 1.7905918057663124e-07, "logits/chosen": -0.6125339269638062, "logits/rejected": -0.5747178792953491, "logps/chosen": -222.89346313476562, "logps/rejected": -259.0747985839844, "loss": 0.3757, "rewards/accuracies": 0.8484375476837158, "rewards/chosen": -1.631098747253418, "rewards/margins": 1.6798287630081177, "rewards/rejected": -3.310927391052246, "step": 1230 }, { "epoch": 0.8465247258608184, "grad_norm": 27.98596142342839, "learning_rate": 1.7147192716236723e-07, "logits/chosen": -0.5824239253997803, "logits/rejected": -0.5337764024734497, "logps/chosen": -224.97032165527344, "logps/rejected": -253.56494140625, "loss": 0.3747, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.6330980062484741, "rewards/margins": 1.6164720058441162, "rewards/rejected": -3.249569892883301, "step": 1240 }, { "epoch": 0.8533515381661475, "grad_norm": 29.97641566967343, "learning_rate": 1.638846737481032e-07, "logits/chosen": -0.6242474913597107, "logits/rejected": -0.5719231963157654, "logps/chosen": -225.42031860351562, "logps/rejected": -250.8466796875, "loss": 0.3653, "rewards/accuracies": 0.8453124761581421, "rewards/chosen": -1.5972038507461548, "rewards/margins": 1.6274079084396362, "rewards/rejected": -3.224611759185791, "step": 1250 }, { "epoch": 0.8601783504714767, "grad_norm": 28.827180040399767, "learning_rate": 1.5629742033383914e-07, "logits/chosen": -0.5725576877593994, "logits/rejected": -0.5013697147369385, "logps/chosen": -229.0035400390625, "logps/rejected": -265.1317443847656, "loss": 0.348, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.63456392288208, "rewards/margins": 1.9875181913375854, "rewards/rejected": -3.622081995010376, "step": 1260 }, { "epoch": 0.867005162776806, "grad_norm": 25.80801920302886, "learning_rate": 1.487101669195751e-07, "logits/chosen": -0.5822853446006775, "logits/rejected": -0.5320168733596802, "logps/chosen": -225.67462158203125, "logps/rejected": -261.8921203613281, "loss": 0.3703, "rewards/accuracies": 0.8406250476837158, "rewards/chosen": -1.611586332321167, "rewards/margins": 1.8382863998413086, "rewards/rejected": -3.4498729705810547, "step": 1270 }, { "epoch": 0.8738319750821351, "grad_norm": 22.850573481872292, "learning_rate": 1.4112291350531107e-07, "logits/chosen": -0.5987452268600464, "logits/rejected": -0.555465817451477, "logps/chosen": -221.936767578125, "logps/rejected": -258.4222106933594, "loss": 0.3694, "rewards/accuracies": 0.8343750238418579, "rewards/chosen": -1.4851796627044678, "rewards/margins": 1.6478145122528076, "rewards/rejected": -3.1329944133758545, "step": 1280 }, { "epoch": 0.8806587873874643, "grad_norm": 31.59414494820462, "learning_rate": 1.3353566009104704e-07, "logits/chosen": -0.6345129013061523, "logits/rejected": -0.5726636052131653, "logps/chosen": -225.1095428466797, "logps/rejected": -260.2147216796875, "loss": 0.3516, "rewards/accuracies": 0.8609375357627869, "rewards/chosen": -1.6206376552581787, "rewards/margins": 1.7943859100341797, "rewards/rejected": -3.4150233268737793, "step": 1290 }, { "epoch": 0.8874855996927934, "grad_norm": 22.999874271408995, "learning_rate": 1.25948406676783e-07, "logits/chosen": -0.6340910196304321, "logits/rejected": -0.6022393703460693, "logps/chosen": -227.1848907470703, "logps/rejected": -259.4250183105469, "loss": 0.3606, "rewards/accuracies": 0.840624988079071, "rewards/chosen": -1.6741951704025269, "rewards/margins": 1.7158465385437012, "rewards/rejected": -3.3900415897369385, "step": 1300 }, { "epoch": 0.8943124119981226, "grad_norm": 22.57593696460777, "learning_rate": 1.1836115326251896e-07, "logits/chosen": -0.6423132419586182, "logits/rejected": -0.5848190188407898, "logps/chosen": -223.364990234375, "logps/rejected": -262.73785400390625, "loss": 0.3355, "rewards/accuracies": 0.8578125238418579, "rewards/chosen": -1.5638341903686523, "rewards/margins": 1.9036611318588257, "rewards/rejected": -3.4674954414367676, "step": 1310 }, { "epoch": 0.9011392243034518, "grad_norm": 37.54515881184046, "learning_rate": 1.1077389984825493e-07, "logits/chosen": -0.575082540512085, "logits/rejected": -0.5424289107322693, "logps/chosen": -237.77911376953125, "logps/rejected": -275.63555908203125, "loss": 0.3516, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6724742650985718, "rewards/margins": 1.8654717206954956, "rewards/rejected": -3.5379459857940674, "step": 1320 }, { "epoch": 0.907966036608781, "grad_norm": 22.84612850953967, "learning_rate": 1.0318664643399089e-07, "logits/chosen": -0.5780067443847656, "logits/rejected": -0.5416074395179749, "logps/chosen": -221.38531494140625, "logps/rejected": -256.49200439453125, "loss": 0.3723, "rewards/accuracies": 0.8531250357627869, "rewards/chosen": -1.6589443683624268, "rewards/margins": 1.743546724319458, "rewards/rejected": -3.4024910926818848, "step": 1330 }, { "epoch": 0.9147928489141102, "grad_norm": 19.89021053617579, "learning_rate": 9.559939301972686e-08, "logits/chosen": -0.6442657709121704, "logits/rejected": -0.6099978685379028, "logps/chosen": -224.65768432617188, "logps/rejected": -256.4526062011719, "loss": 0.3785, "rewards/accuracies": 0.839062511920929, "rewards/chosen": -1.7409473657608032, "rewards/margins": 1.636985421180725, "rewards/rejected": -3.3779327869415283, "step": 1340 }, { "epoch": 0.9216196612194394, "grad_norm": 25.742427094875804, "learning_rate": 8.801213960546281e-08, "logits/chosen": -0.6490598320960999, "logits/rejected": -0.5897331833839417, "logps/chosen": -223.9561309814453, "logps/rejected": -259.2427062988281, "loss": 0.3468, "rewards/accuracies": 0.8546874523162842, "rewards/chosen": -1.6736507415771484, "rewards/margins": 1.7775483131408691, "rewards/rejected": -3.4511990547180176, "step": 1350 }, { "epoch": 0.9284464735247685, "grad_norm": 33.277457563263674, "learning_rate": 8.042488619119878e-08, "logits/chosen": -0.6450273990631104, "logits/rejected": -0.6108094453811646, "logps/chosen": -225.92788696289062, "logps/rejected": -257.3470458984375, "loss": 0.4063, "rewards/accuracies": 0.8234375715255737, "rewards/chosen": -1.7581716775894165, "rewards/margins": 1.6247668266296387, "rewards/rejected": -3.3829383850097656, "step": 1360 }, { "epoch": 0.9352732858300977, "grad_norm": 27.273007529128005, "learning_rate": 7.283763277693475e-08, "logits/chosen": -0.5727499723434448, "logits/rejected": -0.5322836637496948, "logps/chosen": -225.6246337890625, "logps/rejected": -255.45809936523438, "loss": 0.3476, "rewards/accuracies": 0.854687511920929, "rewards/chosen": -1.7323064804077148, "rewards/margins": 1.745940923690796, "rewards/rejected": -3.4782474040985107, "step": 1370 }, { "epoch": 0.9421000981354269, "grad_norm": 32.146995205900126, "learning_rate": 6.525037936267071e-08, "logits/chosen": -0.6203707456588745, "logits/rejected": -0.5694869756698608, "logps/chosen": -224.69058227539062, "logps/rejected": -264.454833984375, "loss": 0.3231, "rewards/accuracies": 0.8843750357627869, "rewards/chosen": -1.678446888923645, "rewards/margins": 1.9156006574630737, "rewards/rejected": -3.5940475463867188, "step": 1380 }, { "epoch": 0.948926910440756, "grad_norm": 30.60411185742407, "learning_rate": 5.766312594840667e-08, "logits/chosen": -0.6311684846878052, "logits/rejected": -0.586366593837738, "logps/chosen": -220.22238159179688, "logps/rejected": -253.02584838867188, "loss": 0.3841, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6623611450195312, "rewards/margins": 1.5844680070877075, "rewards/rejected": -3.2468292713165283, "step": 1390 }, { "epoch": 0.9557537227460853, "grad_norm": 30.706698645138353, "learning_rate": 5.007587253414264e-08, "logits/chosen": -0.6784946918487549, "logits/rejected": -0.6356594562530518, "logps/chosen": -224.51947021484375, "logps/rejected": -258.38885498046875, "loss": 0.3436, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5771867036819458, "rewards/margins": 1.9237372875213623, "rewards/rejected": -3.5009241104125977, "step": 1400 }, { "epoch": 0.9625805350514144, "grad_norm": 21.98143091385874, "learning_rate": 4.2488619119878606e-08, "logits/chosen": -0.6370885372161865, "logits/rejected": -0.5851677060127258, "logps/chosen": -227.73484802246094, "logps/rejected": -267.04486083984375, "loss": 0.3695, "rewards/accuracies": 0.8312500715255737, "rewards/chosen": -1.629777431488037, "rewards/margins": 1.7639508247375488, "rewards/rejected": -3.393728494644165, "step": 1410 }, { "epoch": 0.9694073473567436, "grad_norm": 30.869528987745515, "learning_rate": 3.4901365705614566e-08, "logits/chosen": -0.6314695477485657, "logits/rejected": -0.5921632647514343, "logps/chosen": -221.69427490234375, "logps/rejected": -254.48204040527344, "loss": 0.3673, "rewards/accuracies": 0.8250000476837158, "rewards/chosen": -1.5672862529754639, "rewards/margins": 1.6834831237792969, "rewards/rejected": -3.2507691383361816, "step": 1420 }, { "epoch": 0.9762341596620728, "grad_norm": 27.10327349430508, "learning_rate": 2.731411229135053e-08, "logits/chosen": -0.6630594730377197, "logits/rejected": -0.6296666264533997, "logps/chosen": -224.44802856445312, "logps/rejected": -263.31512451171875, "loss": 0.3689, "rewards/accuracies": 0.846875011920929, "rewards/chosen": -1.7772753238677979, "rewards/margins": 1.7530099153518677, "rewards/rejected": -3.530285358428955, "step": 1430 }, { "epoch": 0.9830609719674019, "grad_norm": 28.43487263604258, "learning_rate": 1.9726858877086493e-08, "logits/chosen": -0.6382923126220703, "logits/rejected": -0.5936453342437744, "logps/chosen": -225.35842895507812, "logps/rejected": -257.9464416503906, "loss": 0.3744, "rewards/accuracies": 0.8468750715255737, "rewards/chosen": -1.7416939735412598, "rewards/margins": 1.7625494003295898, "rewards/rejected": -3.5042431354522705, "step": 1440 }, { "epoch": 0.9898877842727312, "grad_norm": 29.892251091074566, "learning_rate": 1.2139605462822458e-08, "logits/chosen": -0.6021047830581665, "logits/rejected": -0.5681164860725403, "logps/chosen": -228.019287109375, "logps/rejected": -261.641357421875, "loss": 0.3924, "rewards/accuracies": 0.8343750238418579, "rewards/chosen": -1.7237948179244995, "rewards/margins": 1.5835388898849487, "rewards/rejected": -3.307333469390869, "step": 1450 }, { "epoch": 0.9967145965780604, "grad_norm": 36.52351236896022, "learning_rate": 4.552352048558422e-09, "logits/chosen": -0.6409857273101807, "logits/rejected": -0.6134127378463745, "logps/chosen": -229.7710418701172, "logps/rejected": -268.1390075683594, "loss": 0.338, "rewards/accuracies": 0.870312511920929, "rewards/chosen": -1.6543751955032349, "rewards/margins": 1.8962233066558838, "rewards/rejected": -3.550598621368408, "step": 1460 }, { "epoch": 1.0, "step": 1465, "total_flos": 161167907028992.0, "train_loss": 0.47729448792063744, "train_runtime": 14275.1765, "train_samples_per_second": 6.567, "train_steps_per_second": 0.103 } ], "logging_steps": 10, "max_steps": 1465, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 161167907028992.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }