davidanugraha's picture
Upload folder using huggingface_hub
9497555 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1465,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00682681230532918,
"grad_norm": 20.28844809527937,
"learning_rate": 6.122448979591837e-08,
"logits/chosen": 0.036548737436532974,
"logits/rejected": 0.04153463989496231,
"logps/chosen": -191.76834106445312,
"logps/rejected": -189.42454528808594,
"loss": 0.6921,
"rewards/accuracies": 0.4296875298023224,
"rewards/chosen": 0.00048236188013106585,
"rewards/margins": 0.002409812528640032,
"rewards/rejected": -0.0019274509977549314,
"step": 10
},
{
"epoch": 0.01365362461065836,
"grad_norm": 21.758270816027,
"learning_rate": 1.2925170068027211e-07,
"logits/chosen": 0.045435529202222824,
"logits/rejected": 0.05586998164653778,
"logps/chosen": -187.8279571533203,
"logps/rejected": -187.57005310058594,
"loss": 0.6934,
"rewards/accuracies": 0.47343751788139343,
"rewards/chosen": 0.0018052328377962112,
"rewards/margins": -0.00019586144480854273,
"rewards/rejected": 0.0020010939333587885,
"step": 20
},
{
"epoch": 0.02048043691598754,
"grad_norm": 20.292602508788892,
"learning_rate": 1.9727891156462583e-07,
"logits/chosen": 0.01841779053211212,
"logits/rejected": 0.043382205069065094,
"logps/chosen": -185.8760986328125,
"logps/rejected": -189.0175323486328,
"loss": 0.6933,
"rewards/accuracies": 0.5015625357627869,
"rewards/chosen": -0.0002999665157403797,
"rewards/margins": 2.151366788893938e-05,
"rewards/rejected": -0.0003214801545254886,
"step": 30
},
{
"epoch": 0.02730724922131672,
"grad_norm": 19.887585625537866,
"learning_rate": 2.653061224489796e-07,
"logits/chosen": 0.030143991112709045,
"logits/rejected": 0.031897470355033875,
"logps/chosen": -193.6663360595703,
"logps/rejected": -190.40435791015625,
"loss": 0.6926,
"rewards/accuracies": 0.5390625,
"rewards/chosen": 0.0008976617245934904,
"rewards/margins": 0.001484251581132412,
"rewards/rejected": -0.0005865898565389216,
"step": 40
},
{
"epoch": 0.0341340615266459,
"grad_norm": 21.04186313593167,
"learning_rate": 3.333333333333333e-07,
"logits/chosen": 0.049032919108867645,
"logits/rejected": 0.06374948471784592,
"logps/chosen": -188.4130401611328,
"logps/rejected": -190.03564453125,
"loss": 0.6953,
"rewards/accuracies": 0.4468750059604645,
"rewards/chosen": 0.0011686112266033888,
"rewards/margins": -0.0039381845854222775,
"rewards/rejected": 0.005106796510517597,
"step": 50
},
{
"epoch": 0.04096087383197508,
"grad_norm": 20.494153976987644,
"learning_rate": 4.0136054421768705e-07,
"logits/chosen": 0.054087888449430466,
"logits/rejected": 0.05269278585910797,
"logps/chosen": -189.312744140625,
"logps/rejected": -184.3561248779297,
"loss": 0.6934,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.0027984948828816414,
"rewards/margins": -5.139678250998259e-06,
"rewards/rejected": 0.0028036346193403006,
"step": 60
},
{
"epoch": 0.04778768613730426,
"grad_norm": 22.523707852506742,
"learning_rate": 4.693877551020408e-07,
"logits/chosen": 0.03845703601837158,
"logits/rejected": 0.04080147296190262,
"logps/chosen": -189.47398376464844,
"logps/rejected": -190.44366455078125,
"loss": 0.6923,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.008863605558872223,
"rewards/margins": 0.002069632289931178,
"rewards/rejected": 0.006793972570449114,
"step": 70
},
{
"epoch": 0.05461449844263344,
"grad_norm": 19.61999116994199,
"learning_rate": 5.374149659863945e-07,
"logits/chosen": 0.02700674906373024,
"logits/rejected": 0.014029408805072308,
"logps/chosen": -189.84165954589844,
"logps/rejected": -187.60842895507812,
"loss": 0.688,
"rewards/accuracies": 0.6015625,
"rewards/chosen": 0.019321825355291367,
"rewards/margins": 0.01094727497547865,
"rewards/rejected": 0.008374550379812717,
"step": 80
},
{
"epoch": 0.06144131074796262,
"grad_norm": 22.05785225177187,
"learning_rate": 6.054421768707482e-07,
"logits/chosen": 0.01984417997300625,
"logits/rejected": 0.025053691118955612,
"logps/chosen": -186.66159057617188,
"logps/rejected": -189.27655029296875,
"loss": 0.6881,
"rewards/accuracies": 0.5859375,
"rewards/chosen": 0.027713492512702942,
"rewards/margins": 0.010677953250706196,
"rewards/rejected": 0.01703553833067417,
"step": 90
},
{
"epoch": 0.0682681230532918,
"grad_norm": 20.523376899329886,
"learning_rate": 6.734693877551019e-07,
"logits/chosen": 0.03126838803291321,
"logits/rejected": 0.054031528532505035,
"logps/chosen": -190.2693328857422,
"logps/rejected": -189.81666564941406,
"loss": 0.6859,
"rewards/accuracies": 0.6015625,
"rewards/chosen": 0.040415603667497635,
"rewards/margins": 0.015625018626451492,
"rewards/rejected": 0.024790585041046143,
"step": 100
},
{
"epoch": 0.07509493535862098,
"grad_norm": 21.235392584783057,
"learning_rate": 7.414965986394558e-07,
"logits/chosen": -0.008218009024858475,
"logits/rejected": -0.0177446398884058,
"logps/chosen": -189.50607299804688,
"logps/rejected": -192.59072875976562,
"loss": 0.6827,
"rewards/accuracies": 0.6078125238418579,
"rewards/chosen": 0.05662111937999725,
"rewards/margins": 0.022500621154904366,
"rewards/rejected": 0.03412050008773804,
"step": 110
},
{
"epoch": 0.08192174766395016,
"grad_norm": 19.963280285650864,
"learning_rate": 8.095238095238095e-07,
"logits/chosen": -0.048616923391819,
"logits/rejected": -0.04890388250350952,
"logps/chosen": -197.3944091796875,
"logps/rejected": -192.91751098632812,
"loss": 0.6817,
"rewards/accuracies": 0.6078125238418579,
"rewards/chosen": 0.06002511456608772,
"rewards/margins": 0.02546420879662037,
"rewards/rejected": 0.034560900181531906,
"step": 120
},
{
"epoch": 0.08874855996927934,
"grad_norm": 19.85458405774497,
"learning_rate": 8.775510204081632e-07,
"logits/chosen": -0.02412007376551628,
"logits/rejected": -0.02812131866812706,
"logps/chosen": -192.2180938720703,
"logps/rejected": -190.7085418701172,
"loss": 0.6769,
"rewards/accuracies": 0.6609375476837158,
"rewards/chosen": 0.07226413488388062,
"rewards/margins": 0.03687696158885956,
"rewards/rejected": 0.03538716956973076,
"step": 130
},
{
"epoch": 0.09557537227460852,
"grad_norm": 22.035739399905705,
"learning_rate": 9.45578231292517e-07,
"logits/chosen": -0.03001168556511402,
"logits/rejected": -0.00422773277387023,
"logps/chosen": -193.58248901367188,
"logps/rejected": -189.52310180664062,
"loss": 0.6712,
"rewards/accuracies": 0.653124988079071,
"rewards/chosen": 0.08945093303918839,
"rewards/margins": 0.049683406949043274,
"rewards/rejected": 0.03976753354072571,
"step": 140
},
{
"epoch": 0.1024021845799377,
"grad_norm": 20.65980177560287,
"learning_rate": 9.98482549317147e-07,
"logits/chosen": -0.07848148047924042,
"logits/rejected": -0.08446665108203888,
"logps/chosen": -203.80819702148438,
"logps/rejected": -202.5292510986328,
"loss": 0.6659,
"rewards/accuracies": 0.651562511920929,
"rewards/chosen": 0.10370737314224243,
"rewards/margins": 0.06292011588811874,
"rewards/rejected": 0.04078725352883339,
"step": 150
},
{
"epoch": 0.10922899688526688,
"grad_norm": 20.319118469990684,
"learning_rate": 9.908952959028832e-07,
"logits/chosen": -0.09329548478126526,
"logits/rejected": -0.08749746531248093,
"logps/chosen": -185.65835571289062,
"logps/rejected": -186.53118896484375,
"loss": 0.6661,
"rewards/accuracies": 0.6578125357627869,
"rewards/chosen": 0.08388800173997879,
"rewards/margins": 0.06475642323493958,
"rewards/rejected": 0.019131578505039215,
"step": 160
},
{
"epoch": 0.11605580919059606,
"grad_norm": 20.49502936431809,
"learning_rate": 9.833080424886191e-07,
"logits/chosen": -0.08693637698888779,
"logits/rejected": -0.05635486915707588,
"logps/chosen": -188.38397216796875,
"logps/rejected": -190.4040069580078,
"loss": 0.6596,
"rewards/accuracies": 0.6390625238418579,
"rewards/chosen": 0.04457355663180351,
"rewards/margins": 0.08550170809030533,
"rewards/rejected": -0.040928155183792114,
"step": 170
},
{
"epoch": 0.12288262149592524,
"grad_norm": 22.028249586292812,
"learning_rate": 9.75720789074355e-07,
"logits/chosen": -0.07805919647216797,
"logits/rejected": -0.0718764141201973,
"logps/chosen": -197.16851806640625,
"logps/rejected": -197.43580627441406,
"loss": 0.6533,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.04377557337284088,
"rewards/margins": 0.10366812348365784,
"rewards/rejected": -0.059892550110816956,
"step": 180
},
{
"epoch": 0.12970943380125444,
"grad_norm": 22.318401877969084,
"learning_rate": 9.68133535660091e-07,
"logits/chosen": -0.07107028365135193,
"logits/rejected": -0.04754173755645752,
"logps/chosen": -190.40989685058594,
"logps/rejected": -195.1139373779297,
"loss": 0.6431,
"rewards/accuracies": 0.671875,
"rewards/chosen": 0.049717679619789124,
"rewards/margins": 0.12626302242279053,
"rewards/rejected": -0.0765453577041626,
"step": 190
},
{
"epoch": 0.1365362461065836,
"grad_norm": 22.36881239648136,
"learning_rate": 9.60546282245827e-07,
"logits/chosen": -0.11804388463497162,
"logits/rejected": -0.08306587487459183,
"logps/chosen": -193.70574951171875,
"logps/rejected": -196.11013793945312,
"loss": 0.6244,
"rewards/accuracies": 0.7093750238418579,
"rewards/chosen": 0.035508595407009125,
"rewards/margins": 0.1746881902217865,
"rewards/rejected": -0.13917961716651917,
"step": 200
},
{
"epoch": 0.1433630584119128,
"grad_norm": 22.89524823533437,
"learning_rate": 9.52959028831563e-07,
"logits/chosen": -0.1726662516593933,
"logits/rejected": -0.1550799012184143,
"logps/chosen": -203.2538604736328,
"logps/rejected": -200.24282836914062,
"loss": 0.6277,
"rewards/accuracies": 0.653124988079071,
"rewards/chosen": -0.02303643897175789,
"rewards/margins": 0.18620665371418,
"rewards/rejected": -0.20924308896064758,
"step": 210
},
{
"epoch": 0.15018987071724196,
"grad_norm": 20.757150099669722,
"learning_rate": 9.453717754172988e-07,
"logits/chosen": -0.18080198764801025,
"logits/rejected": -0.14751200377941132,
"logps/chosen": -198.46144104003906,
"logps/rejected": -200.69223022460938,
"loss": 0.6021,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.039264436811208725,
"rewards/margins": 0.2533213794231415,
"rewards/rejected": -0.2925858199596405,
"step": 220
},
{
"epoch": 0.15701668302257116,
"grad_norm": 21.58011887614948,
"learning_rate": 9.377845220030348e-07,
"logits/chosen": -0.20878151059150696,
"logits/rejected": -0.18805599212646484,
"logps/chosen": -197.68267822265625,
"logps/rejected": -200.50274658203125,
"loss": 0.6107,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.09467742592096329,
"rewards/margins": 0.2487429678440094,
"rewards/rejected": -0.3434203863143921,
"step": 230
},
{
"epoch": 0.16384349532790032,
"grad_norm": 24.074058895354312,
"learning_rate": 9.301972685887707e-07,
"logits/chosen": -0.2260722517967224,
"logits/rejected": -0.1964491903781891,
"logps/chosen": -191.25946044921875,
"logps/rejected": -196.70068359375,
"loss": 0.6126,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.1551636904478073,
"rewards/margins": 0.24429623782634735,
"rewards/rejected": -0.39945995807647705,
"step": 240
},
{
"epoch": 0.17067030763322952,
"grad_norm": 22.193469540092437,
"learning_rate": 9.226100151745068e-07,
"logits/chosen": -0.23559394478797913,
"logits/rejected": -0.20932801067829132,
"logps/chosen": -191.7481689453125,
"logps/rejected": -197.9097900390625,
"loss": 0.6207,
"rewards/accuracies": 0.6578125357627869,
"rewards/chosen": -0.23032304644584656,
"rewards/margins": 0.262432336807251,
"rewards/rejected": -0.4927554130554199,
"step": 250
},
{
"epoch": 0.17749711993855868,
"grad_norm": 23.15984586558187,
"learning_rate": 9.150227617602428e-07,
"logits/chosen": -0.21696263551712036,
"logits/rejected": -0.1862940490245819,
"logps/chosen": -190.5767822265625,
"logps/rejected": -195.84523010253906,
"loss": 0.6057,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -0.25581681728363037,
"rewards/margins": 0.3005717098712921,
"rewards/rejected": -0.5563884973526001,
"step": 260
},
{
"epoch": 0.18432393224388788,
"grad_norm": 23.590660195882563,
"learning_rate": 9.074355083459787e-07,
"logits/chosen": -0.21451206505298615,
"logits/rejected": -0.19581295549869537,
"logps/chosen": -194.9508056640625,
"logps/rejected": -202.88421630859375,
"loss": 0.5954,
"rewards/accuracies": 0.7015625238418579,
"rewards/chosen": -0.27721405029296875,
"rewards/margins": 0.30862218141555786,
"rewards/rejected": -0.5858362317085266,
"step": 270
},
{
"epoch": 0.19115074454921704,
"grad_norm": 23.62922114673101,
"learning_rate": 8.998482549317147e-07,
"logits/chosen": -0.26997700333595276,
"logits/rejected": -0.2415258288383484,
"logps/chosen": -189.7841339111328,
"logps/rejected": -194.67752075195312,
"loss": 0.5888,
"rewards/accuracies": 0.6984375715255737,
"rewards/chosen": -0.2856101393699646,
"rewards/margins": 0.3423476219177246,
"rewards/rejected": -0.6279577016830444,
"step": 280
},
{
"epoch": 0.19797755685454624,
"grad_norm": 26.534317555189272,
"learning_rate": 8.922610015174506e-07,
"logits/chosen": -0.2865559160709381,
"logits/rejected": -0.268317312002182,
"logps/chosen": -202.84178161621094,
"logps/rejected": -207.90689086914062,
"loss": 0.5911,
"rewards/accuracies": 0.7015625238418579,
"rewards/chosen": -0.33995726704597473,
"rewards/margins": 0.3573826551437378,
"rewards/rejected": -0.6973399519920349,
"step": 290
},
{
"epoch": 0.2048043691598754,
"grad_norm": 23.626680063365733,
"learning_rate": 8.846737481031866e-07,
"logits/chosen": -0.2780352830886841,
"logits/rejected": -0.242587149143219,
"logps/chosen": -201.13905334472656,
"logps/rejected": -203.72994995117188,
"loss": 0.6114,
"rewards/accuracies": 0.715624988079071,
"rewards/chosen": -0.3799774646759033,
"rewards/margins": 0.32134854793548584,
"rewards/rejected": -0.7013260126113892,
"step": 300
},
{
"epoch": 0.2116311814652046,
"grad_norm": 21.614874156586975,
"learning_rate": 8.770864946889226e-07,
"logits/chosen": -0.2941948175430298,
"logits/rejected": -0.2689184844493866,
"logps/chosen": -204.522216796875,
"logps/rejected": -214.1429443359375,
"loss": 0.5795,
"rewards/accuracies": 0.7187500596046448,
"rewards/chosen": -0.36675944924354553,
"rewards/margins": 0.4399191737174988,
"rewards/rejected": -0.8066786527633667,
"step": 310
},
{
"epoch": 0.21845799377053376,
"grad_norm": 25.28063384418063,
"learning_rate": 8.694992412746586e-07,
"logits/chosen": -0.2757799029350281,
"logits/rejected": -0.27304551005363464,
"logps/chosen": -198.39239501953125,
"logps/rejected": -204.60415649414062,
"loss": 0.574,
"rewards/accuracies": 0.7281249761581421,
"rewards/chosen": -0.3858140707015991,
"rewards/margins": 0.41097506880760193,
"rewards/rejected": -0.7967891097068787,
"step": 320
},
{
"epoch": 0.22528480607586296,
"grad_norm": 23.979378980602498,
"learning_rate": 8.619119878603945e-07,
"logits/chosen": -0.33364883065223694,
"logits/rejected": -0.3188924193382263,
"logps/chosen": -207.99539184570312,
"logps/rejected": -212.8521270751953,
"loss": 0.5916,
"rewards/accuracies": 0.676562488079071,
"rewards/chosen": -0.47858649492263794,
"rewards/margins": 0.37764379382133484,
"rewards/rejected": -0.8562303185462952,
"step": 330
},
{
"epoch": 0.23211161838119213,
"grad_norm": 23.546759098667398,
"learning_rate": 8.543247344461305e-07,
"logits/chosen": -0.3054922819137573,
"logits/rejected": -0.28177574276924133,
"logps/chosen": -203.67938232421875,
"logps/rejected": -211.8566131591797,
"loss": 0.5591,
"rewards/accuracies": 0.7312500476837158,
"rewards/chosen": -0.3886514902114868,
"rewards/margins": 0.5112159252166748,
"rewards/rejected": -0.8998674154281616,
"step": 340
},
{
"epoch": 0.23893843068652132,
"grad_norm": 23.169862441537628,
"learning_rate": 8.467374810318663e-07,
"logits/chosen": -0.3280317485332489,
"logits/rejected": -0.28759223222732544,
"logps/chosen": -204.1493682861328,
"logps/rejected": -212.39410400390625,
"loss": 0.5519,
"rewards/accuracies": 0.7140624523162842,
"rewards/chosen": -0.46998512744903564,
"rewards/margins": 0.5246675610542297,
"rewards/rejected": -0.9946527481079102,
"step": 350
},
{
"epoch": 0.24576524299185049,
"grad_norm": 24.779086737609497,
"learning_rate": 8.391502276176023e-07,
"logits/chosen": -0.30415648221969604,
"logits/rejected": -0.272957980632782,
"logps/chosen": -196.55276489257812,
"logps/rejected": -204.5069580078125,
"loss": 0.5813,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.48177972435951233,
"rewards/margins": 0.4307110607624054,
"rewards/rejected": -0.9124907851219177,
"step": 360
},
{
"epoch": 0.25259205529717965,
"grad_norm": 23.12378831072875,
"learning_rate": 8.315629742033384e-07,
"logits/chosen": -0.28424739837646484,
"logits/rejected": -0.2549440562725067,
"logps/chosen": -197.5590057373047,
"logps/rejected": -210.90127563476562,
"loss": 0.5664,
"rewards/accuracies": 0.714062511920929,
"rewards/chosen": -0.5475950837135315,
"rewards/margins": 0.4803504943847656,
"rewards/rejected": -1.027945637702942,
"step": 370
},
{
"epoch": 0.2594188676025089,
"grad_norm": 21.651196996632308,
"learning_rate": 8.239757207890743e-07,
"logits/chosen": -0.30731576681137085,
"logits/rejected": -0.28506577014923096,
"logps/chosen": -204.0982208251953,
"logps/rejected": -214.4062957763672,
"loss": 0.5418,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.5043364763259888,
"rewards/margins": 0.5770630240440369,
"rewards/rejected": -1.0813994407653809,
"step": 380
},
{
"epoch": 0.26624567990783804,
"grad_norm": 22.751040205953682,
"learning_rate": 8.163884673748103e-07,
"logits/chosen": -0.2683367133140564,
"logits/rejected": -0.2332780659198761,
"logps/chosen": -206.3434600830078,
"logps/rejected": -217.34039306640625,
"loss": 0.532,
"rewards/accuracies": 0.7359375357627869,
"rewards/chosen": -0.47760826349258423,
"rewards/margins": 0.6127501130104065,
"rewards/rejected": -1.0903582572937012,
"step": 390
},
{
"epoch": 0.2730724922131672,
"grad_norm": 24.910083016180884,
"learning_rate": 8.088012139605462e-07,
"logits/chosen": -0.28875064849853516,
"logits/rejected": -0.24306923151016235,
"logps/chosen": -202.8630828857422,
"logps/rejected": -216.6767578125,
"loss": 0.5276,
"rewards/accuracies": 0.7406250238418579,
"rewards/chosen": -0.5778933167457581,
"rewards/margins": 0.6203677654266357,
"rewards/rejected": -1.198261022567749,
"step": 400
},
{
"epoch": 0.2798993045184964,
"grad_norm": 24.823111882995203,
"learning_rate": 8.012139605462822e-07,
"logits/chosen": -0.3603791296482086,
"logits/rejected": -0.314382940530777,
"logps/chosen": -202.1355438232422,
"logps/rejected": -208.8005828857422,
"loss": 0.5573,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.6083186864852905,
"rewards/margins": 0.5520691275596619,
"rewards/rejected": -1.1603877544403076,
"step": 410
},
{
"epoch": 0.2867261168238256,
"grad_norm": 24.6727191743662,
"learning_rate": 7.936267071320181e-07,
"logits/chosen": -0.352464497089386,
"logits/rejected": -0.31943339109420776,
"logps/chosen": -207.77703857421875,
"logps/rejected": -216.52903747558594,
"loss": 0.5276,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6607990264892578,
"rewards/margins": 0.6181461215019226,
"rewards/rejected": -1.2789450883865356,
"step": 420
},
{
"epoch": 0.29355292912915476,
"grad_norm": 23.595963095415875,
"learning_rate": 7.860394537177542e-07,
"logits/chosen": -0.35178378224372864,
"logits/rejected": -0.3272789418697357,
"logps/chosen": -211.35107421875,
"logps/rejected": -227.26947021484375,
"loss": 0.5216,
"rewards/accuracies": 0.7437500357627869,
"rewards/chosen": -0.7554113268852234,
"rewards/margins": 0.7442721724510193,
"rewards/rejected": -1.4996836185455322,
"step": 430
},
{
"epoch": 0.3003797414344839,
"grad_norm": 21.126042694881086,
"learning_rate": 7.784522003034901e-07,
"logits/chosen": -0.3556375503540039,
"logits/rejected": -0.3088908791542053,
"logps/chosen": -203.177001953125,
"logps/rejected": -212.28463745117188,
"loss": 0.5038,
"rewards/accuracies": 0.7843750715255737,
"rewards/chosen": -0.6830729246139526,
"rewards/margins": 0.719536304473877,
"rewards/rejected": -1.4026092290878296,
"step": 440
},
{
"epoch": 0.3072065537398131,
"grad_norm": 29.677764730937284,
"learning_rate": 7.708649468892261e-07,
"logits/chosen": -0.37930557131767273,
"logits/rejected": -0.3498023450374603,
"logps/chosen": -209.02462768554688,
"logps/rejected": -225.4560089111328,
"loss": 0.5226,
"rewards/accuracies": 0.7484375238418579,
"rewards/chosen": -0.7893091440200806,
"rewards/margins": 0.6981508731842041,
"rewards/rejected": -1.4874598979949951,
"step": 450
},
{
"epoch": 0.3140333660451423,
"grad_norm": 22.623116660950316,
"learning_rate": 7.632776934749621e-07,
"logits/chosen": -0.4005587100982666,
"logits/rejected": -0.37974676489830017,
"logps/chosen": -208.33297729492188,
"logps/rejected": -223.84945678710938,
"loss": 0.5006,
"rewards/accuracies": 0.7640625238418579,
"rewards/chosen": -0.7449624538421631,
"rewards/margins": 0.8266347646713257,
"rewards/rejected": -1.5715970993041992,
"step": 460
},
{
"epoch": 0.3208601783504715,
"grad_norm": 24.70720094293141,
"learning_rate": 7.55690440060698e-07,
"logits/chosen": -0.4142500162124634,
"logits/rejected": -0.39995333552360535,
"logps/chosen": -211.68191528320312,
"logps/rejected": -222.69406127929688,
"loss": 0.4891,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7804237604141235,
"rewards/margins": 0.8243077993392944,
"rewards/rejected": -1.6047316789627075,
"step": 470
},
{
"epoch": 0.32768699065580065,
"grad_norm": 25.568499628810063,
"learning_rate": 7.481031866464339e-07,
"logits/chosen": -0.446720689535141,
"logits/rejected": -0.4042230248451233,
"logps/chosen": -201.85269165039062,
"logps/rejected": -216.66664123535156,
"loss": 0.5178,
"rewards/accuracies": 0.7406250238418579,
"rewards/chosen": -0.8340361714363098,
"rewards/margins": 0.7315189838409424,
"rewards/rejected": -1.565555214881897,
"step": 480
},
{
"epoch": 0.3345138029611298,
"grad_norm": 25.32075291618558,
"learning_rate": 7.405159332321699e-07,
"logits/chosen": -0.46066954731941223,
"logits/rejected": -0.4040055274963379,
"logps/chosen": -215.21507263183594,
"logps/rejected": -234.66297912597656,
"loss": 0.5151,
"rewards/accuracies": 0.745312511920929,
"rewards/chosen": -0.9270689487457275,
"rewards/margins": 0.8053193688392639,
"rewards/rejected": -1.7323882579803467,
"step": 490
},
{
"epoch": 0.34134061526645904,
"grad_norm": 33.13945555979027,
"learning_rate": 7.329286798179059e-07,
"logits/chosen": -0.41453421115875244,
"logits/rejected": -0.38940101861953735,
"logps/chosen": -216.4798583984375,
"logps/rejected": -225.99053955078125,
"loss": 0.5269,
"rewards/accuracies": 0.739062488079071,
"rewards/chosen": -0.9324908256530762,
"rewards/margins": 0.7752447724342346,
"rewards/rejected": -1.707735538482666,
"step": 500
},
{
"epoch": 0.3481674275717882,
"grad_norm": 29.438798964644228,
"learning_rate": 7.253414264036418e-07,
"logits/chosen": -0.45365840196609497,
"logits/rejected": -0.4384625256061554,
"logps/chosen": -210.4576873779297,
"logps/rejected": -227.66439819335938,
"loss": 0.4929,
"rewards/accuracies": 0.7703125476837158,
"rewards/chosen": -0.7669359445571899,
"rewards/margins": 0.8243012428283691,
"rewards/rejected": -1.5912370681762695,
"step": 510
},
{
"epoch": 0.35499423987711737,
"grad_norm": 30.351745830656164,
"learning_rate": 7.177541729893778e-07,
"logits/chosen": -0.4614608883857727,
"logits/rejected": -0.4203529953956604,
"logps/chosen": -203.9699249267578,
"logps/rejected": -225.49705505371094,
"loss": 0.5141,
"rewards/accuracies": 0.7515625357627869,
"rewards/chosen": -0.7932397127151489,
"rewards/margins": 0.7744376063346863,
"rewards/rejected": -1.56767737865448,
"step": 520
},
{
"epoch": 0.36182105218244653,
"grad_norm": 28.0826891393098,
"learning_rate": 7.101669195751137e-07,
"logits/chosen": -0.44373035430908203,
"logits/rejected": -0.4143510162830353,
"logps/chosen": -217.09344482421875,
"logps/rejected": -232.4281768798828,
"loss": 0.5047,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -0.9070903062820435,
"rewards/margins": 0.8483283519744873,
"rewards/rejected": -1.7554187774658203,
"step": 530
},
{
"epoch": 0.36864786448777576,
"grad_norm": 24.372697015036685,
"learning_rate": 7.025796661608497e-07,
"logits/chosen": -0.4468221068382263,
"logits/rejected": -0.39906221628189087,
"logps/chosen": -199.6805877685547,
"logps/rejected": -217.41481018066406,
"loss": 0.5279,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7898293733596802,
"rewards/margins": 0.7459580302238464,
"rewards/rejected": -1.5357873439788818,
"step": 540
},
{
"epoch": 0.3754746767931049,
"grad_norm": 23.655888347883682,
"learning_rate": 6.949924127465857e-07,
"logits/chosen": -0.424525648355484,
"logits/rejected": -0.3915669322013855,
"logps/chosen": -218.43475341796875,
"logps/rejected": -230.16305541992188,
"loss": 0.5019,
"rewards/accuracies": 0.7703125476837158,
"rewards/chosen": -0.7072125673294067,
"rewards/margins": 0.8257580399513245,
"rewards/rejected": -1.5329705476760864,
"step": 550
},
{
"epoch": 0.3823014890984341,
"grad_norm": 23.724107854224396,
"learning_rate": 6.874051593323217e-07,
"logits/chosen": -0.4273075759410858,
"logits/rejected": -0.39428552985191345,
"logps/chosen": -208.09197998046875,
"logps/rejected": -228.27288818359375,
"loss": 0.4661,
"rewards/accuracies": 0.776562511920929,
"rewards/chosen": -0.8194674253463745,
"rewards/margins": 0.9406121373176575,
"rewards/rejected": -1.7600796222686768,
"step": 560
},
{
"epoch": 0.38912830140376325,
"grad_norm": 27.455015118742324,
"learning_rate": 6.798179059180577e-07,
"logits/chosen": -0.44063428044319153,
"logits/rejected": -0.3982015550136566,
"logps/chosen": -208.05084228515625,
"logps/rejected": -224.2151336669922,
"loss": 0.5007,
"rewards/accuracies": 0.7671874761581421,
"rewards/chosen": -0.8383839726448059,
"rewards/margins": 0.7879061698913574,
"rewards/rejected": -1.6262900829315186,
"step": 570
},
{
"epoch": 0.3959551137090925,
"grad_norm": 25.123237861481527,
"learning_rate": 6.722306525037936e-07,
"logits/chosen": -0.44060733914375305,
"logits/rejected": -0.40178999304771423,
"logps/chosen": -213.97142028808594,
"logps/rejected": -234.66355895996094,
"loss": 0.5009,
"rewards/accuracies": 0.7640625238418579,
"rewards/chosen": -0.8785637617111206,
"rewards/margins": 0.8950086236000061,
"rewards/rejected": -1.7735724449157715,
"step": 580
},
{
"epoch": 0.40278192601442164,
"grad_norm": 23.64031892312566,
"learning_rate": 6.646433990895296e-07,
"logits/chosen": -0.4277493357658386,
"logits/rejected": -0.3930940628051758,
"logps/chosen": -210.43264770507812,
"logps/rejected": -230.69744873046875,
"loss": 0.4728,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.8971768617630005,
"rewards/margins": 0.9437196850776672,
"rewards/rejected": -1.840896487236023,
"step": 590
},
{
"epoch": 0.4096087383197508,
"grad_norm": 26.066890027531336,
"learning_rate": 6.570561456752655e-07,
"logits/chosen": -0.4176095128059387,
"logits/rejected": -0.3907008171081543,
"logps/chosen": -212.6049346923828,
"logps/rejected": -230.02601623535156,
"loss": 0.4545,
"rewards/accuracies": 0.784375011920929,
"rewards/chosen": -0.915949821472168,
"rewards/margins": 1.011177659034729,
"rewards/rejected": -1.9271275997161865,
"step": 600
},
{
"epoch": 0.41643555062508,
"grad_norm": 26.361787585647985,
"learning_rate": 6.494688922610015e-07,
"logits/chosen": -0.4795023202896118,
"logits/rejected": -0.43696874380111694,
"logps/chosen": -212.20419311523438,
"logps/rejected": -234.36700439453125,
"loss": 0.4559,
"rewards/accuracies": 0.7796875238418579,
"rewards/chosen": -1.0691964626312256,
"rewards/margins": 1.075463056564331,
"rewards/rejected": -2.1446595191955566,
"step": 610
},
{
"epoch": 0.4232623629304092,
"grad_norm": 26.1477045167914,
"learning_rate": 6.418816388467374e-07,
"logits/chosen": -0.4670104384422302,
"logits/rejected": -0.4489012360572815,
"logps/chosen": -212.9263916015625,
"logps/rejected": -230.10812377929688,
"loss": 0.4773,
"rewards/accuracies": 0.7906249761581421,
"rewards/chosen": -1.1133571863174438,
"rewards/margins": 0.960852324962616,
"rewards/rejected": -2.074209451675415,
"step": 620
},
{
"epoch": 0.43008917523573836,
"grad_norm": 27.292995243666663,
"learning_rate": 6.342943854324734e-07,
"logits/chosen": -0.5064845085144043,
"logits/rejected": -0.45950955152511597,
"logps/chosen": -210.99880981445312,
"logps/rejected": -233.96551513671875,
"loss": 0.4687,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.1231608390808105,
"rewards/margins": 1.1056054830551147,
"rewards/rejected": -2.2287662029266357,
"step": 630
},
{
"epoch": 0.43691598754106753,
"grad_norm": 27.008219472309488,
"learning_rate": 6.267071320182093e-07,
"logits/chosen": -0.5144222378730774,
"logits/rejected": -0.47659891843795776,
"logps/chosen": -216.17208862304688,
"logps/rejected": -241.91847229003906,
"loss": 0.4634,
"rewards/accuracies": 0.785937488079071,
"rewards/chosen": -1.1570467948913574,
"rewards/margins": 1.1083678007125854,
"rewards/rejected": -2.2654144763946533,
"step": 640
},
{
"epoch": 0.4437427998463967,
"grad_norm": 26.471519737054525,
"learning_rate": 6.191198786039453e-07,
"logits/chosen": -0.5065978765487671,
"logits/rejected": -0.4691276550292969,
"logps/chosen": -220.1791229248047,
"logps/rejected": -241.1091766357422,
"loss": 0.4658,
"rewards/accuracies": 0.770312488079071,
"rewards/chosen": -1.1591362953186035,
"rewards/margins": 1.0631475448608398,
"rewards/rejected": -2.2222838401794434,
"step": 650
},
{
"epoch": 0.4505696121517259,
"grad_norm": 28.17275739709613,
"learning_rate": 6.115326251896813e-07,
"logits/chosen": -0.5238359570503235,
"logits/rejected": -0.4979589581489563,
"logps/chosen": -217.52597045898438,
"logps/rejected": -234.4109344482422,
"loss": 0.521,
"rewards/accuracies": 0.7703125476837158,
"rewards/chosen": -1.1146113872528076,
"rewards/margins": 0.9384186863899231,
"rewards/rejected": -2.053030252456665,
"step": 660
},
{
"epoch": 0.4573964244570551,
"grad_norm": 31.93706252257983,
"learning_rate": 6.039453717754173e-07,
"logits/chosen": -0.49779045581817627,
"logits/rejected": -0.4656790494918823,
"logps/chosen": -206.24905395507812,
"logps/rejected": -230.8712158203125,
"loss": 0.4947,
"rewards/accuracies": 0.7515624761581421,
"rewards/chosen": -1.0656239986419678,
"rewards/margins": 1.1138218641281128,
"rewards/rejected": -2.179445743560791,
"step": 670
},
{
"epoch": 0.46422323676238425,
"grad_norm": 24.781714808834078,
"learning_rate": 5.963581183611533e-07,
"logits/chosen": -0.5391644835472107,
"logits/rejected": -0.5127192139625549,
"logps/chosen": -219.6558074951172,
"logps/rejected": -235.97950744628906,
"loss": 0.4614,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -0.971098780632019,
"rewards/margins": 1.0510507822036743,
"rewards/rejected": -2.0221495628356934,
"step": 680
},
{
"epoch": 0.47105004906771347,
"grad_norm": 27.945074054287147,
"learning_rate": 5.887708649468892e-07,
"logits/chosen": -0.48153987526893616,
"logits/rejected": -0.43504899740219116,
"logps/chosen": -211.96145629882812,
"logps/rejected": -234.5848388671875,
"loss": 0.4352,
"rewards/accuracies": 0.792187511920929,
"rewards/chosen": -0.9944396018981934,
"rewards/margins": 1.1353037357330322,
"rewards/rejected": -2.1297435760498047,
"step": 690
},
{
"epoch": 0.47787686137304264,
"grad_norm": 28.51572741404973,
"learning_rate": 5.811836115326252e-07,
"logits/chosen": -0.4900820851325989,
"logits/rejected": -0.45390579104423523,
"logps/chosen": -215.57627868652344,
"logps/rejected": -239.19187927246094,
"loss": 0.4538,
"rewards/accuracies": 0.7781250476837158,
"rewards/chosen": -1.0837225914001465,
"rewards/margins": 1.1697932481765747,
"rewards/rejected": -2.2535159587860107,
"step": 700
},
{
"epoch": 0.4847036736783718,
"grad_norm": 32.171436824447824,
"learning_rate": 5.735963581183611e-07,
"logits/chosen": -0.49004998803138733,
"logits/rejected": -0.44813936948776245,
"logps/chosen": -211.11868286132812,
"logps/rejected": -236.65904235839844,
"loss": 0.4482,
"rewards/accuracies": 0.784375011920929,
"rewards/chosen": -1.0840590000152588,
"rewards/margins": 1.1766505241394043,
"rewards/rejected": -2.260709524154663,
"step": 710
},
{
"epoch": 0.49153048598370097,
"grad_norm": 27.130561237438616,
"learning_rate": 5.660091047040971e-07,
"logits/chosen": -0.506287693977356,
"logits/rejected": -0.44507527351379395,
"logps/chosen": -205.77537536621094,
"logps/rejected": -230.67068481445312,
"loss": 0.4736,
"rewards/accuracies": 0.7750000357627869,
"rewards/chosen": -1.1519620418548584,
"rewards/margins": 1.0746945142745972,
"rewards/rejected": -2.226656675338745,
"step": 720
},
{
"epoch": 0.4983572982890302,
"grad_norm": 23.54195977607552,
"learning_rate": 5.584218512898331e-07,
"logits/chosen": -0.47103822231292725,
"logits/rejected": -0.42172738909721375,
"logps/chosen": -214.85769653320312,
"logps/rejected": -236.97178649902344,
"loss": 0.4464,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1006401777267456,
"rewards/margins": 1.1676136255264282,
"rewards/rejected": -2.268253803253174,
"step": 730
},
{
"epoch": 0.5051841105943593,
"grad_norm": 24.379091590009676,
"learning_rate": 5.508345978755691e-07,
"logits/chosen": -0.5123965740203857,
"logits/rejected": -0.47382158041000366,
"logps/chosen": -214.0547637939453,
"logps/rejected": -236.20230102539062,
"loss": 0.4352,
"rewards/accuracies": 0.7984375357627869,
"rewards/chosen": -1.0824706554412842,
"rewards/margins": 1.1324328184127808,
"rewards/rejected": -2.2149033546447754,
"step": 740
},
{
"epoch": 0.5120109228996885,
"grad_norm": 26.994677156319458,
"learning_rate": 5.432473444613049e-07,
"logits/chosen": -0.47567224502563477,
"logits/rejected": -0.4289511442184448,
"logps/chosen": -215.22872924804688,
"logps/rejected": -240.09747314453125,
"loss": 0.4434,
"rewards/accuracies": 0.8046875596046448,
"rewards/chosen": -1.116477131843567,
"rewards/margins": 1.183769941329956,
"rewards/rejected": -2.3002471923828125,
"step": 750
},
{
"epoch": 0.5188377352050177,
"grad_norm": 25.25194090429139,
"learning_rate": 5.356600910470409e-07,
"logits/chosen": -0.5224145650863647,
"logits/rejected": -0.46407467126846313,
"logps/chosen": -213.71389770507812,
"logps/rejected": -237.69161987304688,
"loss": 0.4235,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.0965174436569214,
"rewards/margins": 1.2732737064361572,
"rewards/rejected": -2.369791030883789,
"step": 760
},
{
"epoch": 0.5256645475103469,
"grad_norm": 27.278156612784407,
"learning_rate": 5.280728376327769e-07,
"logits/chosen": -0.5006579160690308,
"logits/rejected": -0.475443571805954,
"logps/chosen": -209.89276123046875,
"logps/rejected": -235.71739196777344,
"loss": 0.4355,
"rewards/accuracies": 0.8031250238418579,
"rewards/chosen": -1.149773120880127,
"rewards/margins": 1.2504949569702148,
"rewards/rejected": -2.400268077850342,
"step": 770
},
{
"epoch": 0.5324913598156761,
"grad_norm": 23.589718462559564,
"learning_rate": 5.204855842185128e-07,
"logits/chosen": -0.526742696762085,
"logits/rejected": -0.47116631269454956,
"logps/chosen": -217.98532104492188,
"logps/rejected": -245.28933715820312,
"loss": 0.4599,
"rewards/accuracies": 0.7718750238418579,
"rewards/chosen": -1.2864385843276978,
"rewards/margins": 1.1668964624404907,
"rewards/rejected": -2.4533350467681885,
"step": 780
},
{
"epoch": 0.5393181721210053,
"grad_norm": 30.5610451636278,
"learning_rate": 5.128983308042489e-07,
"logits/chosen": -0.4980033040046692,
"logits/rejected": -0.454012930393219,
"logps/chosen": -213.7527618408203,
"logps/rejected": -242.71051025390625,
"loss": 0.4175,
"rewards/accuracies": 0.8203125596046448,
"rewards/chosen": -1.2648416757583618,
"rewards/margins": 1.2589421272277832,
"rewards/rejected": -2.5237839221954346,
"step": 790
},
{
"epoch": 0.5461449844263344,
"grad_norm": 25.860792001570903,
"learning_rate": 5.053110773899848e-07,
"logits/chosen": -0.5009379982948303,
"logits/rejected": -0.45161643624305725,
"logps/chosen": -218.66497802734375,
"logps/rejected": -247.26480102539062,
"loss": 0.4239,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -1.2370792627334595,
"rewards/margins": 1.2767220735549927,
"rewards/rejected": -2.513801097869873,
"step": 800
},
{
"epoch": 0.5529717967316636,
"grad_norm": 26.913987383333986,
"learning_rate": 4.977238239757208e-07,
"logits/chosen": -0.4731639325618744,
"logits/rejected": -0.4382101893424988,
"logps/chosen": -219.00997924804688,
"logps/rejected": -243.0546417236328,
"loss": 0.4263,
"rewards/accuracies": 0.8015625476837158,
"rewards/chosen": -1.2992050647735596,
"rewards/margins": 1.2243870496749878,
"rewards/rejected": -2.523592233657837,
"step": 810
},
{
"epoch": 0.5597986090369927,
"grad_norm": 28.53426642330379,
"learning_rate": 4.901365705614567e-07,
"logits/chosen": -0.4574064612388611,
"logits/rejected": -0.4343743920326233,
"logps/chosen": -220.09823608398438,
"logps/rejected": -241.9452362060547,
"loss": 0.455,
"rewards/accuracies": 0.7968750596046448,
"rewards/chosen": -1.3589019775390625,
"rewards/margins": 1.2111233472824097,
"rewards/rejected": -2.5700252056121826,
"step": 820
},
{
"epoch": 0.566625421342322,
"grad_norm": 32.236623934330666,
"learning_rate": 4.825493171471927e-07,
"logits/chosen": -0.477167010307312,
"logits/rejected": -0.4342673122882843,
"logps/chosen": -210.48898315429688,
"logps/rejected": -236.6256561279297,
"loss": 0.4202,
"rewards/accuracies": 0.8109375238418579,
"rewards/chosen": -1.266889214515686,
"rewards/margins": 1.3714545965194702,
"rewards/rejected": -2.6383438110351562,
"step": 830
},
{
"epoch": 0.5734522336476512,
"grad_norm": 22.52023880849803,
"learning_rate": 4.7496206373292864e-07,
"logits/chosen": -0.4688745141029358,
"logits/rejected": -0.4309556484222412,
"logps/chosen": -211.64566040039062,
"logps/rejected": -246.43594360351562,
"loss": 0.3935,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -1.1579673290252686,
"rewards/margins": 1.452370524406433,
"rewards/rejected": -2.610337972640991,
"step": 840
},
{
"epoch": 0.5802790459529803,
"grad_norm": 22.86650334456815,
"learning_rate": 4.673748103186646e-07,
"logits/chosen": -0.45984596014022827,
"logits/rejected": -0.4113742709159851,
"logps/chosen": -221.2747039794922,
"logps/rejected": -251.6842803955078,
"loss": 0.3937,
"rewards/accuracies": 0.8265625238418579,
"rewards/chosen": -1.2885468006134033,
"rewards/margins": 1.4187039136886597,
"rewards/rejected": -2.7072505950927734,
"step": 850
},
{
"epoch": 0.5871058582583095,
"grad_norm": 24.866961143566478,
"learning_rate": 4.597875569044006e-07,
"logits/chosen": -0.4843246042728424,
"logits/rejected": -0.43403178453445435,
"logps/chosen": -217.4392852783203,
"logps/rejected": -241.3638153076172,
"loss": 0.4269,
"rewards/accuracies": 0.7937500476837158,
"rewards/chosen": -1.3239818811416626,
"rewards/margins": 1.322103500366211,
"rewards/rejected": -2.646085500717163,
"step": 860
},
{
"epoch": 0.5939326705636387,
"grad_norm": 27.26389649040254,
"learning_rate": 4.5220030349013654e-07,
"logits/chosen": -0.5012161135673523,
"logits/rejected": -0.4667961299419403,
"logps/chosen": -214.83123779296875,
"logps/rejected": -237.57041931152344,
"loss": 0.4311,
"rewards/accuracies": 0.7953125238418579,
"rewards/chosen": -1.337403655052185,
"rewards/margins": 1.2597852945327759,
"rewards/rejected": -2.597188711166382,
"step": 870
},
{
"epoch": 0.6007594828689679,
"grad_norm": 27.440215221554137,
"learning_rate": 4.446130500758725e-07,
"logits/chosen": -0.5221506953239441,
"logits/rejected": -0.47429159283638,
"logps/chosen": -218.2428436279297,
"logps/rejected": -249.35227966308594,
"loss": 0.4219,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.3460763692855835,
"rewards/margins": 1.4286969900131226,
"rewards/rejected": -2.774773359298706,
"step": 880
},
{
"epoch": 0.6075862951742971,
"grad_norm": 26.890967949171902,
"learning_rate": 4.370257966616085e-07,
"logits/chosen": -0.5067352056503296,
"logits/rejected": -0.4783848822116852,
"logps/chosen": -214.87808227539062,
"logps/rejected": -240.6841583251953,
"loss": 0.4601,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.44821298122406,
"rewards/margins": 1.3052537441253662,
"rewards/rejected": -2.7534666061401367,
"step": 890
},
{
"epoch": 0.6144131074796262,
"grad_norm": 32.237693494920336,
"learning_rate": 4.2943854324734444e-07,
"logits/chosen": -0.4975440502166748,
"logits/rejected": -0.4508504271507263,
"logps/chosen": -218.56907653808594,
"logps/rejected": -252.4253692626953,
"loss": 0.4016,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.4073625802993774,
"rewards/margins": 1.4989043474197388,
"rewards/rejected": -2.9062671661376953,
"step": 900
},
{
"epoch": 0.6212399197849554,
"grad_norm": 29.55632282027641,
"learning_rate": 4.2185128983308036e-07,
"logits/chosen": -0.5099817514419556,
"logits/rejected": -0.45365262031555176,
"logps/chosen": -221.89877319335938,
"logps/rejected": -250.57550048828125,
"loss": 0.4166,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.3778979778289795,
"rewards/margins": 1.4088386297225952,
"rewards/rejected": -2.7867367267608643,
"step": 910
},
{
"epoch": 0.6280667320902846,
"grad_norm": 23.405437038158652,
"learning_rate": 4.142640364188164e-07,
"logits/chosen": -0.5026878118515015,
"logits/rejected": -0.46861255168914795,
"logps/chosen": -224.63916015625,
"logps/rejected": -250.61224365234375,
"loss": 0.4178,
"rewards/accuracies": 0.815625011920929,
"rewards/chosen": -1.4456019401550293,
"rewards/margins": 1.366437554359436,
"rewards/rejected": -2.812039613723755,
"step": 920
},
{
"epoch": 0.6348935443956137,
"grad_norm": 26.317645366377434,
"learning_rate": 4.0667678300455234e-07,
"logits/chosen": -0.5089496374130249,
"logits/rejected": -0.447975218296051,
"logps/chosen": -219.5886688232422,
"logps/rejected": -253.34963989257812,
"loss": 0.3633,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -1.3742705583572388,
"rewards/margins": 1.611161231994629,
"rewards/rejected": -2.985431671142578,
"step": 930
},
{
"epoch": 0.641720356700943,
"grad_norm": 21.42106734892436,
"learning_rate": 3.990895295902883e-07,
"logits/chosen": -0.5624995827674866,
"logits/rejected": -0.5080554485321045,
"logps/chosen": -222.9444580078125,
"logps/rejected": -251.54403686523438,
"loss": 0.3967,
"rewards/accuracies": 0.8250000476837158,
"rewards/chosen": -1.435417890548706,
"rewards/margins": 1.4602770805358887,
"rewards/rejected": -2.8956949710845947,
"step": 940
},
{
"epoch": 0.6485471690062722,
"grad_norm": 21.661157767536693,
"learning_rate": 3.915022761760243e-07,
"logits/chosen": -0.5764337182044983,
"logits/rejected": -0.5422189235687256,
"logps/chosen": -209.9173126220703,
"logps/rejected": -239.46363830566406,
"loss": 0.4007,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -1.5316213369369507,
"rewards/margins": 1.480596899986267,
"rewards/rejected": -3.012218475341797,
"step": 950
},
{
"epoch": 0.6553739813116013,
"grad_norm": 23.414697757548318,
"learning_rate": 3.8391502276176024e-07,
"logits/chosen": -0.5713392496109009,
"logits/rejected": -0.5199674963951111,
"logps/chosen": -228.35032653808594,
"logps/rejected": -261.5352478027344,
"loss": 0.39,
"rewards/accuracies": 0.817187488079071,
"rewards/chosen": -1.6193687915802002,
"rewards/margins": 1.6461690664291382,
"rewards/rejected": -3.265537977218628,
"step": 960
},
{
"epoch": 0.6622007936169305,
"grad_norm": 26.16469307265146,
"learning_rate": 3.763277693474962e-07,
"logits/chosen": -0.5215846300125122,
"logits/rejected": -0.4677078127861023,
"logps/chosen": -218.17672729492188,
"logps/rejected": -251.90655517578125,
"loss": 0.4195,
"rewards/accuracies": 0.807812511920929,
"rewards/chosen": -1.639040470123291,
"rewards/margins": 1.440010666847229,
"rewards/rejected": -3.0790510177612305,
"step": 970
},
{
"epoch": 0.6690276059222596,
"grad_norm": 24.910240774548882,
"learning_rate": 3.687405159332321e-07,
"logits/chosen": -0.5399425029754639,
"logits/rejected": -0.5003796219825745,
"logps/chosen": -223.10562133789062,
"logps/rejected": -255.92811584472656,
"loss": 0.4148,
"rewards/accuracies": 0.8125000596046448,
"rewards/chosen": -1.7429447174072266,
"rewards/margins": 1.4403008222579956,
"rewards/rejected": -3.1832454204559326,
"step": 980
},
{
"epoch": 0.6758544182275888,
"grad_norm": 25.952929350002567,
"learning_rate": 3.611532625189681e-07,
"logits/chosen": -0.5129621028900146,
"logits/rejected": -0.4642283320426941,
"logps/chosen": -229.56149291992188,
"logps/rejected": -265.0505676269531,
"loss": 0.401,
"rewards/accuracies": 0.823437511920929,
"rewards/chosen": -1.6355669498443604,
"rewards/margins": 1.5693683624267578,
"rewards/rejected": -3.204935312271118,
"step": 990
},
{
"epoch": 0.6826812305329181,
"grad_norm": 30.276024499323412,
"learning_rate": 3.5356600910470406e-07,
"logits/chosen": -0.5371091365814209,
"logits/rejected": -0.4973250925540924,
"logps/chosen": -236.55892944335938,
"logps/rejected": -263.94873046875,
"loss": 0.3813,
"rewards/accuracies": 0.8421875238418579,
"rewards/chosen": -1.6107735633850098,
"rewards/margins": 1.5443642139434814,
"rewards/rejected": -3.155137538909912,
"step": 1000
},
{
"epoch": 0.6895080428382472,
"grad_norm": 29.460156400060427,
"learning_rate": 3.459787556904401e-07,
"logits/chosen": -0.5438990592956543,
"logits/rejected": -0.49257737398147583,
"logps/chosen": -223.89715576171875,
"logps/rejected": -254.1317138671875,
"loss": 0.405,
"rewards/accuracies": 0.823437511920929,
"rewards/chosen": -1.5855486392974854,
"rewards/margins": 1.5699981451034546,
"rewards/rejected": -3.1555471420288086,
"step": 1010
},
{
"epoch": 0.6963348551435764,
"grad_norm": 37.79554341081753,
"learning_rate": 3.3839150227617604e-07,
"logits/chosen": -0.5885217189788818,
"logits/rejected": -0.5452876091003418,
"logps/chosen": -228.7353515625,
"logps/rejected": -262.69781494140625,
"loss": 0.3902,
"rewards/accuracies": 0.8484375476837158,
"rewards/chosen": -1.5996198654174805,
"rewards/margins": 1.5858361721038818,
"rewards/rejected": -3.185455799102783,
"step": 1020
},
{
"epoch": 0.7031616674489056,
"grad_norm": 32.4485056058962,
"learning_rate": 3.30804248861912e-07,
"logits/chosen": -0.6067803502082825,
"logits/rejected": -0.5617104768753052,
"logps/chosen": -224.02381896972656,
"logps/rejected": -258.9015808105469,
"loss": 0.3977,
"rewards/accuracies": 0.8218750357627869,
"rewards/chosen": -1.7180209159851074,
"rewards/margins": 1.5710700750350952,
"rewards/rejected": -3.289091110229492,
"step": 1030
},
{
"epoch": 0.7099884797542347,
"grad_norm": 25.533353588922054,
"learning_rate": 3.232169954476479e-07,
"logits/chosen": -0.6116800308227539,
"logits/rejected": -0.5805966854095459,
"logps/chosen": -225.80697631835938,
"logps/rejected": -264.60693359375,
"loss": 0.3617,
"rewards/accuracies": 0.8453124761581421,
"rewards/chosen": -1.4660491943359375,
"rewards/margins": 1.7193350791931152,
"rewards/rejected": -3.1853842735290527,
"step": 1040
},
{
"epoch": 0.716815292059564,
"grad_norm": 26.60381949350959,
"learning_rate": 3.156297420333839e-07,
"logits/chosen": -0.6242787837982178,
"logits/rejected": -0.5573821067810059,
"logps/chosen": -226.86032104492188,
"logps/rejected": -264.05621337890625,
"loss": 0.3629,
"rewards/accuracies": 0.8421875238418579,
"rewards/chosen": -1.66229248046875,
"rewards/margins": 1.770555019378662,
"rewards/rejected": -3.432847499847412,
"step": 1050
},
{
"epoch": 0.7236421043648931,
"grad_norm": 23.579508860359553,
"learning_rate": 3.0804248861911986e-07,
"logits/chosen": -0.5435959696769714,
"logits/rejected": -0.49748197197914124,
"logps/chosen": -226.20513916015625,
"logps/rejected": -257.2373046875,
"loss": 0.3933,
"rewards/accuracies": 0.8281250596046448,
"rewards/chosen": -1.6908609867095947,
"rewards/margins": 1.6625466346740723,
"rewards/rejected": -3.353407144546509,
"step": 1060
},
{
"epoch": 0.7304689166702223,
"grad_norm": 27.167904414342036,
"learning_rate": 3.004552352048558e-07,
"logits/chosen": -0.6101936101913452,
"logits/rejected": -0.5515713691711426,
"logps/chosen": -222.96014404296875,
"logps/rejected": -254.43185424804688,
"loss": 0.3563,
"rewards/accuracies": 0.8421875238418579,
"rewards/chosen": -1.3781498670578003,
"rewards/margins": 1.5955610275268555,
"rewards/rejected": -2.9737110137939453,
"step": 1070
},
{
"epoch": 0.7372957289755515,
"grad_norm": 26.180827032107008,
"learning_rate": 2.928679817905918e-07,
"logits/chosen": -0.5770156383514404,
"logits/rejected": -0.5399613380432129,
"logps/chosen": -226.30825805664062,
"logps/rejected": -257.7979736328125,
"loss": 0.3894,
"rewards/accuracies": 0.8312500715255737,
"rewards/chosen": -1.5706027746200562,
"rewards/margins": 1.551680326461792,
"rewards/rejected": -3.1222832202911377,
"step": 1080
},
{
"epoch": 0.7441225412808806,
"grad_norm": 28.76861593583347,
"learning_rate": 2.8528072837632776e-07,
"logits/chosen": -0.5627835392951965,
"logits/rejected": -0.48850327730178833,
"logps/chosen": -215.5068359375,
"logps/rejected": -251.03070068359375,
"loss": 0.3595,
"rewards/accuracies": 0.8531249761581421,
"rewards/chosen": -1.5191683769226074,
"rewards/margins": 1.6994376182556152,
"rewards/rejected": -3.2186059951782227,
"step": 1090
},
{
"epoch": 0.7509493535862098,
"grad_norm": 28.410377070052604,
"learning_rate": 2.776934749620637e-07,
"logits/chosen": -0.5616481304168701,
"logits/rejected": -0.5304921865463257,
"logps/chosen": -213.49261474609375,
"logps/rejected": -240.36477661132812,
"loss": 0.4064,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.595428466796875,
"rewards/margins": 1.5317962169647217,
"rewards/rejected": -3.1272246837615967,
"step": 1100
},
{
"epoch": 0.7577761658915391,
"grad_norm": 28.197822913401385,
"learning_rate": 2.7010622154779964e-07,
"logits/chosen": -0.5820162892341614,
"logits/rejected": -0.5277166366577148,
"logps/chosen": -224.64027404785156,
"logps/rejected": -258.8291320800781,
"loss": 0.4045,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.5898746252059937,
"rewards/margins": 1.5575275421142578,
"rewards/rejected": -3.147402048110962,
"step": 1110
},
{
"epoch": 0.7646029781968682,
"grad_norm": 19.603917980899062,
"learning_rate": 2.6251896813353566e-07,
"logits/chosen": -0.5747621059417725,
"logits/rejected": -0.5307395458221436,
"logps/chosen": -224.86712646484375,
"logps/rejected": -255.05657958984375,
"loss": 0.3845,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -1.4760308265686035,
"rewards/margins": 1.6863486766815186,
"rewards/rejected": -3.162379264831543,
"step": 1120
},
{
"epoch": 0.7714297905021974,
"grad_norm": 23.578968750795934,
"learning_rate": 2.549317147192716e-07,
"logits/chosen": -0.5622972846031189,
"logits/rejected": -0.5020321607589722,
"logps/chosen": -219.63861083984375,
"logps/rejected": -253.4189453125,
"loss": 0.3657,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.452361822128296,
"rewards/margins": 1.6745182275772095,
"rewards/rejected": -3.126879930496216,
"step": 1130
},
{
"epoch": 0.7782566028075265,
"grad_norm": 28.72777820280462,
"learning_rate": 2.473444613050076e-07,
"logits/chosen": -0.6052004098892212,
"logits/rejected": -0.5665544271469116,
"logps/chosen": -216.50294494628906,
"logps/rejected": -247.38058471679688,
"loss": 0.3809,
"rewards/accuracies": 0.8281250596046448,
"rewards/chosen": -1.524512529373169,
"rewards/margins": 1.5869126319885254,
"rewards/rejected": -3.1114251613616943,
"step": 1140
},
{
"epoch": 0.7850834151128557,
"grad_norm": 29.78459070126252,
"learning_rate": 2.3975720789074356e-07,
"logits/chosen": -0.6316035985946655,
"logits/rejected": -0.5892209410667419,
"logps/chosen": -231.2786407470703,
"logps/rejected": -262.7926025390625,
"loss": 0.3996,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -1.5984147787094116,
"rewards/margins": 1.644458532333374,
"rewards/rejected": -3.242873430252075,
"step": 1150
},
{
"epoch": 0.791910227418185,
"grad_norm": 25.317404664815193,
"learning_rate": 2.321699544764795e-07,
"logits/chosen": -0.5851012468338013,
"logits/rejected": -0.5416604280471802,
"logps/chosen": -212.95620727539062,
"logps/rejected": -249.06094360351562,
"loss": 0.3691,
"rewards/accuracies": 0.8250000476837158,
"rewards/chosen": -1.6382710933685303,
"rewards/margins": 1.698676586151123,
"rewards/rejected": -3.336947441101074,
"step": 1160
},
{
"epoch": 0.7987370397235141,
"grad_norm": 30.536892352417116,
"learning_rate": 2.2458270106221546e-07,
"logits/chosen": -0.6020447611808777,
"logits/rejected": -0.5497596263885498,
"logps/chosen": -223.9817352294922,
"logps/rejected": -254.70596313476562,
"loss": 0.3743,
"rewards/accuracies": 0.832812488079071,
"rewards/chosen": -1.5852043628692627,
"rewards/margins": 1.6272475719451904,
"rewards/rejected": -3.212451934814453,
"step": 1170
},
{
"epoch": 0.8055638520288433,
"grad_norm": 79.72745340543844,
"learning_rate": 2.1699544764795143e-07,
"logits/chosen": -0.6025545597076416,
"logits/rejected": -0.5697463154792786,
"logps/chosen": -222.6460418701172,
"logps/rejected": -253.47068786621094,
"loss": 0.4136,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.6384048461914062,
"rewards/margins": 1.580573558807373,
"rewards/rejected": -3.2189784049987793,
"step": 1180
},
{
"epoch": 0.8123906643341725,
"grad_norm": 22.663782810990345,
"learning_rate": 2.094081942336874e-07,
"logits/chosen": -0.5814501047134399,
"logits/rejected": -0.541379988193512,
"logps/chosen": -221.52999877929688,
"logps/rejected": -253.7140350341797,
"loss": 0.3363,
"rewards/accuracies": 0.8640625476837158,
"rewards/chosen": -1.6155626773834229,
"rewards/margins": 1.7298386096954346,
"rewards/rejected": -3.3454012870788574,
"step": 1190
},
{
"epoch": 0.8192174766395016,
"grad_norm": 20.696850459845496,
"learning_rate": 2.0182094081942336e-07,
"logits/chosen": -0.5838125944137573,
"logits/rejected": -0.5411943793296814,
"logps/chosen": -225.6829071044922,
"logps/rejected": -253.77597045898438,
"loss": 0.3816,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": -1.6227580308914185,
"rewards/margins": 1.729698896408081,
"rewards/rejected": -3.352457046508789,
"step": 1200
},
{
"epoch": 0.8260442889448308,
"grad_norm": 22.607979595724903,
"learning_rate": 1.9423368740515933e-07,
"logits/chosen": -0.5873730182647705,
"logits/rejected": -0.5527704954147339,
"logps/chosen": -236.8846435546875,
"logps/rejected": -263.2967224121094,
"loss": 0.366,
"rewards/accuracies": 0.8531250357627869,
"rewards/chosen": -1.619153618812561,
"rewards/margins": 1.6637271642684937,
"rewards/rejected": -3.2828805446624756,
"step": 1210
},
{
"epoch": 0.83287110125016,
"grad_norm": 26.109280560076318,
"learning_rate": 1.8664643399089527e-07,
"logits/chosen": -0.5835367441177368,
"logits/rejected": -0.5328267216682434,
"logps/chosen": -220.54185485839844,
"logps/rejected": -259.7679748535156,
"loss": 0.3908,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.6561741828918457,
"rewards/margins": 1.7342405319213867,
"rewards/rejected": -3.3904144763946533,
"step": 1220
},
{
"epoch": 0.8396979135554892,
"grad_norm": 32.97497497653311,
"learning_rate": 1.7905918057663124e-07,
"logits/chosen": -0.6125339269638062,
"logits/rejected": -0.5747178792953491,
"logps/chosen": -222.89346313476562,
"logps/rejected": -259.0747985839844,
"loss": 0.3757,
"rewards/accuracies": 0.8484375476837158,
"rewards/chosen": -1.631098747253418,
"rewards/margins": 1.6798287630081177,
"rewards/rejected": -3.310927391052246,
"step": 1230
},
{
"epoch": 0.8465247258608184,
"grad_norm": 27.98596142342839,
"learning_rate": 1.7147192716236723e-07,
"logits/chosen": -0.5824239253997803,
"logits/rejected": -0.5337764024734497,
"logps/chosen": -224.97032165527344,
"logps/rejected": -253.56494140625,
"loss": 0.3747,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -1.6330980062484741,
"rewards/margins": 1.6164720058441162,
"rewards/rejected": -3.249569892883301,
"step": 1240
},
{
"epoch": 0.8533515381661475,
"grad_norm": 29.97641566967343,
"learning_rate": 1.638846737481032e-07,
"logits/chosen": -0.6242474913597107,
"logits/rejected": -0.5719231963157654,
"logps/chosen": -225.42031860351562,
"logps/rejected": -250.8466796875,
"loss": 0.3653,
"rewards/accuracies": 0.8453124761581421,
"rewards/chosen": -1.5972038507461548,
"rewards/margins": 1.6274079084396362,
"rewards/rejected": -3.224611759185791,
"step": 1250
},
{
"epoch": 0.8601783504714767,
"grad_norm": 28.827180040399767,
"learning_rate": 1.5629742033383914e-07,
"logits/chosen": -0.5725576877593994,
"logits/rejected": -0.5013697147369385,
"logps/chosen": -229.0035400390625,
"logps/rejected": -265.1317443847656,
"loss": 0.348,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.63456392288208,
"rewards/margins": 1.9875181913375854,
"rewards/rejected": -3.622081995010376,
"step": 1260
},
{
"epoch": 0.867005162776806,
"grad_norm": 25.80801920302886,
"learning_rate": 1.487101669195751e-07,
"logits/chosen": -0.5822853446006775,
"logits/rejected": -0.5320168733596802,
"logps/chosen": -225.67462158203125,
"logps/rejected": -261.8921203613281,
"loss": 0.3703,
"rewards/accuracies": 0.8406250476837158,
"rewards/chosen": -1.611586332321167,
"rewards/margins": 1.8382863998413086,
"rewards/rejected": -3.4498729705810547,
"step": 1270
},
{
"epoch": 0.8738319750821351,
"grad_norm": 22.850573481872292,
"learning_rate": 1.4112291350531107e-07,
"logits/chosen": -0.5987452268600464,
"logits/rejected": -0.555465817451477,
"logps/chosen": -221.936767578125,
"logps/rejected": -258.4222106933594,
"loss": 0.3694,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": -1.4851796627044678,
"rewards/margins": 1.6478145122528076,
"rewards/rejected": -3.1329944133758545,
"step": 1280
},
{
"epoch": 0.8806587873874643,
"grad_norm": 31.59414494820462,
"learning_rate": 1.3353566009104704e-07,
"logits/chosen": -0.6345129013061523,
"logits/rejected": -0.5726636052131653,
"logps/chosen": -225.1095428466797,
"logps/rejected": -260.2147216796875,
"loss": 0.3516,
"rewards/accuracies": 0.8609375357627869,
"rewards/chosen": -1.6206376552581787,
"rewards/margins": 1.7943859100341797,
"rewards/rejected": -3.4150233268737793,
"step": 1290
},
{
"epoch": 0.8874855996927934,
"grad_norm": 22.999874271408995,
"learning_rate": 1.25948406676783e-07,
"logits/chosen": -0.6340910196304321,
"logits/rejected": -0.6022393703460693,
"logps/chosen": -227.1848907470703,
"logps/rejected": -259.4250183105469,
"loss": 0.3606,
"rewards/accuracies": 0.840624988079071,
"rewards/chosen": -1.6741951704025269,
"rewards/margins": 1.7158465385437012,
"rewards/rejected": -3.3900415897369385,
"step": 1300
},
{
"epoch": 0.8943124119981226,
"grad_norm": 22.57593696460777,
"learning_rate": 1.1836115326251896e-07,
"logits/chosen": -0.6423132419586182,
"logits/rejected": -0.5848190188407898,
"logps/chosen": -223.364990234375,
"logps/rejected": -262.73785400390625,
"loss": 0.3355,
"rewards/accuracies": 0.8578125238418579,
"rewards/chosen": -1.5638341903686523,
"rewards/margins": 1.9036611318588257,
"rewards/rejected": -3.4674954414367676,
"step": 1310
},
{
"epoch": 0.9011392243034518,
"grad_norm": 37.54515881184046,
"learning_rate": 1.1077389984825493e-07,
"logits/chosen": -0.575082540512085,
"logits/rejected": -0.5424289107322693,
"logps/chosen": -237.77911376953125,
"logps/rejected": -275.63555908203125,
"loss": 0.3516,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -1.6724742650985718,
"rewards/margins": 1.8654717206954956,
"rewards/rejected": -3.5379459857940674,
"step": 1320
},
{
"epoch": 0.907966036608781,
"grad_norm": 22.84612850953967,
"learning_rate": 1.0318664643399089e-07,
"logits/chosen": -0.5780067443847656,
"logits/rejected": -0.5416074395179749,
"logps/chosen": -221.38531494140625,
"logps/rejected": -256.49200439453125,
"loss": 0.3723,
"rewards/accuracies": 0.8531250357627869,
"rewards/chosen": -1.6589443683624268,
"rewards/margins": 1.743546724319458,
"rewards/rejected": -3.4024910926818848,
"step": 1330
},
{
"epoch": 0.9147928489141102,
"grad_norm": 19.89021053617579,
"learning_rate": 9.559939301972686e-08,
"logits/chosen": -0.6442657709121704,
"logits/rejected": -0.6099978685379028,
"logps/chosen": -224.65768432617188,
"logps/rejected": -256.4526062011719,
"loss": 0.3785,
"rewards/accuracies": 0.839062511920929,
"rewards/chosen": -1.7409473657608032,
"rewards/margins": 1.636985421180725,
"rewards/rejected": -3.3779327869415283,
"step": 1340
},
{
"epoch": 0.9216196612194394,
"grad_norm": 25.742427094875804,
"learning_rate": 8.801213960546281e-08,
"logits/chosen": -0.6490598320960999,
"logits/rejected": -0.5897331833839417,
"logps/chosen": -223.9561309814453,
"logps/rejected": -259.2427062988281,
"loss": 0.3468,
"rewards/accuracies": 0.8546874523162842,
"rewards/chosen": -1.6736507415771484,
"rewards/margins": 1.7775483131408691,
"rewards/rejected": -3.4511990547180176,
"step": 1350
},
{
"epoch": 0.9284464735247685,
"grad_norm": 33.277457563263674,
"learning_rate": 8.042488619119878e-08,
"logits/chosen": -0.6450273990631104,
"logits/rejected": -0.6108094453811646,
"logps/chosen": -225.92788696289062,
"logps/rejected": -257.3470458984375,
"loss": 0.4063,
"rewards/accuracies": 0.8234375715255737,
"rewards/chosen": -1.7581716775894165,
"rewards/margins": 1.6247668266296387,
"rewards/rejected": -3.3829383850097656,
"step": 1360
},
{
"epoch": 0.9352732858300977,
"grad_norm": 27.273007529128005,
"learning_rate": 7.283763277693475e-08,
"logits/chosen": -0.5727499723434448,
"logits/rejected": -0.5322836637496948,
"logps/chosen": -225.6246337890625,
"logps/rejected": -255.45809936523438,
"loss": 0.3476,
"rewards/accuracies": 0.854687511920929,
"rewards/chosen": -1.7323064804077148,
"rewards/margins": 1.745940923690796,
"rewards/rejected": -3.4782474040985107,
"step": 1370
},
{
"epoch": 0.9421000981354269,
"grad_norm": 32.146995205900126,
"learning_rate": 6.525037936267071e-08,
"logits/chosen": -0.6203707456588745,
"logits/rejected": -0.5694869756698608,
"logps/chosen": -224.69058227539062,
"logps/rejected": -264.454833984375,
"loss": 0.3231,
"rewards/accuracies": 0.8843750357627869,
"rewards/chosen": -1.678446888923645,
"rewards/margins": 1.9156006574630737,
"rewards/rejected": -3.5940475463867188,
"step": 1380
},
{
"epoch": 0.948926910440756,
"grad_norm": 30.60411185742407,
"learning_rate": 5.766312594840667e-08,
"logits/chosen": -0.6311684846878052,
"logits/rejected": -0.586366593837738,
"logps/chosen": -220.22238159179688,
"logps/rejected": -253.02584838867188,
"loss": 0.3841,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -1.6623611450195312,
"rewards/margins": 1.5844680070877075,
"rewards/rejected": -3.2468292713165283,
"step": 1390
},
{
"epoch": 0.9557537227460853,
"grad_norm": 30.706698645138353,
"learning_rate": 5.007587253414264e-08,
"logits/chosen": -0.6784946918487549,
"logits/rejected": -0.6356594562530518,
"logps/chosen": -224.51947021484375,
"logps/rejected": -258.38885498046875,
"loss": 0.3436,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.5771867036819458,
"rewards/margins": 1.9237372875213623,
"rewards/rejected": -3.5009241104125977,
"step": 1400
},
{
"epoch": 0.9625805350514144,
"grad_norm": 21.98143091385874,
"learning_rate": 4.2488619119878606e-08,
"logits/chosen": -0.6370885372161865,
"logits/rejected": -0.5851677060127258,
"logps/chosen": -227.73484802246094,
"logps/rejected": -267.04486083984375,
"loss": 0.3695,
"rewards/accuracies": 0.8312500715255737,
"rewards/chosen": -1.629777431488037,
"rewards/margins": 1.7639508247375488,
"rewards/rejected": -3.393728494644165,
"step": 1410
},
{
"epoch": 0.9694073473567436,
"grad_norm": 30.869528987745515,
"learning_rate": 3.4901365705614566e-08,
"logits/chosen": -0.6314695477485657,
"logits/rejected": -0.5921632647514343,
"logps/chosen": -221.69427490234375,
"logps/rejected": -254.48204040527344,
"loss": 0.3673,
"rewards/accuracies": 0.8250000476837158,
"rewards/chosen": -1.5672862529754639,
"rewards/margins": 1.6834831237792969,
"rewards/rejected": -3.2507691383361816,
"step": 1420
},
{
"epoch": 0.9762341596620728,
"grad_norm": 27.10327349430508,
"learning_rate": 2.731411229135053e-08,
"logits/chosen": -0.6630594730377197,
"logits/rejected": -0.6296666264533997,
"logps/chosen": -224.44802856445312,
"logps/rejected": -263.31512451171875,
"loss": 0.3689,
"rewards/accuracies": 0.846875011920929,
"rewards/chosen": -1.7772753238677979,
"rewards/margins": 1.7530099153518677,
"rewards/rejected": -3.530285358428955,
"step": 1430
},
{
"epoch": 0.9830609719674019,
"grad_norm": 28.43487263604258,
"learning_rate": 1.9726858877086493e-08,
"logits/chosen": -0.6382923126220703,
"logits/rejected": -0.5936453342437744,
"logps/chosen": -225.35842895507812,
"logps/rejected": -257.9464416503906,
"loss": 0.3744,
"rewards/accuracies": 0.8468750715255737,
"rewards/chosen": -1.7416939735412598,
"rewards/margins": 1.7625494003295898,
"rewards/rejected": -3.5042431354522705,
"step": 1440
},
{
"epoch": 0.9898877842727312,
"grad_norm": 29.892251091074566,
"learning_rate": 1.2139605462822458e-08,
"logits/chosen": -0.6021047830581665,
"logits/rejected": -0.5681164860725403,
"logps/chosen": -228.019287109375,
"logps/rejected": -261.641357421875,
"loss": 0.3924,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": -1.7237948179244995,
"rewards/margins": 1.5835388898849487,
"rewards/rejected": -3.307333469390869,
"step": 1450
},
{
"epoch": 0.9967145965780604,
"grad_norm": 36.52351236896022,
"learning_rate": 4.552352048558422e-09,
"logits/chosen": -0.6409857273101807,
"logits/rejected": -0.6134127378463745,
"logps/chosen": -229.7710418701172,
"logps/rejected": -268.1390075683594,
"loss": 0.338,
"rewards/accuracies": 0.870312511920929,
"rewards/chosen": -1.6543751955032349,
"rewards/margins": 1.8962233066558838,
"rewards/rejected": -3.550598621368408,
"step": 1460
},
{
"epoch": 1.0,
"step": 1465,
"total_flos": 161167907028992.0,
"train_loss": 0.47729448792063744,
"train_runtime": 14275.1765,
"train_samples_per_second": 6.567,
"train_steps_per_second": 0.103
}
],
"logging_steps": 10,
"max_steps": 1465,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 161167907028992.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}