davidanugraha's picture
Upload folder using huggingface_hub
a91f192 verified
raw
history blame
78.9 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1467,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006817503941369466,
"grad_norm": 20.121693308244087,
"learning_rate": 6.122448979591837e-08,
"logits/chosen": -0.013989130035042763,
"logits/rejected": 0.058542873710393906,
"logps/chosen": -190.215087890625,
"logps/rejected": -203.27479553222656,
"loss": 0.6922,
"rewards/accuracies": 0.4390625059604645,
"rewards/chosen": 0.0016301964642480016,
"rewards/margins": 0.0021729914005845785,
"rewards/rejected": -0.0005427949363365769,
"step": 10
},
{
"epoch": 0.013635007882738932,
"grad_norm": 20.76481447180183,
"learning_rate": 1.2925170068027211e-07,
"logits/chosen": -0.006685615051537752,
"logits/rejected": 0.06347016990184784,
"logps/chosen": -191.0093994140625,
"logps/rejected": -203.0770263671875,
"loss": 0.6933,
"rewards/accuracies": 0.5015624761581421,
"rewards/chosen": -0.0009165612864308059,
"rewards/margins": 8.891527249943465e-05,
"rewards/rejected": -0.0010054768063127995,
"step": 20
},
{
"epoch": 0.0204525118241084,
"grad_norm": 21.80914665550797,
"learning_rate": 1.9727891156462583e-07,
"logits/chosen": 0.02424698881804943,
"logits/rejected": 0.0787603035569191,
"logps/chosen": -188.7990264892578,
"logps/rejected": -198.6798095703125,
"loss": 0.6929,
"rewards/accuracies": 0.5093750357627869,
"rewards/chosen": 0.0011249443050473928,
"rewards/margins": 0.000932438881136477,
"rewards/rejected": 0.00019250542391091585,
"step": 30
},
{
"epoch": 0.027270015765477863,
"grad_norm": 18.7019723893629,
"learning_rate": 2.653061224489796e-07,
"logits/chosen": -0.006816861219704151,
"logits/rejected": 0.05330298840999603,
"logps/chosen": -183.76039123535156,
"logps/rejected": -199.3633575439453,
"loss": 0.6925,
"rewards/accuracies": 0.520312488079071,
"rewards/chosen": -0.0010295719839632511,
"rewards/margins": 0.0016696588136255741,
"rewards/rejected": -0.0026992305647581816,
"step": 40
},
{
"epoch": 0.03408751970684733,
"grad_norm": 20.831078202136776,
"learning_rate": 3.333333333333333e-07,
"logits/chosen": 0.03178512677550316,
"logits/rejected": 0.10444696992635727,
"logps/chosen": -182.8831329345703,
"logps/rejected": -193.09466552734375,
"loss": 0.6921,
"rewards/accuracies": 0.520312488079071,
"rewards/chosen": -0.0042633856646716595,
"rewards/margins": 0.0025998500641435385,
"rewards/rejected": -0.006863235495984554,
"step": 50
},
{
"epoch": 0.0409050236482168,
"grad_norm": 23.46573004490643,
"learning_rate": 4.0136054421768705e-07,
"logits/chosen": -0.0085222776979208,
"logits/rejected": 0.04688471555709839,
"logps/chosen": -187.2247314453125,
"logps/rejected": -197.925537109375,
"loss": 0.6919,
"rewards/accuracies": 0.5218749642372131,
"rewards/chosen": -0.009058980271220207,
"rewards/margins": 0.003005079925060272,
"rewards/rejected": -0.01206406019628048,
"step": 60
},
{
"epoch": 0.04772252758958626,
"grad_norm": 21.16825784724637,
"learning_rate": 4.693877551020408e-07,
"logits/chosen": 0.03674064576625824,
"logits/rejected": 0.10054312646389008,
"logps/chosen": -177.9295654296875,
"logps/rejected": -190.43357849121094,
"loss": 0.6919,
"rewards/accuracies": 0.5453125238418579,
"rewards/chosen": -0.014818010851740837,
"rewards/margins": 0.0029301545582711697,
"rewards/rejected": -0.017748164013028145,
"step": 70
},
{
"epoch": 0.05454003153095573,
"grad_norm": 19.91582657292646,
"learning_rate": 5.374149659863945e-07,
"logits/chosen": 0.03953830525279045,
"logits/rejected": 0.0943986028432846,
"logps/chosen": -174.0701446533203,
"logps/rejected": -185.5764617919922,
"loss": 0.6897,
"rewards/accuracies": 0.5453125238418579,
"rewards/chosen": -0.02375047467648983,
"rewards/margins": 0.007541469298303127,
"rewards/rejected": -0.03129194676876068,
"step": 80
},
{
"epoch": 0.0613575354723252,
"grad_norm": 19.184157083023965,
"learning_rate": 6.054421768707482e-07,
"logits/chosen": 0.041828252375125885,
"logits/rejected": 0.11053334176540375,
"logps/chosen": -180.23428344726562,
"logps/rejected": -192.7969207763672,
"loss": 0.6894,
"rewards/accuracies": 0.5656249523162842,
"rewards/chosen": -0.04171518608927727,
"rewards/margins": 0.00825162697583437,
"rewards/rejected": -0.04996681213378906,
"step": 90
},
{
"epoch": 0.06817503941369465,
"grad_norm": 21.70613257034061,
"learning_rate": 6.734693877551019e-07,
"logits/chosen": 0.03590967878699303,
"logits/rejected": 0.12106480449438095,
"logps/chosen": -187.4181365966797,
"logps/rejected": -196.91404724121094,
"loss": 0.6858,
"rewards/accuracies": 0.620312511920929,
"rewards/chosen": -0.06401355564594269,
"rewards/margins": 0.015702249482274055,
"rewards/rejected": -0.07971581071615219,
"step": 100
},
{
"epoch": 0.07499254335506413,
"grad_norm": 21.340307822546738,
"learning_rate": 7.414965986394558e-07,
"logits/chosen": 0.054249007254838943,
"logits/rejected": 0.11480608582496643,
"logps/chosen": -198.7044219970703,
"logps/rejected": -212.5745086669922,
"loss": 0.6826,
"rewards/accuracies": 0.598437488079071,
"rewards/chosen": -0.10510613769292831,
"rewards/margins": 0.02307462878525257,
"rewards/rejected": -0.12818075716495514,
"step": 110
},
{
"epoch": 0.0818100472964336,
"grad_norm": 19.224497336436897,
"learning_rate": 8.095238095238095e-07,
"logits/chosen": 0.09257032722234726,
"logits/rejected": 0.12628528475761414,
"logps/chosen": -186.2566680908203,
"logps/rejected": -198.2835235595703,
"loss": 0.6818,
"rewards/accuracies": 0.6015625596046448,
"rewards/chosen": -0.13667678833007812,
"rewards/margins": 0.02535596489906311,
"rewards/rejected": -0.16203275322914124,
"step": 120
},
{
"epoch": 0.08862755123780305,
"grad_norm": 21.13611235058464,
"learning_rate": 8.775510204081632e-07,
"logits/chosen": 0.09457506239414215,
"logits/rejected": 0.15074704587459564,
"logps/chosen": -194.18267822265625,
"logps/rejected": -206.7709503173828,
"loss": 0.6724,
"rewards/accuracies": 0.6031250357627869,
"rewards/chosen": -0.17589515447616577,
"rewards/margins": 0.04765651002526283,
"rewards/rejected": -0.2235516607761383,
"step": 130
},
{
"epoch": 0.09544505517917252,
"grad_norm": 19.76839160560583,
"learning_rate": 9.45578231292517e-07,
"logits/chosen": 0.11603380739688873,
"logits/rejected": 0.1540485918521881,
"logps/chosen": -194.67906188964844,
"logps/rejected": -201.3298797607422,
"loss": 0.6753,
"rewards/accuracies": 0.6421875357627869,
"rewards/chosen": -0.21486632525920868,
"rewards/margins": 0.04251245781779289,
"rewards/rejected": -0.25737878680229187,
"step": 140
},
{
"epoch": 0.102262559120542,
"grad_norm": 20.781161821934894,
"learning_rate": 9.984848484848486e-07,
"logits/chosen": 0.18178227543830872,
"logits/rejected": 0.20421989262104034,
"logps/chosen": -194.18841552734375,
"logps/rejected": -205.5372314453125,
"loss": 0.6693,
"rewards/accuracies": 0.6109374761581421,
"rewards/chosen": -0.24953892827033997,
"rewards/margins": 0.05746041238307953,
"rewards/rejected": -0.3069993257522583,
"step": 150
},
{
"epoch": 0.10908006306191145,
"grad_norm": 20.949425745434294,
"learning_rate": 9.909090909090909e-07,
"logits/chosen": 0.16577480733394623,
"logits/rejected": 0.22489143908023834,
"logps/chosen": -189.01882934570312,
"logps/rejected": -204.76612854003906,
"loss": 0.6553,
"rewards/accuracies": 0.6578125357627869,
"rewards/chosen": -0.2690110504627228,
"rewards/margins": 0.0968737006187439,
"rewards/rejected": -0.36588478088378906,
"step": 160
},
{
"epoch": 0.11589756700328092,
"grad_norm": 21.376727693673736,
"learning_rate": 9.833333333333332e-07,
"logits/chosen": 0.16099530458450317,
"logits/rejected": 0.20968888700008392,
"logps/chosen": -198.27276611328125,
"logps/rejected": -207.08128356933594,
"loss": 0.6546,
"rewards/accuracies": 0.6500000357627869,
"rewards/chosen": -0.31624093651771545,
"rewards/margins": 0.09791112691164017,
"rewards/rejected": -0.4141520857810974,
"step": 170
},
{
"epoch": 0.1227150709446504,
"grad_norm": 20.47632034356792,
"learning_rate": 9.757575757575757e-07,
"logits/chosen": 0.16175265610218048,
"logits/rejected": 0.24207058548927307,
"logps/chosen": -192.8699188232422,
"logps/rejected": -204.4312744140625,
"loss": 0.6469,
"rewards/accuracies": 0.6687500476837158,
"rewards/chosen": -0.3407444357872009,
"rewards/margins": 0.12092556804418564,
"rewards/rejected": -0.46167001128196716,
"step": 180
},
{
"epoch": 0.12953257488601985,
"grad_norm": 20.746940996761676,
"learning_rate": 9.681818181818182e-07,
"logits/chosen": 0.15175826847553253,
"logits/rejected": 0.21674920618534088,
"logps/chosen": -193.29212951660156,
"logps/rejected": -209.36143493652344,
"loss": 0.6389,
"rewards/accuracies": 0.6812500357627869,
"rewards/chosen": -0.3786366581916809,
"rewards/margins": 0.1404908001422882,
"rewards/rejected": -0.5191274285316467,
"step": 190
},
{
"epoch": 0.1363500788273893,
"grad_norm": 20.484642032996728,
"learning_rate": 9.606060606060605e-07,
"logits/chosen": 0.1607164442539215,
"logits/rejected": 0.22002199292182922,
"logps/chosen": -197.4151153564453,
"logps/rejected": -209.8327178955078,
"loss": 0.6291,
"rewards/accuracies": 0.6609375476837158,
"rewards/chosen": -0.41719570755958557,
"rewards/margins": 0.17708109319210052,
"rewards/rejected": -0.5942767858505249,
"step": 200
},
{
"epoch": 0.1431675827687588,
"grad_norm": 26.738984065984987,
"learning_rate": 9.53030303030303e-07,
"logits/chosen": 0.15654993057250977,
"logits/rejected": 0.2388145625591278,
"logps/chosen": -195.02975463867188,
"logps/rejected": -207.19190979003906,
"loss": 0.6342,
"rewards/accuracies": 0.6749999523162842,
"rewards/chosen": -0.4655718505382538,
"rewards/margins": 0.16476726531982422,
"rewards/rejected": -0.6303391456604004,
"step": 210
},
{
"epoch": 0.14998508671012825,
"grad_norm": 20.33866123420931,
"learning_rate": 9.454545454545454e-07,
"logits/chosen": 0.12783432006835938,
"logits/rejected": 0.1976049840450287,
"logps/chosen": -201.7896728515625,
"logps/rejected": -215.41249084472656,
"loss": 0.6291,
"rewards/accuracies": 0.6609375476837158,
"rewards/chosen": -0.5083937644958496,
"rewards/margins": 0.18992076814174652,
"rewards/rejected": -0.6983146071434021,
"step": 220
},
{
"epoch": 0.1568025906514977,
"grad_norm": 32.54405565292402,
"learning_rate": 9.378787878787879e-07,
"logits/chosen": 0.1527099907398224,
"logits/rejected": 0.22111022472381592,
"logps/chosen": -193.4207763671875,
"logps/rejected": -207.85169982910156,
"loss": 0.6212,
"rewards/accuracies": 0.690625011920929,
"rewards/chosen": -0.531213641166687,
"rewards/margins": 0.2235802412033081,
"rewards/rejected": -0.7547938823699951,
"step": 230
},
{
"epoch": 0.1636200945928672,
"grad_norm": 19.095273306756834,
"learning_rate": 9.303030303030303e-07,
"logits/chosen": 0.15274283289909363,
"logits/rejected": 0.21214556694030762,
"logps/chosen": -196.24371337890625,
"logps/rejected": -213.04237365722656,
"loss": 0.6085,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.5204161405563354,
"rewards/margins": 0.2579624056816101,
"rewards/rejected": -0.7783786058425903,
"step": 240
},
{
"epoch": 0.17043759853423665,
"grad_norm": 22.735594447037276,
"learning_rate": 9.227272727272727e-07,
"logits/chosen": 0.1225215271115303,
"logits/rejected": 0.18497014045715332,
"logps/chosen": -192.548095703125,
"logps/rejected": -207.9135284423828,
"loss": 0.6233,
"rewards/accuracies": 0.667187511920929,
"rewards/chosen": -0.5648759603500366,
"rewards/margins": 0.2269633412361145,
"rewards/rejected": -0.7918393611907959,
"step": 250
},
{
"epoch": 0.1772551024756061,
"grad_norm": 22.954060831129915,
"learning_rate": 9.151515151515152e-07,
"logits/chosen": 0.1713658571243286,
"logits/rejected": 0.25986677408218384,
"logps/chosen": -198.60391235351562,
"logps/rejected": -214.2535400390625,
"loss": 0.6071,
"rewards/accuracies": 0.6890625357627869,
"rewards/chosen": -0.6113271713256836,
"rewards/margins": 0.27784913778305054,
"rewards/rejected": -0.8891763091087341,
"step": 260
},
{
"epoch": 0.1840726064169756,
"grad_norm": 22.190753794677473,
"learning_rate": 9.075757575757576e-07,
"logits/chosen": 0.14579366147518158,
"logits/rejected": 0.20252245664596558,
"logps/chosen": -199.14405822753906,
"logps/rejected": -213.55294799804688,
"loss": 0.6017,
"rewards/accuracies": 0.6906250715255737,
"rewards/chosen": -0.5976826548576355,
"rewards/margins": 0.3174746632575989,
"rewards/rejected": -0.9151572585105896,
"step": 270
},
{
"epoch": 0.19089011035834505,
"grad_norm": 22.054250481854893,
"learning_rate": 9e-07,
"logits/chosen": 0.11682489514350891,
"logits/rejected": 0.18400567770004272,
"logps/chosen": -195.43438720703125,
"logps/rejected": -214.8118896484375,
"loss": 0.6076,
"rewards/accuracies": 0.6781250238418579,
"rewards/chosen": -0.600721001625061,
"rewards/margins": 0.29136893153190613,
"rewards/rejected": -0.89208984375,
"step": 280
},
{
"epoch": 0.1977076142997145,
"grad_norm": 24.117104152592596,
"learning_rate": 8.924242424242425e-07,
"logits/chosen": 0.08254396170377731,
"logits/rejected": 0.15450119972229004,
"logps/chosen": -202.29647827148438,
"logps/rejected": -221.5592498779297,
"loss": 0.5958,
"rewards/accuracies": 0.6859375238418579,
"rewards/chosen": -0.5900746583938599,
"rewards/margins": 0.34352385997772217,
"rewards/rejected": -0.9335983991622925,
"step": 290
},
{
"epoch": 0.204525118241084,
"grad_norm": 21.604244584329482,
"learning_rate": 8.848484848484849e-07,
"logits/chosen": 0.08819441497325897,
"logits/rejected": 0.17239636182785034,
"logps/chosen": -188.81192016601562,
"logps/rejected": -208.72073364257812,
"loss": 0.569,
"rewards/accuracies": 0.7328125238418579,
"rewards/chosen": -0.6129291653633118,
"rewards/margins": 0.4271809160709381,
"rewards/rejected": -1.0401101112365723,
"step": 300
},
{
"epoch": 0.21134262218245345,
"grad_norm": 22.020365760713695,
"learning_rate": 8.772727272727273e-07,
"logits/chosen": 0.06760307401418686,
"logits/rejected": 0.14344710111618042,
"logps/chosen": -195.82293701171875,
"logps/rejected": -212.59982299804688,
"loss": 0.5687,
"rewards/accuracies": 0.7140624523162842,
"rewards/chosen": -0.6414520740509033,
"rewards/margins": 0.4150450825691223,
"rewards/rejected": -1.0564970970153809,
"step": 310
},
{
"epoch": 0.2181601261238229,
"grad_norm": 19.161894841909444,
"learning_rate": 8.696969696969697e-07,
"logits/chosen": 0.11280106008052826,
"logits/rejected": 0.18791824579238892,
"logps/chosen": -209.43258666992188,
"logps/rejected": -236.70346069335938,
"loss": 0.5635,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.74039626121521,
"rewards/margins": 0.5291692018508911,
"rewards/rejected": -1.269565463066101,
"step": 320
},
{
"epoch": 0.2249776300651924,
"grad_norm": 22.3343774818719,
"learning_rate": 8.62121212121212e-07,
"logits/chosen": 0.09679359942674637,
"logits/rejected": 0.19822388887405396,
"logps/chosen": -208.64476013183594,
"logps/rejected": -229.81011962890625,
"loss": 0.5596,
"rewards/accuracies": 0.7343750596046448,
"rewards/chosen": -0.7948130965232849,
"rewards/margins": 0.5681655406951904,
"rewards/rejected": -1.3629785776138306,
"step": 330
},
{
"epoch": 0.23179513400656185,
"grad_norm": 20.045843226629753,
"learning_rate": 8.545454545454544e-07,
"logits/chosen": 0.03968825936317444,
"logits/rejected": 0.12135367095470428,
"logps/chosen": -207.46609497070312,
"logps/rejected": -224.89439392089844,
"loss": 0.5679,
"rewards/accuracies": 0.7046875357627869,
"rewards/chosen": -0.7988042831420898,
"rewards/margins": 0.4717750549316406,
"rewards/rejected": -1.2705793380737305,
"step": 340
},
{
"epoch": 0.2386126379479313,
"grad_norm": 30.125557475346305,
"learning_rate": 8.469696969696968e-07,
"logits/chosen": 0.10933436453342438,
"logits/rejected": 0.1516590416431427,
"logps/chosen": -201.27694702148438,
"logps/rejected": -218.70889282226562,
"loss": 0.5859,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.8309043645858765,
"rewards/margins": 0.49856728315353394,
"rewards/rejected": -1.3294715881347656,
"step": 350
},
{
"epoch": 0.2454301418893008,
"grad_norm": 22.893234913534002,
"learning_rate": 8.393939393939393e-07,
"logits/chosen": 0.062410831451416016,
"logits/rejected": 0.13367854058742523,
"logps/chosen": -194.7364959716797,
"logps/rejected": -215.7635040283203,
"loss": 0.5739,
"rewards/accuracies": 0.698437511920929,
"rewards/chosen": -0.7721937894821167,
"rewards/margins": 0.49258309602737427,
"rewards/rejected": -1.2647769451141357,
"step": 360
},
{
"epoch": 0.2522476458306702,
"grad_norm": 22.23618919017924,
"learning_rate": 8.318181818181817e-07,
"logits/chosen": 0.04097752273082733,
"logits/rejected": 0.11259806156158447,
"logps/chosen": -193.11825561523438,
"logps/rejected": -220.61949157714844,
"loss": 0.5378,
"rewards/accuracies": 0.7234375476837158,
"rewards/chosen": -0.7389846444129944,
"rewards/margins": 0.5330405831336975,
"rewards/rejected": -1.272025227546692,
"step": 370
},
{
"epoch": 0.2590651497720397,
"grad_norm": 21.834653461400006,
"learning_rate": 8.242424242424241e-07,
"logits/chosen": 0.05887192115187645,
"logits/rejected": 0.1328059434890747,
"logps/chosen": -202.1223602294922,
"logps/rejected": -221.07273864746094,
"loss": 0.5508,
"rewards/accuracies": 0.7296874523162842,
"rewards/chosen": -0.900775671005249,
"rewards/margins": 0.587921142578125,
"rewards/rejected": -1.488696813583374,
"step": 380
},
{
"epoch": 0.2658826537134092,
"grad_norm": 23.992274309591316,
"learning_rate": 8.166666666666666e-07,
"logits/chosen": 0.03438958153128624,
"logits/rejected": 0.12133367359638214,
"logps/chosen": -202.35487365722656,
"logps/rejected": -227.4732208251953,
"loss": 0.5618,
"rewards/accuracies": 0.715624988079071,
"rewards/chosen": -0.9070041179656982,
"rewards/margins": 0.6916414499282837,
"rewards/rejected": -1.598645567893982,
"step": 390
},
{
"epoch": 0.2727001576547786,
"grad_norm": 22.0860813112572,
"learning_rate": 8.09090909090909e-07,
"logits/chosen": 0.08829227089881897,
"logits/rejected": 0.1640704870223999,
"logps/chosen": -205.60989379882812,
"logps/rejected": -224.852294921875,
"loss": 0.5494,
"rewards/accuracies": 0.721875011920929,
"rewards/chosen": -0.9454193115234375,
"rewards/margins": 0.6362107396125793,
"rewards/rejected": -1.5816301107406616,
"step": 400
},
{
"epoch": 0.2795176615961481,
"grad_norm": 23.160379012581956,
"learning_rate": 8.015151515151514e-07,
"logits/chosen": 0.06336803734302521,
"logits/rejected": 0.13648778200149536,
"logps/chosen": -205.41561889648438,
"logps/rejected": -228.48057556152344,
"loss": 0.5462,
"rewards/accuracies": 0.7343750596046448,
"rewards/chosen": -0.9269916415214539,
"rewards/margins": 0.6442463994026184,
"rewards/rejected": -1.5712381601333618,
"step": 410
},
{
"epoch": 0.2863351655375176,
"grad_norm": 22.408557079555997,
"learning_rate": 7.939393939393939e-07,
"logits/chosen": 0.08505380898714066,
"logits/rejected": 0.18310996890068054,
"logps/chosen": -213.0338897705078,
"logps/rejected": -235.76596069335938,
"loss": 0.5532,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -1.0196318626403809,
"rewards/margins": 0.5743885040283203,
"rewards/rejected": -1.5940203666687012,
"step": 420
},
{
"epoch": 0.293152669478887,
"grad_norm": 25.16391126214167,
"learning_rate": 7.863636363636363e-07,
"logits/chosen": 0.10937841981649399,
"logits/rejected": 0.16763341426849365,
"logps/chosen": -207.480224609375,
"logps/rejected": -234.79544067382812,
"loss": 0.5562,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.9945791363716125,
"rewards/margins": 0.7221311330795288,
"rewards/rejected": -1.7167102098464966,
"step": 430
},
{
"epoch": 0.2999701734202565,
"grad_norm": 29.733679778009286,
"learning_rate": 7.787878787878787e-07,
"logits/chosen": 0.09990985691547394,
"logits/rejected": 0.19938966631889343,
"logps/chosen": -207.3507843017578,
"logps/rejected": -230.52630615234375,
"loss": 0.5384,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9231570959091187,
"rewards/margins": 0.6046810746192932,
"rewards/rejected": -1.527838110923767,
"step": 440
},
{
"epoch": 0.306787677361626,
"grad_norm": 19.72755564195352,
"learning_rate": 7.712121212121212e-07,
"logits/chosen": 0.14025147259235382,
"logits/rejected": 0.1931421309709549,
"logps/chosen": -211.64739990234375,
"logps/rejected": -234.59742736816406,
"loss": 0.539,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.9828760623931885,
"rewards/margins": 0.6804162263870239,
"rewards/rejected": -1.6632922887802124,
"step": 450
},
{
"epoch": 0.3136051813029954,
"grad_norm": 23.978166468246467,
"learning_rate": 7.636363636363636e-07,
"logits/chosen": 0.0902441218495369,
"logits/rejected": 0.18330176174640656,
"logps/chosen": -212.70648193359375,
"logps/rejected": -237.41574096679688,
"loss": 0.5309,
"rewards/accuracies": 0.7468750476837158,
"rewards/chosen": -1.0690429210662842,
"rewards/margins": 0.6713231801986694,
"rewards/rejected": -1.740365982055664,
"step": 460
},
{
"epoch": 0.3204226852443649,
"grad_norm": 26.207432691612087,
"learning_rate": 7.56060606060606e-07,
"logits/chosen": 0.11341211199760437,
"logits/rejected": 0.1867765188217163,
"logps/chosen": -197.91021728515625,
"logps/rejected": -221.50146484375,
"loss": 0.5343,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.9720097780227661,
"rewards/margins": 0.6199135780334473,
"rewards/rejected": -1.5919233560562134,
"step": 470
},
{
"epoch": 0.3272401891857344,
"grad_norm": 21.846418443949897,
"learning_rate": 7.484848484848485e-07,
"logits/chosen": 0.11702318489551544,
"logits/rejected": 0.203329399228096,
"logps/chosen": -201.898193359375,
"logps/rejected": -221.0704803466797,
"loss": 0.5479,
"rewards/accuracies": 0.7437500357627869,
"rewards/chosen": -0.9688056111335754,
"rewards/margins": 0.7794600129127502,
"rewards/rejected": -1.7482655048370361,
"step": 480
},
{
"epoch": 0.3340576931271038,
"grad_norm": 23.690713241273283,
"learning_rate": 7.409090909090909e-07,
"logits/chosen": 0.10431469976902008,
"logits/rejected": 0.20410987734794617,
"logps/chosen": -212.93824768066406,
"logps/rejected": -240.54246520996094,
"loss": 0.5022,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.9758983850479126,
"rewards/margins": 0.8021982312202454,
"rewards/rejected": -1.7780965566635132,
"step": 490
},
{
"epoch": 0.3408751970684733,
"grad_norm": 23.75624324955974,
"learning_rate": 7.333333333333332e-07,
"logits/chosen": 0.0590752549469471,
"logits/rejected": 0.15686756372451782,
"logps/chosen": -204.53347778320312,
"logps/rejected": -232.50863647460938,
"loss": 0.4879,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.9903222322463989,
"rewards/margins": 0.8048741817474365,
"rewards/rejected": -1.7951964139938354,
"step": 500
},
{
"epoch": 0.3476927010098428,
"grad_norm": 24.670166077373448,
"learning_rate": 7.257575757575756e-07,
"logits/chosen": 0.04646120220422745,
"logits/rejected": 0.1389884203672409,
"logps/chosen": -216.92431640625,
"logps/rejected": -242.18240356445312,
"loss": 0.5303,
"rewards/accuracies": 0.7312500476837158,
"rewards/chosen": -1.1649525165557861,
"rewards/margins": 0.7863146066665649,
"rewards/rejected": -1.9512672424316406,
"step": 510
},
{
"epoch": 0.3545102049512122,
"grad_norm": 23.363016624074675,
"learning_rate": 7.181818181818181e-07,
"logits/chosen": -0.012276587076485157,
"logits/rejected": 0.07009466737508774,
"logps/chosen": -205.48934936523438,
"logps/rejected": -232.42538452148438,
"loss": 0.5015,
"rewards/accuracies": 0.7671874761581421,
"rewards/chosen": -1.0472114086151123,
"rewards/margins": 0.788646399974823,
"rewards/rejected": -1.83585786819458,
"step": 520
},
{
"epoch": 0.3613277088925817,
"grad_norm": 25.288611630037565,
"learning_rate": 7.106060606060605e-07,
"logits/chosen": -0.012474373914301395,
"logits/rejected": 0.05413222685456276,
"logps/chosen": -208.1520538330078,
"logps/rejected": -241.48385620117188,
"loss": 0.5252,
"rewards/accuracies": 0.746874988079071,
"rewards/chosen": -1.1651169061660767,
"rewards/margins": 1.0477391481399536,
"rewards/rejected": -2.2128560543060303,
"step": 530
},
{
"epoch": 0.3681452128339512,
"grad_norm": 20.178014539662176,
"learning_rate": 7.030303030303029e-07,
"logits/chosen": -0.04357679560780525,
"logits/rejected": 0.0625062957406044,
"logps/chosen": -210.49520874023438,
"logps/rejected": -241.66493225097656,
"loss": 0.4807,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.1151983737945557,
"rewards/margins": 0.9497561454772949,
"rewards/rejected": -2.0649547576904297,
"step": 540
},
{
"epoch": 0.3749627167753206,
"grad_norm": 21.944400967201336,
"learning_rate": 6.954545454545454e-07,
"logits/chosen": 0.004060904495418072,
"logits/rejected": 0.08822981268167496,
"logps/chosen": -207.4363250732422,
"logps/rejected": -234.25550842285156,
"loss": 0.501,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.2111350297927856,
"rewards/margins": 0.8183422684669495,
"rewards/rejected": -2.02947735786438,
"step": 550
},
{
"epoch": 0.3817802207166901,
"grad_norm": 26.850310787153724,
"learning_rate": 6.878787878787878e-07,
"logits/chosen": -0.017100585624575615,
"logits/rejected": 0.07575605064630508,
"logps/chosen": -213.30137634277344,
"logps/rejected": -243.35186767578125,
"loss": 0.4941,
"rewards/accuracies": 0.7593750357627869,
"rewards/chosen": -1.3406263589859009,
"rewards/margins": 0.9297415018081665,
"rewards/rejected": -2.2703678607940674,
"step": 560
},
{
"epoch": 0.3885977246580596,
"grad_norm": 25.982278946370197,
"learning_rate": 6.803030303030302e-07,
"logits/chosen": 0.019733965396881104,
"logits/rejected": 0.07434576749801636,
"logps/chosen": -220.54368591308594,
"logps/rejected": -245.9012451171875,
"loss": 0.5234,
"rewards/accuracies": 0.7406250238418579,
"rewards/chosen": -1.3345239162445068,
"rewards/margins": 0.882716953754425,
"rewards/rejected": -2.217240810394287,
"step": 570
},
{
"epoch": 0.395415228599429,
"grad_norm": 30.324292712841515,
"learning_rate": 6.727272727272727e-07,
"logits/chosen": -0.07648750394582748,
"logits/rejected": 0.013701358810067177,
"logps/chosen": -215.10986328125,
"logps/rejected": -241.52174377441406,
"loss": 0.4888,
"rewards/accuracies": 0.785937488079071,
"rewards/chosen": -1.2312657833099365,
"rewards/margins": 0.8834339380264282,
"rewards/rejected": -2.1146998405456543,
"step": 580
},
{
"epoch": 0.4022327325407985,
"grad_norm": 30.267248385209463,
"learning_rate": 6.651515151515151e-07,
"logits/chosen": -0.08248546719551086,
"logits/rejected": 0.012668056413531303,
"logps/chosen": -217.48597717285156,
"logps/rejected": -248.1539306640625,
"loss": 0.4718,
"rewards/accuracies": 0.770312488079071,
"rewards/chosen": -1.3211320638656616,
"rewards/margins": 1.034571886062622,
"rewards/rejected": -2.3557040691375732,
"step": 590
},
{
"epoch": 0.409050236482168,
"grad_norm": 33.7927835163272,
"learning_rate": 6.575757575757575e-07,
"logits/chosen": -0.10192164778709412,
"logits/rejected": -0.015221836045384407,
"logps/chosen": -217.5685577392578,
"logps/rejected": -255.0595703125,
"loss": 0.503,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4013419151306152,
"rewards/margins": 1.1709883213043213,
"rewards/rejected": -2.5723299980163574,
"step": 600
},
{
"epoch": 0.4158677404235374,
"grad_norm": 26.97423161614756,
"learning_rate": 6.5e-07,
"logits/chosen": -0.09173352271318436,
"logits/rejected": -0.004754798021167517,
"logps/chosen": -214.8068084716797,
"logps/rejected": -247.6568145751953,
"loss": 0.4939,
"rewards/accuracies": 0.753125011920929,
"rewards/chosen": -1.280412197113037,
"rewards/margins": 1.0068598985671997,
"rewards/rejected": -2.2872722148895264,
"step": 610
},
{
"epoch": 0.4226852443649069,
"grad_norm": 23.353042033965973,
"learning_rate": 6.424242424242424e-07,
"logits/chosen": -0.08759984374046326,
"logits/rejected": -0.006561200134456158,
"logps/chosen": -209.99142456054688,
"logps/rejected": -240.59873962402344,
"loss": 0.5032,
"rewards/accuracies": 0.7640625238418579,
"rewards/chosen": -1.327502965927124,
"rewards/margins": 0.8928775191307068,
"rewards/rejected": -2.2203807830810547,
"step": 620
},
{
"epoch": 0.4295027483062764,
"grad_norm": 19.622319808534836,
"learning_rate": 6.348484848484848e-07,
"logits/chosen": -0.072292260825634,
"logits/rejected": 0.022778620943427086,
"logps/chosen": -208.22052001953125,
"logps/rejected": -243.78250122070312,
"loss": 0.4516,
"rewards/accuracies": 0.7953125238418579,
"rewards/chosen": -1.2082226276397705,
"rewards/margins": 1.1600842475891113,
"rewards/rejected": -2.368306875228882,
"step": 630
},
{
"epoch": 0.4363202522476458,
"grad_norm": 27.54001459206905,
"learning_rate": 6.272727272727273e-07,
"logits/chosen": -0.07350125908851624,
"logits/rejected": 0.0216854028403759,
"logps/chosen": -214.49398803710938,
"logps/rejected": -245.44357299804688,
"loss": 0.481,
"rewards/accuracies": 0.7671875357627869,
"rewards/chosen": -1.3779951333999634,
"rewards/margins": 1.1042989492416382,
"rewards/rejected": -2.4822940826416016,
"step": 640
},
{
"epoch": 0.4431377561890153,
"grad_norm": 22.005523605032646,
"learning_rate": 6.196969696969697e-07,
"logits/chosen": -0.08112622797489166,
"logits/rejected": 0.016506649553775787,
"logps/chosen": -207.86119079589844,
"logps/rejected": -243.9912109375,
"loss": 0.4669,
"rewards/accuracies": 0.776562511920929,
"rewards/chosen": -1.312855839729309,
"rewards/margins": 1.1270496845245361,
"rewards/rejected": -2.4399054050445557,
"step": 650
},
{
"epoch": 0.4499552601303848,
"grad_norm": 22.786232365472497,
"learning_rate": 6.12121212121212e-07,
"logits/chosen": -0.014887440949678421,
"logits/rejected": 0.07256890088319778,
"logps/chosen": -214.85989379882812,
"logps/rejected": -242.47958374023438,
"loss": 0.5139,
"rewards/accuracies": 0.7328125238418579,
"rewards/chosen": -1.3719854354858398,
"rewards/margins": 0.9033377766609192,
"rewards/rejected": -2.2753231525421143,
"step": 660
},
{
"epoch": 0.4567727640717542,
"grad_norm": 22.124194779218676,
"learning_rate": 6.045454545454545e-07,
"logits/chosen": -0.06873725354671478,
"logits/rejected": 0.01612996682524681,
"logps/chosen": -214.2239990234375,
"logps/rejected": -245.57972717285156,
"loss": 0.4842,
"rewards/accuracies": 0.7515625357627869,
"rewards/chosen": -1.374406099319458,
"rewards/margins": 1.0592212677001953,
"rewards/rejected": -2.4336276054382324,
"step": 670
},
{
"epoch": 0.4635902680131237,
"grad_norm": 25.208264673147376,
"learning_rate": 5.969696969696969e-07,
"logits/chosen": -0.08258620649576187,
"logits/rejected": 0.019038595259189606,
"logps/chosen": -212.36058044433594,
"logps/rejected": -247.52285766601562,
"loss": 0.4707,
"rewards/accuracies": 0.7796874642372131,
"rewards/chosen": -1.355445384979248,
"rewards/margins": 1.0607233047485352,
"rewards/rejected": -2.416168689727783,
"step": 680
},
{
"epoch": 0.4704077719544932,
"grad_norm": 23.492565569306137,
"learning_rate": 5.893939393939393e-07,
"logits/chosen": -0.048173777759075165,
"logits/rejected": 0.05654379725456238,
"logps/chosen": -203.7908935546875,
"logps/rejected": -235.70806884765625,
"loss": 0.463,
"rewards/accuracies": 0.7640625238418579,
"rewards/chosen": -1.3086433410644531,
"rewards/margins": 1.073492407798767,
"rewards/rejected": -2.3821358680725098,
"step": 690
},
{
"epoch": 0.4772252758958626,
"grad_norm": 23.525844296596986,
"learning_rate": 5.818181818181818e-07,
"logits/chosen": -0.06435231864452362,
"logits/rejected": 0.036002036184072495,
"logps/chosen": -215.8461456298828,
"logps/rejected": -253.51507568359375,
"loss": 0.4424,
"rewards/accuracies": 0.7937500476837158,
"rewards/chosen": -1.304274320602417,
"rewards/margins": 1.1241943836212158,
"rewards/rejected": -2.428468704223633,
"step": 700
},
{
"epoch": 0.4840427798372321,
"grad_norm": 24.839200055423007,
"learning_rate": 5.742424242424242e-07,
"logits/chosen": -0.1140328049659729,
"logits/rejected": -0.0182164516299963,
"logps/chosen": -219.0977020263672,
"logps/rejected": -245.15292358398438,
"loss": 0.4667,
"rewards/accuracies": 0.7796875238418579,
"rewards/chosen": -1.4004441499710083,
"rewards/margins": 1.1077167987823486,
"rewards/rejected": -2.5081608295440674,
"step": 710
},
{
"epoch": 0.4908602837786016,
"grad_norm": 21.081539741219093,
"learning_rate": 5.666666666666666e-07,
"logits/chosen": -0.12187488377094269,
"logits/rejected": -0.013158449903130531,
"logps/chosen": -209.8751220703125,
"logps/rejected": -246.4436492919922,
"loss": 0.4639,
"rewards/accuracies": 0.784375011920929,
"rewards/chosen": -1.422644853591919,
"rewards/margins": 1.1023151874542236,
"rewards/rejected": -2.5249602794647217,
"step": 720
},
{
"epoch": 0.497677787719971,
"grad_norm": 24.782821613085822,
"learning_rate": 5.590909090909091e-07,
"logits/chosen": -0.06877341866493225,
"logits/rejected": 0.035405777394771576,
"logps/chosen": -216.97332763671875,
"logps/rejected": -256.8809814453125,
"loss": 0.4337,
"rewards/accuracies": 0.8078125715255737,
"rewards/chosen": -1.4306436777114868,
"rewards/margins": 1.1720441579818726,
"rewards/rejected": -2.6026878356933594,
"step": 730
},
{
"epoch": 0.5044952916613404,
"grad_norm": 24.588552599891635,
"learning_rate": 5.515151515151515e-07,
"logits/chosen": -0.07467488199472427,
"logits/rejected": 0.017803018912672997,
"logps/chosen": -230.01266479492188,
"logps/rejected": -262.2432861328125,
"loss": 0.4669,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.632286548614502,
"rewards/margins": 1.2193667888641357,
"rewards/rejected": -2.8516533374786377,
"step": 740
},
{
"epoch": 0.5113127956027099,
"grad_norm": 20.475854944533108,
"learning_rate": 5.439393939393939e-07,
"logits/chosen": -0.07244399189949036,
"logits/rejected": 0.0048616742715239525,
"logps/chosen": -218.39337158203125,
"logps/rejected": -250.11416625976562,
"loss": 0.4467,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.5096694231033325,
"rewards/margins": 1.196234107017517,
"rewards/rejected": -2.7059032917022705,
"step": 750
},
{
"epoch": 0.5181302995440794,
"grad_norm": 24.853405341139933,
"learning_rate": 5.363636363636363e-07,
"logits/chosen": -0.0735924020409584,
"logits/rejected": 0.022622695192694664,
"logps/chosen": -216.3765411376953,
"logps/rejected": -250.67161560058594,
"loss": 0.4594,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.5826576948165894,
"rewards/margins": 1.228342890739441,
"rewards/rejected": -2.8110008239746094,
"step": 760
},
{
"epoch": 0.5249478034854489,
"grad_norm": 20.713474784942502,
"learning_rate": 5.287878787878788e-07,
"logits/chosen": -0.07298550754785538,
"logits/rejected": 0.015953145921230316,
"logps/chosen": -213.88311767578125,
"logps/rejected": -254.41146850585938,
"loss": 0.4376,
"rewards/accuracies": 0.8312500715255737,
"rewards/chosen": -1.4484457969665527,
"rewards/margins": 1.3493634462356567,
"rewards/rejected": -2.797809362411499,
"step": 770
},
{
"epoch": 0.5317653074268184,
"grad_norm": 25.104952077400377,
"learning_rate": 5.212121212121212e-07,
"logits/chosen": -0.040183987468481064,
"logits/rejected": 0.03515133634209633,
"logps/chosen": -217.26239013671875,
"logps/rejected": -248.25259399414062,
"loss": 0.4695,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -1.5951099395751953,
"rewards/margins": 1.2722889184951782,
"rewards/rejected": -2.867398738861084,
"step": 780
},
{
"epoch": 0.5385828113681879,
"grad_norm": 28.29480749737075,
"learning_rate": 5.136363636363636e-07,
"logits/chosen": -0.04418431594967842,
"logits/rejected": 0.05417613312602043,
"logps/chosen": -222.26820373535156,
"logps/rejected": -257.3962707519531,
"loss": 0.4579,
"rewards/accuracies": 0.7906250357627869,
"rewards/chosen": -1.570731520652771,
"rewards/margins": 1.1469626426696777,
"rewards/rejected": -2.717694044113159,
"step": 790
},
{
"epoch": 0.5454003153095572,
"grad_norm": 23.21073310542596,
"learning_rate": 5.060606060606061e-07,
"logits/chosen": -0.07153814285993576,
"logits/rejected": 0.021596048027276993,
"logps/chosen": -217.89630126953125,
"logps/rejected": -254.9208221435547,
"loss": 0.4123,
"rewards/accuracies": 0.8250000476837158,
"rewards/chosen": -1.5101759433746338,
"rewards/margins": 1.3654392957687378,
"rewards/rejected": -2.8756155967712402,
"step": 800
},
{
"epoch": 0.5522178192509267,
"grad_norm": 21.00129246925032,
"learning_rate": 4.984848484848485e-07,
"logits/chosen": -0.06583255529403687,
"logits/rejected": 0.04746149852871895,
"logps/chosen": -214.39453125,
"logps/rejected": -253.67242431640625,
"loss": 0.4335,
"rewards/accuracies": 0.785937488079071,
"rewards/chosen": -1.6045914888381958,
"rewards/margins": 1.34579598903656,
"rewards/rejected": -2.950387477874756,
"step": 810
},
{
"epoch": 0.5590353231922962,
"grad_norm": 25.238378218924026,
"learning_rate": 4.909090909090909e-07,
"logits/chosen": -0.06921117007732391,
"logits/rejected": 0.03174077346920967,
"logps/chosen": -213.57254028320312,
"logps/rejected": -248.80799865722656,
"loss": 0.4561,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.6258023977279663,
"rewards/margins": 1.308131217956543,
"rewards/rejected": -2.933933734893799,
"step": 820
},
{
"epoch": 0.5658528271336657,
"grad_norm": 22.691646670626245,
"learning_rate": 4.833333333333333e-07,
"logits/chosen": -0.08812057971954346,
"logits/rejected": 0.028292154893279076,
"logps/chosen": -217.73495483398438,
"logps/rejected": -258.3476867675781,
"loss": 0.4395,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.6993637084960938,
"rewards/margins": 1.3441307544708252,
"rewards/rejected": -3.043494462966919,
"step": 830
},
{
"epoch": 0.5726703310750352,
"grad_norm": 26.78354678353827,
"learning_rate": 4.7575757575757574e-07,
"logits/chosen": -0.08428293466567993,
"logits/rejected": 0.0038617942482233047,
"logps/chosen": -228.9470977783203,
"logps/rejected": -262.1487121582031,
"loss": 0.4585,
"rewards/accuracies": 0.7874999642372131,
"rewards/chosen": -1.7086464166641235,
"rewards/margins": 1.2672871351242065,
"rewards/rejected": -2.97593355178833,
"step": 840
},
{
"epoch": 0.5794878350164047,
"grad_norm": 21.82947420474725,
"learning_rate": 4.681818181818182e-07,
"logits/chosen": -0.07127973437309265,
"logits/rejected": 0.03848648816347122,
"logps/chosen": -214.94436645507812,
"logps/rejected": -252.94378662109375,
"loss": 0.4487,
"rewards/accuracies": 0.785937488079071,
"rewards/chosen": -1.5856503248214722,
"rewards/margins": 1.2746567726135254,
"rewards/rejected": -2.860306978225708,
"step": 850
},
{
"epoch": 0.586305338957774,
"grad_norm": 25.33636086854273,
"learning_rate": 4.606060606060606e-07,
"logits/chosen": -0.0755903422832489,
"logits/rejected": 0.007846422493457794,
"logps/chosen": -229.74365234375,
"logps/rejected": -266.0998229980469,
"loss": 0.4466,
"rewards/accuracies": 0.8109375238418579,
"rewards/chosen": -1.6116007566452026,
"rewards/margins": 1.3451875448226929,
"rewards/rejected": -2.9567883014678955,
"step": 860
},
{
"epoch": 0.5931228428991435,
"grad_norm": 23.430791371783034,
"learning_rate": 4.53030303030303e-07,
"logits/chosen": -0.08032269030809402,
"logits/rejected": 0.013129429891705513,
"logps/chosen": -211.66123962402344,
"logps/rejected": -249.7532501220703,
"loss": 0.4168,
"rewards/accuracies": 0.8296875357627869,
"rewards/chosen": -1.4253365993499756,
"rewards/margins": 1.4885873794555664,
"rewards/rejected": -2.913924217224121,
"step": 870
},
{
"epoch": 0.599940346840513,
"grad_norm": 25.6792868240925,
"learning_rate": 4.4545454545454544e-07,
"logits/chosen": -0.10410317778587341,
"logits/rejected": -0.02125217206776142,
"logps/chosen": -211.6175994873047,
"logps/rejected": -250.02481079101562,
"loss": 0.4419,
"rewards/accuracies": 0.776562511920929,
"rewards/chosen": -1.4521470069885254,
"rewards/margins": 1.4656074047088623,
"rewards/rejected": -2.9177544116973877,
"step": 880
},
{
"epoch": 0.6067578507818825,
"grad_norm": 24.336985726252383,
"learning_rate": 4.3787878787878784e-07,
"logits/chosen": -0.1017291247844696,
"logits/rejected": -0.006951052229851484,
"logps/chosen": -207.91339111328125,
"logps/rejected": -244.61329650878906,
"loss": 0.4338,
"rewards/accuracies": 0.7953125238418579,
"rewards/chosen": -1.3715894222259521,
"rewards/margins": 1.3720262050628662,
"rewards/rejected": -2.7436156272888184,
"step": 890
},
{
"epoch": 0.613575354723252,
"grad_norm": 28.717619953557637,
"learning_rate": 4.303030303030303e-07,
"logits/chosen": -0.12957513332366943,
"logits/rejected": -0.04137944057583809,
"logps/chosen": -212.1148223876953,
"logps/rejected": -250.58657836914062,
"loss": 0.4428,
"rewards/accuracies": 0.8062500357627869,
"rewards/chosen": -1.5294814109802246,
"rewards/margins": 1.3217490911483765,
"rewards/rejected": -2.8512306213378906,
"step": 900
},
{
"epoch": 0.6203928586646215,
"grad_norm": 24.64484800585095,
"learning_rate": 4.227272727272727e-07,
"logits/chosen": -0.1173202320933342,
"logits/rejected": -0.026215719059109688,
"logps/chosen": -219.25180053710938,
"logps/rejected": -253.3756561279297,
"loss": 0.4444,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -1.5156140327453613,
"rewards/margins": 1.2724617719650269,
"rewards/rejected": -2.7880756855010986,
"step": 910
},
{
"epoch": 0.6272103626059908,
"grad_norm": 21.577769353125333,
"learning_rate": 4.1515151515151513e-07,
"logits/chosen": -0.09302366524934769,
"logits/rejected": -0.017984673380851746,
"logps/chosen": -225.01498413085938,
"logps/rejected": -260.4620666503906,
"loss": 0.4204,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.5388312339782715,
"rewards/margins": 1.3805811405181885,
"rewards/rejected": -2.91941237449646,
"step": 920
},
{
"epoch": 0.6340278665473603,
"grad_norm": 21.851016424850197,
"learning_rate": 4.075757575757576e-07,
"logits/chosen": -0.12906233966350555,
"logits/rejected": -0.042714815586805344,
"logps/chosen": -224.21875,
"logps/rejected": -261.9176940917969,
"loss": 0.4137,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": -1.6001354455947876,
"rewards/margins": 1.3610618114471436,
"rewards/rejected": -2.9611973762512207,
"step": 930
},
{
"epoch": 0.6408453704887298,
"grad_norm": 28.106139021916903,
"learning_rate": 4e-07,
"logits/chosen": -0.17769670486450195,
"logits/rejected": -0.07489217072725296,
"logps/chosen": -215.14065551757812,
"logps/rejected": -260.8359680175781,
"loss": 0.4011,
"rewards/accuracies": 0.8265625238418579,
"rewards/chosen": -1.5202898979187012,
"rewards/margins": 1.60263991355896,
"rewards/rejected": -3.122929811477661,
"step": 940
},
{
"epoch": 0.6476628744300993,
"grad_norm": 26.85542161068887,
"learning_rate": 3.924242424242424e-07,
"logits/chosen": -0.14841988682746887,
"logits/rejected": -0.06951985508203506,
"logps/chosen": -207.8277130126953,
"logps/rejected": -250.3575439453125,
"loss": 0.4295,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.5621274709701538,
"rewards/margins": 1.4103821516036987,
"rewards/rejected": -2.9725096225738525,
"step": 950
},
{
"epoch": 0.6544803783714688,
"grad_norm": 30.007894138540276,
"learning_rate": 3.8484848484848483e-07,
"logits/chosen": -0.17307066917419434,
"logits/rejected": -0.07763750106096268,
"logps/chosen": -220.78807067871094,
"logps/rejected": -258.3966369628906,
"loss": 0.4118,
"rewards/accuracies": 0.7984375357627869,
"rewards/chosen": -1.657617449760437,
"rewards/margins": 1.4771149158477783,
"rewards/rejected": -3.134732723236084,
"step": 960
},
{
"epoch": 0.6612978823128383,
"grad_norm": 29.07536633448252,
"learning_rate": 3.7727272727272723e-07,
"logits/chosen": -0.15869039297103882,
"logits/rejected": -0.07338032126426697,
"logps/chosen": -211.30084228515625,
"logps/rejected": -255.395751953125,
"loss": 0.4357,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -1.6164686679840088,
"rewards/margins": 1.3737252950668335,
"rewards/rejected": -2.9901938438415527,
"step": 970
},
{
"epoch": 0.6681153862542076,
"grad_norm": 28.531124567566387,
"learning_rate": 3.696969696969697e-07,
"logits/chosen": -0.13030777871608734,
"logits/rejected": -0.046453818678855896,
"logps/chosen": -216.94631958007812,
"logps/rejected": -267.0611572265625,
"loss": 0.4098,
"rewards/accuracies": 0.8203125596046448,
"rewards/chosen": -1.650040626525879,
"rewards/margins": 1.7019206285476685,
"rewards/rejected": -3.351961135864258,
"step": 980
},
{
"epoch": 0.6749328901955771,
"grad_norm": 24.268025730195088,
"learning_rate": 3.6212121212121213e-07,
"logits/chosen": -0.15982282161712646,
"logits/rejected": -0.0780140832066536,
"logps/chosen": -215.56704711914062,
"logps/rejected": -260.79736328125,
"loss": 0.397,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.6652125120162964,
"rewards/margins": 1.7730036973953247,
"rewards/rejected": -3.438216209411621,
"step": 990
},
{
"epoch": 0.6817503941369466,
"grad_norm": 24.14951347534636,
"learning_rate": 3.545454545454545e-07,
"logits/chosen": -0.16655105352401733,
"logits/rejected": -0.06034347787499428,
"logps/chosen": -214.05752563476562,
"logps/rejected": -254.98703002929688,
"loss": 0.4264,
"rewards/accuracies": 0.8031250238418579,
"rewards/chosen": -1.8080203533172607,
"rewards/margins": 1.4334427118301392,
"rewards/rejected": -3.2414629459381104,
"step": 1000
},
{
"epoch": 0.6885678980783161,
"grad_norm": 28.566305760445395,
"learning_rate": 3.46969696969697e-07,
"logits/chosen": -0.19522453844547272,
"logits/rejected": -0.08679309487342834,
"logps/chosen": -227.52655029296875,
"logps/rejected": -274.0975646972656,
"loss": 0.3803,
"rewards/accuracies": 0.8328125476837158,
"rewards/chosen": -1.7928123474121094,
"rewards/margins": 1.5590283870697021,
"rewards/rejected": -3.3518409729003906,
"step": 1010
},
{
"epoch": 0.6953854020196856,
"grad_norm": 47.85219425522013,
"learning_rate": 3.393939393939394e-07,
"logits/chosen": -0.1567739099264145,
"logits/rejected": -0.06111231818795204,
"logps/chosen": -228.47779846191406,
"logps/rejected": -274.33428955078125,
"loss": 0.4016,
"rewards/accuracies": 0.8218750357627869,
"rewards/chosen": -1.8795576095581055,
"rewards/margins": 1.6040751934051514,
"rewards/rejected": -3.4836325645446777,
"step": 1020
},
{
"epoch": 0.7022029059610551,
"grad_norm": 27.480180578165882,
"learning_rate": 3.318181818181818e-07,
"logits/chosen": -0.14685329794883728,
"logits/rejected": -0.04708694666624069,
"logps/chosen": -229.77883911132812,
"logps/rejected": -269.2246398925781,
"loss": 0.434,
"rewards/accuracies": 0.8156250715255737,
"rewards/chosen": -1.9993985891342163,
"rewards/margins": 1.541892647743225,
"rewards/rejected": -3.5412912368774414,
"step": 1030
},
{
"epoch": 0.7090204099024244,
"grad_norm": 28.437076758644576,
"learning_rate": 3.242424242424242e-07,
"logits/chosen": -0.11219906061887741,
"logits/rejected": 0.005472442135214806,
"logps/chosen": -223.41433715820312,
"logps/rejected": -263.777099609375,
"loss": 0.4147,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -1.9533647298812866,
"rewards/margins": 1.5760959386825562,
"rewards/rejected": -3.5294606685638428,
"step": 1040
},
{
"epoch": 0.7158379138437939,
"grad_norm": 24.780730794756213,
"learning_rate": 3.166666666666666e-07,
"logits/chosen": -0.10113102942705154,
"logits/rejected": -0.0247341338545084,
"logps/chosen": -224.37001037597656,
"logps/rejected": -268.121337890625,
"loss": 0.4301,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.9565200805664062,
"rewards/margins": 1.5767617225646973,
"rewards/rejected": -3.5332815647125244,
"step": 1050
},
{
"epoch": 0.7226554177851634,
"grad_norm": 24.01977768454246,
"learning_rate": 3.0909090909090907e-07,
"logits/chosen": -0.16017019748687744,
"logits/rejected": -0.038362376391887665,
"logps/chosen": -221.4423828125,
"logps/rejected": -269.96832275390625,
"loss": 0.3642,
"rewards/accuracies": 0.8578125238418579,
"rewards/chosen": -1.7076702117919922,
"rewards/margins": 1.7623119354248047,
"rewards/rejected": -3.469982147216797,
"step": 1060
},
{
"epoch": 0.7294729217265329,
"grad_norm": 23.607998459504426,
"learning_rate": 3.015151515151515e-07,
"logits/chosen": -0.10493813455104828,
"logits/rejected": -0.015611783601343632,
"logps/chosen": -225.48379516601562,
"logps/rejected": -265.3271789550781,
"loss": 0.3929,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -1.8727744817733765,
"rewards/margins": 1.5633747577667236,
"rewards/rejected": -3.4361491203308105,
"step": 1070
},
{
"epoch": 0.7362904256679024,
"grad_norm": 27.095218567106365,
"learning_rate": 2.939393939393939e-07,
"logits/chosen": -0.17973893880844116,
"logits/rejected": -0.0618341825902462,
"logps/chosen": -216.41917419433594,
"logps/rejected": -259.7709655761719,
"loss": 0.4123,
"rewards/accuracies": 0.8218750357627869,
"rewards/chosen": -1.995064377784729,
"rewards/margins": 1.4902169704437256,
"rewards/rejected": -3.485281467437744,
"step": 1080
},
{
"epoch": 0.7431079296092719,
"grad_norm": 25.027731308583423,
"learning_rate": 2.8636363636363637e-07,
"logits/chosen": -0.15540730953216553,
"logits/rejected": -0.03777886554598808,
"logps/chosen": -228.9147186279297,
"logps/rejected": -270.1064758300781,
"loss": 0.3986,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.030261993408203,
"rewards/margins": 1.6086986064910889,
"rewards/rejected": -3.638960599899292,
"step": 1090
},
{
"epoch": 0.7499254335506412,
"grad_norm": 32.01602660729836,
"learning_rate": 2.787878787878788e-07,
"logits/chosen": -0.11517558991909027,
"logits/rejected": -0.03462303429841995,
"logps/chosen": -224.9439697265625,
"logps/rejected": -259.7923889160156,
"loss": 0.4219,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9747884273529053,
"rewards/margins": 1.3838858604431152,
"rewards/rejected": -3.3586747646331787,
"step": 1100
},
{
"epoch": 0.7567429374920107,
"grad_norm": 31.517350763988723,
"learning_rate": 2.712121212121212e-07,
"logits/chosen": -0.12141910940408707,
"logits/rejected": -0.01053343154489994,
"logps/chosen": -220.55685424804688,
"logps/rejected": -259.1381530761719,
"loss": 0.4178,
"rewards/accuracies": 0.8109375238418579,
"rewards/chosen": -1.887320637702942,
"rewards/margins": 1.4266173839569092,
"rewards/rejected": -3.3139376640319824,
"step": 1110
},
{
"epoch": 0.7635604414333802,
"grad_norm": 26.127671898389973,
"learning_rate": 2.636363636363636e-07,
"logits/chosen": -0.08841400593519211,
"logits/rejected": -0.0024342993274331093,
"logps/chosen": -226.21588134765625,
"logps/rejected": -265.4579772949219,
"loss": 0.4114,
"rewards/accuracies": 0.8328125476837158,
"rewards/chosen": -1.9491462707519531,
"rewards/margins": 1.4079630374908447,
"rewards/rejected": -3.357109308242798,
"step": 1120
},
{
"epoch": 0.7703779453747497,
"grad_norm": 27.236714171841353,
"learning_rate": 2.56060606060606e-07,
"logits/chosen": -0.09334474056959152,
"logits/rejected": 0.006139551289379597,
"logps/chosen": -221.79632568359375,
"logps/rejected": -259.6671447753906,
"loss": 0.4146,
"rewards/accuracies": 0.8125000596046448,
"rewards/chosen": -1.8307971954345703,
"rewards/margins": 1.6026943922042847,
"rewards/rejected": -3.4334912300109863,
"step": 1130
},
{
"epoch": 0.7771954493161192,
"grad_norm": 23.66366648859467,
"learning_rate": 2.4848484848484846e-07,
"logits/chosen": -0.11060778051614761,
"logits/rejected": -0.020527532324194908,
"logps/chosen": -217.34706115722656,
"logps/rejected": -258.9071350097656,
"loss": 0.4164,
"rewards/accuracies": 0.8093750476837158,
"rewards/chosen": -1.7729765176773071,
"rewards/margins": 1.513333797454834,
"rewards/rejected": -3.2863101959228516,
"step": 1140
},
{
"epoch": 0.7840129532574887,
"grad_norm": 25.2882907135726,
"learning_rate": 2.409090909090909e-07,
"logits/chosen": -0.10374785959720612,
"logits/rejected": -0.018276991322636604,
"logps/chosen": -236.36245727539062,
"logps/rejected": -274.1772766113281,
"loss": 0.4001,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.8536320924758911,
"rewards/margins": 1.4811946153640747,
"rewards/rejected": -3.3348264694213867,
"step": 1150
},
{
"epoch": 0.790830457198858,
"grad_norm": 26.790523464833836,
"learning_rate": 2.3333333333333333e-07,
"logits/chosen": -0.13400709629058838,
"logits/rejected": -0.03729373216629028,
"logps/chosen": -220.23544311523438,
"logps/rejected": -264.3974304199219,
"loss": 0.3736,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.7529268264770508,
"rewards/margins": 1.6899100542068481,
"rewards/rejected": -3.4428367614746094,
"step": 1160
},
{
"epoch": 0.7976479611402275,
"grad_norm": 19.702274370294905,
"learning_rate": 2.2575757575757576e-07,
"logits/chosen": -0.17225009202957153,
"logits/rejected": -0.06531926244497299,
"logps/chosen": -217.31951904296875,
"logps/rejected": -262.68475341796875,
"loss": 0.3893,
"rewards/accuracies": 0.8296875357627869,
"rewards/chosen": -1.773924469947815,
"rewards/margins": 1.5378942489624023,
"rewards/rejected": -3.311818838119507,
"step": 1170
},
{
"epoch": 0.804465465081597,
"grad_norm": 23.425791089677848,
"learning_rate": 2.1818181818181815e-07,
"logits/chosen": -0.1890868991613388,
"logits/rejected": -0.09723814576864243,
"logps/chosen": -237.30435180664062,
"logps/rejected": -280.53155517578125,
"loss": 0.4049,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -1.9801266193389893,
"rewards/margins": 1.6612507104873657,
"rewards/rejected": -3.6413774490356445,
"step": 1180
},
{
"epoch": 0.8112829690229665,
"grad_norm": 24.870512384975044,
"learning_rate": 2.106060606060606e-07,
"logits/chosen": -0.15967592597007751,
"logits/rejected": -0.04680642858147621,
"logps/chosen": -226.22872924804688,
"logps/rejected": -259.76031494140625,
"loss": 0.4036,
"rewards/accuracies": 0.8218750357627869,
"rewards/chosen": -1.8575615882873535,
"rewards/margins": 1.4922484159469604,
"rewards/rejected": -3.3498101234436035,
"step": 1190
},
{
"epoch": 0.818100472964336,
"grad_norm": 24.491520644166894,
"learning_rate": 2.0303030303030303e-07,
"logits/chosen": -0.1476404070854187,
"logits/rejected": -0.056609444320201874,
"logps/chosen": -218.4747314453125,
"logps/rejected": -266.4804382324219,
"loss": 0.3761,
"rewards/accuracies": 0.839062511920929,
"rewards/chosen": -1.8029754161834717,
"rewards/margins": 1.7781448364257812,
"rewards/rejected": -3.581120014190674,
"step": 1200
},
{
"epoch": 0.8249179769057055,
"grad_norm": 25.43453775835723,
"learning_rate": 1.9545454545454545e-07,
"logits/chosen": -0.19677500426769257,
"logits/rejected": -0.1171552985906601,
"logps/chosen": -229.79348754882812,
"logps/rejected": -272.9264831542969,
"loss": 0.3763,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": -1.9184120893478394,
"rewards/margins": 1.567906379699707,
"rewards/rejected": -3.486318349838257,
"step": 1210
},
{
"epoch": 0.8317354808470748,
"grad_norm": 28.16577093909573,
"learning_rate": 1.8787878787878785e-07,
"logits/chosen": -0.2017778754234314,
"logits/rejected": -0.09277643263339996,
"logps/chosen": -231.24696350097656,
"logps/rejected": -278.23516845703125,
"loss": 0.3691,
"rewards/accuracies": 0.854687511920929,
"rewards/chosen": -1.9549689292907715,
"rewards/margins": 1.7604336738586426,
"rewards/rejected": -3.715402603149414,
"step": 1220
},
{
"epoch": 0.8385529847884443,
"grad_norm": 21.84200034001996,
"learning_rate": 1.803030303030303e-07,
"logits/chosen": -0.21481652557849884,
"logits/rejected": -0.09980207681655884,
"logps/chosen": -233.2796630859375,
"logps/rejected": -282.91644287109375,
"loss": 0.3711,
"rewards/accuracies": 0.8609374761581421,
"rewards/chosen": -2.0337018966674805,
"rewards/margins": 1.896054983139038,
"rewards/rejected": -3.9297573566436768,
"step": 1230
},
{
"epoch": 0.8453704887298138,
"grad_norm": 22.694435946938032,
"learning_rate": 1.7272727272727272e-07,
"logits/chosen": -0.15690943598747253,
"logits/rejected": -0.06275378912687302,
"logps/chosen": -227.27455139160156,
"logps/rejected": -274.53790283203125,
"loss": 0.3913,
"rewards/accuracies": 0.846875011920929,
"rewards/chosen": -1.9616923332214355,
"rewards/margins": 1.6192635297775269,
"rewards/rejected": -3.580955982208252,
"step": 1240
},
{
"epoch": 0.8521879926711833,
"grad_norm": 26.094403500936398,
"learning_rate": 1.6515151515151515e-07,
"logits/chosen": -0.19916404783725739,
"logits/rejected": -0.08219482004642487,
"logps/chosen": -227.81277465820312,
"logps/rejected": -274.002685546875,
"loss": 0.3713,
"rewards/accuracies": 0.84375,
"rewards/chosen": -2.1085541248321533,
"rewards/margins": 1.6733149290084839,
"rewards/rejected": -3.7818689346313477,
"step": 1250
},
{
"epoch": 0.8590054966125528,
"grad_norm": 28.285289787916426,
"learning_rate": 1.5757575757575757e-07,
"logits/chosen": -0.20249146223068237,
"logits/rejected": -0.11430975049734116,
"logps/chosen": -224.1591796875,
"logps/rejected": -267.00555419921875,
"loss": 0.355,
"rewards/accuracies": 0.8531250357627869,
"rewards/chosen": -2.015260934829712,
"rewards/margins": 1.7478997707366943,
"rewards/rejected": -3.7631607055664062,
"step": 1260
},
{
"epoch": 0.8658230005539223,
"grad_norm": 28.62125408987568,
"learning_rate": 1.5e-07,
"logits/chosen": -0.23050257563591003,
"logits/rejected": -0.13684435188770294,
"logps/chosen": -226.26783752441406,
"logps/rejected": -269.7716979980469,
"loss": 0.3747,
"rewards/accuracies": 0.840624988079071,
"rewards/chosen": -1.9981603622436523,
"rewards/margins": 1.6271231174468994,
"rewards/rejected": -3.6252834796905518,
"step": 1270
},
{
"epoch": 0.8726405044952916,
"grad_norm": 28.239549647986106,
"learning_rate": 1.4242424242424242e-07,
"logits/chosen": -0.21466362476348877,
"logits/rejected": -0.11737212538719177,
"logps/chosen": -225.7066650390625,
"logps/rejected": -269.4527893066406,
"loss": 0.3437,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.0757107734680176,
"rewards/margins": 1.816016674041748,
"rewards/rejected": -3.8917269706726074,
"step": 1280
},
{
"epoch": 0.8794580084366611,
"grad_norm": 31.488998628971228,
"learning_rate": 1.3484848484848484e-07,
"logits/chosen": -0.2509796619415283,
"logits/rejected": -0.15716706216335297,
"logps/chosen": -227.94644165039062,
"logps/rejected": -273.2456970214844,
"loss": 0.3713,
"rewards/accuracies": 0.8312500715255737,
"rewards/chosen": -2.063997268676758,
"rewards/margins": 1.7757971286773682,
"rewards/rejected": -3.839794635772705,
"step": 1290
},
{
"epoch": 0.8862755123780306,
"grad_norm": 30.3120062725158,
"learning_rate": 1.2727272727272726e-07,
"logits/chosen": -0.22213181853294373,
"logits/rejected": -0.10703583061695099,
"logps/chosen": -230.4038848876953,
"logps/rejected": -277.89312744140625,
"loss": 0.3841,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": -2.1271657943725586,
"rewards/margins": 1.8834253549575806,
"rewards/rejected": -4.01059103012085,
"step": 1300
},
{
"epoch": 0.8930930163194001,
"grad_norm": 20.44931799862506,
"learning_rate": 1.196969696969697e-07,
"logits/chosen": -0.19441619515419006,
"logits/rejected": -0.09727019816637039,
"logps/chosen": -221.7393035888672,
"logps/rejected": -264.74078369140625,
"loss": 0.3754,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.1274771690368652,
"rewards/margins": 1.6642124652862549,
"rewards/rejected": -3.791689395904541,
"step": 1310
},
{
"epoch": 0.8999105202607696,
"grad_norm": 29.43599693108452,
"learning_rate": 1.1212121212121211e-07,
"logits/chosen": -0.20746608078479767,
"logits/rejected": -0.10938036441802979,
"logps/chosen": -230.49411010742188,
"logps/rejected": -274.149169921875,
"loss": 0.4089,
"rewards/accuracies": 0.8140624761581421,
"rewards/chosen": -2.111417293548584,
"rewards/margins": 1.6498727798461914,
"rewards/rejected": -3.7612900733947754,
"step": 1320
},
{
"epoch": 0.9067280242021389,
"grad_norm": 32.13529497456355,
"learning_rate": 1.0454545454545454e-07,
"logits/chosen": -0.19128209352493286,
"logits/rejected": -0.09950501471757889,
"logps/chosen": -232.70655822753906,
"logps/rejected": -277.17218017578125,
"loss": 0.4075,
"rewards/accuracies": 0.8171875476837158,
"rewards/chosen": -2.2094011306762695,
"rewards/margins": 1.6824061870574951,
"rewards/rejected": -3.8918075561523438,
"step": 1330
},
{
"epoch": 0.9135455281435084,
"grad_norm": 24.495028721716935,
"learning_rate": 9.696969696969696e-08,
"logits/chosen": -0.2125847339630127,
"logits/rejected": -0.13061244785785675,
"logps/chosen": -238.3480682373047,
"logps/rejected": -278.2753601074219,
"loss": 0.3927,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.9954111576080322,
"rewards/margins": 1.6787245273590088,
"rewards/rejected": -3.674135208129883,
"step": 1340
},
{
"epoch": 0.9203630320848779,
"grad_norm": 18.494054905792385,
"learning_rate": 8.93939393939394e-08,
"logits/chosen": -0.22991694509983063,
"logits/rejected": -0.1375647336244583,
"logps/chosen": -232.1606903076172,
"logps/rejected": -277.8919677734375,
"loss": 0.3625,
"rewards/accuracies": 0.846875011920929,
"rewards/chosen": -1.9369205236434937,
"rewards/margins": 1.840739369392395,
"rewards/rejected": -3.7776598930358887,
"step": 1350
},
{
"epoch": 0.9271805360262474,
"grad_norm": 23.0952598889843,
"learning_rate": 8.181818181818182e-08,
"logits/chosen": -0.2471987009048462,
"logits/rejected": -0.15926134586334229,
"logps/chosen": -226.1959991455078,
"logps/rejected": -268.29144287109375,
"loss": 0.3859,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.9533106088638306,
"rewards/margins": 1.6697773933410645,
"rewards/rejected": -3.6230881214141846,
"step": 1360
},
{
"epoch": 0.9339980399676169,
"grad_norm": 31.57121472361453,
"learning_rate": 7.424242424242424e-08,
"logits/chosen": -0.21243150532245636,
"logits/rejected": -0.10772553086280823,
"logps/chosen": -229.5428924560547,
"logps/rejected": -267.2238464355469,
"loss": 0.41,
"rewards/accuracies": 0.8093750476837158,
"rewards/chosen": -2.041501998901367,
"rewards/margins": 1.6097761392593384,
"rewards/rejected": -3.651278257369995,
"step": 1370
},
{
"epoch": 0.9408155439089864,
"grad_norm": 24.660859941372323,
"learning_rate": 6.666666666666667e-08,
"logits/chosen": -0.2305571436882019,
"logits/rejected": -0.13984078168869019,
"logps/chosen": -235.77882385253906,
"logps/rejected": -275.8688659667969,
"loss": 0.3632,
"rewards/accuracies": 0.8515625,
"rewards/chosen": -1.9864195585250854,
"rewards/margins": 1.6625468730926514,
"rewards/rejected": -3.6489667892456055,
"step": 1380
},
{
"epoch": 0.9476330478503557,
"grad_norm": 21.4536097610763,
"learning_rate": 5.9090909090909085e-08,
"logits/chosen": -0.2120400369167328,
"logits/rejected": -0.12410594522953033,
"logps/chosen": -222.5196075439453,
"logps/rejected": -271.9889831542969,
"loss": 0.3386,
"rewards/accuracies": 0.8750000596046448,
"rewards/chosen": -1.9523049592971802,
"rewards/margins": 1.976583480834961,
"rewards/rejected": -3.9288883209228516,
"step": 1390
},
{
"epoch": 0.9544505517917252,
"grad_norm": 24.15173339303778,
"learning_rate": 5.151515151515151e-08,
"logits/chosen": -0.2382027804851532,
"logits/rejected": -0.15090808272361755,
"logps/chosen": -222.01071166992188,
"logps/rejected": -271.6986389160156,
"loss": 0.3614,
"rewards/accuracies": 0.8453124761581421,
"rewards/chosen": -2.006227731704712,
"rewards/margins": 1.7343711853027344,
"rewards/rejected": -3.740598678588867,
"step": 1400
},
{
"epoch": 0.9612680557330947,
"grad_norm": 24.550521505283424,
"learning_rate": 4.393939393939393e-08,
"logits/chosen": -0.17552296817302704,
"logits/rejected": -0.08083190023899078,
"logps/chosen": -228.05532836914062,
"logps/rejected": -272.7567443847656,
"loss": 0.4114,
"rewards/accuracies": 0.8218749761581421,
"rewards/chosen": -2.1707372665405273,
"rewards/margins": 1.5515494346618652,
"rewards/rejected": -3.7222867012023926,
"step": 1410
},
{
"epoch": 0.9680855596744642,
"grad_norm": 22.32856821239117,
"learning_rate": 3.636363636363636e-08,
"logits/chosen": -0.22575151920318604,
"logits/rejected": -0.12315725535154343,
"logps/chosen": -227.44094848632812,
"logps/rejected": -275.5225830078125,
"loss": 0.3557,
"rewards/accuracies": 0.846875011920929,
"rewards/chosen": -1.9577521085739136,
"rewards/margins": 1.8534328937530518,
"rewards/rejected": -3.811184883117676,
"step": 1420
},
{
"epoch": 0.9749030636158337,
"grad_norm": 29.28483492412063,
"learning_rate": 2.8787878787878787e-08,
"logits/chosen": -0.20699195563793182,
"logits/rejected": -0.11024124175310135,
"logps/chosen": -226.45755004882812,
"logps/rejected": -270.3711242675781,
"loss": 0.3872,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": -2.00704288482666,
"rewards/margins": 1.5971609354019165,
"rewards/rejected": -3.604203939437866,
"step": 1430
},
{
"epoch": 0.9817205675572032,
"grad_norm": 25.452990040575536,
"learning_rate": 2.1212121212121214e-08,
"logits/chosen": -0.2176055610179901,
"logits/rejected": -0.09939160197973251,
"logps/chosen": -219.8345947265625,
"logps/rejected": -268.5072937011719,
"loss": 0.3557,
"rewards/accuracies": 0.859375,
"rewards/chosen": -2.036515235900879,
"rewards/margins": 1.7241967916488647,
"rewards/rejected": -3.760712146759033,
"step": 1440
},
{
"epoch": 0.9885380714985725,
"grad_norm": 21.433383519856523,
"learning_rate": 1.3636363636363635e-08,
"logits/chosen": -0.19624559581279755,
"logits/rejected": -0.10556697845458984,
"logps/chosen": -229.1326141357422,
"logps/rejected": -277.2064514160156,
"loss": 0.3724,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.0669782161712646,
"rewards/margins": 1.8541796207427979,
"rewards/rejected": -3.9211580753326416,
"step": 1450
},
{
"epoch": 0.995355575439942,
"grad_norm": 24.137434407958857,
"learning_rate": 6.06060606060606e-09,
"logits/chosen": -0.19412463903427124,
"logits/rejected": -0.08993732929229736,
"logps/chosen": -227.69815063476562,
"logps/rejected": -274.43865966796875,
"loss": 0.3698,
"rewards/accuracies": 0.848437488079071,
"rewards/chosen": -1.9875869750976562,
"rewards/margins": 1.7459800243377686,
"rewards/rejected": -3.7335667610168457,
"step": 1460
},
{
"epoch": 1.0,
"step": 1467,
"total_flos": 161507922542592.0,
"train_loss": 0.48762158089620206,
"train_runtime": 14310.7821,
"train_samples_per_second": 6.56,
"train_steps_per_second": 0.103
}
],
"logging_steps": 10,
"max_steps": 1467,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 161507922542592.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}