diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3569 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9959925193694897, + "eval_steps": 100, + "global_step": 233, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004274646005877639, + "grad_norm": 3.4727758395385346, + "learning_rate": 4.166666666666666e-08, + "logits/chosen": -0.9238853454589844, + "logits/rejected": -0.9009266495704651, + "logps/chosen": -211.83998107910156, + "logps/rejected": -194.95265197753906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.008549292011755277, + "grad_norm": 3.5000648062483686, + "learning_rate": 8.333333333333333e-08, + "logits/chosen": -0.9474210739135742, + "logits/rejected": -0.9417086243629456, + "logps/chosen": -160.0943603515625, + "logps/rejected": -163.26644897460938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.012823938017632914, + "grad_norm": 3.8566721368935113, + "learning_rate": 1.25e-07, + "logits/chosen": -0.8552289009094238, + "logits/rejected": -0.9027292132377625, + "logps/chosen": -197.13523864746094, + "logps/rejected": -191.77366638183594, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00209163804538548, + "rewards/margins": 0.0021166829392313957, + "rewards/rejected": -2.5045330403372645e-05, + "step": 3 + }, + { + "epoch": 0.017098584023510555, + "grad_norm": 3.527297888533762, + "learning_rate": 1.6666666666666665e-07, + "logits/chosen": -0.9195055961608887, + "logits/rejected": -0.9506024122238159, + "logps/chosen": -175.96563720703125, + "logps/rejected": -177.187255859375, + "loss": 0.6931, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0005994887324050069, + "rewards/margins": 0.001228818204253912, + "rewards/rejected": -0.0006293297046795487, + "step": 4 + }, + { + "epoch": 0.02137323002938819, + "grad_norm": 3.274108961837268, + "learning_rate": 2.0833333333333333e-07, + "logits/chosen": -0.9131849408149719, + "logits/rejected": -0.9851359128952026, + "logps/chosen": -196.52279663085938, + "logps/rejected": -209.4899444580078, + "loss": 0.6936, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0014956831000745296, + "rewards/margins": -0.002281556138768792, + "rewards/rejected": 0.0007858729222789407, + "step": 5 + }, + { + "epoch": 0.02564787603526583, + "grad_norm": 3.4643988401861643, + "learning_rate": 2.5e-07, + "logits/chosen": -1.0323811769485474, + "logits/rejected": -1.0281962156295776, + "logps/chosen": -175.13864135742188, + "logps/rejected": -171.71237182617188, + "loss": 0.6934, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.0016992997843772173, + "rewards/margins": -0.0023347530514001846, + "rewards/rejected": 0.000635453499853611, + "step": 6 + }, + { + "epoch": 0.029922522041143467, + "grad_norm": 3.753822101296772, + "learning_rate": 2.916666666666667e-07, + "logits/chosen": -0.8140788078308105, + "logits/rejected": -0.8268399238586426, + "logps/chosen": -204.0390625, + "logps/rejected": -210.50558471679688, + "loss": 0.6926, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0002044451393885538, + "rewards/margins": 0.0006307458970695734, + "rewards/rejected": -0.0004263008013367653, + "step": 7 + }, + { + "epoch": 0.03419716804702111, + "grad_norm": 3.1848827253835568, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": -0.9922436475753784, + "logits/rejected": -0.9979274868965149, + "logps/chosen": -192.83494567871094, + "logps/rejected": -200.88128662109375, + "loss": 0.693, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.002161582000553608, + "rewards/margins": 0.0022183258552104235, + "rewards/rejected": -5.674359272234142e-05, + "step": 8 + }, + { + "epoch": 0.03847181405289874, + "grad_norm": 3.7147220039656568, + "learning_rate": 3.75e-07, + "logits/chosen": -0.9252921342849731, + "logits/rejected": -0.9685516357421875, + "logps/chosen": -175.70448303222656, + "logps/rejected": -180.89736938476562, + "loss": 0.6927, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0024666767567396164, + "rewards/margins": 0.002756566507741809, + "rewards/rejected": -0.0002898902166634798, + "step": 9 + }, + { + "epoch": 0.04274646005877638, + "grad_norm": 3.553251668230928, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": -0.9595114588737488, + "logits/rejected": -0.9833444356918335, + "logps/chosen": -208.72735595703125, + "logps/rejected": -214.8730926513672, + "loss": 0.693, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.002427927916869521, + "rewards/margins": 0.001882559503428638, + "rewards/rejected": 0.0005453681806102395, + "step": 10 + }, + { + "epoch": 0.04702110606465402, + "grad_norm": 3.4673888891096216, + "learning_rate": 4.5833333333333327e-07, + "logits/chosen": -0.986074686050415, + "logits/rejected": -0.9903304576873779, + "logps/chosen": -138.227783203125, + "logps/rejected": -137.13824462890625, + "loss": 0.6931, + "rewards/accuracies": 0.40625, + "rewards/chosen": 0.0006541174370795488, + "rewards/margins": -0.0011124282609671354, + "rewards/rejected": 0.0017665456980466843, + "step": 11 + }, + { + "epoch": 0.05129575207053166, + "grad_norm": 3.412261478741586, + "learning_rate": 5e-07, + "logits/chosen": -0.896647036075592, + "logits/rejected": -0.9640191197395325, + "logps/chosen": -157.36685180664062, + "logps/rejected": -180.9624481201172, + "loss": 0.693, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0007026732200756669, + "rewards/margins": 0.0007272702641785145, + "rewards/rejected": -2.459682582411915e-05, + "step": 12 + }, + { + "epoch": 0.055570398076409296, + "grad_norm": 3.5015942981464434, + "learning_rate": 5.416666666666666e-07, + "logits/chosen": -0.8603953123092651, + "logits/rejected": -0.8457555770874023, + "logps/chosen": -190.04727172851562, + "logps/rejected": -196.87872314453125, + "loss": 0.6918, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0017774368170648813, + "rewards/margins": 0.002889451337978244, + "rewards/rejected": -0.0011120146373286843, + "step": 13 + }, + { + "epoch": 0.059845044082286934, + "grad_norm": 3.3564122983724283, + "learning_rate": 5.833333333333334e-07, + "logits/chosen": -0.9946928024291992, + "logits/rejected": -0.9674972295761108, + "logps/chosen": -173.98526000976562, + "logps/rejected": -167.90187072753906, + "loss": 0.6926, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.00402639526873827, + "rewards/margins": 0.00235772505402565, + "rewards/rejected": 0.0016686702147126198, + "step": 14 + }, + { + "epoch": 0.06411969008816458, + "grad_norm": 3.6183547057085903, + "learning_rate": 6.249999999999999e-07, + "logits/chosen": -0.9302492737770081, + "logits/rejected": -0.9131873846054077, + "logps/chosen": -172.501953125, + "logps/rejected": -165.2920684814453, + "loss": 0.6914, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.008232050575315952, + "rewards/margins": 0.0026909802109003067, + "rewards/rejected": 0.005541070364415646, + "step": 15 + }, + { + "epoch": 0.06839433609404222, + "grad_norm": 4.094403587057344, + "learning_rate": 6.666666666666666e-07, + "logits/chosen": -0.8987658023834229, + "logits/rejected": -0.918194591999054, + "logps/chosen": -182.8192901611328, + "logps/rejected": -188.6702423095703, + "loss": 0.6907, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0033055683597922325, + "rewards/margins": 0.006322154775261879, + "rewards/rejected": -0.003016585949808359, + "step": 16 + }, + { + "epoch": 0.07266898209991986, + "grad_norm": 3.7556102735295602, + "learning_rate": 7.083333333333334e-07, + "logits/chosen": -0.7985554933547974, + "logits/rejected": -0.8355307579040527, + "logps/chosen": -218.515869140625, + "logps/rejected": -218.05130004882812, + "loss": 0.6907, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.002777060493826866, + "rewards/margins": 0.0028633405454456806, + "rewards/rejected": -8.627981878817081e-05, + "step": 17 + }, + { + "epoch": 0.07694362810579748, + "grad_norm": 3.7267439469140835, + "learning_rate": 7.5e-07, + "logits/chosen": -1.0510368347167969, + "logits/rejected": -1.1025066375732422, + "logps/chosen": -187.49362182617188, + "logps/rejected": -213.5237274169922, + "loss": 0.6893, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.003624723292887211, + "rewards/margins": 0.0034801624715328217, + "rewards/rejected": 0.00014456117060035467, + "step": 18 + }, + { + "epoch": 0.08121827411167512, + "grad_norm": 3.6435544044761947, + "learning_rate": 7.916666666666666e-07, + "logits/chosen": -1.0699188709259033, + "logits/rejected": -1.0673398971557617, + "logps/chosen": -185.699951171875, + "logps/rejected": -175.41836547851562, + "loss": 0.6889, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.01173433754593134, + "rewards/margins": 0.009952141903340816, + "rewards/rejected": 0.001782197505235672, + "step": 19 + }, + { + "epoch": 0.08549292011755276, + "grad_norm": 3.56212142993403, + "learning_rate": 8.333333333333333e-07, + "logits/chosen": -0.9693958759307861, + "logits/rejected": -1.0447947978973389, + "logps/chosen": -160.5248260498047, + "logps/rejected": -177.9250030517578, + "loss": 0.6883, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0071954806335270405, + "rewards/margins": 0.007437723223119974, + "rewards/rejected": -0.00024224258959293365, + "step": 20 + }, + { + "epoch": 0.0897675661234304, + "grad_norm": 3.599911110818667, + "learning_rate": 8.75e-07, + "logits/chosen": -0.8949970006942749, + "logits/rejected": -0.9538885951042175, + "logps/chosen": -155.24188232421875, + "logps/rejected": -175.83969116210938, + "loss": 0.6875, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0026197312399744987, + "rewards/margins": 0.013713551685214043, + "rewards/rejected": -0.016333281993865967, + "step": 21 + }, + { + "epoch": 0.09404221212930804, + "grad_norm": 3.809589692220953, + "learning_rate": 9.166666666666665e-07, + "logits/chosen": -0.8426035642623901, + "logits/rejected": -0.909124493598938, + "logps/chosen": -162.82546997070312, + "logps/rejected": -168.50677490234375, + "loss": 0.6853, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014116955921053886, + "rewards/margins": 0.008742437697947025, + "rewards/rejected": 0.005374519154429436, + "step": 22 + }, + { + "epoch": 0.09831685813518568, + "grad_norm": 3.8481134168387325, + "learning_rate": 9.583333333333334e-07, + "logits/chosen": -0.9963463544845581, + "logits/rejected": -1.030158281326294, + "logps/chosen": -212.16732788085938, + "logps/rejected": -226.55050659179688, + "loss": 0.6831, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.005188916344195604, + "rewards/margins": 0.037282973527908325, + "rewards/rejected": -0.04247189313173294, + "step": 23 + }, + { + "epoch": 0.10259150414106331, + "grad_norm": 4.017050664188984, + "learning_rate": 1e-06, + "logits/chosen": -0.9375415444374084, + "logits/rejected": -0.9786323308944702, + "logps/chosen": -161.66697692871094, + "logps/rejected": -171.43328857421875, + "loss": 0.6793, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0014034155756235123, + "rewards/margins": 0.029357939958572388, + "rewards/rejected": -0.03076135367155075, + "step": 24 + }, + { + "epoch": 0.10686615014694095, + "grad_norm": 4.219585279560121, + "learning_rate": 9.999435142363483e-07, + "logits/chosen": -0.9440574049949646, + "logits/rejected": -0.97591233253479, + "logps/chosen": -142.18214416503906, + "logps/rejected": -145.74217224121094, + "loss": 0.6753, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.006380043923854828, + "rewards/margins": 0.02844325453042984, + "rewards/rejected": -0.022063210606575012, + "step": 25 + }, + { + "epoch": 0.11114079615281859, + "grad_norm": 4.465336404650454, + "learning_rate": 9.997740697079592e-07, + "logits/chosen": -0.907569408416748, + "logits/rejected": -0.9431344270706177, + "logps/chosen": -186.16468811035156, + "logps/rejected": -188.70187377929688, + "loss": 0.6698, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.03334157541394234, + "rewards/margins": 0.04588525742292404, + "rewards/rejected": -0.07922682911157608, + "step": 26 + }, + { + "epoch": 0.11541544215869623, + "grad_norm": 4.035688150172887, + "learning_rate": 9.994917046996472e-07, + "logits/chosen": -0.9081155061721802, + "logits/rejected": -0.9375332593917847, + "logps/chosen": -196.47586059570312, + "logps/rejected": -210.2967071533203, + "loss": 0.6745, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.06558644771575928, + "rewards/margins": 0.024703964591026306, + "rewards/rejected": -0.09029041230678558, + "step": 27 + }, + { + "epoch": 0.11969008816457387, + "grad_norm": 4.589583975444085, + "learning_rate": 9.990964830098245e-07, + "logits/chosen": -0.9100086688995361, + "logits/rejected": -0.9473557472229004, + "logps/chosen": -183.28317260742188, + "logps/rejected": -191.90957641601562, + "loss": 0.6642, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0952601209282875, + "rewards/margins": 0.06084320694208145, + "rewards/rejected": -0.15610332787036896, + "step": 28 + }, + { + "epoch": 0.12396473417045151, + "grad_norm": 4.479468138978008, + "learning_rate": 9.985884939360872e-07, + "logits/chosen": -1.1165940761566162, + "logits/rejected": -1.1295504570007324, + "logps/chosen": -166.12542724609375, + "logps/rejected": -165.3243408203125, + "loss": 0.6578, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.11186902225017548, + "rewards/margins": 0.050340794026851654, + "rewards/rejected": -0.16220980882644653, + "step": 29 + }, + { + "epoch": 0.12823938017632916, + "grad_norm": 4.703342289738615, + "learning_rate": 9.97967852255038e-07, + "logits/chosen": -0.9528751969337463, + "logits/rejected": -0.9631531238555908, + "logps/chosen": -254.89320373535156, + "logps/rejected": -258.4338073730469, + "loss": 0.6583, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.22431208193302155, + "rewards/margins": 0.050895195454359055, + "rewards/rejected": -0.2752072513103485, + "step": 30 + }, + { + "epoch": 0.13251402618220678, + "grad_norm": 4.869138630164683, + "learning_rate": 9.972346981963546e-07, + "logits/chosen": -1.059159755706787, + "logits/rejected": -1.1036772727966309, + "logps/chosen": -245.163330078125, + "logps/rejected": -268.007568359375, + "loss": 0.6513, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.28532153367996216, + "rewards/margins": 0.1269759237766266, + "rewards/rejected": -0.41229742765426636, + "step": 31 + }, + { + "epoch": 0.13678867218808444, + "grad_norm": 4.841549329203085, + "learning_rate": 9.96389197411104e-07, + "logits/chosen": -0.9731124043464661, + "logits/rejected": -1.025037169456482, + "logps/chosen": -209.7532958984375, + "logps/rejected": -234.02642822265625, + "loss": 0.6414, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.20661108195781708, + "rewards/margins": 0.18127745389938354, + "rewards/rejected": -0.38788852095603943, + "step": 32 + }, + { + "epoch": 0.14106331819396206, + "grad_norm": 4.527086707272363, + "learning_rate": 9.954315409343168e-07, + "logits/chosen": -0.9516006708145142, + "logits/rejected": -1.0085594654083252, + "logps/chosen": -233.74896240234375, + "logps/rejected": -257.0697937011719, + "loss": 0.6391, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3886350095272064, + "rewards/margins": 0.19288921356201172, + "rewards/rejected": -0.5815242528915405, + "step": 33 + }, + { + "epoch": 0.14533796419983971, + "grad_norm": 5.675688535211087, + "learning_rate": 9.943619451418224e-07, + "logits/chosen": -0.9171434640884399, + "logits/rejected": -0.9520907998085022, + "logps/chosen": -232.1197967529297, + "logps/rejected": -252.1339874267578, + "loss": 0.6138, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4231939911842346, + "rewards/margins": 0.2075667530298233, + "rewards/rejected": -0.6307607293128967, + "step": 34 + }, + { + "epoch": 0.14961261020571734, + "grad_norm": 4.555434497600014, + "learning_rate": 9.931806517013612e-07, + "logits/chosen": -0.9599072933197021, + "logits/rejected": -0.9873026013374329, + "logps/chosen": -235.87911987304688, + "logps/rejected": -277.68585205078125, + "loss": 0.6249, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.5353493690490723, + "rewards/margins": 0.26459625363349915, + "rewards/rejected": -0.799945592880249, + "step": 35 + }, + { + "epoch": 0.15388725621159496, + "grad_norm": 4.824318328500942, + "learning_rate": 9.918879275179817e-07, + "logits/chosen": -1.1668760776519775, + "logits/rejected": -1.1293714046478271, + "logps/chosen": -288.35406494140625, + "logps/rejected": -298.5234375, + "loss": 0.6005, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6688686609268188, + "rewards/margins": 0.22619600594043732, + "rewards/rejected": -0.8950645923614502, + "step": 36 + }, + { + "epoch": 0.15816190221747262, + "grad_norm": 4.401543973490666, + "learning_rate": 9.904840646737345e-07, + "logits/chosen": -0.9521760940551758, + "logits/rejected": -0.9997081756591797, + "logps/chosen": -282.0852355957031, + "logps/rejected": -336.65020751953125, + "loss": 0.6319, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7313822507858276, + "rewards/margins": 0.29729628562927246, + "rewards/rejected": -1.0286785364151, + "step": 37 + }, + { + "epoch": 0.16243654822335024, + "grad_norm": 5.15798330777588, + "learning_rate": 9.889693803616791e-07, + "logits/chosen": -1.0276933908462524, + "logits/rejected": -1.045649766921997, + "logps/chosen": -311.87677001953125, + "logps/rejected": -334.0548095703125, + "loss": 0.6, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0708937644958496, + "rewards/margins": 0.2531777620315552, + "rewards/rejected": -1.3240714073181152, + "step": 38 + }, + { + "epoch": 0.1667111942292279, + "grad_norm": 4.8993868987417555, + "learning_rate": 9.873442168142157e-07, + "logits/chosen": -0.909888505935669, + "logits/rejected": -0.9343925714492798, + "logps/chosen": -254.18350219726562, + "logps/rejected": -285.18243408203125, + "loss": 0.5973, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9324233531951904, + "rewards/margins": 0.22102315723896027, + "rewards/rejected": -1.1534464359283447, + "step": 39 + }, + { + "epoch": 0.17098584023510552, + "grad_norm": 4.965329298176294, + "learning_rate": 9.856089412257604e-07, + "logits/chosen": -0.8430695533752441, + "logits/rejected": -0.8712520599365234, + "logps/chosen": -278.5356750488281, + "logps/rejected": -313.9254455566406, + "loss": 0.5892, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.097804069519043, + "rewards/margins": 0.29625624418258667, + "rewards/rejected": -1.3940601348876953, + "step": 40 + }, + { + "epoch": 0.17526048624098317, + "grad_norm": 4.964663460213464, + "learning_rate": 9.8376394566978e-07, + "logits/chosen": -0.9349880218505859, + "logits/rejected": -0.9195177555084229, + "logps/chosen": -353.0047607421875, + "logps/rejected": -375.0325927734375, + "loss": 0.5905, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4320145845413208, + "rewards/margins": 0.2865561842918396, + "rewards/rejected": -1.7185708284378052, + "step": 41 + }, + { + "epoch": 0.1795351322468608, + "grad_norm": 4.673829673061791, + "learning_rate": 9.818096470102066e-07, + "logits/chosen": -0.9460776448249817, + "logits/rejected": -1.0075451135635376, + "logps/chosen": -326.8829040527344, + "logps/rejected": -359.4068908691406, + "loss": 0.59, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3413481712341309, + "rewards/margins": 0.43105652928352356, + "rewards/rejected": -1.772404670715332, + "step": 42 + }, + { + "epoch": 0.18380977825273845, + "grad_norm": 4.756386075263894, + "learning_rate": 9.797464868072486e-07, + "logits/chosen": -0.8998066186904907, + "logits/rejected": -0.9348124265670776, + "logps/chosen": -347.86090087890625, + "logps/rejected": -441.64501953125, + "loss": 0.5674, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.69937264919281, + "rewards/margins": 0.7839919924736023, + "rewards/rejected": -2.4833645820617676, + "step": 43 + }, + { + "epoch": 0.18808442425861607, + "grad_norm": 5.666891206447458, + "learning_rate": 9.775749312176248e-07, + "logits/chosen": -0.8193731307983398, + "logits/rejected": -0.8275444507598877, + "logps/chosen": -334.3702697753906, + "logps/rejected": -402.1867370605469, + "loss": 0.592, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.655839204788208, + "rewards/margins": 0.6519087553024292, + "rewards/rejected": -2.3077480792999268, + "step": 44 + }, + { + "epoch": 0.19235907026449373, + "grad_norm": 5.611553010112512, + "learning_rate": 9.752954708892377e-07, + "logits/chosen": -0.8545299172401428, + "logits/rejected": -0.9027716517448425, + "logps/chosen": -371.7701721191406, + "logps/rejected": -439.71881103515625, + "loss": 0.5779, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.9063547849655151, + "rewards/margins": 0.5792344808578491, + "rewards/rejected": -2.4855895042419434, + "step": 45 + }, + { + "epoch": 0.19663371627037135, + "grad_norm": 5.062237682542423, + "learning_rate": 9.729086208503173e-07, + "logits/chosen": -0.9441611766815186, + "logits/rejected": -0.956858217716217, + "logps/chosen": -451.3914794921875, + "logps/rejected": -498.17999267578125, + "loss": 0.5592, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6227638721466064, + "rewards/margins": 0.4997914731502533, + "rewards/rejected": -3.1225552558898926, + "step": 46 + }, + { + "epoch": 0.200908362276249, + "grad_norm": 5.547722907580694, + "learning_rate": 9.70414920393052e-07, + "logits/chosen": -0.8402402400970459, + "logits/rejected": -0.8305561542510986, + "logps/chosen": -410.6358642578125, + "logps/rejected": -456.8866882324219, + "loss": 0.5657, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3196969032287598, + "rewards/margins": 0.5383195281028748, + "rewards/rejected": -2.8580164909362793, + "step": 47 + }, + { + "epoch": 0.20518300828212663, + "grad_norm": 5.843768728466239, + "learning_rate": 9.678149329517409e-07, + "logits/chosen": -0.9230031967163086, + "logits/rejected": -0.9459983706474304, + "logps/chosen": -421.91253662109375, + "logps/rejected": -464.15460205078125, + "loss": 0.5158, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.3388688564300537, + "rewards/margins": 0.5758055448532104, + "rewards/rejected": -2.9146745204925537, + "step": 48 + }, + { + "epoch": 0.20945765428800428, + "grad_norm": 6.600018288252386, + "learning_rate": 9.651092459754877e-07, + "logits/chosen": -0.7874542474746704, + "logits/rejected": -0.7807765007019043, + "logps/chosen": -553.3658447265625, + "logps/rejected": -578.9154052734375, + "loss": 0.5601, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.3889737129211426, + "rewards/margins": 0.2704327702522278, + "rewards/rejected": -3.6594066619873047, + "step": 49 + }, + { + "epoch": 0.2137323002938819, + "grad_norm": 6.555644686187637, + "learning_rate": 9.62298470795473e-07, + "logits/chosen": -0.7596021890640259, + "logits/rejected": -0.8105506896972656, + "logps/chosen": -396.783935546875, + "logps/rejected": -437.79541015625, + "loss": 0.5798, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.4116053581237793, + "rewards/margins": 0.3920546770095825, + "rewards/rejected": -2.8036601543426514, + "step": 50 + }, + { + "epoch": 0.21800694629975956, + "grad_norm": 6.744038117473701, + "learning_rate": 9.59383242486827e-07, + "logits/chosen": -0.8625648617744446, + "logits/rejected": -0.8875184059143066, + "logps/chosen": -505.1508483886719, + "logps/rejected": -608.79248046875, + "loss": 0.5389, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.0446105003356934, + "rewards/margins": 0.9900886416435242, + "rewards/rejected": -4.034698963165283, + "step": 51 + }, + { + "epoch": 0.22228159230563718, + "grad_norm": 5.565039920114929, + "learning_rate": 9.56364219725138e-07, + "logits/chosen": -0.8463042974472046, + "logits/rejected": -0.8962733745574951, + "logps/chosen": -499.99041748046875, + "logps/rejected": -634.49072265625, + "loss": 0.4915, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.071751832962036, + "rewards/margins": 1.300065279006958, + "rewards/rejected": -4.371817588806152, + "step": 52 + }, + { + "epoch": 0.22655623831151483, + "grad_norm": 9.993449327747468, + "learning_rate": 9.532420846376315e-07, + "logits/chosen": -0.7763329744338989, + "logits/rejected": -0.8177902698516846, + "logps/chosen": -433.4925842285156, + "logps/rejected": -530.1204223632812, + "loss": 0.6104, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.5958311557769775, + "rewards/margins": 0.9427847862243652, + "rewards/rejected": -3.5386157035827637, + "step": 53 + }, + { + "epoch": 0.23083088431739246, + "grad_norm": 6.914149355278302, + "learning_rate": 9.500175426490454e-07, + "logits/chosen": -0.7263307571411133, + "logits/rejected": -0.7950284481048584, + "logps/chosen": -590.033447265625, + "logps/rejected": -696.9810791015625, + "loss": 0.5291, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.79225492477417, + "rewards/margins": 0.9605345726013184, + "rewards/rejected": -4.75278902053833, + "step": 54 + }, + { + "epoch": 0.2351055303232701, + "grad_norm": 6.4385225722348425, + "learning_rate": 9.466913223222465e-07, + "logits/chosen": -0.73805832862854, + "logits/rejected": -0.8121139407157898, + "logps/chosen": -527.0991821289062, + "logps/rejected": -672.7268676757812, + "loss": 0.536, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.397312641143799, + "rewards/margins": 1.3639640808105469, + "rewards/rejected": -4.761276721954346, + "step": 55 + }, + { + "epoch": 0.23938017632914774, + "grad_norm": 7.766777578194787, + "learning_rate": 9.432641751936162e-07, + "logits/chosen": -0.8009728193283081, + "logits/rejected": -0.8259899020195007, + "logps/chosen": -421.18414306640625, + "logps/rejected": -515.0050659179688, + "loss": 0.5853, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.522728204727173, + "rewards/margins": 0.8710657954216003, + "rewards/rejected": -3.393793821334839, + "step": 56 + }, + { + "epoch": 0.2436548223350254, + "grad_norm": 6.5333907258413655, + "learning_rate": 9.397368756032444e-07, + "logits/chosen": -0.7609117031097412, + "logits/rejected": -0.7754147052764893, + "logps/chosen": -436.06427001953125, + "logps/rejected": -512.485107421875, + "loss": 0.5019, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.599217414855957, + "rewards/margins": 0.6933461427688599, + "rewards/rejected": -3.2925636768341064, + "step": 57 + }, + { + "epoch": 0.24792946834090301, + "grad_norm": 7.290942942753059, + "learning_rate": 9.36110220519976e-07, + "logits/chosen": -0.7123927474021912, + "logits/rejected": -0.7812705039978027, + "logps/chosen": -428.41351318359375, + "logps/rejected": -493.216552734375, + "loss": 0.5486, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.501229763031006, + "rewards/margins": 0.6336008906364441, + "rewards/rejected": -3.1348307132720947, + "step": 58 + }, + { + "epoch": 0.25220411434678064, + "grad_norm": 7.291539302989226, + "learning_rate": 9.323850293613379e-07, + "logits/chosen": -0.8743740916252136, + "logits/rejected": -0.8304850459098816, + "logps/chosen": -416.0941467285156, + "logps/rejected": -461.4928283691406, + "loss": 0.5248, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.469886064529419, + "rewards/margins": 0.6079959869384766, + "rewards/rejected": -3.0778818130493164, + "step": 59 + }, + { + "epoch": 0.2564787603526583, + "grad_norm": 6.444086326444115, + "learning_rate": 9.285621438083997e-07, + "logits/chosen": -0.7638828754425049, + "logits/rejected": -0.830043375492096, + "logps/chosen": -462.22723388671875, + "logps/rejected": -567.3082275390625, + "loss": 0.496, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.678903818130493, + "rewards/margins": 0.9149044752120972, + "rewards/rejected": -3.59380841255188, + "step": 60 + }, + { + "epoch": 0.26075340635853594, + "grad_norm": 6.239441558672264, + "learning_rate": 9.246424276156006e-07, + "logits/chosen": -0.7686220407485962, + "logits/rejected": -0.786496102809906, + "logps/chosen": -426.57977294921875, + "logps/rejected": -539.332763671875, + "loss": 0.4872, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.421875, + "rewards/margins": 1.0811634063720703, + "rewards/rejected": -3.5030384063720703, + "step": 61 + }, + { + "epoch": 0.26502805236441357, + "grad_norm": 6.955390195667037, + "learning_rate": 9.206267664155906e-07, + "logits/chosen": -0.8518524765968323, + "logits/rejected": -0.8896721005439758, + "logps/chosen": -490.898681640625, + "logps/rejected": -571.5889282226562, + "loss": 0.5328, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7304701805114746, + "rewards/margins": 0.8642421364784241, + "rewards/rejected": -3.594712495803833, + "step": 62 + }, + { + "epoch": 0.2693026983702912, + "grad_norm": 6.71732914608112, + "learning_rate": 9.165160675191271e-07, + "logits/chosen": -0.7856395244598389, + "logits/rejected": -0.8273566961288452, + "logps/chosen": -406.04241943359375, + "logps/rejected": -512.580810546875, + "loss": 0.54, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.3751254081726074, + "rewards/margins": 1.008442759513855, + "rewards/rejected": -3.3835678100585938, + "step": 63 + }, + { + "epoch": 0.2735773443761689, + "grad_norm": 8.23163566342515, + "learning_rate": 9.123112597100757e-07, + "logits/chosen": -0.7550954818725586, + "logits/rejected": -0.7312250733375549, + "logps/chosen": -428.09210205078125, + "logps/rejected": -472.0053405761719, + "loss": 0.5628, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.460500717163086, + "rewards/margins": 0.6525804996490479, + "rewards/rejected": -3.1130809783935547, + "step": 64 + }, + { + "epoch": 0.2778519903820465, + "grad_norm": 6.484017150981526, + "learning_rate": 9.080132930355566e-07, + "logits/chosen": -0.7198902368545532, + "logits/rejected": -0.7333334684371948, + "logps/chosen": -447.80694580078125, + "logps/rejected": -543.547119140625, + "loss": 0.4951, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.3654160499572754, + "rewards/margins": 1.1327059268951416, + "rewards/rejected": -3.498121738433838, + "step": 65 + }, + { + "epoch": 0.2821266363879241, + "grad_norm": 7.88288469592664, + "learning_rate": 9.036231385912889e-07, + "logits/chosen": -0.787277102470398, + "logits/rejected": -0.8082758188247681, + "logps/chosen": -542.9320678710938, + "logps/rejected": -589.2691650390625, + "loss": 0.5554, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.2829174995422363, + "rewards/margins": 0.4686228036880493, + "rewards/rejected": -3.751540184020996, + "step": 66 + }, + { + "epoch": 0.28640128239380175, + "grad_norm": 6.87598900963406, + "learning_rate": 8.991417883021779e-07, + "logits/chosen": -0.7320197820663452, + "logits/rejected": -0.7914742231369019, + "logps/chosen": -322.4740295410156, + "logps/rejected": -403.00982666015625, + "loss": 0.489, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.867960810661316, + "rewards/margins": 0.7985786199569702, + "rewards/rejected": -2.666539430618286, + "step": 67 + }, + { + "epoch": 0.29067592839967943, + "grad_norm": 6.988332805689512, + "learning_rate": 8.945702546981968e-07, + "logits/chosen": -0.7299609780311584, + "logits/rejected": -0.7391811013221741, + "logps/chosen": -424.68255615234375, + "logps/rejected": -520.7315673828125, + "loss": 0.485, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.4903666973114014, + "rewards/margins": 0.9038018584251404, + "rewards/rejected": -3.3941686153411865, + "step": 68 + }, + { + "epoch": 0.29495057440555705, + "grad_norm": 7.25643319614823, + "learning_rate": 8.899095706856121e-07, + "logits/chosen": -0.8242793679237366, + "logits/rejected": -0.8567203879356384, + "logps/chosen": -416.467041015625, + "logps/rejected": -556.6646118164062, + "loss": 0.501, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.350522994995117, + "rewards/margins": 1.3620076179504395, + "rewards/rejected": -3.7125303745269775, + "step": 69 + }, + { + "epoch": 0.2992252204114347, + "grad_norm": 7.602549324125367, + "learning_rate": 8.851607893136064e-07, + "logits/chosen": -0.7457299828529358, + "logits/rejected": -0.7355296611785889, + "logps/chosen": -458.4794006347656, + "logps/rejected": -523.21484375, + "loss": 0.4974, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.8070144653320312, + "rewards/margins": 0.6739380955696106, + "rewards/rejected": -3.480952739715576, + "step": 70 + }, + { + "epoch": 0.3034998664173123, + "grad_norm": 7.842587956186825, + "learning_rate": 8.803249835363484e-07, + "logits/chosen": -0.7719243168830872, + "logits/rejected": -0.8175538778305054, + "logps/chosen": -391.3406982421875, + "logps/rejected": -472.3711242675781, + "loss": 0.5232, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.26741099357605, + "rewards/margins": 0.7575722336769104, + "rewards/rejected": -3.0249834060668945, + "step": 71 + }, + { + "epoch": 0.3077745124231899, + "grad_norm": 8.33328622593718, + "learning_rate": 8.754032459705671e-07, + "logits/chosen": -0.7375326752662659, + "logits/rejected": -0.7411423921585083, + "logps/chosen": -552.6005249023438, + "logps/rejected": -658.5523071289062, + "loss": 0.4689, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2394614219665527, + "rewards/margins": 1.1419782638549805, + "rewards/rejected": -4.381440162658691, + "step": 72 + }, + { + "epoch": 0.3120491584290676, + "grad_norm": 8.00590533781815, + "learning_rate": 8.703966886486818e-07, + "logits/chosen": -0.7447977066040039, + "logits/rejected": -0.8021827340126038, + "logps/chosen": -528.2827758789062, + "logps/rejected": -663.02099609375, + "loss": 0.4719, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.100710868835449, + "rewards/margins": 1.4192044734954834, + "rewards/rejected": -4.519914627075195, + "step": 73 + }, + { + "epoch": 0.31632380443494523, + "grad_norm": 9.911560282455111, + "learning_rate": 8.653064427675469e-07, + "logits/chosen": -0.7718651294708252, + "logits/rejected": -0.7922145128250122, + "logps/chosen": -473.9974365234375, + "logps/rejected": -587.6644897460938, + "loss": 0.559, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9299654960632324, + "rewards/margins": 1.1681456565856934, + "rewards/rejected": -4.098111152648926, + "step": 74 + }, + { + "epoch": 0.32059845044082286, + "grad_norm": 7.86727024374843, + "learning_rate": 8.601336584328658e-07, + "logits/chosen": -0.6917619705200195, + "logits/rejected": -0.6980517506599426, + "logps/chosen": -500.0274963378906, + "logps/rejected": -590.8597412109375, + "loss": 0.4719, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1366689205169678, + "rewards/margins": 0.9159411191940308, + "rewards/rejected": -4.052610397338867, + "step": 75 + }, + { + "epoch": 0.3248730964467005, + "grad_norm": 9.289091160075673, + "learning_rate": 8.548795043993315e-07, + "logits/chosen": -0.7438817620277405, + "logits/rejected": -0.7294880747795105, + "logps/chosen": -521.81005859375, + "logps/rejected": -567.1892700195312, + "loss": 0.5389, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.433103322982788, + "rewards/margins": 0.4686957001686096, + "rewards/rejected": -3.901798725128174, + "step": 76 + }, + { + "epoch": 0.32914774245257816, + "grad_norm": 8.347011918030578, + "learning_rate": 8.495451678065561e-07, + "logits/chosen": -0.7081446647644043, + "logits/rejected": -0.7084572315216064, + "logps/chosen": -471.94879150390625, + "logps/rejected": -576.2655639648438, + "loss": 0.4923, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.9118268489837646, + "rewards/margins": 1.087062954902649, + "rewards/rejected": -3.9988901615142822, + "step": 77 + }, + { + "epoch": 0.3334223884584558, + "grad_norm": 9.178664809436675, + "learning_rate": 8.441318539108432e-07, + "logits/chosen": -0.672901451587677, + "logits/rejected": -0.6473367214202881, + "logps/chosen": -446.5679931640625, + "logps/rejected": -525.839599609375, + "loss": 0.4831, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.884944200515747, + "rewards/margins": 0.8302309513092041, + "rewards/rejected": -3.7151753902435303, + "step": 78 + }, + { + "epoch": 0.3376970344643334, + "grad_norm": 9.222651758068919, + "learning_rate": 8.386407858128706e-07, + "logits/chosen": -0.7438699007034302, + "logits/rejected": -0.7355214357376099, + "logps/chosen": -530.9581909179688, + "logps/rejected": -658.5861206054688, + "loss": 0.4838, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.4786720275878906, + "rewards/margins": 1.2334851026535034, + "rewards/rejected": -4.712156772613525, + "step": 79 + }, + { + "epoch": 0.34197168047021104, + "grad_norm": 8.799544386144294, + "learning_rate": 8.330732041813366e-07, + "logits/chosen": -0.5365869402885437, + "logits/rejected": -0.5661185383796692, + "logps/chosen": -488.5903015136719, + "logps/rejected": -571.9346923828125, + "loss": 0.4732, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.123514175415039, + "rewards/margins": 0.8028107285499573, + "rewards/rejected": -3.9263250827789307, + "step": 80 + }, + { + "epoch": 0.3462463264760887, + "grad_norm": 9.030244312954085, + "learning_rate": 8.274303669726426e-07, + "logits/chosen": -0.6187846660614014, + "logits/rejected": -0.6990691423416138, + "logps/chosen": -469.09161376953125, + "logps/rejected": -603.3698120117188, + "loss": 0.4698, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.157665729522705, + "rewards/margins": 1.1742490530014038, + "rewards/rejected": -4.331915378570557, + "step": 81 + }, + { + "epoch": 0.35052097248196634, + "grad_norm": 8.841699455559503, + "learning_rate": 8.217135491466636e-07, + "logits/chosen": -0.473153293132782, + "logits/rejected": -0.5449516177177429, + "logps/chosen": -491.49249267578125, + "logps/rejected": -654.7282104492188, + "loss": 0.4764, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.222175121307373, + "rewards/margins": 1.6088354587554932, + "rewards/rejected": -4.831010341644287, + "step": 82 + }, + { + "epoch": 0.35479561848784397, + "grad_norm": 11.098615894997318, + "learning_rate": 8.159240423786819e-07, + "logits/chosen": -0.6635532379150391, + "logits/rejected": -0.6708536148071289, + "logps/chosen": -529.8382568359375, + "logps/rejected": -615.276123046875, + "loss": 0.5068, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.458679437637329, + "rewards/margins": 0.8304582238197327, + "rewards/rejected": -4.289137840270996, + "step": 83 + }, + { + "epoch": 0.3590702644937216, + "grad_norm": 10.69217432485512, + "learning_rate": 8.100631547675416e-07, + "logits/chosen": -0.5764239430427551, + "logits/rejected": -0.6042333245277405, + "logps/chosen": -538.3191528320312, + "logps/rejected": -671.1778564453125, + "loss": 0.463, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.707031726837158, + "rewards/margins": 1.2638828754425049, + "rewards/rejected": -4.970914363861084, + "step": 84 + }, + { + "epoch": 0.36334491049959927, + "grad_norm": 10.389532135595351, + "learning_rate": 8.041322105400921e-07, + "logits/chosen": -0.5952804088592529, + "logits/rejected": -0.5918059349060059, + "logps/chosen": -468.830322265625, + "logps/rejected": -555.058349609375, + "loss": 0.4507, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2899224758148193, + "rewards/margins": 0.8072735071182251, + "rewards/rejected": -4.097196102142334, + "step": 85 + }, + { + "epoch": 0.3676195565054769, + "grad_norm": 9.76247745990493, + "learning_rate": 7.981325497519891e-07, + "logits/chosen": -0.5011740922927856, + "logits/rejected": -0.5748673677444458, + "logps/chosen": -568.4143676757812, + "logps/rejected": -665.891357421875, + "loss": 0.473, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9403202533721924, + "rewards/margins": 0.8801581859588623, + "rewards/rejected": -4.820478439331055, + "step": 86 + }, + { + "epoch": 0.3718942025113545, + "grad_norm": 9.276763138531336, + "learning_rate": 7.920655279849171e-07, + "logits/chosen": -0.6208050847053528, + "logits/rejected": -0.6661792993545532, + "logps/chosen": -454.78558349609375, + "logps/rejected": -583.1072387695312, + "loss": 0.439, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0670785903930664, + "rewards/margins": 1.2415242195129395, + "rewards/rejected": -4.308602333068848, + "step": 87 + }, + { + "epoch": 0.37616884851723215, + "grad_norm": 8.47887809958896, + "learning_rate": 7.859325160403071e-07, + "logits/chosen": -0.5842097401618958, + "logits/rejected": -0.6111244559288025, + "logps/chosen": -513.4686279296875, + "logps/rejected": -631.78515625, + "loss": 0.4224, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.507704973220825, + "rewards/margins": 1.1260159015655518, + "rewards/rejected": -4.633721351623535, + "step": 88 + }, + { + "epoch": 0.3804434945231098, + "grad_norm": 9.042970665585285, + "learning_rate": 7.797348996296114e-07, + "logits/chosen": -0.594511091709137, + "logits/rejected": -0.5762075185775757, + "logps/chosen": -528.5706787109375, + "logps/rejected": -640.2254638671875, + "loss": 0.4195, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.5566911697387695, + "rewards/margins": 1.1777138710021973, + "rewards/rejected": -4.734404563903809, + "step": 89 + }, + { + "epoch": 0.38471814052898745, + "grad_norm": 9.591838712191619, + "learning_rate": 7.734740790612136e-07, + "logits/chosen": -0.5243846774101257, + "logits/rejected": -0.5419484376907349, + "logps/chosen": -597.5997924804688, + "logps/rejected": -721.8450927734375, + "loss": 0.4525, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.259735107421875, + "rewards/margins": 1.2303788661956787, + "rewards/rejected": -5.490115165710449, + "step": 90 + }, + { + "epoch": 0.3889927865348651, + "grad_norm": 9.197817870604572, + "learning_rate": 7.671514689240365e-07, + "logits/chosen": -0.5726766586303711, + "logits/rejected": -0.6172913312911987, + "logps/chosen": -557.262939453125, + "logps/rejected": -697.6727294921875, + "loss": 0.4701, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.7903552055358887, + "rewards/margins": 1.3101625442504883, + "rewards/rejected": -5.100517749786377, + "step": 91 + }, + { + "epoch": 0.3932674325407427, + "grad_norm": 11.581331107950847, + "learning_rate": 7.607684977679283e-07, + "logits/chosen": -0.6335964202880859, + "logits/rejected": -0.6610329747200012, + "logps/chosen": -519.31103515625, + "logps/rejected": -657.5548706054688, + "loss": 0.4294, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.4710004329681396, + "rewards/margins": 1.4494860172271729, + "rewards/rejected": -4.920486927032471, + "step": 92 + }, + { + "epoch": 0.3975420785466204, + "grad_norm": 10.631351767452296, + "learning_rate": 7.543266077808892e-07, + "logits/chosen": -0.427675724029541, + "logits/rejected": -0.45514771342277527, + "logps/chosen": -571.4942626953125, + "logps/rejected": -717.7467041015625, + "loss": 0.4636, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.948040723800659, + "rewards/margins": 1.4244039058685303, + "rewards/rejected": -5.372445106506348, + "step": 93 + }, + { + "epoch": 0.401816724552498, + "grad_norm": 12.057725080831089, + "learning_rate": 7.478272544632202e-07, + "logits/chosen": -0.5969647765159607, + "logits/rejected": -0.6751678586006165, + "logps/chosen": -643.1666870117188, + "logps/rejected": -773.894287109375, + "loss": 0.4507, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.552432537078857, + "rewards/margins": 1.3130940198898315, + "rewards/rejected": -5.8655266761779785, + "step": 94 + }, + { + "epoch": 0.40609137055837563, + "grad_norm": 10.936442473029212, + "learning_rate": 7.412719062986631e-07, + "logits/chosen": -0.4887186288833618, + "logits/rejected": -0.4894056022167206, + "logps/chosen": -555.25341796875, + "logps/rejected": -653.7760009765625, + "loss": 0.4518, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.9043784141540527, + "rewards/margins": 1.035258412361145, + "rewards/rejected": -4.939637184143066, + "step": 95 + }, + { + "epoch": 0.41036601656425326, + "grad_norm": 18.608338586356762, + "learning_rate": 7.346620444226059e-07, + "logits/chosen": -0.5932431221008301, + "logits/rejected": -0.6164640784263611, + "logps/chosen": -586.3929443359375, + "logps/rejected": -703.4130249023438, + "loss": 0.4449, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.206478118896484, + "rewards/margins": 1.083022117614746, + "rewards/rejected": -5.2895002365112305, + "step": 96 + }, + { + "epoch": 0.41464066257013094, + "grad_norm": 12.657787002066415, + "learning_rate": 7.279991622874318e-07, + "logits/chosen": -0.5697692632675171, + "logits/rejected": -0.6259853839874268, + "logps/chosen": -585.4468994140625, + "logps/rejected": -732.0520629882812, + "loss": 0.483, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.9570021629333496, + "rewards/margins": 1.447950839996338, + "rewards/rejected": -5.404953479766846, + "step": 97 + }, + { + "epoch": 0.41891530857600856, + "grad_norm": 13.367477990049776, + "learning_rate": 7.212847653250828e-07, + "logits/chosen": -0.6540141105651855, + "logits/rejected": -0.6565195322036743, + "logps/chosen": -731.0140380859375, + "logps/rejected": -848.3659057617188, + "loss": 0.4767, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.200318336486816, + "rewards/margins": 1.2101118564605713, + "rewards/rejected": -6.410430908203125, + "step": 98 + }, + { + "epoch": 0.4231899545818862, + "grad_norm": 10.139524171463488, + "learning_rate": 7.145203706069182e-07, + "logits/chosen": -0.7252554893493652, + "logits/rejected": -0.7746644616127014, + "logps/chosen": -669.2903442382812, + "logps/rejected": -828.5930786132812, + "loss": 0.3875, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.771048545837402, + "rewards/margins": 1.5108307600021362, + "rewards/rejected": -6.281879425048828, + "step": 99 + }, + { + "epoch": 0.4274646005877638, + "grad_norm": 12.356398801857143, + "learning_rate": 7.077075065009433e-07, + "logits/chosen": -0.557784914970398, + "logits/rejected": -0.5647093057632446, + "logps/chosen": -607.4837646484375, + "logps/rejected": -718.2990112304688, + "loss": 0.4439, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.555881500244141, + "rewards/margins": 1.080979585647583, + "rewards/rejected": -5.636861801147461, + "step": 100 + }, + { + "epoch": 0.4274646005877638, + "eval_logits/chosen": -0.5001155734062195, + "eval_logits/rejected": -0.5150425434112549, + "eval_logps/chosen": -704.6570434570312, + "eval_logps/rejected": -829.715087890625, + "eval_loss": 0.4167614281177521, + "eval_rewards/accuracies": 0.8145161271095276, + "eval_rewards/chosen": -4.996352672576904, + "eval_rewards/margins": 1.3122578859329224, + "eval_rewards/rejected": -6.308610916137695, + "eval_runtime": 165.659, + "eval_samples_per_second": 11.838, + "eval_steps_per_second": 0.374, + "step": 100 + }, + { + "epoch": 0.4317392465936415, + "grad_norm": 13.314006294995002, + "learning_rate": 7.008477123264847e-07, + "logits/chosen": -0.6598826050758362, + "logits/rejected": -0.6927035450935364, + "logps/chosen": -734.70556640625, + "logps/rejected": -900.852294921875, + "loss": 0.4118, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.161342620849609, + "rewards/margins": 1.7760246992111206, + "rewards/rejected": -6.9373674392700195, + "step": 101 + }, + { + "epoch": 0.4360138925995191, + "grad_norm": 13.821741873689282, + "learning_rate": 6.939425380063923e-07, + "logits/chosen": -0.6629341244697571, + "logits/rejected": -0.7558687925338745, + "logps/chosen": -699.57177734375, + "logps/rejected": -886.5426025390625, + "loss": 0.3874, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.201181888580322, + "rewards/margins": 1.6117980480194092, + "rewards/rejected": -6.8129801750183105, + "step": 102 + }, + { + "epoch": 0.44028853860539674, + "grad_norm": 14.52198562884218, + "learning_rate": 6.869935437168449e-07, + "logits/chosen": -0.4441612958908081, + "logits/rejected": -0.4517134428024292, + "logps/chosen": -648.8721313476562, + "logps/rejected": -743.1588745117188, + "loss": 0.4932, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.671912670135498, + "rewards/margins": 1.0545084476470947, + "rewards/rejected": -5.726420879364014, + "step": 103 + }, + { + "epoch": 0.44456318461127436, + "grad_norm": 15.339894722169653, + "learning_rate": 6.80002299534838e-07, + "logits/chosen": -0.719368577003479, + "logits/rejected": -0.7461254596710205, + "logps/chosen": -573.4705810546875, + "logps/rejected": -651.0980224609375, + "loss": 0.4402, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.8566808700561523, + "rewards/margins": 0.8586393594741821, + "rewards/rejected": -4.715320587158203, + "step": 104 + }, + { + "epoch": 0.448837830617152, + "grad_norm": 10.956142784913482, + "learning_rate": 6.72970385083438e-07, + "logits/chosen": -0.641654372215271, + "logits/rejected": -0.6621043682098389, + "logps/chosen": -592.4070434570312, + "logps/rejected": -721.1480102539062, + "loss": 0.4013, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.002495288848877, + "rewards/margins": 1.1794517040252686, + "rewards/rejected": -5.181946754455566, + "step": 105 + }, + { + "epoch": 0.45311247662302967, + "grad_norm": 14.08687818754259, + "learning_rate": 6.658993891748759e-07, + "logits/chosen": -0.6141338348388672, + "logits/rejected": -0.5712395310401917, + "logps/chosen": -525.6826171875, + "logps/rejected": -657.1926879882812, + "loss": 0.3788, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3286538124084473, + "rewards/margins": 1.5682119131088257, + "rewards/rejected": -4.896864891052246, + "step": 106 + }, + { + "epoch": 0.4573871226289073, + "grad_norm": 12.007137757034995, + "learning_rate": 6.587909094515663e-07, + "logits/chosen": -0.6399226188659668, + "logits/rejected": -0.6818464994430542, + "logps/chosen": -515.7030639648438, + "logps/rejected": -624.790283203125, + "loss": 0.4432, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.5857155323028564, + "rewards/margins": 0.9131308794021606, + "rewards/rejected": -4.498846530914307, + "step": 107 + }, + { + "epoch": 0.4616617686347849, + "grad_norm": 11.626806758384587, + "learning_rate": 6.516465520251313e-07, + "logits/chosen": -0.6572325229644775, + "logits/rejected": -0.7261943221092224, + "logps/chosen": -557.6213989257812, + "logps/rejected": -685.3796997070312, + "loss": 0.4302, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.802943468093872, + "rewards/margins": 1.3063392639160156, + "rewards/rejected": -5.109282970428467, + "step": 108 + }, + { + "epoch": 0.46593641464066254, + "grad_norm": 11.769626267692969, + "learning_rate": 6.444679311135112e-07, + "logits/chosen": -0.6812455058097839, + "logits/rejected": -0.6769453287124634, + "logps/chosen": -545.5555419921875, + "logps/rejected": -670.9700317382812, + "loss": 0.4633, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.524083137512207, + "rewards/margins": 1.1972600221633911, + "rewards/rejected": -4.721343040466309, + "step": 109 + }, + { + "epoch": 0.4702110606465402, + "grad_norm": 11.834467781345984, + "learning_rate": 6.372566686762426e-07, + "logits/chosen": -0.6734607219696045, + "logits/rejected": -0.6938244104385376, + "logps/chosen": -631.7657470703125, + "logps/rejected": -778.4968872070312, + "loss": 0.3988, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.416792869567871, + "rewards/margins": 1.5345261096954346, + "rewards/rejected": -5.951319217681885, + "step": 110 + }, + { + "epoch": 0.47448570665241785, + "grad_norm": 10.600986700850507, + "learning_rate": 6.30014394047988e-07, + "logits/chosen": -0.7839672565460205, + "logits/rejected": -0.7656916379928589, + "logps/chosen": -520.810791015625, + "logps/rejected": -590.8253173828125, + "loss": 0.4064, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.642113447189331, + "rewards/margins": 0.7910320162773132, + "rewards/rejected": -4.433145999908447, + "step": 111 + }, + { + "epoch": 0.4787603526582955, + "grad_norm": 13.974810391572062, + "learning_rate": 6.227427435703995e-07, + "logits/chosen": -0.6362528204917908, + "logits/rejected": -0.7391636371612549, + "logps/chosen": -589.657470703125, + "logps/rejected": -778.1963500976562, + "loss": 0.397, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.09708309173584, + "rewards/margins": 1.6432350873947144, + "rewards/rejected": -5.7403178215026855, + "step": 112 + }, + { + "epoch": 0.4830349986641731, + "grad_norm": 12.06902931160219, + "learning_rate": 6.154433602223978e-07, + "logits/chosen": -0.7784813046455383, + "logits/rejected": -0.8440088033676147, + "logps/chosen": -634.3173828125, + "logps/rejected": -829.8695068359375, + "loss": 0.4383, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.257462024688721, + "rewards/margins": 1.7875339984893799, + "rewards/rejected": -6.0449957847595215, + "step": 113 + }, + { + "epoch": 0.4873096446700508, + "grad_norm": 13.358425807533337, + "learning_rate": 6.081178932489535e-07, + "logits/chosen": -0.7081687450408936, + "logits/rejected": -0.7073873281478882, + "logps/chosen": -569.8103637695312, + "logps/rejected": -694.3984375, + "loss": 0.4252, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.89652943611145, + "rewards/margins": 1.3301247358322144, + "rewards/rejected": -5.226654529571533, + "step": 114 + }, + { + "epoch": 0.4915842906759284, + "grad_norm": 11.586882789419233, + "learning_rate": 6.00767997788451e-07, + "logits/chosen": -0.5270929336547852, + "logits/rejected": -0.5626642107963562, + "logps/chosen": -693.6819458007812, + "logps/rejected": -889.004150390625, + "loss": 0.3575, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.817679405212402, + "rewards/margins": 1.964691162109375, + "rewards/rejected": -6.782370090484619, + "step": 115 + }, + { + "epoch": 0.49585893668180603, + "grad_norm": 12.667062697493666, + "learning_rate": 5.933953344987214e-07, + "logits/chosen": -0.6200395226478577, + "logits/rejected": -0.6530672311782837, + "logps/chosen": -617.082763671875, + "logps/rejected": -743.7801513671875, + "loss": 0.394, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.380313873291016, + "rewards/margins": 1.2766342163085938, + "rewards/rejected": -5.656947612762451, + "step": 116 + }, + { + "epoch": 0.5001335826876837, + "grad_norm": 13.06084918611884, + "learning_rate": 5.860015691818292e-07, + "logits/chosen": -0.5794460773468018, + "logits/rejected": -0.6392884850502014, + "logps/chosen": -523.0586547851562, + "logps/rejected": -706.2977294921875, + "loss": 0.3972, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.7082810401916504, + "rewards/margins": 1.7129367589950562, + "rewards/rejected": -5.421217918395996, + "step": 117 + }, + { + "epoch": 0.5044082286935613, + "grad_norm": 14.158925761660381, + "learning_rate": 5.78588372407695e-07, + "logits/chosen": -0.591346025466919, + "logits/rejected": -0.5808792114257812, + "logps/chosen": -661.6780395507812, + "logps/rejected": -753.6257934570312, + "loss": 0.3814, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.592419147491455, + "rewards/margins": 1.0543147325515747, + "rewards/rejected": -5.646734237670898, + "step": 118 + }, + { + "epoch": 0.508682874699439, + "grad_norm": 13.571207607427793, + "learning_rate": 5.711574191366427e-07, + "logits/chosen": -0.4889651834964752, + "logits/rejected": -0.44250980019569397, + "logps/chosen": -608.6267700195312, + "logps/rejected": -910.5017700195312, + "loss": 0.4381, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.152202606201172, + "rewards/margins": 0.7526392936706543, + "rewards/rejected": -4.904841423034668, + "step": 119 + }, + { + "epoch": 0.5129575207053166, + "grad_norm": 10.741928061872432, + "learning_rate": 5.637103883409525e-07, + "logits/chosen": -0.5629594922065735, + "logits/rejected": -0.6181632876396179, + "logps/chosen": -604.459228515625, + "logps/rejected": -852.540283203125, + "loss": 0.3589, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.461546421051025, + "rewards/margins": 2.3289871215820312, + "rewards/rejected": -6.790533542633057, + "step": 120 + }, + { + "epoch": 0.5172321667111942, + "grad_norm": 13.589230759795878, + "learning_rate": 5.562489626255103e-07, + "logits/chosen": -0.6361875534057617, + "logits/rejected": -0.6799750924110413, + "logps/chosen": -612.7998657226562, + "logps/rejected": -803.4393920898438, + "loss": 0.3612, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.2646894454956055, + "rewards/margins": 1.7221603393554688, + "rewards/rejected": -5.986849784851074, + "step": 121 + }, + { + "epoch": 0.5215068127170719, + "grad_norm": 13.130136917819023, + "learning_rate": 5.48774827847634e-07, + "logits/chosen": -0.6019195914268494, + "logits/rejected": -0.6733092665672302, + "logps/chosen": -578.5673828125, + "logps/rejected": -739.2437133789062, + "loss": 0.3972, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.263480186462402, + "rewards/margins": 1.4445068836212158, + "rewards/rejected": -5.707987308502197, + "step": 122 + }, + { + "epoch": 0.5257814587229495, + "grad_norm": 12.568726768016017, + "learning_rate": 5.412896727361662e-07, + "logits/chosen": -0.5387797951698303, + "logits/rejected": -0.6281207799911499, + "logps/chosen": -604.70703125, + "logps/rejected": -767.2006225585938, + "loss": 0.3866, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.197920322418213, + "rewards/margins": 1.5229953527450562, + "rewards/rejected": -5.720915794372559, + "step": 123 + }, + { + "epoch": 0.5300561047288271, + "grad_norm": 12.66743855092958, + "learning_rate": 5.337951885099166e-07, + "logits/chosen": -0.7120057940483093, + "logits/rejected": -0.6868148446083069, + "logps/chosen": -564.8189086914062, + "logps/rejected": -678.998779296875, + "loss": 0.4235, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.940401077270508, + "rewards/margins": 1.179326057434082, + "rewards/rejected": -5.11972713470459, + "step": 124 + }, + { + "epoch": 0.5343307507347048, + "grad_norm": 14.46489476063489, + "learning_rate": 5.262930684955438e-07, + "logits/chosen": -0.7230139970779419, + "logits/rejected": -0.7383438348770142, + "logps/chosen": -680.5816040039062, + "logps/rejected": -828.6448974609375, + "loss": 0.4321, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.831777572631836, + "rewards/margins": 1.4383575916290283, + "rewards/rejected": -6.270134925842285, + "step": 125 + }, + { + "epoch": 0.5386053967405824, + "grad_norm": 13.241409444585061, + "learning_rate": 5.187850077449603e-07, + "logits/chosen": -0.49940329790115356, + "logits/rejected": -0.5305464267730713, + "logps/chosen": -678.576416015625, + "logps/rejected": -828.8834838867188, + "loss": 0.3599, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.9777750968933105, + "rewards/margins": 1.5080969333648682, + "rewards/rejected": -6.485872268676758, + "step": 126 + }, + { + "epoch": 0.5428800427464601, + "grad_norm": 15.600684361137235, + "learning_rate": 5.11272702652346e-07, + "logits/chosen": -0.766007125377655, + "logits/rejected": -0.8170765042304993, + "logps/chosen": -783.025146484375, + "logps/rejected": -941.555908203125, + "loss": 0.3807, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.462682247161865, + "rewards/margins": 1.6811782121658325, + "rewards/rejected": -7.143860816955566, + "step": 127 + }, + { + "epoch": 0.5471546887523377, + "grad_norm": 12.46259382895567, + "learning_rate": 5.03757850570861e-07, + "logits/chosen": -0.6791242361068726, + "logits/rejected": -0.6803139448165894, + "logps/chosen": -693.43115234375, + "logps/rejected": -792.7020263671875, + "loss": 0.3952, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.814350128173828, + "rewards/margins": 1.0507954359054565, + "rewards/rejected": -5.865145206451416, + "step": 128 + }, + { + "epoch": 0.5514293347582153, + "grad_norm": 14.68281824566638, + "learning_rate": 4.962421494291391e-07, + "logits/chosen": -0.6624226570129395, + "logits/rejected": -0.8003214597702026, + "logps/chosen": -641.5405883789062, + "logps/rejected": -848.9283447265625, + "loss": 0.3979, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.510129928588867, + "rewards/margins": 1.8167277574539185, + "rewards/rejected": -6.326857566833496, + "step": 129 + }, + { + "epoch": 0.555703980764093, + "grad_norm": 14.156910665906969, + "learning_rate": 4.88727297347654e-07, + "logits/chosen": -0.6747015714645386, + "logits/rejected": -0.6304070353507996, + "logps/chosen": -673.6571655273438, + "logps/rejected": -818.0390625, + "loss": 0.3615, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.6810078620910645, + "rewards/margins": 1.7004587650299072, + "rewards/rejected": -6.381466388702393, + "step": 130 + }, + { + "epoch": 0.5599786267699706, + "grad_norm": 13.513462308218608, + "learning_rate": 4.812149922550397e-07, + "logits/chosen": -0.5138005614280701, + "logits/rejected": -0.5008392333984375, + "logps/chosen": -603.177490234375, + "logps/rejected": -718.6824951171875, + "loss": 0.4195, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.324260234832764, + "rewards/margins": 1.1721910238265991, + "rewards/rejected": -5.496450901031494, + "step": 131 + }, + { + "epoch": 0.5642532727758482, + "grad_norm": 13.859777475577925, + "learning_rate": 4.7370693150445615e-07, + "logits/chosen": -0.7230309247970581, + "logits/rejected": -0.7601820826530457, + "logps/chosen": -678.0418090820312, + "logps/rejected": -836.7296142578125, + "loss": 0.4123, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.642270565032959, + "rewards/margins": 1.615240216255188, + "rewards/rejected": -6.257511138916016, + "step": 132 + }, + { + "epoch": 0.5685279187817259, + "grad_norm": 12.304755311149645, + "learning_rate": 4.6620481149008364e-07, + "logits/chosen": -0.5858466029167175, + "logits/rejected": -0.5665376782417297, + "logps/chosen": -551.0853271484375, + "logps/rejected": -661.145263671875, + "loss": 0.3952, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.0769124031066895, + "rewards/margins": 1.150291919708252, + "rewards/rejected": -5.227204322814941, + "step": 133 + }, + { + "epoch": 0.5728025647876035, + "grad_norm": 13.318626812706627, + "learning_rate": 4.5871032726383385e-07, + "logits/chosen": -0.6011719703674316, + "logits/rejected": -0.6539227962493896, + "logps/chosen": -613.9602661132812, + "logps/rejected": -801.2767333984375, + "loss": 0.3028, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.4908127784729, + "rewards/margins": 1.866058588027954, + "rewards/rejected": -6.356871604919434, + "step": 134 + }, + { + "epoch": 0.5770772107934812, + "grad_norm": 15.02191791227977, + "learning_rate": 4.512251721523659e-07, + "logits/chosen": -0.5807833671569824, + "logits/rejected": -0.5801360607147217, + "logps/chosen": -585.0089111328125, + "logps/rejected": -690.991455078125, + "loss": 0.4568, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.436345100402832, + "rewards/margins": 0.9718096852302551, + "rewards/rejected": -5.40815544128418, + "step": 135 + }, + { + "epoch": 0.5813518567993589, + "grad_norm": 13.719743061532137, + "learning_rate": 4.4375103737448967e-07, + "logits/chosen": -0.6176421642303467, + "logits/rejected": -0.5977914333343506, + "logps/chosen": -647.6025390625, + "logps/rejected": -780.4196166992188, + "loss": 0.3489, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.731601715087891, + "rewards/margins": 1.3735716342926025, + "rewards/rejected": -6.105173110961914, + "step": 136 + }, + { + "epoch": 0.5856265028052364, + "grad_norm": 13.899011782478087, + "learning_rate": 4.362896116590475e-07, + "logits/chosen": -0.6031906604766846, + "logits/rejected": -0.6842055320739746, + "logps/chosen": -619.6199340820312, + "logps/rejected": -823.1647338867188, + "loss": 0.3829, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.415548324584961, + "rewards/margins": 1.8599358797073364, + "rewards/rejected": -6.275484085083008, + "step": 137 + }, + { + "epoch": 0.5899011488111141, + "grad_norm": 14.094976818965387, + "learning_rate": 4.2884258086335745e-07, + "logits/chosen": -0.5712490081787109, + "logits/rejected": -0.6076186299324036, + "logps/chosen": -635.4631958007812, + "logps/rejected": -767.619873046875, + "loss": 0.3952, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.872575759887695, + "rewards/margins": 1.2816861867904663, + "rewards/rejected": -6.154261589050293, + "step": 138 + }, + { + "epoch": 0.5941757948169917, + "grad_norm": 15.923719497368527, + "learning_rate": 4.2141162759230503e-07, + "logits/chosen": -0.4579673409461975, + "logits/rejected": -0.5088114738464355, + "logps/chosen": -541.15185546875, + "logps/rejected": -640.8451538085938, + "loss": 0.3694, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.0685858726501465, + "rewards/margins": 0.9016439914703369, + "rewards/rejected": -4.970229625701904, + "step": 139 + }, + { + "epoch": 0.5984504408228694, + "grad_norm": 15.626619862394913, + "learning_rate": 4.139984308181708e-07, + "logits/chosen": -0.6617997884750366, + "logits/rejected": -0.6638819575309753, + "logps/chosen": -747.4265747070312, + "logps/rejected": -863.458251953125, + "loss": 0.3971, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.498664379119873, + "rewards/margins": 1.1953853368759155, + "rewards/rejected": -6.694049835205078, + "step": 140 + }, + { + "epoch": 0.602725086828747, + "grad_norm": 16.11770967148462, + "learning_rate": 4.0660466550127853e-07, + "logits/chosen": -0.7728097438812256, + "logits/rejected": -0.8388174772262573, + "logps/chosen": -708.0802001953125, + "logps/rejected": -857.48486328125, + "loss": 0.4029, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.071959972381592, + "rewards/margins": 1.4134087562561035, + "rewards/rejected": -6.485368251800537, + "step": 141 + }, + { + "epoch": 0.6069997328346246, + "grad_norm": 13.358737574089647, + "learning_rate": 3.9923200221154914e-07, + "logits/chosen": -0.5902035236358643, + "logits/rejected": -0.600926399230957, + "logps/chosen": -655.1759033203125, + "logps/rejected": -776.4323120117188, + "loss": 0.4008, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.78019905090332, + "rewards/margins": 1.229543685913086, + "rewards/rejected": -6.009742736816406, + "step": 142 + }, + { + "epoch": 0.6112743788405023, + "grad_norm": 20.882100986314004, + "learning_rate": 3.918821067510464e-07, + "logits/chosen": -0.5608689188957214, + "logits/rejected": -0.5520298480987549, + "logps/chosen": -606.8939208984375, + "logps/rejected": -732.2677001953125, + "loss": 0.481, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.506969928741455, + "rewards/margins": 1.263184666633606, + "rewards/rejected": -5.77015495300293, + "step": 143 + }, + { + "epoch": 0.6155490248463799, + "grad_norm": 15.142641307206523, + "learning_rate": 3.845566397776021e-07, + "logits/chosen": -0.5451078414916992, + "logits/rejected": -0.5328483581542969, + "logps/chosen": -578.9205932617188, + "logps/rejected": -720.2644653320312, + "loss": 0.3981, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.13190221786499, + "rewards/margins": 1.3544728755950928, + "rewards/rejected": -5.486375331878662, + "step": 144 + }, + { + "epoch": 0.6198236708522575, + "grad_norm": 14.102739323418549, + "learning_rate": 3.772572564296004e-07, + "logits/chosen": -0.5815902948379517, + "logits/rejected": -0.6612125635147095, + "logps/chosen": -647.4329833984375, + "logps/rejected": -800.5177001953125, + "loss": 0.3771, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.570836067199707, + "rewards/margins": 1.5209659337997437, + "rewards/rejected": -6.091801643371582, + "step": 145 + }, + { + "epoch": 0.6240983168581352, + "grad_norm": 16.783278936540377, + "learning_rate": 3.699856059520118e-07, + "logits/chosen": -0.5741180777549744, + "logits/rejected": -0.6261047720909119, + "logps/chosen": -518.5027465820312, + "logps/rejected": -752.7945556640625, + "loss": 0.339, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.5159010887145996, + "rewards/margins": 2.189849615097046, + "rewards/rejected": -5.705749988555908, + "step": 146 + }, + { + "epoch": 0.6283729628640128, + "grad_norm": 16.241290219398206, + "learning_rate": 3.627433313237576e-07, + "logits/chosen": -0.6445101499557495, + "logits/rejected": -0.6295093297958374, + "logps/chosen": -611.40576171875, + "logps/rejected": -746.1281127929688, + "loss": 0.4584, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.344273090362549, + "rewards/margins": 1.294838309288025, + "rewards/rejected": -5.639111518859863, + "step": 147 + }, + { + "epoch": 0.6326476088698905, + "grad_norm": 16.58217058287404, + "learning_rate": 3.5553206888648885e-07, + "logits/chosen": -0.5924898386001587, + "logits/rejected": -0.6705700755119324, + "logps/chosen": -561.9385986328125, + "logps/rejected": -790.1613159179688, + "loss": 0.3589, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.8234567642211914, + "rewards/margins": 2.038635015487671, + "rewards/rejected": -5.862092018127441, + "step": 148 + }, + { + "epoch": 0.6369222548757681, + "grad_norm": 13.631809000580427, + "learning_rate": 3.483534479748688e-07, + "logits/chosen": -0.6043068170547485, + "logits/rejected": -0.6237097978591919, + "logps/chosen": -599.3536376953125, + "logps/rejected": -737.3873291015625, + "loss": 0.3333, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.288971424102783, + "rewards/margins": 1.361509919166565, + "rewards/rejected": -5.650481224060059, + "step": 149 + }, + { + "epoch": 0.6411969008816457, + "grad_norm": 13.48441747872976, + "learning_rate": 3.412090905484337e-07, + "logits/chosen": -0.5789849758148193, + "logits/rejected": -0.5934211015701294, + "logps/chosen": -637.4574584960938, + "logps/rejected": -799.4398193359375, + "loss": 0.3633, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.6277079582214355, + "rewards/margins": 1.6034901142120361, + "rewards/rejected": -6.231198310852051, + "step": 150 + }, + { + "epoch": 0.6454715468875234, + "grad_norm": 13.54705279065067, + "learning_rate": 3.3410061082512417e-07, + "logits/chosen": -0.7143419981002808, + "logits/rejected": -0.7396361231803894, + "logps/chosen": -644.5235595703125, + "logps/rejected": -819.0350341796875, + "loss": 0.3787, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.598511695861816, + "rewards/margins": 1.7460615634918213, + "rewards/rejected": -6.344573020935059, + "step": 151 + }, + { + "epoch": 0.649746192893401, + "grad_norm": 13.97282414379501, + "learning_rate": 3.270296149165619e-07, + "logits/chosen": -0.7898523807525635, + "logits/rejected": -0.78404700756073, + "logps/chosen": -746.5989990234375, + "logps/rejected": -925.236328125, + "loss": 0.3681, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.577144145965576, + "rewards/margins": 1.7709450721740723, + "rewards/rejected": -7.348089218139648, + "step": 152 + }, + { + "epoch": 0.6540208388992786, + "grad_norm": 15.30647255299837, + "learning_rate": 3.1999770046516194e-07, + "logits/chosen": -0.6549022197723389, + "logits/rejected": -0.6652963161468506, + "logps/chosen": -734.189453125, + "logps/rejected": -882.783203125, + "loss": 0.397, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.586746692657471, + "rewards/margins": 1.5474714040756226, + "rewards/rejected": -7.134217739105225, + "step": 153 + }, + { + "epoch": 0.6582954849051563, + "grad_norm": 15.480021561153041, + "learning_rate": 3.1300645628315526e-07, + "logits/chosen": -0.6595125794410706, + "logits/rejected": -0.684340238571167, + "logps/chosen": -692.1060180664062, + "logps/rejected": -861.9107055664062, + "loss": 0.3519, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.179555416107178, + "rewards/margins": 1.7508639097213745, + "rewards/rejected": -6.930419445037842, + "step": 154 + }, + { + "epoch": 0.6625701309110339, + "grad_norm": 14.370880370113698, + "learning_rate": 3.060574619936075e-07, + "logits/chosen": -0.6609420776367188, + "logits/rejected": -0.6882165670394897, + "logps/chosen": -755.37109375, + "logps/rejected": -923.8385620117188, + "loss": 0.3964, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.7476301193237305, + "rewards/margins": 1.664482831954956, + "rewards/rejected": -7.412113189697266, + "step": 155 + }, + { + "epoch": 0.6668447769169116, + "grad_norm": 15.740330800217551, + "learning_rate": 2.9915228767351535e-07, + "logits/chosen": -0.6638086438179016, + "logits/rejected": -0.6543954014778137, + "logps/chosen": -691.0468139648438, + "logps/rejected": -834.0437622070312, + "loss": 0.3507, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.920919895172119, + "rewards/margins": 1.5358891487121582, + "rewards/rejected": -6.4568095207214355, + "step": 156 + }, + { + "epoch": 0.6711194229227893, + "grad_norm": 16.901891917408413, + "learning_rate": 2.922924934990568e-07, + "logits/chosen": -0.7027104496955872, + "logits/rejected": -0.7690137624740601, + "logps/chosen": -740.62353515625, + "logps/rejected": -911.9879150390625, + "loss": 0.383, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.2726335525512695, + "rewards/margins": 1.703195571899414, + "rewards/rejected": -6.975828647613525, + "step": 157 + }, + { + "epoch": 0.6753940689286668, + "grad_norm": 14.502221464305004, + "learning_rate": 2.8547962939308186e-07, + "logits/chosen": -0.7240225672721863, + "logits/rejected": -0.743172287940979, + "logps/chosen": -638.63720703125, + "logps/rejected": -785.5480346679688, + "loss": 0.3907, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.341987133026123, + "rewards/margins": 1.3872699737548828, + "rewards/rejected": -5.729257583618164, + "step": 158 + }, + { + "epoch": 0.6796687149345445, + "grad_norm": 13.076811073919702, + "learning_rate": 2.7871523467491725e-07, + "logits/chosen": -0.5847674608230591, + "logits/rejected": -0.6150667667388916, + "logps/chosen": -558.647216796875, + "logps/rejected": -742.8478393554688, + "loss": 0.3777, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.8698740005493164, + "rewards/margins": 1.8015196323394775, + "rewards/rejected": -5.671393394470215, + "step": 159 + }, + { + "epoch": 0.6839433609404221, + "grad_norm": 13.435743880369799, + "learning_rate": 2.720008377125682e-07, + "logits/chosen": -0.7234424352645874, + "logits/rejected": -0.7753596305847168, + "logps/chosen": -599.20947265625, + "logps/rejected": -840.508544921875, + "loss": 0.3559, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.066052436828613, + "rewards/margins": 2.2670648097991943, + "rewards/rejected": -6.3331170082092285, + "step": 160 + }, + { + "epoch": 0.6882180069462998, + "grad_norm": 14.725846441531868, + "learning_rate": 2.6533795557739405e-07, + "logits/chosen": -0.5986216068267822, + "logits/rejected": -0.6119877099990845, + "logps/chosen": -587.1053466796875, + "logps/rejected": -751.2557983398438, + "loss": 0.3576, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9571621417999268, + "rewards/margins": 1.737083077430725, + "rewards/rejected": -5.694245338439941, + "step": 161 + }, + { + "epoch": 0.6924926529521774, + "grad_norm": 18.854853550251143, + "learning_rate": 2.5872809370133704e-07, + "logits/chosen": -0.7047430872917175, + "logits/rejected": -0.7392921447753906, + "logps/chosen": -541.9876098632812, + "logps/rejected": -679.9630737304688, + "loss": 0.3713, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.7241249084472656, + "rewards/margins": 1.3988580703735352, + "rewards/rejected": -5.122982978820801, + "step": 162 + }, + { + "epoch": 0.696767298958055, + "grad_norm": 15.327216222305758, + "learning_rate": 2.521727455367797e-07, + "logits/chosen": -0.4683057963848114, + "logits/rejected": -0.4958358705043793, + "logps/chosen": -477.89453125, + "logps/rejected": -642.5132446289062, + "loss": 0.3217, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4097371101379395, + "rewards/margins": 1.6171811819076538, + "rewards/rejected": -5.026918411254883, + "step": 163 + }, + { + "epoch": 0.7010419449639327, + "grad_norm": 15.107047920787036, + "learning_rate": 2.456733922191108e-07, + "logits/chosen": -0.6403992176055908, + "logits/rejected": -0.7081367373466492, + "logps/chosen": -535.940673828125, + "logps/rejected": -721.487060546875, + "loss": 0.3835, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.6437511444091797, + "rewards/margins": 1.7465698719024658, + "rewards/rejected": -5.390320777893066, + "step": 164 + }, + { + "epoch": 0.7053165909698104, + "grad_norm": 14.789037132111227, + "learning_rate": 2.3923150223207173e-07, + "logits/chosen": -0.6419979333877563, + "logits/rejected": -0.6663538217544556, + "logps/chosen": -611.316162109375, + "logps/rejected": -779.5244140625, + "loss": 0.357, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9267449378967285, + "rewards/margins": 1.7568156719207764, + "rewards/rejected": -5.683561325073242, + "step": 165 + }, + { + "epoch": 0.7095912369756879, + "grad_norm": 11.678139693168967, + "learning_rate": 2.3284853107596347e-07, + "logits/chosen": -0.643075704574585, + "logits/rejected": -0.677239179611206, + "logps/chosen": -605.615478515625, + "logps/rejected": -788.3361206054688, + "loss": 0.3161, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.1840410232543945, + "rewards/margins": 1.828648567199707, + "rewards/rejected": -6.012689590454102, + "step": 166 + }, + { + "epoch": 0.7138658829815656, + "grad_norm": 14.838448866889486, + "learning_rate": 2.2652592093878665e-07, + "logits/chosen": -0.5908488631248474, + "logits/rejected": -0.6062439680099487, + "logps/chosen": -623.344482421875, + "logps/rejected": -775.658447265625, + "loss": 0.3693, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.541435241699219, + "rewards/margins": 1.4988759756088257, + "rewards/rejected": -6.040311336517334, + "step": 167 + }, + { + "epoch": 0.7181405289874432, + "grad_norm": 13.768660505175903, + "learning_rate": 2.202651003703885e-07, + "logits/chosen": -0.5698331594467163, + "logits/rejected": -0.5812557339668274, + "logps/chosen": -609.9556274414062, + "logps/rejected": -810.02587890625, + "loss": 0.3727, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.267837047576904, + "rewards/margins": 1.995699167251587, + "rewards/rejected": -6.2635369300842285, + "step": 168 + }, + { + "epoch": 0.7224151749933209, + "grad_norm": 12.956408292396839, + "learning_rate": 2.1406748395969305e-07, + "logits/chosen": -0.6224421858787537, + "logits/rejected": -0.6563930511474609, + "logps/chosen": -620.7723388671875, + "logps/rejected": -781.536865234375, + "loss": 0.3362, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.473085403442383, + "rewards/margins": 1.6361382007598877, + "rewards/rejected": -6.109223365783691, + "step": 169 + }, + { + "epoch": 0.7266898209991985, + "grad_norm": 15.331044331855917, + "learning_rate": 2.0793447201508286e-07, + "logits/chosen": -0.6418094635009766, + "logits/rejected": -0.6333540678024292, + "logps/chosen": -693.9434204101562, + "logps/rejected": -780.4866333007812, + "loss": 0.3273, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.1741743087768555, + "rewards/margins": 0.9175342321395874, + "rewards/rejected": -6.091708660125732, + "step": 170 + }, + { + "epoch": 0.7309644670050761, + "grad_norm": 14.142847544336131, + "learning_rate": 2.01867450248011e-07, + "logits/chosen": -0.6202086210250854, + "logits/rejected": -0.6713452935218811, + "logps/chosen": -733.3516235351562, + "logps/rejected": -912.0302734375, + "loss": 0.3551, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.687126636505127, + "rewards/margins": 1.6365104913711548, + "rewards/rejected": -7.32363748550415, + "step": 171 + }, + { + "epoch": 0.7352391130109538, + "grad_norm": 14.223351065221193, + "learning_rate": 1.9586778945990783e-07, + "logits/chosen": -0.5605691075325012, + "logits/rejected": -0.6431994438171387, + "logps/chosen": -720.2319946289062, + "logps/rejected": -907.8939819335938, + "loss": 0.3518, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.346857070922852, + "rewards/margins": 1.8520584106445312, + "rewards/rejected": -7.198915481567383, + "step": 172 + }, + { + "epoch": 0.7395137590168315, + "grad_norm": 14.283973424406769, + "learning_rate": 1.899368452324584e-07, + "logits/chosen": -0.8039106130599976, + "logits/rejected": -0.8154680728912354, + "logps/chosen": -702.9293823242188, + "logps/rejected": -874.7135009765625, + "loss": 0.369, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.166163444519043, + "rewards/margins": 1.7623482942581177, + "rewards/rejected": -6.928511142730713, + "step": 173 + }, + { + "epoch": 0.743788405022709, + "grad_norm": 13.648679506818656, + "learning_rate": 1.840759576213181e-07, + "logits/chosen": -0.5466803908348083, + "logits/rejected": -0.6139577031135559, + "logps/chosen": -643.9953002929688, + "logps/rejected": -851.5556640625, + "loss": 0.3272, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.4559197425842285, + "rewards/margins": 2.084817886352539, + "rewards/rejected": -6.540737152099609, + "step": 174 + }, + { + "epoch": 0.7480630510285867, + "grad_norm": 16.8663479372205, + "learning_rate": 1.7828645085333644e-07, + "logits/chosen": -0.6725043654441833, + "logits/rejected": -0.7195257544517517, + "logps/chosen": -706.1845703125, + "logps/rejected": -901.8247680664062, + "loss": 0.3902, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.220101356506348, + "rewards/margins": 1.968306541442871, + "rewards/rejected": -7.188408374786377, + "step": 175 + }, + { + "epoch": 0.7523376970344643, + "grad_norm": 13.612220492425323, + "learning_rate": 1.725696330273575e-07, + "logits/chosen": -0.7102064490318298, + "logits/rejected": -0.7330564260482788, + "logps/chosen": -644.0183715820312, + "logps/rejected": -833.1116943359375, + "loss": 0.3007, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.664500713348389, + "rewards/margins": 1.9048974514007568, + "rewards/rejected": -6.569398403167725, + "step": 176 + }, + { + "epoch": 0.756612343040342, + "grad_norm": 13.592118995239437, + "learning_rate": 1.6692679581866332e-07, + "logits/chosen": -0.5269302725791931, + "logits/rejected": -0.562321662902832, + "logps/chosen": -647.377685546875, + "logps/rejected": -876.7937622070312, + "loss": 0.3336, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.665207862854004, + "rewards/margins": 2.2923545837402344, + "rewards/rejected": -6.9575629234313965, + "step": 177 + }, + { + "epoch": 0.7608869890462197, + "grad_norm": 13.100123198154126, + "learning_rate": 1.6135921418712955e-07, + "logits/chosen": -0.6257606744766235, + "logits/rejected": -0.6605125069618225, + "logps/chosen": -589.7310791015625, + "logps/rejected": -762.8815307617188, + "loss": 0.3326, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.342212200164795, + "rewards/margins": 1.6928372383117676, + "rewards/rejected": -6.035048961639404, + "step": 178 + }, + { + "epoch": 0.7651616350520972, + "grad_norm": 24.95386459216583, + "learning_rate": 1.558681460891567e-07, + "logits/chosen": -0.6882709860801697, + "logits/rejected": -0.7164211273193359, + "logps/chosen": -729.4912719726562, + "logps/rejected": -960.5554809570312, + "loss": 0.3596, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.27896785736084, + "rewards/margins": 2.3151795864105225, + "rewards/rejected": -7.594147682189941, + "step": 179 + }, + { + "epoch": 0.7694362810579749, + "grad_norm": 14.508183670705439, + "learning_rate": 1.5045483219344385e-07, + "logits/chosen": -0.4706317186355591, + "logits/rejected": -0.4956177771091461, + "logps/chosen": -686.981201171875, + "logps/rejected": -892.425537109375, + "loss": 0.3878, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.139954090118408, + "rewards/margins": 1.9214184284210205, + "rewards/rejected": -7.061371803283691, + "step": 180 + }, + { + "epoch": 0.7737109270638525, + "grad_norm": 13.601053697523751, + "learning_rate": 1.4512049560066835e-07, + "logits/chosen": -0.5556597113609314, + "logits/rejected": -0.6318129301071167, + "logps/chosen": -582.2374877929688, + "logps/rejected": -793.1754760742188, + "loss": 0.3245, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.254069805145264, + "rewards/margins": 2.0026259422302246, + "rewards/rejected": -6.256695747375488, + "step": 181 + }, + { + "epoch": 0.7779855730697302, + "grad_norm": 18.93784031657066, + "learning_rate": 1.3986634156713417e-07, + "logits/chosen": -0.5807328224182129, + "logits/rejected": -0.5342915654182434, + "logps/chosen": -602.649658203125, + "logps/rejected": -747.00732421875, + "loss": 0.3884, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.532341957092285, + "rewards/margins": 1.6315253973007202, + "rewards/rejected": -6.163866996765137, + "step": 182 + }, + { + "epoch": 0.7822602190756078, + "grad_norm": 16.431500936058505, + "learning_rate": 1.34693557232453e-07, + "logits/chosen": -0.6352940797805786, + "logits/rejected": -0.6816412210464478, + "logps/chosen": -679.6211547851562, + "logps/rejected": -891.3534545898438, + "loss": 0.3745, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.584761619567871, + "rewards/margins": 2.096449375152588, + "rewards/rejected": -6.681210994720459, + "step": 183 + }, + { + "epoch": 0.7865348650814854, + "grad_norm": 17.829780427983952, + "learning_rate": 1.2960331135131823e-07, + "logits/chosen": -0.6611433029174805, + "logits/rejected": -0.6609802842140198, + "logps/chosen": -662.570068359375, + "logps/rejected": -845.5758666992188, + "loss": 0.3201, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.930952072143555, + "rewards/margins": 1.9189307689666748, + "rewards/rejected": -6.849882125854492, + "step": 184 + }, + { + "epoch": 0.7908095110873631, + "grad_norm": 14.013649032140211, + "learning_rate": 1.2459675402943288e-07, + "logits/chosen": -0.5998414754867554, + "logits/rejected": -0.6097269058227539, + "logps/chosen": -691.530517578125, + "logps/rejected": -900.010986328125, + "loss": 0.2947, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.21755838394165, + "rewards/margins": 2.078969955444336, + "rewards/rejected": -7.296527862548828, + "step": 185 + }, + { + "epoch": 0.7950841570932408, + "grad_norm": 13.880919593847631, + "learning_rate": 1.1967501646365146e-07, + "logits/chosen": -0.6878648996353149, + "logits/rejected": -0.7795136570930481, + "logps/chosen": -629.9570922851562, + "logps/rejected": -834.251708984375, + "loss": 0.3317, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.382741451263428, + "rewards/margins": 2.003836154937744, + "rewards/rejected": -6.386577606201172, + "step": 186 + }, + { + "epoch": 0.7993588030991183, + "grad_norm": 15.353298997567329, + "learning_rate": 1.1483921068639351e-07, + "logits/chosen": -0.6340612173080444, + "logits/rejected": -0.6507092714309692, + "logps/chosen": -723.2861938476562, + "logps/rejected": -939.2916259765625, + "loss": 0.3424, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.30019474029541, + "rewards/margins": 2.066894769668579, + "rewards/rejected": -7.36708927154541, + "step": 187 + }, + { + "epoch": 0.803633449104996, + "grad_norm": 14.566838990350824, + "learning_rate": 1.1009042931438783e-07, + "logits/chosen": -0.6575983762741089, + "logits/rejected": -0.699353814125061, + "logps/chosen": -693.1535034179688, + "logps/rejected": -887.3889770507812, + "loss": 0.3477, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.220640182495117, + "rewards/margins": 1.8395450115203857, + "rewards/rejected": -7.060185432434082, + "step": 188 + }, + { + "epoch": 0.8079080951108736, + "grad_norm": 17.580820182141434, + "learning_rate": 1.0542974530180327e-07, + "logits/chosen": -0.6650811433792114, + "logits/rejected": -0.7292627692222595, + "logps/chosen": -640.0955200195312, + "logps/rejected": -821.3978881835938, + "loss": 0.3859, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.4905171394348145, + "rewards/margins": 1.7442741394042969, + "rewards/rejected": -6.234791278839111, + "step": 189 + }, + { + "epoch": 0.8121827411167513, + "grad_norm": 14.138278292631155, + "learning_rate": 1.0085821169782199e-07, + "logits/chosen": -0.6633419394493103, + "logits/rejected": -0.7350410223007202, + "logps/chosen": -550.8514404296875, + "logps/rejected": -755.83056640625, + "loss": 0.3497, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.9601831436157227, + "rewards/margins": 1.8831403255462646, + "rewards/rejected": -5.843323707580566, + "step": 190 + }, + { + "epoch": 0.8164573871226289, + "grad_norm": 15.051759686903543, + "learning_rate": 9.637686140871121e-08, + "logits/chosen": -0.5633993148803711, + "logits/rejected": -0.5642579793930054, + "logps/chosen": -751.5147705078125, + "logps/rejected": -917.2871704101562, + "loss": 0.3641, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.359073162078857, + "rewards/margins": 1.6620866060256958, + "rewards/rejected": -7.021159648895264, + "step": 191 + }, + { + "epoch": 0.8207320331285065, + "grad_norm": 16.652659340140023, + "learning_rate": 9.198670696444338e-08, + "logits/chosen": -0.6166589260101318, + "logits/rejected": -0.6604666709899902, + "logps/chosen": -641.2590942382812, + "logps/rejected": -832.0524291992188, + "loss": 0.3808, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.821630954742432, + "rewards/margins": 1.7895194292068481, + "rewards/rejected": -6.611149787902832, + "step": 192 + }, + { + "epoch": 0.8250066791343842, + "grad_norm": 14.1280180622041, + "learning_rate": 8.768874028992429e-08, + "logits/chosen": -0.6036140322685242, + "logits/rejected": -0.6334025859832764, + "logps/chosen": -613.3704223632812, + "logps/rejected": -792.6769409179688, + "loss": 0.3289, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.494506359100342, + "rewards/margins": 1.7201225757598877, + "rewards/rejected": -6.21462869644165, + "step": 193 + }, + { + "epoch": 0.8292813251402619, + "grad_norm": 13.810909832189019, + "learning_rate": 8.348393248087287e-08, + "logits/chosen": -0.5372692346572876, + "logits/rejected": -0.5193148255348206, + "logps/chosen": -559.5550537109375, + "logps/rejected": -737.1176147460938, + "loss": 0.3486, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.210740089416504, + "rewards/margins": 1.7630614042282104, + "rewards/rejected": -5.973801612854004, + "step": 194 + }, + { + "epoch": 0.8335559711461394, + "grad_norm": 13.364037119659574, + "learning_rate": 7.937323358440934e-08, + "logits/chosen": -0.7003932595252991, + "logits/rejected": -0.6555891633033752, + "logps/chosen": -687.826171875, + "logps/rejected": -845.4681396484375, + "loss": 0.3359, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.774288654327393, + "rewards/margins": 1.7430295944213867, + "rewards/rejected": -6.517318248748779, + "step": 195 + }, + { + "epoch": 0.8378306171520171, + "grad_norm": 14.21133162771402, + "learning_rate": 7.535757238439938e-08, + "logits/chosen": -0.7010968923568726, + "logits/rejected": -0.7519139647483826, + "logps/chosen": -618.968017578125, + "logps/rejected": -890.8580322265625, + "loss": 0.2995, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.3997673988342285, + "rewards/margins": 2.6948697566986084, + "rewards/rejected": -7.094637393951416, + "step": 196 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 14.29704216155569, + "learning_rate": 7.143785619160026e-08, + "logits/chosen": -0.8667165637016296, + "logits/rejected": -0.9397881031036377, + "logps/chosen": -635.5562133789062, + "logps/rejected": -869.5856323242188, + "loss": 0.2775, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.424742221832275, + "rewards/margins": 2.1135940551757812, + "rewards/rejected": -6.538336277008057, + "step": 197 + }, + { + "epoch": 0.8463799091637724, + "grad_norm": 15.830532425162781, + "learning_rate": 6.761497063866206e-08, + "logits/chosen": -0.715203583240509, + "logits/rejected": -0.7201322317123413, + "logps/chosen": -675.7356567382812, + "logps/rejected": -823.666748046875, + "loss": 0.3961, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.689385890960693, + "rewards/margins": 1.436130404472351, + "rewards/rejected": -6.125515937805176, + "step": 198 + }, + { + "epoch": 0.85065455516965, + "grad_norm": 12.71855284005859, + "learning_rate": 6.388977948002406e-08, + "logits/chosen": -0.6863987445831299, + "logits/rejected": -0.7026057839393616, + "logps/chosen": -635.3793334960938, + "logps/rejected": -810.3753051757812, + "loss": 0.3249, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.71897554397583, + "rewards/margins": 1.7087393999099731, + "rewards/rejected": -6.427714824676514, + "step": 199 + }, + { + "epoch": 0.8549292011755276, + "grad_norm": 14.663842598726813, + "learning_rate": 6.026312439675551e-08, + "logits/chosen": -0.6016858220100403, + "logits/rejected": -0.6423814296722412, + "logps/chosen": -533.045654296875, + "logps/rejected": -688.97705078125, + "loss": 0.343, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.7861201763153076, + "rewards/margins": 1.5125634670257568, + "rewards/rejected": -5.298683166503906, + "step": 200 + }, + { + "epoch": 0.8549292011755276, + "eval_logits/chosen": -0.5621978044509888, + "eval_logits/rejected": -0.5776455998420715, + "eval_logps/chosen": -698.124755859375, + "eval_logps/rejected": -878.510498046875, + "eval_loss": 0.3298446834087372, + "eval_rewards/accuracies": 0.8951612710952759, + "eval_rewards/chosen": -4.931028842926025, + "eval_rewards/margins": 1.8655366897583008, + "eval_rewards/rejected": -6.796565055847168, + "eval_runtime": 148.1942, + "eval_samples_per_second": 13.233, + "eval_steps_per_second": 0.418, + "step": 200 + }, + { + "epoch": 0.8592038471814053, + "grad_norm": 13.178173716498932, + "learning_rate": 5.6735824806383945e-08, + "logits/chosen": -0.8137510418891907, + "logits/rejected": -0.8618326783180237, + "logps/chosen": -762.1471557617188, + "logps/rejected": -987.831787109375, + "loss": 0.3117, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.3300957679748535, + "rewards/margins": 2.290207862854004, + "rewards/rejected": -7.620304107666016, + "step": 201 + }, + { + "epoch": 0.863478493187283, + "grad_norm": 15.015075721065077, + "learning_rate": 5.3308677677753324e-08, + "logits/chosen": -0.6007865071296692, + "logits/rejected": -0.6010035872459412, + "logps/chosen": -607.5897827148438, + "logps/rejected": -783.7905883789062, + "loss": 0.3751, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.329671382904053, + "rewards/margins": 1.7175862789154053, + "rewards/rejected": -6.047257423400879, + "step": 202 + }, + { + "epoch": 0.8677531391931605, + "grad_norm": 17.440815026489904, + "learning_rate": 4.9982457350954576e-08, + "logits/chosen": -0.6124709844589233, + "logits/rejected": -0.5998551249504089, + "logps/chosen": -768.2071533203125, + "logps/rejected": -909.81982421875, + "loss": 0.337, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.643802642822266, + "rewards/margins": 1.5375871658325195, + "rewards/rejected": -7.181390285491943, + "step": 203 + }, + { + "epoch": 0.8720277851990382, + "grad_norm": 13.117492290397454, + "learning_rate": 4.675791536236856e-08, + "logits/chosen": -0.6630779504776001, + "logits/rejected": -0.7016023397445679, + "logps/chosen": -576.6640014648438, + "logps/rejected": -781.410888671875, + "loss": 0.3565, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.391974449157715, + "rewards/margins": 2.0928616523742676, + "rewards/rejected": -6.484836101531982, + "step": 204 + }, + { + "epoch": 0.8763024312049158, + "grad_norm": 16.38485979196371, + "learning_rate": 4.3635780274861864e-08, + "logits/chosen": -0.6497581005096436, + "logits/rejected": -0.7152493596076965, + "logps/chosen": -604.6185302734375, + "logps/rejected": -769.5409545898438, + "loss": 0.3998, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.433528900146484, + "rewards/margins": 1.5042006969451904, + "rewards/rejected": -5.937729835510254, + "step": 205 + }, + { + "epoch": 0.8805770772107935, + "grad_norm": 19.27882004478496, + "learning_rate": 4.0616757513173115e-08, + "logits/chosen": -0.5923482775688171, + "logits/rejected": -0.6674529910087585, + "logps/chosen": -730.7793579101562, + "logps/rejected": -982.1051025390625, + "loss": 0.3504, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.950905799865723, + "rewards/margins": 2.3350894451141357, + "rewards/rejected": -7.2859954833984375, + "step": 206 + }, + { + "epoch": 0.8848517232166712, + "grad_norm": 12.70289041030948, + "learning_rate": 3.7701529204526846e-08, + "logits/chosen": -0.5751956105232239, + "logits/rejected": -0.6102803349494934, + "logps/chosen": -638.7919311523438, + "logps/rejected": -788.8345336914062, + "loss": 0.3242, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.849457740783691, + "rewards/margins": 1.458791971206665, + "rewards/rejected": -6.308249473571777, + "step": 207 + }, + { + "epoch": 0.8891263692225487, + "grad_norm": 16.266039867395605, + "learning_rate": 3.4890754024512246e-08, + "logits/chosen": -0.6625305414199829, + "logits/rejected": -0.6873234510421753, + "logps/chosen": -717.0994873046875, + "logps/rejected": -884.3046875, + "loss": 0.3447, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.226888179779053, + "rewards/margins": 1.590306043624878, + "rewards/rejected": -6.81719446182251, + "step": 208 + }, + { + "epoch": 0.8934010152284264, + "grad_norm": 14.509559337608318, + "learning_rate": 3.218506704825924e-08, + "logits/chosen": -0.5810579061508179, + "logits/rejected": -0.6108168363571167, + "logps/chosen": -675.04150390625, + "logps/rejected": -836.3502197265625, + "loss": 0.3439, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.011352062225342, + "rewards/margins": 1.6481198072433472, + "rewards/rejected": -6.6594719886779785, + "step": 209 + }, + { + "epoch": 0.897675661234304, + "grad_norm": 17.634651177078286, + "learning_rate": 2.958507960694784e-08, + "logits/chosen": -0.5604880452156067, + "logits/rejected": -0.5632505416870117, + "logps/chosen": -718.5660400390625, + "logps/rejected": -901.1178588867188, + "loss": 0.3646, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.241810321807861, + "rewards/margins": 1.8300410509109497, + "rewards/rejected": -7.071850776672363, + "step": 210 + }, + { + "epoch": 0.9019503072401817, + "grad_norm": 14.379481683528487, + "learning_rate": 2.7091379149682682e-08, + "logits/chosen": -0.7952392101287842, + "logits/rejected": -0.8281516432762146, + "logps/chosen": -715.3723754882812, + "logps/rejected": -877.9434814453125, + "loss": 0.3452, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.230836868286133, + "rewards/margins": 1.5704870223999023, + "rewards/rejected": -6.801323413848877, + "step": 211 + }, + { + "epoch": 0.9062249532460593, + "grad_norm": 12.38379892339324, + "learning_rate": 2.470452911076226e-08, + "logits/chosen": -0.6212865114212036, + "logits/rejected": -0.7052218317985535, + "logps/chosen": -568.6085205078125, + "logps/rejected": -806.6718139648438, + "loss": 0.3225, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.2210798263549805, + "rewards/margins": 2.2609493732452393, + "rewards/rejected": -6.482028484344482, + "step": 212 + }, + { + "epoch": 0.9104995992519369, + "grad_norm": 13.671352372902698, + "learning_rate": 2.2425068782375378e-08, + "logits/chosen": -0.708694577217102, + "logits/rejected": -0.7406002283096313, + "logps/chosen": -631.50537109375, + "logps/rejected": -812.782958984375, + "loss": 0.307, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.614077091217041, + "rewards/margins": 1.823103427886963, + "rewards/rejected": -6.4371795654296875, + "step": 213 + }, + { + "epoch": 0.9147742452578146, + "grad_norm": 14.646295067927662, + "learning_rate": 2.025351319275137e-08, + "logits/chosen": -0.6601104736328125, + "logits/rejected": -0.6835007667541504, + "logps/chosen": -590.8656005859375, + "logps/rejected": -773.0665283203125, + "loss": 0.3485, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.47109317779541, + "rewards/margins": 1.788784384727478, + "rewards/rejected": -6.259877681732178, + "step": 214 + }, + { + "epoch": 0.9190488912636923, + "grad_norm": 13.735401688183746, + "learning_rate": 1.8190352989793322e-08, + "logits/chosen": -0.6832427978515625, + "logits/rejected": -0.7862576246261597, + "logps/chosen": -725.4074096679688, + "logps/rejected": -984.6351318359375, + "loss": 0.3231, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.303395748138428, + "rewards/margins": 2.5184988975524902, + "rewards/rejected": -7.821893692016602, + "step": 215 + }, + { + "epoch": 0.9233235372695698, + "grad_norm": 15.601926902374625, + "learning_rate": 1.623605433021985e-08, + "logits/chosen": -0.7613773345947266, + "logits/rejected": -0.8268823623657227, + "logps/chosen": -661.9906005859375, + "logps/rejected": -919.4451904296875, + "loss": 0.3372, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.652050018310547, + "rewards/margins": 2.513514995574951, + "rewards/rejected": -7.1655659675598145, + "step": 216 + }, + { + "epoch": 0.9275981832754475, + "grad_norm": 16.26278938580055, + "learning_rate": 1.4391058774239629e-08, + "logits/chosen": -0.6215861439704895, + "logits/rejected": -0.684840977191925, + "logps/chosen": -776.4395751953125, + "logps/rejected": -996.853271484375, + "loss": 0.326, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.608438491821289, + "rewards/margins": 1.9786306619644165, + "rewards/rejected": -7.587069511413574, + "step": 217 + }, + { + "epoch": 0.9318728292813251, + "grad_norm": 14.96580166748689, + "learning_rate": 1.2655783185784252e-08, + "logits/chosen": -0.5001079440116882, + "logits/rejected": -0.5907378792762756, + "logps/chosen": -636.527099609375, + "logps/rejected": -864.0882568359375, + "loss": 0.3191, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.651758670806885, + "rewards/margins": 2.046351432800293, + "rewards/rejected": -6.698110103607178, + "step": 218 + }, + { + "epoch": 0.9361474752872028, + "grad_norm": 16.265287798595743, + "learning_rate": 1.1030619638320804e-08, + "logits/chosen": -0.7028571963310242, + "logits/rejected": -0.7375434041023254, + "logps/chosen": -659.0259399414062, + "logps/rejected": -862.6585693359375, + "loss": 0.4146, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.674835205078125, + "rewards/margins": 1.9501476287841797, + "rewards/rejected": -6.624982833862305, + "step": 219 + }, + { + "epoch": 0.9404221212930804, + "grad_norm": 18.704093009678243, + "learning_rate": 9.515935326265378e-09, + "logits/chosen": -0.6694950461387634, + "logits/rejected": -0.7062525749206543, + "logps/chosen": -723.8511962890625, + "logps/rejected": -943.264892578125, + "loss": 0.3529, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.229786396026611, + "rewards/margins": 2.252092123031616, + "rewards/rejected": -7.481878757476807, + "step": 220 + }, + { + "epoch": 0.944696767298958, + "grad_norm": 14.334384768178085, + "learning_rate": 8.11207248201834e-09, + "logits/chosen": -0.5764753818511963, + "logits/rejected": -0.5958765745162964, + "logps/chosen": -652.0119018554688, + "logps/rejected": -832.4238891601562, + "loss": 0.3477, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.68829870223999, + "rewards/margins": 1.8251862525939941, + "rewards/rejected": -6.513484001159668, + "step": 221 + }, + { + "epoch": 0.9489714133048357, + "grad_norm": 13.331601354010285, + "learning_rate": 6.819348298638839e-09, + "logits/chosen": -0.5833301544189453, + "logits/rejected": -0.5820556879043579, + "logps/chosen": -636.1702270507812, + "logps/rejected": -775.4622192382812, + "loss": 0.3313, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.623978137969971, + "rewards/margins": 1.5225858688354492, + "rewards/rejected": -6.14656400680542, + "step": 222 + }, + { + "epoch": 0.9532460593107134, + "grad_norm": 14.100864285899743, + "learning_rate": 5.638054858177643e-09, + "logits/chosen": -0.6460739374160767, + "logits/rejected": -0.704484760761261, + "logps/chosen": -666.4550170898438, + "logps/rejected": -879.8217163085938, + "loss": 0.3125, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.722194194793701, + "rewards/margins": 2.0787832736968994, + "rewards/rejected": -6.8009772300720215, + "step": 223 + }, + { + "epoch": 0.957520705316591, + "grad_norm": 14.340696291340606, + "learning_rate": 4.568459065683205e-09, + "logits/chosen": -0.6707419753074646, + "logits/rejected": -0.6990107893943787, + "logps/chosen": -594.0106201171875, + "logps/rejected": -786.5427856445312, + "loss": 0.2979, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.234139442443848, + "rewards/margins": 1.8917232751846313, + "rewards/rejected": -6.1258625984191895, + "step": 224 + }, + { + "epoch": 0.9617953513224686, + "grad_norm": 16.87207011175805, + "learning_rate": 3.6108025888958447e-09, + "logits/chosen": -0.6689302921295166, + "logits/rejected": -0.7221664786338806, + "logps/chosen": -647.2908935546875, + "logps/rejected": -820.8074340820312, + "loss": 0.3808, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.4107489585876465, + "rewards/margins": 1.711435317993164, + "rewards/rejected": -6.1221842765808105, + "step": 225 + }, + { + "epoch": 0.9660699973283462, + "grad_norm": 15.439317380628985, + "learning_rate": 2.7653018036454256e-09, + "logits/chosen": -0.7938471436500549, + "logits/rejected": -0.8257592916488647, + "logps/chosen": -685.37255859375, + "logps/rejected": -856.272705078125, + "loss": 0.376, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.837329864501953, + "rewards/margins": 1.7641983032226562, + "rewards/rejected": -6.601528167724609, + "step": 226 + }, + { + "epoch": 0.9703446433342239, + "grad_norm": 15.341084234652751, + "learning_rate": 2.0321477449619096e-09, + "logits/chosen": -0.6373786330223083, + "logits/rejected": -0.6358063220977783, + "logps/chosen": -671.7559814453125, + "logps/rejected": -808.6800537109375, + "loss": 0.3511, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.954955101013184, + "rewards/margins": 1.4025709629058838, + "rewards/rejected": -6.357525825500488, + "step": 227 + }, + { + "epoch": 0.9746192893401016, + "grad_norm": 16.08851675396365, + "learning_rate": 1.4115060639128818e-09, + "logits/chosen": -0.7105420827865601, + "logits/rejected": -0.7698283195495605, + "logps/chosen": -771.7847900390625, + "logps/rejected": -1010.12255859375, + "loss": 0.3947, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.364959716796875, + "rewards/margins": 2.1707944869995117, + "rewards/rejected": -7.535754203796387, + "step": 228 + }, + { + "epoch": 0.9788939353459791, + "grad_norm": 16.317073121348123, + "learning_rate": 9.035169901754902e-10, + "logits/chosen": -0.6789236664772034, + "logits/rejected": -0.718908429145813, + "logps/chosen": -675.6837158203125, + "logps/rejected": -947.82958984375, + "loss": 0.3487, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.9360833168029785, + "rewards/margins": 2.77742600440979, + "rewards/rejected": -7.713509559631348, + "step": 229 + }, + { + "epoch": 0.9831685813518568, + "grad_norm": 14.913455884524495, + "learning_rate": 5.082953003528456e-10, + "logits/chosen": -0.7133156061172485, + "logits/rejected": -0.7598533630371094, + "logps/chosen": -662.4967651367188, + "logps/rejected": -898.2874145507812, + "loss": 0.3301, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.673389911651611, + "rewards/margins": 2.1788995265960693, + "rewards/rejected": -6.852289199829102, + "step": 230 + }, + { + "epoch": 0.9874432273577345, + "grad_norm": 13.34444116932449, + "learning_rate": 2.2593029204076574e-10, + "logits/chosen": -0.7370655536651611, + "logits/rejected": -0.7865728139877319, + "logps/chosen": -603.37109375, + "logps/rejected": -828.8916015625, + "loss": 0.3555, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.3032660484313965, + "rewards/margins": 2.1432766914367676, + "rewards/rejected": -6.446542739868164, + "step": 231 + }, + { + "epoch": 0.9917178733636121, + "grad_norm": 14.986063948498115, + "learning_rate": 5.648576365169244e-11, + "logits/chosen": -0.7017238140106201, + "logits/rejected": -0.7418109178543091, + "logps/chosen": -690.6285400390625, + "logps/rejected": -863.64208984375, + "loss": 0.352, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.160223960876465, + "rewards/margins": 1.6970188617706299, + "rewards/rejected": -6.857243061065674, + "step": 232 + }, + { + "epoch": 0.9959925193694897, + "grad_norm": 14.725841841963547, + "learning_rate": 0.0, + "logits/chosen": -0.540886402130127, + "logits/rejected": -0.5910319089889526, + "logps/chosen": -539.7564697265625, + "logps/rejected": -689.2047119140625, + "loss": 0.3329, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.104672431945801, + "rewards/margins": 1.427870750427246, + "rewards/rejected": -5.532542705535889, + "step": 233 + }, + { + "epoch": 0.9959925193694897, + "step": 233, + "total_flos": 0.0, + "train_loss": 0.45488236134655996, + "train_runtime": 10577.9564, + "train_samples_per_second": 5.66, + "train_steps_per_second": 0.022 + } + ], + "logging_steps": 1, + "max_steps": 233, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}