{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1465, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00682681230532918, "grad_norm": 20.491548678692148, "learning_rate": 6.122448979591837e-08, "logits/chosen": 0.03672148287296295, "logits/rejected": 0.041521187871694565, "logps/chosen": -191.74862670898438, "logps/rejected": -189.4052276611328, "loss": 0.6921, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": 0.0014678842853754759, "rewards/margins": 0.0024292597081512213, "rewards/rejected": -0.0009613755391910672, "step": 10 }, { "epoch": 0.01365362461065836, "grad_norm": 21.860852469835415, "learning_rate": 1.2925170068027211e-07, "logits/chosen": 0.04523754119873047, "logits/rejected": 0.05510401353240013, "logps/chosen": -187.8703155517578, "logps/rejected": -187.6009979248047, "loss": 0.6937, "rewards/accuracies": 0.4859375059604645, "rewards/chosen": -0.0003124059294350445, "rewards/margins": -0.0007655444787815213, "rewards/rejected": 0.0004531386948656291, "step": 20 }, { "epoch": 0.02048043691598754, "grad_norm": 20.278529512570657, "learning_rate": 1.9727891156462583e-07, "logits/chosen": 0.020983930677175522, "logits/rejected": 0.04532231390476227, "logps/chosen": -185.85728454589844, "logps/rejected": -188.9866180419922, "loss": 0.6936, "rewards/accuracies": 0.46875, "rewards/chosen": 0.00064073596149683, "rewards/margins": -0.0005829028668813407, "rewards/rejected": 0.001223638653755188, "step": 30 }, { "epoch": 0.02730724922131672, "grad_norm": 19.626379046619967, "learning_rate": 2.653061224489796e-07, "logits/chosen": 0.03043345920741558, "logits/rejected": 0.032446593046188354, "logps/chosen": -193.6338653564453, "logps/rejected": -190.4232635498047, "loss": 0.6913, "rewards/accuracies": 0.53125, "rewards/chosen": 0.002521326532587409, "rewards/margins": 0.004052319563925266, "rewards/rejected": -0.0015309930313378572, "step": 40 }, { "epoch": 0.0341340615266459, "grad_norm": 21.08295374738999, "learning_rate": 3.333333333333333e-07, "logits/chosen": 0.04947035759687424, "logits/rejected": 0.06372452527284622, "logps/chosen": -188.39315795898438, "logps/rejected": -190.05992126464844, "loss": 0.6942, "rewards/accuracies": 0.46406251192092896, "rewards/chosen": 0.0021625806111842394, "rewards/margins": -0.0017312343697994947, "rewards/rejected": 0.003893814980983734, "step": 50 }, { "epoch": 0.04096087383197508, "grad_norm": 20.25039554823623, "learning_rate": 4.0136054421768705e-07, "logits/chosen": 0.053825099021196365, "logits/rejected": 0.0521962009370327, "logps/chosen": -189.28480529785156, "logps/rejected": -184.31430053710938, "loss": 0.6937, "rewards/accuracies": 0.5015624761581421, "rewards/chosen": 0.004196351859718561, "rewards/margins": -0.0006979627651162446, "rewards/rejected": 0.0048943147994577885, "step": 60 }, { "epoch": 0.04778768613730426, "grad_norm": 22.505298366939336, "learning_rate": 4.693877551020408e-07, "logits/chosen": 0.03855639323592186, "logits/rejected": 0.041457682847976685, "logps/chosen": -189.49111938476562, "logps/rejected": -190.42034912109375, "loss": 0.6933, "rewards/accuracies": 0.4937499761581421, "rewards/chosen": 0.008006598800420761, "rewards/margins": 4.7756126150488853e-05, "rewards/rejected": 0.007958842441439629, "step": 70 }, { "epoch": 0.05461449844263344, "grad_norm": 19.99809543741437, "learning_rate": 5.374149659863945e-07, "logits/chosen": 0.026321567595005035, "logits/rejected": 0.013571225106716156, "logps/chosen": -189.8534393310547, "logps/rejected": -187.626708984375, "loss": 0.6878, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 0.018732454627752304, "rewards/margins": 0.011271494440734386, "rewards/rejected": 0.007460957858711481, "step": 80 }, { "epoch": 0.06144131074796262, "grad_norm": 22.176568391543768, "learning_rate": 6.054421768707482e-07, "logits/chosen": 0.020383019000291824, "logits/rejected": 0.02592673897743225, "logps/chosen": -186.662841796875, "logps/rejected": -189.3004608154297, "loss": 0.6876, "rewards/accuracies": 0.582812488079071, "rewards/chosen": 0.027650414034724236, "rewards/margins": 0.011809633113443851, "rewards/rejected": 0.01584078185260296, "step": 90 }, { "epoch": 0.0682681230532918, "grad_norm": 20.53234701755388, "learning_rate": 6.734693877551019e-07, "logits/chosen": 0.02966993674635887, "logits/rejected": 0.05219441279768944, "logps/chosen": -190.25782775878906, "logps/rejected": -189.80935668945312, "loss": 0.6858, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": 0.040990687906742096, "rewards/margins": 0.01583397202193737, "rewards/rejected": 0.025156717747449875, "step": 100 }, { "epoch": 0.07509493535862098, "grad_norm": 21.19602898096358, "learning_rate": 7.414965986394558e-07, "logits/chosen": -0.007384412921965122, "logits/rejected": -0.016086794435977936, "logps/chosen": -189.52395629882812, "logps/rejected": -192.64816284179688, "loss": 0.6817, "rewards/accuracies": 0.625, "rewards/chosen": 0.05572628974914551, "rewards/margins": 0.024477079510688782, "rewards/rejected": 0.031249215826392174, "step": 110 }, { "epoch": 0.08192174766395016, "grad_norm": 20.08862529877448, "learning_rate": 8.095238095238095e-07, "logits/chosen": -0.04889947175979614, "logits/rejected": -0.049361489713191986, "logps/chosen": -197.39492797851562, "logps/rejected": -192.8791046142578, "loss": 0.6828, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 0.059998854994773865, "rewards/margins": 0.023517701774835587, "rewards/rejected": 0.03648114949464798, "step": 120 }, { "epoch": 0.08874855996927934, "grad_norm": 19.78186965312465, "learning_rate": 8.775510204081632e-07, "logits/chosen": -0.022162066772580147, "logits/rejected": -0.02603471651673317, "logps/chosen": -192.2538604736328, "logps/rejected": -190.6973876953125, "loss": 0.6782, "rewards/accuracies": 0.651562511920929, "rewards/chosen": 0.07047584652900696, "rewards/margins": 0.03453099727630615, "rewards/rejected": 0.035944852977991104, "step": 130 }, { "epoch": 0.09557537227460852, "grad_norm": 21.72668562860521, "learning_rate": 9.45578231292517e-07, "logits/chosen": -0.028122998774051666, "logits/rejected": -0.0023567965254187584, "logps/chosen": -193.58602905273438, "logps/rejected": -189.49517822265625, "loss": 0.6721, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.08927410840988159, "rewards/margins": 0.04811044782400131, "rewards/rejected": 0.04116365686058998, "step": 140 }, { "epoch": 0.1024021845799377, "grad_norm": 20.630914226397604, "learning_rate": 9.98482549317147e-07, "logits/chosen": -0.07732997089624405, "logits/rejected": -0.08366119861602783, "logps/chosen": -203.80441284179688, "logps/rejected": -202.51812744140625, "loss": 0.666, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": 0.10389578342437744, "rewards/margins": 0.06255247443914413, "rewards/rejected": 0.04134330898523331, "step": 150 }, { "epoch": 0.10922899688526688, "grad_norm": 20.25669433495337, "learning_rate": 9.908952959028832e-07, "logits/chosen": -0.09441889822483063, "logits/rejected": -0.08870529383420944, "logps/chosen": -185.63307189941406, "logps/rejected": -186.53253173828125, "loss": 0.6654, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": 0.08515263348817825, "rewards/margins": 0.0660884901881218, "rewards/rejected": 0.019064147025346756, "step": 160 }, { "epoch": 0.11605580919059606, "grad_norm": 20.384593980794733, "learning_rate": 9.833080424886191e-07, "logits/chosen": -0.08715031296014786, "logits/rejected": -0.05636933073401451, "logps/chosen": -188.3374481201172, "logps/rejected": -190.37437438964844, "loss": 0.659, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": 0.04690036177635193, "rewards/margins": 0.08634677529335022, "rewards/rejected": -0.03944641351699829, "step": 170 }, { "epoch": 0.12288262149592524, "grad_norm": 21.86056528276187, "learning_rate": 9.75720789074355e-07, "logits/chosen": -0.07912790030241013, "logits/rejected": -0.07271625846624374, "logps/chosen": -197.11959838867188, "logps/rejected": -197.41287231445312, "loss": 0.6528, "rewards/accuracies": 0.6749999523162842, "rewards/chosen": 0.04622086510062218, "rewards/margins": 0.10496747493743896, "rewards/rejected": -0.058746613562107086, "step": 180 }, { "epoch": 0.12970943380125444, "grad_norm": 22.24802422589698, "learning_rate": 9.68133535660091e-07, "logits/chosen": -0.07506565004587173, "logits/rejected": -0.05108420550823212, "logps/chosen": -190.35340881347656, "logps/rejected": -195.009521484375, "loss": 0.6441, "rewards/accuracies": 0.6812500357627869, "rewards/chosen": 0.052541881799697876, "rewards/margins": 0.12386594712734222, "rewards/rejected": -0.07132406532764435, "step": 190 }, { "epoch": 0.1365362461065836, "grad_norm": 22.419822765649933, "learning_rate": 9.60546282245827e-07, "logits/chosen": -0.11874101310968399, "logits/rejected": -0.08336825668811798, "logps/chosen": -193.62611389160156, "logps/rejected": -196.01084899902344, "loss": 0.6249, "rewards/accuracies": 0.7046875357627869, "rewards/chosen": 0.03949081152677536, "rewards/margins": 0.17370560765266418, "rewards/rejected": -0.13421478867530823, "step": 200 }, { "epoch": 0.1433630584119128, "grad_norm": 22.915739502006815, "learning_rate": 9.52959028831563e-07, "logits/chosen": -0.17365601658821106, "logits/rejected": -0.15520283579826355, "logps/chosen": -203.1890869140625, "logps/rejected": -200.14974975585938, "loss": 0.6287, "rewards/accuracies": 0.6687500476837158, "rewards/chosen": -0.01979774236679077, "rewards/margins": 0.18479280173778534, "rewards/rejected": -0.2045905441045761, "step": 210 }, { "epoch": 0.15018987071724196, "grad_norm": 20.769969852017695, "learning_rate": 9.453717754172988e-07, "logits/chosen": -0.1847243756055832, "logits/rejected": -0.15192236006259918, "logps/chosen": -198.33010864257812, "logps/rejected": -200.56228637695312, "loss": 0.6015, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.03269830346107483, "rewards/margins": 0.25339096784591675, "rewards/rejected": -0.2860892415046692, "step": 220 }, { "epoch": 0.15701668302257116, "grad_norm": 21.597574913870996, "learning_rate": 9.377845220030348e-07, "logits/chosen": -0.21274694800376892, "logits/rejected": -0.19206659495830536, "logps/chosen": -197.59228515625, "logps/rejected": -200.42283630371094, "loss": 0.611, "rewards/accuracies": 0.6734374761581421, "rewards/chosen": -0.09015801548957825, "rewards/margins": 0.24926723539829254, "rewards/rejected": -0.3394252359867096, "step": 230 }, { "epoch": 0.16384349532790032, "grad_norm": 24.09497342960952, "learning_rate": 9.301972685887707e-07, "logits/chosen": -0.2293986827135086, "logits/rejected": -0.19997453689575195, "logps/chosen": -191.1751251220703, "logps/rejected": -196.63511657714844, "loss": 0.6125, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.15094764530658722, "rewards/margins": 0.24523335695266724, "rewards/rejected": -0.39618098735809326, "step": 240 }, { "epoch": 0.17067030763322952, "grad_norm": 22.186402685803138, "learning_rate": 9.226100151745068e-07, "logits/chosen": -0.23599499464035034, "logits/rejected": -0.20987126231193542, "logps/chosen": -191.61639404296875, "logps/rejected": -197.80091857910156, "loss": 0.6205, "rewards/accuracies": 0.6546875238418579, "rewards/chosen": -0.22373469173908234, "rewards/margins": 0.2635762691497803, "rewards/rejected": -0.4873109459877014, "step": 250 }, { "epoch": 0.17749711993855868, "grad_norm": 23.30196457741843, "learning_rate": 9.150227617602428e-07, "logits/chosen": -0.2195354700088501, "logits/rejected": -0.19019638001918793, "logps/chosen": -190.50746154785156, "logps/rejected": -195.74331665039062, "loss": 0.6056, "rewards/accuracies": 0.7046875357627869, "rewards/chosen": -0.2523514926433563, "rewards/margins": 0.29894089698791504, "rewards/rejected": -0.5512923002243042, "step": 260 }, { "epoch": 0.18432393224388788, "grad_norm": 23.437160399579792, "learning_rate": 9.074355083459787e-07, "logits/chosen": -0.2144363671541214, "logits/rejected": -0.19538246095180511, "logps/chosen": -194.883056640625, "logps/rejected": -202.83575439453125, "loss": 0.595, "rewards/accuracies": 0.7078125476837158, "rewards/chosen": -0.27382633090019226, "rewards/margins": 0.3095867931842804, "rewards/rejected": -0.5834130644798279, "step": 270 }, { "epoch": 0.19115074454921704, "grad_norm": 23.67928529051871, "learning_rate": 8.998482549317147e-07, "logits/chosen": -0.2671777606010437, "logits/rejected": -0.23835715651512146, "logps/chosen": -189.7034912109375, "logps/rejected": -194.55117797851562, "loss": 0.589, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -0.2815781235694885, "rewards/margins": 0.34006255865097046, "rewards/rejected": -0.621640682220459, "step": 280 }, { "epoch": 0.19797755685454624, "grad_norm": 26.3785919721159, "learning_rate": 8.922610015174506e-07, "logits/chosen": -0.2851921319961548, "logits/rejected": -0.2668570280075073, "logps/chosen": -202.77801513671875, "logps/rejected": -207.8894805908203, "loss": 0.59, "rewards/accuracies": 0.7046875357627869, "rewards/chosen": -0.33676964044570923, "rewards/margins": 0.35969871282577515, "rewards/rejected": -0.6964683532714844, "step": 290 }, { "epoch": 0.2048043691598754, "grad_norm": 23.715391013722297, "learning_rate": 8.846737481031866e-07, "logits/chosen": -0.2776036262512207, "logits/rejected": -0.24332435429096222, "logps/chosen": -201.10296630859375, "logps/rejected": -203.72195434570312, "loss": 0.6111, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.3781723380088806, "rewards/margins": 0.3227519989013672, "rewards/rejected": -0.700924277305603, "step": 300 }, { "epoch": 0.2116311814652046, "grad_norm": 21.57268816738927, "learning_rate": 8.770864946889226e-07, "logits/chosen": -0.29242080450057983, "logits/rejected": -0.2669425308704376, "logps/chosen": -204.4817352294922, "logps/rejected": -214.0943603515625, "loss": 0.5794, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.3647349178791046, "rewards/margins": 0.4395143985748291, "rewards/rejected": -0.8042493462562561, "step": 310 }, { "epoch": 0.21845799377053376, "grad_norm": 25.227342019618998, "learning_rate": 8.694992412746586e-07, "logits/chosen": -0.27386438846588135, "logits/rejected": -0.2711098790168762, "logps/chosen": -198.40101623535156, "logps/rejected": -204.6220703125, "loss": 0.5727, "rewards/accuracies": 0.7281250357627869, "rewards/chosen": -0.3862449824810028, "rewards/margins": 0.41143903136253357, "rewards/rejected": -0.7976840734481812, "step": 320 }, { "epoch": 0.22528480607586296, "grad_norm": 24.00522520700325, "learning_rate": 8.619119878603945e-07, "logits/chosen": -0.3334537744522095, "logits/rejected": -0.3187546730041504, "logps/chosen": -208.01986694335938, "logps/rejected": -212.91488647460938, "loss": 0.5913, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -0.4798099398612976, "rewards/margins": 0.37955817580223083, "rewards/rejected": -0.8593681454658508, "step": 330 }, { "epoch": 0.23211161838119213, "grad_norm": 23.49360024665317, "learning_rate": 8.543247344461305e-07, "logits/chosen": -0.30438894033432007, "logits/rejected": -0.28073978424072266, "logps/chosen": -203.7110595703125, "logps/rejected": -211.83615112304688, "loss": 0.56, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": -0.3902357518672943, "rewards/margins": 0.5086088180541992, "rewards/rejected": -0.8988445401191711, "step": 340 }, { "epoch": 0.23893843068652132, "grad_norm": 23.086500001623612, "learning_rate": 8.467374810318663e-07, "logits/chosen": -0.3257724940776825, "logits/rejected": -0.2853447198867798, "logps/chosen": -204.09765625, "logps/rejected": -212.38494873046875, "loss": 0.5515, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.4673992991447449, "rewards/margins": 0.5267953872680664, "rewards/rejected": -0.9941946864128113, "step": 350 }, { "epoch": 0.24576524299185049, "grad_norm": 24.60129579583855, "learning_rate": 8.391502276176023e-07, "logits/chosen": -0.3029869794845581, "logits/rejected": -0.2718327045440674, "logps/chosen": -196.5174560546875, "logps/rejected": -204.4929656982422, "loss": 0.5809, "rewards/accuracies": 0.7046875357627869, "rewards/chosen": -0.4800136089324951, "rewards/margins": 0.43177759647369385, "rewards/rejected": -0.9117912650108337, "step": 360 }, { "epoch": 0.25259205529717965, "grad_norm": 23.03353178121409, "learning_rate": 8.315629742033384e-07, "logits/chosen": -0.28175657987594604, "logits/rejected": -0.2525416612625122, "logps/chosen": -197.58517456054688, "logps/rejected": -210.83853149414062, "loss": 0.5675, "rewards/accuracies": 0.7234375476837158, "rewards/chosen": -0.5489044785499573, "rewards/margins": 0.4759043753147125, "rewards/rejected": -1.0248088836669922, "step": 370 }, { "epoch": 0.2594188676025089, "grad_norm": 21.702116754195792, "learning_rate": 8.239757207890743e-07, "logits/chosen": -0.3090224266052246, "logits/rejected": -0.2872709333896637, "logps/chosen": -204.044921875, "logps/rejected": -214.3769989013672, "loss": 0.5414, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.501671552658081, "rewards/margins": 0.5782625675201416, "rewards/rejected": -1.0799341201782227, "step": 380 }, { "epoch": 0.26624567990783804, "grad_norm": 22.690534272455945, "learning_rate": 8.163884673748103e-07, "logits/chosen": -0.2652078866958618, "logits/rejected": -0.22916777431964874, "logps/chosen": -206.28855895996094, "logps/rejected": -217.3023681640625, "loss": 0.532, "rewards/accuracies": 0.7343750596046448, "rewards/chosen": -0.47486239671707153, "rewards/margins": 0.6135950684547424, "rewards/rejected": -1.088457465171814, "step": 390 }, { "epoch": 0.2730724922131672, "grad_norm": 24.587498727216616, "learning_rate": 8.088012139605462e-07, "logits/chosen": -0.28489071130752563, "logits/rejected": -0.23875750601291656, "logps/chosen": -202.77565002441406, "logps/rejected": -216.6030731201172, "loss": 0.5272, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5735213756561279, "rewards/margins": 0.6210550665855408, "rewards/rejected": -1.194576382637024, "step": 400 }, { "epoch": 0.2798993045184964, "grad_norm": 24.707605897401567, "learning_rate": 8.012139605462822e-07, "logits/chosen": -0.3593894839286804, "logits/rejected": -0.3138624429702759, "logps/chosen": -202.06204223632812, "logps/rejected": -208.73065185546875, "loss": 0.5575, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.604642391204834, "rewards/margins": 0.5522481203079224, "rewards/rejected": -1.156890630722046, "step": 410 }, { "epoch": 0.2867261168238256, "grad_norm": 24.754070000277498, "learning_rate": 7.936267071320181e-07, "logits/chosen": -0.3502323627471924, "logits/rejected": -0.3173756003379822, "logps/chosen": -207.6633758544922, "logps/rejected": -216.3917236328125, "loss": 0.5265, "rewards/accuracies": 0.7484375238418579, "rewards/chosen": -0.6551162004470825, "rewards/margins": 0.6169639229774475, "rewards/rejected": -1.2720801830291748, "step": 420 }, { "epoch": 0.29355292912915476, "grad_norm": 23.564476771066985, "learning_rate": 7.860394537177542e-07, "logits/chosen": -0.3500007092952728, "logits/rejected": -0.32545575499534607, "logps/chosen": -211.29928588867188, "logps/rejected": -227.12037658691406, "loss": 0.5223, "rewards/accuracies": 0.7421875596046448, "rewards/chosen": -0.7528213262557983, "rewards/margins": 0.739406943321228, "rewards/rejected": -1.492228388786316, "step": 430 }, { "epoch": 0.3003797414344839, "grad_norm": 21.091018091079327, "learning_rate": 7.784522003034901e-07, "logits/chosen": -0.35516998171806335, "logits/rejected": -0.3074837327003479, "logps/chosen": -203.1188507080078, "logps/rejected": -212.15496826171875, "loss": 0.5055, "rewards/accuracies": 0.7765625715255737, "rewards/chosen": -0.6801650524139404, "rewards/margins": 0.7159599661827087, "rewards/rejected": -1.396125078201294, "step": 440 }, { "epoch": 0.3072065537398131, "grad_norm": 30.178688833532316, "learning_rate": 7.708649468892261e-07, "logits/chosen": -0.3771928548812866, "logits/rejected": -0.34754854440689087, "logps/chosen": -208.95216369628906, "logps/rejected": -225.38938903808594, "loss": 0.5226, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.7856850624084473, "rewards/margins": 0.6984450817108154, "rewards/rejected": -1.4841301441192627, "step": 450 }, { "epoch": 0.3140333660451423, "grad_norm": 22.73508892423378, "learning_rate": 7.632776934749621e-07, "logits/chosen": -0.40090760588645935, "logits/rejected": -0.3806273937225342, "logps/chosen": -208.29766845703125, "logps/rejected": -223.73020935058594, "loss": 0.5013, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7431963086128235, "rewards/margins": 0.8224382400512695, "rewards/rejected": -1.5656344890594482, "step": 460 }, { "epoch": 0.3208601783504715, "grad_norm": 24.65367082247547, "learning_rate": 7.55690440060698e-07, "logits/chosen": -0.41392359137535095, "logits/rejected": -0.3990693688392639, "logps/chosen": -211.69845581054688, "logps/rejected": -222.681884765625, "loss": 0.4896, "rewards/accuracies": 0.7671874761581421, "rewards/chosen": -0.7812504768371582, "rewards/margins": 0.8228715062141418, "rewards/rejected": -1.6041220426559448, "step": 470 }, { "epoch": 0.32768699065580065, "grad_norm": 26.060565630616303, "learning_rate": 7.481031866464339e-07, "logits/chosen": -0.4470677673816681, "logits/rejected": -0.4043146073818207, "logps/chosen": -201.87158203125, "logps/rejected": -216.65240478515625, "loss": 0.5178, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.8349807858467102, "rewards/margins": 0.7298619151115417, "rewards/rejected": -1.564842700958252, "step": 480 }, { "epoch": 0.3345138029611298, "grad_norm": 24.867787006387463, "learning_rate": 7.405159332321699e-07, "logits/chosen": -0.4602758288383484, "logits/rejected": -0.4031441807746887, "logps/chosen": -215.20541381835938, "logps/rejected": -234.6583251953125, "loss": 0.5155, "rewards/accuracies": 0.7484375238418579, "rewards/chosen": -0.9265861511230469, "rewards/margins": 0.8055697679519653, "rewards/rejected": -1.7321559190750122, "step": 490 }, { "epoch": 0.34134061526645904, "grad_norm": 32.86790243336268, "learning_rate": 7.329286798179059e-07, "logits/chosen": -0.4144153594970703, "logits/rejected": -0.3892706036567688, "logps/chosen": -216.45887756347656, "logps/rejected": -225.97056579589844, "loss": 0.5274, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": -0.9314414262771606, "rewards/margins": 0.7752954363822937, "rewards/rejected": -1.7067368030548096, "step": 500 }, { "epoch": 0.3481674275717882, "grad_norm": 29.0406209714796, "learning_rate": 7.253414264036418e-07, "logits/chosen": -0.4518946707248688, "logits/rejected": -0.4360005855560303, "logps/chosen": -210.40875244140625, "logps/rejected": -227.6586456298828, "loss": 0.4918, "rewards/accuracies": 0.7640624642372131, "rewards/chosen": -0.7644888162612915, "rewards/margins": 0.8264600038528442, "rewards/rejected": -1.5909489393234253, "step": 510 }, { "epoch": 0.35499423987711737, "grad_norm": 29.792037648827193, "learning_rate": 7.177541729893778e-07, "logits/chosen": -0.46055272221565247, "logits/rejected": -0.41955289244651794, "logps/chosen": -203.9451904296875, "logps/rejected": -225.48402404785156, "loss": 0.5137, "rewards/accuracies": 0.75, "rewards/chosen": -0.7920026779174805, "rewards/margins": 0.7750235795974731, "rewards/rejected": -1.5670262575149536, "step": 520 }, { "epoch": 0.36182105218244653, "grad_norm": 28.48324275582042, "learning_rate": 7.101669195751137e-07, "logits/chosen": -0.44266417622566223, "logits/rejected": -0.4136849045753479, "logps/chosen": -217.11045837402344, "logps/rejected": -232.384521484375, "loss": 0.5059, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -0.9079422950744629, "rewards/margins": 0.8452929258346558, "rewards/rejected": -1.7532353401184082, "step": 530 }, { "epoch": 0.36864786448777576, "grad_norm": 24.346858846505146, "learning_rate": 7.025796661608497e-07, "logits/chosen": -0.4453073740005493, "logits/rejected": -0.39773428440093994, "logps/chosen": -199.64686584472656, "logps/rejected": -217.36294555664062, "loss": 0.5282, "rewards/accuracies": 0.745312511920929, "rewards/chosen": -0.7881425023078918, "rewards/margins": 0.745051383972168, "rewards/rejected": -1.533193826675415, "step": 540 }, { "epoch": 0.3754746767931049, "grad_norm": 23.88017645464549, "learning_rate": 6.949924127465857e-07, "logits/chosen": -0.4227825701236725, "logits/rejected": -0.3899107873439789, "logps/chosen": -218.3785400390625, "logps/rejected": -230.14222717285156, "loss": 0.5021, "rewards/accuracies": 0.7703125476837158, "rewards/chosen": -0.704402506351471, "rewards/margins": 0.8275265693664551, "rewards/rejected": -1.5319291353225708, "step": 550 }, { "epoch": 0.3823014890984341, "grad_norm": 23.672046628232867, "learning_rate": 6.874051593323217e-07, "logits/chosen": -0.42757853865623474, "logits/rejected": -0.394180566072464, "logps/chosen": -208.079345703125, "logps/rejected": -228.22598266601562, "loss": 0.4667, "rewards/accuracies": 0.770312488079071, "rewards/chosen": -0.8188365697860718, "rewards/margins": 0.9388971328735352, "rewards/rejected": -1.7577338218688965, "step": 560 }, { "epoch": 0.38912830140376325, "grad_norm": 27.539677366232738, "learning_rate": 6.798179059180577e-07, "logits/chosen": -0.4404156506061554, "logits/rejected": -0.3975413739681244, "logps/chosen": -208.03125, "logps/rejected": -224.20956420898438, "loss": 0.5004, "rewards/accuracies": 0.7593750357627869, "rewards/chosen": -0.8374041318893433, "rewards/margins": 0.7886074781417847, "rewards/rejected": -1.6260114908218384, "step": 570 }, { "epoch": 0.3959551137090925, "grad_norm": 25.29375987198196, "learning_rate": 6.722306525037936e-07, "logits/chosen": -0.4404994249343872, "logits/rejected": -0.40123340487480164, "logps/chosen": -213.8634490966797, "logps/rejected": -234.7059326171875, "loss": 0.497, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.8731653094291687, "rewards/margins": 0.9025252461433411, "rewards/rejected": -1.7756905555725098, "step": 580 }, { "epoch": 0.40278192601442164, "grad_norm": 23.196272876570017, "learning_rate": 6.646433990895296e-07, "logits/chosen": -0.42805609107017517, "logits/rejected": -0.3933747410774231, "logps/chosen": -210.49766540527344, "logps/rejected": -230.8019256591797, "loss": 0.472, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -0.9004274606704712, "rewards/margins": 0.9456923604011536, "rewards/rejected": -1.84611976146698, "step": 590 }, { "epoch": 0.4096087383197508, "grad_norm": 26.112729497646914, "learning_rate": 6.570561456752655e-07, "logits/chosen": -0.419676810503006, "logits/rejected": -0.3932231068611145, "logps/chosen": -212.6820831298828, "logps/rejected": -230.1705322265625, "loss": 0.4551, "rewards/accuracies": 0.776562511920929, "rewards/chosen": -0.9198075532913208, "rewards/margins": 1.0145457983016968, "rewards/rejected": -1.9343533515930176, "step": 600 }, { "epoch": 0.41643555062508, "grad_norm": 26.539025702964505, "learning_rate": 6.494688922610015e-07, "logits/chosen": -0.47971057891845703, "logits/rejected": -0.43692541122436523, "logps/chosen": -212.31594848632812, "logps/rejected": -234.5380859375, "loss": 0.4563, "rewards/accuracies": 0.770312488079071, "rewards/chosen": -1.0747839212417603, "rewards/margins": 1.078429937362671, "rewards/rejected": -2.1532137393951416, "step": 610 }, { "epoch": 0.4232623629304092, "grad_norm": 26.2859842178028, "learning_rate": 6.418816388467374e-07, "logits/chosen": -0.4652007818222046, "logits/rejected": -0.4464990496635437, "logps/chosen": -212.9930419921875, "logps/rejected": -230.19207763671875, "loss": 0.4778, "rewards/accuracies": 0.7906250357627869, "rewards/chosen": -1.1166890859603882, "rewards/margins": 0.9617180228233337, "rewards/rejected": -2.0784072875976562, "step": 620 }, { "epoch": 0.43008917523573836, "grad_norm": 27.943160005363282, "learning_rate": 6.342943854324734e-07, "logits/chosen": -0.507358968257904, "logits/rejected": -0.46083295345306396, "logps/chosen": -211.0389404296875, "logps/rejected": -234.06576538085938, "loss": 0.4689, "rewards/accuracies": 0.78125, "rewards/chosen": -1.125166654586792, "rewards/margins": 1.1086124181747437, "rewards/rejected": -2.233778953552246, "step": 630 }, { "epoch": 0.43691598754106753, "grad_norm": 27.031702699703523, "learning_rate": 6.267071320182093e-07, "logits/chosen": -0.5109987854957581, "logits/rejected": -0.4727884531021118, "logps/chosen": -216.13302612304688, "logps/rejected": -241.88287353515625, "loss": 0.4635, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.1550945043563843, "rewards/margins": 1.1085400581359863, "rewards/rejected": -2.263634443283081, "step": 640 }, { "epoch": 0.4437427998463967, "grad_norm": 26.49416191451856, "learning_rate": 6.191198786039453e-07, "logits/chosen": -0.5049822330474854, "logits/rejected": -0.46804797649383545, "logps/chosen": -220.15802001953125, "logps/rejected": -241.11386108398438, "loss": 0.4646, "rewards/accuracies": 0.770312488079071, "rewards/chosen": -1.1580806970596313, "rewards/margins": 1.064436435699463, "rewards/rejected": -2.222517490386963, "step": 650 }, { "epoch": 0.4505696121517259, "grad_norm": 28.052993928802096, "learning_rate": 6.115326251896813e-07, "logits/chosen": -0.5224714875221252, "logits/rejected": -0.496852308511734, "logps/chosen": -217.48992919921875, "logps/rejected": -234.48318481445312, "loss": 0.5188, "rewards/accuracies": 0.7671874761581421, "rewards/chosen": -1.1128088235855103, "rewards/margins": 0.9438337087631226, "rewards/rejected": -2.056642532348633, "step": 660 }, { "epoch": 0.4573964244570551, "grad_norm": 32.11947138128127, "learning_rate": 6.039453717754173e-07, "logits/chosen": -0.4993141293525696, "logits/rejected": -0.4682856798171997, "logps/chosen": -206.40176391601562, "logps/rejected": -231.08042907714844, "loss": 0.4953, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0732593536376953, "rewards/margins": 1.1166470050811768, "rewards/rejected": -2.189906358718872, "step": 670 }, { "epoch": 0.46422323676238425, "grad_norm": 24.595239877463356, "learning_rate": 5.963581183611533e-07, "logits/chosen": -0.5414324998855591, "logits/rejected": -0.5145028233528137, "logps/chosen": -219.66567993164062, "logps/rejected": -236.0765380859375, "loss": 0.4608, "rewards/accuracies": 0.7781250476837158, "rewards/chosen": -0.9715930819511414, "rewards/margins": 1.0554088354110718, "rewards/rejected": -2.0270018577575684, "step": 680 }, { "epoch": 0.47105004906771347, "grad_norm": 27.819824043736283, "learning_rate": 5.887708649468892e-07, "logits/chosen": -0.482106477022171, "logits/rejected": -0.43574321269989014, "logps/chosen": -211.92596435546875, "logps/rejected": -234.6639862060547, "loss": 0.4352, "rewards/accuracies": 0.792187511920929, "rewards/chosen": -0.9926649332046509, "rewards/margins": 1.141036033630371, "rewards/rejected": -2.1337008476257324, "step": 690 }, { "epoch": 0.47787686137304264, "grad_norm": 27.82950606174818, "learning_rate": 5.811836115326252e-07, "logits/chosen": -0.491192102432251, "logits/rejected": -0.45507892966270447, "logps/chosen": -215.52423095703125, "logps/rejected": -239.1810302734375, "loss": 0.4534, "rewards/accuracies": 0.7812500596046448, "rewards/chosen": -1.0811206102371216, "rewards/margins": 1.171852469444275, "rewards/rejected": -2.2529730796813965, "step": 700 }, { "epoch": 0.4847036736783718, "grad_norm": 32.40109215164061, "learning_rate": 5.735963581183611e-07, "logits/chosen": -0.48725226521492004, "logits/rejected": -0.4451846480369568, "logps/chosen": -211.22933959960938, "logps/rejected": -236.77740478515625, "loss": 0.4487, "rewards/accuracies": 0.7828124761581421, "rewards/chosen": -1.0895929336547852, "rewards/margins": 1.1770341396331787, "rewards/rejected": -2.2666268348693848, "step": 710 }, { "epoch": 0.49153048598370097, "grad_norm": 27.259651037643604, "learning_rate": 5.660091047040971e-07, "logits/chosen": -0.5053711533546448, "logits/rejected": -0.4444194436073303, "logps/chosen": -205.80319213867188, "logps/rejected": -230.7117919921875, "loss": 0.4743, "rewards/accuracies": 0.770312488079071, "rewards/chosen": -1.1533528566360474, "rewards/margins": 1.07535982131958, "rewards/rejected": -2.228712797164917, "step": 720 }, { "epoch": 0.4983572982890302, "grad_norm": 23.45407239305211, "learning_rate": 5.584218512898331e-07, "logits/chosen": -0.46755921840667725, "logits/rejected": -0.41828638315200806, "logps/chosen": -214.959716796875, "logps/rejected": -237.14413452148438, "loss": 0.4451, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1057405471801758, "rewards/margins": 1.1711297035217285, "rewards/rejected": -2.2768704891204834, "step": 730 }, { "epoch": 0.5051841105943593, "grad_norm": 24.513672931022274, "learning_rate": 5.508345978755691e-07, "logits/chosen": -0.5107758045196533, "logits/rejected": -0.47158223390579224, "logps/chosen": -214.1978759765625, "logps/rejected": -236.34100341796875, "loss": 0.4356, "rewards/accuracies": 0.796875, "rewards/chosen": -1.0896263122558594, "rewards/margins": 1.132210612297058, "rewards/rejected": -2.221837043762207, "step": 740 }, { "epoch": 0.5120109228996885, "grad_norm": 27.557361902005226, "learning_rate": 5.432473444613049e-07, "logits/chosen": -0.47495898604393005, "logits/rejected": -0.42891502380371094, "logps/chosen": -215.3628692626953, "logps/rejected": -240.29644775390625, "loss": 0.4433, "rewards/accuracies": 0.8046875596046448, "rewards/chosen": -1.1231842041015625, "rewards/margins": 1.1870129108428955, "rewards/rejected": -2.310196876525879, "step": 750 }, { "epoch": 0.5188377352050177, "grad_norm": 25.763088367806024, "learning_rate": 5.356600910470409e-07, "logits/chosen": -0.5234218835830688, "logits/rejected": -0.46476346254348755, "logps/chosen": -214.0421142578125, "logps/rejected": -238.0985565185547, "loss": 0.4236, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1129274368286133, "rewards/margins": 1.2772108316421509, "rewards/rejected": -2.3901383876800537, "step": 760 }, { "epoch": 0.5256645475103469, "grad_norm": 27.345063991868273, "learning_rate": 5.280728376327769e-07, "logits/chosen": -0.5037857294082642, "logits/rejected": -0.4784386157989502, "logps/chosen": -210.1291046142578, "logps/rejected": -236.04969787597656, "loss": 0.4347, "rewards/accuracies": 0.8109375238418579, "rewards/chosen": -1.1615896224975586, "rewards/margins": 1.2552942037582397, "rewards/rejected": -2.416883945465088, "step": 770 }, { "epoch": 0.5324913598156761, "grad_norm": 23.559487104074414, "learning_rate": 5.204855842185128e-07, "logits/chosen": -0.5264319777488708, "logits/rejected": -0.47137507796287537, "logps/chosen": -218.16024780273438, "logps/rejected": -245.5438995361328, "loss": 0.4609, "rewards/accuracies": 0.7703125476837158, "rewards/chosen": -1.2951855659484863, "rewards/margins": 1.170878291130066, "rewards/rejected": -2.4660637378692627, "step": 780 }, { "epoch": 0.5393181721210053, "grad_norm": 30.437623350555043, "learning_rate": 5.128983308042489e-07, "logits/chosen": -0.4954899251461029, "logits/rejected": -0.45233067870140076, "logps/chosen": -213.85757446289062, "logps/rejected": -242.7041473388672, "loss": 0.4193, "rewards/accuracies": 0.8093750476837158, "rewards/chosen": -1.2700811624526978, "rewards/margins": 1.2533843517303467, "rewards/rejected": -2.523465394973755, "step": 790 }, { "epoch": 0.5461449844263344, "grad_norm": 25.96035380580991, "learning_rate": 5.053110773899848e-07, "logits/chosen": -0.49867063760757446, "logits/rejected": -0.44984591007232666, "logps/chosen": -218.67074584960938, "logps/rejected": -247.30982971191406, "loss": 0.424, "rewards/accuracies": 0.8046875, "rewards/chosen": -1.237367868423462, "rewards/margins": 1.278685212135315, "rewards/rejected": -2.5160531997680664, "step": 800 }, { "epoch": 0.5529717967316636, "grad_norm": 27.066709483078917, "learning_rate": 4.977238239757208e-07, "logits/chosen": -0.4714178144931793, "logits/rejected": -0.4372885823249817, "logps/chosen": -218.98892211914062, "logps/rejected": -242.98770141601562, "loss": 0.4266, "rewards/accuracies": 0.7984375357627869, "rewards/chosen": -1.298151969909668, "rewards/margins": 1.222092866897583, "rewards/rejected": -2.520244836807251, "step": 810 }, { "epoch": 0.5597986090369927, "grad_norm": 28.230804755745105, "learning_rate": 4.901365705614567e-07, "logits/chosen": -0.45390385389328003, "logits/rejected": -0.43030381202697754, "logps/chosen": -220.013427734375, "logps/rejected": -241.9390411376953, "loss": 0.4526, "rewards/accuracies": 0.796875, "rewards/chosen": -1.354661464691162, "rewards/margins": 1.215053677558899, "rewards/rejected": -2.5697154998779297, "step": 820 }, { "epoch": 0.566625421342322, "grad_norm": 32.13534664184047, "learning_rate": 4.825493171471927e-07, "logits/chosen": -0.475396066904068, "logits/rejected": -0.43329310417175293, "logps/chosen": -210.43185424804688, "logps/rejected": -236.67987060546875, "loss": 0.4189, "rewards/accuracies": 0.815625011920929, "rewards/chosen": -1.264033317565918, "rewards/margins": 1.377021074295044, "rewards/rejected": -2.641054153442383, "step": 830 }, { "epoch": 0.5734522336476512, "grad_norm": 22.262860568714245, "learning_rate": 4.7496206373292864e-07, "logits/chosen": -0.4692656993865967, "logits/rejected": -0.4306912422180176, "logps/chosen": -211.5372772216797, "logps/rejected": -246.39736938476562, "loss": 0.3916, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.1525495052337646, "rewards/margins": 1.4558607339859009, "rewards/rejected": -2.608410358428955, "step": 840 }, { "epoch": 0.5802790459529803, "grad_norm": 22.80617456340079, "learning_rate": 4.673748103186646e-07, "logits/chosen": -0.46342021226882935, "logits/rejected": -0.41512057185173035, "logps/chosen": -221.32496643066406, "logps/rejected": -251.7954864501953, "loss": 0.394, "rewards/accuracies": 0.8218750357627869, "rewards/chosen": -1.2910584211349487, "rewards/margins": 1.4217520952224731, "rewards/rejected": -2.712810516357422, "step": 850 }, { "epoch": 0.5871058582583095, "grad_norm": 24.868191575194487, "learning_rate": 4.597875569044006e-07, "logits/chosen": -0.48653626441955566, "logits/rejected": -0.4366312623023987, "logps/chosen": -217.47422790527344, "logps/rejected": -241.48968505859375, "loss": 0.4269, "rewards/accuracies": 0.801562488079071, "rewards/chosen": -1.3257293701171875, "rewards/margins": 1.3266490697860718, "rewards/rejected": -2.652378559112549, "step": 860 }, { "epoch": 0.5939326705636387, "grad_norm": 27.035059402616938, "learning_rate": 4.5220030349013654e-07, "logits/chosen": -0.5033361911773682, "logits/rejected": -0.4694429039955139, "logps/chosen": -214.79815673828125, "logps/rejected": -237.64102172851562, "loss": 0.4296, "rewards/accuracies": 0.7921874523162842, "rewards/chosen": -1.3357491493225098, "rewards/margins": 1.2649694681167603, "rewards/rejected": -2.6007187366485596, "step": 870 }, { "epoch": 0.6007594828689679, "grad_norm": 27.746278145893346, "learning_rate": 4.446130500758725e-07, "logits/chosen": -0.5227242708206177, "logits/rejected": -0.4751604497432709, "logps/chosen": -218.23658752441406, "logps/rejected": -249.3454132080078, "loss": 0.4233, "rewards/accuracies": 0.817187488079071, "rewards/chosen": -1.3457627296447754, "rewards/margins": 1.428666591644287, "rewards/rejected": -2.7744295597076416, "step": 880 }, { "epoch": 0.6075862951742971, "grad_norm": 26.892931653503698, "learning_rate": 4.370257966616085e-07, "logits/chosen": -0.5066260099411011, "logits/rejected": -0.47855502367019653, "logps/chosen": -214.84915161132812, "logps/rejected": -240.56436157226562, "loss": 0.4612, "rewards/accuracies": 0.7812500596046448, "rewards/chosen": -1.4467679262161255, "rewards/margins": 1.3007091283798218, "rewards/rejected": -2.7474770545959473, "step": 890 }, { "epoch": 0.6144131074796262, "grad_norm": 32.793455771900234, "learning_rate": 4.2943854324734444e-07, "logits/chosen": -0.4987248182296753, "logits/rejected": -0.4517776668071747, "logps/chosen": -218.49545288085938, "logps/rejected": -252.3199462890625, "loss": 0.4007, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4036812782287598, "rewards/margins": 1.497314453125, "rewards/rejected": -2.9009957313537598, "step": 900 }, { "epoch": 0.6212399197849554, "grad_norm": 30.14977908240741, "learning_rate": 4.2185128983308036e-07, "logits/chosen": -0.5123909711837769, "logits/rejected": -0.456384539604187, "logps/chosen": -221.94183349609375, "logps/rejected": -250.5224151611328, "loss": 0.4185, "rewards/accuracies": 0.8265625238418579, "rewards/chosen": -1.3800506591796875, "rewards/margins": 1.4040327072143555, "rewards/rejected": -2.784083366394043, "step": 910 }, { "epoch": 0.6280667320902846, "grad_norm": 23.187149506889586, "learning_rate": 4.142640364188164e-07, "logits/chosen": -0.5007960200309753, "logits/rejected": -0.4656420350074768, "logps/chosen": -224.66000366210938, "logps/rejected": -250.5994873046875, "loss": 0.4194, "rewards/accuracies": 0.817187488079071, "rewards/chosen": -1.4466440677642822, "rewards/margins": 1.3647561073303223, "rewards/rejected": -2.8114004135131836, "step": 920 }, { "epoch": 0.6348935443956137, "grad_norm": 26.465496977643166, "learning_rate": 4.0667678300455234e-07, "logits/chosen": -0.5095345973968506, "logits/rejected": -0.44781219959259033, "logps/chosen": -219.541259765625, "logps/rejected": -253.14544677734375, "loss": 0.3631, "rewards/accuracies": 0.8765624761581421, "rewards/chosen": -1.3718998432159424, "rewards/margins": 1.6033210754394531, "rewards/rejected": -2.9752209186553955, "step": 930 }, { "epoch": 0.641720356700943, "grad_norm": 21.651167586614733, "learning_rate": 3.990895295902883e-07, "logits/chosen": -0.5611530542373657, "logits/rejected": -0.5065969824790955, "logps/chosen": -222.84457397460938, "logps/rejected": -251.35067749023438, "loss": 0.397, "rewards/accuracies": 0.8250000476837158, "rewards/chosen": -1.4304229021072388, "rewards/margins": 1.4556035995483398, "rewards/rejected": -2.886026620864868, "step": 940 }, { "epoch": 0.6485471690062722, "grad_norm": 21.56653990852637, "learning_rate": 3.915022761760243e-07, "logits/chosen": -0.575349748134613, "logits/rejected": -0.5415146350860596, "logps/chosen": -209.71266174316406, "logps/rejected": -239.22946166992188, "loss": 0.4001, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.521388053894043, "rewards/margins": 1.4791213274002075, "rewards/rejected": -3.000509262084961, "step": 950 }, { "epoch": 0.6553739813116013, "grad_norm": 23.31036794244746, "learning_rate": 3.8391502276176024e-07, "logits/chosen": -0.5698951482772827, "logits/rejected": -0.5178714394569397, "logps/chosen": -228.25030517578125, "logps/rejected": -261.415771484375, "loss": 0.3891, "rewards/accuracies": 0.817187488079071, "rewards/chosen": -1.6143665313720703, "rewards/margins": 1.645197868347168, "rewards/rejected": -3.2595643997192383, "step": 960 }, { "epoch": 0.6622007936169305, "grad_norm": 26.214223596010875, "learning_rate": 3.763277693474962e-07, "logits/chosen": -0.5214463472366333, "logits/rejected": -0.46749287843704224, "logps/chosen": -218.10549926757812, "logps/rejected": -251.87442016601562, "loss": 0.4196, "rewards/accuracies": 0.8125, "rewards/chosen": -1.63547945022583, "rewards/margins": 1.4419658184051514, "rewards/rejected": -3.0774452686309814, "step": 970 }, { "epoch": 0.6690276059222596, "grad_norm": 24.89349466924626, "learning_rate": 3.687405159332321e-07, "logits/chosen": -0.5419428944587708, "logits/rejected": -0.5022714734077454, "logps/chosen": -223.1068115234375, "logps/rejected": -255.94949340820312, "loss": 0.4144, "rewards/accuracies": 0.815625011920929, "rewards/chosen": -1.7430050373077393, "rewards/margins": 1.4413095712661743, "rewards/rejected": -3.184314489364624, "step": 980 }, { "epoch": 0.6758544182275888, "grad_norm": 25.914909518247867, "learning_rate": 3.611532625189681e-07, "logits/chosen": -0.5115488767623901, "logits/rejected": -0.4625004827976227, "logps/chosen": -229.49105834960938, "logps/rejected": -265.0625, "loss": 0.3983, "rewards/accuracies": 0.8218750357627869, "rewards/chosen": -1.6320453882217407, "rewards/margins": 1.5734854936599731, "rewards/rejected": -3.2055306434631348, "step": 990 }, { "epoch": 0.6826812305329181, "grad_norm": 31.456143694319483, "learning_rate": 3.5356600910470406e-07, "logits/chosen": -0.5371730327606201, "logits/rejected": -0.4974362850189209, "logps/chosen": -236.7477569580078, "logps/rejected": -264.0472106933594, "loss": 0.3827, "rewards/accuracies": 0.8437500596046448, "rewards/chosen": -1.620214819908142, "rewards/margins": 1.5398459434509277, "rewards/rejected": -3.1600606441497803, "step": 1000 }, { "epoch": 0.6895080428382472, "grad_norm": 29.661159656571126, "learning_rate": 3.459787556904401e-07, "logits/chosen": -0.5440015196800232, "logits/rejected": -0.49301889538764954, "logps/chosen": -224.03494262695312, "logps/rejected": -254.42193603515625, "loss": 0.4033, "rewards/accuracies": 0.8296875357627869, "rewards/chosen": -1.5924382209777832, "rewards/margins": 1.5776193141937256, "rewards/rejected": -3.170057773590088, "step": 1010 }, { "epoch": 0.6963348551435764, "grad_norm": 38.12069128333079, "learning_rate": 3.3839150227617604e-07, "logits/chosen": -0.5860447883605957, "logits/rejected": -0.543270468711853, "logps/chosen": -228.84930419921875, "logps/rejected": -262.8966064453125, "loss": 0.3898, "rewards/accuracies": 0.8406250476837158, "rewards/chosen": -1.6053173542022705, "rewards/margins": 1.590077519416809, "rewards/rejected": -3.19539475440979, "step": 1020 }, { "epoch": 0.7031616674489056, "grad_norm": 32.08364090632609, "learning_rate": 3.30804248861912e-07, "logits/chosen": -0.6051906943321228, "logits/rejected": -0.5597983598709106, "logps/chosen": -224.02899169921875, "logps/rejected": -258.93511962890625, "loss": 0.396, "rewards/accuracies": 0.8171875476837158, "rewards/chosen": -1.7182796001434326, "rewards/margins": 1.5724890232086182, "rewards/rejected": -3.290768623352051, "step": 1030 }, { "epoch": 0.7099884797542347, "grad_norm": 25.599680429412086, "learning_rate": 3.232169954476479e-07, "logits/chosen": -0.6112679243087769, "logits/rejected": -0.5801026821136475, "logps/chosen": -225.71258544921875, "logps/rejected": -264.3663330078125, "loss": 0.3637, "rewards/accuracies": 0.8421875238418579, "rewards/chosen": -1.4613301753997803, "rewards/margins": 1.712023138999939, "rewards/rejected": -3.1733531951904297, "step": 1040 }, { "epoch": 0.716815292059564, "grad_norm": 26.325121380352627, "learning_rate": 3.156297420333839e-07, "logits/chosen": -0.6216264963150024, "logits/rejected": -0.5548665523529053, "logps/chosen": -226.58059692382812, "logps/rejected": -263.7754821777344, "loss": 0.3636, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6483052968978882, "rewards/margins": 1.7705044746398926, "rewards/rejected": -3.4188098907470703, "step": 1050 }, { "epoch": 0.7236421043648931, "grad_norm": 23.347203569226366, "learning_rate": 3.0804248861911986e-07, "logits/chosen": -0.5403355360031128, "logits/rejected": -0.49409806728363037, "logps/chosen": -225.88253784179688, "logps/rejected": -256.93182373046875, "loss": 0.393, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6747300624847412, "rewards/margins": 1.6634035110473633, "rewards/rejected": -3.3381335735321045, "step": 1060 }, { "epoch": 0.7304689166702223, "grad_norm": 26.591582696664684, "learning_rate": 3.004552352048558e-07, "logits/chosen": -0.60378497838974, "logits/rejected": -0.5446761250495911, "logps/chosen": -222.86285400390625, "logps/rejected": -254.32901000976562, "loss": 0.3562, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.373286247253418, "rewards/margins": 1.5952813625335693, "rewards/rejected": -2.9685676097869873, "step": 1070 }, { "epoch": 0.7372957289755515, "grad_norm": 26.301256433411677, "learning_rate": 2.928679817905918e-07, "logits/chosen": -0.575655996799469, "logits/rejected": -0.5388238430023193, "logps/chosen": -226.25411987304688, "logps/rejected": -257.7029724121094, "loss": 0.3889, "rewards/accuracies": 0.832812488079071, "rewards/chosen": -1.5678967237472534, "rewards/margins": 1.5496362447738647, "rewards/rejected": -3.1175332069396973, "step": 1080 }, { "epoch": 0.7441225412808806, "grad_norm": 29.1969544488184, "learning_rate": 2.8528072837632776e-07, "logits/chosen": -0.563581109046936, "logits/rejected": -0.4889605939388275, "logps/chosen": -215.546630859375, "logps/rejected": -251.0224609375, "loss": 0.3594, "rewards/accuracies": 0.854687511920929, "rewards/chosen": -1.5211578607559204, "rewards/margins": 1.6970359086990356, "rewards/rejected": -3.218193531036377, "step": 1090 }, { "epoch": 0.7509493535862098, "grad_norm": 28.75255873182244, "learning_rate": 2.776934749620637e-07, "logits/chosen": -0.5607287883758545, "logits/rejected": -0.5297821760177612, "logps/chosen": -213.63365173339844, "logps/rejected": -240.619384765625, "loss": 0.4057, "rewards/accuracies": 0.8203125, "rewards/chosen": -1.6024796962738037, "rewards/margins": 1.537475347518921, "rewards/rejected": -3.1399548053741455, "step": 1100 }, { "epoch": 0.7577761658915391, "grad_norm": 28.027697277996715, "learning_rate": 2.7010622154779964e-07, "logits/chosen": -0.5775099992752075, "logits/rejected": -0.5231542587280273, "logps/chosen": -224.80667114257812, "logps/rejected": -259.0721435546875, "loss": 0.4044, "rewards/accuracies": 0.8140624761581421, "rewards/chosen": -1.598193883895874, "rewards/margins": 1.5613579750061035, "rewards/rejected": -3.1595516204833984, "step": 1110 }, { "epoch": 0.7646029781968682, "grad_norm": 19.772049611357087, "learning_rate": 2.6251896813353566e-07, "logits/chosen": -0.5745671987533569, "logits/rejected": -0.5299438834190369, "logps/chosen": -225.1347198486328, "logps/rejected": -255.4309539794922, "loss": 0.3858, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.4894109964370728, "rewards/margins": 1.691686987876892, "rewards/rejected": -3.181097984313965, "step": 1120 }, { "epoch": 0.7714297905021974, "grad_norm": 23.20450105175028, "learning_rate": 2.549317147192716e-07, "logits/chosen": -0.5600322484970093, "logits/rejected": -0.5002347230911255, "logps/chosen": -219.86434936523438, "logps/rejected": -253.78878784179688, "loss": 0.3663, "rewards/accuracies": 0.8453125357627869, "rewards/chosen": -1.4636483192443848, "rewards/margins": 1.6817249059677124, "rewards/rejected": -3.1453733444213867, "step": 1130 }, { "epoch": 0.7782566028075265, "grad_norm": 28.72150866508454, "learning_rate": 2.473444613050076e-07, "logits/chosen": -0.6041327118873596, "logits/rejected": -0.5645285844802856, "logps/chosen": -216.68939208984375, "logps/rejected": -247.66275024414062, "loss": 0.3806, "rewards/accuracies": 0.8328125476837158, "rewards/chosen": -1.5338340997695923, "rewards/margins": 1.5916988849639893, "rewards/rejected": -3.125532865524292, "step": 1140 }, { "epoch": 0.7850834151128557, "grad_norm": 29.858461214238897, "learning_rate": 2.3975720789074356e-07, "logits/chosen": -0.6299252510070801, "logits/rejected": -0.586955189704895, "logps/chosen": -231.401611328125, "logps/rejected": -263.02197265625, "loss": 0.3998, "rewards/accuracies": 0.8328125476837158, "rewards/chosen": -1.6045633554458618, "rewards/margins": 1.6497775316238403, "rewards/rejected": -3.2543411254882812, "step": 1150 }, { "epoch": 0.791910227418185, "grad_norm": 26.24413163476253, "learning_rate": 2.321699544764795e-07, "logits/chosen": -0.5830259919166565, "logits/rejected": -0.5397896766662598, "logps/chosen": -213.19375610351562, "logps/rejected": -249.24717712402344, "loss": 0.3717, "rewards/accuracies": 0.8250000476837158, "rewards/chosen": -1.6501479148864746, "rewards/margins": 1.6961115598678589, "rewards/rejected": -3.346259593963623, "step": 1160 }, { "epoch": 0.7987370397235141, "grad_norm": 31.016581977192125, "learning_rate": 2.2458270106221546e-07, "logits/chosen": -0.5983390808105469, "logits/rejected": -0.5455670952796936, "logps/chosen": -224.10618591308594, "logps/rejected": -254.94383239746094, "loss": 0.3732, "rewards/accuracies": 0.8296875357627869, "rewards/chosen": -1.5914267301559448, "rewards/margins": 1.632917046546936, "rewards/rejected": -3.2243435382843018, "step": 1170 }, { "epoch": 0.8055638520288433, "grad_norm": 82.84012389678055, "learning_rate": 2.1699544764795143e-07, "logits/chosen": -0.6019859910011292, "logits/rejected": -0.5678104758262634, "logps/chosen": -222.878662109375, "logps/rejected": -253.78060913085938, "loss": 0.4147, "rewards/accuracies": 0.828125, "rewards/chosen": -1.6500358581542969, "rewards/margins": 1.5844390392303467, "rewards/rejected": -3.2344746589660645, "step": 1180 }, { "epoch": 0.8123906643341725, "grad_norm": 22.55342908794488, "learning_rate": 2.094081942336874e-07, "logits/chosen": -0.5819066762924194, "logits/rejected": -0.5418481826782227, "logps/chosen": -221.70608520507812, "logps/rejected": -254.09922790527344, "loss": 0.3354, "rewards/accuracies": 0.8609375357627869, "rewards/chosen": -1.6243677139282227, "rewards/margins": 1.7402938604354858, "rewards/rejected": -3.364661455154419, "step": 1190 }, { "epoch": 0.8192174766395016, "grad_norm": 21.249823285036445, "learning_rate": 2.0182094081942336e-07, "logits/chosen": -0.5841631889343262, "logits/rejected": -0.5415323972702026, "logps/chosen": -225.88800048828125, "logps/rejected": -254.038818359375, "loss": 0.3821, "rewards/accuracies": 0.839062511920929, "rewards/chosen": -1.6330121755599976, "rewards/margins": 1.732587456703186, "rewards/rejected": -3.3655996322631836, "step": 1200 }, { "epoch": 0.8260442889448308, "grad_norm": 22.957761561567523, "learning_rate": 1.9423368740515933e-07, "logits/chosen": -0.5876274704933167, "logits/rejected": -0.5527446866035461, "logps/chosen": -237.04470825195312, "logps/rejected": -263.58868408203125, "loss": 0.3658, "rewards/accuracies": 0.8531250357627869, "rewards/chosen": -1.6271567344665527, "rewards/margins": 1.6703208684921265, "rewards/rejected": -3.297477960586548, "step": 1210 }, { "epoch": 0.83287110125016, "grad_norm": 26.3109466733547, "learning_rate": 1.8664643399089527e-07, "logits/chosen": -0.5855602622032166, "logits/rejected": -0.5348464846611023, "logps/chosen": -220.74581909179688, "logps/rejected": -259.97076416015625, "loss": 0.392, "rewards/accuracies": 0.8234375715255737, "rewards/chosen": -1.666372299194336, "rewards/margins": 1.7341811656951904, "rewards/rejected": -3.4005534648895264, "step": 1220 }, { "epoch": 0.8396979135554892, "grad_norm": 32.86005475979103, "learning_rate": 1.7905918057663124e-07, "logits/chosen": -0.6146824359893799, "logits/rejected": -0.5769205093383789, "logps/chosen": -223.04859924316406, "logps/rejected": -259.2931213378906, "loss": 0.3747, "rewards/accuracies": 0.8484375476837158, "rewards/chosen": -1.6388548612594604, "rewards/margins": 1.6829884052276611, "rewards/rejected": -3.321843147277832, "step": 1230 }, { "epoch": 0.8465247258608184, "grad_norm": 27.824013672905682, "learning_rate": 1.7147192716236723e-07, "logits/chosen": -0.5848041772842407, "logits/rejected": -0.5365484356880188, "logps/chosen": -224.9688262939453, "logps/rejected": -253.75857543945312, "loss": 0.374, "rewards/accuracies": 0.8343750238418579, "rewards/chosen": -1.633022427558899, "rewards/margins": 1.6262296438217163, "rewards/rejected": -3.2592520713806152, "step": 1240 }, { "epoch": 0.8533515381661475, "grad_norm": 28.870976428951412, "learning_rate": 1.638846737481032e-07, "logits/chosen": -0.6266176700592041, "logits/rejected": -0.5750494003295898, "logps/chosen": -225.53489685058594, "logps/rejected": -251.16812133789062, "loss": 0.3643, "rewards/accuracies": 0.8421875238418579, "rewards/chosen": -1.6029326915740967, "rewards/margins": 1.637751817703247, "rewards/rejected": -3.2406845092773438, "step": 1250 }, { "epoch": 0.8601783504714767, "grad_norm": 28.44671682958466, "learning_rate": 1.5629742033383914e-07, "logits/chosen": -0.5748768448829651, "logits/rejected": -0.5039246082305908, "logps/chosen": -229.083740234375, "logps/rejected": -265.5872802734375, "loss": 0.3464, "rewards/accuracies": 0.8515625, "rewards/chosen": -1.6385741233825684, "rewards/margins": 2.006284713745117, "rewards/rejected": -3.6448588371276855, "step": 1260 }, { "epoch": 0.867005162776806, "grad_norm": 26.03554320093484, "learning_rate": 1.487101669195751e-07, "logits/chosen": -0.580173671245575, "logits/rejected": -0.5294475555419922, "logps/chosen": -225.72938537597656, "logps/rejected": -262.03546142578125, "loss": 0.3718, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.6143238544464111, "rewards/margins": 1.8427155017852783, "rewards/rejected": -3.4570393562316895, "step": 1270 }, { "epoch": 0.8738319750821351, "grad_norm": 22.97729500897279, "learning_rate": 1.4112291350531107e-07, "logits/chosen": -0.6003884673118591, "logits/rejected": -0.5561665296554565, "logps/chosen": -221.987548828125, "logps/rejected": -258.51727294921875, "loss": 0.3686, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.4877190589904785, "rewards/margins": 1.6500287055969238, "rewards/rejected": -3.1377477645874023, "step": 1280 }, { "epoch": 0.8806587873874643, "grad_norm": 31.37447822214391, "learning_rate": 1.3353566009104704e-07, "logits/chosen": -0.6364210844039917, "logits/rejected": -0.575194239616394, "logps/chosen": -225.1094207763672, "logps/rejected": -260.13885498046875, "loss": 0.3534, "rewards/accuracies": 0.864062488079071, "rewards/chosen": -1.6206319332122803, "rewards/margins": 1.7905977964401245, "rewards/rejected": -3.4112298488616943, "step": 1290 }, { "epoch": 0.8874855996927934, "grad_norm": 22.936789815076953, "learning_rate": 1.25948406676783e-07, "logits/chosen": -0.6323338747024536, "logits/rejected": -0.6003640294075012, "logps/chosen": -227.20034790039062, "logps/rejected": -259.46502685546875, "loss": 0.3575, "rewards/accuracies": 0.8406250476837158, "rewards/chosen": -1.6749684810638428, "rewards/margins": 1.7170754671096802, "rewards/rejected": -3.3920438289642334, "step": 1300 }, { "epoch": 0.8943124119981226, "grad_norm": 22.489511604558004, "learning_rate": 1.1836115326251896e-07, "logits/chosen": -0.6401182413101196, "logits/rejected": -0.5833394527435303, "logps/chosen": -223.30029296875, "logps/rejected": -262.72998046875, "loss": 0.3353, "rewards/accuracies": 0.8593750596046448, "rewards/chosen": -1.560599446296692, "rewards/margins": 1.906503677368164, "rewards/rejected": -3.4671034812927246, "step": 1310 }, { "epoch": 0.9011392243034518, "grad_norm": 37.43162732034228, "learning_rate": 1.1077389984825493e-07, "logits/chosen": -0.5761069059371948, "logits/rejected": -0.5430048108100891, "logps/chosen": -237.7594757080078, "logps/rejected": -275.5934753417969, "loss": 0.3514, "rewards/accuracies": 0.859375, "rewards/chosen": -1.6714935302734375, "rewards/margins": 1.8643473386764526, "rewards/rejected": -3.5358407497406006, "step": 1320 }, { "epoch": 0.907966036608781, "grad_norm": 22.988879587386872, "learning_rate": 1.0318664643399089e-07, "logits/chosen": -0.5806565284729004, "logits/rejected": -0.5450279116630554, "logps/chosen": -221.33053588867188, "logps/rejected": -256.5147705078125, "loss": 0.3729, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.6562050580978394, "rewards/margins": 1.747424840927124, "rewards/rejected": -3.403630018234253, "step": 1330 }, { "epoch": 0.9147928489141102, "grad_norm": 19.80848176554877, "learning_rate": 9.559939301972686e-08, "logits/chosen": -0.6481366157531738, "logits/rejected": -0.6148696541786194, "logps/chosen": -224.6954803466797, "logps/rejected": -256.4845275878906, "loss": 0.3775, "rewards/accuracies": 0.8421875238418579, "rewards/chosen": -1.7428375482559204, "rewards/margins": 1.636692762374878, "rewards/rejected": -3.379530191421509, "step": 1340 }, { "epoch": 0.9216196612194394, "grad_norm": 25.8470434123946, "learning_rate": 8.801213960546281e-08, "logits/chosen": -0.6496397852897644, "logits/rejected": -0.5912147164344788, "logps/chosen": -223.9413299560547, "logps/rejected": -259.1372375488281, "loss": 0.3461, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.6729114055633545, "rewards/margins": 1.7730145454406738, "rewards/rejected": -3.445925712585449, "step": 1350 }, { "epoch": 0.9284464735247685, "grad_norm": 33.2201336722171, "learning_rate": 8.042488619119878e-08, "logits/chosen": -0.645717203617096, "logits/rejected": -0.6112032532691956, "logps/chosen": -225.99624633789062, "logps/rejected": -257.4811706542969, "loss": 0.4065, "rewards/accuracies": 0.8218750357627869, "rewards/chosen": -1.761589527130127, "rewards/margins": 1.6280558109283447, "rewards/rejected": -3.389645576477051, "step": 1360 }, { "epoch": 0.9352732858300977, "grad_norm": 27.005710517490183, "learning_rate": 7.283763277693475e-08, "logits/chosen": -0.573918342590332, "logits/rejected": -0.5335432291030884, "logps/chosen": -225.52552795410156, "logps/rejected": -255.49449157714844, "loss": 0.3465, "rewards/accuracies": 0.8531249761581421, "rewards/chosen": -1.7273519039154053, "rewards/margins": 1.7527152299880981, "rewards/rejected": -3.480067253112793, "step": 1370 }, { "epoch": 0.9421000981354269, "grad_norm": 32.140399259495645, "learning_rate": 6.525037936267071e-08, "logits/chosen": -0.6214314103126526, "logits/rejected": -0.570462167263031, "logps/chosen": -224.70672607421875, "logps/rejected": -264.4761962890625, "loss": 0.3218, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.6792542934417725, "rewards/margins": 1.915861964225769, "rewards/rejected": -3.595116138458252, "step": 1380 }, { "epoch": 0.948926910440756, "grad_norm": 31.250154294424732, "learning_rate": 5.766312594840667e-08, "logits/chosen": -0.6339004635810852, "logits/rejected": -0.5892723798751831, "logps/chosen": -220.26611328125, "logps/rejected": -252.96212768554688, "loss": 0.3864, "rewards/accuracies": 0.8312499523162842, "rewards/chosen": -1.6645467281341553, "rewards/margins": 1.5790960788726807, "rewards/rejected": -3.243642807006836, "step": 1390 }, { "epoch": 0.9557537227460853, "grad_norm": 30.068762957187783, "learning_rate": 5.007587253414264e-08, "logits/chosen": -0.678811252117157, "logits/rejected": -0.6359538435935974, "logps/chosen": -224.49069213867188, "logps/rejected": -258.3272705078125, "loss": 0.3447, "rewards/accuracies": 0.8531250357627869, "rewards/chosen": -1.575748085975647, "rewards/margins": 1.9220972061157227, "rewards/rejected": -3.49784517288208, "step": 1400 }, { "epoch": 0.9625805350514144, "grad_norm": 22.16371068962549, "learning_rate": 4.2488619119878606e-08, "logits/chosen": -0.6366287469863892, "logits/rejected": -0.5852836966514587, "logps/chosen": -227.71780395507812, "logps/rejected": -267.0358581542969, "loss": 0.3718, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6289258003234863, "rewards/margins": 1.7643526792526245, "rewards/rejected": -3.3932785987854004, "step": 1410 }, { "epoch": 0.9694073473567436, "grad_norm": 31.032456565988113, "learning_rate": 3.4901365705614566e-08, "logits/chosen": -0.6306103467941284, "logits/rejected": -0.5921708345413208, "logps/chosen": -221.66065979003906, "logps/rejected": -254.41958618164062, "loss": 0.3678, "rewards/accuracies": 0.823437511920929, "rewards/chosen": -1.5656054019927979, "rewards/margins": 1.682039499282837, "rewards/rejected": -3.2476449012756348, "step": 1420 }, { "epoch": 0.9762341596620728, "grad_norm": 26.873435878225383, "learning_rate": 2.731411229135053e-08, "logits/chosen": -0.6624563336372375, "logits/rejected": -0.6294071078300476, "logps/chosen": -224.36407470703125, "logps/rejected": -263.2255859375, "loss": 0.3681, "rewards/accuracies": 0.8484375476837158, "rewards/chosen": -1.7730777263641357, "rewards/margins": 1.7527307271957397, "rewards/rejected": -3.525808334350586, "step": 1430 }, { "epoch": 0.9830609719674019, "grad_norm": 28.36352572432148, "learning_rate": 1.9726858877086493e-08, "logits/chosen": -0.6402366161346436, "logits/rejected": -0.5960521697998047, "logps/chosen": -225.24977111816406, "logps/rejected": -257.8275451660156, "loss": 0.3734, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7362611293792725, "rewards/margins": 1.7620372772216797, "rewards/rejected": -3.498298168182373, "step": 1440 }, { "epoch": 0.9898877842727312, "grad_norm": 30.66526971215358, "learning_rate": 1.2139605462822458e-08, "logits/chosen": -0.6005350351333618, "logits/rejected": -0.5661831498146057, "logps/chosen": -227.962158203125, "logps/rejected": -261.6782531738281, "loss": 0.3924, "rewards/accuracies": 0.8328125476837158, "rewards/chosen": -1.720937728881836, "rewards/margins": 1.5882391929626465, "rewards/rejected": -3.3091769218444824, "step": 1450 }, { "epoch": 0.9967145965780604, "grad_norm": 36.64240487573334, "learning_rate": 4.552352048558422e-09, "logits/chosen": -0.6393886804580688, "logits/rejected": -0.6115251183509827, "logps/chosen": -229.70652770996094, "logps/rejected": -268.06982421875, "loss": 0.3379, "rewards/accuracies": 0.8734375238418579, "rewards/chosen": -1.651149034500122, "rewards/margins": 1.8959904909133911, "rewards/rejected": -3.5471396446228027, "step": 1460 }, { "epoch": 1.0, "step": 1465, "total_flos": 161167907028992.0, "train_loss": 0.47723283336431094, "train_runtime": 14257.9418, "train_samples_per_second": 6.575, "train_steps_per_second": 0.103 } ], "logging_steps": 10, "max_steps": 1465, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 161167907028992.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }