{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.982222222222222, "eval_steps": 1, "global_step": 336, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011851851851851851, "grad_norm": 44.10000740195015, "learning_rate": 1.4705882352941176e-08, "logits/chosen": -1.1635093688964844, "logits/rejected": -0.9440154433250427, "logps/chosen": -26.389511108398438, "logps/rejected": -42.156002044677734, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.023703703703703703, "grad_norm": 45.622821831639094, "learning_rate": 2.941176470588235e-08, "logits/chosen": -0.8899029493331909, "logits/rejected": -0.9265471696853638, "logps/chosen": -24.45637321472168, "logps/rejected": -38.72291564941406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.035555555555555556, "grad_norm": 41.287867804704256, "learning_rate": 4.411764705882353e-08, "logits/chosen": -0.9218576550483704, "logits/rejected": -0.8510868549346924, "logps/chosen": -23.573394775390625, "logps/rejected": -31.830120086669922, "loss": 0.6917, "rewards/accuracies": 0.625, "rewards/chosen": 0.05094228684902191, "rewards/margins": 0.055795177817344666, "rewards/rejected": -0.004852890968322754, "step": 3 }, { "epoch": 0.047407407407407405, "grad_norm": 41.148615147033524, "learning_rate": 5.88235294117647e-08, "logits/chosen": -0.8889421820640564, "logits/rejected": -0.7832293510437012, "logps/chosen": -27.102622985839844, "logps/rejected": -32.83424377441406, "loss": 0.692, "rewards/accuracies": 0.5, "rewards/chosen": 0.005930736660957336, "rewards/margins": 0.013045087456703186, "rewards/rejected": -0.00711435079574585, "step": 4 }, { "epoch": 0.05925925925925926, "grad_norm": 41.57192528486562, "learning_rate": 7.352941176470588e-08, "logits/chosen": -0.8269144296646118, "logits/rejected": -0.8342342376708984, "logps/chosen": -26.83285903930664, "logps/rejected": -33.845359802246094, "loss": 0.7004, "rewards/accuracies": 0.5625, "rewards/chosen": 0.002873659133911133, "rewards/margins": 0.0599842369556427, "rewards/rejected": -0.05711057782173157, "step": 5 }, { "epoch": 0.07111111111111111, "grad_norm": 42.964145384550164, "learning_rate": 8.823529411764706e-08, "logits/chosen": -0.9288309216499329, "logits/rejected": -0.9066528677940369, "logps/chosen": -31.687969207763672, "logps/rejected": -35.163841247558594, "loss": 0.701, "rewards/accuracies": 0.5625, "rewards/chosen": 0.005395621061325073, "rewards/margins": -0.016778230667114258, "rewards/rejected": 0.02217385172843933, "step": 6 }, { "epoch": 0.08296296296296296, "grad_norm": 38.3846396537961, "learning_rate": 1.0294117647058822e-07, "logits/chosen": -0.9132620096206665, "logits/rejected": -0.7912867665290833, "logps/chosen": -24.47614860534668, "logps/rejected": -32.74094009399414, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": 0.02499394118785858, "rewards/margins": 0.010348424315452576, "rewards/rejected": 0.014645516872406006, "step": 7 }, { "epoch": 0.09481481481481481, "grad_norm": 43.34894792705672, "learning_rate": 1.176470588235294e-07, "logits/chosen": -0.8170281648635864, "logits/rejected": -0.8093118667602539, "logps/chosen": -21.367229461669922, "logps/rejected": -30.556249618530273, "loss": 0.6943, "rewards/accuracies": 0.625, "rewards/chosen": 0.021743685007095337, "rewards/margins": 0.05349762737751007, "rewards/rejected": -0.031753942370414734, "step": 8 }, { "epoch": 0.10666666666666667, "grad_norm": 44.768590418142296, "learning_rate": 1.3235294117647057e-07, "logits/chosen": -0.8906874656677246, "logits/rejected": -0.8358623385429382, "logps/chosen": -27.88587760925293, "logps/rejected": -30.677749633789062, "loss": 0.7014, "rewards/accuracies": 0.5, "rewards/chosen": 0.00475698709487915, "rewards/margins": 0.035931557416915894, "rewards/rejected": -0.031174570322036743, "step": 9 }, { "epoch": 0.11851851851851852, "grad_norm": 41.386325746824284, "learning_rate": 1.4705882352941175e-07, "logits/chosen": -1.0302842855453491, "logits/rejected": -0.8634576201438904, "logps/chosen": -28.216838836669922, "logps/rejected": -38.4200553894043, "loss": 0.6967, "rewards/accuracies": 0.5625, "rewards/chosen": 0.02494041621685028, "rewards/margins": 0.07226283848285675, "rewards/rejected": -0.04732242226600647, "step": 10 }, { "epoch": 0.13037037037037036, "grad_norm": 42.87170433913047, "learning_rate": 1.6176470588235293e-07, "logits/chosen": -0.8272039890289307, "logits/rejected": -0.8201614618301392, "logps/chosen": -24.542991638183594, "logps/rejected": -33.56885528564453, "loss": 0.7016, "rewards/accuracies": 0.4375, "rewards/chosen": 0.04099439084529877, "rewards/margins": 0.02981768548488617, "rewards/rejected": 0.011176705360412598, "step": 11 }, { "epoch": 0.14222222222222222, "grad_norm": 41.54515829050869, "learning_rate": 1.764705882352941e-07, "logits/chosen": -0.8868040442466736, "logits/rejected": -0.8360949158668518, "logps/chosen": -29.391693115234375, "logps/rejected": -39.35624694824219, "loss": 0.6989, "rewards/accuracies": 0.6875, "rewards/chosen": 0.020398467779159546, "rewards/margins": 0.034372299909591675, "rewards/rejected": -0.013973832130432129, "step": 12 }, { "epoch": 0.15407407407407409, "grad_norm": 40.960317074043914, "learning_rate": 1.9117647058823527e-07, "logits/chosen": -0.9931007623672485, "logits/rejected": -0.9051375985145569, "logps/chosen": -21.935997009277344, "logps/rejected": -29.908475875854492, "loss": 0.6973, "rewards/accuracies": 0.5625, "rewards/chosen": -0.006315797567367554, "rewards/margins": 0.032275840640068054, "rewards/rejected": -0.03859163820743561, "step": 13 }, { "epoch": 0.16592592592592592, "grad_norm": 46.37678646749312, "learning_rate": 2.0588235294117645e-07, "logits/chosen": -0.736880898475647, "logits/rejected": -0.6582351326942444, "logps/chosen": -28.070615768432617, "logps/rejected": -37.080623626708984, "loss": 0.6942, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0168197900056839, "rewards/margins": 0.020758137106895447, "rewards/rejected": -0.037577927112579346, "step": 14 }, { "epoch": 0.17777777777777778, "grad_norm": 38.418772787456916, "learning_rate": 2.2058823529411763e-07, "logits/chosen": -0.8958194851875305, "logits/rejected": -0.8823959827423096, "logps/chosen": -24.240140914916992, "logps/rejected": -36.511985778808594, "loss": 0.6853, "rewards/accuracies": 0.5, "rewards/chosen": -0.010584741830825806, "rewards/margins": 0.013375014066696167, "rewards/rejected": -0.023959755897521973, "step": 15 }, { "epoch": 0.18962962962962962, "grad_norm": 39.784578323473944, "learning_rate": 2.352941176470588e-07, "logits/chosen": -1.044739007949829, "logits/rejected": -0.9721382260322571, "logps/chosen": -24.203937530517578, "logps/rejected": -38.13182830810547, "loss": 0.6644, "rewards/accuracies": 0.625, "rewards/chosen": -0.008091084659099579, "rewards/margins": 0.03979543596506119, "rewards/rejected": -0.04788652062416077, "step": 16 }, { "epoch": 0.20148148148148148, "grad_norm": 38.68619452262893, "learning_rate": 2.5e-07, "logits/chosen": -0.9131325483322144, "logits/rejected": -0.9099739193916321, "logps/chosen": -23.27505874633789, "logps/rejected": -25.550016403198242, "loss": 0.6639, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04493655264377594, "rewards/margins": 0.11525127291679382, "rewards/rejected": -0.07031472027301788, "step": 17 }, { "epoch": 0.21333333333333335, "grad_norm": 38.89353521239618, "learning_rate": 2.6470588235294114e-07, "logits/chosen": -1.1501476764678955, "logits/rejected": -1.0104213953018188, "logps/chosen": -28.398540496826172, "logps/rejected": -40.202754974365234, "loss": 0.6675, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0002931952476501465, "rewards/margins": 0.1416773796081543, "rewards/rejected": -0.14197057485580444, "step": 18 }, { "epoch": 0.22518518518518518, "grad_norm": 39.73809119940035, "learning_rate": 2.7941176470588235e-07, "logits/chosen": -0.6393623948097229, "logits/rejected": -0.5715636014938354, "logps/chosen": -23.02471160888672, "logps/rejected": -29.500215530395508, "loss": 0.6618, "rewards/accuracies": 0.6875, "rewards/chosen": 0.012419655919075012, "rewards/margins": 0.04500822722911835, "rewards/rejected": -0.032588571310043335, "step": 19 }, { "epoch": 0.23703703703703705, "grad_norm": 36.24445457135461, "learning_rate": 2.941176470588235e-07, "logits/chosen": -1.090634822845459, "logits/rejected": -1.0109808444976807, "logps/chosen": -22.518497467041016, "logps/rejected": -28.288860321044922, "loss": 0.6407, "rewards/accuracies": 0.6875, "rewards/chosen": 0.004169940948486328, "rewards/margins": 0.08663815259933472, "rewards/rejected": -0.08246821165084839, "step": 20 }, { "epoch": 0.24888888888888888, "grad_norm": 37.48843626542997, "learning_rate": 3.088235294117647e-07, "logits/chosen": -0.9397974610328674, "logits/rejected": -0.8281663060188293, "logps/chosen": -29.923145294189453, "logps/rejected": -37.80279541015625, "loss": 0.6361, "rewards/accuracies": 0.8125, "rewards/chosen": -0.010303795337677002, "rewards/margins": 0.1979476809501648, "rewards/rejected": -0.2082514762878418, "step": 21 }, { "epoch": 0.2607407407407407, "grad_norm": 37.17562909629751, "learning_rate": 3.2352941176470586e-07, "logits/chosen": -0.8852977752685547, "logits/rejected": -0.8319816589355469, "logps/chosen": -23.00829315185547, "logps/rejected": -28.55397605895996, "loss": 0.6446, "rewards/accuracies": 0.75, "rewards/chosen": -0.001584082841873169, "rewards/margins": 0.16339415311813354, "rewards/rejected": -0.1649782359600067, "step": 22 }, { "epoch": 0.2725925925925926, "grad_norm": 35.98779991504583, "learning_rate": 3.3823529411764707e-07, "logits/chosen": -0.7651995420455933, "logits/rejected": -0.7312899827957153, "logps/chosen": -31.04439926147461, "logps/rejected": -37.98454284667969, "loss": 0.6453, "rewards/accuracies": 0.5625, "rewards/chosen": -0.023012787103652954, "rewards/margins": 0.09663936495780945, "rewards/rejected": -0.1196521520614624, "step": 23 }, { "epoch": 0.28444444444444444, "grad_norm": 36.65037386431617, "learning_rate": 3.529411764705882e-07, "logits/chosen": -0.9652918577194214, "logits/rejected": -0.9185481071472168, "logps/chosen": -30.223522186279297, "logps/rejected": -34.86516189575195, "loss": 0.6319, "rewards/accuracies": 0.625, "rewards/chosen": -0.025013744831085205, "rewards/margins": 0.17298102378845215, "rewards/rejected": -0.19799476861953735, "step": 24 }, { "epoch": 0.2962962962962963, "grad_norm": 36.325213836190166, "learning_rate": 3.6764705882352943e-07, "logits/chosen": -0.8377700448036194, "logits/rejected": -0.7563367486000061, "logps/chosen": -19.788166046142578, "logps/rejected": -32.94764709472656, "loss": 0.603, "rewards/accuracies": 0.6875, "rewards/chosen": -0.00691574439406395, "rewards/margins": 0.249167799949646, "rewards/rejected": -0.25608354806900024, "step": 25 }, { "epoch": 0.30814814814814817, "grad_norm": 33.21927808914221, "learning_rate": 3.8235294117647053e-07, "logits/chosen": -0.9247075319290161, "logits/rejected": -0.9600427746772766, "logps/chosen": -22.75655746459961, "logps/rejected": -33.42902374267578, "loss": 0.5963, "rewards/accuracies": 0.6875, "rewards/chosen": 0.012206077575683594, "rewards/margins": 0.18068939447402954, "rewards/rejected": -0.16848331689834595, "step": 26 }, { "epoch": 0.32, "grad_norm": 33.919179281256405, "learning_rate": 3.9705882352941174e-07, "logits/chosen": -1.0090656280517578, "logits/rejected": -0.8680551052093506, "logps/chosen": -27.313983917236328, "logps/rejected": -32.803958892822266, "loss": 0.5868, "rewards/accuracies": 0.8125, "rewards/chosen": 0.006254285573959351, "rewards/margins": 0.3513309061527252, "rewards/rejected": -0.34507662057876587, "step": 27 }, { "epoch": 0.33185185185185184, "grad_norm": 32.88800631538997, "learning_rate": 4.117647058823529e-07, "logits/chosen": -0.7507399320602417, "logits/rejected": -0.6654347777366638, "logps/chosen": -33.17474365234375, "logps/rejected": -37.52992248535156, "loss": 0.5582, "rewards/accuracies": 0.875, "rewards/chosen": -0.018849045038223267, "rewards/margins": 0.5149893164634705, "rewards/rejected": -0.5338383913040161, "step": 28 }, { "epoch": 0.3437037037037037, "grad_norm": 32.187131672205425, "learning_rate": 4.264705882352941e-07, "logits/chosen": -0.9114011526107788, "logits/rejected": -0.7332407236099243, "logps/chosen": -27.552963256835938, "logps/rejected": -33.381103515625, "loss": 0.569, "rewards/accuracies": 0.75, "rewards/chosen": -0.034624576568603516, "rewards/margins": 0.2657691240310669, "rewards/rejected": -0.3003937005996704, "step": 29 }, { "epoch": 0.35555555555555557, "grad_norm": 31.43465207056781, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -1.080330491065979, "logits/rejected": -1.018049716949463, "logps/chosen": -24.93523406982422, "logps/rejected": -33.0054817199707, "loss": 0.5787, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08209644258022308, "rewards/margins": 0.2938240170478821, "rewards/rejected": -0.37592047452926636, "step": 30 }, { "epoch": 0.3674074074074074, "grad_norm": 29.8319071034986, "learning_rate": 4.5588235294117646e-07, "logits/chosen": -0.7354201078414917, "logits/rejected": -0.5976296663284302, "logps/chosen": -20.997676849365234, "logps/rejected": -32.08062744140625, "loss": 0.5421, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06063076853752136, "rewards/margins": 0.5570548176765442, "rewards/rejected": -0.6176855564117432, "step": 31 }, { "epoch": 0.37925925925925924, "grad_norm": 32.37223816472854, "learning_rate": 4.705882352941176e-07, "logits/chosen": -0.9014286398887634, "logits/rejected": -0.868757963180542, "logps/chosen": -23.115407943725586, "logps/rejected": -39.159507751464844, "loss": 0.549, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0811956524848938, "rewards/margins": 0.4106258153915405, "rewards/rejected": -0.4918214678764343, "step": 32 }, { "epoch": 0.39111111111111113, "grad_norm": 30.829704180417103, "learning_rate": 4.852941176470588e-07, "logits/chosen": -0.8415942788124084, "logits/rejected": -0.826940655708313, "logps/chosen": -25.28696060180664, "logps/rejected": -36.247039794921875, "loss": 0.5377, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11577820032835007, "rewards/margins": 0.5010173916816711, "rewards/rejected": -0.616795539855957, "step": 33 }, { "epoch": 0.40296296296296297, "grad_norm": 34.240433832755805, "learning_rate": 5e-07, "logits/chosen": -1.071217656135559, "logits/rejected": -0.8587817549705505, "logps/chosen": -23.079936981201172, "logps/rejected": -32.364227294921875, "loss": 0.554, "rewards/accuracies": 0.75, "rewards/chosen": -0.06819352507591248, "rewards/margins": 0.4207611680030823, "rewards/rejected": -0.48895469307899475, "step": 34 }, { "epoch": 0.4148148148148148, "grad_norm": 31.79158261280939, "learning_rate": 4.999864732969518e-07, "logits/chosen": -1.041569471359253, "logits/rejected": -0.9538137912750244, "logps/chosen": -29.438274383544922, "logps/rejected": -35.4671745300293, "loss": 0.5322, "rewards/accuracies": 0.75, "rewards/chosen": -0.10224419832229614, "rewards/margins": 0.5241090059280396, "rewards/rejected": -0.6263532042503357, "step": 35 }, { "epoch": 0.4266666666666667, "grad_norm": 28.443303374361268, "learning_rate": 4.999458946515807e-07, "logits/chosen": -1.1223492622375488, "logits/rejected": -1.040766954421997, "logps/chosen": -32.29949951171875, "logps/rejected": -41.46755599975586, "loss": 0.5017, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0577593594789505, "rewards/margins": 0.6482563018798828, "rewards/rejected": -0.7060155868530273, "step": 36 }, { "epoch": 0.43851851851851853, "grad_norm": 30.648927045340578, "learning_rate": 4.998782684550491e-07, "logits/chosen": -0.9065847992897034, "logits/rejected": -0.8718705177307129, "logps/chosen": -21.124893188476562, "logps/rejected": -39.29669952392578, "loss": 0.5147, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09413473308086395, "rewards/margins": 0.6028537154197693, "rewards/rejected": -0.6969884634017944, "step": 37 }, { "epoch": 0.45037037037037037, "grad_norm": 29.437195830990852, "learning_rate": 4.997836020254328e-07, "logits/chosen": -0.9325073957443237, "logits/rejected": -0.8846120238304138, "logps/chosen": -27.168790817260742, "logps/rejected": -36.877262115478516, "loss": 0.5122, "rewards/accuracies": 0.75, "rewards/chosen": -0.15598426759243011, "rewards/margins": 0.6510501503944397, "rewards/rejected": -0.807034432888031, "step": 38 }, { "epoch": 0.4622222222222222, "grad_norm": 28.44428517855095, "learning_rate": 4.996619056069291e-07, "logits/chosen": -0.8960347771644592, "logits/rejected": -0.8378150463104248, "logps/chosen": -28.43727684020996, "logps/rejected": -40.62827682495117, "loss": 0.4705, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17505469918251038, "rewards/margins": 0.8592283725738525, "rewards/rejected": -1.0342830419540405, "step": 39 }, { "epoch": 0.4740740740740741, "grad_norm": 29.94537092561941, "learning_rate": 4.995131923687487e-07, "logits/chosen": -0.9718501567840576, "logits/rejected": -0.8560028076171875, "logps/chosen": -29.755184173583984, "logps/rejected": -37.2801399230957, "loss": 0.4835, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11063119769096375, "rewards/margins": 0.7615076899528503, "rewards/rejected": -0.8721388578414917, "step": 40 }, { "epoch": 0.48592592592592593, "grad_norm": 26.638351450808948, "learning_rate": 4.993374784036901e-07, "logits/chosen": -1.006788969039917, "logits/rejected": -0.8062241077423096, "logps/chosen": -27.824739456176758, "logps/rejected": -37.465415954589844, "loss": 0.4489, "rewards/accuracies": 0.75, "rewards/chosen": -0.16224287450313568, "rewards/margins": 0.9281247854232788, "rewards/rejected": -1.0903676748275757, "step": 41 }, { "epoch": 0.49777777777777776, "grad_norm": 29.703403664234436, "learning_rate": 4.991347827263982e-07, "logits/chosen": -1.0439155101776123, "logits/rejected": -0.8992699384689331, "logps/chosen": -28.927303314208984, "logps/rejected": -42.13187026977539, "loss": 0.488, "rewards/accuracies": 0.75, "rewards/chosen": -0.13541710376739502, "rewards/margins": 0.8769669532775879, "rewards/rejected": -1.0123839378356934, "step": 42 }, { "epoch": 0.5096296296296297, "grad_norm": 25.194805243065485, "learning_rate": 4.989051272713069e-07, "logits/chosen": -0.9479715824127197, "logits/rejected": -0.808491051197052, "logps/chosen": -30.748804092407227, "logps/rejected": -48.32786178588867, "loss": 0.4055, "rewards/accuracies": 0.875, "rewards/chosen": -0.0868428647518158, "rewards/margins": 1.7449877262115479, "rewards/rejected": -1.8318307399749756, "step": 43 }, { "epoch": 0.5214814814814814, "grad_norm": 28.50704779191256, "learning_rate": 4.986485368902656e-07, "logits/chosen": -1.003732681274414, "logits/rejected": -0.9534778594970703, "logps/chosen": -25.17104148864746, "logps/rejected": -36.80795669555664, "loss": 0.4687, "rewards/accuracies": 0.75, "rewards/chosen": -0.15720072388648987, "rewards/margins": 0.7120774984359741, "rewards/rejected": -0.8692781925201416, "step": 44 }, { "epoch": 0.5333333333333333, "grad_norm": 26.654378912528262, "learning_rate": 4.983650393498489e-07, "logits/chosen": -0.9796334505081177, "logits/rejected": -0.8810800313949585, "logps/chosen": -34.67963790893555, "logps/rejected": -37.48582077026367, "loss": 0.4059, "rewards/accuracies": 0.875, "rewards/chosen": -0.22126227617263794, "rewards/margins": 1.05548095703125, "rewards/rejected": -1.2767431735992432, "step": 45 }, { "epoch": 0.5451851851851852, "grad_norm": 25.91641243212481, "learning_rate": 4.980546653283537e-07, "logits/chosen": -1.1144230365753174, "logits/rejected": -0.9187833666801453, "logps/chosen": -27.469764709472656, "logps/rejected": -42.77268981933594, "loss": 0.4794, "rewards/accuracies": 0.75, "rewards/chosen": -0.20582953095436096, "rewards/margins": 1.8931379318237305, "rewards/rejected": -2.0989675521850586, "step": 46 }, { "epoch": 0.557037037037037, "grad_norm": 27.616713081396448, "learning_rate": 4.977174484124775e-07, "logits/chosen": -0.9438971877098083, "logits/rejected": -0.9460131525993347, "logps/chosen": -28.729183197021484, "logps/rejected": -30.642105102539062, "loss": 0.4464, "rewards/accuracies": 0.75, "rewards/chosen": -0.13018304109573364, "rewards/margins": 0.8073292970657349, "rewards/rejected": -0.9375122785568237, "step": 47 }, { "epoch": 0.5688888888888889, "grad_norm": 26.228638287015333, "learning_rate": 4.97353425093685e-07, "logits/chosen": -1.2007321119308472, "logits/rejected": -1.0530825853347778, "logps/chosen": -25.535133361816406, "logps/rejected": -35.96273422241211, "loss": 0.4261, "rewards/accuracies": 0.9375, "rewards/chosen": -0.14477074146270752, "rewards/margins": 1.4705314636230469, "rewards/rejected": -1.615302324295044, "step": 48 }, { "epoch": 0.5807407407407408, "grad_norm": 27.447706308710917, "learning_rate": 4.96962634764259e-07, "logits/chosen": -1.0324229001998901, "logits/rejected": -1.000633955001831, "logps/chosen": -31.232351303100586, "logps/rejected": -40.054874420166016, "loss": 0.4274, "rewards/accuracies": 0.75, "rewards/chosen": -0.37540578842163086, "rewards/margins": 0.9162301421165466, "rewards/rejected": -1.2916358709335327, "step": 49 }, { "epoch": 0.5925925925925926, "grad_norm": 27.537626334544292, "learning_rate": 4.965451197130372e-07, "logits/chosen": -1.0934017896652222, "logits/rejected": -0.9698958396911621, "logps/chosen": -25.604278564453125, "logps/rejected": -41.89402770996094, "loss": 0.4418, "rewards/accuracies": 0.875, "rewards/chosen": -0.11558225750923157, "rewards/margins": 1.3389551639556885, "rewards/rejected": -1.4545375108718872, "step": 50 }, { "epoch": 0.6044444444444445, "grad_norm": 26.396954082977054, "learning_rate": 4.961009251208367e-07, "logits/chosen": -1.071451187133789, "logits/rejected": -0.9166553616523743, "logps/chosen": -21.116607666015625, "logps/rejected": -34.15024948120117, "loss": 0.4173, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06339044868946075, "rewards/margins": 1.8111618757247925, "rewards/rejected": -1.8745522499084473, "step": 51 }, { "epoch": 0.6162962962962963, "grad_norm": 24.23948883073191, "learning_rate": 4.956300990555643e-07, "logits/chosen": -1.0040934085845947, "logits/rejected": -0.8644249439239502, "logps/chosen": -24.51968002319336, "logps/rejected": -32.15287399291992, "loss": 0.3977, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1651010513305664, "rewards/margins": 1.301413655281067, "rewards/rejected": -1.4665147066116333, "step": 52 }, { "epoch": 0.6281481481481481, "grad_norm": 29.178528579105812, "learning_rate": 4.951326924670147e-07, "logits/chosen": -0.8935304880142212, "logits/rejected": -0.9188090562820435, "logps/chosen": -29.823339462280273, "logps/rejected": -42.743675231933594, "loss": 0.4615, "rewards/accuracies": 0.8125, "rewards/chosen": -0.31552594900131226, "rewards/margins": 1.0024209022521973, "rewards/rejected": -1.3179469108581543, "step": 53 }, { "epoch": 0.64, "grad_norm": 24.40363992735679, "learning_rate": 4.94608759181358e-07, "logits/chosen": -0.9994638562202454, "logits/rejected": -0.8031306266784668, "logps/chosen": -32.72019577026367, "logps/rejected": -39.62814712524414, "loss": 0.3302, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1551201343536377, "rewards/margins": 1.3950880765914917, "rewards/rejected": -1.5502082109451294, "step": 54 }, { "epoch": 0.6518518518518519, "grad_norm": 27.80698317557724, "learning_rate": 4.940583558953137e-07, "logits/chosen": -1.1568812131881714, "logits/rejected": -1.083202838897705, "logps/chosen": -28.588844299316406, "logps/rejected": -46.40166091918945, "loss": 0.4196, "rewards/accuracies": 0.8125, "rewards/chosen": -0.32859814167022705, "rewards/margins": 1.721780776977539, "rewards/rejected": -2.0503790378570557, "step": 55 }, { "epoch": 0.6637037037037037, "grad_norm": 27.03342498011367, "learning_rate": 4.934815421700164e-07, "logits/chosen": -0.9664996266365051, "logits/rejected": -0.9351974725723267, "logps/chosen": -25.929637908935547, "logps/rejected": -36.615997314453125, "loss": 0.4234, "rewards/accuracies": 0.875, "rewards/chosen": -0.14317776262760162, "rewards/margins": 1.6834478378295898, "rewards/rejected": -1.8266258239746094, "step": 56 }, { "epoch": 0.6755555555555556, "grad_norm": 27.024427262923552, "learning_rate": 4.928783804245699e-07, "logits/chosen": -0.8274962902069092, "logits/rejected": -0.745110273361206, "logps/chosen": -32.589447021484375, "logps/rejected": -34.72138977050781, "loss": 0.3984, "rewards/accuracies": 0.8125, "rewards/chosen": -0.42406025528907776, "rewards/margins": 0.8041820526123047, "rewards/rejected": -1.2282423973083496, "step": 57 }, { "epoch": 0.6874074074074074, "grad_norm": 24.14506468826234, "learning_rate": 4.922489359292927e-07, "logits/chosen": -0.920275866985321, "logits/rejected": -0.7754595279693604, "logps/chosen": -30.828351974487305, "logps/rejected": -49.377220153808594, "loss": 0.3514, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3007601797580719, "rewards/margins": 2.031721830368042, "rewards/rejected": -2.33248233795166, "step": 58 }, { "epoch": 0.6992592592592592, "grad_norm": 22.656374640286362, "learning_rate": 4.915932767986551e-07, "logits/chosen": -1.103749394416809, "logits/rejected": -1.0164357423782349, "logps/chosen": -26.017108917236328, "logps/rejected": -43.8387565612793, "loss": 0.3561, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2933482527732849, "rewards/margins": 1.7674319744110107, "rewards/rejected": -2.0607800483703613, "step": 59 }, { "epoch": 0.7111111111111111, "grad_norm": 24.99314823194104, "learning_rate": 4.909114739839079e-07, "logits/chosen": -0.9634025692939758, "logits/rejected": -0.9252867102622986, "logps/chosen": -23.952117919921875, "logps/rejected": -34.92929458618164, "loss": 0.3598, "rewards/accuracies": 0.875, "rewards/chosen": -0.1855652928352356, "rewards/margins": 1.803605079650879, "rewards/rejected": -1.9891700744628906, "step": 60 }, { "epoch": 0.7229629629629629, "grad_norm": 22.905046033248826, "learning_rate": 4.902036012654048e-07, "logits/chosen": -0.7937788963317871, "logits/rejected": -0.7061766982078552, "logps/chosen": -22.034412384033203, "logps/rejected": -33.86552047729492, "loss": 0.3401, "rewards/accuracies": 0.875, "rewards/chosen": -0.3443925678730011, "rewards/margins": 1.395371913909912, "rewards/rejected": -1.73976469039917, "step": 61 }, { "epoch": 0.7348148148148148, "grad_norm": 25.28725048216447, "learning_rate": 4.894697352446182e-07, "logits/chosen": -1.0165841579437256, "logits/rejected": -1.0237828493118286, "logps/chosen": -24.306283950805664, "logps/rejected": -39.6012077331543, "loss": 0.3453, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09554791450500488, "rewards/margins": 1.746566891670227, "rewards/rejected": -1.8421146869659424, "step": 62 }, { "epoch": 0.7466666666666667, "grad_norm": 27.905008683571545, "learning_rate": 4.887099553358501e-07, "logits/chosen": -1.087665319442749, "logits/rejected": -0.9620079398155212, "logps/chosen": -29.117008209228516, "logps/rejected": -37.334896087646484, "loss": 0.3946, "rewards/accuracies": 0.6875, "rewards/chosen": -0.18033871054649353, "rewards/margins": 1.7729251384735107, "rewards/rejected": -1.953263759613037, "step": 63 }, { "epoch": 0.7585185185185185, "grad_norm": 26.563175740341975, "learning_rate": 4.879243437576383e-07, "logits/chosen": -1.0562440156936646, "logits/rejected": -0.8816579580307007, "logps/chosen": -23.48358726501465, "logps/rejected": -34.346927642822266, "loss": 0.369, "rewards/accuracies": 0.8125, "rewards/chosen": -0.34856918454170227, "rewards/margins": 1.5337965488433838, "rewards/rejected": -1.8823657035827637, "step": 64 }, { "epoch": 0.7703703703703704, "grad_norm": 28.950708662099014, "learning_rate": 4.871129855238588e-07, "logits/chosen": -1.031766653060913, "logits/rejected": -1.0294549465179443, "logps/chosen": -31.139263153076172, "logps/rejected": -41.21425247192383, "loss": 0.3715, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3050842881202698, "rewards/margins": 1.8005170822143555, "rewards/rejected": -2.1056013107299805, "step": 65 }, { "epoch": 0.7822222222222223, "grad_norm": 27.546216408337372, "learning_rate": 4.862759684345269e-07, "logits/chosen": -1.203002691268921, "logits/rejected": -1.0988627672195435, "logps/chosen": -29.396411895751953, "logps/rejected": -35.40150833129883, "loss": 0.3922, "rewards/accuracies": 0.875, "rewards/chosen": -0.12675023078918457, "rewards/margins": 2.0646703243255615, "rewards/rejected": -2.191420316696167, "step": 66 }, { "epoch": 0.794074074074074, "grad_norm": 23.894441975814534, "learning_rate": 4.854133830662955e-07, "logits/chosen": -0.9780765771865845, "logits/rejected": -0.8497614860534668, "logps/chosen": -28.06260871887207, "logps/rejected": -34.55665588378906, "loss": 0.3334, "rewards/accuracies": 1.0, "rewards/chosen": -0.6227935552597046, "rewards/margins": 2.2487592697143555, "rewards/rejected": -2.8715527057647705, "step": 67 }, { "epoch": 0.8059259259259259, "grad_norm": 30.617173652616593, "learning_rate": 4.845253227626536e-07, "logits/chosen": -1.0398799180984497, "logits/rejected": -0.907300591468811, "logps/chosen": -41.52682876586914, "logps/rejected": -43.311920166015625, "loss": 0.4022, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7630512714385986, "rewards/margins": 1.217781662940979, "rewards/rejected": -1.9808329343795776, "step": 68 }, { "epoch": 0.8177777777777778, "grad_norm": 24.025263203043526, "learning_rate": 4.836118836238252e-07, "logits/chosen": -1.1331119537353516, "logits/rejected": -1.0378354787826538, "logps/chosen": -27.220407485961914, "logps/rejected": -41.87384796142578, "loss": 0.3431, "rewards/accuracies": 0.9375, "rewards/chosen": -0.057599157094955444, "rewards/margins": 1.6851834058761597, "rewards/rejected": -1.7427825927734375, "step": 69 }, { "epoch": 0.8296296296296296, "grad_norm": 23.34599437673964, "learning_rate": 4.826731644963704e-07, "logits/chosen": -1.0917811393737793, "logits/rejected": -1.0149914026260376, "logps/chosen": -25.583330154418945, "logps/rejected": -33.85319900512695, "loss": 0.3162, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5846102237701416, "rewards/margins": 1.9573626518249512, "rewards/rejected": -2.5419728755950928, "step": 70 }, { "epoch": 0.8414814814814815, "grad_norm": 24.42006807604626, "learning_rate": 4.817092669624882e-07, "logits/chosen": -1.0650672912597656, "logits/rejected": -0.9445031881332397, "logps/chosen": -22.825862884521484, "logps/rejected": -33.60643768310547, "loss": 0.3745, "rewards/accuracies": 0.875, "rewards/chosen": -0.11327299475669861, "rewards/margins": 2.1697635650634766, "rewards/rejected": -2.283036708831787, "step": 71 }, { "epoch": 0.8533333333333334, "grad_norm": 24.54245031605526, "learning_rate": 4.807202953290243e-07, "logits/chosen": -1.1544904708862305, "logits/rejected": -0.9994347095489502, "logps/chosen": -23.641387939453125, "logps/rejected": -38.42119216918945, "loss": 0.3599, "rewards/accuracies": 0.875, "rewards/chosen": -0.23269107937812805, "rewards/margins": 2.1029093265533447, "rewards/rejected": -2.3356003761291504, "step": 72 }, { "epoch": 0.8651851851851852, "grad_norm": 25.210130682755583, "learning_rate": 4.797063566161834e-07, "logits/chosen": -0.9285881519317627, "logits/rejected": -0.8881164789199829, "logps/chosen": -31.189298629760742, "logps/rejected": -35.99159622192383, "loss": 0.3768, "rewards/accuracies": 0.75, "rewards/chosen": -0.41402971744537354, "rewards/margins": 1.2696895599365234, "rewards/rejected": -1.6837193965911865, "step": 73 }, { "epoch": 0.8770370370370371, "grad_norm": 22.99038510220094, "learning_rate": 4.786675605459487e-07, "logits/chosen": -1.1656837463378906, "logits/rejected": -1.1220611333847046, "logps/chosen": -28.37079620361328, "logps/rejected": -45.16815185546875, "loss": 0.3318, "rewards/accuracies": 0.75, "rewards/chosen": -0.2702009975910187, "rewards/margins": 2.232954978942871, "rewards/rejected": -2.5031557083129883, "step": 74 }, { "epoch": 0.8888888888888888, "grad_norm": 24.11796136324434, "learning_rate": 4.776040195302079e-07, "logits/chosen": -1.112859845161438, "logits/rejected": -0.9862438440322876, "logps/chosen": -22.272464752197266, "logps/rejected": -35.39492416381836, "loss": 0.3439, "rewards/accuracies": 0.8125, "rewards/chosen": -0.34517136216163635, "rewards/margins": 2.139002561569214, "rewards/rejected": -2.4841737747192383, "step": 75 }, { "epoch": 0.9007407407407407, "grad_norm": 29.8497129464844, "learning_rate": 4.76515848658589e-07, "logits/chosen": -1.182924747467041, "logits/rejected": -1.0297247171401978, "logps/chosen": -30.078699111938477, "logps/rejected": -39.582275390625, "loss": 0.3452, "rewards/accuracies": 0.875, "rewards/chosen": -0.5807650089263916, "rewards/margins": 2.0797762870788574, "rewards/rejected": -2.660541534423828, "step": 76 }, { "epoch": 0.9125925925925926, "grad_norm": 25.533689636810493, "learning_rate": 4.754031656860059e-07, "logits/chosen": -1.0601996183395386, "logits/rejected": -0.968002200126648, "logps/chosen": -25.98404312133789, "logps/rejected": -29.14290428161621, "loss": 0.3515, "rewards/accuracies": 0.875, "rewards/chosen": -0.17048078775405884, "rewards/margins": 1.8824352025985718, "rewards/rejected": -2.0529160499572754, "step": 77 }, { "epoch": 0.9244444444444444, "grad_norm": 21.394058422904486, "learning_rate": 4.74266091019916e-07, "logits/chosen": -1.1088751554489136, "logits/rejected": -0.9137270450592041, "logps/chosen": -28.85074806213379, "logps/rejected": -34.893470764160156, "loss": 0.2988, "rewards/accuracies": 0.875, "rewards/chosen": -0.05692651867866516, "rewards/margins": 1.6240626573562622, "rewards/rejected": -1.6809892654418945, "step": 78 }, { "epoch": 0.9362962962962963, "grad_norm": 25.697276730733257, "learning_rate": 4.7310474770728996e-07, "logits/chosen": -1.2263762950897217, "logits/rejected": -1.1397736072540283, "logps/chosen": -28.09562873840332, "logps/rejected": -35.75029754638672, "loss": 0.3664, "rewards/accuracies": 0.75, "rewards/chosen": -0.18649393320083618, "rewards/margins": 1.1695051193237305, "rewards/rejected": -1.3559989929199219, "step": 79 }, { "epoch": 0.9481481481481482, "grad_norm": 21.662832078683152, "learning_rate": 4.719192614212969e-07, "logits/chosen": -0.9513252377510071, "logits/rejected": -0.9007601141929626, "logps/chosen": -34.18433380126953, "logps/rejected": -53.043609619140625, "loss": 0.2814, "rewards/accuracies": 0.875, "rewards/chosen": -0.33905377984046936, "rewards/margins": 2.0920355319976807, "rewards/rejected": -2.431089401245117, "step": 80 }, { "epoch": 0.96, "grad_norm": 24.69839835625674, "learning_rate": 4.707097604477045e-07, "logits/chosen": -1.1311062574386597, "logits/rejected": -0.9999745488166809, "logps/chosen": -32.54650115966797, "logps/rejected": -34.888450622558594, "loss": 0.3278, "rewards/accuracies": 1.0, "rewards/chosen": -0.328436940908432, "rewards/margins": 1.7844316959381104, "rewards/rejected": -2.112868547439575, "step": 81 }, { "epoch": 0.9718518518518519, "grad_norm": 21.411242391551657, "learning_rate": 4.694763756709967e-07, "logits/chosen": -1.1982715129852295, "logits/rejected": -1.1674623489379883, "logps/chosen": -28.029937744140625, "logps/rejected": -37.19408416748047, "loss": 0.2882, "rewards/accuracies": 0.75, "rewards/chosen": -0.3920401930809021, "rewards/margins": 1.973564624786377, "rewards/rejected": -2.365604877471924, "step": 82 }, { "epoch": 0.9837037037037037, "grad_norm": 21.7744311573738, "learning_rate": 4.6821924056021053e-07, "logits/chosen": -1.0800765752792358, "logits/rejected": -0.9170486330986023, "logps/chosen": -22.360857009887695, "logps/rejected": -41.66752624511719, "loss": 0.3088, "rewards/accuracies": 0.875, "rewards/chosen": -0.21913698315620422, "rewards/margins": 2.08003830909729, "rewards/rejected": -2.299175262451172, "step": 83 }, { "epoch": 0.9955555555555555, "grad_norm": 24.355082987137063, "learning_rate": 4.669384911544926e-07, "logits/chosen": -1.06318199634552, "logits/rejected": -1.0848791599273682, "logps/chosen": -24.275285720825195, "logps/rejected": -37.596893310546875, "loss": 0.3674, "rewards/accuracies": 0.875, "rewards/chosen": -0.2712962031364441, "rewards/margins": 1.7089827060699463, "rewards/rejected": -1.9802789688110352, "step": 84 }, { "epoch": 1.0074074074074073, "grad_norm": 22.616093539594576, "learning_rate": 4.6563426604837817e-07, "logits/chosen": -1.2081141471862793, "logits/rejected": -0.9877020716667175, "logps/chosen": -34.070823669433594, "logps/rejected": -40.52888107299805, "loss": 0.2829, "rewards/accuracies": 1.0, "rewards/chosen": -0.10701459646224976, "rewards/margins": 3.043393611907959, "rewards/rejected": -3.1504077911376953, "step": 85 }, { "epoch": 1.0192592592592593, "grad_norm": 16.700104066458838, "learning_rate": 4.6430670637679294e-07, "logits/chosen": -1.0600411891937256, "logits/rejected": -0.8425652384757996, "logps/chosen": -22.52095603942871, "logps/rejected": -33.55463409423828, "loss": 0.2269, "rewards/accuracies": 1.0, "rewards/chosen": 0.14321041107177734, "rewards/margins": 2.309981346130371, "rewards/rejected": -2.1667709350585938, "step": 86 }, { "epoch": 1.031111111111111, "grad_norm": 15.240586856186553, "learning_rate": 4.629559557997804e-07, "logits/chosen": -1.3102786540985107, "logits/rejected": -1.143240213394165, "logps/chosen": -31.257415771484375, "logps/rejected": -47.26383590698242, "loss": 0.1831, "rewards/accuracies": 1.0, "rewards/chosen": -0.7230758666992188, "rewards/margins": 3.270418882369995, "rewards/rejected": -3.993495225906372, "step": 87 }, { "epoch": 1.0429629629629629, "grad_norm": 14.157542057104557, "learning_rate": 4.615821604869563e-07, "logits/chosen": -1.094043254852295, "logits/rejected": -0.8985757827758789, "logps/chosen": -28.409828186035156, "logps/rejected": -47.5828971862793, "loss": 0.1842, "rewards/accuracies": 0.9375, "rewards/chosen": -0.18681968748569489, "rewards/margins": 3.8075270652770996, "rewards/rejected": -3.9943466186523438, "step": 88 }, { "epoch": 1.0548148148148149, "grad_norm": 17.38420675108177, "learning_rate": 4.6018546910169067e-07, "logits/chosen": -1.0334746837615967, "logits/rejected": -0.9715449810028076, "logps/chosen": -25.995702743530273, "logps/rejected": -38.42037582397461, "loss": 0.2053, "rewards/accuracies": 0.875, "rewards/chosen": -0.36298614740371704, "rewards/margins": 2.626688241958618, "rewards/rejected": -2.9896743297576904, "step": 89 }, { "epoch": 1.0666666666666667, "grad_norm": 16.906629376553013, "learning_rate": 4.5876603278502027e-07, "logits/chosen": -1.0619425773620605, "logits/rejected": -0.9389445781707764, "logps/chosen": -28.09102439880371, "logps/rejected": -51.08159255981445, "loss": 0.2098, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0606449693441391, "rewards/margins": 3.6463186740875244, "rewards/rejected": -3.5856735706329346, "step": 90 }, { "epoch": 1.0785185185185184, "grad_norm": 18.96732689014115, "learning_rate": 4.573240051392935e-07, "logits/chosen": -0.9454656839370728, "logits/rejected": -0.9307714700698853, "logps/chosen": -26.379640579223633, "logps/rejected": -37.363258361816406, "loss": 0.238, "rewards/accuracies": 0.9375, "rewards/chosen": -0.14031583070755005, "rewards/margins": 2.1791586875915527, "rewards/rejected": -2.319474458694458, "step": 91 }, { "epoch": 1.0903703703703704, "grad_norm": 16.671437504434632, "learning_rate": 4.5585954221154853e-07, "logits/chosen": -1.3018877506256104, "logits/rejected": -1.1478052139282227, "logps/chosen": -25.605445861816406, "logps/rejected": -44.80401611328125, "loss": 0.2076, "rewards/accuracies": 1.0, "rewards/chosen": -0.03381985425949097, "rewards/margins": 3.086803436279297, "rewards/rejected": -3.1206235885620117, "step": 92 }, { "epoch": 1.1022222222222222, "grad_norm": 16.654640941302485, "learning_rate": 4.5437280247662646e-07, "logits/chosen": -1.0023672580718994, "logits/rejected": -0.9070078134536743, "logps/chosen": -29.185150146484375, "logps/rejected": -37.990234375, "loss": 0.1961, "rewards/accuracies": 1.0, "rewards/chosen": -0.18216750025749207, "rewards/margins": 2.39959716796875, "rewards/rejected": -2.5817646980285645, "step": 93 }, { "epoch": 1.114074074074074, "grad_norm": 16.12699044310946, "learning_rate": 4.528639468200226e-07, "logits/chosen": -1.1345858573913574, "logits/rejected": -1.107000470161438, "logps/chosen": -28.13390350341797, "logps/rejected": -36.65238571166992, "loss": 0.204, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10749045014381409, "rewards/margins": 2.2392215728759766, "rewards/rejected": -2.1317310333251953, "step": 94 }, { "epoch": 1.125925925925926, "grad_norm": 14.378767798932659, "learning_rate": 4.5133313852047613e-07, "logits/chosen": -1.058295726776123, "logits/rejected": -1.0083810091018677, "logps/chosen": -27.640762329101562, "logps/rejected": -42.5653076171875, "loss": 0.1812, "rewards/accuracies": 1.0, "rewards/chosen": 0.1484062671661377, "rewards/margins": 2.2676548957824707, "rewards/rejected": -2.119248390197754, "step": 95 }, { "epoch": 1.1377777777777778, "grad_norm": 20.808144652094654, "learning_rate": 4.4978054323230144e-07, "logits/chosen": -1.0242708921432495, "logits/rejected": -0.9334837198257446, "logps/chosen": -24.16075897216797, "logps/rejected": -34.90480041503906, "loss": 0.241, "rewards/accuracies": 0.9375, "rewards/chosen": 0.17029838263988495, "rewards/margins": 2.3052542209625244, "rewards/rejected": -2.134955406188965, "step": 96 }, { "epoch": 1.1496296296296296, "grad_norm": 14.579273235897853, "learning_rate": 4.482063289674618e-07, "logits/chosen": -1.0504794120788574, "logits/rejected": -0.9864072799682617, "logps/chosen": -25.85841178894043, "logps/rejected": -44.5855598449707, "loss": 0.1552, "rewards/accuracies": 1.0, "rewards/chosen": 0.2036604881286621, "rewards/margins": 3.2001941204071045, "rewards/rejected": -2.9965333938598633, "step": 97 }, { "epoch": 1.1614814814814816, "grad_norm": 14.479069724776132, "learning_rate": 4.466106660773884e-07, "logits/chosen": -1.2236568927764893, "logits/rejected": -1.0246343612670898, "logps/chosen": -30.013458251953125, "logps/rejected": -40.343631744384766, "loss": 0.176, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13774560391902924, "rewards/margins": 2.9517884254455566, "rewards/rejected": -3.089534044265747, "step": 98 }, { "epoch": 1.1733333333333333, "grad_norm": 16.052170855559773, "learning_rate": 4.44993727234546e-07, "logits/chosen": -1.102075457572937, "logits/rejected": -0.9819889664649963, "logps/chosen": -30.00847816467285, "logps/rejected": -35.746273040771484, "loss": 0.1827, "rewards/accuracies": 1.0, "rewards/chosen": -0.23734648525714874, "rewards/margins": 2.4544928073883057, "rewards/rejected": -2.6918392181396484, "step": 99 }, { "epoch": 1.1851851851851851, "grad_norm": 14.71406650743676, "learning_rate": 4.4335568741374695e-07, "logits/chosen": -1.3955886363983154, "logits/rejected": -1.1072180271148682, "logps/chosen": -29.151214599609375, "logps/rejected": -35.26973342895508, "loss": 0.1753, "rewards/accuracies": 0.875, "rewards/chosen": 0.24000686407089233, "rewards/margins": 2.8170034885406494, "rewards/rejected": -2.576996326446533, "step": 100 }, { "epoch": 1.1970370370370371, "grad_norm": 15.185117866368294, "learning_rate": 4.4169672387321735e-07, "logits/chosen": -0.9774000644683838, "logits/rejected": -0.8965713977813721, "logps/chosen": -28.971498489379883, "logps/rejected": -42.8656120300293, "loss": 0.1719, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21834176778793335, "rewards/margins": 3.638746738433838, "rewards/rejected": -3.4204049110412598, "step": 101 }, { "epoch": 1.208888888888889, "grad_norm": 19.818913364910017, "learning_rate": 4.4001701613541454e-07, "logits/chosen": -0.9378620982170105, "logits/rejected": -0.8033993244171143, "logps/chosen": -25.265066146850586, "logps/rejected": -34.76940155029297, "loss": 0.2147, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2527243196964264, "rewards/margins": 2.660951852798462, "rewards/rejected": -2.4082274436950684, "step": 102 }, { "epoch": 1.2207407407407407, "grad_norm": 17.017386662283865, "learning_rate": 4.383167459676008e-07, "logits/chosen": -1.101958155632019, "logits/rejected": -1.0334186553955078, "logps/chosen": -27.581031799316406, "logps/rejected": -41.83063507080078, "loss": 0.2141, "rewards/accuracies": 0.9375, "rewards/chosen": 0.033310309052467346, "rewards/margins": 2.295804262161255, "rewards/rejected": -2.2624940872192383, "step": 103 }, { "epoch": 1.2325925925925927, "grad_norm": 15.01263977310487, "learning_rate": 4.365960973621734e-07, "logits/chosen": -1.261305570602417, "logits/rejected": -1.1650094985961914, "logps/chosen": -21.846336364746094, "logps/rejected": -38.35143280029297, "loss": 0.1664, "rewards/accuracies": 1.0, "rewards/chosen": 0.08976972103118896, "rewards/margins": 2.9284555912017822, "rewards/rejected": -2.838685989379883, "step": 104 }, { "epoch": 1.2444444444444445, "grad_norm": 15.499811043472015, "learning_rate": 4.348552565167542e-07, "logits/chosen": -0.9682034850120544, "logits/rejected": -0.8779630064964294, "logps/chosen": -26.32052993774414, "logps/rejected": -33.074302673339844, "loss": 0.1766, "rewards/accuracies": 1.0, "rewards/chosen": -0.027928471565246582, "rewards/margins": 2.689946413040161, "rewards/rejected": -2.717874765396118, "step": 105 }, { "epoch": 1.2562962962962962, "grad_norm": 16.751326465749557, "learning_rate": 4.330944118140406e-07, "logits/chosen": -0.9463189840316772, "logits/rejected": -0.8563187718391418, "logps/chosen": -29.297607421875, "logps/rejected": -38.705177307128906, "loss": 0.1839, "rewards/accuracies": 0.9375, "rewards/chosen": 0.17362913489341736, "rewards/margins": 2.7915725708007812, "rewards/rejected": -2.617943286895752, "step": 106 }, { "epoch": 1.268148148148148, "grad_norm": 13.213493074609195, "learning_rate": 4.313137538014198e-07, "logits/chosen": -1.0986582040786743, "logits/rejected": -0.9737260937690735, "logps/chosen": -25.97295570373535, "logps/rejected": -27.29983901977539, "loss": 0.1545, "rewards/accuracies": 1.0, "rewards/chosen": 0.4464994966983795, "rewards/margins": 2.5145790576934814, "rewards/rejected": -2.0680792331695557, "step": 107 }, { "epoch": 1.28, "grad_norm": 21.550277344518772, "learning_rate": 4.295134751703492e-07, "logits/chosen": -0.9147591590881348, "logits/rejected": -0.8136166334152222, "logps/chosen": -39.372562408447266, "logps/rejected": -40.19895935058594, "loss": 0.2066, "rewards/accuracies": 0.875, "rewards/chosen": -0.10953384637832642, "rewards/margins": 3.0302987098693848, "rewards/rejected": -3.1398324966430664, "step": 108 }, { "epoch": 1.2918518518518518, "grad_norm": 15.95008980481358, "learning_rate": 4.276937707355044e-07, "logits/chosen": -1.119678020477295, "logits/rejected": -0.9529648423194885, "logps/chosen": -29.550357818603516, "logps/rejected": -40.979732513427734, "loss": 0.1793, "rewards/accuracies": 1.0, "rewards/chosen": 0.23076438903808594, "rewards/margins": 3.9992775917053223, "rewards/rejected": -3.7685132026672363, "step": 109 }, { "epoch": 1.3037037037037038, "grad_norm": 14.896618310434517, "learning_rate": 4.2585483741369755e-07, "logits/chosen": -1.1377118825912476, "logits/rejected": -1.0649988651275635, "logps/chosen": -20.728757858276367, "logps/rejected": -42.846527099609375, "loss": 0.1515, "rewards/accuracies": 1.0, "rewards/chosen": -0.1395069807767868, "rewards/margins": 3.1843342781066895, "rewards/rejected": -3.3238413333892822, "step": 110 }, { "epoch": 1.3155555555555556, "grad_norm": 15.287898186475319, "learning_rate": 4.239968742025684e-07, "logits/chosen": -0.9551693797111511, "logits/rejected": -0.8516461253166199, "logps/chosen": -22.917587280273438, "logps/rejected": -43.595619201660156, "loss": 0.184, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1407267451286316, "rewards/margins": 3.2762203216552734, "rewards/rejected": -3.4169468879699707, "step": 111 }, { "epoch": 1.3274074074074074, "grad_norm": 13.13930765742771, "learning_rate": 4.2212008215905e-07, "logits/chosen": -1.309780240058899, "logits/rejected": -1.1697163581848145, "logps/chosen": -23.579864501953125, "logps/rejected": -39.38568115234375, "loss": 0.1529, "rewards/accuracies": 1.0, "rewards/chosen": -0.12219972908496857, "rewards/margins": 2.721135139465332, "rewards/rejected": -2.843334913253784, "step": 112 }, { "epoch": 1.3392592592592591, "grad_norm": 16.93467958306283, "learning_rate": 4.2022466437761154e-07, "logits/chosen": -1.0195517539978027, "logits/rejected": -0.9710554480552673, "logps/chosen": -27.96396255493164, "logps/rejected": -39.36810302734375, "loss": 0.1946, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13629719614982605, "rewards/margins": 1.8954023122787476, "rewards/rejected": -2.0316996574401855, "step": 113 }, { "epoch": 1.3511111111111112, "grad_norm": 16.185982425906115, "learning_rate": 4.18310825968281e-07, "logits/chosen": -1.085777997970581, "logits/rejected": -1.0098400115966797, "logps/chosen": -31.38774871826172, "logps/rejected": -44.18259811401367, "loss": 0.1856, "rewards/accuracies": 0.9375, "rewards/chosen": -0.33986663818359375, "rewards/margins": 3.2784461975097656, "rewards/rejected": -3.618312358856201, "step": 114 }, { "epoch": 1.362962962962963, "grad_norm": 14.370437677602862, "learning_rate": 4.1637877403444923e-07, "logits/chosen": -1.1370917558670044, "logits/rejected": -1.076406478881836, "logps/chosen": -21.368831634521484, "logps/rejected": -37.987247467041016, "loss": 0.1862, "rewards/accuracies": 1.0, "rewards/chosen": 0.27133771777153015, "rewards/margins": 3.750422716140747, "rewards/rejected": -3.4790849685668945, "step": 115 }, { "epoch": 1.374814814814815, "grad_norm": 14.315285669788084, "learning_rate": 4.144287176504582e-07, "logits/chosen": -1.0781633853912354, "logits/rejected": -0.9295682907104492, "logps/chosen": -27.247238159179688, "logps/rejected": -39.297607421875, "loss": 0.1807, "rewards/accuracies": 1.0, "rewards/chosen": 0.07360666990280151, "rewards/margins": 2.9343483448028564, "rewards/rejected": -2.860741376876831, "step": 116 }, { "epoch": 1.3866666666666667, "grad_norm": 13.224703617010858, "learning_rate": 4.1246086783897713e-07, "logits/chosen": -1.143677830696106, "logits/rejected": -1.035298228263855, "logps/chosen": -21.692089080810547, "logps/rejected": -39.77001953125, "loss": 0.1324, "rewards/accuracies": 1.0, "rewards/chosen": 0.11114715039730072, "rewards/margins": 4.035545825958252, "rewards/rejected": -3.924398422241211, "step": 117 }, { "epoch": 1.3985185185185185, "grad_norm": 13.386330467851073, "learning_rate": 4.104754375481664e-07, "logits/chosen": -1.1449244022369385, "logits/rejected": -1.0441653728485107, "logps/chosen": -24.610374450683594, "logps/rejected": -36.322635650634766, "loss": 0.148, "rewards/accuracies": 0.875, "rewards/chosen": -0.24931076169013977, "rewards/margins": 2.860081195831299, "rewards/rejected": -3.1093921661376953, "step": 118 }, { "epoch": 1.4103703703703703, "grad_norm": 17.903128810468665, "learning_rate": 4.084726416286337e-07, "logits/chosen": -1.1355631351470947, "logits/rejected": -1.0569454431533813, "logps/chosen": -22.172731399536133, "logps/rejected": -38.71437072753906, "loss": 0.1681, "rewards/accuracies": 1.0, "rewards/chosen": -0.004118114709854126, "rewards/margins": 3.1719160079956055, "rewards/rejected": -3.1760339736938477, "step": 119 }, { "epoch": 1.4222222222222223, "grad_norm": 14.325608299731273, "learning_rate": 4.0645269681018434e-07, "logits/chosen": -1.2059547901153564, "logits/rejected": -1.132045030593872, "logps/chosen": -24.006052017211914, "logps/rejected": -37.643314361572266, "loss": 0.1583, "rewards/accuracies": 1.0, "rewards/chosen": -0.3129858076572418, "rewards/margins": 2.9086873531341553, "rewards/rejected": -3.221672773361206, "step": 120 }, { "epoch": 1.434074074074074, "grad_norm": 13.002484277938684, "learning_rate": 4.044158216783684e-07, "logits/chosen": -1.369994044303894, "logits/rejected": -1.179801344871521, "logps/chosen": -28.838666915893555, "logps/rejected": -49.269287109375, "loss": 0.1372, "rewards/accuracies": 0.9375, "rewards/chosen": -0.21779999136924744, "rewards/margins": 4.504581928253174, "rewards/rejected": -4.722381591796875, "step": 121 }, { "epoch": 1.445925925925926, "grad_norm": 16.113792921785464, "learning_rate": 4.0236223665082605e-07, "logits/chosen": -1.1226955652236938, "logits/rejected": -1.0712882280349731, "logps/chosen": -21.75322151184082, "logps/rejected": -35.07586669921875, "loss": 0.1625, "rewards/accuracies": 1.0, "rewards/chosen": 0.1918860822916031, "rewards/margins": 3.3565304279327393, "rewards/rejected": -3.164644479751587, "step": 122 }, { "epoch": 1.4577777777777778, "grad_norm": 11.423804755471494, "learning_rate": 4.0029216395343617e-07, "logits/chosen": -1.0564236640930176, "logits/rejected": -0.9565566778182983, "logps/chosen": -27.292240142822266, "logps/rejected": -41.23828887939453, "loss": 0.1276, "rewards/accuracies": 0.9375, "rewards/chosen": -0.42787694931030273, "rewards/margins": 3.2124743461608887, "rewards/rejected": -3.6403515338897705, "step": 123 }, { "epoch": 1.4696296296296296, "grad_norm": 11.96487396864106, "learning_rate": 3.982058275962682e-07, "logits/chosen": -1.2627426385879517, "logits/rejected": -1.163001298904419, "logps/chosen": -20.64603614807129, "logps/rejected": -39.54261016845703, "loss": 0.1485, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4101359248161316, "rewards/margins": 2.903512716293335, "rewards/rejected": -2.4933767318725586, "step": 124 }, { "epoch": 1.4814814814814814, "grad_norm": 13.800579072803204, "learning_rate": 3.9610345334934094e-07, "logits/chosen": -1.2117929458618164, "logits/rejected": -0.9392006993293762, "logps/chosen": -28.66204071044922, "logps/rejected": -40.63731002807617, "loss": 0.1596, "rewards/accuracies": 1.0, "rewards/chosen": 0.1593039333820343, "rewards/margins": 3.4954304695129395, "rewards/rejected": -3.3361263275146484, "step": 125 }, { "epoch": 1.4933333333333334, "grad_norm": 12.680404338446278, "learning_rate": 3.939852687181915e-07, "logits/chosen": -1.1634321212768555, "logits/rejected": -1.0764764547348022, "logps/chosen": -24.423765182495117, "logps/rejected": -45.39548873901367, "loss": 0.1324, "rewards/accuracies": 1.0, "rewards/chosen": 0.17568892240524292, "rewards/margins": 4.0248494148254395, "rewards/rejected": -3.8491601943969727, "step": 126 }, { "epoch": 1.5051851851851852, "grad_norm": 13.14161578490378, "learning_rate": 3.9185150291925585e-07, "logits/chosen": -1.0429072380065918, "logits/rejected": -1.0684268474578857, "logps/chosen": -26.456886291503906, "logps/rejected": -39.13412094116211, "loss": 0.1397, "rewards/accuracies": 0.9375, "rewards/chosen": -0.43806853890419006, "rewards/margins": 3.234588146209717, "rewards/rejected": -3.672656536102295, "step": 127 }, { "epoch": 1.5170370370370372, "grad_norm": 14.252517134892512, "learning_rate": 3.8970238685506486e-07, "logits/chosen": -1.0745394229888916, "logits/rejected": -1.0680888891220093, "logps/chosen": -26.106287002563477, "logps/rejected": -45.78963088989258, "loss": 0.1535, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10547050833702087, "rewards/margins": 3.6777379512786865, "rewards/rejected": -3.5722672939300537, "step": 128 }, { "epoch": 1.528888888888889, "grad_norm": 13.410270453749325, "learning_rate": 3.8753815308925685e-07, "logits/chosen": -1.3084537982940674, "logits/rejected": -1.1879018545150757, "logps/chosen": -22.162595748901367, "logps/rejected": -42.90380096435547, "loss": 0.1354, "rewards/accuracies": 0.9375, "rewards/chosen": -0.29345619678497314, "rewards/margins": 3.8301730155944824, "rewards/rejected": -4.123629570007324, "step": 129 }, { "epoch": 1.5407407407407407, "grad_norm": 16.65901363698597, "learning_rate": 3.8535903582141184e-07, "logits/chosen": -1.1705418825149536, "logits/rejected": -1.053526520729065, "logps/chosen": -22.083023071289062, "logps/rejected": -43.40499496459961, "loss": 0.1819, "rewards/accuracies": 1.0, "rewards/chosen": 0.09943583607673645, "rewards/margins": 3.597656011581421, "rewards/rejected": -3.498220443725586, "step": 130 }, { "epoch": 1.5525925925925925, "grad_norm": 15.81048973784746, "learning_rate": 3.8316527086170727e-07, "logits/chosen": -1.1002339124679565, "logits/rejected": -0.9635283946990967, "logps/chosen": -22.6536865234375, "logps/rejected": -35.75001907348633, "loss": 0.1862, "rewards/accuracies": 0.9375, "rewards/chosen": 0.08621586859226227, "rewards/margins": 3.319308280944824, "rewards/rejected": -3.2330923080444336, "step": 131 }, { "epoch": 1.5644444444444443, "grad_norm": 13.934303626010081, "learning_rate": 3.809570956054003e-07, "logits/chosen": -1.2058043479919434, "logits/rejected": -1.1326546669006348, "logps/chosen": -20.698150634765625, "logps/rejected": -43.496559143066406, "loss": 0.1502, "rewards/accuracies": 0.875, "rewards/chosen": -0.10407552123069763, "rewards/margins": 4.241490364074707, "rewards/rejected": -4.3455657958984375, "step": 132 }, { "epoch": 1.5762962962962963, "grad_norm": 13.808397445470401, "learning_rate": 3.787347490071389e-07, "logits/chosen": -1.2017699480056763, "logits/rejected": -1.1394281387329102, "logps/chosen": -29.24155044555664, "logps/rejected": -45.46855163574219, "loss": 0.1565, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3030049204826355, "rewards/margins": 3.9124467372894287, "rewards/rejected": -4.215451240539551, "step": 133 }, { "epoch": 1.5881481481481483, "grad_norm": 13.152290267087837, "learning_rate": 3.764984715551031e-07, "logits/chosen": -1.1422480344772339, "logits/rejected": -1.053503155708313, "logps/chosen": -20.119190216064453, "logps/rejected": -41.04280090332031, "loss": 0.1632, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04641704261302948, "rewards/margins": 3.333278179168701, "rewards/rejected": -3.379695415496826, "step": 134 }, { "epoch": 1.6, "grad_norm": 12.384641280044091, "learning_rate": 3.7424850524498113e-07, "logits/chosen": -1.1235531568527222, "logits/rejected": -1.016575574874878, "logps/chosen": -23.927431106567383, "logps/rejected": -38.624183654785156, "loss": 0.1505, "rewards/accuracies": 0.9375, "rewards/chosen": -0.145728200674057, "rewards/margins": 3.4623892307281494, "rewards/rejected": -3.6081173419952393, "step": 135 }, { "epoch": 1.6118518518518519, "grad_norm": 13.297788267005293, "learning_rate": 3.7198509355378207e-07, "logits/chosen": -1.1904593706130981, "logits/rejected": -1.0650973320007324, "logps/chosen": -30.460954666137695, "logps/rejected": -35.29721450805664, "loss": 0.1623, "rewards/accuracies": 0.875, "rewards/chosen": -0.5998407602310181, "rewards/margins": 2.190915822982788, "rewards/rejected": -2.7907564640045166, "step": 136 }, { "epoch": 1.6237037037037036, "grad_norm": 17.654879145447634, "learning_rate": 3.6970848141348855e-07, "logits/chosen": -1.2997840642929077, "logits/rejected": -1.1812993288040161, "logps/chosen": -29.659500122070312, "logps/rejected": -39.244354248046875, "loss": 0.1878, "rewards/accuracies": 0.9375, "rewards/chosen": -0.24524670839309692, "rewards/margins": 3.048208713531494, "rewards/rejected": -3.2934556007385254, "step": 137 }, { "epoch": 1.6355555555555554, "grad_norm": 9.713259026639975, "learning_rate": 3.6741891518455146e-07, "logits/chosen": -1.0600968599319458, "logits/rejected": -0.9694119691848755, "logps/chosen": -26.941146850585938, "logps/rejected": -45.241539001464844, "loss": 0.099, "rewards/accuracies": 1.0, "rewards/chosen": -0.2543194591999054, "rewards/margins": 3.474762201309204, "rewards/rejected": -3.729081392288208, "step": 138 }, { "epoch": 1.6474074074074074, "grad_norm": 11.146298314879976, "learning_rate": 3.6511664262923094e-07, "logits/chosen": -1.1857203245162964, "logits/rejected": -1.1235812902450562, "logps/chosen": -20.542293548583984, "logps/rejected": -38.22064971923828, "loss": 0.1272, "rewards/accuracies": 0.9375, "rewards/chosen": -0.22452278435230255, "rewards/margins": 3.8128674030303955, "rewards/rejected": -4.037390232086182, "step": 139 }, { "epoch": 1.6592592592592592, "grad_norm": 11.77226347660767, "learning_rate": 3.6280191288478435e-07, "logits/chosen": -1.2729771137237549, "logits/rejected": -1.1265182495117188, "logps/chosen": -26.0278377532959, "logps/rejected": -44.57939147949219, "loss": 0.1158, "rewards/accuracies": 1.0, "rewards/chosen": -0.20267322659492493, "rewards/margins": 3.678438901901245, "rewards/rejected": -3.8811120986938477, "step": 140 }, { "epoch": 1.6711111111111112, "grad_norm": 12.442016266819769, "learning_rate": 3.604749764365069e-07, "logits/chosen": -1.1912599802017212, "logits/rejected": -1.084775686264038, "logps/chosen": -20.05962371826172, "logps/rejected": -39.900665283203125, "loss": 0.1196, "rewards/accuracies": 1.0, "rewards/chosen": -0.07022520899772644, "rewards/margins": 3.820122718811035, "rewards/rejected": -3.890347957611084, "step": 141 }, { "epoch": 1.682962962962963, "grad_norm": 14.443169294013128, "learning_rate": 3.5813608509062526e-07, "logits/chosen": -0.998296856880188, "logits/rejected": -1.11066472530365, "logps/chosen": -26.359149932861328, "logps/rejected": -48.0468635559082, "loss": 0.1386, "rewards/accuracies": 1.0, "rewards/chosen": -0.2483871877193451, "rewards/margins": 3.9444689750671387, "rewards/rejected": -4.192856311798096, "step": 142 }, { "epoch": 1.6948148148148148, "grad_norm": 12.88438627763912, "learning_rate": 3.557854919470491e-07, "logits/chosen": -1.1343494653701782, "logits/rejected": -1.1029855012893677, "logps/chosen": -32.05289077758789, "logps/rejected": -38.77518081665039, "loss": 0.1465, "rewards/accuracies": 1.0, "rewards/chosen": -0.08383223414421082, "rewards/margins": 2.9178643226623535, "rewards/rejected": -3.001697063446045, "step": 143 }, { "epoch": 1.7066666666666666, "grad_norm": 12.409012501344572, "learning_rate": 3.5342345137198206e-07, "logits/chosen": -1.0480347871780396, "logits/rejected": -0.9312314391136169, "logps/chosen": -30.324771881103516, "logps/rejected": -36.17607116699219, "loss": 0.1341, "rewards/accuracies": 0.875, "rewards/chosen": -0.2758581340312958, "rewards/margins": 2.6668765544891357, "rewards/rejected": -2.942734718322754, "step": 144 }, { "epoch": 1.7185185185185186, "grad_norm": 14.582949797718573, "learning_rate": 3.510502189703954e-07, "logits/chosen": -0.97275710105896, "logits/rejected": -0.7612693905830383, "logps/chosen": -28.907245635986328, "logps/rejected": -45.605037689208984, "loss": 0.1472, "rewards/accuracies": 1.0, "rewards/chosen": -0.35651320219039917, "rewards/margins": 4.817986011505127, "rewards/rejected": -5.17449951171875, "step": 145 }, { "epoch": 1.7303703703703703, "grad_norm": 13.66922326611715, "learning_rate": 3.486660515583691e-07, "logits/chosen": -1.1288774013519287, "logits/rejected": -1.1245758533477783, "logps/chosen": -23.699264526367188, "logps/rejected": -42.97127914428711, "loss": 0.1285, "rewards/accuracies": 1.0, "rewards/chosen": 0.11036735773086548, "rewards/margins": 4.373822212219238, "rewards/rejected": -4.263454914093018, "step": 146 }, { "epoch": 1.7422222222222223, "grad_norm": 13.037114765866198, "learning_rate": 3.4627120713529983e-07, "logits/chosen": -0.9598813056945801, "logits/rejected": -0.8330179452896118, "logps/chosen": -22.383928298950195, "logps/rejected": -45.0758171081543, "loss": 0.1429, "rewards/accuracies": 1.0, "rewards/chosen": -0.034085407853126526, "rewards/margins": 4.736968040466309, "rewards/rejected": -4.771053314208984, "step": 147 }, { "epoch": 1.7540740740740741, "grad_norm": 10.872543956486167, "learning_rate": 3.438659448559825e-07, "logits/chosen": -1.1963474750518799, "logits/rejected": -1.0486239194869995, "logps/chosen": -27.349458694458008, "logps/rejected": -48.23403549194336, "loss": 0.1038, "rewards/accuracies": 1.0, "rewards/chosen": -0.17232058942317963, "rewards/margins": 4.531591892242432, "rewards/rejected": -4.703912734985352, "step": 148 }, { "epoch": 1.765925925925926, "grad_norm": 10.7720279947233, "learning_rate": 3.414505250025659e-07, "logits/chosen": -0.9560255408287048, "logits/rejected": -1.0075461864471436, "logps/chosen": -30.97559928894043, "logps/rejected": -42.89778518676758, "loss": 0.1031, "rewards/accuracies": 1.0, "rewards/chosen": -0.011755384504795074, "rewards/margins": 3.706606864929199, "rewards/rejected": -3.718362331390381, "step": 149 }, { "epoch": 1.7777777777777777, "grad_norm": 14.01453220823484, "learning_rate": 3.390252089563867e-07, "logits/chosen": -1.167525291442871, "logits/rejected": -1.008201241493225, "logps/chosen": -24.03421401977539, "logps/rejected": -37.12451171875, "loss": 0.147, "rewards/accuracies": 1.0, "rewards/chosen": 0.05710184574127197, "rewards/margins": 3.54923939704895, "rewards/rejected": -3.492137908935547, "step": 150 }, { "epoch": 1.7896296296296297, "grad_norm": 18.40124537105695, "learning_rate": 3.3659025916968475e-07, "logits/chosen": -1.1562587022781372, "logits/rejected": -1.0596400499343872, "logps/chosen": -27.828075408935547, "logps/rejected": -50.78956985473633, "loss": 0.1666, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4149998426437378, "rewards/margins": 4.103493690490723, "rewards/rejected": -4.51849365234375, "step": 151 }, { "epoch": 1.8014814814814815, "grad_norm": 15.187471450574751, "learning_rate": 3.3414593913720155e-07, "logits/chosen": -1.1149495840072632, "logits/rejected": -0.9014438986778259, "logps/chosen": -24.957393646240234, "logps/rejected": -38.273773193359375, "loss": 0.1572, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3299483358860016, "rewards/margins": 3.5365545749664307, "rewards/rejected": -3.206606388092041, "step": 152 }, { "epoch": 1.8133333333333335, "grad_norm": 11.786430269793136, "learning_rate": 3.3169251336766697e-07, "logits/chosen": -1.0765142440795898, "logits/rejected": -0.9713940620422363, "logps/chosen": -23.6178035736084, "logps/rejected": -36.39717102050781, "loss": 0.1303, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5628844499588013, "rewards/margins": 3.1841235160827637, "rewards/rejected": -3.7470080852508545, "step": 153 }, { "epoch": 1.8251851851851852, "grad_norm": 15.707535366344572, "learning_rate": 3.2923024735517567e-07, "logits/chosen": -1.2396905422210693, "logits/rejected": -1.13885498046875, "logps/chosen": -25.60649871826172, "logps/rejected": -41.11204147338867, "loss": 0.1377, "rewards/accuracies": 0.875, "rewards/chosen": -0.42897889018058777, "rewards/margins": 3.3137550354003906, "rewards/rejected": -3.742733955383301, "step": 154 }, { "epoch": 1.837037037037037, "grad_norm": 16.002573607260132, "learning_rate": 3.2675940755045713e-07, "logits/chosen": -1.1592830419540405, "logits/rejected": -1.069584846496582, "logps/chosen": -34.06727600097656, "logps/rejected": -54.026817321777344, "loss": 0.1949, "rewards/accuracies": 1.0, "rewards/chosen": -0.43196994066238403, "rewards/margins": 4.310949802398682, "rewards/rejected": -4.742919445037842, "step": 155 }, { "epoch": 1.8488888888888888, "grad_norm": 14.394511048135854, "learning_rate": 3.242802613320418e-07, "logits/chosen": -1.0737497806549072, "logits/rejected": -0.9672637581825256, "logps/chosen": -27.148597717285156, "logps/rejected": -41.859004974365234, "loss": 0.1554, "rewards/accuracies": 1.0, "rewards/chosen": -0.2204400897026062, "rewards/margins": 3.8589026927948, "rewards/rejected": -4.079343318939209, "step": 156 }, { "epoch": 1.8607407407407406, "grad_norm": 13.068510095436686, "learning_rate": 3.217930769773275e-07, "logits/chosen": -1.2130502462387085, "logits/rejected": -1.0399776697158813, "logps/chosen": -20.487337112426758, "logps/rejected": -35.530582427978516, "loss": 0.1261, "rewards/accuracies": 1.0, "rewards/chosen": 0.049159154295921326, "rewards/margins": 4.460110187530518, "rewards/rejected": -4.410951614379883, "step": 157 }, { "epoch": 1.8725925925925926, "grad_norm": 12.727841490377434, "learning_rate": 3.1929812363354764e-07, "logits/chosen": -1.1142170429229736, "logits/rejected": -0.979875385761261, "logps/chosen": -25.325483322143555, "logps/rejected": -46.20812225341797, "loss": 0.1047, "rewards/accuracies": 1.0, "rewards/chosen": -0.1650889664888382, "rewards/margins": 4.539978504180908, "rewards/rejected": -4.7050676345825195, "step": 158 }, { "epoch": 1.8844444444444446, "grad_norm": 13.783921189406176, "learning_rate": 3.167956712886463e-07, "logits/chosen": -1.0069048404693604, "logits/rejected": -0.9355603456497192, "logps/chosen": -29.581226348876953, "logps/rejected": -37.52265167236328, "loss": 0.1372, "rewards/accuracies": 1.0, "rewards/chosen": -0.499575138092041, "rewards/margins": 2.6125097274780273, "rewards/rejected": -3.1120848655700684, "step": 159 }, { "epoch": 1.8962962962962964, "grad_norm": 12.862775831490238, "learning_rate": 3.142859907420615e-07, "logits/chosen": -1.0252788066864014, "logits/rejected": -1.0804516077041626, "logps/chosen": -24.711009979248047, "logps/rejected": -42.78890609741211, "loss": 0.1256, "rewards/accuracies": 1.0, "rewards/chosen": -0.3273608684539795, "rewards/margins": 3.2098522186279297, "rewards/rejected": -3.5372135639190674, "step": 160 }, { "epoch": 1.9081481481481481, "grad_norm": 11.856116486125906, "learning_rate": 3.117693535754213e-07, "logits/chosen": -1.069286823272705, "logits/rejected": -0.9155316948890686, "logps/chosen": -23.146581649780273, "logps/rejected": -43.31779479980469, "loss": 0.1256, "rewards/accuracies": 1.0, "rewards/chosen": 0.038980498909950256, "rewards/margins": 4.636472702026367, "rewards/rejected": -4.597492218017578, "step": 161 }, { "epoch": 1.92, "grad_norm": 15.032149567521808, "learning_rate": 3.092460321231547e-07, "logits/chosen": -1.0839258432388306, "logits/rejected": -1.006733775138855, "logps/chosen": -24.381574630737305, "logps/rejected": -40.473060607910156, "loss": 0.1488, "rewards/accuracies": 1.0, "rewards/chosen": -0.02435511350631714, "rewards/margins": 4.535048007965088, "rewards/rejected": -4.559402942657471, "step": 162 }, { "epoch": 1.9318518518518517, "grad_norm": 14.8363884279284, "learning_rate": 3.0671629944302164e-07, "logits/chosen": -1.0501927137374878, "logits/rejected": -0.9243767261505127, "logps/chosen": -27.61357879638672, "logps/rejected": -36.362586975097656, "loss": 0.1177, "rewards/accuracies": 0.9375, "rewards/chosen": -0.23934195935726166, "rewards/margins": 3.6352920532226562, "rewards/rejected": -3.8746337890625, "step": 163 }, { "epoch": 1.9437037037037037, "grad_norm": 12.238985051757798, "learning_rate": 3.0418042928656415e-07, "logits/chosen": -1.1459879875183105, "logits/rejected": -0.9831377267837524, "logps/chosen": -23.33287811279297, "logps/rejected": -43.29710006713867, "loss": 0.1341, "rewards/accuracies": 0.9375, "rewards/chosen": -0.029119372367858887, "rewards/margins": 4.274390697479248, "rewards/rejected": -4.3035101890563965, "step": 164 }, { "epoch": 1.9555555555555557, "grad_norm": 16.045991654119778, "learning_rate": 3.016386960694827e-07, "logits/chosen": -1.0820094347000122, "logits/rejected": -0.9164285063743591, "logps/chosen": -29.36737823486328, "logps/rejected": -45.8538818359375, "loss": 0.1575, "rewards/accuracies": 0.875, "rewards/chosen": -0.5107632875442505, "rewards/margins": 3.8868861198425293, "rewards/rejected": -4.39764928817749, "step": 165 }, { "epoch": 1.9674074074074075, "grad_norm": 15.658417100599408, "learning_rate": 2.990913748419411e-07, "logits/chosen": -1.1057474613189697, "logits/rejected": -1.0400460958480835, "logps/chosen": -32.17692565917969, "logps/rejected": -43.858551025390625, "loss": 0.1491, "rewards/accuracies": 1.0, "rewards/chosen": 0.028441503643989563, "rewards/margins": 3.5364620685577393, "rewards/rejected": -3.5080206394195557, "step": 166 }, { "epoch": 1.9792592592592593, "grad_norm": 17.182247947721276, "learning_rate": 2.9653874125880167e-07, "logits/chosen": -1.1606206893920898, "logits/rejected": -1.0265402793884277, "logps/chosen": -24.273101806640625, "logps/rejected": -43.97246551513672, "loss": 0.1734, "rewards/accuracies": 1.0, "rewards/chosen": -0.012486815452575684, "rewards/margins": 3.4821486473083496, "rewards/rejected": -3.4946351051330566, "step": 167 }, { "epoch": 1.991111111111111, "grad_norm": 8.93976424369471, "learning_rate": 2.9398107154979634e-07, "logits/chosen": -1.1381988525390625, "logits/rejected": -1.03400456905365, "logps/chosen": -21.53853416442871, "logps/rejected": -48.0505256652832, "loss": 0.0872, "rewards/accuracies": 1.0, "rewards/chosen": 0.05716177821159363, "rewards/margins": 4.557176113128662, "rewards/rejected": -4.500014305114746, "step": 168 }, { "epoch": 2.002962962962963, "grad_norm": 11.949224405327886, "learning_rate": 2.9141864248963427e-07, "logits/chosen": -1.2692681550979614, "logits/rejected": -1.0146331787109375, "logps/chosen": -27.361726760864258, "logps/rejected": -35.84319305419922, "loss": 0.1362, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12528757750988007, "rewards/margins": 4.429131984710693, "rewards/rejected": -4.303844451904297, "step": 169 }, { "epoch": 2.0148148148148146, "grad_norm": 7.858640781523143, "learning_rate": 2.8885173136805125e-07, "logits/chosen": -1.1425201892852783, "logits/rejected": -1.0211284160614014, "logps/chosen": -26.627113342285156, "logps/rejected": -51.298709869384766, "loss": 0.0958, "rewards/accuracies": 1.0, "rewards/chosen": -0.16640473902225494, "rewards/margins": 4.273306846618652, "rewards/rejected": -4.439712047576904, "step": 170 }, { "epoch": 2.026666666666667, "grad_norm": 5.791091337239758, "learning_rate": 2.862806159598032e-07, "logits/chosen": -1.246085286140442, "logits/rejected": -1.1816462278366089, "logps/chosen": -23.06086540222168, "logps/rejected": -39.5461540222168, "loss": 0.0582, "rewards/accuracies": 1.0, "rewards/chosen": 0.21991188824176788, "rewards/margins": 4.500458717346191, "rewards/rejected": -4.28054666519165, "step": 171 }, { "epoch": 2.0385185185185186, "grad_norm": 8.464583168455022, "learning_rate": 2.837055744946072e-07, "logits/chosen": -0.9950094819068909, "logits/rejected": -0.9867933392524719, "logps/chosen": -20.085613250732422, "logps/rejected": -39.374183654785156, "loss": 0.0846, "rewards/accuracies": 1.0, "rewards/chosen": 0.23747408390045166, "rewards/margins": 4.035274505615234, "rewards/rejected": -3.797800064086914, "step": 172 }, { "epoch": 2.0503703703703704, "grad_norm": 7.246388422688696, "learning_rate": 2.811268856270332e-07, "logits/chosen": -1.149637222290039, "logits/rejected": -1.1608506441116333, "logps/chosen": -22.0140380859375, "logps/rejected": -42.8390998840332, "loss": 0.081, "rewards/accuracies": 1.0, "rewards/chosen": 0.34959834814071655, "rewards/margins": 4.302677154541016, "rewards/rejected": -3.9530792236328125, "step": 173 }, { "epoch": 2.062222222222222, "grad_norm": 7.340518516395049, "learning_rate": 2.7854482840634965e-07, "logits/chosen": -1.2548686265945435, "logits/rejected": -1.127457618713379, "logps/chosen": -21.352310180664062, "logps/rejected": -43.30939483642578, "loss": 0.0859, "rewards/accuracies": 1.0, "rewards/chosen": 0.04353713244199753, "rewards/margins": 5.536983013153076, "rewards/rejected": -5.49344539642334, "step": 174 }, { "epoch": 2.074074074074074, "grad_norm": 9.753614692470563, "learning_rate": 2.759596822463267e-07, "logits/chosen": -1.1281955242156982, "logits/rejected": -0.9843631386756897, "logps/chosen": -28.948612213134766, "logps/rejected": -37.4376335144043, "loss": 0.0864, "rewards/accuracies": 1.0, "rewards/chosen": -0.19420504570007324, "rewards/margins": 3.627711772918701, "rewards/rejected": -3.8219170570373535, "step": 175 }, { "epoch": 2.0859259259259257, "grad_norm": 6.267240444464727, "learning_rate": 2.73371726895e-07, "logits/chosen": -1.1884928941726685, "logits/rejected": -1.0611791610717773, "logps/chosen": -29.869997024536133, "logps/rejected": -49.20811462402344, "loss": 0.0636, "rewards/accuracies": 1.0, "rewards/chosen": 0.011849135160446167, "rewards/margins": 4.665461540222168, "rewards/rejected": -4.6536126136779785, "step": 176 }, { "epoch": 2.097777777777778, "grad_norm": 6.022136138537939, "learning_rate": 2.7078124240439793e-07, "logits/chosen": -1.1008820533752441, "logits/rejected": -0.9790475368499756, "logps/chosen": -29.616289138793945, "logps/rejected": -57.20648193359375, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": -0.5932025909423828, "rewards/margins": 6.053226947784424, "rewards/rejected": -6.64642858505249, "step": 177 }, { "epoch": 2.1096296296296297, "grad_norm": 6.379960194971949, "learning_rate": 2.68188509100236e-07, "logits/chosen": -1.0663186311721802, "logits/rejected": -0.994686484336853, "logps/chosen": -26.227067947387695, "logps/rejected": -50.95429229736328, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": -0.13309167325496674, "rewards/margins": 4.429349422454834, "rewards/rejected": -4.562440872192383, "step": 178 }, { "epoch": 2.1214814814814815, "grad_norm": 7.642435740805011, "learning_rate": 2.6559380755158206e-07, "logits/chosen": -1.1984007358551025, "logits/rejected": -1.1312189102172852, "logps/chosen": -29.640098571777344, "logps/rejected": -48.15163040161133, "loss": 0.0936, "rewards/accuracies": 1.0, "rewards/chosen": -0.03109852969646454, "rewards/margins": 4.580999851226807, "rewards/rejected": -4.61209774017334, "step": 179 }, { "epoch": 2.1333333333333333, "grad_norm": 7.309302464370304, "learning_rate": 2.629974185404951e-07, "logits/chosen": -1.232039451599121, "logits/rejected": -1.1574738025665283, "logps/chosen": -24.592525482177734, "logps/rejected": -58.08824157714844, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": -0.607149600982666, "rewards/margins": 5.233615875244141, "rewards/rejected": -5.840765476226807, "step": 180 }, { "epoch": 2.145185185185185, "grad_norm": 7.918401262658898, "learning_rate": 2.603996230316402e-07, "logits/chosen": -1.1730706691741943, "logits/rejected": -1.1893783807754517, "logps/chosen": -20.52701187133789, "logps/rejected": -32.62423324584961, "loss": 0.093, "rewards/accuracies": 1.0, "rewards/chosen": 0.2257765233516693, "rewards/margins": 3.6342880725860596, "rewards/rejected": -3.4085114002227783, "step": 181 }, { "epoch": 2.157037037037037, "grad_norm": 8.076843746703107, "learning_rate": 2.5780070214188474e-07, "logits/chosen": -1.2444607019424438, "logits/rejected": -1.1096103191375732, "logps/chosen": -33.14277267456055, "logps/rejected": -46.21152114868164, "loss": 0.0751, "rewards/accuracies": 1.0, "rewards/chosen": -0.3877983093261719, "rewards/margins": 3.8209316730499268, "rewards/rejected": -4.2087297439575195, "step": 182 }, { "epoch": 2.168888888888889, "grad_norm": 7.21014521039241, "learning_rate": 2.552009371098778e-07, "logits/chosen": -1.132177472114563, "logits/rejected": -1.0657352209091187, "logps/chosen": -27.557518005371094, "logps/rejected": -44.8818473815918, "loss": 0.0694, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03888387978076935, "rewards/margins": 4.166874885559082, "rewards/rejected": -4.205758571624756, "step": 183 }, { "epoch": 2.180740740740741, "grad_norm": 7.31003315950285, "learning_rate": 2.5260060926561604e-07, "logits/chosen": -1.1547397375106812, "logits/rejected": -1.0553665161132812, "logps/chosen": -22.003814697265625, "logps/rejected": -42.98273849487305, "loss": 0.0753, "rewards/accuracies": 1.0, "rewards/chosen": -0.14262710511684418, "rewards/margins": 4.841116905212402, "rewards/rejected": -4.9837446212768555, "step": 184 }, { "epoch": 2.1925925925925926, "grad_norm": 8.27339627937372, "learning_rate": 2.5e-07, "logits/chosen": -1.2605483531951904, "logits/rejected": -1.0690468549728394, "logps/chosen": -28.908740997314453, "logps/rejected": -40.10096740722656, "loss": 0.0956, "rewards/accuracies": 0.9375, "rewards/chosen": -0.43739017844200134, "rewards/margins": 3.4248218536376953, "rewards/rejected": -3.8622121810913086, "step": 185 }, { "epoch": 2.2044444444444444, "grad_norm": 8.253216832927258, "learning_rate": 2.4739939073438393e-07, "logits/chosen": -1.3061436414718628, "logits/rejected": -1.1886006593704224, "logps/chosen": -33.44011688232422, "logps/rejected": -46.8795166015625, "loss": 0.0904, "rewards/accuracies": 1.0, "rewards/chosen": -0.5726553201675415, "rewards/margins": 3.8218576908111572, "rewards/rejected": -4.39451265335083, "step": 186 }, { "epoch": 2.216296296296296, "grad_norm": 7.807015119173489, "learning_rate": 2.4479906289012216e-07, "logits/chosen": -1.345091462135315, "logits/rejected": -1.0644184350967407, "logps/chosen": -25.767536163330078, "logps/rejected": -41.148502349853516, "loss": 0.0849, "rewards/accuracies": 1.0, "rewards/chosen": 0.5413724780082703, "rewards/margins": 4.547415256500244, "rewards/rejected": -4.006042957305908, "step": 187 }, { "epoch": 2.228148148148148, "grad_norm": 8.268473966183542, "learning_rate": 2.421992978581152e-07, "logits/chosen": -1.2509685754776, "logits/rejected": -1.1202762126922607, "logps/chosen": -26.480911254882812, "logps/rejected": -41.798858642578125, "loss": 0.0768, "rewards/accuracies": 1.0, "rewards/chosen": -0.4163511097431183, "rewards/margins": 4.190377235412598, "rewards/rejected": -4.606728553771973, "step": 188 }, { "epoch": 2.24, "grad_norm": 6.31545694362126, "learning_rate": 2.3960037696835987e-07, "logits/chosen": -0.9931889772415161, "logits/rejected": -0.9487002491950989, "logps/chosen": -23.28666877746582, "logps/rejected": -45.82819366455078, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": -0.22770199179649353, "rewards/margins": 5.227255344390869, "rewards/rejected": -5.454957008361816, "step": 189 }, { "epoch": 2.251851851851852, "grad_norm": 8.198173492670941, "learning_rate": 2.3700258145950493e-07, "logits/chosen": -1.2542146444320679, "logits/rejected": -1.296125888824463, "logps/chosen": -23.325332641601562, "logps/rejected": -42.396663665771484, "loss": 0.074, "rewards/accuracies": 1.0, "rewards/chosen": -0.37019920349121094, "rewards/margins": 4.538805961608887, "rewards/rejected": -4.909005165100098, "step": 190 }, { "epoch": 2.2637037037037038, "grad_norm": 6.252335723496194, "learning_rate": 2.3440619244841794e-07, "logits/chosen": -1.0998159646987915, "logits/rejected": -1.0990605354309082, "logps/chosen": -24.507465362548828, "logps/rejected": -36.9913330078125, "loss": 0.0693, "rewards/accuracies": 1.0, "rewards/chosen": 0.07138313353061676, "rewards/margins": 3.743443727493286, "rewards/rejected": -3.67206072807312, "step": 191 }, { "epoch": 2.2755555555555556, "grad_norm": 5.937599917562406, "learning_rate": 2.3181149089976404e-07, "logits/chosen": -1.1160556077957153, "logits/rejected": -0.9888994693756104, "logps/chosen": -25.562957763671875, "logps/rejected": -44.06254959106445, "loss": 0.0597, "rewards/accuracies": 1.0, "rewards/chosen": -0.11315414309501648, "rewards/margins": 5.40950345993042, "rewards/rejected": -5.52265739440918, "step": 192 }, { "epoch": 2.2874074074074073, "grad_norm": 8.140792637653023, "learning_rate": 2.2921875759560207e-07, "logits/chosen": -1.2146611213684082, "logits/rejected": -1.1461243629455566, "logps/chosen": -36.22383499145508, "logps/rejected": -46.22894287109375, "loss": 0.0893, "rewards/accuracies": 1.0, "rewards/chosen": -0.8142991065979004, "rewards/margins": 4.104118347167969, "rewards/rejected": -4.918417930603027, "step": 193 }, { "epoch": 2.299259259259259, "grad_norm": 7.224664725024332, "learning_rate": 2.2662827310499995e-07, "logits/chosen": -1.0874426364898682, "logits/rejected": -0.9829124212265015, "logps/chosen": -24.988603591918945, "logps/rejected": -42.57012939453125, "loss": 0.0656, "rewards/accuracies": 1.0, "rewards/chosen": 0.10310526937246323, "rewards/margins": 5.060862064361572, "rewards/rejected": -4.957756996154785, "step": 194 }, { "epoch": 2.311111111111111, "grad_norm": 7.027603500584767, "learning_rate": 2.2404031775367332e-07, "logits/chosen": -1.1362197399139404, "logits/rejected": -1.0883052349090576, "logps/chosen": -24.717567443847656, "logps/rejected": -43.55390167236328, "loss": 0.0582, "rewards/accuracies": 1.0, "rewards/chosen": 0.07100862264633179, "rewards/margins": 4.886796474456787, "rewards/rejected": -4.815788269042969, "step": 195 }, { "epoch": 2.322962962962963, "grad_norm": 6.3481105853123, "learning_rate": 2.2145517159365043e-07, "logits/chosen": -1.2440788745880127, "logits/rejected": -1.0895586013793945, "logps/chosen": -27.22349739074707, "logps/rejected": -39.78349304199219, "loss": 0.0609, "rewards/accuracies": 1.0, "rewards/chosen": 0.12459969520568848, "rewards/margins": 4.133920192718506, "rewards/rejected": -4.0093207359313965, "step": 196 }, { "epoch": 2.334814814814815, "grad_norm": 8.448014970739372, "learning_rate": 2.1887311437296684e-07, "logits/chosen": -1.2059340476989746, "logits/rejected": -1.1843221187591553, "logps/chosen": -22.853811264038086, "logps/rejected": -32.71154022216797, "loss": 0.0912, "rewards/accuracies": 1.0, "rewards/chosen": 0.47247427701950073, "rewards/margins": 3.9881787300109863, "rewards/rejected": -3.51570463180542, "step": 197 }, { "epoch": 2.3466666666666667, "grad_norm": 8.053586024276273, "learning_rate": 2.162944255053928e-07, "logits/chosen": -1.1554303169250488, "logits/rejected": -1.0401800870895386, "logps/chosen": -20.67418670654297, "logps/rejected": -37.24845504760742, "loss": 0.0809, "rewards/accuracies": 1.0, "rewards/chosen": 0.2763448655605316, "rewards/margins": 4.477565288543701, "rewards/rejected": -4.201220512390137, "step": 198 }, { "epoch": 2.3585185185185185, "grad_norm": 7.516398498619182, "learning_rate": 2.137193840401968e-07, "logits/chosen": -1.1824381351470947, "logits/rejected": -1.1074461936950684, "logps/chosen": -28.55365562438965, "logps/rejected": -41.09587478637695, "loss": 0.0734, "rewards/accuracies": 1.0, "rewards/chosen": 0.2404264211654663, "rewards/margins": 3.8725597858428955, "rewards/rejected": -3.6321334838867188, "step": 199 }, { "epoch": 2.3703703703703702, "grad_norm": 5.954177017572196, "learning_rate": 2.1114826863194878e-07, "logits/chosen": -1.24180269241333, "logits/rejected": -1.0925354957580566, "logps/chosen": -28.197025299072266, "logps/rejected": -46.81939697265625, "loss": 0.0549, "rewards/accuracies": 1.0, "rewards/chosen": -0.15689772367477417, "rewards/margins": 5.056156635284424, "rewards/rejected": -5.213054180145264, "step": 200 }, { "epoch": 2.3822222222222225, "grad_norm": 5.991252280343694, "learning_rate": 2.0858135751036568e-07, "logits/chosen": -1.222536325454712, "logits/rejected": -1.1197445392608643, "logps/chosen": -32.660709381103516, "logps/rejected": -46.89257049560547, "loss": 0.0509, "rewards/accuracies": 1.0, "rewards/chosen": 0.014814764261245728, "rewards/margins": 5.237975120544434, "rewards/rejected": -5.223160743713379, "step": 201 }, { "epoch": 2.3940740740740742, "grad_norm": 6.65615573416704, "learning_rate": 2.060189284502037e-07, "logits/chosen": -1.1877946853637695, "logits/rejected": -1.1109426021575928, "logps/chosen": -25.55805206298828, "logps/rejected": -44.239295959472656, "loss": 0.0634, "rewards/accuracies": 1.0, "rewards/chosen": 0.14644792675971985, "rewards/margins": 4.83575439453125, "rewards/rejected": -4.689306259155273, "step": 202 }, { "epoch": 2.405925925925926, "grad_norm": 6.275499946646439, "learning_rate": 2.0346125874119838e-07, "logits/chosen": -1.132055401802063, "logits/rejected": -1.0429214239120483, "logps/chosen": -24.973257064819336, "logps/rejected": -42.17146682739258, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": -0.11066167056560516, "rewards/margins": 4.5910515785217285, "rewards/rejected": -4.7017130851745605, "step": 203 }, { "epoch": 2.417777777777778, "grad_norm": 7.65769891944596, "learning_rate": 2.0090862515805895e-07, "logits/chosen": -1.0738351345062256, "logits/rejected": -0.8972642421722412, "logps/chosen": -33.31107711791992, "logps/rejected": -41.709693908691406, "loss": 0.0813, "rewards/accuracies": 1.0, "rewards/chosen": -0.21268025040626526, "rewards/margins": 4.644548416137695, "rewards/rejected": -4.857229232788086, "step": 204 }, { "epoch": 2.4296296296296296, "grad_norm": 7.640686179230129, "learning_rate": 1.983613039305173e-07, "logits/chosen": -1.2996752262115479, "logits/rejected": -1.12294340133667, "logps/chosen": -18.794048309326172, "logps/rejected": -45.74852752685547, "loss": 0.0789, "rewards/accuracies": 1.0, "rewards/chosen": -0.16848334670066833, "rewards/margins": 4.915053367614746, "rewards/rejected": -5.0835371017456055, "step": 205 }, { "epoch": 2.4414814814814814, "grad_norm": 7.524471411959897, "learning_rate": 1.9581957071343588e-07, "logits/chosen": -1.0391274690628052, "logits/rejected": -0.9014835357666016, "logps/chosen": -33.915252685546875, "logps/rejected": -57.86189270019531, "loss": 0.0805, "rewards/accuracies": 1.0, "rewards/chosen": -0.4224894046783447, "rewards/margins": 4.6927666664123535, "rewards/rejected": -5.115255832672119, "step": 206 }, { "epoch": 2.453333333333333, "grad_norm": 6.9279059385356305, "learning_rate": 1.9328370055697832e-07, "logits/chosen": -1.1469345092773438, "logits/rejected": -0.9380808472633362, "logps/chosen": -24.10541343688965, "logps/rejected": -44.4921760559082, "loss": 0.0595, "rewards/accuracies": 1.0, "rewards/chosen": 0.29525160789489746, "rewards/margins": 4.89801549911499, "rewards/rejected": -4.602763652801514, "step": 207 }, { "epoch": 2.4651851851851854, "grad_norm": 6.54091678469529, "learning_rate": 1.907539678768453e-07, "logits/chosen": -1.1986242532730103, "logits/rejected": -1.1000490188598633, "logps/chosen": -22.64141273498535, "logps/rejected": -53.74283981323242, "loss": 0.068, "rewards/accuracies": 1.0, "rewards/chosen": -0.23207074403762817, "rewards/margins": 5.020073413848877, "rewards/rejected": -5.2521443367004395, "step": 208 }, { "epoch": 2.477037037037037, "grad_norm": 7.3835745720901365, "learning_rate": 1.8823064642457876e-07, "logits/chosen": -1.1322101354599, "logits/rejected": -1.0012404918670654, "logps/chosen": -25.564584732055664, "logps/rejected": -52.52565002441406, "loss": 0.0701, "rewards/accuracies": 1.0, "rewards/chosen": -0.2706539034843445, "rewards/margins": 4.544902801513672, "rewards/rejected": -4.815556526184082, "step": 209 }, { "epoch": 2.488888888888889, "grad_norm": 6.037126217772019, "learning_rate": 1.8571400925793852e-07, "logits/chosen": -1.32914137840271, "logits/rejected": -1.199539303779602, "logps/chosen": -27.011600494384766, "logps/rejected": -42.806114196777344, "loss": 0.0639, "rewards/accuracies": 1.0, "rewards/chosen": 0.05861341953277588, "rewards/margins": 4.113726615905762, "rewards/rejected": -4.055113315582275, "step": 210 }, { "epoch": 2.5007407407407407, "grad_norm": 6.792003028800643, "learning_rate": 1.8320432871135376e-07, "logits/chosen": -0.9643785357475281, "logits/rejected": -0.8642684817314148, "logps/chosen": -32.56407928466797, "logps/rejected": -48.981529235839844, "loss": 0.0659, "rewards/accuracies": 1.0, "rewards/chosen": -0.09449410438537598, "rewards/margins": 4.1956257820129395, "rewards/rejected": -4.2901201248168945, "step": 211 }, { "epoch": 2.5125925925925925, "grad_norm": 6.652434536599441, "learning_rate": 1.8070187636645237e-07, "logits/chosen": -1.1183323860168457, "logits/rejected": -1.0643121004104614, "logps/chosen": -23.476839065551758, "logps/rejected": -46.453697204589844, "loss": 0.0651, "rewards/accuracies": 1.0, "rewards/chosen": 0.05248948931694031, "rewards/margins": 4.479131698608398, "rewards/rejected": -4.426641941070557, "step": 212 }, { "epoch": 2.5244444444444447, "grad_norm": 6.873490871799767, "learning_rate": 1.782069230226725e-07, "logits/chosen": -0.9355219602584839, "logits/rejected": -0.8760642409324646, "logps/chosen": -26.840740203857422, "logps/rejected": -46.565147399902344, "loss": 0.0716, "rewards/accuracies": 1.0, "rewards/chosen": -0.27372512221336365, "rewards/margins": 4.666114330291748, "rewards/rejected": -4.9398393630981445, "step": 213 }, { "epoch": 2.536296296296296, "grad_norm": 6.477809311744379, "learning_rate": 1.7571973866795813e-07, "logits/chosen": -1.3275456428527832, "logits/rejected": -1.1785155534744263, "logps/chosen": -19.671016693115234, "logps/rejected": -40.520137786865234, "loss": 0.0696, "rewards/accuracies": 1.0, "rewards/chosen": 0.06844872236251831, "rewards/margins": 4.899576663970947, "rewards/rejected": -4.8311285972595215, "step": 214 }, { "epoch": 2.5481481481481483, "grad_norm": 4.940048002831371, "learning_rate": 1.7324059244954292e-07, "logits/chosen": -1.461755633354187, "logits/rejected": -1.3273966312408447, "logps/chosen": -23.988277435302734, "logps/rejected": -35.3886604309082, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": -0.3582080900669098, "rewards/margins": 4.515974044799805, "rewards/rejected": -4.874181747436523, "step": 215 }, { "epoch": 2.56, "grad_norm": 9.005658987409907, "learning_rate": 1.7076975264482433e-07, "logits/chosen": -1.2200323343276978, "logits/rejected": -1.0738322734832764, "logps/chosen": -22.159700393676758, "logps/rejected": -41.98440170288086, "loss": 0.0807, "rewards/accuracies": 1.0, "rewards/chosen": 0.023108944296836853, "rewards/margins": 4.085160255432129, "rewards/rejected": -4.062050819396973, "step": 216 }, { "epoch": 2.571851851851852, "grad_norm": 6.115258133963013, "learning_rate": 1.6830748663233303e-07, "logits/chosen": -1.135589599609375, "logits/rejected": -1.0998283624649048, "logps/chosen": -22.15255355834961, "logps/rejected": -39.37363815307617, "loss": 0.0597, "rewards/accuracies": 1.0, "rewards/chosen": -0.2580828368663788, "rewards/margins": 4.229098796844482, "rewards/rejected": -4.487181186676025, "step": 217 }, { "epoch": 2.5837037037037036, "grad_norm": 7.594741719247832, "learning_rate": 1.6585406086279846e-07, "logits/chosen": -1.3007519245147705, "logits/rejected": -1.258547306060791, "logps/chosen": -29.01621437072754, "logps/rejected": -51.67272186279297, "loss": 0.0695, "rewards/accuracies": 1.0, "rewards/chosen": 0.06502366065979004, "rewards/margins": 5.339412212371826, "rewards/rejected": -5.274388313293457, "step": 218 }, { "epoch": 2.5955555555555554, "grad_norm": 5.212981266507165, "learning_rate": 1.6340974083031523e-07, "logits/chosen": -1.2680379152297974, "logits/rejected": -1.2023954391479492, "logps/chosen": -25.777963638305664, "logps/rejected": -38.38170623779297, "loss": 0.057, "rewards/accuracies": 1.0, "rewards/chosen": 0.02264055609703064, "rewards/margins": 3.978463649749756, "rewards/rejected": -3.9558229446411133, "step": 219 }, { "epoch": 2.6074074074074076, "grad_norm": 5.672295808616577, "learning_rate": 1.6097479104361326e-07, "logits/chosen": -1.2693517208099365, "logits/rejected": -1.2250739336013794, "logps/chosen": -21.100271224975586, "logps/rejected": -41.79471969604492, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": 0.08915658295154572, "rewards/margins": 4.542138576507568, "rewards/rejected": -4.452981948852539, "step": 220 }, { "epoch": 2.6192592592592594, "grad_norm": 6.347499166452346, "learning_rate": 1.5854947499743413e-07, "logits/chosen": -1.0178323984146118, "logits/rejected": -0.9484214186668396, "logps/chosen": -18.72942543029785, "logps/rejected": -43.50739288330078, "loss": 0.058, "rewards/accuracies": 1.0, "rewards/chosen": 0.00246235728263855, "rewards/margins": 5.049181938171387, "rewards/rejected": -5.046720027923584, "step": 221 }, { "epoch": 2.631111111111111, "grad_norm": 7.517395617419555, "learning_rate": 1.5613405514401757e-07, "logits/chosen": -1.3176552057266235, "logits/rejected": -1.2037431001663208, "logps/chosen": -23.663074493408203, "logps/rejected": -38.63740158081055, "loss": 0.0784, "rewards/accuracies": 1.0, "rewards/chosen": -0.46497219800949097, "rewards/margins": 3.637241840362549, "rewards/rejected": -4.1022138595581055, "step": 222 }, { "epoch": 2.642962962962963, "grad_norm": 5.580464995595371, "learning_rate": 1.537287928647002e-07, "logits/chosen": -1.1343742609024048, "logits/rejected": -1.0372800827026367, "logps/chosen": -24.60474395751953, "logps/rejected": -35.45951843261719, "loss": 0.0552, "rewards/accuracies": 1.0, "rewards/chosen": -0.19021296501159668, "rewards/margins": 3.999257802963257, "rewards/rejected": -4.1894707679748535, "step": 223 }, { "epoch": 2.6548148148148147, "grad_norm": 6.8709626079577175, "learning_rate": 1.513339484416309e-07, "logits/chosen": -1.1663920879364014, "logits/rejected": -1.151513695716858, "logps/chosen": -34.081424713134766, "logps/rejected": -52.950035095214844, "loss": 0.0634, "rewards/accuracies": 1.0, "rewards/chosen": -0.6248299479484558, "rewards/margins": 5.083865165710449, "rewards/rejected": -5.708695411682129, "step": 224 }, { "epoch": 2.6666666666666665, "grad_norm": 5.241170241551687, "learning_rate": 1.489497810296046e-07, "logits/chosen": -1.1173107624053955, "logits/rejected": -1.0356335639953613, "logps/chosen": -23.928882598876953, "logps/rejected": -59.75672912597656, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": -0.154710054397583, "rewards/margins": 6.490203857421875, "rewards/rejected": -6.644913673400879, "step": 225 }, { "epoch": 2.6785185185185183, "grad_norm": 6.118871434229746, "learning_rate": 1.4657654862801797e-07, "logits/chosen": -1.1692712306976318, "logits/rejected": -1.1598937511444092, "logps/chosen": -21.213607788085938, "logps/rejected": -43.659019470214844, "loss": 0.0535, "rewards/accuracies": 1.0, "rewards/chosen": -0.0038762539625167847, "rewards/margins": 4.2396321296691895, "rewards/rejected": -4.243508338928223, "step": 226 }, { "epoch": 2.6903703703703705, "grad_norm": 6.573686325728602, "learning_rate": 1.4421450805295082e-07, "logits/chosen": -1.3742166757583618, "logits/rejected": -1.2483296394348145, "logps/chosen": -26.414283752441406, "logps/rejected": -36.898033142089844, "loss": 0.0631, "rewards/accuracies": 1.0, "rewards/chosen": 0.4164190888404846, "rewards/margins": 3.4958038330078125, "rewards/rejected": -3.0793848037719727, "step": 227 }, { "epoch": 2.7022222222222223, "grad_norm": 6.627117841873176, "learning_rate": 1.418639149093748e-07, "logits/chosen": -1.252882719039917, "logits/rejected": -1.1287035942077637, "logps/chosen": -27.196077346801758, "logps/rejected": -36.04934310913086, "loss": 0.0652, "rewards/accuracies": 1.0, "rewards/chosen": -0.3088911473751068, "rewards/margins": 3.228538990020752, "rewards/rejected": -3.5374302864074707, "step": 228 }, { "epoch": 2.714074074074074, "grad_norm": 5.7383606439736985, "learning_rate": 1.3952502356349323e-07, "logits/chosen": -1.134902000427246, "logits/rejected": -1.048799753189087, "logps/chosen": -24.576427459716797, "logps/rejected": -45.68292236328125, "loss": 0.0555, "rewards/accuracies": 1.0, "rewards/chosen": -0.00011165440082550049, "rewards/margins": 5.5121378898620605, "rewards/rejected": -5.512249946594238, "step": 229 }, { "epoch": 2.725925925925926, "grad_norm": 7.011579914365523, "learning_rate": 1.371980871152157e-07, "logits/chosen": -1.0634100437164307, "logits/rejected": -0.9104180335998535, "logps/chosen": -29.859907150268555, "logps/rejected": -50.70886993408203, "loss": 0.0694, "rewards/accuracies": 1.0, "rewards/chosen": 0.16749510169029236, "rewards/margins": 5.730169773101807, "rewards/rejected": -5.5626749992370605, "step": 230 }, { "epoch": 2.7377777777777776, "grad_norm": 6.154624592473375, "learning_rate": 1.3488335737076911e-07, "logits/chosen": -1.196423888206482, "logits/rejected": -1.0755786895751953, "logps/chosen": -22.506702423095703, "logps/rejected": -31.105947494506836, "loss": 0.0662, "rewards/accuracies": 1.0, "rewards/chosen": -0.14943012595176697, "rewards/margins": 3.1946725845336914, "rewards/rejected": -3.344102621078491, "step": 231 }, { "epoch": 2.74962962962963, "grad_norm": 5.957255330795934, "learning_rate": 1.3258108481544847e-07, "logits/chosen": -1.1230725049972534, "logits/rejected": -1.0154623985290527, "logps/chosen": -32.393314361572266, "logps/rejected": -46.890968322753906, "loss": 0.0571, "rewards/accuracies": 1.0, "rewards/chosen": -0.3677994906902313, "rewards/margins": 4.284073829650879, "rewards/rejected": -4.651873588562012, "step": 232 }, { "epoch": 2.7614814814814816, "grad_norm": 7.438230804694601, "learning_rate": 1.3029151858651143e-07, "logits/chosen": -1.351361632347107, "logits/rejected": -1.2523919343948364, "logps/chosen": -21.477752685546875, "logps/rejected": -47.73276138305664, "loss": 0.072, "rewards/accuracies": 1.0, "rewards/chosen": -0.33381107449531555, "rewards/margins": 5.271888256072998, "rewards/rejected": -5.60569953918457, "step": 233 }, { "epoch": 2.7733333333333334, "grad_norm": 6.539977486206468, "learning_rate": 1.2801490644621788e-07, "logits/chosen": -0.9469627141952515, "logits/rejected": -0.7967553734779358, "logps/chosen": -29.131805419921875, "logps/rejected": -47.47956085205078, "loss": 0.0694, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4405498802661896, "rewards/margins": 4.784643650054932, "rewards/rejected": -5.225193500518799, "step": 234 }, { "epoch": 2.785185185185185, "grad_norm": 5.650929076564459, "learning_rate": 1.257514947550189e-07, "logits/chosen": -1.1391454935073853, "logits/rejected": -0.9985545873641968, "logps/chosen": -19.8972110748291, "logps/rejected": -33.077980041503906, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": 0.12327444553375244, "rewards/margins": 4.0027875900268555, "rewards/rejected": -3.8795135021209717, "step": 235 }, { "epoch": 2.797037037037037, "grad_norm": 7.402429067879936, "learning_rate": 1.2350152844489688e-07, "logits/chosen": -1.1549052000045776, "logits/rejected": -0.9909151792526245, "logps/chosen": -30.456247329711914, "logps/rejected": -48.731536865234375, "loss": 0.0793, "rewards/accuracies": 1.0, "rewards/chosen": -0.4372573494911194, "rewards/margins": 4.651694297790527, "rewards/rejected": -5.088951587677002, "step": 236 }, { "epoch": 2.8088888888888888, "grad_norm": 6.734173424308296, "learning_rate": 1.2126525099286108e-07, "logits/chosen": -1.180855631828308, "logits/rejected": -1.2272781133651733, "logps/chosen": -28.35424041748047, "logps/rejected": -48.205318450927734, "loss": 0.0687, "rewards/accuracies": 1.0, "rewards/chosen": -0.47708311676979065, "rewards/margins": 5.187458515167236, "rewards/rejected": -5.664542198181152, "step": 237 }, { "epoch": 2.8207407407407405, "grad_norm": 6.387888892476844, "learning_rate": 1.1904290439459971e-07, "logits/chosen": -1.1783702373504639, "logits/rejected": -1.0934996604919434, "logps/chosen": -23.247806549072266, "logps/rejected": -42.38697814941406, "loss": 0.0566, "rewards/accuracies": 1.0, "rewards/chosen": -0.272434800863266, "rewards/margins": 3.895128011703491, "rewards/rejected": -4.167562484741211, "step": 238 }, { "epoch": 2.8325925925925928, "grad_norm": 5.6141759750684015, "learning_rate": 1.1683472913829284e-07, "logits/chosen": -1.2703089714050293, "logits/rejected": -1.1347819566726685, "logps/chosen": -36.7236213684082, "logps/rejected": -49.431922912597656, "loss": 0.0502, "rewards/accuracies": 1.0, "rewards/chosen": -0.3381836414337158, "rewards/margins": 4.108426094055176, "rewards/rejected": -4.446609973907471, "step": 239 }, { "epoch": 2.8444444444444446, "grad_norm": 5.932933616519591, "learning_rate": 1.146409641785882e-07, "logits/chosen": -1.1102083921432495, "logits/rejected": -1.0604140758514404, "logps/chosen": -27.76748275756836, "logps/rejected": -34.07774353027344, "loss": 0.0608, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2582487463951111, "rewards/margins": 2.856698513031006, "rewards/rejected": -3.1149468421936035, "step": 240 }, { "epoch": 2.8562962962962963, "grad_norm": 6.7530047905552735, "learning_rate": 1.1246184691074314e-07, "logits/chosen": -1.2408270835876465, "logits/rejected": -1.1994930505752563, "logps/chosen": -28.50021743774414, "logps/rejected": -49.54254150390625, "loss": 0.0722, "rewards/accuracies": 1.0, "rewards/chosen": 0.0546003133058548, "rewards/margins": 6.17899227142334, "rewards/rejected": -6.124391555786133, "step": 241 }, { "epoch": 2.868148148148148, "grad_norm": 7.401984494431854, "learning_rate": 1.1029761314493518e-07, "logits/chosen": -1.3563504219055176, "logits/rejected": -1.2836796045303345, "logps/chosen": -29.872364044189453, "logps/rejected": -42.799747467041016, "loss": 0.0685, "rewards/accuracies": 1.0, "rewards/chosen": -0.22778728604316711, "rewards/margins": 5.134041786193848, "rewards/rejected": -5.3618292808532715, "step": 242 }, { "epoch": 2.88, "grad_norm": 7.471266580413762, "learning_rate": 1.0814849708074414e-07, "logits/chosen": -1.128278136253357, "logits/rejected": -0.9680910706520081, "logps/chosen": -38.86433792114258, "logps/rejected": -47.132667541503906, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": 0.25867849588394165, "rewards/margins": 4.5064005851745605, "rewards/rejected": -4.247722625732422, "step": 243 }, { "epoch": 2.891851851851852, "grad_norm": 6.390593039880407, "learning_rate": 1.0601473128180854e-07, "logits/chosen": -1.2510465383529663, "logits/rejected": -1.100001573562622, "logps/chosen": -33.47804260253906, "logps/rejected": -41.27080154418945, "loss": 0.0621, "rewards/accuracies": 1.0, "rewards/chosen": 0.0875249058008194, "rewards/margins": 4.39518404006958, "rewards/rejected": -4.307658672332764, "step": 244 }, { "epoch": 2.9037037037037035, "grad_norm": 8.267732345292577, "learning_rate": 1.0389654665065908e-07, "logits/chosen": -1.1220481395721436, "logits/rejected": -1.0034825801849365, "logps/chosen": -24.331592559814453, "logps/rejected": -41.46772003173828, "loss": 0.0865, "rewards/accuracies": 0.9375, "rewards/chosen": -0.26567134261131287, "rewards/margins": 4.682834148406982, "rewards/rejected": -4.948505401611328, "step": 245 }, { "epoch": 2.9155555555555557, "grad_norm": 7.488610652410469, "learning_rate": 1.0179417240373182e-07, "logits/chosen": -1.176962971687317, "logits/rejected": -1.1089400053024292, "logps/chosen": -34.5350341796875, "logps/rejected": -56.02618408203125, "loss": 0.0643, "rewards/accuracies": 1.0, "rewards/chosen": -0.9151340126991272, "rewards/margins": 5.155758380889893, "rewards/rejected": -6.070892333984375, "step": 246 }, { "epoch": 2.9274074074074075, "grad_norm": 6.376533768492628, "learning_rate": 9.970783604656383e-08, "logits/chosen": -1.3059768676757812, "logits/rejected": -1.0361342430114746, "logps/chosen": -28.046321868896484, "logps/rejected": -48.62135696411133, "loss": 0.0632, "rewards/accuracies": 1.0, "rewards/chosen": -0.13395918905735016, "rewards/margins": 5.584090232849121, "rewards/rejected": -5.718049049377441, "step": 247 }, { "epoch": 2.9392592592592592, "grad_norm": 7.764371689739165, "learning_rate": 9.763776334917398e-08, "logits/chosen": -1.3117642402648926, "logits/rejected": -1.1723650693893433, "logps/chosen": -28.31963348388672, "logps/rejected": -37.416561126708984, "loss": 0.0825, "rewards/accuracies": 1.0, "rewards/chosen": -0.4260867238044739, "rewards/margins": 2.7781217098236084, "rewards/rejected": -3.2042083740234375, "step": 248 }, { "epoch": 2.951111111111111, "grad_norm": 6.603531713615615, "learning_rate": 9.558417832163162e-08, "logits/chosen": -1.0509438514709473, "logits/rejected": -1.1028845310211182, "logps/chosen": -29.35840606689453, "logps/rejected": -39.08806610107422, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": -0.06763426959514618, "rewards/margins": 4.334118843078613, "rewards/rejected": -4.401752948760986, "step": 249 }, { "epoch": 2.962962962962963, "grad_norm": 6.641636931789762, "learning_rate": 9.354730318981561e-08, "logits/chosen": -1.269490122795105, "logits/rejected": -1.1995911598205566, "logps/chosen": -23.048587799072266, "logps/rejected": -41.5166015625, "loss": 0.0718, "rewards/accuracies": 1.0, "rewards/chosen": -0.22312739491462708, "rewards/margins": 4.370500564575195, "rewards/rejected": -4.5936279296875, "step": 250 }, { "epoch": 2.974814814814815, "grad_norm": 5.554303148575841, "learning_rate": 9.15273583713663e-08, "logits/chosen": -1.2579662799835205, "logits/rejected": -1.0015959739685059, "logps/chosen": -31.479568481445312, "logps/rejected": -56.00233459472656, "loss": 0.0555, "rewards/accuracies": 1.0, "rewards/chosen": -0.5022794008255005, "rewards/margins": 6.369531154632568, "rewards/rejected": -6.871809959411621, "step": 251 }, { "epoch": 2.986666666666667, "grad_norm": 5.613495199138643, "learning_rate": 8.95245624518336e-08, "logits/chosen": -1.2209105491638184, "logits/rejected": -1.217021107673645, "logps/chosen": -25.06351089477539, "logps/rejected": -47.17867660522461, "loss": 0.0597, "rewards/accuracies": 1.0, "rewards/chosen": -0.4572719633579254, "rewards/margins": 4.925302028656006, "rewards/rejected": -5.382573127746582, "step": 252 }, { "epoch": 2.9985185185185186, "grad_norm": 5.721091066167364, "learning_rate": 8.753913216102285e-08, "logits/chosen": -1.257638931274414, "logits/rejected": -1.1348259449005127, "logps/chosen": -28.36161231994629, "logps/rejected": -52.211952209472656, "loss": 0.0512, "rewards/accuracies": 1.0, "rewards/chosen": -0.5001235604286194, "rewards/margins": 4.936119079589844, "rewards/rejected": -5.436242580413818, "step": 253 }, { "epoch": 3.0103703703703704, "grad_norm": 6.164342961198106, "learning_rate": 8.557128234954189e-08, "logits/chosen": -1.16610848903656, "logits/rejected": -1.0525445938110352, "logps/chosen": -19.37337875366211, "logps/rejected": -44.04081344604492, "loss": 0.0608, "rewards/accuracies": 1.0, "rewards/chosen": -0.3864176273345947, "rewards/margins": 5.418819427490234, "rewards/rejected": -5.805237293243408, "step": 254 }, { "epoch": 3.022222222222222, "grad_norm": 4.836985245782948, "learning_rate": 8.362122596555088e-08, "logits/chosen": -1.1399970054626465, "logits/rejected": -0.9710614681243896, "logps/chosen": -23.326759338378906, "logps/rejected": -46.79590606689453, "loss": 0.0426, "rewards/accuracies": 1.0, "rewards/chosen": 0.2866979241371155, "rewards/margins": 6.633603572845459, "rewards/rejected": -6.3469061851501465, "step": 255 }, { "epoch": 3.034074074074074, "grad_norm": 5.427568975360207, "learning_rate": 8.16891740317189e-08, "logits/chosen": -1.2294830083847046, "logits/rejected": -1.1226603984832764, "logps/chosen": -23.196685791015625, "logps/rejected": -38.58136749267578, "loss": 0.0515, "rewards/accuracies": 1.0, "rewards/chosen": -0.16551783680915833, "rewards/margins": 4.354981899261475, "rewards/rejected": -4.520500183105469, "step": 256 }, { "epoch": 3.0459259259259257, "grad_norm": 5.924541071404178, "learning_rate": 7.977533562238838e-08, "logits/chosen": -1.1663788557052612, "logits/rejected": -1.1404701471328735, "logps/chosen": -26.776004791259766, "logps/rejected": -50.571266174316406, "loss": 0.059, "rewards/accuracies": 0.9375, "rewards/chosen": -0.18248632550239563, "rewards/margins": 5.887378692626953, "rewards/rejected": -6.069864273071289, "step": 257 }, { "epoch": 3.057777777777778, "grad_norm": 4.128731375178606, "learning_rate": 7.787991784094999e-08, "logits/chosen": -1.2448476552963257, "logits/rejected": -1.0964651107788086, "logps/chosen": -29.85052490234375, "logps/rejected": -62.34690856933594, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": -0.4742766320705414, "rewards/margins": 5.80230712890625, "rewards/rejected": -6.276583671569824, "step": 258 }, { "epoch": 3.0696296296296297, "grad_norm": 6.5179983840331825, "learning_rate": 7.60031257974316e-08, "logits/chosen": -1.1081359386444092, "logits/rejected": -1.0185449123382568, "logps/chosen": -23.463979721069336, "logps/rejected": -50.03909683227539, "loss": 0.0571, "rewards/accuracies": 1.0, "rewards/chosen": -0.14104682207107544, "rewards/margins": 5.487791538238525, "rewards/rejected": -5.628839015960693, "step": 259 }, { "epoch": 3.0814814814814815, "grad_norm": 6.670813820042167, "learning_rate": 7.414516258630244e-08, "logits/chosen": -1.0931766033172607, "logits/rejected": -0.9176234602928162, "logps/chosen": -35.09284210205078, "logps/rejected": -56.267723083496094, "loss": 0.0615, "rewards/accuracies": 1.0, "rewards/chosen": -0.34566253423690796, "rewards/margins": 5.847842216491699, "rewards/rejected": -6.193504810333252, "step": 260 }, { "epoch": 3.0933333333333333, "grad_norm": 4.886020098171949, "learning_rate": 7.230622926449564e-08, "logits/chosen": -1.2389843463897705, "logits/rejected": -1.1709716320037842, "logps/chosen": -23.021934509277344, "logps/rejected": -42.478797912597656, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": -0.25014615058898926, "rewards/margins": 5.359426975250244, "rewards/rejected": -5.6095733642578125, "step": 261 }, { "epoch": 3.105185185185185, "grad_norm": 5.470390367743688, "learning_rate": 7.048652482965078e-08, "logits/chosen": -1.250532865524292, "logits/rejected": -1.098189353942871, "logps/chosen": -33.6146354675293, "logps/rejected": -41.64539337158203, "loss": 0.0575, "rewards/accuracies": 1.0, "rewards/chosen": -0.16313320398330688, "rewards/margins": 4.307170391082764, "rewards/rejected": -4.470303535461426, "step": 262 }, { "epoch": 3.117037037037037, "grad_norm": 4.836356334775007, "learning_rate": 6.868624619858021e-08, "logits/chosen": -1.4147872924804688, "logits/rejected": -1.4524210691452026, "logps/chosen": -28.40629768371582, "logps/rejected": -56.72626495361328, "loss": 0.0447, "rewards/accuracies": 1.0, "rewards/chosen": 0.03009369969367981, "rewards/margins": 5.194394111633301, "rewards/rejected": -5.164300918579102, "step": 263 }, { "epoch": 3.128888888888889, "grad_norm": 4.750367218060603, "learning_rate": 6.690558818595943e-08, "logits/chosen": -1.2358546257019043, "logits/rejected": -1.1999270915985107, "logps/chosen": -25.05208969116211, "logps/rejected": -48.712806701660156, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": -0.637313723564148, "rewards/margins": 5.243877410888672, "rewards/rejected": -5.881191253662109, "step": 264 }, { "epoch": 3.140740740740741, "grad_norm": 4.863983890990079, "learning_rate": 6.514474348324581e-08, "logits/chosen": -1.2671034336090088, "logits/rejected": -1.1254373788833618, "logps/chosen": -32.094966888427734, "logps/rejected": -52.297821044921875, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": -0.4840225875377655, "rewards/margins": 5.210573196411133, "rewards/rejected": -5.6945953369140625, "step": 265 }, { "epoch": 3.1525925925925926, "grad_norm": 6.337695693323137, "learning_rate": 6.340390263782655e-08, "logits/chosen": -1.2698873281478882, "logits/rejected": -1.172045111656189, "logps/chosen": -24.47865867614746, "logps/rejected": -54.05537796020508, "loss": 0.0665, "rewards/accuracies": 1.0, "rewards/chosen": -0.3584892153739929, "rewards/margins": 5.420907020568848, "rewards/rejected": -5.779396057128906, "step": 266 }, { "epoch": 3.1644444444444444, "grad_norm": 6.315515433549729, "learning_rate": 6.168325403239913e-08, "logits/chosen": -1.2651307582855225, "logits/rejected": -1.1162527799606323, "logps/chosen": -19.784488677978516, "logps/rejected": -40.73728942871094, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 0.27537021040916443, "rewards/margins": 5.313858985900879, "rewards/rejected": -5.038488388061523, "step": 267 }, { "epoch": 3.176296296296296, "grad_norm": 4.582040973118046, "learning_rate": 5.998298386458545e-08, "logits/chosen": -1.0796051025390625, "logits/rejected": -1.0264118909835815, "logps/chosen": -27.581031799316406, "logps/rejected": -49.427703857421875, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": 0.022144198417663574, "rewards/margins": 5.080024719238281, "rewards/rejected": -5.057880878448486, "step": 268 }, { "epoch": 3.188148148148148, "grad_norm": 4.952404939534042, "learning_rate": 5.830327612678265e-08, "logits/chosen": -1.0570693016052246, "logits/rejected": -1.0790140628814697, "logps/chosen": -27.206192016601562, "logps/rejected": -52.819984436035156, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": -0.8033032417297363, "rewards/margins": 4.914515495300293, "rewards/rejected": -5.717819690704346, "step": 269 }, { "epoch": 3.2, "grad_norm": 5.724512806119854, "learning_rate": 5.6644312586253044e-08, "logits/chosen": -1.0734919309616089, "logits/rejected": -1.0849241018295288, "logps/chosen": -41.63764572143555, "logps/rejected": -48.729576110839844, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": -0.17860059440135956, "rewards/margins": 4.616375923156738, "rewards/rejected": -4.794977188110352, "step": 270 }, { "epoch": 3.211851851851852, "grad_norm": 5.991455888598502, "learning_rate": 5.5006272765454056e-08, "logits/chosen": -1.2988901138305664, "logits/rejected": -1.1308969259262085, "logps/chosen": -22.436080932617188, "logps/rejected": -34.09817123413086, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": -0.005201712250709534, "rewards/margins": 3.529590606689453, "rewards/rejected": -3.534792423248291, "step": 271 }, { "epoch": 3.2237037037037037, "grad_norm": 5.413534996418431, "learning_rate": 5.338933392261158e-08, "logits/chosen": -1.222093105316162, "logits/rejected": -1.1171449422836304, "logps/chosen": -26.16643714904785, "logps/rejected": -42.16415023803711, "loss": 0.0508, "rewards/accuracies": 1.0, "rewards/chosen": -0.20804953575134277, "rewards/margins": 5.081421852111816, "rewards/rejected": -5.2894721031188965, "step": 272 }, { "epoch": 3.2355555555555555, "grad_norm": 5.91458057536832, "learning_rate": 5.1793671032538206e-08, "logits/chosen": -1.2229275703430176, "logits/rejected": -1.3230491876602173, "logps/chosen": -23.901247024536133, "logps/rejected": -45.79841995239258, "loss": 0.0586, "rewards/accuracies": 1.0, "rewards/chosen": -0.171402707695961, "rewards/margins": 4.954162120819092, "rewards/rejected": -5.125565052032471, "step": 273 }, { "epoch": 3.2474074074074073, "grad_norm": 5.22719369235926, "learning_rate": 5.021945676769859e-08, "logits/chosen": -1.2852232456207275, "logits/rejected": -1.2391951084136963, "logps/chosen": -20.282339096069336, "logps/rejected": -42.286293029785156, "loss": 0.0503, "rewards/accuracies": 1.0, "rewards/chosen": -0.014406859874725342, "rewards/margins": 4.513213157653809, "rewards/rejected": -4.5276198387146, "step": 274 }, { "epoch": 3.259259259259259, "grad_norm": 5.73422178803048, "learning_rate": 4.866686147952387e-08, "logits/chosen": -1.0481388568878174, "logits/rejected": -0.9910224676132202, "logps/chosen": -31.128089904785156, "logps/rejected": -48.627586364746094, "loss": 0.0565, "rewards/accuracies": 1.0, "rewards/chosen": 0.11383038759231567, "rewards/margins": 4.8710784912109375, "rewards/rejected": -4.757248401641846, "step": 275 }, { "epoch": 3.2711111111111113, "grad_norm": 5.655456397723797, "learning_rate": 4.71360531799774e-08, "logits/chosen": -1.1052677631378174, "logits/rejected": -1.0184680223464966, "logps/chosen": -36.36450958251953, "logps/rejected": -51.73442840576172, "loss": 0.0596, "rewards/accuracies": 1.0, "rewards/chosen": -0.5170344114303589, "rewards/margins": 5.25890588760376, "rewards/rejected": -5.775939464569092, "step": 276 }, { "epoch": 3.282962962962963, "grad_norm": 4.996738283026781, "learning_rate": 4.562719752337349e-08, "logits/chosen": -1.266676664352417, "logits/rejected": -1.1158446073532104, "logps/chosen": -33.958919525146484, "logps/rejected": -66.85248565673828, "loss": 0.0523, "rewards/accuracies": 1.0, "rewards/chosen": -0.6868615746498108, "rewards/margins": 6.892773628234863, "rewards/rejected": -7.579635143280029, "step": 277 }, { "epoch": 3.294814814814815, "grad_norm": 5.1730881424971535, "learning_rate": 4.4140457788451434e-08, "logits/chosen": -1.3682211637496948, "logits/rejected": -1.2177406549453735, "logps/chosen": -23.593040466308594, "logps/rejected": -43.28880310058594, "loss": 0.047, "rewards/accuracies": 1.0, "rewards/chosen": 0.21666671335697174, "rewards/margins": 4.890883445739746, "rewards/rejected": -4.674216270446777, "step": 278 }, { "epoch": 3.3066666666666666, "grad_norm": 4.729619192449929, "learning_rate": 4.267599486070647e-08, "logits/chosen": -1.2258741855621338, "logits/rejected": -1.1649140119552612, "logps/chosen": -31.068470001220703, "logps/rejected": -36.381038665771484, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": -0.24162916839122772, "rewards/margins": 4.573906898498535, "rewards/rejected": -4.8155364990234375, "step": 279 }, { "epoch": 3.3185185185185184, "grad_norm": 5.122216550777693, "learning_rate": 4.1233967214979764e-08, "logits/chosen": -1.198957920074463, "logits/rejected": -1.06025230884552, "logps/chosen": -33.02262496948242, "logps/rejected": -41.4984130859375, "loss": 0.049, "rewards/accuracies": 1.0, "rewards/chosen": 0.23480704426765442, "rewards/margins": 3.949801445007324, "rewards/rejected": -3.714993953704834, "step": 280 }, { "epoch": 3.33037037037037, "grad_norm": 4.305629596628497, "learning_rate": 3.9814530898309356e-08, "logits/chosen": -1.0878995656967163, "logits/rejected": -1.0379247665405273, "logps/chosen": -27.192787170410156, "logps/rejected": -46.65719223022461, "loss": 0.0366, "rewards/accuracies": 1.0, "rewards/chosen": -0.029019802808761597, "rewards/margins": 5.454700946807861, "rewards/rejected": -5.483720779418945, "step": 281 }, { "epoch": 3.3422222222222224, "grad_norm": 5.169778953020736, "learning_rate": 3.8417839513043646e-08, "logits/chosen": -1.2834384441375732, "logits/rejected": -1.2438150644302368, "logps/chosen": -30.712045669555664, "logps/rejected": -37.924110412597656, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": -0.27224576473236084, "rewards/margins": 3.7623844146728516, "rewards/rejected": -4.034629821777344, "step": 282 }, { "epoch": 3.354074074074074, "grad_norm": 6.097603815404355, "learning_rate": 3.704404420021956e-08, "logits/chosen": -1.1656073331832886, "logits/rejected": -0.950996994972229, "logps/chosen": -27.072315216064453, "logps/rejected": -46.62635040283203, "loss": 0.0607, "rewards/accuracies": 1.0, "rewards/chosen": -0.001886114478111267, "rewards/margins": 5.3247785568237305, "rewards/rejected": -5.326663970947266, "step": 283 }, { "epoch": 3.365925925925926, "grad_norm": 5.599744322780303, "learning_rate": 3.569329362320708e-08, "logits/chosen": -1.015643835067749, "logits/rejected": -0.936226487159729, "logps/chosen": -21.00103187561035, "logps/rejected": -49.05156326293945, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": -0.10715761035680771, "rewards/margins": 5.087098121643066, "rewards/rejected": -5.19425630569458, "step": 284 }, { "epoch": 3.3777777777777778, "grad_norm": 5.412291665519436, "learning_rate": 3.436573395162179e-08, "logits/chosen": -1.2125096321105957, "logits/rejected": -1.0717750787734985, "logps/chosen": -26.21784782409668, "logps/rejected": -44.80372619628906, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": -0.4322938024997711, "rewards/margins": 4.486114025115967, "rewards/rejected": -4.918407440185547, "step": 285 }, { "epoch": 3.3896296296296295, "grad_norm": 5.068684864066647, "learning_rate": 3.306150884550732e-08, "logits/chosen": -1.306767225265503, "logits/rejected": -1.136150598526001, "logps/chosen": -28.90319061279297, "logps/rejected": -48.472164154052734, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": -0.3896186947822571, "rewards/margins": 4.801982879638672, "rewards/rejected": -5.191601753234863, "step": 286 }, { "epoch": 3.4014814814814813, "grad_norm": 6.257371157657287, "learning_rate": 3.17807594397895e-08, "logits/chosen": -1.2118041515350342, "logits/rejected": -1.007792353630066, "logps/chosen": -26.383615493774414, "logps/rejected": -46.10572052001953, "loss": 0.0542, "rewards/accuracies": 1.0, "rewards/chosen": -0.43773341178894043, "rewards/margins": 5.824153900146484, "rewards/rejected": -6.261887550354004, "step": 287 }, { "epoch": 3.413333333333333, "grad_norm": 5.353883051519317, "learning_rate": 3.052362432900332e-08, "logits/chosen": -1.447021245956421, "logits/rejected": -1.2934633493423462, "logps/chosen": -25.619125366210938, "logps/rejected": -42.07542037963867, "loss": 0.0495, "rewards/accuracies": 1.0, "rewards/chosen": 0.3246709406375885, "rewards/margins": 7.011876106262207, "rewards/rejected": -6.687204360961914, "step": 288 }, { "epoch": 3.4251851851851853, "grad_norm": 5.399450219209751, "learning_rate": 2.9290239552295538e-08, "logits/chosen": -1.0401594638824463, "logits/rejected": -1.0249950885772705, "logps/chosen": -32.01249313354492, "logps/rejected": -38.693145751953125, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": 0.22764620184898376, "rewards/margins": 4.879059314727783, "rewards/rejected": -4.6514129638671875, "step": 289 }, { "epoch": 3.437037037037037, "grad_norm": 5.942445036249677, "learning_rate": 2.8080738578703052e-08, "logits/chosen": -1.2160862684249878, "logits/rejected": -1.1057730913162231, "logps/chosen": -26.857769012451172, "logps/rejected": -49.42009735107422, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": 0.0421622097492218, "rewards/margins": 7.015720844268799, "rewards/rejected": -6.973557472229004, "step": 290 }, { "epoch": 3.448888888888889, "grad_norm": 4.452390830898345, "learning_rate": 2.6895252292709974e-08, "logits/chosen": -1.0676244497299194, "logits/rejected": -1.078723669052124, "logps/chosen": -31.738510131835938, "logps/rejected": -45.86015319824219, "loss": 0.0474, "rewards/accuracies": 1.0, "rewards/chosen": -0.5407127737998962, "rewards/margins": 4.980555057525635, "rewards/rejected": -5.521267890930176, "step": 291 }, { "epoch": 3.4607407407407407, "grad_norm": 6.147853636678421, "learning_rate": 2.5733908980083984e-08, "logits/chosen": -1.2384705543518066, "logits/rejected": -1.112764835357666, "logps/chosen": -26.170108795166016, "logps/rejected": -45.731956481933594, "loss": 0.0609, "rewards/accuracies": 1.0, "rewards/chosen": -0.5723249316215515, "rewards/margins": 4.402219772338867, "rewards/rejected": -4.974545001983643, "step": 292 }, { "epoch": 3.4725925925925925, "grad_norm": 5.914419745435524, "learning_rate": 2.4596834313994037e-08, "logits/chosen": -1.1161627769470215, "logits/rejected": -1.0215301513671875, "logps/chosen": -28.129005432128906, "logps/rejected": -33.972686767578125, "loss": 0.0566, "rewards/accuracies": 1.0, "rewards/chosen": 0.1716342568397522, "rewards/margins": 4.373476028442383, "rewards/rejected": -4.201840877532959, "step": 293 }, { "epoch": 3.4844444444444447, "grad_norm": 4.7471616018558285, "learning_rate": 2.3484151341411018e-08, "logits/chosen": -1.1082960367202759, "logits/rejected": -1.0436348915100098, "logps/chosen": -20.280670166015625, "logps/rejected": -46.68223190307617, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": -0.2738330066204071, "rewards/margins": 5.264364719390869, "rewards/rejected": -5.5381975173950195, "step": 294 }, { "epoch": 3.4962962962962965, "grad_norm": 4.718228569099853, "learning_rate": 2.23959804697921e-08, "logits/chosen": -1.0989983081817627, "logits/rejected": -1.0200862884521484, "logps/chosen": -28.536529541015625, "logps/rejected": -44.35844421386719, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": 0.01516886055469513, "rewards/margins": 5.231680870056152, "rewards/rejected": -5.216512680053711, "step": 295 }, { "epoch": 3.5081481481481482, "grad_norm": 4.412160626992289, "learning_rate": 2.1332439454051277e-08, "logits/chosen": -1.0349336862564087, "logits/rejected": -0.9772415161132812, "logps/chosen": -24.290695190429688, "logps/rejected": -34.85298538208008, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 0.006944596767425537, "rewards/margins": 3.6680963039398193, "rewards/rejected": -3.661151647567749, "step": 296 }, { "epoch": 3.52, "grad_norm": 5.698184998134574, "learning_rate": 2.029364338381656e-08, "logits/chosen": -1.373365879058838, "logits/rejected": -1.2929483652114868, "logps/chosen": -34.31553649902344, "logps/rejected": -35.5068359375, "loss": 0.0555, "rewards/accuracies": 1.0, "rewards/chosen": -0.09695194661617279, "rewards/margins": 3.730624198913574, "rewards/rejected": -3.8275763988494873, "step": 297 }, { "epoch": 3.531851851851852, "grad_norm": 5.166813211580323, "learning_rate": 1.9279704670975726e-08, "logits/chosen": -1.0577523708343506, "logits/rejected": -0.9344998598098755, "logps/chosen": -25.05517578125, "logps/rejected": -48.95963668823242, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": -0.17365989089012146, "rewards/margins": 4.252870559692383, "rewards/rejected": -4.426530838012695, "step": 298 }, { "epoch": 3.5437037037037036, "grad_norm": 4.976330098589956, "learning_rate": 1.829073303751172e-08, "logits/chosen": -1.071714162826538, "logits/rejected": -1.0084483623504639, "logps/chosen": -20.396150588989258, "logps/rejected": -38.729373931884766, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": -0.11510992050170898, "rewards/margins": 5.170332431793213, "rewards/rejected": -5.2854413986206055, "step": 299 }, { "epoch": 3.5555555555555554, "grad_norm": 4.623020185136485, "learning_rate": 1.732683550362954e-08, "logits/chosen": -1.06589674949646, "logits/rejected": -1.0053000450134277, "logps/chosen": -33.81154251098633, "logps/rejected": -48.16522216796875, "loss": 0.0401, "rewards/accuracies": 1.0, "rewards/chosen": -0.0654190182685852, "rewards/margins": 4.919932842254639, "rewards/rejected": -4.985352516174316, "step": 300 }, { "epoch": 3.5674074074074076, "grad_norm": 4.373917316257469, "learning_rate": 1.6388116376174765e-08, "logits/chosen": -1.1930819749832153, "logits/rejected": -1.1007626056671143, "logps/chosen": -24.583969116210938, "logps/rejected": -48.29629898071289, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": -0.5538195371627808, "rewards/margins": 5.18541145324707, "rewards/rejected": -5.739231109619141, "step": 301 }, { "epoch": 3.5792592592592594, "grad_norm": 4.944808160984247, "learning_rate": 1.5474677237346468e-08, "logits/chosen": -1.1952768564224243, "logits/rejected": -1.1539109945297241, "logps/chosen": -29.354717254638672, "logps/rejected": -49.623294830322266, "loss": 0.0512, "rewards/accuracies": 1.0, "rewards/chosen": -0.31121665239334106, "rewards/margins": 4.585163116455078, "rewards/rejected": -4.896379470825195, "step": 302 }, { "epoch": 3.591111111111111, "grad_norm": 6.237582774941322, "learning_rate": 1.4586616933704527e-08, "logits/chosen": -1.0483250617980957, "logits/rejected": -1.0512489080429077, "logps/chosen": -36.7315788269043, "logps/rejected": -52.41490173339844, "loss": 0.063, "rewards/accuracies": 1.0, "rewards/chosen": 0.060242414474487305, "rewards/margins": 5.002007484436035, "rewards/rejected": -4.941765308380127, "step": 303 }, { "epoch": 3.602962962962963, "grad_norm": 5.366887328514776, "learning_rate": 1.372403156547311e-08, "logits/chosen": -1.2591538429260254, "logits/rejected": -1.1872644424438477, "logps/chosen": -22.69057273864746, "logps/rejected": -38.499332427978516, "loss": 0.0535, "rewards/accuracies": 1.0, "rewards/chosen": -0.3308228552341461, "rewards/margins": 4.569196701049805, "rewards/rejected": -4.900019645690918, "step": 304 }, { "epoch": 3.6148148148148147, "grad_norm": 4.383186056032288, "learning_rate": 1.2887014476141212e-08, "logits/chosen": -1.1302443742752075, "logits/rejected": -1.1017392873764038, "logps/chosen": -27.243087768554688, "logps/rejected": -47.09513473510742, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": 0.2740994989871979, "rewards/margins": 6.748981475830078, "rewards/rejected": -6.474882125854492, "step": 305 }, { "epoch": 3.626666666666667, "grad_norm": 5.520520861273014, "learning_rate": 1.2075656242361732e-08, "logits/chosen": -1.1834189891815186, "logits/rejected": -1.0502477884292603, "logps/chosen": -24.07543182373047, "logps/rejected": -44.05875778198242, "loss": 0.0493, "rewards/accuracies": 1.0, "rewards/chosen": -0.1085430383682251, "rewards/margins": 4.616766452789307, "rewards/rejected": -4.725309371948242, "step": 306 }, { "epoch": 3.6385185185185183, "grad_norm": 4.5846368218080045, "learning_rate": 1.1290044664149873e-08, "logits/chosen": -1.0908325910568237, "logits/rejected": -1.0090572834014893, "logps/chosen": -32.33647918701172, "logps/rejected": -47.15243148803711, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": -0.17506128549575806, "rewards/margins": 4.792283058166504, "rewards/rejected": -4.967344284057617, "step": 307 }, { "epoch": 3.6503703703703705, "grad_norm": 5.28209891846498, "learning_rate": 1.0530264755381824e-08, "logits/chosen": -1.2786378860473633, "logits/rejected": -1.3132318258285522, "logps/chosen": -26.759113311767578, "logps/rejected": -41.227149963378906, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": -0.08898818492889404, "rewards/margins": 3.8004322052001953, "rewards/rejected": -3.8894202709198, "step": 308 }, { "epoch": 3.6622222222222223, "grad_norm": 4.960907388580732, "learning_rate": 9.796398734595284e-09, "logits/chosen": -1.1778481006622314, "logits/rejected": -1.181472897529602, "logps/chosen": -20.444726943969727, "logps/rejected": -33.29534149169922, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": -0.2902683913707733, "rewards/margins": 3.6233019828796387, "rewards/rejected": -3.9135704040527344, "step": 309 }, { "epoch": 3.674074074074074, "grad_norm": 5.737646906284586, "learning_rate": 9.088526016092141e-09, "logits/chosen": -1.1990212202072144, "logits/rejected": -1.1145985126495361, "logps/chosen": -23.687454223632812, "logps/rejected": -40.095672607421875, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": 0.447526752948761, "rewards/margins": 5.467113018035889, "rewards/rejected": -5.019586086273193, "step": 310 }, { "epoch": 3.685925925925926, "grad_norm": 5.7150399704998245, "learning_rate": 8.40672320134489e-09, "logits/chosen": -1.146994948387146, "logits/rejected": -0.9583498239517212, "logps/chosen": -27.36312484741211, "logps/rejected": -43.72743225097656, "loss": 0.0543, "rewards/accuracies": 1.0, "rewards/chosen": -0.11620418727397919, "rewards/margins": 5.6578192710876465, "rewards/rejected": -5.774023056030273, "step": 311 }, { "epoch": 3.6977777777777776, "grad_norm": 4.67711156350355, "learning_rate": 7.751064070707247e-09, "logits/chosen": -1.3420299291610718, "logits/rejected": -1.3341833353042603, "logps/chosen": -31.239133834838867, "logps/rejected": -41.84351348876953, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": 0.3695347011089325, "rewards/margins": 4.75352668762207, "rewards/rejected": -4.3839921951293945, "step": 312 }, { "epoch": 3.70962962962963, "grad_norm": 5.331465549642304, "learning_rate": 7.12161957543006e-09, "logits/chosen": -1.1273610591888428, "logits/rejected": -1.1161746978759766, "logps/chosen": -37.207733154296875, "logps/rejected": -61.19139862060547, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": -0.4315568804740906, "rewards/margins": 5.310615539550781, "rewards/rejected": -5.742172718048096, "step": 313 }, { "epoch": 3.7214814814814816, "grad_norm": 5.478798851131127, "learning_rate": 6.518457829983559e-09, "logits/chosen": -1.3124021291732788, "logits/rejected": -1.2279609441757202, "logps/chosen": -34.83631896972656, "logps/rejected": -44.276790618896484, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": -0.18586915731430054, "rewards/margins": 3.245110511779785, "rewards/rejected": -3.4309799671173096, "step": 314 }, { "epoch": 3.7333333333333334, "grad_norm": 4.7427648272619, "learning_rate": 5.9416441046862555e-09, "logits/chosen": -1.1716216802597046, "logits/rejected": -1.2297029495239258, "logps/chosen": -21.677108764648438, "logps/rejected": -35.96882247924805, "loss": 0.0502, "rewards/accuracies": 1.0, "rewards/chosen": 0.003640979528427124, "rewards/margins": 3.7295522689819336, "rewards/rejected": -3.7259111404418945, "step": 315 }, { "epoch": 3.745185185185185, "grad_norm": 5.760686688528461, "learning_rate": 5.3912408186420064e-09, "logits/chosen": -1.038623332977295, "logits/rejected": -0.9665778875350952, "logps/chosen": -27.82607650756836, "logps/rejected": -35.596378326416016, "loss": 0.0571, "rewards/accuracies": 1.0, "rewards/chosen": 0.08153516054153442, "rewards/margins": 4.407654762268066, "rewards/rejected": -4.326119422912598, "step": 316 }, { "epoch": 3.757037037037037, "grad_norm": 4.629475217167777, "learning_rate": 4.867307532985227e-09, "logits/chosen": -1.2615653276443481, "logits/rejected": -1.1494407653808594, "logps/chosen": -40.15790557861328, "logps/rejected": -60.7736701965332, "loss": 0.0398, "rewards/accuracies": 1.0, "rewards/chosen": -0.6902495622634888, "rewards/margins": 5.601743698120117, "rewards/rejected": -6.291993141174316, "step": 317 }, { "epoch": 3.7688888888888887, "grad_norm": 6.454886951587756, "learning_rate": 4.369900944435734e-09, "logits/chosen": -1.0968234539031982, "logits/rejected": -1.026517391204834, "logps/chosen": -31.793502807617188, "logps/rejected": -60.37879180908203, "loss": 0.0663, "rewards/accuracies": 1.0, "rewards/chosen": -0.31835824251174927, "rewards/margins": 5.233622074127197, "rewards/rejected": -5.551980495452881, "step": 318 }, { "epoch": 3.7807407407407405, "grad_norm": 5.37027735834608, "learning_rate": 3.899074879163244e-09, "logits/chosen": -1.2527568340301514, "logits/rejected": -1.0810654163360596, "logps/chosen": -24.402645111083984, "logps/rejected": -39.67679977416992, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": -0.4089929461479187, "rewards/margins": 4.298420429229736, "rewards/rejected": -4.707413196563721, "step": 319 }, { "epoch": 3.7925925925925927, "grad_norm": 5.568233279162257, "learning_rate": 3.4548802869627804e-09, "logits/chosen": -1.291711688041687, "logits/rejected": -1.2471994161605835, "logps/chosen": -31.061437606811523, "logps/rejected": -49.516639709472656, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": -0.07120761275291443, "rewards/margins": 3.723219394683838, "rewards/rejected": -3.794426918029785, "step": 320 }, { "epoch": 3.8044444444444445, "grad_norm": 6.171276653233977, "learning_rate": 3.037365235741024e-09, "logits/chosen": -1.3342313766479492, "logits/rejected": -1.187886357307434, "logps/chosen": -24.079877853393555, "logps/rejected": -38.28224182128906, "loss": 0.0611, "rewards/accuracies": 1.0, "rewards/chosen": -0.4851805865764618, "rewards/margins": 4.108402252197266, "rewards/rejected": -4.593582630157471, "step": 321 }, { "epoch": 3.8162962962962963, "grad_norm": 6.301615641450496, "learning_rate": 2.6465749063149245e-09, "logits/chosen": -1.4614932537078857, "logits/rejected": -1.3210101127624512, "logps/chosen": -24.112567901611328, "logps/rejected": -51.42138671875, "loss": 0.0652, "rewards/accuracies": 1.0, "rewards/chosen": -0.5791712999343872, "rewards/margins": 6.299165725708008, "rewards/rejected": -6.8783369064331055, "step": 322 }, { "epoch": 3.828148148148148, "grad_norm": 6.87975838997433, "learning_rate": 2.282551587522441e-09, "logits/chosen": -1.406750202178955, "logits/rejected": -1.3338254690170288, "logps/chosen": -22.056568145751953, "logps/rejected": -34.89329147338867, "loss": 0.0758, "rewards/accuracies": 1.0, "rewards/chosen": -0.25121578574180603, "rewards/margins": 4.385520935058594, "rewards/rejected": -4.636736869812012, "step": 323 }, { "epoch": 3.84, "grad_norm": 4.966352446635051, "learning_rate": 1.9453346716462316e-09, "logits/chosen": -1.211751937866211, "logits/rejected": -1.1320858001708984, "logps/chosen": -27.62029457092285, "logps/rejected": -32.46119689941406, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": 0.004868373274803162, "rewards/margins": 3.807752847671509, "rewards/rejected": -3.802884578704834, "step": 324 }, { "epoch": 3.851851851851852, "grad_norm": 5.653095930506065, "learning_rate": 1.6349606501509794e-09, "logits/chosen": -1.1088950634002686, "logits/rejected": -0.9607290029525757, "logps/chosen": -28.395509719848633, "logps/rejected": -34.00682830810547, "loss": 0.0544, "rewards/accuracies": 1.0, "rewards/chosen": 0.15473833680152893, "rewards/margins": 3.99048113822937, "rewards/rejected": -3.835742473602295, "step": 325 }, { "epoch": 3.863703703703704, "grad_norm": 5.3966444428734945, "learning_rate": 1.351463109734441e-09, "logits/chosen": -1.3495458364486694, "logits/rejected": -1.0097894668579102, "logps/chosen": -22.80147933959961, "logps/rejected": -41.809940338134766, "loss": 0.0496, "rewards/accuracies": 1.0, "rewards/chosen": -0.19359610974788666, "rewards/margins": 5.500581741333008, "rewards/rejected": -5.694178104400635, "step": 326 }, { "epoch": 3.8755555555555556, "grad_norm": 5.006770074945758, "learning_rate": 1.0948727286930192e-09, "logits/chosen": -1.1479936838150024, "logits/rejected": -0.9590707421302795, "logps/chosen": -27.08885955810547, "logps/rejected": -40.10725402832031, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 0.17611512541770935, "rewards/margins": 3.613635540008545, "rewards/rejected": -3.4375205039978027, "step": 327 }, { "epoch": 3.8874074074074074, "grad_norm": 6.085390667471827, "learning_rate": 8.652172736017816e-10, "logits/chosen": -1.1275379657745361, "logits/rejected": -1.116228461265564, "logps/chosen": -33.487083435058594, "logps/rejected": -52.050228118896484, "loss": 0.0628, "rewards/accuracies": 1.0, "rewards/chosen": -0.16681703925132751, "rewards/margins": 4.557419776916504, "rewards/rejected": -4.724237442016602, "step": 328 }, { "epoch": 3.899259259259259, "grad_norm": 6.597375260168904, "learning_rate": 6.625215963098896e-10, "logits/chosen": -1.234811782836914, "logits/rejected": -1.1153168678283691, "logps/chosen": -27.0404052734375, "logps/rejected": -34.0019416809082, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": -0.32952964305877686, "rewards/margins": 4.547809600830078, "rewards/rejected": -4.8773393630981445, "step": 329 }, { "epoch": 3.911111111111111, "grad_norm": 5.399445593167999, "learning_rate": 4.868076312512515e-10, "logits/chosen": -1.1961758136749268, "logits/rejected": -1.034976840019226, "logps/chosen": -22.31209945678711, "logps/rejected": -44.69541931152344, "loss": 0.0563, "rewards/accuracies": 1.0, "rewards/chosen": 0.18147775530815125, "rewards/margins": 5.29000186920166, "rewards/rejected": -5.108523845672607, "step": 330 }, { "epoch": 3.9229629629629628, "grad_norm": 4.687101989180421, "learning_rate": 3.3809439307086463e-10, "logits/chosen": -1.204687237739563, "logits/rejected": -1.126007318496704, "logps/chosen": -24.837623596191406, "logps/rejected": -40.658023834228516, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": 0.2791484594345093, "rewards/margins": 4.140464782714844, "rewards/rejected": -3.8613169193267822, "step": 331 }, { "epoch": 3.934814814814815, "grad_norm": 5.7183873880444045, "learning_rate": 2.1639797456723952e-10, "logits/chosen": -1.2559609413146973, "logits/rejected": -1.0792549848556519, "logps/chosen": -35.796287536621094, "logps/rejected": -46.229820251464844, "loss": 0.049, "rewards/accuracies": 1.0, "rewards/chosen": 0.015116512775421143, "rewards/margins": 5.156147480010986, "rewards/rejected": -5.141030311584473, "step": 332 }, { "epoch": 3.9466666666666668, "grad_norm": 4.801576190645628, "learning_rate": 1.21731544950876e-10, "logits/chosen": -1.227901816368103, "logits/rejected": -1.2207201719284058, "logps/chosen": -31.329517364501953, "logps/rejected": -51.822059631347656, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": -0.12939153611660004, "rewards/margins": 5.30501651763916, "rewards/rejected": -5.434407711029053, "step": 333 }, { "epoch": 3.9585185185185185, "grad_norm": 5.366333281325966, "learning_rate": 5.4105348419264394e-11, "logits/chosen": -1.474123239517212, "logits/rejected": -1.370969295501709, "logps/chosen": -21.29511260986328, "logps/rejected": -37.816551208496094, "loss": 0.0584, "rewards/accuracies": 0.9375, "rewards/chosen": 0.26586639881134033, "rewards/margins": 4.2116827964782715, "rewards/rejected": -3.9458167552948, "step": 334 }, { "epoch": 3.9703703703703703, "grad_norm": 4.961233689259609, "learning_rate": 1.3526703048216682e-11, "logits/chosen": -1.2672888040542603, "logits/rejected": -1.0974268913269043, "logps/chosen": -25.828834533691406, "logps/rejected": -52.68805694580078, "loss": 0.0468, "rewards/accuracies": 1.0, "rewards/chosen": 0.10839378833770752, "rewards/margins": 6.133199691772461, "rewards/rejected": -6.024805545806885, "step": 335 }, { "epoch": 3.982222222222222, "grad_norm": 5.7068180002610625, "learning_rate": 0.0, "logits/chosen": -1.3201903104782104, "logits/rejected": -1.2799780368804932, "logps/chosen": -26.542402267456055, "logps/rejected": -42.164154052734375, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": 0.13592669367790222, "rewards/margins": 4.714659690856934, "rewards/rejected": -4.578732967376709, "step": 336 }, { "epoch": 3.982222222222222, "step": 336, "total_flos": 0.0, "train_loss": 0.19470643034825721, "train_runtime": 59934.0013, "train_samples_per_second": 0.72, "train_steps_per_second": 0.006 } ], "logging_steps": 1, "max_steps": 336, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }