{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994837377387713, "eval_steps": 100, "global_step": 968, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.154639175257731e-09, "logits/chosen": -2.251229763031006, "logits/rejected": -2.2295913696289062, "logps/chosen": -269.52740478515625, "logps/rejected": -240.59812927246094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.154639175257731e-08, "logits/chosen": -2.223740339279175, "logits/rejected": -2.180643081665039, "logps/chosen": -284.7340087890625, "logps/rejected": -205.98194885253906, "loss": 0.694, "rewards/accuracies": 0.4305555522441864, "rewards/chosen": -0.0006893649115227163, "rewards/margins": 0.0007374237175099552, "rewards/rejected": -0.0014267880469560623, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.0309278350515462e-07, "logits/chosen": -2.33476185798645, "logits/rejected": -2.2125375270843506, "logps/chosen": -320.8204040527344, "logps/rejected": -248.4267120361328, "loss": 0.692, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0003039050498045981, "rewards/margins": 0.0023796656168997288, "rewards/rejected": -0.0020757606253027916, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.5463917525773197e-07, "logits/chosen": -2.339370012283325, "logits/rejected": -2.304020404815674, "logps/chosen": -268.95074462890625, "logps/rejected": -227.067626953125, "loss": 0.6921, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0005883350968360901, "rewards/margins": 0.002594549907371402, "rewards/rejected": -0.0020062148105353117, "step": 30 }, { "epoch": 0.04, "learning_rate": 2.0618556701030925e-07, "logits/chosen": -2.3392791748046875, "logits/rejected": -2.3300938606262207, "logps/chosen": -308.5113220214844, "logps/rejected": -253.8385467529297, "loss": 0.6945, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0024464379530400038, "rewards/margins": -0.00025889737298712134, "rewards/rejected": 0.0027053358498960733, "step": 40 }, { "epoch": 0.05, "learning_rate": 2.5773195876288655e-07, "logits/chosen": -2.251412868499756, "logits/rejected": -2.2359275817871094, "logps/chosen": -297.78375244140625, "logps/rejected": -227.23556518554688, "loss": 0.6922, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0033915191888809204, "rewards/margins": 0.0055986023508012295, "rewards/rejected": -0.0022070836275815964, "step": 50 }, { "epoch": 0.06, "learning_rate": 3.0927835051546394e-07, "logits/chosen": -2.167163848876953, "logits/rejected": -2.3376193046569824, "logps/chosen": -256.54510498046875, "logps/rejected": -229.5459747314453, "loss": 0.6917, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.000388039683457464, "rewards/margins": 0.007883811369538307, "rewards/rejected": -0.0074957734905183315, "step": 60 }, { "epoch": 0.07, "learning_rate": 3.608247422680412e-07, "logits/chosen": -2.3430614471435547, "logits/rejected": -2.281782627105713, "logps/chosen": -313.92608642578125, "logps/rejected": -252.57284545898438, "loss": 0.6924, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0012417413527145982, "rewards/margins": 0.0001173208438558504, "rewards/rejected": 0.0011244199704378843, "step": 70 }, { "epoch": 0.08, "learning_rate": 4.123711340206185e-07, "logits/chosen": -2.337070941925049, "logits/rejected": -2.3018112182617188, "logps/chosen": -302.9524841308594, "logps/rejected": -243.9047088623047, "loss": 0.6916, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0021400884725153446, "rewards/margins": -0.0002812549355439842, "rewards/rejected": 0.002421343233436346, "step": 80 }, { "epoch": 0.09, "learning_rate": 4.639175257731959e-07, "logits/chosen": -2.259251356124878, "logits/rejected": -2.2963995933532715, "logps/chosen": -270.1668395996094, "logps/rejected": -216.64822387695312, "loss": 0.6913, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.009941437281668186, "rewards/margins": 0.010241752490401268, "rewards/rejected": -0.00030031436472199857, "step": 90 }, { "epoch": 0.1, "learning_rate": 4.982778415614236e-07, "logits/chosen": -2.1677582263946533, "logits/rejected": -2.2741990089416504, "logps/chosen": -274.75836181640625, "logps/rejected": -226.3966064453125, "loss": 0.6901, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.006115993484854698, "rewards/margins": 0.0013887921813875437, "rewards/rejected": 0.0047272020019590855, "step": 100 }, { "epoch": 0.11, "learning_rate": 4.925373134328357e-07, "logits/chosen": -2.271916389465332, "logits/rejected": -2.197857141494751, "logps/chosen": -274.72113037109375, "logps/rejected": -232.5464324951172, "loss": 0.6886, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.005831545684486628, "rewards/margins": 0.0067709460854530334, "rewards/rejected": -0.000939400284551084, "step": 110 }, { "epoch": 0.12, "learning_rate": 4.867967853042479e-07, "logits/chosen": -2.2548232078552246, "logits/rejected": -2.322075366973877, "logps/chosen": -319.34521484375, "logps/rejected": -235.76535034179688, "loss": 0.689, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.013832703232765198, "rewards/margins": 0.01176449190825224, "rewards/rejected": 0.002068211790174246, "step": 120 }, { "epoch": 0.13, "learning_rate": 4.810562571756601e-07, "logits/chosen": -2.32174015045166, "logits/rejected": -2.3775150775909424, "logps/chosen": -296.20733642578125, "logps/rejected": -245.56655883789062, "loss": 0.6875, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.017552796751260757, "rewards/margins": 0.013545483350753784, "rewards/rejected": 0.004007314797490835, "step": 130 }, { "epoch": 0.14, "learning_rate": 4.753157290470723e-07, "logits/chosen": -2.3627283573150635, "logits/rejected": -2.310948133468628, "logps/chosen": -301.9321594238281, "logps/rejected": -239.2898406982422, "loss": 0.688, "rewards/accuracies": 0.46875, "rewards/chosen": 0.011156091466546059, "rewards/margins": 0.009668431244790554, "rewards/rejected": 0.0014876595232635736, "step": 140 }, { "epoch": 0.15, "learning_rate": 4.6957520091848447e-07, "logits/chosen": -2.2531113624572754, "logits/rejected": -2.348215341567993, "logps/chosen": -284.4292907714844, "logps/rejected": -259.6882019042969, "loss": 0.6858, "rewards/accuracies": 0.5625, "rewards/chosen": 0.017186133190989494, "rewards/margins": 0.011862866580486298, "rewards/rejected": 0.005323265679180622, "step": 150 }, { "epoch": 0.17, "learning_rate": 4.6383467278989666e-07, "logits/chosen": -2.361238956451416, "logits/rejected": -2.4430744647979736, "logps/chosen": -286.7644348144531, "logps/rejected": -221.6837158203125, "loss": 0.6857, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.01911218836903572, "rewards/margins": 0.014816234819591045, "rewards/rejected": 0.00429595448076725, "step": 160 }, { "epoch": 0.18, "learning_rate": 4.580941446613088e-07, "logits/chosen": -2.32244610786438, "logits/rejected": -2.3339757919311523, "logps/chosen": -301.54693603515625, "logps/rejected": -239.26095581054688, "loss": 0.6839, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.023171866312623024, "rewards/margins": 0.014685508795082569, "rewards/rejected": 0.00848635844886303, "step": 170 }, { "epoch": 0.19, "learning_rate": 4.52353616532721e-07, "logits/chosen": -2.347285032272339, "logits/rejected": -2.3244121074676514, "logps/chosen": -257.841552734375, "logps/rejected": -214.5565643310547, "loss": 0.6864, "rewards/accuracies": 0.5625, "rewards/chosen": 0.019994111731648445, "rewards/margins": 0.01520625315606594, "rewards/rejected": 0.004787858575582504, "step": 180 }, { "epoch": 0.2, "learning_rate": 4.4661308840413316e-07, "logits/chosen": -2.2657313346862793, "logits/rejected": -2.201254367828369, "logps/chosen": -253.98916625976562, "logps/rejected": -206.3340301513672, "loss": 0.6833, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0245305635035038, "rewards/margins": 0.017677443102002144, "rewards/rejected": 0.006853120867162943, "step": 190 }, { "epoch": 0.21, "learning_rate": 4.408725602755453e-07, "logits/chosen": -2.284461498260498, "logits/rejected": -2.2873706817626953, "logps/chosen": -261.44427490234375, "logps/rejected": -195.59422302246094, "loss": 0.6835, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03187788277864456, "rewards/margins": 0.024095263332128525, "rewards/rejected": 0.007782619446516037, "step": 200 }, { "epoch": 0.22, "learning_rate": 4.351320321469575e-07, "logits/chosen": -2.18426513671875, "logits/rejected": -2.1963071823120117, "logps/chosen": -302.31195068359375, "logps/rejected": -218.6005401611328, "loss": 0.6815, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.03718667849898338, "rewards/margins": 0.026892077177762985, "rewards/rejected": 0.010294605046510696, "step": 210 }, { "epoch": 0.23, "learning_rate": 4.2939150401836967e-07, "logits/chosen": -2.2150394916534424, "logits/rejected": -2.2160990238189697, "logps/chosen": -269.44769287109375, "logps/rejected": -235.6748504638672, "loss": 0.6801, "rewards/accuracies": 0.59375, "rewards/chosen": 0.038056183606386185, "rewards/margins": 0.023441683501005173, "rewards/rejected": 0.014614498242735863, "step": 220 }, { "epoch": 0.24, "learning_rate": 4.236509758897818e-07, "logits/chosen": -2.2152469158172607, "logits/rejected": -2.1862380504608154, "logps/chosen": -271.4049377441406, "logps/rejected": -242.6397247314453, "loss": 0.6826, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.03268683701753616, "rewards/margins": 0.026912549510598183, "rewards/rejected": 0.0057742842473089695, "step": 230 }, { "epoch": 0.25, "learning_rate": 4.17910447761194e-07, "logits/chosen": -2.3059380054473877, "logits/rejected": -2.2681984901428223, "logps/chosen": -309.55499267578125, "logps/rejected": -221.61703491210938, "loss": 0.6827, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.03509462997317314, "rewards/margins": 0.012767216190695763, "rewards/rejected": 0.02232741378247738, "step": 240 }, { "epoch": 0.26, "learning_rate": 4.121699196326062e-07, "logits/chosen": -2.307035446166992, "logits/rejected": -2.2920923233032227, "logps/chosen": -272.9412841796875, "logps/rejected": -237.314208984375, "loss": 0.6824, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.03535359352827072, "rewards/margins": 0.012216273695230484, "rewards/rejected": 0.023137323558330536, "step": 250 }, { "epoch": 0.27, "learning_rate": 4.0642939150401836e-07, "logits/chosen": -2.3456673622131348, "logits/rejected": -2.3194832801818848, "logps/chosen": -270.475341796875, "logps/rejected": -221.84536743164062, "loss": 0.6805, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.04569912329316139, "rewards/margins": 0.029975151643157005, "rewards/rejected": 0.015723969787359238, "step": 260 }, { "epoch": 0.28, "learning_rate": 4.006888633754305e-07, "logits/chosen": -2.385854721069336, "logits/rejected": -2.3556528091430664, "logps/chosen": -284.36029052734375, "logps/rejected": -232.5426788330078, "loss": 0.6793, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.04998317360877991, "rewards/margins": 0.032010577619075775, "rewards/rejected": 0.017972594127058983, "step": 270 }, { "epoch": 0.29, "learning_rate": 3.949483352468427e-07, "logits/chosen": -2.308225154876709, "logits/rejected": -2.259629726409912, "logps/chosen": -293.1715087890625, "logps/rejected": -236.4293975830078, "loss": 0.6771, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.057786036282777786, "rewards/margins": 0.04149205610156059, "rewards/rejected": 0.016293983906507492, "step": 280 }, { "epoch": 0.3, "learning_rate": 3.8920780711825487e-07, "logits/chosen": -2.278501033782959, "logits/rejected": -2.369293689727783, "logps/chosen": -278.4786376953125, "logps/rejected": -227.40927124023438, "loss": 0.6792, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0485750176012516, "rewards/margins": 0.02242155373096466, "rewards/rejected": 0.02615346387028694, "step": 290 }, { "epoch": 0.31, "learning_rate": 3.83467278989667e-07, "logits/chosen": -2.2661235332489014, "logits/rejected": -2.205644130706787, "logps/chosen": -254.183837890625, "logps/rejected": -221.9667510986328, "loss": 0.6772, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.05874975398182869, "rewards/margins": 0.03965791314840317, "rewards/rejected": 0.019091838970780373, "step": 300 }, { "epoch": 0.32, "learning_rate": 3.777267508610792e-07, "logits/chosen": -2.32353138923645, "logits/rejected": -2.3743112087249756, "logps/chosen": -306.22711181640625, "logps/rejected": -257.60980224609375, "loss": 0.6783, "rewards/accuracies": 0.59375, "rewards/chosen": 0.04823786020278931, "rewards/margins": 0.017192820087075233, "rewards/rejected": 0.03104504384100437, "step": 310 }, { "epoch": 0.33, "learning_rate": 3.7198622273249137e-07, "logits/chosen": -2.234679698944092, "logits/rejected": -2.211430788040161, "logps/chosen": -251.83053588867188, "logps/rejected": -193.01544189453125, "loss": 0.6739, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.06583289802074432, "rewards/margins": 0.047706056386232376, "rewards/rejected": 0.018126841634511948, "step": 320 }, { "epoch": 0.34, "learning_rate": 3.662456946039035e-07, "logits/chosen": -2.259127140045166, "logits/rejected": -2.287956714630127, "logps/chosen": -312.1918029785156, "logps/rejected": -239.03530883789062, "loss": 0.6761, "rewards/accuracies": 0.65625, "rewards/chosen": 0.07068151980638504, "rewards/margins": 0.051512353122234344, "rewards/rejected": 0.0191691592335701, "step": 330 }, { "epoch": 0.35, "learning_rate": 3.605051664753157e-07, "logits/chosen": -2.197277784347534, "logits/rejected": -2.13037109375, "logps/chosen": -244.2609100341797, "logps/rejected": -238.80953979492188, "loss": 0.6788, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.05732797831296921, "rewards/margins": 0.030042264610528946, "rewards/rejected": 0.027285713702440262, "step": 340 }, { "epoch": 0.36, "learning_rate": 3.547646383467279e-07, "logits/chosen": -2.365830421447754, "logits/rejected": -2.3728528022766113, "logps/chosen": -313.7022705078125, "logps/rejected": -248.090087890625, "loss": 0.6746, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.08016298711299896, "rewards/margins": 0.05509548634290695, "rewards/rejected": 0.025067497044801712, "step": 350 }, { "epoch": 0.37, "learning_rate": 3.4902411021814007e-07, "logits/chosen": -2.22756290435791, "logits/rejected": -2.259359121322632, "logps/chosen": -303.25250244140625, "logps/rejected": -249.8985595703125, "loss": 0.6723, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.06414168328046799, "rewards/margins": 0.04363773763179779, "rewards/rejected": 0.020503941923379898, "step": 360 }, { "epoch": 0.38, "learning_rate": 3.432835820895522e-07, "logits/chosen": -2.3700273036956787, "logits/rejected": -2.3231639862060547, "logps/chosen": -314.5257263183594, "logps/rejected": -270.7105712890625, "loss": 0.6759, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.07061124593019485, "rewards/margins": 0.03391130641102791, "rewards/rejected": 0.03669993579387665, "step": 370 }, { "epoch": 0.39, "learning_rate": 3.375430539609644e-07, "logits/chosen": -2.3212878704071045, "logits/rejected": -2.249602794647217, "logps/chosen": -291.92474365234375, "logps/rejected": -239.6724395751953, "loss": 0.677, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.07373902946710587, "rewards/margins": 0.03367278352379799, "rewards/rejected": 0.04006624594330788, "step": 380 }, { "epoch": 0.4, "learning_rate": 3.3180252583237657e-07, "logits/chosen": -2.297023057937622, "logits/rejected": -2.264172077178955, "logps/chosen": -278.0927734375, "logps/rejected": -237.13436889648438, "loss": 0.6722, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.0686158686876297, "rewards/margins": 0.051144860684871674, "rewards/rejected": 0.01747100241482258, "step": 390 }, { "epoch": 0.41, "learning_rate": 3.260619977037887e-07, "logits/chosen": -2.237035036087036, "logits/rejected": -2.2392399311065674, "logps/chosen": -263.4399108886719, "logps/rejected": -213.87451171875, "loss": 0.6707, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.06768475472927094, "rewards/margins": 0.048441771417856216, "rewards/rejected": 0.019242987036705017, "step": 400 }, { "epoch": 0.42, "learning_rate": 3.203214695752009e-07, "logits/chosen": -2.2776081562042236, "logits/rejected": -2.2924447059631348, "logps/chosen": -268.8953857421875, "logps/rejected": -252.852294921875, "loss": 0.6673, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.08111406862735748, "rewards/margins": 0.05318716913461685, "rewards/rejected": 0.027926897630095482, "step": 410 }, { "epoch": 0.43, "learning_rate": 3.145809414466131e-07, "logits/chosen": -2.3054046630859375, "logits/rejected": -2.2502362728118896, "logps/chosen": -252.5205841064453, "logps/rejected": -204.43344116210938, "loss": 0.6749, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.07272285223007202, "rewards/margins": 0.04809904843568802, "rewards/rejected": 0.024623800069093704, "step": 420 }, { "epoch": 0.44, "learning_rate": 3.0884041331802526e-07, "logits/chosen": -2.3482632637023926, "logits/rejected": -2.3258707523345947, "logps/chosen": -263.67095947265625, "logps/rejected": -241.14047241210938, "loss": 0.6741, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.07022975385189056, "rewards/margins": 0.04051927849650383, "rewards/rejected": 0.029710477218031883, "step": 430 }, { "epoch": 0.45, "learning_rate": 3.030998851894374e-07, "logits/chosen": -2.286533832550049, "logits/rejected": -2.320568084716797, "logps/chosen": -286.72894287109375, "logps/rejected": -247.65542602539062, "loss": 0.6705, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.07666246592998505, "rewards/margins": 0.05972421169281006, "rewards/rejected": 0.01693824864923954, "step": 440 }, { "epoch": 0.46, "learning_rate": 2.973593570608496e-07, "logits/chosen": -2.206477642059326, "logits/rejected": -2.315464496612549, "logps/chosen": -276.1682434082031, "logps/rejected": -230.3959197998047, "loss": 0.678, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0718303695321083, "rewards/margins": 0.04074189439415932, "rewards/rejected": 0.03108847141265869, "step": 450 }, { "epoch": 0.47, "learning_rate": 2.9161882893226177e-07, "logits/chosen": -2.277815103530884, "logits/rejected": -2.342268705368042, "logps/chosen": -273.23773193359375, "logps/rejected": -222.5966796875, "loss": 0.6662, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0802597850561142, "rewards/margins": 0.050464123487472534, "rewards/rejected": 0.029795657843351364, "step": 460 }, { "epoch": 0.49, "learning_rate": 2.858783008036739e-07, "logits/chosen": -2.2656216621398926, "logits/rejected": -2.2778594493865967, "logps/chosen": -248.9929656982422, "logps/rejected": -215.5894012451172, "loss": 0.6669, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.08564073592424393, "rewards/margins": 0.06490761041641235, "rewards/rejected": 0.020733121782541275, "step": 470 }, { "epoch": 0.5, "learning_rate": 2.801377726750861e-07, "logits/chosen": -2.2962255477905273, "logits/rejected": -2.27239727973938, "logps/chosen": -289.5277404785156, "logps/rejected": -231.601318359375, "loss": 0.6713, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.08144901692867279, "rewards/margins": 0.05658548325300217, "rewards/rejected": 0.024863524362444878, "step": 480 }, { "epoch": 0.51, "learning_rate": 2.743972445464983e-07, "logits/chosen": -2.445746660232544, "logits/rejected": -2.267007827758789, "logps/chosen": -293.1885986328125, "logps/rejected": -243.8875274658203, "loss": 0.6676, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.10828351974487305, "rewards/margins": 0.08175922185182571, "rewards/rejected": 0.02652430161833763, "step": 490 }, { "epoch": 0.52, "learning_rate": 2.686567164179104e-07, "logits/chosen": -2.278276205062866, "logits/rejected": -2.295633316040039, "logps/chosen": -254.94760131835938, "logps/rejected": -221.79452514648438, "loss": 0.6672, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.08227074891328812, "rewards/margins": 0.055896710604429245, "rewards/rejected": 0.026374032720923424, "step": 500 }, { "epoch": 0.53, "learning_rate": 2.629161882893226e-07, "logits/chosen": -2.202611207962036, "logits/rejected": -2.2495861053466797, "logps/chosen": -310.4443664550781, "logps/rejected": -256.72406005859375, "loss": 0.6666, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.07021793723106384, "rewards/margins": 0.040728576481342316, "rewards/rejected": 0.02948935702443123, "step": 510 }, { "epoch": 0.54, "learning_rate": 2.571756601607348e-07, "logits/chosen": -2.3376307487487793, "logits/rejected": -2.352074146270752, "logps/chosen": -278.10504150390625, "logps/rejected": -244.0722198486328, "loss": 0.6697, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0925985723733902, "rewards/margins": 0.0637633204460144, "rewards/rejected": 0.028835251927375793, "step": 520 }, { "epoch": 0.55, "learning_rate": 2.5143513203214697e-07, "logits/chosen": -2.243332624435425, "logits/rejected": -2.2513413429260254, "logps/chosen": -242.59439086914062, "logps/rejected": -224.13259887695312, "loss": 0.6716, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.07866770029067993, "rewards/margins": 0.057711243629455566, "rewards/rejected": 0.020956454798579216, "step": 530 }, { "epoch": 0.56, "learning_rate": 2.456946039035591e-07, "logits/chosen": -2.300567150115967, "logits/rejected": -2.271827220916748, "logps/chosen": -288.2174377441406, "logps/rejected": -240.34439086914062, "loss": 0.6682, "rewards/accuracies": 0.65625, "rewards/chosen": 0.10411250591278076, "rewards/margins": 0.05851038545370102, "rewards/rejected": 0.04560210928320885, "step": 540 }, { "epoch": 0.57, "learning_rate": 2.399540757749713e-07, "logits/chosen": -2.3359756469726562, "logits/rejected": -2.194058895111084, "logps/chosen": -265.052001953125, "logps/rejected": -230.23605346679688, "loss": 0.6686, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0775262787938118, "rewards/margins": 0.05575944110751152, "rewards/rejected": 0.021766824647784233, "step": 550 }, { "epoch": 0.58, "learning_rate": 2.3421354764638345e-07, "logits/chosen": -2.3195242881774902, "logits/rejected": -2.283975124359131, "logps/chosen": -302.0104064941406, "logps/rejected": -252.0124053955078, "loss": 0.6708, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.10010389983654022, "rewards/margins": 0.053703296929597855, "rewards/rejected": 0.04640059918165207, "step": 560 }, { "epoch": 0.59, "learning_rate": 2.2847301951779563e-07, "logits/chosen": -2.2481091022491455, "logits/rejected": -2.400871515274048, "logps/chosen": -268.6519775390625, "logps/rejected": -223.69882202148438, "loss": 0.6654, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0826568529009819, "rewards/margins": 0.05431235954165459, "rewards/rejected": 0.028344491496682167, "step": 570 }, { "epoch": 0.6, "learning_rate": 2.227324913892078e-07, "logits/chosen": -2.299408197402954, "logits/rejected": -2.22338604927063, "logps/chosen": -299.3912353515625, "logps/rejected": -236.9815216064453, "loss": 0.661, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.10458721220493317, "rewards/margins": 0.08465038239955902, "rewards/rejected": 0.019936833530664444, "step": 580 }, { "epoch": 0.61, "learning_rate": 2.1699196326061998e-07, "logits/chosen": -2.2584633827209473, "logits/rejected": -2.2311649322509766, "logps/chosen": -253.76913452148438, "logps/rejected": -218.6166534423828, "loss": 0.6687, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.07234074175357819, "rewards/margins": 0.04758009687066078, "rewards/rejected": 0.024760644882917404, "step": 590 }, { "epoch": 0.62, "learning_rate": 2.1125143513203214e-07, "logits/chosen": -2.318943738937378, "logits/rejected": -2.2511682510375977, "logps/chosen": -256.5652770996094, "logps/rejected": -206.35586547851562, "loss": 0.669, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.07542125880718231, "rewards/margins": 0.0553053617477417, "rewards/rejected": 0.020115893334150314, "step": 600 }, { "epoch": 0.63, "learning_rate": 2.055109070034443e-07, "logits/chosen": -2.3058714866638184, "logits/rejected": -2.304198741912842, "logps/chosen": -266.4674987792969, "logps/rejected": -223.82711791992188, "loss": 0.6677, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.09824246913194656, "rewards/margins": 0.06738617271184921, "rewards/rejected": 0.03085630014538765, "step": 610 }, { "epoch": 0.64, "learning_rate": 1.997703788748565e-07, "logits/chosen": -2.337787389755249, "logits/rejected": -2.2819180488586426, "logps/chosen": -313.7826232910156, "logps/rejected": -249.5704803466797, "loss": 0.6582, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.10966908931732178, "rewards/margins": 0.08016980439424515, "rewards/rejected": 0.029499292373657227, "step": 620 }, { "epoch": 0.65, "learning_rate": 1.9402985074626865e-07, "logits/chosen": -2.2067112922668457, "logits/rejected": -2.246953010559082, "logps/chosen": -259.2144775390625, "logps/rejected": -240.3810272216797, "loss": 0.6653, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09941162168979645, "rewards/margins": 0.06417630612850189, "rewards/rejected": 0.035235337913036346, "step": 630 }, { "epoch": 0.66, "learning_rate": 1.8828932261768083e-07, "logits/chosen": -2.2894420623779297, "logits/rejected": -2.2385382652282715, "logps/chosen": -266.48992919921875, "logps/rejected": -217.8952178955078, "loss": 0.661, "rewards/accuracies": 0.65625, "rewards/chosen": 0.095299132168293, "rewards/margins": 0.07987986505031586, "rewards/rejected": 0.01541926246136427, "step": 640 }, { "epoch": 0.67, "learning_rate": 1.82548794489093e-07, "logits/chosen": -2.33485746383667, "logits/rejected": -2.3108019828796387, "logps/chosen": -284.7020568847656, "logps/rejected": -232.82080078125, "loss": 0.664, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.10341651737689972, "rewards/margins": 0.07464977353811264, "rewards/rejected": 0.028766745701432228, "step": 650 }, { "epoch": 0.68, "learning_rate": 1.7680826636050515e-07, "logits/chosen": -2.3347816467285156, "logits/rejected": -2.2758853435516357, "logps/chosen": -279.80059814453125, "logps/rejected": -233.2425994873047, "loss": 0.6608, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.11068934202194214, "rewards/margins": 0.07695071399211884, "rewards/rejected": 0.0337386280298233, "step": 660 }, { "epoch": 0.69, "learning_rate": 1.7106773823191734e-07, "logits/chosen": -2.2854952812194824, "logits/rejected": -2.273536205291748, "logps/chosen": -295.6964416503906, "logps/rejected": -240.4071502685547, "loss": 0.6615, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.1013779416680336, "rewards/margins": 0.060683172196149826, "rewards/rejected": 0.04069476202130318, "step": 670 }, { "epoch": 0.7, "learning_rate": 1.653272101033295e-07, "logits/chosen": -2.34243106842041, "logits/rejected": -2.2720611095428467, "logps/chosen": -289.71722412109375, "logps/rejected": -230.321533203125, "loss": 0.6729, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.09767869859933853, "rewards/margins": 0.039280109107494354, "rewards/rejected": 0.05839858204126358, "step": 680 }, { "epoch": 0.71, "learning_rate": 1.5958668197474169e-07, "logits/chosen": -2.371598482131958, "logits/rejected": -2.362656354904175, "logps/chosen": -268.17828369140625, "logps/rejected": -229.41232299804688, "loss": 0.6659, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0969640463590622, "rewards/margins": 0.06369610875844955, "rewards/rejected": 0.033267926424741745, "step": 690 }, { "epoch": 0.72, "learning_rate": 1.5384615384615385e-07, "logits/chosen": -2.2588796615600586, "logits/rejected": -2.2576823234558105, "logps/chosen": -282.4342041015625, "logps/rejected": -222.56381225585938, "loss": 0.664, "rewards/accuracies": 0.65625, "rewards/chosen": 0.10399500280618668, "rewards/margins": 0.08138440549373627, "rewards/rejected": 0.0226106159389019, "step": 700 }, { "epoch": 0.73, "learning_rate": 1.4810562571756603e-07, "logits/chosen": -2.3341283798217773, "logits/rejected": -2.2046780586242676, "logps/chosen": -272.2647399902344, "logps/rejected": -208.01364135742188, "loss": 0.666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.10669133812189102, "rewards/margins": 0.08235933631658554, "rewards/rejected": 0.02433200553059578, "step": 710 }, { "epoch": 0.74, "learning_rate": 1.423650975889782e-07, "logits/chosen": -2.323979139328003, "logits/rejected": -2.340238094329834, "logps/chosen": -303.2074279785156, "logps/rejected": -259.44268798828125, "loss": 0.6667, "rewards/accuracies": 0.625, "rewards/chosen": 0.11533965170383453, "rewards/margins": 0.047552816569805145, "rewards/rejected": 0.06778682768344879, "step": 720 }, { "epoch": 0.75, "learning_rate": 1.3662456946039035e-07, "logits/chosen": -2.3031513690948486, "logits/rejected": -2.28584623336792, "logps/chosen": -270.1670837402344, "logps/rejected": -252.5519256591797, "loss": 0.6642, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.10461707413196564, "rewards/margins": 0.058367032557725906, "rewards/rejected": 0.04625004902482033, "step": 730 }, { "epoch": 0.76, "learning_rate": 1.3088404133180254e-07, "logits/chosen": -2.2157022953033447, "logits/rejected": -2.2670745849609375, "logps/chosen": -276.71240234375, "logps/rejected": -199.2496795654297, "loss": 0.6635, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.11176248639822006, "rewards/margins": 0.08353973925113678, "rewards/rejected": 0.02822275087237358, "step": 740 }, { "epoch": 0.77, "learning_rate": 1.251435132032147e-07, "logits/chosen": -2.2043914794921875, "logits/rejected": -2.221619129180908, "logps/chosen": -269.0702819824219, "logps/rejected": -220.8921356201172, "loss": 0.665, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.09922349452972412, "rewards/margins": 0.04318443313241005, "rewards/rejected": 0.05603905767202377, "step": 750 }, { "epoch": 0.78, "learning_rate": 1.1940298507462686e-07, "logits/chosen": -2.232959270477295, "logits/rejected": -2.2529525756835938, "logps/chosen": -267.9338684082031, "logps/rejected": -249.4876251220703, "loss": 0.6684, "rewards/accuracies": 0.625, "rewards/chosen": 0.08004304021596909, "rewards/margins": 0.04949140548706055, "rewards/rejected": 0.030551627278327942, "step": 760 }, { "epoch": 0.8, "learning_rate": 1.1366245694603903e-07, "logits/chosen": -2.293257236480713, "logits/rejected": -2.2078585624694824, "logps/chosen": -273.19671630859375, "logps/rejected": -238.57858276367188, "loss": 0.661, "rewards/accuracies": 0.625, "rewards/chosen": 0.11353409290313721, "rewards/margins": 0.06645722687244415, "rewards/rejected": 0.04707685858011246, "step": 770 }, { "epoch": 0.81, "learning_rate": 1.079219288174512e-07, "logits/chosen": -2.3507869243621826, "logits/rejected": -2.325718879699707, "logps/chosen": -290.9693298339844, "logps/rejected": -236.1486358642578, "loss": 0.6633, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0980958342552185, "rewards/margins": 0.07181811332702637, "rewards/rejected": 0.026277724653482437, "step": 780 }, { "epoch": 0.82, "learning_rate": 1.0218140068886336e-07, "logits/chosen": -2.268038272857666, "logits/rejected": -2.286581516265869, "logps/chosen": -270.3387451171875, "logps/rejected": -221.06356811523438, "loss": 0.6564, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.12088136374950409, "rewards/margins": 0.080001600086689, "rewards/rejected": 0.040879763662815094, "step": 790 }, { "epoch": 0.83, "learning_rate": 9.644087256027554e-08, "logits/chosen": -2.272735118865967, "logits/rejected": -2.2941083908081055, "logps/chosen": -284.6488952636719, "logps/rejected": -243.56796264648438, "loss": 0.6639, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.1113913282752037, "rewards/margins": 0.05327050760388374, "rewards/rejected": 0.05812082439661026, "step": 800 }, { "epoch": 0.84, "learning_rate": 9.070034443168771e-08, "logits/chosen": -2.2838375568389893, "logits/rejected": -2.289247751235962, "logps/chosen": -269.5845642089844, "logps/rejected": -230.6207275390625, "loss": 0.6617, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.09149408340454102, "rewards/margins": 0.06341233849525452, "rewards/rejected": 0.02808173932135105, "step": 810 }, { "epoch": 0.85, "learning_rate": 8.495981630309988e-08, "logits/chosen": -2.365980863571167, "logits/rejected": -2.3436598777770996, "logps/chosen": -302.0718688964844, "logps/rejected": -228.1407470703125, "loss": 0.6623, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.13062262535095215, "rewards/margins": 0.08858474344015121, "rewards/rejected": 0.04203786700963974, "step": 820 }, { "epoch": 0.86, "learning_rate": 7.921928817451206e-08, "logits/chosen": -2.342413902282715, "logits/rejected": -2.2254080772399902, "logps/chosen": -287.4922180175781, "logps/rejected": -222.5606231689453, "loss": 0.6565, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12904855608940125, "rewards/margins": 0.08615640550851822, "rewards/rejected": 0.04289213940501213, "step": 830 }, { "epoch": 0.87, "learning_rate": 7.347876004592423e-08, "logits/chosen": -2.259397029876709, "logits/rejected": -2.227036476135254, "logps/chosen": -258.3423767089844, "logps/rejected": -216.99606323242188, "loss": 0.6714, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.10358164459466934, "rewards/margins": 0.06773830950260162, "rewards/rejected": 0.03584333881735802, "step": 840 }, { "epoch": 0.88, "learning_rate": 6.773823191733639e-08, "logits/chosen": -2.2834537029266357, "logits/rejected": -2.3872971534729004, "logps/chosen": -262.05084228515625, "logps/rejected": -231.11306762695312, "loss": 0.6647, "rewards/accuracies": 0.625, "rewards/chosen": 0.09495140612125397, "rewards/margins": 0.055265575647354126, "rewards/rejected": 0.03968583419919014, "step": 850 }, { "epoch": 0.89, "learning_rate": 6.199770378874856e-08, "logits/chosen": -2.4065003395080566, "logits/rejected": -2.3337345123291016, "logps/chosen": -295.71478271484375, "logps/rejected": -270.1822814941406, "loss": 0.6693, "rewards/accuracies": 0.65625, "rewards/chosen": 0.11348612606525421, "rewards/margins": 0.07466179132461548, "rewards/rejected": 0.03882431983947754, "step": 860 }, { "epoch": 0.9, "learning_rate": 5.6257175660160735e-08, "logits/chosen": -2.2463555335998535, "logits/rejected": -2.2443947792053223, "logps/chosen": -312.9588317871094, "logps/rejected": -237.4109344482422, "loss": 0.6644, "rewards/accuracies": 0.59375, "rewards/chosen": 0.10128283500671387, "rewards/margins": 0.053178369998931885, "rewards/rejected": 0.04810447618365288, "step": 870 }, { "epoch": 0.91, "learning_rate": 5.05166475315729e-08, "logits/chosen": -2.358501434326172, "logits/rejected": -2.313483715057373, "logps/chosen": -291.43377685546875, "logps/rejected": -240.09054565429688, "loss": 0.6632, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.10742716491222382, "rewards/margins": 0.07204015552997589, "rewards/rejected": 0.03538701683282852, "step": 880 }, { "epoch": 0.92, "learning_rate": 4.477611940298507e-08, "logits/chosen": -2.313149929046631, "logits/rejected": -2.3558261394500732, "logps/chosen": -285.90643310546875, "logps/rejected": -235.43051147460938, "loss": 0.6666, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.12259715795516968, "rewards/margins": 0.09698096662759781, "rewards/rejected": 0.02561618760228157, "step": 890 }, { "epoch": 0.93, "learning_rate": 3.903559127439724e-08, "logits/chosen": -2.3278651237487793, "logits/rejected": -2.195068836212158, "logps/chosen": -272.7381896972656, "logps/rejected": -211.40640258789062, "loss": 0.658, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.1207551583647728, "rewards/margins": 0.09316142648458481, "rewards/rejected": 0.027593741193413734, "step": 900 }, { "epoch": 0.94, "learning_rate": 3.3295063145809414e-08, "logits/chosen": -2.290696859359741, "logits/rejected": -2.3440823554992676, "logps/chosen": -238.2651824951172, "logps/rejected": -206.77969360351562, "loss": 0.6616, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.09928463399410248, "rewards/margins": 0.07226204872131348, "rewards/rejected": 0.027022594586014748, "step": 910 }, { "epoch": 0.95, "learning_rate": 2.755453501722158e-08, "logits/chosen": -2.375807762145996, "logits/rejected": -2.367743730545044, "logps/chosen": -281.56195068359375, "logps/rejected": -225.125244140625, "loss": 0.662, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.1072310209274292, "rewards/margins": 0.056608647108078, "rewards/rejected": 0.050622373819351196, "step": 920 }, { "epoch": 0.96, "learning_rate": 2.1814006888633754e-08, "logits/chosen": -2.281919002532959, "logits/rejected": -2.254122734069824, "logps/chosen": -256.39105224609375, "logps/rejected": -203.3081817626953, "loss": 0.6617, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.11211923509836197, "rewards/margins": 0.07925260812044144, "rewards/rejected": 0.03286661207675934, "step": 930 }, { "epoch": 0.97, "learning_rate": 1.6073478760045924e-08, "logits/chosen": -2.316282272338867, "logits/rejected": -2.3123340606689453, "logps/chosen": -271.6207580566406, "logps/rejected": -231.7317352294922, "loss": 0.6626, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.10637687146663666, "rewards/margins": 0.06768520176410675, "rewards/rejected": 0.0386916846036911, "step": 940 }, { "epoch": 0.98, "learning_rate": 1.0332950631458094e-08, "logits/chosen": -2.3146958351135254, "logits/rejected": -2.2793381214141846, "logps/chosen": -282.83270263671875, "logps/rejected": -233.0804443359375, "loss": 0.6612, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.11455857753753662, "rewards/margins": 0.0838586837053299, "rewards/rejected": 0.030699897557497025, "step": 950 }, { "epoch": 0.99, "learning_rate": 4.592422502870264e-09, "logits/chosen": -2.251638889312744, "logits/rejected": -2.234907627105713, "logps/chosen": -281.0075378417969, "logps/rejected": -239.98049926757812, "loss": 0.661, "rewards/accuracies": 0.65625, "rewards/chosen": 0.1062885969877243, "rewards/margins": 0.06708581745624542, "rewards/rejected": 0.03920278698205948, "step": 960 }, { "epoch": 1.0, "eval_logits/chosen": -2.4597132205963135, "eval_logits/rejected": -2.398695468902588, "eval_logps/chosen": -278.69171142578125, "eval_logps/rejected": -230.4560089111328, "eval_loss": 0.6642152070999146, "eval_rewards/accuracies": 0.6480000019073486, "eval_rewards/chosen": 0.10415761172771454, "eval_rewards/margins": 0.06405296921730042, "eval_rewards/rejected": 0.04010463133454323, "eval_runtime": 443.9432, "eval_samples_per_second": 4.505, "eval_steps_per_second": 0.282, "step": 968 }, { "epoch": 1.0, "step": 968, "total_flos": 0.0, "train_loss": 0.6728762634529555, "train_runtime": 27528.1814, "train_samples_per_second": 2.251, "train_steps_per_second": 0.035 } ], "logging_steps": 10, "max_steps": 968, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }