{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "eta": 0.004999999888241291, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -2.745321273803711, "logits/rejected": -2.661250352859497, "logps/chosen": -321.0613098144531, "logps/rejected": -271.1681823730469, "loss": 0.7079, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "eta": 0.004999999422580004, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.781073570251465, "logits/rejected": -2.7723324298858643, "logps/chosen": -242.78675842285156, "logps/rejected": -208.898193359375, "loss": 0.7064, "rewards/accuracies": 0.5069444179534912, "rewards/chosen": 0.00046142542851157486, "rewards/margins": 0.0004825991054531187, "rewards/rejected": -2.1173778804950416e-05, "step": 10 }, { "epoch": 0.04, "eta": 0.004999999422580004, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.7764596939086914, "logits/rejected": -2.76332426071167, "logps/chosen": -268.7160339355469, "logps/rejected": -246.1806182861328, "loss": 0.7059, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0012417829129844904, "rewards/margins": 0.0018564596539363265, "rewards/rejected": -0.0006146768573671579, "step": 20 }, { "epoch": 0.06, "eta": 0.004999999422580004, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.829183578491211, "logits/rejected": -2.789580821990967, "logps/chosen": -279.50506591796875, "logps/rejected": -272.59332275390625, "loss": 0.703, "rewards/accuracies": 0.65625, "rewards/chosen": 0.001455739140510559, "rewards/margins": 0.005600649863481522, "rewards/rejected": -0.0041449107229709625, "step": 30 }, { "epoch": 0.08, "eta": 0.004999999422580004, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.7342190742492676, "logits/rejected": -2.7025771141052246, "logps/chosen": -287.4779968261719, "logps/rejected": -262.9581298828125, "loss": 0.6913, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.00810016505420208, "rewards/margins": 0.03219769150018692, "rewards/rejected": -0.02409752830862999, "step": 40 }, { "epoch": 0.1, "eta": 0.004999999422580004, "learning_rate": 4.999733114418725e-07, "logits/chosen": -2.70398211479187, "logits/rejected": -2.678131580352783, "logps/chosen": -306.2682189941406, "logps/rejected": -304.858642578125, "loss": 0.6757, "rewards/accuracies": 0.6875, "rewards/chosen": -0.010384158231317997, "rewards/margins": 0.06137201189994812, "rewards/rejected": -0.07175617665052414, "step": 50 }, { "epoch": 0.13, "eta": 0.004999999422580004, "learning_rate": 4.990398100856366e-07, "logits/chosen": -2.6752824783325195, "logits/rejected": -2.6444764137268066, "logps/chosen": -260.00921630859375, "logps/rejected": -241.9503173828125, "loss": 0.66, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.07034246623516083, "rewards/margins": 0.10934233665466309, "rewards/rejected": -0.17968478798866272, "step": 60 }, { "epoch": 0.15, "eta": 0.004999999422580004, "learning_rate": 4.967775735898179e-07, "logits/chosen": -2.7276742458343506, "logits/rejected": -2.7039637565612793, "logps/chosen": -294.48309326171875, "logps/rejected": -287.7945251464844, "loss": 0.6352, "rewards/accuracies": 0.65625, "rewards/chosen": -0.21306617558002472, "rewards/margins": 0.20508523285388947, "rewards/rejected": -0.4181514382362366, "step": 70 }, { "epoch": 0.17, "eta": 0.004999999422580004, "learning_rate": 4.931986719649298e-07, "logits/chosen": -2.7324066162109375, "logits/rejected": -2.7214457988739014, "logps/chosen": -303.2037048339844, "logps/rejected": -328.5882873535156, "loss": 0.6282, "rewards/accuracies": 0.71875, "rewards/chosen": -0.38951486349105835, "rewards/margins": 0.1954750418663025, "rewards/rejected": -0.5849899053573608, "step": 80 }, { "epoch": 0.19, "eta": 0.004999999422580004, "learning_rate": 4.883222001996351e-07, "logits/chosen": -2.732830286026001, "logits/rejected": -2.6980254650115967, "logps/chosen": -297.2022399902344, "logps/rejected": -315.86553955078125, "loss": 0.5956, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3561047911643982, "rewards/margins": 0.31715089082717896, "rewards/rejected": -0.6732556819915771, "step": 90 }, { "epoch": 0.21, "eta": 0.004999999422580004, "learning_rate": 4.821741763807186e-07, "logits/chosen": -2.7799582481384277, "logits/rejected": -2.734489679336548, "logps/chosen": -322.6822204589844, "logps/rejected": -346.29632568359375, "loss": 0.5808, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.4515351355075836, "rewards/margins": 0.430880069732666, "rewards/rejected": -0.8824151158332825, "step": 100 }, { "epoch": 0.23, "eta": 0.004999999422580004, "learning_rate": 4.747874028753375e-07, "logits/chosen": -2.737361431121826, "logits/rejected": -2.7130186557769775, "logps/chosen": -354.4649353027344, "logps/rejected": -358.72894287109375, "loss": 0.5836, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.42983478307724, "rewards/margins": 0.48764386773109436, "rewards/rejected": -0.9174786806106567, "step": 110 }, { "epoch": 0.25, "eta": 0.004999999422580004, "learning_rate": 4.662012913161997e-07, "logits/chosen": -2.5539088249206543, "logits/rejected": -2.5288946628570557, "logps/chosen": -332.5165710449219, "logps/rejected": -339.154296875, "loss": 0.5704, "rewards/accuracies": 0.75, "rewards/chosen": -0.5978292226791382, "rewards/margins": 0.5392987728118896, "rewards/rejected": -1.1371279954910278, "step": 120 }, { "epoch": 0.27, "eta": 0.004999999422580004, "learning_rate": 4.5646165232345103e-07, "logits/chosen": -2.4799418449401855, "logits/rejected": -2.463563919067383, "logps/chosen": -309.03570556640625, "logps/rejected": -393.3725280761719, "loss": 0.5759, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.5238430500030518, "rewards/margins": 0.5551779866218567, "rewards/rejected": -1.0790210962295532, "step": 130 }, { "epoch": 0.29, "eta": 0.004999999422580004, "learning_rate": 4.456204510851956e-07, "logits/chosen": -2.4601731300354004, "logits/rejected": -2.4274346828460693, "logps/chosen": -369.95086669921875, "logps/rejected": -402.4490661621094, "loss": 0.5515, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8267312049865723, "rewards/margins": 0.6819665431976318, "rewards/rejected": -1.508697748184204, "step": 140 }, { "epoch": 0.31, "eta": 0.004999999422580004, "learning_rate": 4.337355301007335e-07, "logits/chosen": -2.4591126441955566, "logits/rejected": -2.4192230701446533, "logps/chosen": -338.8128356933594, "logps/rejected": -341.96185302734375, "loss": 0.5644, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.55790776014328, "rewards/margins": 0.5498847961425781, "rewards/rejected": -1.107792615890503, "step": 150 }, { "epoch": 0.33, "eta": 0.004999999422580004, "learning_rate": 4.2087030056579986e-07, "logits/chosen": -2.324352979660034, "logits/rejected": -2.297412872314453, "logps/chosen": -333.8399963378906, "logps/rejected": -388.5691833496094, "loss": 0.5386, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5367451906204224, "rewards/margins": 0.6883090138435364, "rewards/rejected": -1.225054144859314, "step": 160 }, { "epoch": 0.36, "eta": 0.004999999422580004, "learning_rate": 4.070934040463998e-07, "logits/chosen": -2.25808048248291, "logits/rejected": -2.191967010498047, "logps/chosen": -348.13031005859375, "logps/rejected": -380.72906494140625, "loss": 0.5827, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0580767393112183, "rewards/margins": 0.4786924421787262, "rewards/rejected": -1.536769151687622, "step": 170 }, { "epoch": 0.38, "eta": 0.004999999422580004, "learning_rate": 3.9247834624635404e-07, "logits/chosen": -2.3127024173736572, "logits/rejected": -2.320852279663086, "logps/chosen": -367.42950439453125, "logps/rejected": -426.7998046875, "loss": 0.5569, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7815980315208435, "rewards/margins": 0.7305761575698853, "rewards/rejected": -1.512174129486084, "step": 180 }, { "epoch": 0.4, "eta": 0.004999999422580004, "learning_rate": 3.7710310482256523e-07, "logits/chosen": -2.4002134799957275, "logits/rejected": -2.3388237953186035, "logps/chosen": -328.8986511230469, "logps/rejected": -381.8426208496094, "loss": 0.5436, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5217610597610474, "rewards/margins": 0.6428496241569519, "rewards/rejected": -1.1646106243133545, "step": 190 }, { "epoch": 0.42, "eta": 0.004999999422580004, "learning_rate": 3.610497133404795e-07, "logits/chosen": -2.2329065799713135, "logits/rejected": -2.1769356727600098, "logps/chosen": -364.03607177734375, "logps/rejected": -379.78289794921875, "loss": 0.5399, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8814976811408997, "rewards/margins": 0.5185674428939819, "rewards/rejected": -1.4000650644302368, "step": 200 }, { "epoch": 0.44, "eta": 0.004999999422580004, "learning_rate": 3.4440382358952115e-07, "logits/chosen": -2.2697091102600098, "logits/rejected": -2.2113330364227295, "logps/chosen": -344.99200439453125, "logps/rejected": -368.27581787109375, "loss": 0.554, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7441986799240112, "rewards/margins": 0.5876610279083252, "rewards/rejected": -1.331859827041626, "step": 210 }, { "epoch": 0.46, "eta": 0.004999999422580004, "learning_rate": 3.272542485937368e-07, "logits/chosen": -2.227048873901367, "logits/rejected": -2.2157351970672607, "logps/chosen": -324.1903991699219, "logps/rejected": -370.2602844238281, "loss": 0.5538, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5380795001983643, "rewards/margins": 0.7564869523048401, "rewards/rejected": -1.2945663928985596, "step": 220 }, { "epoch": 0.48, "eta": 0.004999999422580004, "learning_rate": 3.096924887558854e-07, "logits/chosen": -2.1815030574798584, "logits/rejected": -2.1446516513824463, "logps/chosen": -329.86566162109375, "logps/rejected": -409.99267578125, "loss": 0.5485, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6506673097610474, "rewards/margins": 0.7174521684646606, "rewards/rejected": -1.3681195974349976, "step": 230 }, { "epoch": 0.5, "eta": 0.004999999422580004, "learning_rate": 2.9181224366319943e-07, "logits/chosen": -2.1926796436309814, "logits/rejected": -2.133800745010376, "logps/chosen": -328.4383850097656, "logps/rejected": -380.1564025878906, "loss": 0.5478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8131483793258667, "rewards/margins": 0.6653987169265747, "rewards/rejected": -1.4785473346710205, "step": 240 }, { "epoch": 0.52, "eta": 0.004999999422580004, "learning_rate": 2.7370891215954565e-07, "logits/chosen": -2.198378324508667, "logits/rejected": -2.148686647415161, "logps/chosen": -343.18426513671875, "logps/rejected": -405.7811584472656, "loss": 0.5379, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8210641741752625, "rewards/margins": 0.6968287229537964, "rewards/rejected": -1.5178929567337036, "step": 250 }, { "epoch": 0.54, "eta": 0.004999999422580004, "learning_rate": 2.55479083351317e-07, "logits/chosen": -2.1465113162994385, "logits/rejected": -2.093456745147705, "logps/chosen": -347.0015869140625, "logps/rejected": -396.0328063964844, "loss": 0.5184, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.7407617568969727, "rewards/margins": 0.8684328198432922, "rewards/rejected": -1.6091945171356201, "step": 260 }, { "epoch": 0.56, "eta": 0.004999999422580004, "learning_rate": 2.3722002126275822e-07, "logits/chosen": -2.1429693698883057, "logits/rejected": -2.0827112197875977, "logps/chosen": -358.7803649902344, "logps/rejected": -440.46990966796875, "loss": 0.527, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9018821716308594, "rewards/margins": 0.9309617877006531, "rewards/rejected": -1.8328437805175781, "step": 270 }, { "epoch": 0.59, "eta": 0.004999999422580004, "learning_rate": 2.19029145890313e-07, "logits/chosen": -2.215878486633301, "logits/rejected": -2.1548526287078857, "logps/chosen": -337.1029357910156, "logps/rejected": -380.2957763671875, "loss": 0.5418, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6890209913253784, "rewards/margins": 0.7738422155380249, "rewards/rejected": -1.4628633260726929, "step": 280 }, { "epoch": 0.61, "eta": 0.004999999422580004, "learning_rate": 2.0100351342479216e-07, "logits/chosen": -2.1938037872314453, "logits/rejected": -2.177739381790161, "logps/chosen": -353.51544189453125, "logps/rejected": -412.64306640625, "loss": 0.5398, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.726887583732605, "rewards/margins": 0.7154626250267029, "rewards/rejected": -1.442350149154663, "step": 290 }, { "epoch": 0.63, "eta": 0.004999999422580004, "learning_rate": 1.8323929841460178e-07, "logits/chosen": -2.1435744762420654, "logits/rejected": -2.082127332687378, "logps/chosen": -368.35906982421875, "logps/rejected": -436.9691467285156, "loss": 0.5324, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8049615025520325, "rewards/margins": 0.6589316725730896, "rewards/rejected": -1.4638930559158325, "step": 300 }, { "epoch": 0.65, "eta": 0.004999999422580004, "learning_rate": 1.6583128063291573e-07, "logits/chosen": -2.188692092895508, "logits/rejected": -2.1064612865448, "logps/chosen": -354.7235412597656, "logps/rejected": -376.1325378417969, "loss": 0.5187, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8574191331863403, "rewards/margins": 0.742445707321167, "rewards/rejected": -1.5998647212982178, "step": 310 }, { "epoch": 0.67, "eta": 0.004999999422580004, "learning_rate": 1.488723393865766e-07, "logits/chosen": -2.113239288330078, "logits/rejected": -2.0523862838745117, "logps/chosen": -317.50787353515625, "logps/rejected": -413.80426025390625, "loss": 0.522, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.7432131171226501, "rewards/margins": 1.0518220663070679, "rewards/rejected": -1.7950351238250732, "step": 320 }, { "epoch": 0.69, "eta": 0.004999999422580004, "learning_rate": 1.3245295796480788e-07, "logits/chosen": -2.2147035598754883, "logits/rejected": -2.1897997856140137, "logps/chosen": -375.3721618652344, "logps/rejected": -414.7781677246094, "loss": 0.5318, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9661127328872681, "rewards/margins": 0.6892365217208862, "rewards/rejected": -1.6553493738174438, "step": 330 }, { "epoch": 0.71, "eta": 0.004999999422580004, "learning_rate": 1.1666074087171627e-07, "logits/chosen": -2.1029276847839355, "logits/rejected": -2.0935397148132324, "logps/chosen": -336.9126892089844, "logps/rejected": -391.1408386230469, "loss": 0.5351, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8055974841117859, "rewards/margins": 0.6507540941238403, "rewards/rejected": -1.4563515186309814, "step": 340 }, { "epoch": 0.73, "eta": 0.004999999422580004, "learning_rate": 1.0157994641835734e-07, "logits/chosen": -2.0941309928894043, "logits/rejected": -2.0219178199768066, "logps/chosen": -322.93145751953125, "logps/rejected": -389.5673828125, "loss": 0.5032, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7798911333084106, "rewards/margins": 0.814423680305481, "rewards/rejected": -1.5943149328231812, "step": 350 }, { "epoch": 0.75, "eta": 0.004999999422580004, "learning_rate": 8.729103716819111e-08, "logits/chosen": -2.1901535987854004, "logits/rejected": -2.1034839153289795, "logps/chosen": -386.5938720703125, "logps/rejected": -423.3191833496094, "loss": 0.5109, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.7419500946998596, "rewards/margins": 0.9194036722183228, "rewards/rejected": -1.6613538265228271, "step": 360 }, { "epoch": 0.77, "eta": 0.004999999422580004, "learning_rate": 7.387025063449081e-08, "logits/chosen": -2.1681642532348633, "logits/rejected": -2.0902836322784424, "logps/chosen": -379.05096435546875, "logps/rejected": -418.29620361328125, "loss": 0.5313, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7454948425292969, "rewards/margins": 0.9174238443374634, "rewards/rejected": -1.6629188060760498, "step": 370 }, { "epoch": 0.79, "eta": 0.004999999422580004, "learning_rate": 6.138919252022435e-08, "logits/chosen": -2.127246618270874, "logits/rejected": -2.0689470767974854, "logps/chosen": -356.09906005859375, "logps/rejected": -440.26788330078125, "loss": 0.5075, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8266879320144653, "rewards/margins": 0.8445302844047546, "rewards/rejected": -1.6712181568145752, "step": 380 }, { "epoch": 0.82, "eta": 0.004999999422580004, "learning_rate": 4.991445467064689e-08, "logits/chosen": -2.1151671409606934, "logits/rejected": -2.071991443634033, "logps/chosen": -353.2530822753906, "logps/rejected": -428.5403747558594, "loss": 0.5323, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8702594637870789, "rewards/margins": 0.7266938090324402, "rewards/rejected": -1.5969533920288086, "step": 390 }, { "epoch": 0.84, "eta": 0.004999999422580004, "learning_rate": 3.9507259776993954e-08, "logits/chosen": -2.0943470001220703, "logits/rejected": -2.0785202980041504, "logps/chosen": -405.1844177246094, "logps/rejected": -463.93841552734375, "loss": 0.5132, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9134857058525085, "rewards/margins": 0.8523229360580444, "rewards/rejected": -1.7658087015151978, "step": 400 }, { "epoch": 0.86, "eta": 0.004999999422580004, "learning_rate": 3.022313472693447e-08, "logits/chosen": -2.123349905014038, "logits/rejected": -2.0609166622161865, "logps/chosen": -333.0213623046875, "logps/rejected": -408.7607727050781, "loss": 0.5302, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7784355282783508, "rewards/margins": 0.8595449328422546, "rewards/rejected": -1.6379806995391846, "step": 410 }, { "epoch": 0.88, "eta": 0.004999999422580004, "learning_rate": 2.2111614344599684e-08, "logits/chosen": -2.0923712253570557, "logits/rejected": -2.0771572589874268, "logps/chosen": -346.8676452636719, "logps/rejected": -450.01312255859375, "loss": 0.5237, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.7909084558486938, "rewards/margins": 0.9126070141792297, "rewards/rejected": -1.7035152912139893, "step": 420 }, { "epoch": 0.9, "eta": 0.004999999422580004, "learning_rate": 1.521597710086439e-08, "logits/chosen": -2.203439474105835, "logits/rejected": -2.1634104251861572, "logps/chosen": -366.4968566894531, "logps/rejected": -410.26177978515625, "loss": 0.5283, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7541608810424805, "rewards/margins": 0.8509491682052612, "rewards/rejected": -1.6051101684570312, "step": 430 }, { "epoch": 0.92, "eta": 0.004999999422580004, "learning_rate": 9.57301420397924e-09, "logits/chosen": -2.166147232055664, "logits/rejected": -2.1166653633117676, "logps/chosen": -336.30279541015625, "logps/rejected": -413.140380859375, "loss": 0.5157, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8325687646865845, "rewards/margins": 0.7907065153121948, "rewards/rejected": -1.6232753992080688, "step": 440 }, { "epoch": 0.94, "eta": 0.004999999422580004, "learning_rate": 5.212833302556258e-09, "logits/chosen": -2.073584794998169, "logits/rejected": -2.025235652923584, "logps/chosen": -316.84906005859375, "logps/rejected": -382.339599609375, "loss": 0.5276, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7750087976455688, "rewards/margins": 0.7785266041755676, "rewards/rejected": -1.5535353422164917, "step": 450 }, { "epoch": 0.96, "eta": 0.004999999422580004, "learning_rate": 2.158697848236607e-09, "logits/chosen": -2.1052441596984863, "logits/rejected": -2.039158582687378, "logps/chosen": -318.8851623535156, "logps/rejected": -384.3031005859375, "loss": 0.5063, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8155075907707214, "rewards/margins": 0.7695605158805847, "rewards/rejected": -1.5850679874420166, "step": 460 }, { "epoch": 0.98, "eta": 0.004999999422580004, "learning_rate": 4.269029751107489e-10, "logits/chosen": -2.1885509490966797, "logits/rejected": -2.0868308544158936, "logps/chosen": -369.0015869140625, "logps/rejected": -435.66632080078125, "loss": 0.5031, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7688988447189331, "rewards/margins": 1.0167477130889893, "rewards/rejected": -1.7856464385986328, "step": 470 }, { "epoch": 1.0, "step": 478, "total_flos": 0.0, "train_loss": 0.5600044772215967, "train_runtime": 8380.7405, "train_samples_per_second": 7.295, "train_steps_per_second": 0.057 } ], "logging_steps": 10, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }