{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 63.25, "learning_rate": 1.282051282051282e-07, "logits/chosen": -2.7358343601226807, "logits/rejected": -2.7480404376983643, "logps/chosen": -27.35565757751465, "logps/rejected": -21.06114387512207, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 108.0, "learning_rate": 1.282051282051282e-06, "logits/chosen": -3.009650945663452, "logits/rejected": -2.998239040374756, "logps/chosen": -33.192203521728516, "logps/rejected": -31.957557678222656, "loss": 1.0141, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": -0.009009478613734245, "rewards/margins": -0.014087951742112637, "rewards/rejected": 0.005078474525362253, "step": 10 }, { "epoch": 0.05, "grad_norm": 73.5, "learning_rate": 2.564102564102564e-06, "logits/chosen": -2.899263381958008, "logits/rejected": -2.894313335418701, "logps/chosen": -32.45400619506836, "logps/rejected": -28.9648494720459, "loss": 0.9967, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.016872350126504898, "rewards/margins": 0.003259001299738884, "rewards/rejected": 0.01361334603279829, "step": 20 }, { "epoch": 0.08, "grad_norm": 71.0, "learning_rate": 3.846153846153847e-06, "logits/chosen": -3.0970497131347656, "logits/rejected": -3.108996868133545, "logps/chosen": -32.78731918334961, "logps/rejected": -30.140506744384766, "loss": 0.9473, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.09709431231021881, "rewards/margins": 0.05265679210424423, "rewards/rejected": 0.044437527656555176, "step": 30 }, { "epoch": 0.1, "grad_norm": 57.75, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.8630309104919434, "logits/rejected": -2.8540406227111816, "logps/chosen": -31.542278289794922, "logps/rejected": -32.394432067871094, "loss": 0.8414, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.23043569922447205, "rewards/margins": 0.260015070438385, "rewards/rejected": -0.02957936003804207, "step": 40 }, { "epoch": 0.13, "grad_norm": 44.75, "learning_rate": 4.987541037542187e-06, "logits/chosen": -2.8809738159179688, "logits/rejected": -2.8790669441223145, "logps/chosen": -29.41156005859375, "logps/rejected": -30.12240219116211, "loss": 0.8761, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.26288196444511414, "rewards/margins": 0.2896483242511749, "rewards/rejected": -0.02676635980606079, "step": 50 }, { "epoch": 0.16, "grad_norm": 63.75, "learning_rate": 4.954691471941119e-06, "logits/chosen": -2.9108948707580566, "logits/rejected": -2.912576675415039, "logps/chosen": -29.901845932006836, "logps/rejected": -28.0941219329834, "loss": 0.7968, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.18233974277973175, "rewards/margins": 0.2930926978588104, "rewards/rejected": -0.11075299978256226, "step": 60 }, { "epoch": 0.18, "grad_norm": 72.5, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.9933552742004395, "logits/rejected": -2.9995028972625732, "logps/chosen": -29.255428314208984, "logps/rejected": -31.047006607055664, "loss": 0.9015, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.020431842654943466, "rewards/margins": 0.14459456503391266, "rewards/rejected": -0.1241627112030983, "step": 70 }, { "epoch": 0.21, "grad_norm": 70.5, "learning_rate": 4.828760511501322e-06, "logits/chosen": -2.808861255645752, "logits/rejected": -2.8243188858032227, "logps/chosen": -29.406871795654297, "logps/rejected": -29.915807723999023, "loss": 0.7973, "rewards/accuracies": 0.6875, "rewards/chosen": 0.10708501189947128, "rewards/margins": 0.30030542612075806, "rewards/rejected": -0.19322039186954498, "step": 80 }, { "epoch": 0.23, "grad_norm": 56.25, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -2.8983397483825684, "logits/rejected": -2.880967617034912, "logps/chosen": -32.71396255493164, "logps/rejected": -30.347427368164062, "loss": 0.8646, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.09797407686710358, "rewards/margins": 0.35859915614128113, "rewards/rejected": -0.26062512397766113, "step": 90 }, { "epoch": 0.26, "grad_norm": 54.75, "learning_rate": 4.626245458345211e-06, "logits/chosen": -3.004662275314331, "logits/rejected": -3.005678653717041, "logps/chosen": -31.851581573486328, "logps/rejected": -30.93560791015625, "loss": 0.8401, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1642352044582367, "rewards/margins": 0.25978168845176697, "rewards/rejected": -0.09554646909236908, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.811776876449585, "eval_logits/rejected": -2.809250831604004, "eval_logps/chosen": -31.276582717895508, "eval_logps/rejected": -34.853797912597656, "eval_loss": 0.8872909545898438, "eval_rewards/accuracies": 0.6067276000976562, "eval_rewards/chosen": 0.0046949307434260845, "eval_rewards/margins": 0.12921129167079926, "eval_rewards/rejected": -0.12451635301113129, "eval_runtime": 113.4101, "eval_samples_per_second": 3.024, "eval_steps_per_second": 0.379, "step": 100 }, { "epoch": 0.29, "grad_norm": 83.0, "learning_rate": 4.498257201263691e-06, "logits/chosen": -2.9626810550689697, "logits/rejected": -2.9382669925689697, "logps/chosen": -31.831439971923828, "logps/rejected": -31.40035057067871, "loss": 0.6952, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.3082864582538605, "rewards/margins": 0.5014885663986206, "rewards/rejected": -0.1932021528482437, "step": 110 }, { "epoch": 0.31, "grad_norm": 65.0, "learning_rate": 4.353806263777678e-06, "logits/chosen": -3.0443854331970215, "logits/rejected": -3.073098659515381, "logps/chosen": -28.707149505615234, "logps/rejected": -34.32903289794922, "loss": 0.7028, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3654031455516815, "rewards/margins": 0.5105921030044556, "rewards/rejected": -0.14518897235393524, "step": 120 }, { "epoch": 0.34, "grad_norm": 44.5, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.7482428550720215, "logits/rejected": -2.743565082550049, "logps/chosen": -28.566293716430664, "logps/rejected": -30.31746482849121, "loss": 0.7405, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.3267093598842621, "rewards/margins": 0.47363653779029846, "rewards/rejected": -0.14692717790603638, "step": 130 }, { "epoch": 0.36, "grad_norm": 46.25, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -3.022853136062622, "logits/rejected": -3.0205140113830566, "logps/chosen": -27.139602661132812, "logps/rejected": -31.89032554626465, "loss": 0.7769, "rewards/accuracies": 0.625, "rewards/chosen": 0.27729907631874084, "rewards/margins": 0.46414414048194885, "rewards/rejected": -0.1868450939655304, "step": 140 }, { "epoch": 0.39, "grad_norm": 48.0, "learning_rate": 3.834196265035119e-06, "logits/chosen": -2.816462516784668, "logits/rejected": -2.810920000076294, "logps/chosen": -27.263103485107422, "logps/rejected": -31.520715713500977, "loss": 0.6276, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.4499203562736511, "rewards/margins": 0.6875794529914856, "rewards/rejected": -0.2376590520143509, "step": 150 }, { "epoch": 0.42, "grad_norm": 68.5, "learning_rate": 3.636998309800573e-06, "logits/chosen": -3.133582592010498, "logits/rejected": -3.1159331798553467, "logps/chosen": -31.7524471282959, "logps/rejected": -29.40524673461914, "loss": 0.547, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5914198756217957, "rewards/margins": 0.9317981600761414, "rewards/rejected": -0.3403782546520233, "step": 160 }, { "epoch": 0.44, "grad_norm": 58.25, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.944953203201294, "logits/rejected": -2.952117681503296, "logps/chosen": -29.341304779052734, "logps/rejected": -31.5146427154541, "loss": 0.6062, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4426456391811371, "rewards/margins": 0.7747727632522583, "rewards/rejected": -0.33212706446647644, "step": 170 }, { "epoch": 0.47, "grad_norm": 57.25, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.795180082321167, "logits/rejected": -2.792935848236084, "logps/chosen": -29.07159423828125, "logps/rejected": -30.085384368896484, "loss": 0.5999, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.47860392928123474, "rewards/margins": 0.7323731184005737, "rewards/rejected": -0.2537691593170166, "step": 180 }, { "epoch": 0.49, "grad_norm": 35.5, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.9086050987243652, "logits/rejected": -2.9050517082214355, "logps/chosen": -29.76633644104004, "logps/rejected": -28.544025421142578, "loss": 0.778, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3363017439842224, "rewards/margins": 0.4854954779148102, "rewards/rejected": -0.14919371902942657, "step": 190 }, { "epoch": 0.52, "grad_norm": 51.0, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -2.9782276153564453, "logits/rejected": -2.9666411876678467, "logps/chosen": -32.82664108276367, "logps/rejected": -30.458984375, "loss": 0.724, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6979023218154907, "rewards/margins": 0.7178188562393188, "rewards/rejected": -0.01991647481918335, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.8178980350494385, "eval_logits/rejected": -2.815643548965454, "eval_logps/chosen": -31.38960838317871, "eval_logps/rejected": -34.92823791503906, "eval_loss": 0.9140273332595825, "eval_rewards/accuracies": 0.565614640712738, "eval_rewards/chosen": -0.08572381734848022, "eval_rewards/margins": 0.09834489226341248, "eval_rewards/rejected": -0.1840687096118927, "eval_runtime": 113.15, "eval_samples_per_second": 3.031, "eval_steps_per_second": 0.38, "step": 200 }, { "epoch": 0.55, "grad_norm": 52.5, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.9144248962402344, "logits/rejected": -2.914703845977783, "logps/chosen": -32.25563049316406, "logps/rejected": -34.11750793457031, "loss": 0.5504, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6222246289253235, "rewards/margins": 0.8270590901374817, "rewards/rejected": -0.2048344612121582, "step": 210 }, { "epoch": 0.57, "grad_norm": 47.5, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -2.8935537338256836, "logits/rejected": -2.909308671951294, "logps/chosen": -29.393646240234375, "logps/rejected": -28.79998779296875, "loss": 0.6005, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6275160908699036, "rewards/margins": 0.8448917269706726, "rewards/rejected": -0.21737566590309143, "step": 220 }, { "epoch": 0.6, "grad_norm": 46.5, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -2.9393625259399414, "logits/rejected": -2.9434802532196045, "logps/chosen": -30.659032821655273, "logps/rejected": -31.946239471435547, "loss": 0.7369, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.39426764845848083, "rewards/margins": 0.5447386503219604, "rewards/rejected": -0.15047098696231842, "step": 230 }, { "epoch": 0.62, "grad_norm": 46.75, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -2.99426531791687, "logits/rejected": -3.00126314163208, "logps/chosen": -29.958850860595703, "logps/rejected": -30.447010040283203, "loss": 0.4564, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7568598985671997, "rewards/margins": 0.9262750744819641, "rewards/rejected": -0.16941508650779724, "step": 240 }, { "epoch": 0.65, "grad_norm": 55.0, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.8251967430114746, "logits/rejected": -2.815450429916382, "logps/chosen": -26.474361419677734, "logps/rejected": -29.600570678710938, "loss": 0.721, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.3838837444782257, "rewards/margins": 0.43902724981307983, "rewards/rejected": -0.05514346435666084, "step": 250 }, { "epoch": 0.68, "grad_norm": 21.75, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -2.8065786361694336, "logits/rejected": -2.8274922370910645, "logps/chosen": -28.88662338256836, "logps/rejected": -34.49494934082031, "loss": 0.4422, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.7546705603599548, "rewards/margins": 1.0113604068756104, "rewards/rejected": -0.25668981671333313, "step": 260 }, { "epoch": 0.7, "grad_norm": 60.0, "learning_rate": 1.243452991757889e-06, "logits/chosen": -2.948761463165283, "logits/rejected": -2.955237627029419, "logps/chosen": -30.040813446044922, "logps/rejected": -30.221237182617188, "loss": 0.5322, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.5924339890480042, "rewards/margins": 0.8772123456001282, "rewards/rejected": -0.2847784161567688, "step": 270 }, { "epoch": 0.73, "grad_norm": 29.375, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -2.9630208015441895, "logits/rejected": -2.949868679046631, "logps/chosen": -30.0323429107666, "logps/rejected": -28.719945907592773, "loss": 0.7384, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.5514736771583557, "rewards/margins": 0.6223500967025757, "rewards/rejected": -0.07087641209363937, "step": 280 }, { "epoch": 0.75, "grad_norm": 36.5, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.890263080596924, "logits/rejected": -2.872467279434204, "logps/chosen": -31.614023208618164, "logps/rejected": -30.969629287719727, "loss": 0.4081, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.8178254961967468, "rewards/margins": 1.1748807430267334, "rewards/rejected": -0.3570552468299866, "step": 290 }, { "epoch": 0.78, "grad_norm": 47.0, "learning_rate": 7.08321427484816e-07, "logits/chosen": -2.892866611480713, "logits/rejected": -2.8899810314178467, "logps/chosen": -31.352060317993164, "logps/rejected": -27.79092025756836, "loss": 0.6408, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.6964761018753052, "rewards/margins": 0.8463441729545593, "rewards/rejected": -0.14986807107925415, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.810232400894165, "eval_logits/rejected": -2.807687520980835, "eval_logps/chosen": -31.392431259155273, "eval_logps/rejected": -34.949954986572266, "eval_loss": 0.9091227650642395, "eval_rewards/accuracies": 0.5627076625823975, "eval_rewards/chosen": -0.08798420429229736, "eval_rewards/margins": 0.11345873028039932, "eval_rewards/rejected": -0.20144294202327728, "eval_runtime": 113.1296, "eval_samples_per_second": 3.032, "eval_steps_per_second": 0.38, "step": 300 }, { "epoch": 0.81, "grad_norm": 57.5, "learning_rate": 5.576113578589035e-07, "logits/chosen": -2.771563768386841, "logits/rejected": -2.790158748626709, "logps/chosen": -28.552204132080078, "logps/rejected": -31.123676300048828, "loss": 0.4625, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.5417758226394653, "rewards/margins": 0.855111300945282, "rewards/rejected": -0.31333547830581665, "step": 310 }, { "epoch": 0.83, "grad_norm": 50.75, "learning_rate": 4.229036944380913e-07, "logits/chosen": -3.01896333694458, "logits/rejected": -3.0036964416503906, "logps/chosen": -29.334665298461914, "logps/rejected": -28.647085189819336, "loss": 0.4327, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7015351057052612, "rewards/margins": 0.9641841650009155, "rewards/rejected": -0.2626491189002991, "step": 320 }, { "epoch": 0.86, "grad_norm": 19.25, "learning_rate": 3.053082288996112e-07, "logits/chosen": -2.9340624809265137, "logits/rejected": -2.916344404220581, "logps/chosen": -27.29937744140625, "logps/rejected": -30.881011962890625, "loss": 0.3934, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8501566052436829, "rewards/margins": 1.2416493892669678, "rewards/rejected": -0.3914927840232849, "step": 330 }, { "epoch": 0.88, "grad_norm": 54.0, "learning_rate": 2.0579377374915805e-07, "logits/chosen": -3.1503987312316895, "logits/rejected": -3.156454563140869, "logps/chosen": -30.917760848999023, "logps/rejected": -33.217864990234375, "loss": 0.4822, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.5771540403366089, "rewards/margins": 0.994644820690155, "rewards/rejected": -0.4174906611442566, "step": 340 }, { "epoch": 0.91, "grad_norm": 27.0, "learning_rate": 1.2518018074041684e-07, "logits/chosen": -3.026543140411377, "logits/rejected": -3.0300345420837402, "logps/chosen": -29.901714324951172, "logps/rejected": -31.784900665283203, "loss": 0.5385, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.7241344451904297, "rewards/margins": 0.9414092302322388, "rewards/rejected": -0.2172747403383255, "step": 350 }, { "epoch": 0.94, "grad_norm": 47.25, "learning_rate": 6.41315865106129e-08, "logits/chosen": -2.8658933639526367, "logits/rejected": -2.8671889305114746, "logps/chosen": -27.54721450805664, "logps/rejected": -30.053890228271484, "loss": 0.4298, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8820101022720337, "rewards/margins": 1.0603699684143066, "rewards/rejected": -0.17835985124111176, "step": 360 }, { "epoch": 0.96, "grad_norm": 62.75, "learning_rate": 2.3150941078050325e-08, "logits/chosen": -2.9409708976745605, "logits/rejected": -2.9394469261169434, "logps/chosen": -29.869876861572266, "logps/rejected": -32.248016357421875, "loss": 0.486, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7334206700325012, "rewards/margins": 0.9893990755081177, "rewards/rejected": -0.25597840547561646, "step": 370 }, { "epoch": 0.99, "grad_norm": 56.5, "learning_rate": 2.575864278703266e-09, "logits/chosen": -2.8988537788391113, "logits/rejected": -2.8815865516662598, "logps/chosen": -28.09465980529785, "logps/rejected": -28.32416343688965, "loss": 0.4149, "rewards/accuracies": 0.875, "rewards/chosen": 0.6803036332130432, "rewards/margins": 1.027940273284912, "rewards/rejected": -0.3476366698741913, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.6588526527602951, "train_runtime": 2719.293, "train_samples_per_second": 1.132, "train_steps_per_second": 0.142 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }