{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02092050209205021, "grad_norm": 1.1945245265960693, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -0.30859375, "logits/rejected": -0.2890625, "logps/chosen": -312.0, "logps/rejected": -288.0, "loss": 0.692, "rewards/accuracies": 0.26249998807907104, "rewards/chosen": 0.0007476806640625, "rewards/margins": 0.0005035400390625, "rewards/rejected": 0.000240325927734375, "step": 10 }, { "epoch": 0.04184100418410042, "grad_norm": 1.1291463375091553, "learning_rate": 8.333333333333333e-07, "logits/chosen": -0.330078125, "logits/rejected": -0.294921875, "logps/chosen": -302.0, "logps/rejected": -268.0, "loss": 0.6921, "rewards/accuracies": 0.3531250059604645, "rewards/chosen": -0.00098419189453125, "rewards/margins": 0.000614166259765625, "rewards/rejected": -0.00160980224609375, "step": 20 }, { "epoch": 0.06276150627615062, "grad_norm": 1.1083910465240479, "learning_rate": 9.995691082675907e-07, "logits/chosen": -0.22265625, "logits/rejected": -0.2314453125, "logps/chosen": -338.0, "logps/rejected": -306.0, "loss": 0.6914, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0006103515625, "rewards/margins": 0.005645751953125, "rewards/rejected": -0.006256103515625, "step": 30 }, { "epoch": 0.08368200836820083, "grad_norm": 1.1081091165542603, "learning_rate": 9.969385700404345e-07, "logits/chosen": -0.35546875, "logits/rejected": -0.34765625, "logps/chosen": -328.0, "logps/rejected": -300.0, "loss": 0.6882, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": 0.003509521484375, "rewards/margins": 0.011474609375, "rewards/rejected": -0.0079345703125, "step": 40 }, { "epoch": 0.10460251046025104, "grad_norm": 1.1411256790161133, "learning_rate": 9.91929453572245e-07, "logits/chosen": -0.259765625, "logits/rejected": -0.279296875, "logps/chosen": -342.0, "logps/rejected": -284.0, "loss": 0.6843, "rewards/accuracies": 0.640625, "rewards/chosen": 0.0086669921875, "rewards/margins": 0.02197265625, "rewards/rejected": -0.0133056640625, "step": 50 }, { "epoch": 0.12552301255230125, "grad_norm": 1.1798174381256104, "learning_rate": 9.845657348152955e-07, "logits/chosen": -0.33203125, "logits/rejected": -0.28515625, "logps/chosen": -330.0, "logps/rejected": -272.0, "loss": 0.6789, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0038909912109375, "rewards/margins": 0.033203125, "rewards/rejected": -0.029296875, "step": 60 }, { "epoch": 0.14644351464435146, "grad_norm": 1.3184058666229248, "learning_rate": 9.748826599393632e-07, "logits/chosen": -0.396484375, "logits/rejected": -0.3046875, "logps/chosen": -282.0, "logps/rejected": -268.0, "loss": 0.675, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.000690460205078125, "rewards/margins": 0.043212890625, "rewards/rejected": -0.0439453125, "step": 70 }, { "epoch": 0.16736401673640167, "grad_norm": 1.1700127124786377, "learning_rate": 9.629265766272291e-07, "logits/chosen": -0.49609375, "logits/rejected": -0.404296875, "logps/chosen": -302.0, "logps/rejected": -290.0, "loss": 0.6704, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.005828857421875, "rewards/margins": 0.03955078125, "rewards/rejected": -0.04541015625, "step": 80 }, { "epoch": 0.18828451882845187, "grad_norm": 1.171791672706604, "learning_rate": 9.487547122331964e-07, "logits/chosen": -0.4609375, "logits/rejected": -0.412109375, "logps/chosen": -314.0, "logps/rejected": -308.0, "loss": 0.6674, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.014404296875, "rewards/margins": 0.05859375, "rewards/rejected": -0.0732421875, "step": 90 }, { "epoch": 0.20920502092050208, "grad_norm": 1.2314943075180054, "learning_rate": 9.324348998664548e-07, "logits/chosen": -0.453125, "logits/rejected": -0.41796875, "logps/chosen": -312.0, "logps/rejected": -294.0, "loss": 0.6568, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.01043701171875, "rewards/margins": 0.083984375, "rewards/rejected": -0.0947265625, "step": 100 }, { "epoch": 0.2301255230125523, "grad_norm": 1.1103886365890503, "learning_rate": 9.140452537103941e-07, "logits/chosen": -0.55859375, "logits/rejected": -0.5625, "logps/chosen": -330.0, "logps/rejected": -300.0, "loss": 0.6515, "rewards/accuracies": 0.640625, "rewards/chosen": -0.045166015625, "rewards/margins": 0.0888671875, "rewards/rejected": -0.1337890625, "step": 110 }, { "epoch": 0.2510460251046025, "grad_norm": 1.180451512336731, "learning_rate": 8.936737951319275e-07, "logits/chosen": -0.609375, "logits/rejected": -0.5625, "logps/chosen": -342.0, "logps/rejected": -316.0, "loss": 0.6445, "rewards/accuracies": 0.703125, "rewards/chosen": -0.06494140625, "rewards/margins": 0.1513671875, "rewards/rejected": -0.2158203125, "step": 120 }, { "epoch": 0.2719665271966527, "grad_norm": 1.2995760440826416, "learning_rate": 8.714180313704489e-07, "logits/chosen": -0.7109375, "logits/rejected": -0.578125, "logps/chosen": -340.0, "logps/rejected": -328.0, "loss": 0.6395, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.111328125, "rewards/margins": 0.1591796875, "rewards/rejected": -0.26953125, "step": 130 }, { "epoch": 0.2928870292887029, "grad_norm": 1.2812036275863647, "learning_rate": 8.473844888230064e-07, "logits/chosen": -0.6875, "logits/rejected": -0.64453125, "logps/chosen": -332.0, "logps/rejected": -290.0, "loss": 0.6381, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1845703125, "rewards/margins": 0.158203125, "rewards/rejected": -0.34375, "step": 140 }, { "epoch": 0.3138075313807531, "grad_norm": 1.8293964862823486, "learning_rate": 8.216882031596096e-07, "logits/chosen": -0.73046875, "logits/rejected": -0.6953125, "logps/chosen": -340.0, "logps/rejected": -322.0, "loss": 0.6355, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.220703125, "rewards/margins": 0.17578125, "rewards/rejected": -0.396484375, "step": 150 }, { "epoch": 0.33472803347280333, "grad_norm": 1.3217679262161255, "learning_rate": 7.944521687092142e-07, "logits/chosen": -0.87109375, "logits/rejected": -0.8046875, "logps/chosen": -344.0, "logps/rejected": -320.0, "loss": 0.6231, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.21875, "rewards/margins": 0.240234375, "rewards/rejected": -0.458984375, "step": 160 }, { "epoch": 0.35564853556485354, "grad_norm": 1.493200659751892, "learning_rate": 7.658067497518772e-07, "logits/chosen": -0.94140625, "logits/rejected": -0.92578125, "logps/chosen": -324.0, "logps/rejected": -308.0, "loss": 0.6278, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.193359375, "rewards/margins": 0.1865234375, "rewards/rejected": -0.380859375, "step": 170 }, { "epoch": 0.37656903765690375, "grad_norm": 1.3974004983901978, "learning_rate": 7.358890565349105e-07, "logits/chosen": -0.90234375, "logits/rejected": -0.96484375, "logps/chosen": -340.0, "logps/rejected": -320.0, "loss": 0.6182, "rewards/accuracies": 0.671875, "rewards/chosen": -0.2333984375, "rewards/margins": 0.216796875, "rewards/rejected": -0.451171875, "step": 180 }, { "epoch": 0.39748953974895396, "grad_norm": 1.4906997680664062, "learning_rate": 7.048422889997115e-07, "logits/chosen": -0.96484375, "logits/rejected": -0.91796875, "logps/chosen": -342.0, "logps/rejected": -330.0, "loss": 0.632, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2578125, "rewards/margins": 0.212890625, "rewards/rejected": -0.470703125, "step": 190 }, { "epoch": 0.41841004184100417, "grad_norm": 1.4964449405670166, "learning_rate": 6.72815051360494e-07, "logits/chosen": -1.03125, "logits/rejected": -1.015625, "logps/chosen": -324.0, "logps/rejected": -314.0, "loss": 0.6232, "rewards/accuracies": 0.6875, "rewards/chosen": -0.263671875, "rewards/margins": 0.1962890625, "rewards/rejected": -0.458984375, "step": 200 }, { "epoch": 0.4393305439330544, "grad_norm": 1.7729754447937012, "learning_rate": 6.399606408156687e-07, "logits/chosen": -1.046875, "logits/rejected": -1.0078125, "logps/chosen": -316.0, "logps/rejected": -316.0, "loss": 0.6014, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.2470703125, "rewards/margins": 0.2431640625, "rewards/rejected": -0.490234375, "step": 210 }, { "epoch": 0.4602510460251046, "grad_norm": 1.4692357778549194, "learning_rate": 6.064363137964225e-07, "logits/chosen": -1.125, "logits/rejected": -0.9765625, "logps/chosen": -348.0, "logps/rejected": -358.0, "loss": 0.6114, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.267578125, "rewards/margins": 0.30859375, "rewards/rejected": -0.578125, "step": 220 }, { "epoch": 0.4811715481171548, "grad_norm": 1.377569317817688, "learning_rate": 5.724025332645793e-07, "logits/chosen": -1.0625, "logits/rejected": -1.0390625, "logps/chosen": -350.0, "logps/rejected": -344.0, "loss": 0.6098, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.2890625, "rewards/margins": 0.234375, "rewards/rejected": -0.5234375, "step": 230 }, { "epoch": 0.502092050209205, "grad_norm": 1.5865265130996704, "learning_rate": 5.380222006625179e-07, "logits/chosen": -1.1171875, "logits/rejected": -1.078125, "logps/chosen": -334.0, "logps/rejected": -328.0, "loss": 0.6252, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.26953125, "rewards/margins": 0.263671875, "rewards/rejected": -0.53125, "step": 240 }, { "epoch": 0.5230125523012552, "grad_norm": 1.5621623992919922, "learning_rate": 5.034598761913916e-07, "logits/chosen": -1.140625, "logits/rejected": -1.1015625, "logps/chosen": -318.0, "logps/rejected": -318.0, "loss": 0.6285, "rewards/accuracies": 0.671875, "rewards/chosen": -0.248046875, "rewards/margins": 0.2314453125, "rewards/rejected": -0.478515625, "step": 250 }, { "epoch": 0.5439330543933054, "grad_norm": 1.3323726654052734, "learning_rate": 4.688809911497609e-07, "logits/chosen": -1.125, "logits/rejected": -1.109375, "logps/chosen": -346.0, "logps/rejected": -350.0, "loss": 0.6079, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.240234375, "rewards/margins": 0.30859375, "rewards/rejected": -0.546875, "step": 260 }, { "epoch": 0.5648535564853556, "grad_norm": 1.4896010160446167, "learning_rate": 4.344510561027498e-07, "logits/chosen": -1.15625, "logits/rejected": -1.15625, "logps/chosen": -346.0, "logps/rejected": -330.0, "loss": 0.5974, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.271484375, "rewards/margins": 0.310546875, "rewards/rejected": -0.58203125, "step": 270 }, { "epoch": 0.5857740585774058, "grad_norm": 1.4263241291046143, "learning_rate": 4.003348686717949e-07, "logits/chosen": -1.1953125, "logits/rejected": -1.1171875, "logps/chosen": -338.0, "logps/rejected": -356.0, "loss": 0.6141, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.27734375, "rewards/margins": 0.306640625, "rewards/rejected": -0.58203125, "step": 280 }, { "epoch": 0.606694560669456, "grad_norm": 1.510299563407898, "learning_rate": 3.666957247368757e-07, "logits/chosen": -1.109375, "logits/rejected": -1.078125, "logps/chosen": -338.0, "logps/rejected": -322.0, "loss": 0.5992, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.2451171875, "rewards/margins": 0.345703125, "rewards/rejected": -0.58984375, "step": 290 }, { "epoch": 0.6276150627615062, "grad_norm": 1.8962136507034302, "learning_rate": 3.3369463682677234e-07, "logits/chosen": -1.1875, "logits/rejected": -1.1484375, "logps/chosen": -362.0, "logps/rejected": -362.0, "loss": 0.6158, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.267578125, "rewards/margins": 0.32421875, "rewards/rejected": -0.59375, "step": 300 }, { "epoch": 0.6485355648535565, "grad_norm": 1.5493396520614624, "learning_rate": 3.014895634385014e-07, "logits/chosen": -1.109375, "logits/rejected": -1.140625, "logps/chosen": -352.0, "logps/rejected": -340.0, "loss": 0.601, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.306640625, "rewards/margins": 0.2333984375, "rewards/rejected": -0.5390625, "step": 310 }, { "epoch": 0.6694560669456067, "grad_norm": 1.5511579513549805, "learning_rate": 2.7023465297476424e-07, "logits/chosen": -1.15625, "logits/rejected": -1.15625, "logps/chosen": -354.0, "logps/rejected": -358.0, "loss": 0.595, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.283203125, "rewards/margins": 0.326171875, "rewards/rejected": -0.609375, "step": 320 }, { "epoch": 0.6903765690376569, "grad_norm": 1.7871730327606201, "learning_rate": 2.4007950591826913e-07, "logits/chosen": -1.1328125, "logits/rejected": -1.125, "logps/chosen": -350.0, "logps/rejected": -352.0, "loss": 0.612, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.330078125, "rewards/margins": 0.30078125, "rewards/rejected": -0.6328125, "step": 330 }, { "epoch": 0.7112970711297071, "grad_norm": 1.6344778537750244, "learning_rate": 2.1116845877450805e-07, "logits/chosen": -1.1953125, "logits/rejected": -1.171875, "logps/chosen": -330.0, "logps/rejected": -316.0, "loss": 0.6125, "rewards/accuracies": 0.65625, "rewards/chosen": -0.32421875, "rewards/margins": 0.255859375, "rewards/rejected": -0.58203125, "step": 340 }, { "epoch": 0.7322175732217573, "grad_norm": 1.6645337343215942, "learning_rate": 1.8363989321036577e-07, "logits/chosen": -1.296875, "logits/rejected": -1.203125, "logps/chosen": -332.0, "logps/rejected": -340.0, "loss": 0.6038, "rewards/accuracies": 0.671875, "rewards/chosen": -0.333984375, "rewards/margins": 0.26953125, "rewards/rejected": -0.6015625, "step": 350 }, { "epoch": 0.7531380753138075, "grad_norm": 1.5749666690826416, "learning_rate": 1.5762557369534708e-07, "logits/chosen": -1.15625, "logits/rejected": -1.1875, "logps/chosen": -322.0, "logps/rejected": -348.0, "loss": 0.6019, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.38671875, "rewards/margins": 0.265625, "rewards/rejected": -0.65234375, "step": 360 }, { "epoch": 0.7740585774058577, "grad_norm": 1.582518458366394, "learning_rate": 1.332500168157748e-07, "logits/chosen": -1.15625, "logits/rejected": -1.203125, "logps/chosen": -360.0, "logps/rejected": -348.0, "loss": 0.6147, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.412109375, "rewards/margins": 0.25390625, "rewards/rejected": -0.6640625, "step": 370 }, { "epoch": 0.7949790794979079, "grad_norm": 1.690048098564148, "learning_rate": 1.1062989528071681e-07, "logits/chosen": -1.1953125, "logits/rejected": -1.1796875, "logps/chosen": -364.0, "logps/rejected": -378.0, "loss": 0.6108, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.349609375, "rewards/margins": 0.2890625, "rewards/rejected": -0.63671875, "step": 380 }, { "epoch": 0.8158995815899581, "grad_norm": 1.6943994760513306, "learning_rate": 8.987347947234192e-08, "logits/chosen": -1.109375, "logits/rejected": -1.1328125, "logps/chosen": -328.0, "logps/rejected": -324.0, "loss": 0.6044, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3203125, "rewards/margins": 0.322265625, "rewards/rejected": -0.64453125, "step": 390 }, { "epoch": 0.8368200836820083, "grad_norm": 1.6517119407653809, "learning_rate": 7.108011921370727e-08, "logits/chosen": -1.1875, "logits/rejected": -1.1640625, "logps/chosen": -386.0, "logps/rejected": -382.0, "loss": 0.6041, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.375, "rewards/margins": 0.328125, "rewards/rejected": -0.703125, "step": 400 }, { "epoch": 0.8577405857740585, "grad_norm": 1.6941229104995728, "learning_rate": 5.433976823447262e-08, "logits/chosen": -1.203125, "logits/rejected": -1.1015625, "logps/chosen": -340.0, "logps/rejected": -352.0, "loss": 0.6026, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.392578125, "rewards/margins": 0.333984375, "rewards/rejected": -0.7265625, "step": 410 }, { "epoch": 0.8786610878661087, "grad_norm": 1.6011600494384766, "learning_rate": 3.973255361067346e-08, "logits/chosen": -1.2265625, "logits/rejected": -1.1875, "logps/chosen": -362.0, "logps/rejected": -352.0, "loss": 0.5924, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.34375, "rewards/margins": 0.2890625, "rewards/rejected": -0.6328125, "step": 420 }, { "epoch": 0.899581589958159, "grad_norm": 1.7107675075531006, "learning_rate": 2.732839223940914e-08, "logits/chosen": -1.1484375, "logits/rejected": -1.1953125, "logps/chosen": -356.0, "logps/rejected": -366.0, "loss": 0.5962, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.373046875, "rewards/margins": 0.3671875, "rewards/rejected": -0.7421875, "step": 430 }, { "epoch": 0.9205020920502092, "grad_norm": 1.5065439939498901, "learning_rate": 1.7186656184179473e-08, "logits/chosen": -1.203125, "logits/rejected": -1.2109375, "logps/chosen": -364.0, "logps/rejected": -374.0, "loss": 0.6021, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.361328125, "rewards/margins": 0.373046875, "rewards/rejected": -0.734375, "step": 440 }, { "epoch": 0.9414225941422594, "grad_norm": 1.560874104499817, "learning_rate": 9.355888492680153e-09, "logits/chosen": -1.1328125, "logits/rejected": -1.1015625, "logps/chosen": -340.0, "logps/rejected": -346.0, "loss": 0.5957, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.337890625, "rewards/margins": 0.357421875, "rewards/rejected": -0.6953125, "step": 450 }, { "epoch": 0.9623430962343096, "grad_norm": 1.5713697671890259, "learning_rate": 3.873570847285012e-09, "logits/chosen": -1.234375, "logits/rejected": -1.2109375, "logps/chosen": -356.0, "logps/rejected": -356.0, "loss": 0.6004, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.40234375, "rewards/margins": 0.30078125, "rewards/rejected": -0.703125, "step": 460 }, { "epoch": 0.9832635983263598, "grad_norm": 1.5760387182235718, "learning_rate": 7.65944160348142e-10, "logits/chosen": -1.2109375, "logits/rejected": -1.140625, "logps/chosen": -378.0, "logps/rejected": -374.0, "loss": 0.6114, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.3515625, "rewards/margins": 0.330078125, "rewards/rejected": -0.6796875, "step": 470 } ], "logging_steps": 10, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }