{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.997867803837953, "eval_steps": 100, "global_step": 117, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 627.8802337646484, "epoch": 0.008528784648187633, "grad_norm": 0.35644883946602257, "kl": 0.0, "learning_rate": 2.5e-07, "loss": -0.0129, "reward": 0.6093750298023224, "reward_std": 0.3860909380018711, "rewards/accuracy_reward": 0.6093750298023224, "rewards/format_reward": 0.0, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 587.183611869812, "epoch": 0.042643923240938165, "grad_norm": 0.34939166374507424, "kl": 0.00016704201698303223, "learning_rate": 1.25e-06, "loss": 0.0135, "reward": 0.6184895997866988, "reward_std": 0.34997194120660424, "rewards/accuracy_reward": 0.6184895997866988, "rewards/format_reward": 0.0, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 611.967724609375, "epoch": 0.08528784648187633, "grad_norm": 31.88849856902946, "kl": 0.0043338298797607425, "learning_rate": 2.5e-06, "loss": 0.0188, "reward": 0.6291666887700558, "reward_std": 0.32357291094958784, "rewards/accuracy_reward": 0.6291666887700558, "rewards/format_reward": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 616.8541862487793, "epoch": 0.1279317697228145, "grad_norm": 0.3815627956846007, "kl": 0.010546112060546875, "learning_rate": 2.993961440992859e-06, "loss": 0.0578, "reward": 0.6953125193715095, "reward_std": 0.2850981580093503, "rewards/accuracy_reward": 0.6953125193715095, "rewards/format_reward": 0.0, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 618.81460647583, "epoch": 0.17057569296375266, "grad_norm": 0.21300730475167035, "kl": 0.0043548583984375, "learning_rate": 2.957235057439301e-06, "loss": 0.0758, "reward": 0.7281250193715095, "reward_std": 0.23656688714399934, "rewards/accuracy_reward": 0.7281250193715095, "rewards/format_reward": 0.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 591.3015823364258, "epoch": 0.21321961620469082, "grad_norm": 0.12113342234838173, "kl": 0.015129280090332032, "learning_rate": 2.887956450710995e-06, "loss": 0.0443, "reward": 0.7828125208616257, "reward_std": 0.177566824760288, "rewards/accuracy_reward": 0.7828125208616257, "rewards/format_reward": 0.0, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 579.9239768981934, "epoch": 0.255863539445629, "grad_norm": 0.14438849219292743, "kl": 0.005391120910644531, "learning_rate": 2.7876731904027993e-06, "loss": 0.0371, "reward": 0.7609375171363354, "reward_std": 0.16949560260400176, "rewards/accuracy_reward": 0.7609375171363354, "rewards/format_reward": 0.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 571.2515838623046, "epoch": 0.29850746268656714, "grad_norm": 0.15658618448015763, "kl": 0.004360771179199219, "learning_rate": 2.6586254388368995e-06, "loss": 0.0415, "reward": 0.7854166865348816, "reward_std": 0.17419785326346754, "rewards/accuracy_reward": 0.7854166865348816, "rewards/format_reward": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 579.2099113464355, "epoch": 0.3411513859275053, "grad_norm": 0.6009157958287555, "kl": 0.026328277587890626, "learning_rate": 2.5036959095382875e-06, "loss": 0.0321, "reward": 0.7708333596587181, "reward_std": 0.17964822258800267, "rewards/accuracy_reward": 0.7708333596587181, "rewards/format_reward": 0.0, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 586.2880401611328, "epoch": 0.3837953091684435, "grad_norm": 0.46893633502563103, "kl": 0.015601730346679688, "learning_rate": 2.3263454721781537e-06, "loss": 0.0288, "reward": 0.7869791895151138, "reward_std": 0.17324934136122466, "rewards/accuracy_reward": 0.7869791895151138, "rewards/format_reward": 0.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 577.1146011352539, "epoch": 0.42643923240938164, "grad_norm": 0.14115724397125742, "kl": 0.00496826171875, "learning_rate": 2.1305358424643485e-06, "loss": 0.0306, "reward": 0.7510416850447654, "reward_std": 0.1911184054799378, "rewards/accuracy_reward": 0.7510416850447654, "rewards/format_reward": 0.0, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 579.0744941711425, "epoch": 0.4690831556503198, "grad_norm": 3.183002278410084, "kl": 0.018677902221679688, "learning_rate": 1.9206410839590043e-06, "loss": 0.0246, "reward": 0.7661458477377892, "reward_std": 0.19202441712841392, "rewards/accuracy_reward": 0.7661458477377892, "rewards/format_reward": 0.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 601.4166862487793, "epoch": 0.511727078891258, "grad_norm": 0.12757187329087855, "kl": 0.006385040283203125, "learning_rate": 1.7013498987264833e-06, "loss": 0.0345, "reward": 0.7364583522081375, "reward_std": 0.19316800702363252, "rewards/accuracy_reward": 0.7364583522081375, "rewards/format_reward": 0.0, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 581.6609550476074, "epoch": 0.5543710021321961, "grad_norm": 0.17404128901148935, "kl": 0.007928085327148438, "learning_rate": 1.4775608894771048e-06, "loss": 0.0328, "reward": 0.7505208536982536, "reward_std": 0.2075295069254935, "rewards/accuracy_reward": 0.7505208536982536, "rewards/format_reward": 0.0, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 596.1479362487793, "epoch": 0.5970149253731343, "grad_norm": 0.2944207023714681, "kl": 0.006090927124023438, "learning_rate": 1.2542731328772936e-06, "loss": 0.0339, "reward": 0.7265625208616256, "reward_std": 0.19276394164189697, "rewards/accuracy_reward": 0.7265625208616256, "rewards/format_reward": 0.0, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 565.5979385375977, "epoch": 0.6396588486140725, "grad_norm": 0.18840980741585026, "kl": 0.0075702667236328125, "learning_rate": 1.036474508437579e-06, "loss": 0.0368, "reward": 0.7677083507180213, "reward_std": 0.18664944088086485, "rewards/accuracy_reward": 0.7677083507180213, "rewards/format_reward": 0.0, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 580.122933959961, "epoch": 0.6823027718550106, "grad_norm": 0.11471347531056907, "kl": 0.006945037841796875, "learning_rate": 8.290302775265509e-07, "loss": 0.0341, "reward": 0.7583333522081375, "reward_std": 0.1750888627022505, "rewards/accuracy_reward": 0.7583333522081375, "rewards/format_reward": 0.0, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 592.2614776611329, "epoch": 0.7249466950959488, "grad_norm": 0.22116249522463094, "kl": 0.009731292724609375, "learning_rate": 6.3657440147149e-07, "loss": 0.0333, "reward": 0.7656250178813935, "reward_std": 0.21401627436280252, "rewards/accuracy_reward": 0.7656250178813935, "rewards/format_reward": 0.0, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 579.4776222229004, "epoch": 0.767590618336887, "grad_norm": 0.2030484454112882, "kl": 0.010486984252929687, "learning_rate": 4.63406026519703e-07, "loss": 0.0328, "reward": 0.7552083522081375, "reward_std": 0.16649889973923565, "rewards/accuracy_reward": 0.7552083522081375, "rewards/format_reward": 0.0, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 586.7932479858398, "epoch": 0.8102345415778252, "grad_norm": 0.11451963582699543, "kl": 0.006160736083984375, "learning_rate": 3.133934480154885e-07, "loss": 0.0294, "reward": 0.7520833507180213, "reward_std": 0.18174212109297513, "rewards/accuracy_reward": 0.7520833507180213, "rewards/format_reward": 0.0, "step": 95 }, { "epoch": 0.8528784648187633, "grad_norm": 0.1403395507058327, "learning_rate": 1.8988769907430552e-07, "loss": 0.0343, "step": 100 }, { "epoch": 0.8528784648187633, "eval_clip_ratio": 0.0, "eval_completion_length": 571.9227185058594, "eval_kl": 0.0098244873046875, "eval_loss": 0.019065221771597862, "eval_reward": 0.6759333529949189, "eval_reward_std": 0.2166214306771755, "eval_rewards/accuracy_reward": 0.6759333529949189, "eval_rewards/format_reward": 0.0, "eval_runtime": 11129.6267, "eval_samples_per_second": 0.449, "eval_steps_per_second": 0.009, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 583.2151233673096, "epoch": 0.8955223880597015, "grad_norm": 0.1561949193212127, "kl": 0.005991172790527344, "learning_rate": 9.564769404039419e-08, "loss": 0.0236, "reward": 0.7598958529531956, "reward_std": 0.1863211216405034, "rewards/accuracy_reward": 0.7598958529531956, "rewards/format_reward": 0.0, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 586.4302299499511, "epoch": 0.9381663113006397, "grad_norm": 5.136938540934993, "kl": 0.008953857421875, "learning_rate": 3.277859889929147e-08, "loss": 0.0333, "reward": 0.7864583492279053, "reward_std": 0.1887844305485487, "rewards/accuracy_reward": 0.7864583492279053, "rewards/format_reward": 0.0, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 569.3703300476075, "epoch": 0.9808102345415778, "grad_norm": 0.147469744170478, "kl": 0.00705413818359375, "learning_rate": 2.684805348397268e-09, "loss": 0.0296, "reward": 0.7828125163912774, "reward_std": 0.16681436980143188, "rewards/accuracy_reward": 0.7828125163912774, "rewards/format_reward": 0.0, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 583.3411636352539, "epoch": 0.997867803837953, "kl": 0.007679939270019531, "reward": 0.7473958507180214, "reward_std": 0.18170781643129885, "rewards/accuracy_reward": 0.7473958507180214, "rewards/format_reward": 0.0, "step": 117, "total_flos": 0.0, "train_loss": 0.034317180164094664, "train_runtime": 31905.6637, "train_samples_per_second": 0.235, "train_steps_per_second": 0.004 } ], "logging_steps": 5, "max_steps": 117, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }